From 47df1793171771ff9b1622d30433ef6edf0eb280 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Fri, 25 Apr 2008 02:01:44 +0000
Subject: [CIFS] Update cifs version number

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index e1dd9f32e1d..503f6aa734a 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -110,5 +110,5 @@ extern int cifs_ioctl(struct inode *inode, struct file *filep,
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
 
-#define CIFS_VERSION   "1.52"
+#define CIFS_VERSION   "1.53"
 #endif				/* _CIFSFS_H */
-- 
cgit v1.2.3-70-g09d2


From 0206e61b467fde4d7b50f1a64355182a4fd9576b Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Fri, 25 Apr 2008 18:19:40 +0000
Subject: [CIFS] Fix spelling mistake

Noticed by Joe Perches

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifspdu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 9f49c2f3582..a0d26b540d4 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -2050,7 +2050,7 @@ typedef struct {
 						      to 0xFFFF00 */
 #define CIFS_UNIX_LARGE_WRITE_CAP       0x00000080
 #define CIFS_UNIX_TRANSPORT_ENCRYPTION_CAP 0x00000100 /* can do SPNEGO crypt */
-#define CIFS_UNIX_TRANPSORT_ENCRYPTION_MANDATORY_CAP  0x00000200 /* must do  */
+#define CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP  0x00000200 /* must do  */
 #define CIFS_UNIX_PROXY_CAP             0x00000400 /* Proxy cap: 0xACE ioctl and
 						      QFS PROXY call */
 #ifdef CONFIG_CIFS_POSIX
-- 
cgit v1.2.3-70-g09d2


From d09e860cf07e7c9ee12920a09f5080e30a12a23a Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Sat, 26 Apr 2008 00:22:23 +0000
Subject: [CIFS] Adds to dns_resolver checking if the server name is an IP addr
 and skipping upcall in this case.

Signed-off-by: Igor Mammedov <niallain@gmail.com>
Signed-off-by: sfrench@us.ibm.com
---
 fs/cifs/dns_resolve.c | 62 ++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 52 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 7cc86c41818..939e256f849 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -55,6 +55,32 @@ struct key_type key_type_dns_resolver = {
 	.match       = user_match,
 };
 
+/* Checks if supplied name is IP address
+ * returns:
+ * 		1 - name is IP
+ * 		0 - name is not IP
+ */
+static int is_ip(const char *name)
+{
+	int rc;
+	struct sockaddr_in sin_server;
+	struct sockaddr_in6 sin_server6;
+
+	rc = cifs_inet_pton(AF_INET, name,
+			&sin_server.sin_addr.s_addr);
+
+	if (rc <= 0) {
+		/* not ipv4 address, try ipv6 */
+		rc = cifs_inet_pton(AF_INET6, name,
+				&sin_server6.sin6_addr.in6_u);
+		if (rc > 0)
+			return 1;
+	} else {
+		return 1;
+	}
+	/* we failed translating address */
+	return 0;
+}
 
 /* Resolves server name to ip address.
  * input:
@@ -67,8 +93,9 @@ int
 dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
 {
 	int rc = -EAGAIN;
-	struct key *rkey;
+	struct key *rkey = ERR_PTR(-EAGAIN);
 	char *name;
+	char *data = NULL;
 	int len;
 
 	if (!ip_addr || !unc)
@@ -97,26 +124,41 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
 	memcpy(name, unc+2, len);
 	name[len] = 0;
 
+	if (is_ip(name)) {
+		cFYI(1, ("%s: it is IP, skipping dns upcall: %s",
+					__func__, name));
+		data = name;
+		goto skip_upcall;
+	}
+
 	rkey = request_key(&key_type_dns_resolver, name, "");
 	if (!IS_ERR(rkey)) {
-		len = strlen(rkey->payload.data);
-		*ip_addr = kmalloc(len+1, GFP_KERNEL);
-		if (*ip_addr) {
-			memcpy(*ip_addr, rkey->payload.data, len);
-			(*ip_addr)[len] = '\0';
-			cFYI(1, ("%s: resolved: %s to %s", __func__,
+		data = rkey->payload.data;
+		cFYI(1, ("%s: resolved: %s to %s", __func__,
 					rkey->description,
 					*ip_addr
 				));
+	} else {
+		cERROR(1, ("%s: unable to resolve: %s", __func__, name));
+		goto out;
+	}
+
+skip_upcall:
+	if (data) {
+		len = strlen(data);
+		*ip_addr = kmalloc(len+1, GFP_KERNEL);
+		if (*ip_addr) {
+			memcpy(*ip_addr, data, len);
+			(*ip_addr)[len] = '\0';
 			rc = 0;
 		} else {
 			rc = -ENOMEM;
 		}
-		key_put(rkey);
-	} else {
-		cERROR(1, ("%s: unable to resolve: %s", __func__, name));
+		if (!IS_ERR(rkey))
+			key_put(rkey);
 	}
 
+out:
 	kfree(name);
 	return rc;
 }
-- 
cgit v1.2.3-70-g09d2


From 39da9847113a870b20fee9a7c216a848b9a5e9f7 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Mon, 28 Apr 2008 04:04:34 +0000
Subject: [CIFS] Fix statfs formatting

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES  |  3 +++
 fs/cifs/cifsfs.c | 66 ++++++++++++++++++++++++++++----------------------------
 2 files changed, 36 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 05c9da6181c..8355e918fdd 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,6 @@
+Version 1.53
+------------
+
 Version 1.52
 ------------
 Fix oops on second mount to server when null auth is used.
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 39c2cbdface..68f1cdf6aed 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -222,50 +222,50 @@ static int
 cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
-	int xid;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+	struct cifsTconInfo *tcon = cifs_sb->tcon;
 	int rc = -EOPNOTSUPP;
-	struct cifs_sb_info *cifs_sb;
-	struct cifsTconInfo *pTcon;
+	int xid;
 
 	xid = GetXid();
 
-	cifs_sb = CIFS_SB(sb);
-	pTcon = cifs_sb->tcon;
-
 	buf->f_type = CIFS_MAGIC_NUMBER;
 
-	/* instead could get the real value via SMB_QUERY_FS_ATTRIBUTE_INFO */
-	buf->f_namelen = PATH_MAX; /* PATH_MAX may be too long - it would
-				      presumably be total path, but note
-				      that some servers (includinng Samba 3)
-				      have a shorter maximum path */
+	/*
+	 * PATH_MAX may be too long - it would presumably be total path,
+	 * but note that some servers (includinng Samba 3) have a shorter
+	 * maximum path.
+	 *
+	 * Instead could get the real value via SMB_QUERY_FS_ATTRIBUTE_INFO.
+	 */
+	buf->f_namelen = PATH_MAX;
 	buf->f_files = 0;	/* undefined */
 	buf->f_ffree = 0;	/* unlimited */
 
-/* BB we could add a second check for a QFS Unix capability bit */
-/* BB FIXME check CIFS_POSIX_EXTENSIONS Unix cap first FIXME BB */
-    if ((pTcon->ses->capabilities & CAP_UNIX) && (CIFS_POSIX_EXTENSIONS &
-			le64_to_cpu(pTcon->fsUnixInfo.Capability)))
-	    rc = CIFSSMBQFSPosixInfo(xid, pTcon, buf);
-
-    /* Only need to call the old QFSInfo if failed
-    on newer one */
-    if (rc)
-	if (pTcon->ses->capabilities & CAP_NT_SMBS)
-		rc = CIFSSMBQFSInfo(xid, pTcon, buf); /* not supported by OS2 */
-
-	/* Some old Windows servers also do not support level 103, retry with
-	   older level one if old server failed the previous call or we
-	   bypassed it because we detected that this was an older LANMAN sess */
+	/*
+	 * We could add a second check for a QFS Unix capability bit
+	 */
+	if ((tcon->ses->capabilities & CAP_UNIX) &&
+	    (CIFS_POSIX_EXTENSIONS & le64_to_cpu(tcon->fsUnixInfo.Capability)))
+		rc = CIFSSMBQFSPosixInfo(xid, tcon, buf);
+
+	/*
+	 * Only need to call the old QFSInfo if failed on newer one,
+	 * e.g. by OS/2.
+	 **/
+	if (rc && (tcon->ses->capabilities & CAP_NT_SMBS))
+		rc = CIFSSMBQFSInfo(xid, tcon, buf);
+
+	/*
+	 * Some old Windows servers also do not support level 103, retry with
+	 * older level one if old server failed the previous call or we
+	 * bypassed it because we detected that this was an older LANMAN sess
+	 */
 	if (rc)
-		rc = SMBOldQFSInfo(xid, pTcon, buf);
-	/* int f_type;
-	   __fsid_t f_fsid;
-	   int f_namelen;  */
-	/* BB get from info in tcon struct at mount time call to QFSAttrInfo */
+		rc = SMBOldQFSInfo(xid, tcon, buf);
+
 	FreeXid(xid);
-	return 0;		/* always return success? what if volume is no
-				   longer available? */
+	return 0;
 }
 
 static int cifs_permission(struct inode *inode, int mask, struct nameidata *nd)
-- 
cgit v1.2.3-70-g09d2


From 22ba0317c81ba263172baaefd2cb38de78c4598f Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Mon, 28 Apr 2008 18:38:49 +0300
Subject: udf: fs/udf/partition.c:udf_get_pblock() mustn't be inline
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch fixes the following build error with UML and gcc 4.3:

<--  snip  -->

...
  CC      fs/udf/partition.o
/home/bunk/linux/kernel-2.6/git/linux-2.6/fs/udf/partition.c: In function ‘udf_get_pblock_virt15’:
/home/bunk/linux/kernel-2.6/git/linux-2.6/fs/udf/partition.c:32: sorry, unimplemented: inlining failed in call to ‘udf_get_pblock’: function body not available
/home/bunk/linux/kernel-2.6/git/linux-2.6/fs/udf/partition.c:102: sorry, unimplemented: called from here
make[3]: *** [fs/udf/partition.o] Error 1

<--  snip  -->

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/partition.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index 63610f026ae..96dfd207c3d 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -27,8 +27,8 @@
 #include <linux/slab.h>
 #include <linux/buffer_head.h>
 
-inline uint32_t udf_get_pblock(struct super_block *sb, uint32_t block,
-			       uint16_t partition, uint32_t offset)
+uint32_t udf_get_pblock(struct super_block *sb, uint32_t block,
+			uint16_t partition, uint32_t offset)
 {
 	struct udf_sb_info *sbi = UDF_SB(sb);
 	struct udf_part_map *map;
-- 
cgit v1.2.3-70-g09d2


From bf62fd887cab230f5952b611bde25e8e15acb454 Mon Sep 17 00:00:00 2001
From: Igor Mammedov <niallain@gmail.com>
Date: Mon, 28 Apr 2008 23:05:58 +0000
Subject: [CIFS] fixed compatibility issue with samba refferal request

treeName part is canonicalized to '/' path separator

Signed-off-by: Igor Mammedov <niallain@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_dfs_ref.c | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 95024c066d8..f6fdecf6598 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -93,15 +93,11 @@ static char *cifs_get_share_name(const char *node_name)
 	/* find sharename end */
 	pSep++;
 	pSep = memchr(UNC+(pSep-UNC), '\\', len-(pSep-UNC));
-	if (!pSep) {
-		cERROR(1, ("%s:2 cant find share name in node name: %s",
-			__func__, node_name));
-		kfree(UNC);
-		return NULL;
+	if (pSep) {
+		/* trim path up to sharename end
+		 * now we have share name in UNC */
+		*pSep = 0;
 	}
-	/* trim path up to sharename end
-	 *          * now we have share name in UNC */
-	*pSep = 0;
 
 	return UNC;
 }
@@ -188,7 +184,7 @@ static char *compose_mount_options(const char *sb_mountdata,
 		tkn_e = strchr(tkn_e+1, '\\');
 		if (tkn_e) {
 			strcat(mountdata, ",prefixpath=");
-			strcat(mountdata, tkn_e);
+			strcat(mountdata, tkn_e+1);
 		}
 	}
 
@@ -244,7 +240,8 @@ static char *build_full_dfs_path_from_dentry(struct dentry *dentry)
 		return NULL;
 
 	if (cifs_sb->tcon->Flags & SMB_SHARE_IS_IN_DFS) {
-		/* we should use full path name to correct working with DFS */
+		int i;
+		/* we should use full path name for correct working with DFS */
 		l_max_len = strnlen(cifs_sb->tcon->treeName, MAX_TREE_SIZE+1) +
 					strnlen(search_path, MAX_PATHCONF) + 1;
 		tmp_path = kmalloc(l_max_len, GFP_KERNEL);
@@ -253,8 +250,14 @@ static char *build_full_dfs_path_from_dentry(struct dentry *dentry)
 			return NULL;
 		}
 		strncpy(tmp_path, cifs_sb->tcon->treeName, l_max_len);
-		strcat(tmp_path, search_path);
 		tmp_path[l_max_len-1] = 0;
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
+			for (i = 0; i < l_max_len; i++) {
+				if (tmp_path[i] == '\\')
+					tmp_path[i] = '/';
+			}
+		strncat(tmp_path, search_path, l_max_len - strlen(tmp_path));
+
 		full_path = tmp_path;
 		kfree(search_path);
 	} else {
-- 
cgit v1.2.3-70-g09d2


From 4b18f2a9c3964f7612b7403dddc1d1ba5443ae24 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 29 Apr 2008 00:06:05 +0000
Subject: [CIFS] convert usage of implicit booleans to bool

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/asn1.c       | 10 +++----
 fs/cifs/cifsacl.c    | 16 +++++------
 fs/cifs/cifsfs.c     |  6 ++--
 fs/cifs/cifsfs.h     |  8 ------
 fs/cifs/cifsglob.h   | 44 ++++++++++++-----------------
 fs/cifs/cifsproto.h  | 13 +++++----
 fs/cifs/cifssmb.c    | 38 ++++++++++++-------------
 fs/cifs/connect.c    | 80 ++++++++++++++++++++++++++--------------------------
 fs/cifs/dir.c        | 18 ++++++------
 fs/cifs/fcntl.c      |  2 +-
 fs/cifs/file.c       | 80 ++++++++++++++++++++++++++--------------------------
 fs/cifs/inode.c      | 42 +++++++++++++--------------
 fs/cifs/link.c       |  2 +-
 fs/cifs/misc.c       | 33 +++++++++++-----------
 fs/cifs/readdir.c    | 12 ++++----
 fs/cifs/smbencrypt.c |  8 +++---
 fs/cifs/xattr.c      |  2 +-
 17 files changed, 200 insertions(+), 214 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index bcda2c6b6a0..cb52cbbe45f 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -460,8 +460,8 @@ decode_negTokenInit(unsigned char *security_blob, int length,
 	unsigned char *sequence_end;
 	unsigned long *oid = NULL;
 	unsigned int cls, con, tag, oidlen, rc;
-	int use_ntlmssp = FALSE;
-	int use_kerberos = FALSE;
+	bool use_ntlmssp = false;
+	bool use_kerberos = false;
 
 	*secType = NTLM; /* BB eventually make Kerberos or NLTMSSP the default*/
 
@@ -561,15 +561,15 @@ decode_negTokenInit(unsigned char *security_blob, int length,
 					if (compare_oid(oid, oidlen,
 							MSKRB5_OID,
 							MSKRB5_OID_LEN))
-						use_kerberos = TRUE;
+						use_kerberos = true;
 					else if (compare_oid(oid, oidlen,
 							     KRB5_OID,
 							     KRB5_OID_LEN))
-						use_kerberos = TRUE;
+						use_kerberos = true;
 					else if (compare_oid(oid, oidlen,
 							     NTLMSSP_OID,
 							     NTLMSSP_OID_LEN))
-						use_ntlmssp = TRUE;
+						use_ntlmssp = true;
 
 					kfree(oid);
 				}
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index e99d4faf5f0..6fe1bc5bb36 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -559,7 +559,7 @@ static struct cifs_ntsd *get_cifs_acl(u32 *pacllen, struct inode *inode,
 				       const char *path, const __u16 *pfid)
 {
 	struct cifsFileInfo *open_file = NULL;
-	int unlock_file = FALSE;
+	bool unlock_file = false;
 	int xid;
 	int rc = -EIO;
 	__u16 fid;
@@ -586,10 +586,10 @@ static struct cifs_ntsd *get_cifs_acl(u32 *pacllen, struct inode *inode,
 	cifs_sb = CIFS_SB(sb);
 
 	if (open_file) {
-		unlock_file = TRUE;
+		unlock_file = true;
 		fid = open_file->netfid;
 	} else if (pfid == NULL) {
-		int oplock = FALSE;
+		bool oplock = false;
 		/* open file */
 		rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN,
 				READ_CONTROL, 0, &fid, &oplock, NULL,
@@ -604,7 +604,7 @@ static struct cifs_ntsd *get_cifs_acl(u32 *pacllen, struct inode *inode,
 
 	rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen);
 	cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen));
-	if (unlock_file == TRUE) /* find_readable_file increments ref count */
+	if (unlock_file == true) /* find_readable_file increments ref count */
 		atomic_dec(&open_file->wrtPending);
 	else if (pfid == NULL) /* if opened above we have to close the handle */
 		CIFSSMBClose(xid, cifs_sb->tcon, fid);
@@ -619,7 +619,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
 				struct inode *inode, const char *path)
 {
 	struct cifsFileInfo *open_file;
-	int unlock_file = FALSE;
+	bool unlock_file = false;
 	int xid;
 	int rc = -EIO;
 	__u16 fid;
@@ -640,10 +640,10 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
 
 	open_file = find_readable_file(CIFS_I(inode));
 	if (open_file) {
-		unlock_file = TRUE;
+		unlock_file = true;
 		fid = open_file->netfid;
 	} else {
-		int oplock = FALSE;
+		int oplock = 0;
 		/* open file */
 		rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN,
 				WRITE_DAC, 0, &fid, &oplock, NULL,
@@ -658,7 +658,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
 
 	rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
 	cFYI(DBG2, ("SetCIFSACL rc = %d", rc));
-	if (unlock_file == TRUE)
+	if (unlock_file)
 		atomic_dec(&open_file->wrtPending);
 	else
 		CIFSSMBClose(xid, cifs_sb->tcon, fid);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 68f1cdf6aed..427a7c69589 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -306,8 +306,8 @@ cifs_alloc_inode(struct super_block *sb)
 	/* Until the file is open and we have gotten oplock
 	info back from the server, can not assume caching of
 	file data or metadata */
-	cifs_inode->clientCanCacheRead = FALSE;
-	cifs_inode->clientCanCacheAll = FALSE;
+	cifs_inode->clientCanCacheRead = false;
+	cifs_inode->clientCanCacheAll = false;
 	cifs_inode->vfs_inode.i_blkbits = 14;  /* 2**14 = CIFS_MAX_MSGSIZE */
 
 	/* Can not set i_flags here - they get immediately overwritten
@@ -940,7 +940,7 @@ static int cifs_oplock_thread(void *dummyarg)
 				    rc = CIFSSMBLock(0, pTcon, netfid,
 					    0 /* len */ , 0 /* offset */, 0,
 					    0, LOCKING_ANDX_OPLOCK_RELEASE,
-					    0 /* wait flag */);
+					    false /* wait flag */);
 					cFYI(1, ("Oplock release rc = %d", rc));
 				}
 			} else
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 503f6aa734a..cd1301a09b3 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -24,14 +24,6 @@
 
 #define ROOT_I 2
 
-#ifndef FALSE
-#define FALSE 0
-#endif
-
-#ifndef TRUE
-#define TRUE 1
-#endif
-
 extern struct file_system_type cifs_fs_type;
 extern const struct address_space_operations cifs_addr_ops;
 extern const struct address_space_operations cifs_addr_ops_smallbuf;
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 69a2e194254..b7d9f698e63 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -57,14 +57,6 @@
 
 #include "cifspdu.h"
 
-#ifndef FALSE
-#define FALSE 0
-#endif
-
-#ifndef TRUE
-#define TRUE 1
-#endif
-
 #ifndef XATTR_DOS_ATTRIB
 #define XATTR_DOS_ATTRIB "user.DOSATTRIB"
 #endif
@@ -147,7 +139,7 @@ struct TCP_Server_Info {
 	enum protocolEnum protocolType;
 	char versionMajor;
 	char versionMinor;
-	unsigned svlocal:1;	/* local server or remote */
+	bool svlocal:1;			/* local server or remote */
 	atomic_t socketUseCount; /* number of open cifs sessions on socket */
 	atomic_t inFlight;  /* number of requests on the wire to server */
 #ifdef CONFIG_CIFS_STATS2
@@ -286,10 +278,10 @@ struct cifsTconInfo {
 	FILE_SYSTEM_DEVICE_INFO fsDevInfo;
 	FILE_SYSTEM_ATTRIBUTE_INFO fsAttrInfo; /* ok if fs name truncated */
 	FILE_SYSTEM_UNIX_INFO fsUnixInfo;
-	unsigned ipc:1;		/* set if connection to IPC$ eg for RPC/PIPES */
-	unsigned retry:1;
-	unsigned nocase:1;
-	unsigned unix_ext:1; /* if off disable Linux extensions to CIFS protocol
+	bool ipc:1;		/* set if connection to IPC$ eg for RPC/PIPES */
+	bool retry:1;
+	bool nocase:1;
+	bool unix_ext:1;  /* if false disable Linux extensions to CIFS protocol
 				for this mount even if server would support */
 	/* BB add field for back pointer to sb struct(s)? */
 };
@@ -317,10 +309,10 @@ struct cifs_search_info {
 	char *srch_entries_start;
 	char *presume_name;
 	unsigned int resume_name_len;
-	unsigned endOfSearch:1;
-	unsigned emptyDir:1;
-	unsigned unicode:1;
-	unsigned smallBuf:1; /* so we know which buf_release function to call */
+	bool endOfSearch:1;
+	bool emptyDir:1;
+	bool unicode:1;
+	bool smallBuf:1; /* so we know which buf_release function to call */
 };
 
 struct cifsFileInfo {
@@ -335,9 +327,9 @@ struct cifsFileInfo {
 	struct inode *pInode; /* needed for oplock break */
 	struct mutex lock_mutex;
 	struct list_head llist; /* list of byte range locks we have. */
-	unsigned closePend:1;	/* file is marked to close */
-	unsigned invalidHandle:1;  /* file closed via session abend */
-	unsigned messageMode:1;    /* for pipes: message vs byte mode */
+	bool closePend:1;	/* file is marked to close */
+	bool invalidHandle:1;	/* file closed via session abend */
+	bool messageMode:1;	/* for pipes: message vs byte mode */
 	atomic_t wrtPending;   /* handle in use - defer close */
 	struct semaphore fh_sem; /* prevents reopen race after dead ses*/
 	char *search_resume_name; /* BB removeme BB */
@@ -356,9 +348,9 @@ struct cifsInodeInfo {
 	__u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */
 	atomic_t inUse;	 /* num concurrent users (local openers cifs) of file*/
 	unsigned long time;	/* jiffies of last update/check of inode */
-	unsigned clientCanCacheRead:1; /* read oplock */
-	unsigned clientCanCacheAll:1;  /* read and writebehind oplock */
-	unsigned oplockPending:1;
+	bool clientCanCacheRead:1;	/* read oplock */
+	bool clientCanCacheAll:1;	/* read and writebehind oplock */
+	bool oplockPending:1;
 	struct inode vfs_inode;
 };
 
@@ -426,9 +418,9 @@ struct mid_q_entry {
 	struct smb_hdr *resp_buf;	/* response buffer */
 	int midState;	/* wish this were enum but can not pass to wait_event */
 	__u8 command;	/* smb command code */
-	unsigned largeBuf:1;    /* if valid response, is pointer to large buf */
-	unsigned multiRsp:1;   /* multiple trans2 responses for one request  */
-	unsigned multiEnd:1; /* both received */
+	bool largeBuf:1;	/* if valid response, is pointer to large buf */
+	bool multiRsp:1;	/* multiple trans2 responses for one request  */
+	bool multiEnd:1;	/* both received */
 };
 
 struct oplock_q_entry {
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 50f9fdae19b..a249a29109a 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -59,8 +59,9 @@ extern int SendReceiveBlockingLock(const unsigned int xid,
 			struct smb_hdr *out_buf,
 			int *bytes_returned);
 extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length);
-extern int is_valid_oplock_break(struct smb_hdr *smb, struct TCP_Server_Info *);
-extern int is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
+extern bool is_valid_oplock_break(struct smb_hdr *smb,
+				  struct TCP_Server_Info *);
+extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
 extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *);
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *);
@@ -187,12 +188,12 @@ extern int CIFSSMBSetAttrLegacy(int xid, struct cifsTconInfo *tcon,
 #endif /* possibly unneeded function */
 extern int CIFSSMBSetEOF(const int xid, struct cifsTconInfo *tcon,
 			const char *fileName, __u64 size,
-			int setAllocationSizeFlag,
+			bool setAllocationSizeFlag,
 			const struct nls_table *nls_codepage,
 			int remap_special_chars);
 extern int CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon,
 			 __u64 size, __u16 fileHandle, __u32 opener_pid,
-			int AllocSizeFlag);
+			bool AllocSizeFlag);
 extern int CIFSSMBUnixSetPerms(const int xid, struct cifsTconInfo *pTcon,
 			char *full_path, __u64 mode, __u64 uid,
 			__u64 gid, dev_t dev,
@@ -291,11 +292,11 @@ extern int CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
 			const __u16 netfid, const __u64 len,
 			const __u64 offset, const __u32 numUnlock,
 			const __u32 numLock, const __u8 lockType,
-			const int waitFlag);
+			const bool waitFlag);
 extern int CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
 			const __u16 smb_file_id, const int get_flag,
 			const __u64 len, struct file_lock *,
-			const __u16 lock_type, const int waitFlag);
+			const __u16 lock_type, const bool waitFlag);
 extern int CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon);
 extern int CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses);
 
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 4728fa982a4..cfd9750852b 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -95,7 +95,7 @@ static void mark_open_files_invalid(struct cifsTconInfo *pTcon)
 	list_for_each_safe(tmp, tmp1, &pTcon->openFileList) {
 		open_file = list_entry(tmp, struct cifsFileInfo, tlist);
 		if (open_file)
-			open_file->invalidHandle = TRUE;
+			open_file->invalidHandle = true;
 	}
 	write_unlock(&GlobalSMBSeslock);
 	/* BB Add call to invalidate_inodes(sb) for all superblocks mounted
@@ -141,7 +141,7 @@ small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
 				if (tcon->ses->server->tcpStatus ==
 							CifsNeedReconnect) {
 					/* on "soft" mounts we wait once */
-					if ((tcon->retry == FALSE) ||
+					if (!tcon->retry ||
 					   (tcon->ses->status == CifsExiting)) {
 						cFYI(1, ("gave up waiting on "
 						      "reconnect in smb_init"));
@@ -289,7 +289,7 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
 				if (tcon->ses->server->tcpStatus ==
 						CifsNeedReconnect) {
 					/* on "soft" mounts we wait once */
-					if ((tcon->retry == FALSE) ||
+					if (!tcon->retry ||
 					   (tcon->ses->status == CifsExiting)) {
 						cFYI(1, ("gave up waiting on "
 						      "reconnect in smb_init"));
@@ -1686,7 +1686,7 @@ int
 CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
 	    const __u16 smb_file_id, const __u64 len,
 	    const __u64 offset, const __u32 numUnlock,
-	    const __u32 numLock, const __u8 lockType, const int waitFlag)
+	    const __u32 numLock, const __u8 lockType, const bool waitFlag)
 {
 	int rc = 0;
 	LOCK_REQ *pSMB = NULL;
@@ -1695,7 +1695,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
 	int timeout = 0;
 	__u16 count;
 
-	cFYI(1, ("CIFSSMBLock timeout %d numLock %d", waitFlag, numLock));
+	cFYI(1, ("CIFSSMBLock timeout %d numLock %d", (int)waitFlag, numLock));
 	rc = small_smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB);
 
 	if (rc)
@@ -1706,7 +1706,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
 	if (lockType == LOCKING_ANDX_OPLOCK_RELEASE) {
 		timeout = CIFS_ASYNC_OP; /* no response expected */
 		pSMB->Timeout = 0;
-	} else if (waitFlag == TRUE) {
+	} else if (waitFlag) {
 		timeout = CIFS_BLOCKING_OP; /* blocking operation, no timeout */
 		pSMB->Timeout = cpu_to_le32(-1);/* blocking - do not time out */
 	} else {
@@ -1756,7 +1756,7 @@ int
 CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
 		const __u16 smb_file_id, const int get_flag, const __u64 len,
 		struct file_lock *pLockData, const __u16 lock_type,
-		const int waitFlag)
+		const bool waitFlag)
 {
 	struct smb_com_transaction2_sfi_req *pSMB  = NULL;
 	struct smb_com_transaction2_sfi_rsp *pSMBr = NULL;
@@ -3581,9 +3581,9 @@ findFirstRetry:
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 		if (rc == 0) {
 			if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE)
-				psrch_inf->unicode = TRUE;
+				psrch_inf->unicode = true;
 			else
-				psrch_inf->unicode = FALSE;
+				psrch_inf->unicode = false;
 
 			psrch_inf->ntwrk_buf_start = (char *)pSMBr;
 			psrch_inf->smallBuf = 0;
@@ -3594,9 +3594,9 @@ findFirstRetry:
 			       le16_to_cpu(pSMBr->t2.ParameterOffset));
 
 			if (parms->EndofSearch)
-				psrch_inf->endOfSearch = TRUE;
+				psrch_inf->endOfSearch = true;
 			else
-				psrch_inf->endOfSearch = FALSE;
+				psrch_inf->endOfSearch = false;
 
 			psrch_inf->entries_in_buffer =
 					le16_to_cpu(parms->SearchCount);
@@ -3624,7 +3624,7 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
 
 	cFYI(1, ("In FindNext"));
 
-	if (psrch_inf->endOfSearch == TRUE)
+	if (psrch_inf->endOfSearch)
 		return -ENOENT;
 
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -3682,7 +3682,7 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
 	cifs_stats_inc(&tcon->num_fnext);
 	if (rc) {
 		if (rc == -EBADF) {
-			psrch_inf->endOfSearch = TRUE;
+			psrch_inf->endOfSearch = true;
 			rc = 0; /* search probably was closed at end of search*/
 		} else
 			cFYI(1, ("FindNext returned = %d", rc));
@@ -3692,9 +3692,9 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
 		if (rc == 0) {
 			/* BB fixme add lock for file (srch_info) struct here */
 			if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE)
-				psrch_inf->unicode = TRUE;
+				psrch_inf->unicode = true;
 			else
-				psrch_inf->unicode = FALSE;
+				psrch_inf->unicode = false;
 			response_data = (char *) &pSMBr->hdr.Protocol +
 			       le16_to_cpu(pSMBr->t2.ParameterOffset);
 			parms = (T2_FNEXT_RSP_PARMS *)response_data;
@@ -3709,9 +3709,9 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
 			psrch_inf->ntwrk_buf_start = (char *)pSMB;
 			psrch_inf->smallBuf = 0;
 			if (parms->EndofSearch)
-				psrch_inf->endOfSearch = TRUE;
+				psrch_inf->endOfSearch = true;
 			else
-				psrch_inf->endOfSearch = FALSE;
+				psrch_inf->endOfSearch = false;
 			psrch_inf->entries_in_buffer =
 						le16_to_cpu(parms->SearchCount);
 			psrch_inf->index_of_last_entry +=
@@ -4586,7 +4586,7 @@ QFSPosixRetry:
 
 int
 CIFSSMBSetEOF(const int xid, struct cifsTconInfo *tcon, const char *fileName,
-	      __u64 size, int SetAllocation,
+	      __u64 size, bool SetAllocation,
 	      const struct nls_table *nls_codepage, int remap)
 {
 	struct smb_com_transaction2_spi_req *pSMB = NULL;
@@ -4675,7 +4675,7 @@ SetEOFRetry:
 
 int
 CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
-		   __u16 fid, __u32 pid_of_opener, int SetAllocation)
+		   __u16 fid, __u32 pid_of_opener, bool SetAllocation)
 {
 	struct smb_com_transaction2_sfi_req *pSMB  = NULL;
 	char *data_offset;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index e1710673016..6c4332f8da6 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -71,23 +71,23 @@ struct smb_vol {
 	mode_t file_mode;
 	mode_t dir_mode;
 	unsigned secFlg;
-	unsigned rw:1;
-	unsigned retry:1;
-	unsigned intr:1;
-	unsigned setuids:1;
-	unsigned override_uid:1;
-	unsigned override_gid:1;
-	unsigned noperm:1;
-	unsigned no_psx_acl:1; /* set if posix acl support should be disabled */
-	unsigned cifs_acl:1;
-	unsigned no_xattr:1;   /* set if xattr (EA) support should be disabled*/
-	unsigned server_ino:1; /* use inode numbers from server ie UniqueId */
-	unsigned direct_io:1;
-	unsigned remap:1;   /* set to remap seven reserved chars in filenames */
-	unsigned posix_paths:1;   /* unset to not ask for posix pathnames. */
-	unsigned no_linux_ext:1;
-	unsigned sfu_emul:1;
-	unsigned nullauth:1; /* attempt to authenticate with null user */
+	bool rw:1;
+	bool retry:1;
+	bool intr:1;
+	bool setuids:1;
+	bool override_uid:1;
+	bool override_gid:1;
+	bool noperm:1;
+	bool no_psx_acl:1; /* set if posix acl support should be disabled */
+	bool cifs_acl:1;
+	bool no_xattr:1;   /* set if xattr (EA) support should be disabled*/
+	bool server_ino:1; /* use inode numbers from server ie UniqueId */
+	bool direct_io:1;
+	bool remap:1;     /* set to remap seven reserved chars in filenames */
+	bool posix_paths:1;   /* unset to not ask for posix pathnames. */
+	bool no_linux_ext:1;
+	bool sfu_emul:1;
+	bool nullauth:1; /* attempt to authenticate with null user */
 	unsigned nocase;     /* request case insensitive filenames */
 	unsigned nobrl;      /* disable sending byte range locks to srv */
 	unsigned int rsize;
@@ -345,8 +345,8 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
 	struct task_struct *task_to_wake = NULL;
 	struct mid_q_entry *mid_entry;
 	char temp;
-	int isLargeBuf = FALSE;
-	int isMultiRsp;
+	bool isLargeBuf = false;
+	bool isMultiRsp;
 	int reconnect;
 
 	current->flags |= PF_MEMALLOC;
@@ -390,8 +390,8 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
 		} else /* if existing small buf clear beginning */
 			memset(smallbuf, 0, sizeof(struct smb_hdr));
 
-		isLargeBuf = FALSE;
-		isMultiRsp = FALSE;
+		isLargeBuf = false;
+		isMultiRsp = false;
 		smb_buffer = smallbuf;
 		iov.iov_base = smb_buffer;
 		iov.iov_len = 4;
@@ -517,7 +517,7 @@ incomplete_rcv:
 		reconnect = 0;
 
 		if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) {
-			isLargeBuf = TRUE;
+			isLargeBuf = true;
 			memcpy(bigbuf, smallbuf, 4);
 			smb_buffer = bigbuf;
 		}
@@ -582,16 +582,18 @@ incomplete_rcv:
 			    (mid_entry->command == smb_buffer->Command)) {
 				if (check2ndT2(smb_buffer,server->maxBuf) > 0) {
 					/* We have a multipart transact2 resp */
-					isMultiRsp = TRUE;
+					isMultiRsp = true;
 					if (mid_entry->resp_buf) {
 						/* merge response - fix up 1st*/
 						if (coalesce_t2(smb_buffer,
 							mid_entry->resp_buf)) {
-							mid_entry->multiRsp = 1;
+							mid_entry->multiRsp =
+								 true;
 							break;
 						} else {
 							/* all parts received */
-							mid_entry->multiEnd = 1;
+							mid_entry->multiEnd =
+								 true;
 							goto multi_t2_fnd;
 						}
 					} else {
@@ -603,17 +605,15 @@ incomplete_rcv:
 							/* Have first buffer */
 							mid_entry->resp_buf =
 								 smb_buffer;
-							mid_entry->largeBuf = 1;
+							mid_entry->largeBuf =
+								 true;
 							bigbuf = NULL;
 						}
 					}
 					break;
 				}
 				mid_entry->resp_buf = smb_buffer;
-				if (isLargeBuf)
-					mid_entry->largeBuf = 1;
-				else
-					mid_entry->largeBuf = 0;
+				mid_entry->largeBuf = isLargeBuf;
 multi_t2_fnd:
 				task_to_wake = mid_entry->tsk;
 				mid_entry->midState = MID_RESPONSE_RECEIVED;
@@ -638,8 +638,8 @@ multi_t2_fnd:
 					smallbuf = NULL;
 			}
 			wake_up_process(task_to_wake);
-		} else if ((is_valid_oplock_break(smb_buffer, server) == FALSE)
-		    && (isMultiRsp == FALSE)) {
+		} else if (!is_valid_oplock_break(smb_buffer, server) &&
+			   !isMultiRsp) {
 			cERROR(1, ("No task to wake, unknown frame received! "
 				   "NumMids %d", midCount.counter));
 			cifs_dump_mem("Received Data is: ", (char *)smb_buffer,
@@ -825,7 +825,7 @@ cifs_parse_mount_options(char *options, const char *devname,
 	vol->file_mode = (S_IRWXUGO | S_ISGID) & (~S_IXGRP);
 
 	/* vol->retry default is 0 (i.e. "soft" limited retry not hard retry) */
-	vol->rw = TRUE;
+	vol->rw = true;
 	/* default is always to request posix paths. */
 	vol->posix_paths = 1;
 
@@ -1181,7 +1181,7 @@ cifs_parse_mount_options(char *options, const char *devname,
 		} else if (strnicmp(data, "guest", 5) == 0) {
 			/* ignore */
 		} else if (strnicmp(data, "rw", 2) == 0) {
-			vol->rw = TRUE;
+			vol->rw = true;
 		} else if ((strnicmp(data, "suid", 4) == 0) ||
 				   (strnicmp(data, "nosuid", 6) == 0) ||
 				   (strnicmp(data, "exec", 4) == 0) ||
@@ -1197,7 +1197,7 @@ cifs_parse_mount_options(char *options, const char *devname,
 			    is ok to just ignore them */
 			continue;
 		} else if (strnicmp(data, "ro", 2) == 0) {
-			vol->rw = FALSE;
+			vol->rw = false;
 		} else if (strnicmp(data, "hard", 4) == 0) {
 			vol->retry = 1;
 		} else if (strnicmp(data, "soft", 4) == 0) {
@@ -2602,7 +2602,7 @@ sesssetup_nomem:	/* do not return an error on nomem for the info strings,
 
 static int
 CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
-			      struct cifsSesInfo *ses, int *pNTLMv2_flag,
+			      struct cifsSesInfo *ses, bool *pNTLMv2_flag,
 			      const struct nls_table *nls_codepage)
 {
 	struct smb_hdr *smb_buffer;
@@ -2625,7 +2625,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
 	if (ses == NULL)
 		return -EINVAL;
 	domain = ses->domainName;
-	*pNTLMv2_flag = FALSE;
+	*pNTLMv2_flag = false;
 	smb_buffer = cifs_buf_get();
 	if (smb_buffer == NULL) {
 		return -ENOMEM;
@@ -2778,7 +2778,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
 				       CIFS_CRYPTO_KEY_SIZE);
 				if (SecurityBlob2->NegotiateFlags &
 					cpu_to_le32(NTLMSSP_NEGOTIATE_NTLMV2))
-					*pNTLMv2_flag = TRUE;
+					*pNTLMv2_flag = true;
 
 				if ((SecurityBlob2->NegotiateFlags &
 					cpu_to_le32(NTLMSSP_NEGOTIATE_ALWAYS_SIGN))
@@ -2939,7 +2939,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
 }
 static int
 CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
-			char *ntlm_session_key, int ntlmv2_flag,
+			char *ntlm_session_key, bool ntlmv2_flag,
 			const struct nls_table *nls_codepage)
 {
 	struct smb_hdr *smb_buffer;
@@ -3569,7 +3569,7 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
 {
 	int rc = 0;
 	char ntlm_session_key[CIFS_SESS_KEY_SIZE];
-	int ntlmv2_flag = FALSE;
+	bool ntlmv2_flag = false;
 	int first_time = 0;
 
 	/* what if server changes its buffer size after dropping the session? */
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 0f5c62ba403..6ed775986be 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -130,7 +130,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 	struct cifsFileInfo *pCifsFile = NULL;
 	struct cifsInodeInfo *pCifsInode;
 	int disposition = FILE_OVERWRITE_IF;
-	int write_only = FALSE;
+	bool write_only = false;
 
 	xid = GetXid();
 
@@ -152,7 +152,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 		if (oflags & FMODE_WRITE) {
 			desiredAccess |= GENERIC_WRITE;
 			if (!(oflags & FMODE_READ))
-				write_only = TRUE;
+				write_only = true;
 		}
 
 		if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
@@ -254,7 +254,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 			d_instantiate(direntry, newinode);
 		}
 		if ((nd == NULL /* nfsd case - nfs srv does not set nd */) ||
-			((nd->flags & LOOKUP_OPEN) == FALSE)) {
+			(!(nd->flags & LOOKUP_OPEN))) {
 			/* mknod case - do not leave file open */
 			CIFSSMBClose(xid, pTcon, fileHandle);
 		} else if (newinode) {
@@ -266,8 +266,8 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 			pCifsFile->netfid = fileHandle;
 			pCifsFile->pid = current->tgid;
 			pCifsFile->pInode = newinode;
-			pCifsFile->invalidHandle = FALSE;
-			pCifsFile->closePend     = FALSE;
+			pCifsFile->invalidHandle = false;
+			pCifsFile->closePend     = false;
 			init_MUTEX(&pCifsFile->fh_sem);
 			mutex_init(&pCifsFile->lock_mutex);
 			INIT_LIST_HEAD(&pCifsFile->llist);
@@ -280,7 +280,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 			pCifsInode = CIFS_I(newinode);
 			if (pCifsInode) {
 				/* if readable file instance put first in list*/
-				if (write_only == TRUE) {
+				if (write_only) {
 					list_add_tail(&pCifsFile->flist,
 						&pCifsInode->openFileList);
 				} else {
@@ -288,12 +288,12 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 						&pCifsInode->openFileList);
 				}
 				if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
-					pCifsInode->clientCanCacheAll = TRUE;
-					pCifsInode->clientCanCacheRead = TRUE;
+					pCifsInode->clientCanCacheAll = true;
+					pCifsInode->clientCanCacheRead = true;
 					cFYI(1, ("Exclusive Oplock inode %p",
 						newinode));
 				} else if ((oplock & 0xF) == OPLOCK_READ)
-					pCifsInode->clientCanCacheRead = TRUE;
+					pCifsInode->clientCanCacheRead = true;
 			}
 			write_unlock(&GlobalSMBSeslock);
 		}
diff --git a/fs/cifs/fcntl.c b/fs/cifs/fcntl.c
index 7d1d5aa4c43..5a57581eb4b 100644
--- a/fs/cifs/fcntl.c
+++ b/fs/cifs/fcntl.c
@@ -68,7 +68,7 @@ int cifs_dir_notify(struct file *file, unsigned long arg)
 {
 	int xid;
 	int rc = -EINVAL;
-	int oplock = FALSE;
+	int oplock = 0;
 	struct cifs_sb_info *cifs_sb;
 	struct cifsTconInfo *pTcon;
 	char *full_path = NULL;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 40b690073fc..31a0a33b9d9 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -51,8 +51,8 @@ static inline struct cifsFileInfo *cifs_init_private(
 	INIT_LIST_HEAD(&private_data->llist);
 	private_data->pfile = file; /* needed for writepage */
 	private_data->pInode = inode;
-	private_data->invalidHandle = FALSE;
-	private_data->closePend = FALSE;
+	private_data->invalidHandle = false;
+	private_data->closePend = false;
 	/* we have to track num writers to the inode, since writepages
 	does not tell us which handle the write is for so there can
 	be a close (overlapping with write) of the filehandle that
@@ -148,12 +148,12 @@ client_can_cache:
 			full_path, buf, inode->i_sb, xid, NULL);
 
 	if ((*oplock & 0xF) == OPLOCK_EXCLUSIVE) {
-		pCifsInode->clientCanCacheAll = TRUE;
-		pCifsInode->clientCanCacheRead = TRUE;
+		pCifsInode->clientCanCacheAll = true;
+		pCifsInode->clientCanCacheRead = true;
 		cFYI(1, ("Exclusive Oplock granted on inode %p",
 			 file->f_path.dentry->d_inode));
 	} else if ((*oplock & 0xF) == OPLOCK_READ)
-		pCifsInode->clientCanCacheRead = TRUE;
+		pCifsInode->clientCanCacheRead = true;
 
 	return rc;
 }
@@ -247,7 +247,7 @@ int cifs_open(struct inode *inode, struct file *file)
 	if (oplockEnabled)
 		oplock = REQ_OPLOCK;
 	else
-		oplock = FALSE;
+		oplock = 0;
 
 	/* BB pass O_SYNC flag through on file attributes .. BB */
 
@@ -339,7 +339,7 @@ static int cifs_relock_file(struct cifsFileInfo *cifsFile)
 	return rc;
 }
 
-static int cifs_reopen_file(struct file *file, int can_flush)
+static int cifs_reopen_file(struct file *file, bool can_flush)
 {
 	int rc = -EACCES;
 	int xid, oplock;
@@ -360,7 +360,7 @@ static int cifs_reopen_file(struct file *file, int can_flush)
 
 	xid = GetXid();
 	down(&pCifsFile->fh_sem);
-	if (pCifsFile->invalidHandle == FALSE) {
+	if (!pCifsFile->invalidHandle) {
 		up(&pCifsFile->fh_sem);
 		FreeXid(xid);
 		return 0;
@@ -404,7 +404,7 @@ reopen_error_exit:
 	if (oplockEnabled)
 		oplock = REQ_OPLOCK;
 	else
-		oplock = FALSE;
+		oplock = 0;
 
 	/* Can not refresh inode by passing in file_info buf to be returned
 	   by SMBOpen and then calling get_inode_info with returned buf
@@ -422,7 +422,7 @@ reopen_error_exit:
 		cFYI(1, ("oplock: %d", oplock));
 	} else {
 		pCifsFile->netfid = netfid;
-		pCifsFile->invalidHandle = FALSE;
+		pCifsFile->invalidHandle = false;
 		up(&pCifsFile->fh_sem);
 		pCifsInode = CIFS_I(inode);
 		if (pCifsInode) {
@@ -432,8 +432,8 @@ reopen_error_exit:
 					CIFS_I(inode)->write_behind_rc = rc;
 			/* temporarily disable caching while we
 			   go to server to get inode info */
-				pCifsInode->clientCanCacheAll = FALSE;
-				pCifsInode->clientCanCacheRead = FALSE;
+				pCifsInode->clientCanCacheAll = false;
+				pCifsInode->clientCanCacheRead = false;
 				if (pTcon->unix_ext)
 					rc = cifs_get_inode_info_unix(&inode,
 						full_path, inode->i_sb, xid);
@@ -448,16 +448,16 @@ reopen_error_exit:
 			     we can not go to the server to get the new inod
 			     info */
 			if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
-				pCifsInode->clientCanCacheAll = TRUE;
-				pCifsInode->clientCanCacheRead = TRUE;
+				pCifsInode->clientCanCacheAll = true;
+				pCifsInode->clientCanCacheRead = true;
 				cFYI(1, ("Exclusive Oplock granted on inode %p",
 					 file->f_path.dentry->d_inode));
 			} else if ((oplock & 0xF) == OPLOCK_READ) {
-				pCifsInode->clientCanCacheRead = TRUE;
-				pCifsInode->clientCanCacheAll = FALSE;
+				pCifsInode->clientCanCacheRead = true;
+				pCifsInode->clientCanCacheAll = false;
 			} else {
-				pCifsInode->clientCanCacheRead = FALSE;
-				pCifsInode->clientCanCacheAll = FALSE;
+				pCifsInode->clientCanCacheRead = false;
+				pCifsInode->clientCanCacheAll = false;
 			}
 			cifs_relock_file(pCifsFile);
 		}
@@ -484,7 +484,7 @@ int cifs_close(struct inode *inode, struct file *file)
 	if (pSMBFile) {
 		struct cifsLockInfo *li, *tmp;
 
-		pSMBFile->closePend = TRUE;
+		pSMBFile->closePend = true;
 		if (pTcon) {
 			/* no sense reconnecting to close a file that is
 			   already closed */
@@ -553,8 +553,8 @@ int cifs_close(struct inode *inode, struct file *file)
 		cFYI(1, ("closing last open instance for inode %p", inode));
 		/* if the file is not open we do not know if we can cache info
 		   on this inode, much less write behind and read ahead */
-		CIFS_I(inode)->clientCanCacheRead = FALSE;
-		CIFS_I(inode)->clientCanCacheAll  = FALSE;
+		CIFS_I(inode)->clientCanCacheRead = false;
+		CIFS_I(inode)->clientCanCacheAll  = false;
 	}
 	read_unlock(&GlobalSMBSeslock);
 	if ((rc == 0) && CIFS_I(inode)->write_behind_rc)
@@ -583,9 +583,9 @@ int cifs_closedir(struct inode *inode, struct file *file)
 		pTcon = cifs_sb->tcon;
 
 		cFYI(1, ("Freeing private data in close dir"));
-		if ((pCFileStruct->srch_inf.endOfSearch == FALSE) &&
-		   (pCFileStruct->invalidHandle == FALSE)) {
-			pCFileStruct->invalidHandle = TRUE;
+		if (!pCFileStruct->srch_inf.endOfSearch &&
+		    !pCFileStruct->invalidHandle) {
+			pCFileStruct->invalidHandle = true;
 			rc = CIFSFindClose(xid, pTcon, pCFileStruct->netfid);
 			cFYI(1, ("Closing uncompleted readdir with rc %d",
 				 rc));
@@ -637,12 +637,12 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
 	__u32 numLock = 0;
 	__u32 numUnlock = 0;
 	__u64 length;
-	int wait_flag = FALSE;
+	bool wait_flag = false;
 	struct cifs_sb_info *cifs_sb;
 	struct cifsTconInfo *pTcon;
 	__u16 netfid;
 	__u8 lockType = LOCKING_ANDX_LARGE_FILES;
-	int posix_locking;
+	bool posix_locking;
 
 	length = 1 + pfLock->fl_end - pfLock->fl_start;
 	rc = -EACCES;
@@ -659,7 +659,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
 		cFYI(1, ("Flock"));
 	if (pfLock->fl_flags & FL_SLEEP) {
 		cFYI(1, ("Blocking lock"));
-		wait_flag = TRUE;
+		wait_flag = true;
 	}
 	if (pfLock->fl_flags & FL_ACCESS)
 		cFYI(1, ("Process suspended by mandatory locking - "
@@ -794,7 +794,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
 					stored_rc = CIFSSMBLock(xid, pTcon,
 							netfid,
 							li->length, li->offset,
-							1, 0, li->type, FALSE);
+							1, 0, li->type, false);
 					if (stored_rc)
 						rc = stored_rc;
 
@@ -866,7 +866,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 				   filemap_fdatawait from here so tell
 				   reopen_file not to flush data to server
 				   now */
-				rc = cifs_reopen_file(file, FALSE);
+				rc = cifs_reopen_file(file, false);
 				if (rc != 0)
 					break;
 			}
@@ -966,7 +966,7 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
 				   filemap_fdatawait from here so tell
 				   reopen_file not to flush data to
 				   server now */
-				rc = cifs_reopen_file(file, FALSE);
+				rc = cifs_reopen_file(file, false);
 				if (rc != 0)
 					break;
 			}
@@ -1093,7 +1093,7 @@ refind_writable:
 
 			read_unlock(&GlobalSMBSeslock);
 			/* Had to unlock since following call can block */
-			rc = cifs_reopen_file(open_file->pfile, FALSE);
+			rc = cifs_reopen_file(open_file->pfile, false);
 			if (!rc) {
 				if (!open_file->closePend)
 					return open_file;
@@ -1608,7 +1608,7 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
 			int buf_type = CIFS_NO_BUFFER;
 			if ((open_file->invalidHandle) &&
 			    (!open_file->closePend)) {
-				rc = cifs_reopen_file(file, TRUE);
+				rc = cifs_reopen_file(file, true);
 				if (rc != 0)
 					break;
 			}
@@ -1693,7 +1693,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
 		while (rc == -EAGAIN) {
 			if ((open_file->invalidHandle) &&
 			    (!open_file->closePend)) {
-				rc = cifs_reopen_file(file, TRUE);
+				rc = cifs_reopen_file(file, true);
 				if (rc != 0)
 					break;
 			}
@@ -1850,7 +1850,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 		while (rc == -EAGAIN) {
 			if ((open_file->invalidHandle) &&
 			    (!open_file->closePend)) {
-				rc = cifs_reopen_file(file, TRUE);
+				rc = cifs_reopen_file(file, true);
 				if (rc != 0)
 					break;
 			}
@@ -2009,10 +2009,10 @@ static int is_inode_writable(struct cifsInodeInfo *cifs_inode)
    refreshing the inode only on increases in the file size
    but this is tricky to do without racing with writebehind
    page caching in the current Linux kernel design */
-int is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 end_of_file)
+bool is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 end_of_file)
 {
 	if (!cifsInode)
-		return 1;
+		return true;
 
 	if (is_inode_writable(cifsInode)) {
 		/* This inode is open for write at least once */
@@ -2022,15 +2022,15 @@ int is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 end_of_file)
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
 			/* since no page cache to corrupt on directio
 			we can change size safely */
-			return 1;
+			return true;
 		}
 
 		if (i_size_read(&cifsInode->vfs_inode) < end_of_file)
-			return 1;
+			return true;
 
-		return 0;
+		return false;
 	} else
-		return 1;
+		return true;
 }
 
 static int cifs_prepare_write(struct file *file, struct page *page,
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index e1031b9e2c5..8eaa9e72fe8 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -281,7 +281,7 @@ static int decode_sfu_inode(struct inode *inode, __u64 size,
 			    struct cifs_sb_info *cifs_sb, int xid)
 {
 	int rc;
-	int oplock = FALSE;
+	int oplock = 0;
 	__u16 netfid;
 	struct cifsTconInfo *pTcon = cifs_sb->tcon;
 	char buf[24];
@@ -389,7 +389,7 @@ int cifs_get_inode_info(struct inode **pinode,
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 	const unsigned char *full_path = NULL;
 	char *buf = NULL;
-	int adjustTZ = FALSE;
+	bool adjustTZ = bool;
 	bool is_dfs_referral = false;
 
 	pTcon = cifs_sb->tcon;
@@ -425,7 +425,7 @@ try_again_CIFSSMBQPathInfo:
 					pfindData, cifs_sb->local_nls,
 					cifs_sb->mnt_cifs_flags &
 					  CIFS_MOUNT_MAP_SPECIAL_CHR);
-			adjustTZ = TRUE;
+			adjustTZ = true;
 		}
 	}
 	/* dump_mem("\nQPathInfo return data",&findData, sizeof(findData)); */
@@ -703,7 +703,7 @@ psx_del_no_retry:
 	} else if (rc == -ENOENT) {
 		d_drop(direntry);
 	} else if (rc == -ETXTBSY) {
-		int oplock = FALSE;
+		int oplock = 0;
 		__u16 netfid;
 
 		rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN, DELETE,
@@ -736,7 +736,7 @@ psx_del_no_retry:
 				rc = -EOPNOTSUPP;
 
 			if (rc == -EOPNOTSUPP) {
-				int oplock = FALSE;
+				int oplock = 0;
 				__u16 netfid;
 			/*	rc = CIFSSMBSetAttrLegacy(xid, pTcon,
 							  full_path,
@@ -774,7 +774,7 @@ psx_del_no_retry:
 				if (direntry->d_inode)
 					drop_nlink(direntry->d_inode);
 			} else if (rc == -ETXTBSY) {
-				int oplock = FALSE;
+				int oplock = 0;
 				__u16 netfid;
 
 				rc = CIFSSMBOpen(xid, pTcon, full_path,
@@ -1149,7 +1149,7 @@ int cifs_rename(struct inode *source_inode, struct dentry *source_direntry,
 		cFYI(1, ("rename rc %d", rc));
 
 	if ((rc == -EIO) || (rc == -EEXIST)) {
-		int oplock = FALSE;
+		int oplock = 0;
 		__u16 netfid;
 
 		/* BB FIXME Is Generic Read correct for rename? */
@@ -1186,7 +1186,7 @@ int cifs_revalidate(struct dentry *direntry)
 	struct cifsInodeInfo *cifsInode;
 	loff_t local_size;
 	struct timespec local_mtime;
-	int invalidate_inode = FALSE;
+	bool invalidate_inode = false;
 
 	if (direntry->d_inode == NULL)
 		return -ENOENT;
@@ -1268,7 +1268,7 @@ int cifs_revalidate(struct dentry *direntry)
 			   only ones who could have modified the file and the
 			   server copy is staler than ours */
 		} else {
-			invalidate_inode = TRUE;
+			invalidate_inode = true;
 		}
 	}
 
@@ -1402,8 +1402,8 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 	int rc = -EACCES;
 	struct cifsFileInfo *open_file = NULL;
 	FILE_BASIC_INFO time_buf;
-	int set_time = FALSE;
-	int set_dosattr = FALSE;
+	bool set_time = false;
+	bool set_dosattr = false;
 	__u64 mode = 0xFFFFFFFFFFFFFFFFULL;
 	__u64 uid = 0xFFFFFFFFFFFFFFFFULL;
 	__u64 gid = 0xFFFFFFFFFFFFFFFFULL;
@@ -1464,7 +1464,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 			__u16 nfid = open_file->netfid;
 			__u32 npid = open_file->pid;
 			rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size,
-						nfid, npid, FALSE);
+						nfid, npid, false);
 			atomic_dec(&open_file->wrtPending);
 			cFYI(1, ("SetFSize for attrs rc = %d", rc));
 			if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
@@ -1484,14 +1484,14 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 			   it was found or because there was an error setting
 			   it by handle */
 			rc = CIFSSMBSetEOF(xid, pTcon, full_path,
-					   attrs->ia_size, FALSE,
+					   attrs->ia_size, false,
 					   cifs_sb->local_nls,
 					   cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
 			cFYI(1, ("SetEOF by path (setattrs) rc = %d", rc));
 			if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
 				__u16 netfid;
-				int oplock = FALSE;
+				int oplock = 0;
 
 				rc = SMBLegacyOpen(xid, pTcon, full_path,
 					FILE_OPEN,
@@ -1516,7 +1516,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 
 		/* Server is ok setting allocation size implicitly - no need
 		   to call:
-		CIFSSMBSetEOF(xid, pTcon, full_path, attrs->ia_size, TRUE,
+		CIFSSMBSetEOF(xid, pTcon, full_path, attrs->ia_size, true,
 			 cifs_sb->local_nls);
 		   */
 
@@ -1564,7 +1564,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 #endif
 			/* not writeable */
 			if ((cifsInode->cifsAttrs & ATTR_READONLY) == 0) {
-				set_dosattr = TRUE;
+				set_dosattr = true;
 				time_buf.Attributes =
 					cpu_to_le32(cifsInode->cifsAttrs |
 						    ATTR_READONLY);
@@ -1574,7 +1574,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 			not be able to write to it - so if any write
 			bit is enabled for user or group or other we
 			need to at least try to remove r/o dos attr */
-			set_dosattr = TRUE;
+			set_dosattr = true;
 			time_buf.Attributes = cpu_to_le32(cifsInode->cifsAttrs &
 					    (~ATTR_READONLY));
 			/* Windows ignores set to zero */
@@ -1588,14 +1588,14 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 	}
 
 	if (attrs->ia_valid & ATTR_ATIME) {
-		set_time = TRUE;
+		set_time = true;
 		time_buf.LastAccessTime =
 		    cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_atime));
 	} else
 		time_buf.LastAccessTime = 0;
 
 	if (attrs->ia_valid & ATTR_MTIME) {
-		set_time = TRUE;
+		set_time = true;
 		time_buf.LastWriteTime =
 		    cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_mtime));
 	} else
@@ -1606,7 +1606,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 	   server times */
 
 	if (set_time && (attrs->ia_valid & ATTR_CTIME)) {
-		set_time = TRUE;
+		set_time = true;
 		/* Although Samba throws this field away
 		it may be useful to Windows - but we do
 		not want to set ctime unless some other
@@ -1630,7 +1630,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 			rc = -EOPNOTSUPP;
 
 		if (rc == -EOPNOTSUPP) {
-			int oplock = FALSE;
+			int oplock = 0;
 			__u16 netfid;
 
 			cFYI(1, ("calling SetFileInfo since SetPathInfo for "
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index d4e7ec93285..1c2c3ce5020 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -230,7 +230,7 @@ cifs_readlink(struct dentry *direntry, char __user *pBuffer, int buflen)
 	struct inode *inode = direntry->d_inode;
 	int rc = -EACCES;
 	int xid;
-	int oplock = FALSE;
+	int oplock = 0;
 	struct cifs_sb_info *cifs_sb;
 	struct cifsTconInfo *pTcon;
 	char *full_path = NULL;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 2a42d9fedbb..1d69b8014e0 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -496,7 +496,8 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
 	}
 	return 0;
 }
-int
+
+bool
 is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 {
 	struct smb_com_lock_req *pSMB = (struct smb_com_lock_req *)buf;
@@ -522,17 +523,17 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 				pnotify->Action));  /* BB removeme BB */
 			/*   cifs_dump_mem("Rcvd notify Data: ",buf,
 				sizeof(struct smb_hdr)+60); */
-			return TRUE;
+			return true;
 		}
 		if (pSMBr->hdr.Status.CifsError) {
 			cFYI(1, ("notify err 0x%d",
 				pSMBr->hdr.Status.CifsError));
-			return TRUE;
+			return true;
 		}
-		return FALSE;
+		return false;
 	}
 	if (pSMB->hdr.Command != SMB_COM_LOCKING_ANDX)
-		return FALSE;
+		return false;
 	if (pSMB->hdr.Flags & SMBFLG_RESPONSE) {
 		/* no sense logging error on invalid handle on oplock
 		   break - harmless race between close request and oplock
@@ -541,21 +542,21 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 		if ((NT_STATUS_INVALID_HANDLE) ==
 		   le32_to_cpu(pSMB->hdr.Status.CifsError)) {
 			cFYI(1, ("invalid handle on oplock break"));
-			return TRUE;
+			return true;
 		} else if (ERRbadfid ==
 		   le16_to_cpu(pSMB->hdr.Status.DosError.Error)) {
-			return TRUE;
+			return true;
 		} else {
-			return FALSE; /* on valid oplock brk we get "request" */
+			return false; /* on valid oplock brk we get "request" */
 		}
 	}
 	if (pSMB->hdr.WordCount != 8)
-		return FALSE;
+		return false;
 
 	cFYI(1, ("oplock type 0x%d level 0x%d",
 		 pSMB->LockType, pSMB->OplockLevel));
 	if (!(pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE))
-		return FALSE;
+		return false;
 
 	/* look up tcon based on tid & uid */
 	read_lock(&GlobalSMBSeslock);
@@ -573,11 +574,11 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 					    ("file id match, oplock break"));
 					pCifsInode =
 						CIFS_I(netfile->pInode);
-					pCifsInode->clientCanCacheAll = FALSE;
+					pCifsInode->clientCanCacheAll = false;
 					if (pSMB->OplockLevel == 0)
 						pCifsInode->clientCanCacheRead
-							= FALSE;
-					pCifsInode->oplockPending = TRUE;
+							= false;
+					pCifsInode->oplockPending = true;
 					AllocOplockQEntry(netfile->pInode,
 							  netfile->netfid,
 							  tcon);
@@ -585,17 +586,17 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 					    ("about to wake up oplock thread"));
 					if (oplockThread)
 					    wake_up_process(oplockThread);
-					return TRUE;
+					return true;
 				}
 			}
 			read_unlock(&GlobalSMBSeslock);
 			cFYI(1, ("No matching file for oplock break"));
-			return TRUE;
+			return true;
 		}
 	}
 	read_unlock(&GlobalSMBSeslock);
 	cFYI(1, ("Can not process oplock break for non-existent connection"));
-	return TRUE;
+	return true;
 }
 
 void
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 32b445edc88..34ec32100c7 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -447,8 +447,8 @@ static int initiate_cifs_search(const int xid, struct file *file)
 	if (file->private_data == NULL)
 		return -ENOMEM;
 	cifsFile = file->private_data;
-	cifsFile->invalidHandle = TRUE;
-	cifsFile->srch_inf.endOfSearch = FALSE;
+	cifsFile->invalidHandle = true;
+	cifsFile->srch_inf.endOfSearch = false;
 
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 	if (cifs_sb == NULL)
@@ -485,7 +485,7 @@ ffirst_retry:
 		cifs_sb->mnt_cifs_flags &
 			CIFS_MOUNT_MAP_SPECIAL_CHR, CIFS_DIR_SEP(cifs_sb));
 	if (rc == 0)
-		cifsFile->invalidHandle = FALSE;
+		cifsFile->invalidHandle = false;
 	if ((rc == -EOPNOTSUPP) &&
 		(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) {
 		cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
@@ -670,7 +670,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 	   (index_to_find < first_entry_in_buffer)) {
 		/* close and restart search */
 		cFYI(1, ("search backing up - close and restart search"));
-		cifsFile->invalidHandle = TRUE;
+		cifsFile->invalidHandle = true;
 		CIFSFindClose(xid, pTcon, cifsFile->netfid);
 		kfree(cifsFile->search_resume_name);
 		cifsFile->search_resume_name = NULL;
@@ -692,7 +692,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 	}
 
 	while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) &&
-	      (rc == 0) && (cifsFile->srch_inf.endOfSearch == FALSE)) {
+	      (rc == 0) && !cifsFile->srch_inf.endOfSearch) {
 		cFYI(1, ("calling findnext2"));
 		rc = CIFSFindNext(xid, pTcon, cifsFile->netfid,
 				  &cifsFile->srch_inf);
@@ -1038,7 +1038,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 				break;
 			}
 		} /* else {
-			cifsFile->invalidHandle = TRUE;
+			cifsFile->invalidHandle = true;
 			CIFSFindClose(xid, pTcon, cifsFile->netfid);
 		}
 		kfree(cifsFile->search_resume_name);
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 58bbfd992cc..ff3232fa101 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -35,11 +35,11 @@
 #include "cifs_debug.h"
 #include "cifsencrypt.h"
 
-#ifndef FALSE
-#define FALSE 0
+#ifndef false
+#define false 0
 #endif
-#ifndef TRUE
-#define TRUE 1
+#ifndef true
+#define true 1
 #endif
 
 /* following came from the other byteorder.h to avoid include conflicts */
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 8cd6a445b01..e9527eedc63 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -264,7 +264,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 		else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
 			__u16 fid;
-			int oplock = FALSE;
+			int oplock = 0;
 			struct cifs_ntsd *pacl = NULL;
 			__u32 buflen = 0;
 			if (experimEnabled)
-- 
cgit v1.2.3-70-g09d2


From 9b1ec9eceabe0c90d12116871f692263b69d476d Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 29 Apr 2008 20:15:43 +0000
Subject: [CIFS] Remove duplicate call to mode_to_acl

The current logic in cifs_setattr calls mode_to_acl twice on mode
changes if cifsacl is enabled. Remove the duplicate call.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
CC: Shirish Pargaonkar <shirishp@us.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/inode.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 8eaa9e72fe8..a1f24bdb656 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1581,10 +1581,6 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 			if (time_buf.Attributes == 0)
 				time_buf.Attributes |= cpu_to_le32(ATTR_NORMAL);
 		}
-#ifdef CONFIG_CIFS_EXPERIMENTAL
-		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
-			mode_to_acl(direntry->d_inode, full_path, mode);
-#endif
 	}
 
 	if (attrs->ia_valid & ATTR_ATIME) {
-- 
cgit v1.2.3-70-g09d2


From 5ade9deaaa3e1f7291467d97b238648e43eae15e Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Fri, 2 May 2008 20:56:23 +0000
Subject: [CIFS] fix typo

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index a1f24bdb656..0d9d2e6d7af 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -389,7 +389,7 @@ int cifs_get_inode_info(struct inode **pinode,
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 	const unsigned char *full_path = NULL;
 	char *buf = NULL;
-	bool adjustTZ = bool;
+	bool adjustTZ = false;
 	bool is_dfs_referral = false;
 
 	pTcon = cifs_sb->tcon;
-- 
cgit v1.2.3-70-g09d2


From eb28062f131b0a1da32b2554fd819af5221040de Mon Sep 17 00:00:00 2001
From: Bryan Wu <cooloney@kernel.org>
Date: Sun, 4 May 2008 23:12:55 +0800
Subject: task_nommu: fix compile failing bug because of spilt file.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

  CC      fs/proc/task_nommu.o
fs/proc/task_nommu.c: In function ‘task_mem’:
fs/proc/task_nommu.c:55: error: dereferencing pointer to incomplete type
make[2]: *** [fs/proc/task_nommu.o] Error 1
make[1]: *** [fs/proc] Error 2
make: *** [fs] Error 2

Signed-off-by: Bryan Wu <cooloney@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_nommu.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 4b733f10845..4b4f9cc2f18 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -1,6 +1,7 @@
 
 #include <linux/mm.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/mount.h>
 #include <linux/ptrace.h>
 #include <linux/seq_file.h>
-- 
cgit v1.2.3-70-g09d2


From dca3c33652e437ed02c30ed3eca3cecd0cc00838 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Tue, 29 Apr 2008 17:02:20 +0200
Subject: [PATCH] fix reservation discarding in affs

- remove affs_put_inode, so preallocations aren't discared unnecessarily
  often.
- remove affs_drop_inode, it's called with a spinlock held, so it can't
  use a mutex.
- make i_opencnt atomic
- avoid direct b_count manipulations
- a few allocation failure fixes, so that these are more gracefully
  handled now.
Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/affs/affs.h  |  4 +---
 fs/affs/file.c  | 25 +++++++++++++++++--------
 fs/affs/inode.c | 34 ++++++++--------------------------
 fs/affs/namei.c |  6 ++++--
 fs/affs/super.c | 18 +++++++++++-------
 5 files changed, 41 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index d5bd497ab9c..223b1917093 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -48,7 +48,7 @@ struct affs_ext_key {
  * affs fs inode data in memory
  */
 struct affs_inode_info {
-	u32	 i_opencnt;
+	atomic_t i_opencnt;
 	struct semaphore i_link_lock;		/* Protects internal inode access. */
 	struct semaphore i_ext_lock;		/* Protects internal inode access. */
 #define i_hash_lock i_ext_lock
@@ -170,8 +170,6 @@ extern int	affs_rename(struct inode *old_dir, struct dentry *old_dentry,
 extern unsigned long		 affs_parent_ino(struct inode *dir);
 extern struct inode		*affs_new_inode(struct inode *dir);
 extern int			 affs_notify_change(struct dentry *dentry, struct iattr *attr);
-extern void			 affs_put_inode(struct inode *inode);
-extern void			 affs_drop_inode(struct inode *inode);
 extern void			 affs_delete_inode(struct inode *inode);
 extern void			 affs_clear_inode(struct inode *inode);
 extern struct inode		*affs_iget(struct super_block *sb,
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 1a4f092f24e..6eac7bdeec9 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -48,8 +48,9 @@ affs_file_open(struct inode *inode, struct file *filp)
 {
 	if (atomic_read(&filp->f_count) != 1)
 		return 0;
-	pr_debug("AFFS: open(%d)\n", AFFS_I(inode)->i_opencnt);
-	AFFS_I(inode)->i_opencnt++;
+	pr_debug("AFFS: open(%lu,%d)\n",
+		 inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt));
+	atomic_inc(&AFFS_I(inode)->i_opencnt);
 	return 0;
 }
 
@@ -58,10 +59,16 @@ affs_file_release(struct inode *inode, struct file *filp)
 {
 	if (atomic_read(&filp->f_count) != 0)
 		return 0;
-	pr_debug("AFFS: release(%d)\n", AFFS_I(inode)->i_opencnt);
-	AFFS_I(inode)->i_opencnt--;
-	if (!AFFS_I(inode)->i_opencnt)
+	pr_debug("AFFS: release(%lu, %d)\n",
+		 inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt));
+
+	if (atomic_dec_and_test(&AFFS_I(inode)->i_opencnt)) {
+		mutex_lock(&inode->i_mutex);
+		if (inode->i_size != AFFS_I(inode)->mmu_private)
+			affs_truncate(inode);
 		affs_free_prealloc(inode);
+		mutex_unlock(&inode->i_mutex);
+	}
 
 	return 0;
 }
@@ -180,7 +187,7 @@ affs_get_extblock(struct inode *inode, u32 ext)
 	/* inline the simplest case: same extended block as last time */
 	struct buffer_head *bh = AFFS_I(inode)->i_ext_bh;
 	if (ext == AFFS_I(inode)->i_ext_last)
-		atomic_inc(&bh->b_count);
+		get_bh(bh);
 	else
 		/* we have to do more (not inlined) */
 		bh = affs_get_extblock_slow(inode, ext);
@@ -306,7 +313,7 @@ store_ext:
 	affs_brelse(AFFS_I(inode)->i_ext_bh);
 	AFFS_I(inode)->i_ext_last = ext;
 	AFFS_I(inode)->i_ext_bh = bh;
-	atomic_inc(&bh->b_count);
+	get_bh(bh);
 
 	return bh;
 
@@ -324,7 +331,6 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul
 
 	pr_debug("AFFS: get_block(%u, %lu)\n", (u32)inode->i_ino, (unsigned long)block);
 
-
 	BUG_ON(block > (sector_t)0x7fffffffUL);
 
 	if (block >= AFFS_I(inode)->i_blkcnt) {
@@ -827,6 +833,8 @@ affs_truncate(struct inode *inode)
 		res = mapping->a_ops->write_begin(NULL, mapping, size, 0, 0, &page, &fsdata);
 		if (!res)
 			res = mapping->a_ops->write_end(NULL, mapping, size, 0, 0, page, fsdata);
+		else
+			inode->i_size = AFFS_I(inode)->mmu_private;
 		mark_inode_dirty(inode);
 		return;
 	} else if (inode->i_size == AFFS_I(inode)->mmu_private)
@@ -862,6 +870,7 @@ affs_truncate(struct inode *inode)
 		blk++;
 	} else
 		AFFS_HEAD(ext_bh)->first_data = 0;
+	AFFS_HEAD(ext_bh)->block_count = cpu_to_be32(i);
 	size = AFFS_SB(sb)->s_hashsize;
 	if (size > blkcnt - blk + i)
 		size = blkcnt - blk + i;
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 27fe6cbe43a..a13b334a391 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -58,7 +58,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
 	AFFS_I(inode)->i_extcnt = 1;
 	AFFS_I(inode)->i_ext_last = ~1;
 	AFFS_I(inode)->i_protect = prot;
-	AFFS_I(inode)->i_opencnt = 0;
+	atomic_set(&AFFS_I(inode)->i_opencnt, 0);
 	AFFS_I(inode)->i_blkcnt = 0;
 	AFFS_I(inode)->i_lc = NULL;
 	AFFS_I(inode)->i_lc_size = 0;
@@ -108,8 +108,6 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
 			inode->i_mode |= S_IFDIR;
 		} else
 			inode->i_mode = S_IRUGO | S_IXUGO | S_IWUSR | S_IFDIR;
-		if (tail->link_chain)
-			inode->i_nlink = 2;
 		/* Maybe it should be controlled by mount parameter? */
 		//inode->i_mode |= S_ISVTX;
 		inode->i_op = &affs_dir_inode_operations;
@@ -244,32 +242,13 @@ out:
 	return error;
 }
 
-void
-affs_put_inode(struct inode *inode)
-{
-	pr_debug("AFFS: put_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink);
-	affs_free_prealloc(inode);
-}
-
-void
-affs_drop_inode(struct inode *inode)
-{
-	mutex_lock(&inode->i_mutex);
-	if (inode->i_size != AFFS_I(inode)->mmu_private)
-		affs_truncate(inode);
-	mutex_unlock(&inode->i_mutex);
-
-	generic_drop_inode(inode);
-}
-
 void
 affs_delete_inode(struct inode *inode)
 {
 	pr_debug("AFFS: delete_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink);
 	truncate_inode_pages(&inode->i_data, 0);
 	inode->i_size = 0;
-	if (S_ISREG(inode->i_mode))
-		affs_truncate(inode);
+	affs_truncate(inode);
 	clear_inode(inode);
 	affs_free_block(inode->i_sb, inode->i_ino);
 }
@@ -277,9 +256,12 @@ affs_delete_inode(struct inode *inode)
 void
 affs_clear_inode(struct inode *inode)
 {
-	unsigned long cache_page = (unsigned long) AFFS_I(inode)->i_lc;
+	unsigned long cache_page;
 
 	pr_debug("AFFS: clear_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink);
+
+	affs_free_prealloc(inode);
+	cache_page = (unsigned long)AFFS_I(inode)->i_lc;
 	if (cache_page) {
 		pr_debug("AFFS: freeing ext cache\n");
 		AFFS_I(inode)->i_lc = NULL;
@@ -316,7 +298,7 @@ affs_new_inode(struct inode *dir)
 	inode->i_ino     = block;
 	inode->i_nlink   = 1;
 	inode->i_mtime   = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
-	AFFS_I(inode)->i_opencnt = 0;
+	atomic_set(&AFFS_I(inode)->i_opencnt, 0);
 	AFFS_I(inode)->i_blkcnt = 0;
 	AFFS_I(inode)->i_lc = NULL;
 	AFFS_I(inode)->i_lc_size = 0;
@@ -369,12 +351,12 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3
 	switch (type) {
 	case ST_LINKFILE:
 	case ST_LINKDIR:
-		inode_bh = bh;
 		retval = -ENOSPC;
 		block = affs_alloc_block(dir, dir->i_ino);
 		if (!block)
 			goto err;
 		retval = -EIO;
+		inode_bh = bh;
 		bh = affs_getzeroblk(sb, block);
 		if (!bh)
 			goto err;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 2218f1ee71c..cfcf1b6cf82 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -234,7 +234,8 @@ affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 int
 affs_unlink(struct inode *dir, struct dentry *dentry)
 {
-	pr_debug("AFFS: unlink(dir=%d, \"%.*s\")\n", (u32)dir->i_ino,
+	pr_debug("AFFS: unlink(dir=%d, %lu \"%.*s\")\n", (u32)dir->i_ino,
+		 dentry->d_inode->i_ino,
 		 (int)dentry->d_name.len, dentry->d_name.name);
 
 	return affs_remove_header(dentry);
@@ -302,7 +303,8 @@ affs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 int
 affs_rmdir(struct inode *dir, struct dentry *dentry)
 {
-	pr_debug("AFFS: rmdir(dir=%u, \"%.*s\")\n", (u32)dir->i_ino,
+	pr_debug("AFFS: rmdir(dir=%u, %lu \"%.*s\")\n", (u32)dir->i_ino,
+		 dentry->d_inode->i_ino,
 		 (int)dentry->d_name.len, dentry->d_name.name);
 
 	return affs_remove_header(dentry);
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 01d25d53254..d214837d5e4 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -71,12 +71,18 @@ static struct kmem_cache * affs_inode_cachep;
 
 static struct inode *affs_alloc_inode(struct super_block *sb)
 {
-	struct affs_inode_info *ei;
-	ei = (struct affs_inode_info *)kmem_cache_alloc(affs_inode_cachep, GFP_KERNEL);
-	if (!ei)
+	struct affs_inode_info *i;
+
+	i = kmem_cache_alloc(affs_inode_cachep, GFP_KERNEL);
+	if (!i)
 		return NULL;
-	ei->vfs_inode.i_version = 1;
-	return &ei->vfs_inode;
+
+	i->vfs_inode.i_version = 1;
+	i->i_lc = NULL;
+	i->i_ext_bh = NULL;
+	i->i_pa_cnt = 0;
+
+	return &i->vfs_inode;
 }
 
 static void affs_destroy_inode(struct inode *inode)
@@ -114,8 +120,6 @@ static const struct super_operations affs_sops = {
 	.alloc_inode	= affs_alloc_inode,
 	.destroy_inode	= affs_destroy_inode,
 	.write_inode	= affs_write_inode,
-	.put_inode	= affs_put_inode,
-	.drop_inode	= affs_drop_inode,
 	.delete_inode	= affs_delete_inode,
 	.clear_inode	= affs_clear_inode,
 	.put_super	= affs_put_super,
-- 
cgit v1.2.3-70-g09d2


From 33dcdac2df54e66c447ae03f58c95c7251aa5649 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 29 Apr 2008 17:46:26 +0200
Subject: [PATCH] kill ->put_inode

And with that last patch to affs killing the last put_inode instance we
can finally, after many years of transition kill this racy and awkward
interface.

(It's kinda funny that even the description in
Documentation/filesystems/vfs.txt was entirely wrong..)

Also remove a very misleading comment above the defintion of
struct super_operations.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/Locking | 2 --
 Documentation/filesystems/vfs.txt | 4 ----
 fs/inode.c                        | 3 ---
 include/linux/fs.h                | 5 -----
 4 files changed, 14 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index c2992bc54f2..8b22d7d8b99 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -92,7 +92,6 @@ prototypes:
 	void (*destroy_inode)(struct inode *);
 	void (*dirty_inode) (struct inode *);
 	int (*write_inode) (struct inode *, int);
-	void (*put_inode) (struct inode *);
 	void (*drop_inode) (struct inode *);
 	void (*delete_inode) (struct inode *);
 	void (*put_super) (struct super_block *);
@@ -115,7 +114,6 @@ alloc_inode:		no	no	no
 destroy_inode:		no
 dirty_inode:		no				(must not sleep)
 write_inode:		no
-put_inode:		no
 drop_inode:		no				!!!inode_lock!!!
 delete_inode:		no
 put_super:		yes	yes	no
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 81e5be6e6e3..b7522c6cbae 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -205,7 +205,6 @@ struct super_operations {
 
         void (*dirty_inode) (struct inode *);
         int (*write_inode) (struct inode *, int);
-        void (*put_inode) (struct inode *);
         void (*drop_inode) (struct inode *);
         void (*delete_inode) (struct inode *);
         void (*put_super) (struct super_block *);
@@ -246,9 +245,6 @@ or bottom half).
 	inode to disc.  The second parameter indicates whether the write
 	should be synchronous or not, not all filesystems check this flag.
 
-  put_inode: called when the VFS inode is removed from the inode
-	cache.
-
   drop_inode: called when the last access to the inode is dropped,
 	with the inode_lock spinlock held.
 
diff --git a/fs/inode.c b/fs/inode.c
index bf647813042..18bdce14b70 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1153,9 +1153,6 @@ void iput(struct inode *inode)
 
 		BUG_ON(inode->i_state == I_CLEAR);
 
-		if (op && op->put_inode)
-			op->put_inode(inode);
-
 		if (atomic_dec_and_lock(&inode->i_count, &inode_lock))
 			iput_final(inode);
 	}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a1ba005d08e..7e0fa9e6447 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1289,17 +1289,12 @@ extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
 extern ssize_t vfs_writev(struct file *, const struct iovec __user *,
 		unsigned long, loff_t *);
 
-/*
- * NOTE: write_inode, delete_inode, clear_inode, put_inode can be called
- * without the big kernel lock held in all filesystems.
- */
 struct super_operations {
    	struct inode *(*alloc_inode)(struct super_block *sb);
 	void (*destroy_inode)(struct inode *);
 
    	void (*dirty_inode) (struct inode *);
 	int (*write_inode) (struct inode *, int);
-	void (*put_inode) (struct inode *);
 	void (*drop_inode) (struct inode *);
 	void (*delete_inode) (struct inode *);
 	void (*put_super) (struct super_block *);
-- 
cgit v1.2.3-70-g09d2


From 0b2bac2f1ea0d33a3621b27ca68b9ae760fca2e9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 6 May 2008 13:58:34 -0400
Subject: [PATCH] fix SMP ordering hole in fcntl_setlk()

fcntl_setlk()/close() race prevention has a subtle hole - we need to
make sure that if we *do* have an fcntl/close race on SMP box, the
access to descriptor table and inode->i_flock won't get reordered.

As it is, we get STORE inode->i_flock, LOAD descriptor table entry vs.
STORE descriptor table entry, LOAD inode->i_flock with not a single
lock in common on both sides.  We do have BKL around the first STORE,
but check in locks_remove_posix() is outside of BKL and for a good
reason - we don't want BKL on common path of close(2).

Solution is to hold ->file_lock around fcheck() in there; that orders
us wrt removal from descriptor table that preceded locks_remove_posix()
on close path and we either come first (in which case eviction will be
handled by the close side) or we'll see the effect of close and do
eviction ourselves.  Note that even though it's read-only access,
we do need ->file_lock here - rcu_read_lock() won't be enough to
order the things.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/locks.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index 663c069b59b..0ac6b92cb0b 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1753,6 +1753,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
 	struct file_lock *file_lock = locks_alloc_lock();
 	struct flock flock;
 	struct inode *inode;
+	struct file *f;
 	int error;
 
 	if (file_lock == NULL)
@@ -1825,7 +1826,15 @@ again:
 	 * Attempt to detect a close/fcntl race and recover by
 	 * releasing the lock that was just acquired.
 	 */
-	if (!error && fcheck(fd) != filp && flock.l_type != F_UNLCK) {
+	/*
+	 * we need that spin_lock here - it prevents reordering between
+	 * update of inode->i_flock and check for it done in close().
+	 * rcu_read_lock() wouldn't do.
+	 */
+	spin_lock(&current->files->file_lock);
+	f = fcheck(fd);
+	spin_unlock(&current->files->file_lock);
+	if (!error && f != filp && flock.l_type != F_UNLCK) {
 		flock.l_type = F_UNLCK;
 		goto again;
 	}
@@ -1881,6 +1890,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
 	struct file_lock *file_lock = locks_alloc_lock();
 	struct flock64 flock;
 	struct inode *inode;
+	struct file *f;
 	int error;
 
 	if (file_lock == NULL)
@@ -1953,7 +1963,10 @@ again:
 	 * Attempt to detect a close/fcntl race and recover by
 	 * releasing the lock that was just acquired.
 	 */
-	if (!error && fcheck(fd) != filp && flock.l_type != F_UNLCK) {
+	spin_lock(&current->files->file_lock);
+	f = fcheck(fd);
+	spin_unlock(&current->files->file_lock);
+	if (!error && f != filp && flock.l_type != F_UNLCK) {
 		flock.l_type = F_UNLCK;
 		goto again;
 	}
-- 
cgit v1.2.3-70-g09d2


From 6ce07c7b61e74af35a05060a2d6341f68fd92c9e Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 6 May 2008 13:13:37 -0700
Subject: VFS: fix unused variable warning

Commit 33dcdac2df54e66c447ae03f58c95c7251aa5649 ("kill ->put_inode")
removed the final use of i_op->put_inode, but left the now totally
unused "op" variable in iput().

Get rid of it.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/inode.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index 18bdce14b70..c36d9480335 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1149,8 +1149,6 @@ static inline void iput_final(struct inode *inode)
 void iput(struct inode *inode)
 {
 	if (inode) {
-		const struct super_operations *op = inode->i_sb->s_op;
-
 		BUG_ON(inode->i_state == I_CLEAR);
 
 		if (atomic_dec_and_lock(&inode->i_count, &inode_lock))
-- 
cgit v1.2.3-70-g09d2


From dea570e08a69b14808b2cab56d6b0dda72145fa6 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 6 May 2008 22:05:51 +0000
Subject: [CIFS] Remove over-indented code in find_unc().

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 64 +++++++++++++++++++++++++++----------------------------
 1 file changed, 31 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 6c4332f8da6..f2259db075c 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1362,45 +1362,43 @@ find_unc(__be32 new_target_ip_addr, char *uncName, char *userName)
 {
 	struct list_head *tmp;
 	struct cifsTconInfo *tcon;
+	__be32 old_ip;
 
 	read_lock(&GlobalSMBSeslock);
+
 	list_for_each(tmp, &GlobalTreeConnectionList) {
 		cFYI(1, ("Next tcon"));
 		tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
-		if (tcon->ses) {
-			if (tcon->ses->server) {
-				cFYI(1,
-				     ("old ip addr: %x == new ip %x ?",
-				      tcon->ses->server->addr.sockAddr.sin_addr.
-				      s_addr, new_target_ip_addr));
-				if (tcon->ses->server->addr.sockAddr.sin_addr.
-				    s_addr == new_target_ip_addr) {
-	/* BB lock tcon, server and tcp session and increment use count here? */
-					/* found a match on the TCP session */
-					/* BB check if reconnection needed */
-					cFYI(1,
-					      ("IP match, old UNC: %s new: %s",
-					      tcon->treeName, uncName));
-					if (strncmp
-					    (tcon->treeName, uncName,
-					     MAX_TREE_SIZE) == 0) {
-						cFYI(1,
-						     ("and old usr: %s new: %s",
-						      tcon->treeName, uncName));
-						if (strncmp
-						    (tcon->ses->userName,
-						     userName,
-						     MAX_USERNAME_SIZE) == 0) {
-							read_unlock(&GlobalSMBSeslock);
-							/* matched smb session
-							(user name */
-							return tcon;
-						}
-					}
-				}
-			}
-		}
+		if (!tcon->ses || !tcon->ses->server)
+			continue;
+
+		old_ip = tcon->ses->server->addr.sockAddr.sin_addr.s_addr;
+		cFYI(1, ("old ip addr: %x == new ip %x ?",
+			old_ip, new_target_ip_addr));
+
+		if (old_ip != new_target_ip_addr)
+			continue;
+
+		/* BB lock tcon, server, tcp session and increment use count? */
+		/* found a match on the TCP session */
+		/* BB check if reconnection needed */
+		cFYI(1, ("IP match, old UNC: %s new: %s",
+			tcon->treeName, uncName));
+
+		if (strncmp(tcon->treeName, uncName, MAX_TREE_SIZE))
+			continue;
+
+		cFYI(1, ("and old usr: %s new: %s",
+			tcon->treeName, uncName));
+
+		if (strncmp(tcon->ses->userName, userName, MAX_USERNAME_SIZE))
+			continue;
+
+		/* matched smb session (user name) */
+		read_unlock(&GlobalSMBSeslock);
+		return tcon;
 	}
+
 	read_unlock(&GlobalSMBSeslock);
 	return NULL;
 }
-- 
cgit v1.2.3-70-g09d2


From cf432eb50ffd03572c08a006f44e0069957cf300 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 6 May 2008 22:27:16 +0000
Subject: [CIFS] cleanup cifsd completion

Was a holdover from the old kernel_thread based cifsd
code. We needed to know that the thread had set the task variable
before proceeding. Now that kthread_run returns the new task, this
doesn't appear to be needed anymore.

As best I can tell, this sleep was intended to try to prevent
cifs_umount from freeing the cifsSesInfo struct before cifsd had
exited. Now that cifsd is using the kthread API, we know that
when kthread_stop returns that cifsd has exited, so I don't
think this is needed any longer.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Acked-by: Christop Hellwig <hch@infradead.org>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index f2259db075c..957998e8477 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -49,8 +49,6 @@
 #define CIFS_PORT 445
 #define RFC1001_PORT 139
 
-static DECLARE_COMPLETION(cifsd_complete);
-
 extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
 			 unsigned char *p24);
 
@@ -356,7 +354,6 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
 	atomic_inc(&tcpSesAllocCount);
 	length = tcpSesAllocCount.counter;
 	write_unlock(&GlobalSMBSeslock);
-	complete(&cifsd_complete);
 	if (length  > 1)
 		mempool_resize(cifs_req_poolp, length + cifs_min_rcv,
 				GFP_KERNEL);
@@ -1980,7 +1977,6 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 				kfree(srvTcp->hostname);
 				goto out;
 			}
-			wait_for_completion(&cifsd_complete);
 			rc = 0;
 			memcpy(srvTcp->workstation_RFC1001_name,
 				volume_info.source_rfc1001_name, 16);
@@ -3553,8 +3549,6 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
 	cifs_sb->prepathlen = 0;
 	cifs_sb->prepath = NULL;
 	kfree(tmp);
-	if (ses)
-		schedule_timeout_interruptible(msecs_to_jiffies(500));
 	if (ses)
 		sesInfoFree(ses);
 
-- 
cgit v1.2.3-70-g09d2


From 7f3d4ee108c184ab215036051087aaaaa8de7661 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Wed, 7 May 2008 09:22:39 +0200
Subject: vfs: splice remove_suid() cleanup

generic_file_splice_write() duplicates remove_suid() just because it
doesn't hold i_mutex.  But it grabs i_mutex inside splice_from_pipe()
anyway, so this is rather pointless.

Move locking to generic_file_splice_write() and call remove_suid() and
__splice_from_pipe() instead.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/splice.c        | 29 +++++++++++++----------------
 include/linux/fs.h |  1 -
 mm/filemap.c       |  2 +-
 3 files changed, 14 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 633f58ebfb7..cece15b4ef7 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -811,24 +811,19 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 {
 	struct address_space *mapping = out->f_mapping;
 	struct inode *inode = mapping->host;
-	int killsuid, killpriv;
+	struct splice_desc sd = {
+		.total_len = len,
+		.flags = flags,
+		.pos = *ppos,
+		.u.file = out,
+	};
 	ssize_t ret;
-	int err = 0;
-
-	killpriv = security_inode_need_killpriv(out->f_path.dentry);
-	killsuid = should_remove_suid(out->f_path.dentry);
-	if (unlikely(killsuid || killpriv)) {
-		mutex_lock(&inode->i_mutex);
-		if (killpriv)
-			err = security_inode_killpriv(out->f_path.dentry);
-		if (!err && killsuid)
-			err = __remove_suid(out->f_path.dentry, killsuid);
-		mutex_unlock(&inode->i_mutex);
-		if (err)
-			return err;
-	}
 
-	ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file);
+	inode_double_lock(inode, pipe->inode);
+	ret = remove_suid(out->f_path.dentry);
+	if (likely(!ret))
+		ret = __splice_from_pipe(pipe, &sd, pipe_to_file);
+	inode_double_unlock(inode, pipe->inode);
 	if (ret > 0) {
 		unsigned long nr_pages;
 
@@ -840,6 +835,8 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 		 * sync it.
 		 */
 		if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
+			int err;
+
 			mutex_lock(&inode->i_mutex);
 			err = generic_osync_inode(inode, mapping,
 						  OSYNC_METADATA|OSYNC_DATA);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7e0fa9e6447..f413085f748 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1816,7 +1816,6 @@ extern void iget_failed(struct inode *);
 extern void clear_inode(struct inode *);
 extern void destroy_inode(struct inode *);
 extern struct inode *new_inode(struct super_block *);
-extern int __remove_suid(struct dentry *, int);
 extern int should_remove_suid(struct dentry *);
 extern int remove_suid(struct dentry *);
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 239d36163bb..2dead9adf8b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1655,7 +1655,7 @@ int should_remove_suid(struct dentry *dentry)
 }
 EXPORT_SYMBOL(should_remove_suid);
 
-int __remove_suid(struct dentry *dentry, int kill)
+static int __remove_suid(struct dentry *dentry, int kill)
 {
 	struct iattr newattrs;
 
-- 
cgit v1.2.3-70-g09d2


From 221e583a735fc5d879d83c2a76b8ee5afcbdf146 Mon Sep 17 00:00:00 2001
From: Rasmus Rohde <rohde@duff.dk>
Date: Wed, 30 Apr 2008 17:22:06 +0200
Subject: udf: Make udf exportable

Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Rasmus Rohde <rohde@duff.dk>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/namei.c           | 140 ++++++++++++++++++++++++++++++++++++++++++++++-
 fs/udf/super.c           |   1 +
 fs/udf/udfdecl.h         |   1 +
 include/linux/exportfs.h |  21 +++++++
 4 files changed, 161 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index ba5537d4bc1..47a6589e10b 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -32,6 +32,7 @@
 #include <linux/buffer_head.h>
 #include <linux/sched.h>
 #include <linux/crc-itu-t.h>
+#include <linux/exportfs.h>
 
 static inline int udf_match(int len1, const char *name1, int len2,
 			    const char *name2)
@@ -158,6 +159,8 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
 	sector_t offset;
 	struct extent_position epos = {};
 	struct udf_inode_info *dinfo = UDF_I(dir);
+	int isdotdot = dentry->d_name.len == 2 &&
+		dentry->d_name.name[0] == '.' && dentry->d_name.name[1] == '.';
 
 	size = udf_ext0_offset(dir) + dir->i_size;
 	f_pos = udf_ext0_offset(dir);
@@ -225,6 +228,12 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
 				continue;
 		}
 
+		if ((cfi->fileCharacteristics & FID_FILE_CHAR_PARENT) &&
+		    isdotdot) {
+			brelse(epos.bh);
+			return fi;
+		}
+
 		if (!lfi)
 			continue;
 
@@ -286,9 +295,8 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
 		}
 	}
 	unlock_kernel();
-	d_add(dentry, inode);
 
-	return NULL;
+	return d_splice_alias(inode, dentry);
 }
 
 static struct fileIdentDesc *udf_add_entry(struct inode *dir,
@@ -1232,6 +1240,134 @@ end_rename:
 	return retval;
 }
 
+static struct dentry *udf_get_parent(struct dentry *child)
+{
+	struct dentry *parent;
+	struct inode *inode = NULL;
+	struct dentry dotdot;
+	struct fileIdentDesc cfi;
+	struct udf_fileident_bh fibh;
+
+	dotdot.d_name.name = "..";
+	dotdot.d_name.len = 2;
+
+	lock_kernel();
+	if (!udf_find_entry(child->d_inode, &dotdot, &fibh, &cfi))
+		goto out_unlock;
+
+	if (fibh.sbh != fibh.ebh)
+		brelse(fibh.ebh);
+	brelse(fibh.sbh);
+
+	inode = udf_iget(child->d_inode->i_sb,
+			 lelb_to_cpu(cfi.icb.extLocation));
+	if (!inode)
+		goto out_unlock;
+	unlock_kernel();
+
+	parent = d_alloc_anon(inode);
+	if (!parent) {
+		iput(inode);
+		parent = ERR_PTR(-ENOMEM);
+	}
+
+	return parent;
+out_unlock:
+	unlock_kernel();
+	return ERR_PTR(-EACCES);
+}
+
+
+static struct dentry *udf_nfs_get_inode(struct super_block *sb, u32 block,
+					u16 partref, __u32 generation)
+{
+	struct inode *inode;
+	struct dentry *result;
+	kernel_lb_addr loc;
+
+	if (block == 0)
+		return ERR_PTR(-ESTALE);
+
+	loc.logicalBlockNum = block;
+	loc.partitionReferenceNum = partref;
+	inode = udf_iget(sb, loc);
+
+	if (inode == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	if (generation && inode->i_generation != generation) {
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+	result = d_alloc_anon(inode);
+	if (!result) {
+		iput(inode);
+		return ERR_PTR(-ENOMEM);
+	}
+	return result;
+}
+
+static struct dentry *udf_fh_to_dentry(struct super_block *sb,
+				       struct fid *fid, int fh_len, int fh_type)
+{
+	if ((fh_len != 3 && fh_len != 5) ||
+	    (fh_type != FILEID_UDF_WITH_PARENT &&
+	     fh_type != FILEID_UDF_WITHOUT_PARENT))
+		return NULL;
+
+	return udf_nfs_get_inode(sb, fid->udf.block, fid->udf.partref,
+			fid->udf.generation);
+}
+
+static struct dentry *udf_fh_to_parent(struct super_block *sb,
+				       struct fid *fid, int fh_len, int fh_type)
+{
+	if (fh_len != 5 || fh_type != FILEID_UDF_WITH_PARENT)
+		return NULL;
+
+	return udf_nfs_get_inode(sb, fid->udf.parent_block,
+				 fid->udf.parent_partref,
+				 fid->udf.parent_generation);
+}
+static int udf_encode_fh(struct dentry *de, __u32 *fh, int *lenp,
+			 int connectable)
+{
+	int len = *lenp;
+	struct inode *inode =  de->d_inode;
+	kernel_lb_addr location = UDF_I(inode)->i_location;
+	struct fid *fid = (struct fid *)fh;
+	int type = FILEID_UDF_WITHOUT_PARENT;
+
+	if (len < 3 || (connectable && len < 5))
+		return 255;
+
+	*lenp = 3;
+	fid->udf.block = location.logicalBlockNum;
+	fid->udf.partref = location.partitionReferenceNum;
+	fid->udf.generation = inode->i_generation;
+
+	if (connectable && !S_ISDIR(inode->i_mode)) {
+		spin_lock(&de->d_lock);
+		inode = de->d_parent->d_inode;
+		location = UDF_I(inode)->i_location;
+		fid->udf.parent_block = location.logicalBlockNum;
+		fid->udf.parent_partref = location.partitionReferenceNum;
+		fid->udf.parent_generation = inode->i_generation;
+		spin_unlock(&de->d_lock);
+		*lenp = 5;
+		type = FILEID_UDF_WITH_PARENT;
+	}
+
+	return type;
+}
+
+const struct export_operations udf_export_ops = {
+	.encode_fh	= udf_encode_fh,
+	.fh_to_dentry   = udf_fh_to_dentry,
+	.fh_to_parent   = udf_fh_to_parent,
+	.get_parent     = udf_get_parent,
+};
+
 const struct inode_operations udf_dir_inode_operations = {
 	.lookup				= udf_lookup,
 	.create				= udf_create,
diff --git a/fs/udf/super.c b/fs/udf/super.c
index b564fc140fe..260f4b82c79 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1933,6 +1933,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 
 	/* Fill in the rest of the superblock */
 	sb->s_op = &udf_sb_ops;
+	sb->s_export_op = &udf_export_ops;
 	sb->dq_op = NULL;
 	sb->s_dirt = 0;
 	sb->s_magic = UDF_SUPER_MAGIC;
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index f3f45d02927..8fa9c2d7091 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -73,6 +73,7 @@ struct task_struct;
 struct buffer_head;
 struct super_block;
 
+extern const struct export_operations udf_export_ops;
 extern const struct inode_operations udf_dir_inode_operations;
 extern const struct file_operations udf_dir_operations;
 extern const struct inode_operations udf_file_inode_operations;
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index de8387b7ceb..f5abd130663 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -33,6 +33,19 @@ enum fid_type {
 	 * 32 bit parent directory inode number.
 	 */
 	FILEID_INO32_GEN_PARENT = 2,
+
+	/*
+	 * 32 bit block number, 16 bit partition reference,
+	 * 16 bit unused, 32 bit generation number.
+	 */
+	FILEID_UDF_WITHOUT_PARENT = 0x51,
+
+	/*
+	 * 32 bit block number, 16 bit partition reference,
+	 * 16 bit unused, 32 bit generation number,
+	 * 32 bit parent block number, 32 bit parent generation number
+	 */
+	FILEID_UDF_WITH_PARENT = 0x52,
 };
 
 struct fid {
@@ -43,6 +56,14 @@ struct fid {
 			u32 parent_ino;
 			u32 parent_gen;
 		} i32;
+ 		struct {
+ 			u32 block;
+ 			u16 partref;
+ 			u16 parent_partref;
+ 			u32 generation;
+ 			u32 parent_block;
+ 			u32 parent_generation;
+ 		} udf;
 		__u32 raw[0];
 	};
 };
-- 
cgit v1.2.3-70-g09d2


From 9afadc4b1fd25337003832c9a4668f9bd42cdda9 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@ghost.suse.cz>
Date: Tue, 6 May 2008 18:26:17 +0200
Subject: udf: Fix memory corruption when fs mounted with noadinicb option

When UDF filesystem is mounted with noadinicb mount option, it
happens that we extend an empty directory with a block. A code in
udf_add_entry() didn't count with this possibility and used
uninitialized data leading to memory and filesystem corruption.
Add a check whether file already has some extents before operating
on them.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/namei.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 47a6589e10b..3d94bc1cfba 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -315,7 +315,7 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
 	uint16_t liu;
 	int block;
 	kernel_lb_addr eloc;
-	uint32_t elen;
+	uint32_t elen = 0;
 	sector_t offset;
 	struct extent_position epos = {};
 	struct udf_inode_info *dinfo;
@@ -406,7 +406,8 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
 	}
 
 add:
-	if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
+	/* Is there any extent whose size we need to round up? */
+	if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && elen) {
 		elen = (elen + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1);
 		if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
 			epos.offset -= sizeof(short_ad);
-- 
cgit v1.2.3-70-g09d2


From eeae1d48c011839d9e1cdc1e8aacf0193c9d8197 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 7 May 2008 13:26:27 +0200
Subject: block: use unitialized_var() in bio_alloc_bioset()

Better than setting idx to some random value and it silences the
same bogus gcc warning.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index 799f86deff2..2fa04ff8670 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -158,7 +158,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 
 		bio_init(bio);
 		if (likely(nr_iovecs)) {
-			unsigned long idx = 0; /* shut up gcc */
+			unsigned long uninitialized_var(idx);
 
 			bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
 			if (unlikely(!bvl)) {
-- 
cgit v1.2.3-70-g09d2


From ffee0259c9edcc4c0f06b60df51c461eeecad4c0 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 30 Apr 2008 09:08:54 +0200
Subject: docbook: fix bio missing parameter

Fix fs/bio.c kernel-doc parameter warning:
Warning(linux-2.6.25-git14//fs/bio.c:972): No description found for parameter 'reading'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index 2fa04ff8670..78562574cb5 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -963,6 +963,7 @@ static void bio_copy_kern_endio(struct bio *bio, int err)
  *	@data: pointer to buffer to copy
  *	@len: length in bytes
  *	@gfp_mask: allocation flags for bio and page allocation
+ *	@reading: data direction is READ
  *
  *	copy the kernel address into a bio suitable for io to a block
  *	device. Returns an error pointer in case of error.
-- 
cgit v1.2.3-70-g09d2


From 75065ff619e42fe35178eda863cbcddd57776794 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 8 May 2008 14:06:19 +0200
Subject: Revert "relay: fix splice problem"

This reverts commit c3270e577c18b3d0e984c3371493205a4807db9d.
---
 fs/splice.c    | 2 +-
 kernel/relay.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index cece15b4ef7..78150038b58 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1072,7 +1072,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
 
 	ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
 	if (ret > 0)
-		*ppos = sd.pos;
+		*ppos += ret;
 
 	return ret;
 }
diff --git a/kernel/relay.c b/kernel/relay.c
index 7de644cdec4..bc24dcdc570 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1191,7 +1191,7 @@ static ssize_t relay_file_splice_read(struct file *in,
 	ret = 0;
 	spliced = 0;
 
-	while (len && !spliced) {
+	while (len) {
 		ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret);
 		if (ret < 0)
 			break;
-- 
cgit v1.2.3-70-g09d2


From ba719baeabbff5476eeb91c223e6921ba29e1490 Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Tue, 6 May 2008 20:42:38 -0700
Subject: sys_pipe(): fix file descriptor leaks

Remember to close the files if copy_to_user() failed.

Spotted by dm.n9107@gmail.com.

Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Cc: DM <dm.n9107@gmail.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/cris/kernel/sys_cris.c | 5 ++++-
 arch/m32r/kernel/sys_m32r.c | 5 ++++-
 fs/pipe.c                   | 6 +++++-
 3 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/arch/cris/kernel/sys_cris.c b/arch/cris/kernel/sys_cris.c
index 8b9984197ed..d124066e172 100644
--- a/arch/cris/kernel/sys_cris.c
+++ b/arch/cris/kernel/sys_cris.c
@@ -40,8 +40,11 @@ asmlinkage int sys_pipe(unsigned long __user * fildes)
         error = do_pipe(fd);
         unlock_kernel();
         if (!error) {
-                if (copy_to_user(fildes, fd, 2*sizeof(int)))
+                if (copy_to_user(fildes, fd, 2*sizeof(int))) {
+			sys_close(fd[0]);
+			sys_close(fd[1]);
                         error = -EFAULT;
+		}
         }
         return error;
 }
diff --git a/arch/m32r/kernel/sys_m32r.c b/arch/m32r/kernel/sys_m32r.c
index 6d7a80fdad4..319c79720b8 100644
--- a/arch/m32r/kernel/sys_m32r.c
+++ b/arch/m32r/kernel/sys_m32r.c
@@ -90,8 +90,11 @@ sys_pipe(unsigned long r0, unsigned long r1, unsigned long r2,
 
 	error = do_pipe(fd);
 	if (!error) {
-		if (copy_to_user((void __user *)r0, fd, 2*sizeof(int)))
+		if (copy_to_user((void __user *)r0, fd, 2*sizeof(int))) {
+			sys_close(fd[0]);
+			sys_close(fd[1]);
 			error = -EFAULT;
+		}
 	}
 	return error;
 }
diff --git a/fs/pipe.c b/fs/pipe.c
index 3499f9ff631..ec228bc9f88 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -17,6 +17,7 @@
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/audit.h>
+#include <linux/syscalls.h>
 
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
@@ -1086,8 +1087,11 @@ asmlinkage long __weak sys_pipe(int __user *fildes)
 
 	error = do_pipe(fd);
 	if (!error) {
-		if (copy_to_user(fildes, fd, sizeof(fd)))
+		if (copy_to_user(fildes, fd, sizeof(fd))) {
+			sys_close(fd[0]);
+			sys_close(fd[1]);
 			error = -EFAULT;
+		}
 	}
 	return error;
 }
-- 
cgit v1.2.3-70-g09d2


From 19566ca6dc26600bae4b75701d4dced8d8540f16 Mon Sep 17 00:00:00 2001
From: Huang Weiyi <weiyi.huang@gmail.com>
Date: Thu, 8 May 2008 22:36:27 +0800
Subject: fs/proc/task_mmu.c: remove duplicated include files

Removed duplicated include files <linux/ptrace.h> and <linux/seq_file.h> in
fs/proc/task_mmu.c.

Signed-off-by: Huang Weiyi <weiyi.huang@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index e2b8e769f51..88717c0f941 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -5,11 +5,9 @@
 #include <linux/highmem.h>
 #include <linux/ptrace.h>
 #include <linux/pagemap.h>
-#include <linux/ptrace.h>
 #include <linux/mempolicy.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
-#include <linux/seq_file.h>
 
 #include <asm/elf.h>
 #include <asm/uaccess.h>
-- 
cgit v1.2.3-70-g09d2


From 7c5e628f95b440b69332b1ed3eb112648fc8f7ff Mon Sep 17 00:00:00 2001
From: Igor Mammedov <niallain@gmail.com>
Date: Thu, 8 May 2008 20:48:42 +0000
Subject: [CIFS] Fixed build warning in is_ip

Signed-off-by: Igor Mammedov <niallain@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsproto.h |  2 +-
 fs/cifs/connect.c   |  3 +++
 fs/cifs/netmisc.c   | 32 +-------------------------------
 3 files changed, 5 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index a249a29109a..d481f6c5a2b 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -70,7 +70,7 @@ extern unsigned int smbCalcSize(struct smb_hdr *ptr);
 extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr);
 extern int decode_negTokenInit(unsigned char *security_blob, int length,
 			enum securityEnum *secType);
-extern int cifs_inet_pton(int, char *source, void *dst);
+extern int cifs_inet_pton(const int, const char *source, void *dst);
 extern int map_smb_to_linux_error(struct smb_hdr *smb, int logErr);
 extern void header_assemble(struct smb_hdr *, char /* command */ ,
 			    const struct cifsTconInfo *, int /* length of
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 957998e8477..791ca5c1a11 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1302,6 +1302,9 @@ cifs_parse_mount_options(char *options, const char *devname,
 						    "begin with // or \\\\ \n");
 				return 1;
 			}
+			value = strpbrk(vol->UNC+2, "/\\");
+			if (value)
+				*value = '\\';
 		} else {
 			printk(KERN_WARNING "CIFS: UNC name too long\n");
 			return 1;
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 3b5a5ce882b..00f4cff400b 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -132,47 +132,17 @@ static const struct smb_to_posix_error mapping_table_ERRHRD[] = {
 	{0, 0}
 };
 
-
-/* if the mount helper is missing we need to reverse the 1st slash
-   from '/' to backslash in order to format the UNC properly for
-   ip address parsing and for tree connect (unless the user
-   remembered to put the UNC name in properly). Fortunately we do
-   not have to call this twice (we check for IPv4 addresses
-   first, so it is already converted by the time we
-   try IPv6 addresses */
-static int canonicalize_unc(char *cp)
-{
-	int i;
-
-	for (i = 0; i <= 46 /* INET6_ADDRSTRLEN */ ; i++) {
-		if (cp[i] == 0)
-			break;
-		if (cp[i] == '\\')
-			break;
-		if (cp[i] == '/') {
-			cFYI(DBG2, ("change slash to \\ in malformed UNC"));
-			cp[i] = '\\';
-			return 1;
-		}
-	}
-	return 0;
-}
-
 /* Convert string containing dotted ip address to binary form */
 /* returns 0 if invalid address */
 
 int
-cifs_inet_pton(int address_family, char *cp, void *dst)
+cifs_inet_pton(const int address_family, const char *cp, void *dst)
 {
 	int ret = 0;
 
 	/* calculate length by finding first slash or NULL */
 	if (address_family == AF_INET) {
 		ret = in4_pton(cp, -1 /* len */, dst, '\\', NULL);
-		if (ret == 0) {
-			if (canonicalize_unc(cp))
-				ret = in4_pton(cp, -1, dst, '\\', NULL);
-		}
 	} else if (address_family == AF_INET6) {
 		ret = in6_pton(cp, -1 /* len */, dst , '\\', NULL);
 	}
-- 
cgit v1.2.3-70-g09d2


From af4b3c355cbd38703471e55d11f42d8640db4118 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Fri, 9 May 2008 03:48:05 +0000
Subject: [CIFS] fix build warning

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsacl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 6fe1bc5bb36..34902cff540 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -589,7 +589,7 @@ static struct cifs_ntsd *get_cifs_acl(u32 *pacllen, struct inode *inode,
 		unlock_file = true;
 		fid = open_file->netfid;
 	} else if (pfid == NULL) {
-		bool oplock = false;
+		int oplock = 0;
 		/* open file */
 		rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN,
 				READ_CONTROL, 0, &fid, &oplock, NULL,
-- 
cgit v1.2.3-70-g09d2


From 1b20d672188bf80baef60d515a123f556871a5ce Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Fri, 9 May 2008 18:17:21 +0000
Subject: [CIFS] cifs_find_tcp_session cleanup

This patch cleans up cifs_find_tcp_session so it become
less indented. Also the error of skipping IPv6 matched
addresses fixed.

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 51 ++++++++++++++++++++++++++-------------------------
 1 file changed, 26 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 791ca5c1a11..6b520aad7af 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1318,42 +1318,43 @@ cifs_parse_mount_options(char *options, const char *devname,
 
 static struct cifsSesInfo *
 cifs_find_tcp_session(struct in_addr *target_ip_addr,
-		struct in6_addr *target_ip6_addr,
-		 char *userName, struct TCP_Server_Info **psrvTcp)
+		      struct in6_addr *target_ip6_addr,
+		      char *userName, struct TCP_Server_Info **psrvTcp)
 {
 	struct list_head *tmp;
 	struct cifsSesInfo *ses;
+
 	*psrvTcp = NULL;
-	read_lock(&GlobalSMBSeslock);
 
+	read_lock(&GlobalSMBSeslock);
 	list_for_each(tmp, &GlobalSMBSessionList) {
 		ses = list_entry(tmp, struct cifsSesInfo, cifsSessionList);
-		if (ses->server) {
-			if ((target_ip_addr &&
-				(ses->server->addr.sockAddr.sin_addr.s_addr
-				  == target_ip_addr->s_addr)) || (target_ip6_addr
-				&& memcmp(&ses->server->addr.sockAddr6.sin6_addr,
-					target_ip6_addr, sizeof(*target_ip6_addr)))) {
-				/* BB lock server and tcp session and increment
-				      use count here?? */
-
-				/* found a match on the TCP session */
-				*psrvTcp = ses->server;
-
-				/* BB check if reconnection needed */
-				if (strncmp
-				    (ses->userName, userName,
-				     MAX_USERNAME_SIZE) == 0){
-					read_unlock(&GlobalSMBSeslock);
-					/* Found exact match on both TCP and
-					   SMB sessions */
-					return ses;
-				}
-			}
+		if (!ses->server)
+			continue;
+
+		if (target_ip_addr &&
+		    ses->server->addr.sockAddr.sin_addr.s_addr != target_ip_addr->s_addr)
+				continue;
+		else if (target_ip6_addr &&
+			 memcmp(&ses->server->addr.sockAddr6.sin6_addr,
+				target_ip6_addr, sizeof(*target_ip6_addr)))
+				continue;
+		/* BB lock server and tcp session and increment use count here?? */
+
+		/* found a match on the TCP session */
+		*psrvTcp = ses->server;
+
+		/* BB check if reconnection needed */
+		if (strncmp(ses->userName, userName, MAX_USERNAME_SIZE) == 0) {
+			read_unlock(&GlobalSMBSeslock);
+			/* Found exact match on both TCP and
+			   SMB sessions */
+			return ses;
 		}
 		/* else tcp and smb sessions need reconnection */
 	}
 	read_unlock(&GlobalSMBSeslock);
+
 	return NULL;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 02eadeffda169a45946c79270ec19f45eeafb8e7 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 9 May 2008 21:26:11 +0000
Subject: [CIFS]  add local struct inode pointer to cifs_setattr

Clean up cifs_setattr a bit by adding a local inode pointer, and
changing all of the direntry->d_inode references to it. This also adds a
bit of micro-optimization. d_inode shouldn't change over the life of
this function, so we only need to dereference it once.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c |  2 +-
 fs/cifs/inode.c   | 20 ++++++++++----------
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 6b520aad7af..96505086759 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1339,7 +1339,7 @@ cifs_find_tcp_session(struct in_addr *target_ip_addr,
 			 memcmp(&ses->server->addr.sockAddr6.sin6_addr,
 				target_ip6_addr, sizeof(*target_ip6_addr)))
 				continue;
-		/* BB lock server and tcp session and increment use count here?? */
+		/* BB lock server and tcp session; increment use count here?? */
 
 		/* found a match on the TCP session */
 		*psrvTcp = ses->server;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 0d9d2e6d7af..d904a037c83 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1408,18 +1408,19 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 	__u64 uid = 0xFFFFFFFFFFFFFFFFULL;
 	__u64 gid = 0xFFFFFFFFFFFFFFFFULL;
 	struct cifsInodeInfo *cifsInode;
+	struct inode *inode = direntry->d_inode;
 
 	xid = GetXid();
 
 	cFYI(1, ("setattr on file %s attrs->iavalid 0x%x",
 		 direntry->d_name.name, attrs->ia_valid));
 
-	cifs_sb = CIFS_SB(direntry->d_inode->i_sb);
+	cifs_sb = CIFS_SB(inode->i_sb);
 	pTcon = cifs_sb->tcon;
 
 	if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) == 0) {
 		/* check if we have permission to change attrs */
-		rc = inode_change_ok(direntry->d_inode, attrs);
+		rc = inode_change_ok(inode, attrs);
 		if (rc < 0) {
 			FreeXid(xid);
 			return rc;
@@ -1432,7 +1433,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 		FreeXid(xid);
 		return -ENOMEM;
 	}
-	cifsInode = CIFS_I(direntry->d_inode);
+	cifsInode = CIFS_I(inode);
 
 	if ((attrs->ia_valid & ATTR_MTIME) || (attrs->ia_valid & ATTR_SIZE)) {
 		/*
@@ -1443,9 +1444,9 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 		   will be truncated anyway? Also, should we error out here if
 		   the flush returns error?
 		 */
-		rc = filemap_write_and_wait(direntry->d_inode->i_mapping);
+		rc = filemap_write_and_wait(inode->i_mapping);
 		if (rc != 0) {
-			CIFS_I(direntry->d_inode)->write_behind_rc = rc;
+			cifsInode->write_behind_rc = rc;
 			rc = 0;
 		}
 	}
@@ -1521,9 +1522,8 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 		   */
 
 		if (rc == 0) {
-			rc = cifs_vmtruncate(direntry->d_inode, attrs->ia_size);
-			cifs_truncate_page(direntry->d_inode->i_mapping,
-					   direntry->d_inode->i_size);
+			rc = cifs_vmtruncate(inode, attrs->ia_size);
+			cifs_truncate_page(inode->i_mapping, inode->i_size);
 		} else
 			goto cifs_setattr_exit;
 	}
@@ -1557,7 +1557,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 		rc = 0;
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
-			rc = mode_to_acl(direntry->d_inode, full_path, mode);
+			rc = mode_to_acl(inode, full_path, mode);
 		else if ((mode & S_IWUGO) == 0) {
 #else
 		if ((mode & S_IWUGO) == 0) {
@@ -1665,7 +1665,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 	/* do not need local check to inode_check_ok since the server does
 	   that */
 	if (!rc)
-		rc = inode_setattr(direntry->d_inode, attrs);
+		rc = inode_setattr(inode, attrs);
 cifs_setattr_exit:
 	kfree(full_path);
 	FreeXid(xid);
-- 
cgit v1.2.3-70-g09d2


From 67750fb9e07940c078d1edb16fd736ccc92a4a4e Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 9 May 2008 22:28:02 +0000
Subject: [CIFS] when not using unix extensions, check for and set
 ATTR_READONLY on create and mkdir

When creating a directory on a CIFS share without POSIX extensions,
and the given mode has no write bits set, set the ATTR_READONLY bit.

When creating a file, set ATTR_READONLY if the create mode has no write
bits set and we're not using unix extensions.

There are some comments about this being problematic due to the VFS
splitting creates into 2 parts. I'm not sure what that's actually
talking about, but I'm assuming that it has something to do with how
mknod is implemented. In the simple case where we have no unix
extensions and we're just creating a regular file, there's no reason
we can't set ATTR_READONLY.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifspdu.h |  1 +
 fs/cifs/cifssmb.c | 16 ++++++----------
 fs/cifs/dir.c     | 16 +++++++++++++---
 fs/cifs/inode.c   | 15 +++++++++++----
 4 files changed, 31 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index a0d26b540d4..c43bf4b7a55 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -340,6 +340,7 @@
 #define OPEN_NO_RECALL          0x00400000
 #define OPEN_FREE_SPACE_QUERY   0x00800000	/* should be zero */
 #define CREATE_OPTIONS_MASK     0x007FFFFF
+#define CREATE_OPTION_READONLY	0x10000000
 #define CREATE_OPTION_SPECIAL   0x20000000   /* system. NB not sent over wire */
 
 /* ImpersonationLevel flags */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index cfd9750852b..95fbba4ea7d 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1224,11 +1224,8 @@ OldOpenRetry:
 	else /* BB FIXME BB */
 		pSMB->FileAttributes = cpu_to_le16(0/*ATTR_NORMAL*/);
 
-	/* if ((omode & S_IWUGO) == 0)
-		pSMB->FileAttributes |= cpu_to_le32(ATTR_READONLY);*/
-	/*  Above line causes problems due to vfs splitting create into two
-	    pieces - need to set mode after file created not while it is
-	    being created */
+	if (create_options & CREATE_OPTION_READONLY)
+		pSMB->FileAttributes |= cpu_to_le16(ATTR_READONLY);
 
 	/* BB FIXME BB */
 /*	pSMB->CreateOptions = cpu_to_le32(create_options &
@@ -1331,17 +1328,16 @@ openRetry:
 		pSMB->FileAttributes = cpu_to_le32(ATTR_SYSTEM);
 	else
 		pSMB->FileAttributes = cpu_to_le32(ATTR_NORMAL);
+
 	/* XP does not handle ATTR_POSIX_SEMANTICS */
 	/* but it helps speed up case sensitive checks for other
 	servers such as Samba */
 	if (tcon->ses->capabilities & CAP_UNIX)
 		pSMB->FileAttributes |= cpu_to_le32(ATTR_POSIX_SEMANTICS);
 
-	/* if ((omode & S_IWUGO) == 0)
-		pSMB->FileAttributes |= cpu_to_le32(ATTR_READONLY);*/
-	/*  Above line causes problems due to vfs splitting create into two
-		pieces - need to set mode after file created not while it is
-		being created */
+	if (create_options & CREATE_OPTION_READONLY)
+		pSMB->FileAttributes |= cpu_to_le32(ATTR_READONLY);
+
 	pSMB->ShareAccess = cpu_to_le32(FILE_SHARE_ALL);
 	pSMB->CreateDisposition = cpu_to_le32(openDisposition);
 	pSMB->CreateOptions = cpu_to_le32(create_options & CREATE_OPTIONS_MASK);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 6ed775986be..e4e0078a052 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -119,6 +119,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 {
 	int rc = -ENOENT;
 	int xid;
+	int create_options = CREATE_NOT_DIR;
 	int oplock = 0;
 	int desiredAccess = GENERIC_READ | GENERIC_WRITE;
 	__u16 fileHandle;
@@ -176,9 +177,19 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 		FreeXid(xid);
 		return -ENOMEM;
 	}
+
+	mode &= ~current->fs->umask;
+
+	/*
+	 * if we're not using unix extensions, see if we need to set
+	 * ATTR_READONLY on the create call
+	 */
+	if (!pTcon->unix_ext && (mode & S_IWUGO) == 0)
+		create_options |= CREATE_OPTION_READONLY;
+
 	if (cifs_sb->tcon->ses->capabilities & CAP_NT_SMBS)
 		rc = CIFSSMBOpen(xid, pTcon, full_path, disposition,
-			 desiredAccess, CREATE_NOT_DIR,
+			 desiredAccess, create_options,
 			 &fileHandle, &oplock, buf, cifs_sb->local_nls,
 			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 	else
@@ -187,7 +198,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 	if (rc == -EIO) {
 		/* old server, retry the open legacy style */
 		rc = SMBLegacyOpen(xid, pTcon, full_path, disposition,
-			desiredAccess, CREATE_NOT_DIR,
+			desiredAccess, create_options,
 			&fileHandle, &oplock, buf, cifs_sb->local_nls,
 			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 	}
@@ -197,7 +208,6 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 		/* If Open reported that we actually created a file
 		then we now have to set the mode if possible */
 		if ((pTcon->unix_ext) && (oplock & CIFS_CREATE_ACTION)) {
-			mode &= ~current->fs->umask;
 			if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
 				CIFSSMBUnixSetPerms(xid, pTcon, full_path, mode,
 					(__u64)current->fsuid,
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index d904a037c83..fcbdbb6ad7b 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -974,8 +974,8 @@ mkdir_get_info:
 		  * failed to get it from the server or was set bogus */
 		if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2))
 				direntry->d_inode->i_nlink = 2;
+		mode &= ~current->fs->umask;
 		if (pTcon->unix_ext) {
-			mode &= ~current->fs->umask;
 			if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
 				CIFSSMBUnixSetPerms(xid, pTcon, full_path,
 						    mode,
@@ -994,9 +994,16 @@ mkdir_get_info:
 						    CIFS_MOUNT_MAP_SPECIAL_CHR);
 			}
 		} else {
-			/* BB to be implemented via Windows secrty descriptors
-			   eg CIFSSMBWinSetPerms(xid, pTcon, full_path, mode,
-						 -1, -1, local_nls); */
+			if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) &&
+			    (mode & S_IWUGO) == 0) {
+				FILE_BASIC_INFO pInfo;
+				memset(&pInfo, 0, sizeof(pInfo));
+				pInfo.Attributes = cpu_to_le32(ATTR_READONLY);
+				CIFSSMBSetTimes(xid, pTcon, full_path,
+						&pInfo, cifs_sb->local_nls,
+						cifs_sb->mnt_cifs_flags &
+						CIFS_MOUNT_MAP_SPECIAL_CHR);
+			}
 			if (direntry->d_inode) {
 				direntry->d_inode->i_mode = mode;
 				direntry->d_inode->i_mode |= S_IFDIR;
-- 
cgit v1.2.3-70-g09d2


From e691b9d1a096fbaaff9d6d6aef1adc593b786e62 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Sun, 11 May 2008 15:53:33 +0000
Subject: [CIFS] don't allow demultiplex thread to exit until kthread_stop is
 called

cifs_demultiplex_thread can exit under several conditions:

1) if it's signaled
2) if there's a problem with session setup
3) if kthread_stop is called on it

The first two are problems. If kthread_stop is called on the thread,
there is no guarantee that it will still be up. We need to have the
thread stay up until kthread_stop is called on it.

One option would be to not even try to tear things down until after
kthread_stop is called. However, in the case where there is a problem
setting up the session, there's no real reason to try continuing the
loop.

This patch allows the thread to clean up and prepare for exit under all
three conditions, but it has the thread go to sleep until kthread_stop
is called. This allows us to simplify the shutdown code somewhat since
we can be reasonably sure that the thread won't exit after being
signaled but before kthread_stop is called.

It also removes the places where the thread itself set the tsk variable
since it appeared that it could have a potential race where the thread
might never be shut down.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Acked-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 96505086759..f428bf3bf1a 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -348,7 +348,6 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
 	int reconnect;
 
 	current->flags |= PF_MEMALLOC;
-	server->tsk = current;	/* save process info to wake at shutdown */
 	cFYI(1, ("Demultiplex PID: %d", task_pid_nr(current)));
 	write_lock(&GlobalSMBSeslock);
 	atomic_inc(&tcpSesAllocCount);
@@ -651,10 +650,20 @@ multi_t2_fnd:
 
 	spin_lock(&GlobalMid_Lock);
 	server->tcpStatus = CifsExiting;
-	server->tsk = NULL;
+	spin_unlock(&GlobalMid_Lock);
+
+	/* don't exit until kthread_stop is called */
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	while (!kthread_should_stop()) {
+		schedule();
+		set_current_state(TASK_UNINTERRUPTIBLE);
+	}
+	set_current_state(TASK_RUNNING);
+
 	/* check if we have blocked requests that need to free */
 	/* Note that cifs_max_pending is normally 50, but
 	can be set at module install time to as little as two */
+	spin_lock(&GlobalMid_Lock);
 	if (atomic_read(&server->inFlight) >= cifs_max_pending)
 		atomic_set(&server->inFlight, cifs_max_pending - 1);
 	/* We do not want to set the max_pending too low or we
@@ -2187,15 +2196,12 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			srvTcp->tcpStatus = CifsExiting;
 			spin_unlock(&GlobalMid_Lock);
 			if (srvTcp->tsk) {
-				struct task_struct *tsk;
 				/* If we could verify that kthread_stop would
 				   always wake up processes blocked in
 				   tcp in recv_mesg then we could remove the
 				   send_sig call */
 				force_sig(SIGKILL, srvTcp->tsk);
-				tsk = srvTcp->tsk;
-				if (tsk)
-					kthread_stop(tsk);
+				kthread_stop(srvTcp->tsk);
 			}
 		}
 		 /* If find_unc succeeded then rc == 0 so we can not end */
@@ -2211,23 +2217,17 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 					if ((temp_rc == -ESHUTDOWN) &&
 					    (pSesInfo->server) &&
 					    (pSesInfo->server->tsk)) {
-						struct task_struct *tsk;
 						force_sig(SIGKILL,
 							pSesInfo->server->tsk);
-						tsk = pSesInfo->server->tsk;
-						if (tsk)
-							kthread_stop(tsk);
+						kthread_stop(pSesInfo->server->tsk);
 					}
 				} else {
 					cFYI(1, ("No session or bad tcon"));
 					if ((pSesInfo->server) &&
 					    (pSesInfo->server->tsk)) {
-						struct task_struct *tsk;
 						force_sig(SIGKILL,
 							pSesInfo->server->tsk);
-						tsk = pSesInfo->server->tsk;
-						if (tsk)
-							kthread_stop(tsk);
+						kthread_stop(pSesInfo->server->tsk);
 					}
 				}
 				sesInfoFree(pSesInfo);
-- 
cgit v1.2.3-70-g09d2


From c3921ab71507b108d51a0f1ee960f80cd668a93d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 11 May 2008 16:04:48 -0700
Subject: Add new 'cond_resched_bkl()' helper function

It acts exactly like a regular 'cond_resched()', but will not get
optimized away when CONFIG_PREEMPT is set.

Normal kernel code is already preemptable in the presense of
CONFIG_PREEMPT, so cond_resched() is optimized away (see commit
02b67cc3ba36bdba351d6c3a00593f4ec550d9d3 "sched: do not do
cond_resched() when CONFIG_PREEMPT").

But when wanting to conditionally reschedule while holding a lock, you
need to use "cond_sched_lock(lock)", and the new function is the BKL
equivalent of that.

Also make fs/locks.c use it.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/locks.c            | 2 +-
 include/linux/sched.h | 6 +++++-
 kernel/sched.c        | 2 --
 3 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index 0ac6b92cb0b..11dbf08651b 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -773,7 +773,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
 	 * give it the opportunity to lock the file.
 	 */
 	if (found)
-		cond_resched();
+		cond_resched_bkl();
 
 find_conflict:
 	for_each_lock(inode, before) {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0c35b0343a7..4ab9f32f923 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2037,13 +2037,13 @@ static inline int need_resched(void)
  * cond_resched_lock() will drop the spinlock before scheduling,
  * cond_resched_softirq() will enable bhs before scheduling.
  */
+extern int _cond_resched(void);
 #ifdef CONFIG_PREEMPT
 static inline int cond_resched(void)
 {
 	return 0;
 }
 #else
-extern int _cond_resched(void);
 static inline int cond_resched(void)
 {
 	return _cond_resched();
@@ -2051,6 +2051,10 @@ static inline int cond_resched(void)
 #endif
 extern int cond_resched_lock(spinlock_t * lock);
 extern int cond_resched_softirq(void);
+static inline int cond_resched_bkl(void)
+{
+	return _cond_resched();
+}
 
 /*
  * Does a critical section need to be broken due to another
diff --git a/kernel/sched.c b/kernel/sched.c
index c51b6565e07..8841a915545 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5525,7 +5525,6 @@ static void __cond_resched(void)
 	} while (need_resched());
 }
 
-#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY)
 int __sched _cond_resched(void)
 {
 	if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
@@ -5536,7 +5535,6 @@ int __sched _cond_resched(void)
 	return 0;
 }
 EXPORT_SYMBOL(_cond_resched);
-#endif
 
 /*
  * cond_resched_lock() - if a reschedule is pending, drop the given lock,
-- 
cgit v1.2.3-70-g09d2


From 091806edd458486af13ad83c9802f5b8b54d6d19 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Tue, 29 Apr 2008 12:35:48 -0500
Subject: [GFS2] filesystem consistency error from do_strip

This patch fixes a GFS2 filesystem consistency error reported from
function do_strip.  The problem was caused by a timing window
that allowed two vfs inodes to be created in memory that point
to the same file.  The problem is fixed by making the vfs's
iget_test, iget_set mechanism check and set a new bit in the
in-core gfs2_inode structure while the vfs inode spin_lock is held.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glops.c     |  2 +-
 fs/gfs2/incore.h    |  1 +
 fs/gfs2/inode.c     | 10 +++++-----
 fs/gfs2/meta_io.c   |  6 ++++--
 fs/gfs2/ops_super.c | 16 +++++++++-------
 5 files changed, 20 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index d31badadef8..07d84d16cda 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -249,7 +249,7 @@ static int inode_go_lock(struct gfs2_holder *gh)
 	struct gfs2_inode *ip = gl->gl_object;
 	int error = 0;
 
-	if (!ip)
+	if (!ip || (gh->gh_flags & GL_SKIP))
 		return 0;
 
 	if (test_bit(GIF_INVALID, &ip->i_flags)) {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 9c2c0b90b22..eabe5eac41d 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -236,6 +236,7 @@ enum {
 	GIF_INVALID		= 0,
 	GIF_QD_LOCKED		= 1,
 	GIF_SW_PAGED		= 3,
+	GIF_USER                = 4, /* user inode, not metadata addr space */
 };
 
 struct gfs2_dinode_host {
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 3a9ef526c30..09453d057e4 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -47,8 +47,7 @@ static int iget_test(struct inode *inode, void *opaque)
 	struct gfs2_inode *ip = GFS2_I(inode);
 	u64 *no_addr = opaque;
 
-	if (ip->i_no_addr == *no_addr &&
-	    inode->i_private != NULL)
+	if (ip->i_no_addr == *no_addr && test_bit(GIF_USER, &ip->i_flags))
 		return 1;
 
 	return 0;
@@ -61,6 +60,7 @@ static int iget_set(struct inode *inode, void *opaque)
 
 	inode->i_ino = (unsigned long)*no_addr;
 	ip->i_no_addr = *no_addr;
+	set_bit(GIF_USER, &ip->i_flags);
 	return 0;
 }
 
@@ -86,7 +86,7 @@ static int iget_skip_test(struct inode *inode, void *opaque)
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_skip_data *data = opaque;
 
-	if (ip->i_no_addr == data->no_addr && inode->i_private != NULL){
+	if (ip->i_no_addr == data->no_addr && test_bit(GIF_USER, &ip->i_flags)){
 		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)){
 			data->skipped = 1;
 			return 0;
@@ -105,6 +105,7 @@ static int iget_skip_set(struct inode *inode, void *opaque)
 		return 1;
 	inode->i_ino = (unsigned long)(data->no_addr);
 	ip->i_no_addr = data->no_addr;
+	set_bit(GIF_USER, &ip->i_flags);
 	return 0;
 }
 
@@ -166,7 +167,7 @@ void gfs2_set_iop(struct inode *inode)
  * Returns: A VFS inode, or an error
  */
 
-struct inode *gfs2_inode_lookup(struct super_block *sb, 
+struct inode *gfs2_inode_lookup(struct super_block *sb,
 				unsigned int type,
 				u64 no_addr,
 				u64 no_formal_ino, int skip_freeing)
@@ -187,7 +188,6 @@ struct inode *gfs2_inode_lookup(struct super_block *sb,
 
 	if (inode->i_state & I_NEW) {
 		struct gfs2_sbd *sdp = GFS2_SB(inode);
-		inode->i_private = ip;
 		ip->i_no_formal_ino = no_formal_ino;
 
 		error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 85aea27b4a8..78d75f892f8 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -69,13 +69,15 @@ static const struct address_space_operations aspace_aops = {
 struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp)
 {
 	struct inode *aspace;
+	struct gfs2_inode *ip;
 
 	aspace = new_inode(sdp->sd_vfs);
 	if (aspace) {
 		mapping_set_gfp_mask(aspace->i_mapping, GFP_NOFS);
 		aspace->i_mapping->a_ops = &aspace_aops;
 		aspace->i_size = ~0ULL;
-		aspace->i_private = NULL;
+		ip = GFS2_I(aspace);
+		clear_bit(GIF_USER, &ip->i_flags);
 		insert_inode_hash(aspace);
 	}
 	return aspace;
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 2278c68b7e3..0b7cc920eb8 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -52,7 +52,7 @@ static int gfs2_write_inode(struct inode *inode, int sync)
 	struct gfs2_inode *ip = GFS2_I(inode);
 
 	/* Check this is a "normal" inode */
-	if (inode->i_private) {
+	if (test_bit(GIF_USER, &ip->i_flags)) {
 		if (current->flags & PF_MEMALLOC)
 			return 0;
 		if (sync)
@@ -297,8 +297,9 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
  */
 static void gfs2_drop_inode(struct inode *inode)
 {
-	if (inode->i_private && inode->i_nlink) {
-		struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_inode *ip = GFS2_I(inode);
+
+	if (test_bit(GIF_USER, &ip->i_flags) && inode->i_nlink) {
 		struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
 		if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
 			clear_nlink(inode);
@@ -314,12 +315,13 @@ static void gfs2_drop_inode(struct inode *inode)
 
 static void gfs2_clear_inode(struct inode *inode)
 {
+	struct gfs2_inode *ip = GFS2_I(inode);
+
 	/* This tells us its a "real" inode and not one which only
 	 * serves to contain an address space (see rgrp.c, meta_io.c)
 	 * which therefore doesn't have its own glocks.
 	 */
-	if (inode->i_private) {
-		struct gfs2_inode *ip = GFS2_I(inode);
+	if (test_bit(GIF_USER, &ip->i_flags)) {
 		ip->i_gl->gl_object = NULL;
 		gfs2_glock_schedule_for_reclaim(ip->i_gl);
 		gfs2_glock_put(ip->i_gl);
@@ -419,7 +421,7 @@ static void gfs2_delete_inode(struct inode *inode)
 	struct gfs2_holder gh;
 	int error;
 
-	if (!inode->i_private)
+	if (!test_bit(GIF_USER, &ip->i_flags))
 		goto out;
 
 	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
-- 
cgit v1.2.3-70-g09d2


From ad99f77778e83358c371dab7a50bde69270ed6b8 Mon Sep 17 00:00:00 2001
From: Andrew Price <andy@andrewprice.me.uk>
Date: Thu, 1 May 2008 11:55:38 +0100
Subject: [GFS2] Fix cast from unsigned int to s64

This fixes bz 444829 where allocating a new block caused gfs2 file systems to
report 0 bytes used in df. It was caused by a broken cast from an unsigned int
in gfs2_block_alloc() to a negative s64 in gfs2_statfs_change(). This patch
casts the unsigned int to an s64 before the unary minus is applied.

Signed-off-by: Andrew Price <andy@andrewprice.me.uk>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/rgrp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 7e8f0b1d6c6..6387523a315 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1495,7 +1495,7 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
 
 	al->al_alloced += *n;
 
-	gfs2_statfs_change(sdp, 0, -*n, 0);
+	gfs2_statfs_change(sdp, 0, -(s64)*n, 0);
 	gfs2_quota_change(ip, *n, ip->i_inode.i_uid, ip->i_inode.i_gid);
 
 	spin_lock(&sdp->sd_rindex_spin);
-- 
cgit v1.2.3-70-g09d2


From 00377d8e3842776d1da633ad9c79a16ecb548b92 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Fri, 9 May 2008 17:59:51 +0200
Subject: [GFS2] Prefer strlcpy() over snprintf()

strlcpy is faster than snprintf when you don't use the returned value.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_fstype.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index ef9c6c4f80f..b2028c82e8d 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -142,8 +142,8 @@ static int init_names(struct gfs2_sbd *sdp, int silent)
 	if (!table[0])
 		table = sdp->sd_vfs->s_id;
 
-	snprintf(sdp->sd_proto_name, GFS2_FSNAME_LEN, "%s", proto);
-	snprintf(sdp->sd_table_name, GFS2_FSNAME_LEN, "%s", table);
+	strlcpy(sdp->sd_proto_name, proto, GFS2_FSNAME_LEN);
+	strlcpy(sdp->sd_table_name, table, GFS2_FSNAME_LEN);
 
 	table = sdp->sd_table_name;
 	while ((table = strchr(table, '/')))
-- 
cgit v1.2.3-70-g09d2


From 88f85a55c0645c2b7e03bf34d2a90eddf6de34fa Mon Sep 17 00:00:00 2001
From: Marcin Slusarz <marcin.slusarz@gmail.com>
Date: Mon, 12 May 2008 16:42:43 -0500
Subject: JFS: 0 is not valid errno value so return NULL from jfs_lookup

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Cc: jfs-discussion@lists.sourceforge.net
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
---
 fs/jfs/namei.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 0ba6778edaa..2aba8238681 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1455,7 +1455,7 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc
 		free_UCSname(&key);
 		if (rc == -ENOENT) {
 			d_add(dentry, NULL);
-			return ERR_PTR(0);
+			return NULL;
 		} else if (rc) {
 			jfs_err("jfs_lookup: dtSearch returned %d", rc);
 			return ERR_PTR(rc);
-- 
cgit v1.2.3-70-g09d2


From d0a9c078db4769f7305ff9774558776d12bfb25b Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 12 May 2008 22:23:49 +0000
Subject: [CIFS] CIFS currently allows for permissions to be changed on files,
 even when unix extensions and cifsacl support are disabled. These permissions
 changes are "ephemeral" however. They are lost whenever a share is mounted
 and unmounted, or when memory pressure forces the inode out of the cache.

Because of this, we'd like to introduce a behavior change to make
CIFS behave more like local DOS/Windows filesystems. When unix
extensions and cifsacl support aren't enabled, then don't silently
ignore changes to permission bits that can't be reflected on the
server.

Still, there may be people relying on the current behavior for
certain applications. This patch adds a new "dynperm" (and a
corresponding "nodynperm") mount option that will be intended
to make the client fall back to legacy behavior when setting
these modes.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_fs_sb.h | 1 +
 fs/cifs/connect.c    | 7 +++++++
 2 files changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 8ad2330ba06..877c85409f1 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -30,6 +30,7 @@
 #define CIFS_MOUNT_CIFS_ACL     0x200 /* send ACL requests to non-POSIX srv   */
 #define CIFS_MOUNT_OVERR_UID    0x400 /* override uid returned from server    */
 #define CIFS_MOUNT_OVERR_GID    0x800 /* override gid returned from server    */
+#define CIFS_MOUNT_DYNPERM	0x1000 /* allow in-memory only mode setting */
 
 struct cifs_sb_info {
 	struct cifsTconInfo *tcon;	/* primary mount */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index f428bf3bf1a..8e2fa6d46c7 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -75,6 +75,7 @@ struct smb_vol {
 	bool setuids:1;
 	bool override_uid:1;
 	bool override_gid:1;
+	bool dynperm:1;
 	bool noperm:1;
 	bool no_psx_acl:1; /* set if posix acl support should be disabled */
 	bool cifs_acl:1;
@@ -1246,6 +1247,10 @@ cifs_parse_mount_options(char *options, const char *devname,
 			vol->setuids = 1;
 		} else if (strnicmp(data, "nosetuids", 9) == 0) {
 			vol->setuids = 0;
+		} else if (strnicmp(data, "dynperm", 7) == 0) {
+			vol->dynperm = true;
+		} else if (strnicmp(data, "nodynperm", 9) == 0) {
+			vol->dynperm = false;
 		} else if (strnicmp(data, "nohard", 6) == 0) {
 			vol->retry = 0;
 		} else if (strnicmp(data, "nosoft", 6) == 0) {
@@ -2125,6 +2130,8 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_UID;
 		if (volume_info.override_gid)
 			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_GID;
+		if (volume_info.dynperm)
+			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM;
 		if (volume_info.direct_io) {
 			cFYI(1, ("mounting share using direct i/o"));
 			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
-- 
cgit v1.2.3-70-g09d2


From 6353450a2deefaa79cdb4fd2b72830c7db610256 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 12 May 2008 19:56:05 -0700
Subject: fix memory leak in CIFSFindNext

When CIFSFindNext gets back an -EBADF from a call, it sets the return
code of the function to 0 and eventually exits. Doing this makes the
cleanup at the end of the function skip freeing the SMB buffer, so
we need to make sure we free the buffer explicitly when doing this.

If we don't you end up with errors like this when unplugging the cifs
kernel module:

slab error in kmem_cache_destroy(): cache `cifs_request': Can't free all objects
 [<c046bdbf>] kmem_cache_destroy+0x61/0xf3
 [<e0f03045>] cifs_destroy_request_bufs+0x14/0x28 [cifs]
 [<e0f2016e>] exit_cifs+0x1e/0x80 [cifs]
 [<c043aeae>] sys_delete_module+0x192/0x1b8
 [<c04451fd>] audit_syscall_entry+0x14b/0x17d
 [<c0405413>] syscall_call+0x7/0xb
 =======================

Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 fs/cifs/CHANGES   | 3 ++-
 fs/cifs/cifssmb.c | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 8355e918fdd..502a4c2b841 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -12,7 +12,8 @@ Add ability to modify cifs acls for handling chmod (when mounted with
 cifsacl flag). Fix prefixpath path separator so we can handle mounts
 with prefixpaths longer than one directory (one path component) when
 mounted to Windows servers.  Fix slow file open when cifsacl
-enabled.
+enabled. Fix memory leak in FindNext when the SMB call returns -EBADF.
+
 
 Version 1.51
 ------------
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 95fbba4ea7d..641cc8ffc51 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -3679,6 +3679,7 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
 	if (rc) {
 		if (rc == -EBADF) {
 			psrch_inf->endOfSearch = true;
+			cifs_buf_release(pSMB);
 			rc = 0; /* search probably was closed at end of search*/
 		} else
 			cFYI(1, ("FindNext returned = %d", rc));
-- 
cgit v1.2.3-70-g09d2


From ed5f037005d728de19a0f63678ac35b42064966d Mon Sep 17 00:00:00 2001
From: Marcin Slusarz <marcin.slusarz@gmail.com>
Date: Tue, 13 May 2008 04:01:01 +0000
Subject: [CIFS] CIFSSMBPosixLock should return -EINVAL on error

all other codepaths in this function return negative values on errors

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifssmb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 641cc8ffc51..1cbe61524ef 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1767,7 +1767,7 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
 	cFYI(1, ("Posix Lock"));
 
 	if (pLockData == NULL)
-		return EINVAL;
+		return -EINVAL;
 
 	rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
 
-- 
cgit v1.2.3-70-g09d2


From 582d21e5e319d38c0485d8b9e92f6f2341f7c79b Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 13 May 2008 04:54:12 +0000
Subject: [CIFS] cleanup old checkpatch warnings

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsproto.h |  6 +++---
 fs/cifs/cifssmb.c   | 51 +++++++++++++++++++++++++++++++++------------------
 fs/cifs/connect.c   |  7 ++++---
 fs/cifs/netmisc.c   |  6 +++---
 fs/cifs/ntlmssp.h   |  4 ++--
 5 files changed, 45 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index d481f6c5a2b..08248e85b78 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -93,7 +93,7 @@ extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time);
 
 extern int cifs_get_inode_info(struct inode **pinode,
 			const unsigned char *search_path,
-			FILE_ALL_INFO * pfile_info,
+			FILE_ALL_INFO *pfile_info,
 			struct super_block *sb, int xid, const __u16 *pfid);
 extern int cifs_get_inode_info_unix(struct inode **pinode,
 			const unsigned char *search_path,
@@ -130,7 +130,7 @@ extern int CIFSFindClose(const int, struct cifsTconInfo *tcon,
 
 extern int CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
 			const unsigned char *searchName,
-			FILE_ALL_INFO * findData,
+			FILE_ALL_INFO *findData,
 			int legacy /* whether to use old info level */,
 			const struct nls_table *nls_codepage, int remap);
 extern int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
@@ -141,7 +141,7 @@ extern int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
 extern int CIFSSMBUnixQPathInfo(const int xid,
 			struct cifsTconInfo *tcon,
 			const unsigned char *searchName,
-			FILE_UNIX_BASIC_INFO * pFindData,
+			FILE_UNIX_BASIC_INFO *pFindData,
 			const struct nls_table *nls_codepage, int remap);
 
 extern int CIFSGetDFSRefer(const int xid, struct cifsSesInfo *ses,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 1cbe61524ef..3c05c2de50e 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1247,7 +1247,7 @@ OldOpenRetry:
 	} else {
 	/* BB verify if wct == 15 */
 
-/*		*pOplock = pSMBr->OplockLevel; */  /* BB take from action field BB */
+/*		*pOplock = pSMBr->OplockLevel; */ /* BB take from action field*/
 
 		*netfid = pSMBr->Fid;   /* cifs fid stays in le */
 		/* Let caller know file was created so we can set the mode. */
@@ -1944,7 +1944,7 @@ renameRetry:
 	/* protocol requires ASCII signature byte on Unicode string */
 		pSMB->OldFileName[name_len + 1] = 0x00;
 		name_len2 =
-		    cifsConvertToUCS((__le16 *) &pSMB->OldFileName[name_len + 2],
+		    cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2],
 				     toName, PATH_MAX, nls_codepage, remap);
 		name_len2 += 1 /* trailing null */  + 1 /* Signature word */ ;
 		name_len2 *= 2;	/* convert to bytes */
@@ -2925,7 +2925,8 @@ setAclRetry:
 	}
 	params = 6 + name_len;
 	pSMB->MaxParameterCount = cpu_to_le16(2);
-	pSMB->MaxDataCount = cpu_to_le16(1000); /* BB find max SMB size from sess */
+	/* BB find max SMB size from sess */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
 	pSMB->MaxSetupCount = 0;
 	pSMB->Reserved = 0;
 	pSMB->Flags = 0;
@@ -3322,7 +3323,8 @@ QPathInfoRetry:
 	params = 2 /* level */ + 4 /* reserved */ + name_len /* includes NUL */;
 	pSMB->TotalDataCount = 0;
 	pSMB->MaxParameterCount = cpu_to_le16(2);
-	pSMB->MaxDataCount = cpu_to_le16(4000);	/* BB find exact max SMB PDU from sess structure BB */
+	/* BB find exact max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(4000);
 	pSMB->MaxSetupCount = 0;
 	pSMB->Reserved = 0;
 	pSMB->Flags = 0;
@@ -3388,7 +3390,7 @@ QPathInfoRetry:
 int
 CIFSSMBUnixQPathInfo(const int xid, struct cifsTconInfo *tcon,
 		     const unsigned char *searchName,
-		     FILE_UNIX_BASIC_INFO * pFindData,
+		     FILE_UNIX_BASIC_INFO *pFindData,
 		     const struct nls_table *nls_codepage, int remap)
 {
 /* SMB_QUERY_FILE_UNIX_BASIC */
@@ -3922,7 +3924,8 @@ getDFSRetry:
 	pSMB->DataCount = 0;
 	pSMB->DataOffset = 0;
 	pSMB->MaxParameterCount = 0;
-	pSMB->MaxDataCount = cpu_to_le16(4000);	/* BB find exact max SMB PDU from sess structure BB */
+	/* BB find exact max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(4000);
 	pSMB->MaxSetupCount = 0;
 	pSMB->Reserved = 0;
 	pSMB->Flags = 0;
@@ -4230,7 +4233,8 @@ QFSAttributeRetry:
 	params = 2;	/* level */
 	pSMB->TotalDataCount = 0;
 	pSMB->MaxParameterCount = cpu_to_le16(2);
-	pSMB->MaxDataCount = cpu_to_le16(1000);	/* BB find exact max SMB PDU from sess structure BB */
+	/* BB find exact max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
 	pSMB->MaxSetupCount = 0;
 	pSMB->Reserved = 0;
 	pSMB->Flags = 0;
@@ -4299,7 +4303,8 @@ QFSDeviceRetry:
 	params = 2;	/* level */
 	pSMB->TotalDataCount = 0;
 	pSMB->MaxParameterCount = cpu_to_le16(2);
-	pSMB->MaxDataCount = cpu_to_le16(1000);	/* BB find exact max SMB PDU from sess structure BB */
+	/* BB find exact max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
 	pSMB->MaxSetupCount = 0;
 	pSMB->Reserved = 0;
 	pSMB->Flags = 0;
@@ -4370,7 +4375,8 @@ QFSUnixRetry:
 	pSMB->DataCount = 0;
 	pSMB->DataOffset = 0;
 	pSMB->MaxParameterCount = cpu_to_le16(2);
-	pSMB->MaxDataCount = cpu_to_le16(100);	/* BB find exact max SMB PDU from sess structure BB */
+	/* BB find exact max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(100);
 	pSMB->MaxSetupCount = 0;
 	pSMB->Reserved = 0;
 	pSMB->Flags = 0;
@@ -4445,7 +4451,8 @@ SETFSUnixRetry:
 	offset = param_offset + params;
 
 	pSMB->MaxParameterCount = cpu_to_le16(4);
-	pSMB->MaxDataCount = cpu_to_le16(100);	/* BB find exact max SMB PDU from sess structure BB */
+	/* BB find exact max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(100);
 	pSMB->SetupCount = 1;
 	pSMB->Reserved3 = 0;
 	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FS_INFORMATION);
@@ -4513,7 +4520,8 @@ QFSPosixRetry:
 	pSMB->DataCount = 0;
 	pSMB->DataOffset = 0;
 	pSMB->MaxParameterCount = cpu_to_le16(2);
-	pSMB->MaxDataCount = cpu_to_le16(100);	/* BB find exact max SMB PDU from sess structure BB */
+	/* BB find exact max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(100);
 	pSMB->MaxSetupCount = 0;
 	pSMB->Reserved = 0;
 	pSMB->Flags = 0;
@@ -4703,7 +4711,8 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
 
 	count = sizeof(struct file_end_of_file_info);
 	pSMB->MaxParameterCount = cpu_to_le16(2);
-	pSMB->MaxDataCount = cpu_to_le16(1000);	/* BB find max SMB PDU from sess */
+	/* BB find exact max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
 	pSMB->SetupCount = 1;
 	pSMB->Reserved3 = 0;
 	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION);
@@ -4790,7 +4799,8 @@ CIFSSMBSetFileTimes(const int xid, struct cifsTconInfo *tcon,
 
 	count = sizeof(FILE_BASIC_INFO);
 	pSMB->MaxParameterCount = cpu_to_le16(2);
-	pSMB->MaxDataCount = cpu_to_le16(1000);	/* BB find max SMB PDU from sess */
+	/* BB find max SMB PDU from sess */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
 	pSMB->SetupCount = 1;
 	pSMB->Reserved3 = 0;
 	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION);
@@ -4857,7 +4867,8 @@ SetTimesRetry:
 	params = 6 + name_len;
 	count = sizeof(FILE_BASIC_INFO);
 	pSMB->MaxParameterCount = cpu_to_le16(2);
-	pSMB->MaxDataCount = cpu_to_le16(1000);	/* BB find exact max SMB PDU from sess structure BB */
+	/* BB find max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
 	pSMB->MaxSetupCount = 0;
 	pSMB->Reserved = 0;
 	pSMB->Flags = 0;
@@ -4987,7 +4998,8 @@ setPermsRetry:
 	params = 6 + name_len;
 	count = sizeof(FILE_UNIX_BASIC_INFO);
 	pSMB->MaxParameterCount = cpu_to_le16(2);
-	pSMB->MaxDataCount = cpu_to_le16(1000);	/* BB find exact max SMB PDU from sess structure BB */
+	/* BB find max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
 	pSMB->MaxSetupCount = 0;
 	pSMB->Reserved = 0;
 	pSMB->Flags = 0;
@@ -5170,7 +5182,8 @@ QAllEAsRetry:
 	params = 2 /* level */ + 4 /* reserved */ + name_len /* includes NUL */;
 	pSMB->TotalDataCount = 0;
 	pSMB->MaxParameterCount = cpu_to_le16(2);
-	pSMB->MaxDataCount = cpu_to_le16(4000);	/* BB find exact max SMB PDU from sess structure BB */
+	/* BB find exact max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(4000);
 	pSMB->MaxSetupCount = 0;
 	pSMB->Reserved = 0;
 	pSMB->Flags = 0;
@@ -5318,7 +5331,8 @@ QEARetry:
 	params = 2 /* level */ + 4 /* reserved */ + name_len /* includes NUL */;
 	pSMB->TotalDataCount = 0;
 	pSMB->MaxParameterCount = cpu_to_le16(2);
-	pSMB->MaxDataCount = cpu_to_le16(4000);	/* BB find exact max SMB PDU from sess structure BB */
+	/* BB find exact max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(4000);
 	pSMB->MaxSetupCount = 0;
 	pSMB->Reserved = 0;
 	pSMB->Flags = 0;
@@ -5476,7 +5490,8 @@ SetEARetry:
 
 	count = sizeof(*parm_data) + ea_value_len + name_len;
 	pSMB->MaxParameterCount = cpu_to_le16(2);
-	pSMB->MaxDataCount = cpu_to_le16(1000);	/* BB find max SMB size from sess */
+	/* BB find max SMB PDU from sess */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
 	pSMB->MaxSetupCount = 0;
 	pSMB->Reserved = 0;
 	pSMB->Flags = 0;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 8e2fa6d46c7..7c2e5ea0330 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1820,7 +1820,7 @@ convert_delimiter(char *path, char delim)
 	if (path == NULL)
 		return;
 
-	if (delim == '/') 
+	if (delim == '/')
 		old_delim = '\\';
 	else
 		old_delim = '/';
@@ -2321,9 +2321,10 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 	user = ses->userName;
 	domain = ses->domainName;
 	smb_buffer = cifs_buf_get();
-	if (smb_buffer == NULL) {
+
+	if (smb_buffer == NULL)
 		return -ENOMEM;
-	}
+
 	smb_buffer_response = smb_buffer;
 	pSMBr = pSMB = (SESSION_SETUP_ANDX *) smb_buffer;
 
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 00f4cff400b..8703d68f5b2 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -141,11 +141,11 @@ cifs_inet_pton(const int address_family, const char *cp, void *dst)
 	int ret = 0;
 
 	/* calculate length by finding first slash or NULL */
-	if (address_family == AF_INET) {
+	if (address_family == AF_INET)
 		ret = in4_pton(cp, -1 /* len */, dst, '\\', NULL);
-	} else if (address_family == AF_INET6) {
+	else if (address_family == AF_INET6)
 		ret = in6_pton(cp, -1 /* len */, dst , '\\', NULL);
-	}
+
 	cFYI(DBG2, ("address conversion returned %d for %s", ret, cp));
 	if (ret > 0)
 		ret = 1;
diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h
index 7170a9b70f1..c377d8065d9 100644
--- a/fs/cifs/ntlmssp.h
+++ b/fs/cifs/ntlmssp.h
@@ -64,7 +64,7 @@ typedef struct _SECURITY_BUFFER {
 } __attribute__((packed)) SECURITY_BUFFER;
 
 typedef struct _NEGOTIATE_MESSAGE {
-	__u8 Signature[sizeof (NTLMSSP_SIGNATURE)];
+	__u8 Signature[sizeof(NTLMSSP_SIGNATURE)];
 	__le32 MessageType;     /* 1 */
 	__le32 NegotiateFlags;
 	SECURITY_BUFFER DomainName;	/* RFC 1001 style and ASCII */
@@ -74,7 +74,7 @@ typedef struct _NEGOTIATE_MESSAGE {
 } __attribute__((packed)) NEGOTIATE_MESSAGE, *PNEGOTIATE_MESSAGE;
 
 typedef struct _CHALLENGE_MESSAGE {
-	__u8 Signature[sizeof (NTLMSSP_SIGNATURE)];
+	__u8 Signature[sizeof(NTLMSSP_SIGNATURE)];
 	__le32 MessageType;   /* 2 */
 	SECURITY_BUFFER TargetName;
 	__le32 NegotiateFlags;
-- 
cgit v1.2.3-70-g09d2


From b2e03ca7485cac033a0667d9e45e28d32fdee9a5 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 13 May 2008 08:22:10 -0500
Subject: JFS: switch to seq_files

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
---
 fs/jfs/jfs_debug.c    | 62 +++++++++++++++++++++-------------------------
 fs/jfs/jfs_debug.h    | 10 ++++----
 fs/jfs/jfs_logmgr.c   | 35 ++++++++++++--------------
 fs/jfs/jfs_metapage.c | 36 +++++++++++++--------------
 fs/jfs/jfs_txnmgr.c   | 68 +++++++++++++++++++++++----------------------------
 fs/jfs/jfs_xtree.c    | 36 +++++++++++++--------------
 6 files changed, 114 insertions(+), 133 deletions(-)

(limited to 'fs')

diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c
index bf6ab19b86e..6a73de84bce 100644
--- a/fs/jfs/jfs_debug.c
+++ b/fs/jfs/jfs_debug.c
@@ -21,6 +21,7 @@
 #include <linux/ctype.h>
 #include <linux/module.h>
 #include <linux/proc_fs.h>
+#include <linux/seq_file.h>
 #include <asm/uaccess.h>
 #include "jfs_incore.h"
 #include "jfs_filsys.h"
@@ -30,29 +31,19 @@
 
 static struct proc_dir_entry *base;
 #ifdef CONFIG_JFS_DEBUG
-static int loglevel_read(char *page, char **start, off_t off,
-			 int count, int *eof, void *data)
+static int jfs_loglevel_proc_show(struct seq_file *m, void *v)
 {
-	int len;
-
-	len = sprintf(page, "%d\n", jfsloglevel);
-
-	len -= off;
-	*start = page + off;
-
-	if (len > count)
-		len = count;
-	else
-		*eof = 1;
-
-	if (len < 0)
-		len = 0;
+	seq_printf(m, "%d\n", jfsloglevel);
+	return 0;
+}
 
-	return len;
+static int jfs_loglevel_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, jfs_loglevel_proc_show, NULL);
 }
 
-static int loglevel_write(struct file *file, const char __user *buffer,
-			unsigned long count, void *data)
+static ssize_t jfs_loglevel_proc_write(struct file *file,
+		const char __user *buffer, size_t count, loff_t *ppos)
 {
 	char c;
 
@@ -65,22 +56,30 @@ static int loglevel_write(struct file *file, const char __user *buffer,
 	jfsloglevel = c - '0';
 	return count;
 }
+
+static const struct file_operations jfs_loglevel_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= jfs_loglevel_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= jfs_loglevel_proc_write,
+};
 #endif
 
 static struct {
 	const char	*name;
-	read_proc_t	*read_fn;
-	write_proc_t	*write_fn;
+	const struct file_operations *proc_fops;
 } Entries[] = {
 #ifdef CONFIG_JFS_STATISTICS
-	{ "lmstats",	jfs_lmstats_read, },
-	{ "txstats",	jfs_txstats_read, },
-	{ "xtstat",	jfs_xtstat_read, },
-	{ "mpstat",	jfs_mpstat_read, },
+	{ "lmstats",	&jfs_lmstats_proc_fops, },
+	{ "txstats",	&jfs_txstats_proc_fops, },
+	{ "xtstat",	&jfs_xtstat_proc_fops, },
+	{ "mpstat",	&jfs_mpstat_proc_fops, },
 #endif
 #ifdef CONFIG_JFS_DEBUG
-	{ "TxAnchor",	jfs_txanchor_read, },
-	{ "loglevel",	loglevel_read, loglevel_write }
+	{ "TxAnchor",	&jfs_txanchor_proc_fops, },
+	{ "loglevel",	&jfs_loglevel_proc_fops }
 #endif
 };
 #define NPROCENT	ARRAY_SIZE(Entries)
@@ -93,13 +92,8 @@ void jfs_proc_init(void)
 		return;
 	base->owner = THIS_MODULE;
 
-	for (i = 0; i < NPROCENT; i++) {
-		struct proc_dir_entry *p;
-		if ((p = create_proc_entry(Entries[i].name, 0, base))) {
-			p->read_proc = Entries[i].read_fn;
-			p->write_proc = Entries[i].write_fn;
-		}
-	}
+	for (i = 0; i < NPROCENT; i++)
+		proc_create(Entries[i].name, 0, base, Entries[i].proc_fops);
 }
 
 void jfs_proc_clean(void)
diff --git a/fs/jfs/jfs_debug.h b/fs/jfs/jfs_debug.h
index 044c1e654cc..eafd1300a00 100644
--- a/fs/jfs/jfs_debug.h
+++ b/fs/jfs/jfs_debug.h
@@ -62,7 +62,7 @@ extern void jfs_proc_clean(void);
 
 extern int jfsloglevel;
 
-extern int jfs_txanchor_read(char *, char **, off_t, int, int *, void *);
+extern const struct file_operations jfs_txanchor_proc_fops;
 
 /* information message: e.g., configuration, major event */
 #define jfs_info(fmt, arg...) do {			\
@@ -105,10 +105,10 @@ extern int jfs_txanchor_read(char *, char **, off_t, int, int *, void *);
  *	----------
  */
 #ifdef	CONFIG_JFS_STATISTICS
-extern int jfs_lmstats_read(char *, char **, off_t, int, int *, void *);
-extern int jfs_txstats_read(char *, char **, off_t, int, int *, void *);
-extern int jfs_mpstat_read(char *, char **, off_t, int, int *, void *);
-extern int jfs_xtstat_read(char *, char **, off_t, int, int *, void *);
+extern const struct file_operations jfs_lmstats_proc_fops;
+extern const struct file_operations jfs_txstats_proc_fops;
+extern const struct file_operations jfs_mpstat_proc_fops;
+extern const struct file_operations jfs_xtstat_proc_fops;
 
 #define	INCREMENT(x)		((x)++)
 #define	DECREMENT(x)		((x)--)
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 325a9679b95..cd2ec2988b5 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -69,6 +69,7 @@
 #include <linux/freezer.h>
 #include <linux/delay.h>
 #include <linux/mutex.h>
+#include <linux/seq_file.h>
 #include "jfs_incore.h"
 #include "jfs_filsys.h"
 #include "jfs_metapage.h"
@@ -2503,13 +2504,9 @@ exit:
 }
 
 #ifdef CONFIG_JFS_STATISTICS
-int jfs_lmstats_read(char *buffer, char **start, off_t offset, int length,
-		      int *eof, void *data)
+static int jfs_lmstats_proc_show(struct seq_file *m, void *v)
 {
-	int len = 0;
-	off_t begin;
-
-	len += sprintf(buffer,
+	seq_printf(m,
 		       "JFS Logmgr stats\n"
 		       "================\n"
 		       "commits = %d\n"
@@ -2522,19 +2519,19 @@ int jfs_lmstats_read(char *buffer, char **start, off_t offset, int length,
 		       lmStat.pagedone,
 		       lmStat.full_page,
 		       lmStat.partial_page);
+	return 0;
+}
 
-	begin = offset;
-	*start = buffer + begin;
-	len -= begin;
-
-	if (len > length)
-		len = length;
-	else
-		*eof = 1;
-
-	if (len < 0)
-		len = 0;
-
-	return len;
+static int jfs_lmstats_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, jfs_lmstats_proc_show, NULL);
 }
+
+const struct file_operations jfs_lmstats_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= jfs_lmstats_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
 #endif /* CONFIG_JFS_STATISTICS */
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index d1e64f2f2fc..854ff0ec574 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -19,10 +19,12 @@
 
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/module.h>
 #include <linux/bio.h>
 #include <linux/init.h>
 #include <linux/buffer_head.h>
 #include <linux/mempool.h>
+#include <linux/seq_file.h>
 #include "jfs_incore.h"
 #include "jfs_superblock.h"
 #include "jfs_filsys.h"
@@ -804,13 +806,9 @@ void __invalidate_metapages(struct inode *ip, s64 addr, int len)
 }
 
 #ifdef CONFIG_JFS_STATISTICS
-int jfs_mpstat_read(char *buffer, char **start, off_t offset, int length,
-		    int *eof, void *data)
+static int jfs_mpstat_proc_show(struct seq_file *m, void *v)
 {
-	int len = 0;
-	off_t begin;
-
-	len += sprintf(buffer,
+	seq_printf(m,
 		       "JFS Metapage statistics\n"
 		       "=======================\n"
 		       "page allocations = %d\n"
@@ -819,19 +817,19 @@ int jfs_mpstat_read(char *buffer, char **start, off_t offset, int length,
 		       mpStat.pagealloc,
 		       mpStat.pagefree,
 		       mpStat.lockwait);
+	return 0;
+}
 
-	begin = offset;
-	*start = buffer + begin;
-	len -= begin;
-
-	if (len > length)
-		len = length;
-	else
-		*eof = 1;
-
-	if (len < 0)
-		len = 0;
-
-	return len;
+static int jfs_mpstat_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, jfs_mpstat_proc_show, NULL);
 }
+
+const struct file_operations jfs_mpstat_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= jfs_mpstat_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
 #endif
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index e7c60ae6b5b..f26e4d03ada 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -49,6 +49,7 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/kthread.h>
+#include <linux/seq_file.h>
 #include "jfs_incore.h"
 #include "jfs_inode.h"
 #include "jfs_filsys.h"
@@ -3009,11 +3010,8 @@ int jfs_sync(void *arg)
 }
 
 #if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_DEBUG)
-int jfs_txanchor_read(char *buffer, char **start, off_t offset, int length,
-		      int *eof, void *data)
+static int jfs_txanchor_proc_show(struct seq_file *m, void *v)
 {
-	int len = 0;
-	off_t begin;
 	char *freewait;
 	char *freelockwait;
 	char *lowlockwait;
@@ -3025,7 +3023,7 @@ int jfs_txanchor_read(char *buffer, char **start, off_t offset, int length,
 	lowlockwait =
 	    waitqueue_active(&TxAnchor.lowlockwait) ? "active" : "empty";
 
-	len += sprintf(buffer,
+	seq_printf(m,
 		       "JFS TxAnchor\n"
 		       "============\n"
 		       "freetid = %d\n"
@@ -3044,31 +3042,27 @@ int jfs_txanchor_read(char *buffer, char **start, off_t offset, int length,
 		       TxAnchor.tlocksInUse,
 		       jfs_tlocks_low,
 		       list_empty(&TxAnchor.unlock_queue) ? "" : "not ");
+	return 0;
+}
 
-	begin = offset;
-	*start = buffer + begin;
-	len -= begin;
-
-	if (len > length)
-		len = length;
-	else
-		*eof = 1;
-
-	if (len < 0)
-		len = 0;
-
-	return len;
+static int jfs_txanchor_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, jfs_txanchor_proc_show, NULL);
 }
+
+const struct file_operations jfs_txanchor_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= jfs_txanchor_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
 #endif
 
 #if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_STATISTICS)
-int jfs_txstats_read(char *buffer, char **start, off_t offset, int length,
-		     int *eof, void *data)
+static int jfs_txstats_proc_show(struct seq_file *m, void *v)
 {
-	int len = 0;
-	off_t begin;
-
-	len += sprintf(buffer,
+	seq_printf(m,
 		       "JFS TxStats\n"
 		       "===========\n"
 		       "calls to txBegin = %d\n"
@@ -3089,19 +3083,19 @@ int jfs_txstats_read(char *buffer, char **start, off_t offset, int length,
 		       TxStat.txBeginAnon_lockslow,
 		       TxStat.txLockAlloc,
 		       TxStat.txLockAlloc_freelock);
+	return 0;
+}
 
-	begin = offset;
-	*start = buffer + begin;
-	len -= begin;
-
-	if (len > length)
-		len = length;
-	else
-		*eof = 1;
-
-	if (len < 0)
-		len = 0;
-
-	return len;
+static int jfs_txstats_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, jfs_txstats_proc_show, NULL);
 }
+
+const struct file_operations jfs_txstats_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= jfs_txstats_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
 #endif
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index 5a61ebf2cbc..ae3acafb447 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -20,7 +20,9 @@
  */
 
 #include <linux/fs.h>
+#include <linux/module.h>
 #include <linux/quotaops.h>
+#include <linux/seq_file.h>
 #include "jfs_incore.h"
 #include "jfs_filsys.h"
 #include "jfs_metapage.h"
@@ -4134,13 +4136,9 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
 }
 
 #ifdef CONFIG_JFS_STATISTICS
-int jfs_xtstat_read(char *buffer, char **start, off_t offset, int length,
-		    int *eof, void *data)
+static int jfs_xtstat_proc_show(struct seq_file *m, void *v)
 {
-	int len = 0;
-	off_t begin;
-
-	len += sprintf(buffer,
+	seq_printf(m,
 		       "JFS Xtree statistics\n"
 		       "====================\n"
 		       "searches = %d\n"
@@ -4149,19 +4147,19 @@ int jfs_xtstat_read(char *buffer, char **start, off_t offset, int length,
 		       xtStat.search,
 		       xtStat.fastSearch,
 		       xtStat.split);
+	return 0;
+}
 
-	begin = offset;
-	*start = buffer + begin;
-	len -= begin;
-
-	if (len > length)
-		len = length;
-	else
-		*eof = 1;
-
-	if (len < 0)
-		len = 0;
-
-	return len;
+static int jfs_xtstat_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, jfs_xtstat_proc_show, NULL);
 }
+
+const struct file_operations jfs_xtstat_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= jfs_xtstat_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
 #endif
-- 
cgit v1.2.3-70-g09d2


From a0612b1f0b3d851458dafe5886e33d58c1967440 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 12 May 2008 14:01:49 -0700
Subject: uml: hppfs fixes

hppfs tidying and fixes noticed during hch's get_inode work -
      style fixes
      a copy_to_user got its return value checked
      hppfs_write no longer fiddles file->f_pos because it gets and
returns pos in its arguments
      hppfs_delete_inode dputs the underlyng procfs dentry stored in
its private data and mntputs the vfsmnt stashed in s_fs_info
      hppfs_put_super no longer needs to mntput the s_fs_info, so it
no longer needs to exist
      hppfs_readlink and hppfs_follow_link were doing a bunch of stuff
with a struct file which they didn't use
      there is now a ->permission which calls generic_permission
      get_inode was always returning 0 for some reason - it now
returns an inode if nothing bad happened

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Cc: WANG Cong <xiyou.wangcong@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hppfs/hppfs_kern.c | 82 +++++++++++++++++++--------------------------------
 1 file changed, 30 insertions(+), 52 deletions(-)

(limited to 'fs')

diff --git a/fs/hppfs/hppfs_kern.c b/fs/hppfs/hppfs_kern.c
index 8601d8ef3b5..65077aa90f0 100644
--- a/fs/hppfs/hppfs_kern.c
+++ b/fs/hppfs/hppfs_kern.c
@@ -33,7 +33,7 @@ struct hppfs_private {
 };
 
 struct hppfs_inode_info {
-        struct dentry *proc_dentry;
+	struct dentry *proc_dentry;
 	struct inode vfs_inode;
 };
 
@@ -52,7 +52,7 @@ static int is_pid(struct dentry *dentry)
 	int i;
 
 	sb = dentry->d_sb;
-	if ((sb->s_op != &hppfs_sbops) || (dentry->d_parent != sb->s_root))
+	if (dentry->d_parent != sb->s_root)
 		return 0;
 
 	for (i = 0; i < dentry->d_name.len; i++) {
@@ -136,7 +136,7 @@ static int file_removed(struct dentry *dentry, const char *file)
 }
 
 static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry,
-                                  struct nameidata *nd)
+				   struct nameidata *nd)
 {
 	struct dentry *proc_dentry, *new, *parent;
 	struct inode *inode;
@@ -254,6 +254,8 @@ static ssize_t hppfs_read(struct file *file, char __user *buf, size_t count,
 	int err;
 
 	if (hppfs->contents != NULL) {
+		int rem;
+
 		if (*ppos >= hppfs->len)
 			return 0;
 
@@ -267,8 +269,10 @@ static ssize_t hppfs_read(struct file *file, char __user *buf, size_t count,
 
 		if (off + count > hppfs->len)
 			count = hppfs->len - off;
-		copy_to_user(buf, &data->contents[off], count);
-		*ppos += count;
+		rem = copy_to_user(buf, &data->contents[off], count);
+		*ppos += count - rem;
+		if (rem > 0)
+			return -EFAULT;
 	} else if (hppfs->host_fd != -1) {
 		err = os_seek_file(hppfs->host_fd, *ppos);
 		if (err) {
@@ -285,21 +289,15 @@ static ssize_t hppfs_read(struct file *file, char __user *buf, size_t count,
 	return count;
 }
 
-static ssize_t hppfs_write(struct file *file, const char __user *buf, size_t len,
-			   loff_t *ppos)
+static ssize_t hppfs_write(struct file *file, const char __user *buf,
+			   size_t len, loff_t *ppos)
 {
 	struct hppfs_private *data = file->private_data;
 	struct file *proc_file = data->proc_file;
 	ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *);
-	int err;
 
 	write = proc_file->f_path.dentry->d_inode->i_fop->write;
-
-	proc_file->f_pos = file->f_pos;
-	err = (*write)(proc_file, buf, len, &proc_file->f_pos);
-	file->f_pos = proc_file->f_pos;
-
-	return err;
+	return (*write)(proc_file, buf, len, ppos);
 }
 
 static int open_host_sock(char *host_file, int *filter_out)
@@ -357,7 +355,7 @@ static struct hppfs_data *hppfs_get_data(int fd, int filter,
 
 	if (filter) {
 		while ((n = read_proc(proc_file, data->contents,
-				     sizeof(data->contents), NULL, 0)) > 0)
+				      sizeof(data->contents), NULL, 0)) > 0)
 			os_write_file(fd, data->contents, n);
 		err = os_shutdown_socket(fd, 0, 1);
 		if (err) {
@@ -429,8 +427,8 @@ static int file_mode(int fmode)
 static int hppfs_open(struct inode *inode, struct file *file)
 {
 	struct hppfs_private *data;
-	struct dentry *proc_dentry;
 	struct vfsmount *proc_mnt;
+	struct dentry *proc_dentry;
 	char *host_file;
 	int err, fd, type, filter;
 
@@ -492,8 +490,8 @@ static int hppfs_open(struct inode *inode, struct file *file)
 static int hppfs_dir_open(struct inode *inode, struct file *file)
 {
 	struct hppfs_private *data;
-	struct dentry *proc_dentry;
 	struct vfsmount *proc_mnt;
+	struct dentry *proc_dentry;
 	int err;
 
 	err = -ENOMEM;
@@ -620,6 +618,9 @@ static struct inode *hppfs_alloc_inode(struct super_block *sb)
 
 void hppfs_delete_inode(struct inode *ino)
 {
+	dput(HPPFS_I(ino)->proc_dentry);
+	mntput(ino->i_sb->s_fs_info);
+
 	clear_inode(ino);
 }
 
@@ -628,69 +629,46 @@ static void hppfs_destroy_inode(struct inode *inode)
 	kfree(HPPFS_I(inode));
 }
 
-static void hppfs_put_super(struct super_block *sb)
-{
-	mntput(sb->s_fs_info);
-}
-
 static const struct super_operations hppfs_sbops = {
 	.alloc_inode	= hppfs_alloc_inode,
 	.destroy_inode	= hppfs_destroy_inode,
 	.delete_inode	= hppfs_delete_inode,
 	.statfs		= hppfs_statfs,
-	.put_super	= hppfs_put_super,
 };
 
 static int hppfs_readlink(struct dentry *dentry, char __user *buffer,
 			  int buflen)
 {
-	struct file *proc_file;
 	struct dentry *proc_dentry;
-	struct vfsmount *proc_mnt;
-	int ret;
 
 	proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
-	proc_mnt = dentry->d_sb->s_fs_info;
-
-	proc_file = dentry_open(dget(proc_dentry), mntget(proc_mnt), O_RDONLY);
-	if (IS_ERR(proc_file))
-		return PTR_ERR(proc_file);
-
-	ret = proc_dentry->d_inode->i_op->readlink(proc_dentry, buffer, buflen);
-
-	fput(proc_file);
-
-	return ret;
+	return proc_dentry->d_inode->i_op->readlink(proc_dentry, buffer,
+						    buflen);
 }
 
-static void* hppfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+static void *hppfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
-	struct file *proc_file;
 	struct dentry *proc_dentry;
-	struct vfsmount *proc_mnt;
-	void *ret;
 
 	proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
-	proc_mnt = dentry->d_sb->s_fs_info;
-
-	proc_file = dentry_open(dget(proc_dentry), mntget(proc_mnt), O_RDONLY);
-	if (IS_ERR(proc_file))
-		return proc_file;
-
-	ret = proc_dentry->d_inode->i_op->follow_link(proc_dentry, nd);
 
-	fput(proc_file);
+	return proc_dentry->d_inode->i_op->follow_link(proc_dentry, nd);
+}
 
-	return ret;
+int hppfs_permission(struct inode *inode, int mask, struct nameidata *nd)
+{
+	return generic_permission(inode, mask, NULL);
 }
 
 static const struct inode_operations hppfs_dir_iops = {
 	.lookup		= hppfs_lookup,
+	.permission	= hppfs_permission,
 };
 
 static const struct inode_operations hppfs_link_iops = {
 	.readlink	= hppfs_readlink,
 	.follow_link	= hppfs_follow_link,
+	.permission	= hppfs_permission,
 };
 
 static struct inode *get_inode(struct super_block *sb, struct dentry *dentry)
@@ -712,7 +690,7 @@ static struct inode *get_inode(struct super_block *sb, struct dentry *dentry)
 		inode->i_fop = &hppfs_file_fops;
 	}
 
-	HPPFS_I(inode)->proc_dentry = dentry;
+	HPPFS_I(inode)->proc_dentry = dget(dentry);
 
 	inode->i_uid = proc_ino->i_uid;
 	inode->i_gid = proc_ino->i_gid;
@@ -725,7 +703,7 @@ static struct inode *get_inode(struct super_block *sb, struct dentry *dentry)
 	inode->i_size = proc_ino->i_size;
 	inode->i_blocks = proc_ino->i_blocks;
 
-	return 0;
+	return inode;
 }
 
 static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
-- 
cgit v1.2.3-70-g09d2


From 46d7b522ebf486edbd096965d534cc6465e9e309 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 12 May 2008 14:01:50 -0700
Subject: uml: move hppfs_kern.c to hppfs.c

There's no reason for the _kern in hppfs_kern.c, so move it to hppfs.c.

Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Cc: WANG Cong <xiyou.wangcong@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hppfs/Makefile     |   6 +-
 fs/hppfs/hppfs.c      | 771 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/hppfs/hppfs_kern.c | 771 --------------------------------------------------
 3 files changed, 774 insertions(+), 774 deletions(-)
 create mode 100644 fs/hppfs/hppfs.c
 delete mode 100644 fs/hppfs/hppfs_kern.c

(limited to 'fs')

diff --git a/fs/hppfs/Makefile b/fs/hppfs/Makefile
index 6890433f759..8a1f5034436 100644
--- a/fs/hppfs/Makefile
+++ b/fs/hppfs/Makefile
@@ -1,9 +1,9 @@
 #
-# Copyright (C) 2002, 2003 Jeff Dike (jdike@karaya.com)
+# Copyright (C) 2002 - 2008 Jeff Dike (jdike@{addtoit,linux.intel}.com)
 # Licensed under the GPL
 #
 
-hppfs-objs := hppfs_kern.o
+hppfs-objs := hppfs.o
 
 obj-y =
-obj-$(CONFIG_HPPFS) += hppfs.o
+obj-$(CONFIG_HPPFS) += $(hppfs-objs)
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
new file mode 100644
index 00000000000..65077aa90f0
--- /dev/null
+++ b/fs/hppfs/hppfs.c
@@ -0,0 +1,771 @@
+/*
+ * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Licensed under the GPL
+ */
+
+#include <linux/ctype.h>
+#include <linux/dcache.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/slab.h>
+#include <linux/statfs.h>
+#include <linux/types.h>
+#include <asm/uaccess.h>
+#include "os.h"
+
+static struct inode *get_inode(struct super_block *, struct dentry *);
+
+struct hppfs_data {
+	struct list_head list;
+	char contents[PAGE_SIZE - sizeof(struct list_head)];
+};
+
+struct hppfs_private {
+	struct file *proc_file;
+	int host_fd;
+	loff_t len;
+	struct hppfs_data *contents;
+};
+
+struct hppfs_inode_info {
+	struct dentry *proc_dentry;
+	struct inode vfs_inode;
+};
+
+static inline struct hppfs_inode_info *HPPFS_I(struct inode *inode)
+{
+	return container_of(inode, struct hppfs_inode_info, vfs_inode);
+}
+
+#define HPPFS_SUPER_MAGIC 0xb00000ee
+
+static const struct super_operations hppfs_sbops;
+
+static int is_pid(struct dentry *dentry)
+{
+	struct super_block *sb;
+	int i;
+
+	sb = dentry->d_sb;
+	if (dentry->d_parent != sb->s_root)
+		return 0;
+
+	for (i = 0; i < dentry->d_name.len; i++) {
+		if (!isdigit(dentry->d_name.name[i]))
+			return 0;
+	}
+	return 1;
+}
+
+static char *dentry_name(struct dentry *dentry, int extra)
+{
+	struct dentry *parent;
+	char *root, *name;
+	const char *seg_name;
+	int len, seg_len;
+
+	len = 0;
+	parent = dentry;
+	while (parent->d_parent != parent) {
+		if (is_pid(parent))
+			len += strlen("pid") + 1;
+		else len += parent->d_name.len + 1;
+		parent = parent->d_parent;
+	}
+
+	root = "proc";
+	len += strlen(root);
+	name = kmalloc(len + extra + 1, GFP_KERNEL);
+	if (name == NULL)
+		return NULL;
+
+	name[len] = '\0';
+	parent = dentry;
+	while (parent->d_parent != parent) {
+		if (is_pid(parent)) {
+			seg_name = "pid";
+			seg_len = strlen("pid");
+		}
+		else {
+			seg_name = parent->d_name.name;
+			seg_len = parent->d_name.len;
+		}
+
+		len -= seg_len + 1;
+		name[len] = '/';
+		strncpy(&name[len + 1], seg_name, seg_len);
+		parent = parent->d_parent;
+	}
+	strncpy(name, root, strlen(root));
+	return name;
+}
+
+static int file_removed(struct dentry *dentry, const char *file)
+{
+	char *host_file;
+	int extra, fd;
+
+	extra = 0;
+	if (file != NULL)
+		extra += strlen(file) + 1;
+
+	host_file = dentry_name(dentry, extra + strlen("/remove"));
+	if (host_file == NULL) {
+		printk(KERN_ERR "file_removed : allocation failed\n");
+		return -ENOMEM;
+	}
+
+	if (file != NULL) {
+		strcat(host_file, "/");
+		strcat(host_file, file);
+	}
+	strcat(host_file, "/remove");
+
+	fd = os_open_file(host_file, of_read(OPENFLAGS()), 0);
+	kfree(host_file);
+	if (fd > 0) {
+		os_close_file(fd);
+		return 1;
+	}
+	return 0;
+}
+
+static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry,
+				   struct nameidata *nd)
+{
+	struct dentry *proc_dentry, *new, *parent;
+	struct inode *inode;
+	int err, deleted;
+
+	deleted = file_removed(dentry, NULL);
+	if (deleted < 0)
+		return ERR_PTR(deleted);
+	else if (deleted)
+		return ERR_PTR(-ENOENT);
+
+	err = -ENOMEM;
+	parent = HPPFS_I(ino)->proc_dentry;
+	mutex_lock(&parent->d_inode->i_mutex);
+	proc_dentry = d_lookup(parent, &dentry->d_name);
+	if (proc_dentry == NULL) {
+		proc_dentry = d_alloc(parent, &dentry->d_name);
+		if (proc_dentry == NULL) {
+			mutex_unlock(&parent->d_inode->i_mutex);
+			goto out;
+		}
+		new = (*parent->d_inode->i_op->lookup)(parent->d_inode,
+						       proc_dentry, NULL);
+		if (new) {
+			dput(proc_dentry);
+			proc_dentry = new;
+		}
+	}
+	mutex_unlock(&parent->d_inode->i_mutex);
+
+	if (IS_ERR(proc_dentry))
+		return proc_dentry;
+
+	err = -ENOMEM;
+	inode = get_inode(ino->i_sb, proc_dentry);
+	if (!inode)
+		goto out_dput;
+
+ 	d_add(dentry, inode);
+	return NULL;
+
+ out_dput:
+	dput(proc_dentry);
+ out:
+	return ERR_PTR(err);
+}
+
+static const struct inode_operations hppfs_file_iops = {
+};
+
+static ssize_t read_proc(struct file *file, char __user *buf, ssize_t count,
+			 loff_t *ppos, int is_user)
+{
+	ssize_t (*read)(struct file *, char __user *, size_t, loff_t *);
+	ssize_t n;
+
+	read = file->f_path.dentry->d_inode->i_fop->read;
+
+	if (!is_user)
+		set_fs(KERNEL_DS);
+
+	n = (*read)(file, buf, count, &file->f_pos);
+
+	if (!is_user)
+		set_fs(USER_DS);
+
+	if (ppos)
+		*ppos = file->f_pos;
+	return n;
+}
+
+static ssize_t hppfs_read_file(int fd, char __user *buf, ssize_t count)
+{
+	ssize_t n;
+	int cur, err;
+	char *new_buf;
+
+	n = -ENOMEM;
+	new_buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (new_buf == NULL) {
+		printk(KERN_ERR "hppfs_read_file : kmalloc failed\n");
+		goto out;
+	}
+	n = 0;
+	while (count > 0) {
+		cur = min_t(ssize_t, count, PAGE_SIZE);
+		err = os_read_file(fd, new_buf, cur);
+		if (err < 0) {
+			printk(KERN_ERR "hppfs_read : read failed, "
+			       "errno = %d\n", err);
+			n = err;
+			goto out_free;
+		} else if (err == 0)
+			break;
+
+		if (copy_to_user(buf, new_buf, err)) {
+			n = -EFAULT;
+			goto out_free;
+		}
+		n += err;
+		count -= err;
+	}
+ out_free:
+	kfree(new_buf);
+ out:
+	return n;
+}
+
+static ssize_t hppfs_read(struct file *file, char __user *buf, size_t count,
+			  loff_t *ppos)
+{
+	struct hppfs_private *hppfs = file->private_data;
+	struct hppfs_data *data;
+	loff_t off;
+	int err;
+
+	if (hppfs->contents != NULL) {
+		int rem;
+
+		if (*ppos >= hppfs->len)
+			return 0;
+
+		data = hppfs->contents;
+		off = *ppos;
+		while (off >= sizeof(data->contents)) {
+			data = list_entry(data->list.next, struct hppfs_data,
+					  list);
+			off -= sizeof(data->contents);
+		}
+
+		if (off + count > hppfs->len)
+			count = hppfs->len - off;
+		rem = copy_to_user(buf, &data->contents[off], count);
+		*ppos += count - rem;
+		if (rem > 0)
+			return -EFAULT;
+	} else if (hppfs->host_fd != -1) {
+		err = os_seek_file(hppfs->host_fd, *ppos);
+		if (err) {
+			printk(KERN_ERR "hppfs_read : seek failed, "
+			       "errno = %d\n", err);
+			return err;
+		}
+		count = hppfs_read_file(hppfs->host_fd, buf, count);
+		if (count > 0)
+			*ppos += count;
+	}
+	else count = read_proc(hppfs->proc_file, buf, count, ppos, 1);
+
+	return count;
+}
+
+static ssize_t hppfs_write(struct file *file, const char __user *buf,
+			   size_t len, loff_t *ppos)
+{
+	struct hppfs_private *data = file->private_data;
+	struct file *proc_file = data->proc_file;
+	ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *);
+
+	write = proc_file->f_path.dentry->d_inode->i_fop->write;
+	return (*write)(proc_file, buf, len, ppos);
+}
+
+static int open_host_sock(char *host_file, int *filter_out)
+{
+	char *end;
+	int fd;
+
+	end = &host_file[strlen(host_file)];
+	strcpy(end, "/rw");
+	*filter_out = 1;
+	fd = os_connect_socket(host_file);
+	if (fd > 0)
+		return fd;
+
+	strcpy(end, "/r");
+	*filter_out = 0;
+	fd = os_connect_socket(host_file);
+	return fd;
+}
+
+static void free_contents(struct hppfs_data *head)
+{
+	struct hppfs_data *data;
+	struct list_head *ele, *next;
+
+	if (head == NULL)
+		return;
+
+	list_for_each_safe(ele, next, &head->list) {
+		data = list_entry(ele, struct hppfs_data, list);
+		kfree(data);
+	}
+	kfree(head);
+}
+
+static struct hppfs_data *hppfs_get_data(int fd, int filter,
+					 struct file *proc_file,
+					 struct file *hppfs_file,
+					 loff_t *size_out)
+{
+	struct hppfs_data *data, *new, *head;
+	int n, err;
+
+	err = -ENOMEM;
+	data = kmalloc(sizeof(*data), GFP_KERNEL);
+	if (data == NULL) {
+		printk(KERN_ERR "hppfs_get_data : head allocation failed\n");
+		goto failed;
+	}
+
+	INIT_LIST_HEAD(&data->list);
+
+	head = data;
+	*size_out = 0;
+
+	if (filter) {
+		while ((n = read_proc(proc_file, data->contents,
+				      sizeof(data->contents), NULL, 0)) > 0)
+			os_write_file(fd, data->contents, n);
+		err = os_shutdown_socket(fd, 0, 1);
+		if (err) {
+			printk(KERN_ERR "hppfs_get_data : failed to shut down "
+			       "socket\n");
+			goto failed_free;
+		}
+	}
+	while (1) {
+		n = os_read_file(fd, data->contents, sizeof(data->contents));
+		if (n < 0) {
+			err = n;
+			printk(KERN_ERR "hppfs_get_data : read failed, "
+			       "errno = %d\n", err);
+			goto failed_free;
+		} else if (n == 0)
+			break;
+
+		*size_out += n;
+
+		if (n < sizeof(data->contents))
+			break;
+
+		new = kmalloc(sizeof(*data), GFP_KERNEL);
+		if (new == 0) {
+			printk(KERN_ERR "hppfs_get_data : data allocation "
+			       "failed\n");
+			err = -ENOMEM;
+			goto failed_free;
+		}
+
+		INIT_LIST_HEAD(&new->list);
+		list_add(&new->list, &data->list);
+		data = new;
+	}
+	return head;
+
+ failed_free:
+	free_contents(head);
+ failed:
+	return ERR_PTR(err);
+}
+
+static struct hppfs_private *hppfs_data(void)
+{
+	struct hppfs_private *data;
+
+	data = kmalloc(sizeof(*data), GFP_KERNEL);
+	if (data == NULL)
+		return data;
+
+	*data = ((struct hppfs_private ) { .host_fd  		= -1,
+					   .len  		= -1,
+					   .contents 		= NULL } );
+	return data;
+}
+
+static int file_mode(int fmode)
+{
+	if (fmode == (FMODE_READ | FMODE_WRITE))
+		return O_RDWR;
+	if (fmode == FMODE_READ)
+		return O_RDONLY;
+	if (fmode == FMODE_WRITE)
+		return O_WRONLY;
+	return 0;
+}
+
+static int hppfs_open(struct inode *inode, struct file *file)
+{
+	struct hppfs_private *data;
+	struct vfsmount *proc_mnt;
+	struct dentry *proc_dentry;
+	char *host_file;
+	int err, fd, type, filter;
+
+	err = -ENOMEM;
+	data = hppfs_data();
+	if (data == NULL)
+		goto out;
+
+	host_file = dentry_name(file->f_path.dentry, strlen("/rw"));
+	if (host_file == NULL)
+		goto out_free2;
+
+	proc_dentry = HPPFS_I(inode)->proc_dentry;
+	proc_mnt = inode->i_sb->s_fs_info;
+
+	/* XXX This isn't closed anywhere */
+	data->proc_file = dentry_open(dget(proc_dentry), mntget(proc_mnt),
+				      file_mode(file->f_mode));
+	err = PTR_ERR(data->proc_file);
+	if (IS_ERR(data->proc_file))
+		goto out_free1;
+
+	type = os_file_type(host_file);
+	if (type == OS_TYPE_FILE) {
+		fd = os_open_file(host_file, of_read(OPENFLAGS()), 0);
+		if (fd >= 0)
+			data->host_fd = fd;
+		else
+			printk(KERN_ERR "hppfs_open : failed to open '%s', "
+			       "errno = %d\n", host_file, -fd);
+
+		data->contents = NULL;
+	} else if (type == OS_TYPE_DIR) {
+		fd = open_host_sock(host_file, &filter);
+		if (fd > 0) {
+			data->contents = hppfs_get_data(fd, filter,
+							data->proc_file,
+							file, &data->len);
+			if (!IS_ERR(data->contents))
+				data->host_fd = fd;
+		} else
+			printk(KERN_ERR "hppfs_open : failed to open a socket "
+			       "in '%s', errno = %d\n", host_file, -fd);
+	}
+	kfree(host_file);
+
+	file->private_data = data;
+	return 0;
+
+ out_free1:
+	kfree(host_file);
+ out_free2:
+	free_contents(data->contents);
+	kfree(data);
+ out:
+	return err;
+}
+
+static int hppfs_dir_open(struct inode *inode, struct file *file)
+{
+	struct hppfs_private *data;
+	struct vfsmount *proc_mnt;
+	struct dentry *proc_dentry;
+	int err;
+
+	err = -ENOMEM;
+	data = hppfs_data();
+	if (data == NULL)
+		goto out;
+
+	proc_dentry = HPPFS_I(inode)->proc_dentry;
+	proc_mnt = inode->i_sb->s_fs_info;
+	data->proc_file = dentry_open(dget(proc_dentry), mntget(proc_mnt),
+				      file_mode(file->f_mode));
+	err = PTR_ERR(data->proc_file);
+	if (IS_ERR(data->proc_file))
+		goto out_free;
+
+	file->private_data = data;
+	return 0;
+
+ out_free:
+	kfree(data);
+ out:
+	return err;
+}
+
+static loff_t hppfs_llseek(struct file *file, loff_t off, int where)
+{
+	struct hppfs_private *data = file->private_data;
+	struct file *proc_file = data->proc_file;
+	loff_t (*llseek)(struct file *, loff_t, int);
+	loff_t ret;
+
+	llseek = proc_file->f_path.dentry->d_inode->i_fop->llseek;
+	if (llseek != NULL) {
+		ret = (*llseek)(proc_file, off, where);
+		if (ret < 0)
+			return ret;
+	}
+
+	return default_llseek(file, off, where);
+}
+
+static const struct file_operations hppfs_file_fops = {
+	.owner		= NULL,
+	.llseek		= hppfs_llseek,
+	.read		= hppfs_read,
+	.write		= hppfs_write,
+	.open		= hppfs_open,
+};
+
+struct hppfs_dirent {
+	void *vfs_dirent;
+	filldir_t filldir;
+	struct dentry *dentry;
+};
+
+static int hppfs_filldir(void *d, const char *name, int size,
+			 loff_t offset, u64 inode, unsigned int type)
+{
+	struct hppfs_dirent *dirent = d;
+
+	if (file_removed(dirent->dentry, name))
+		return 0;
+
+	return (*dirent->filldir)(dirent->vfs_dirent, name, size, offset,
+				  inode, type);
+}
+
+static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir)
+{
+	struct hppfs_private *data = file->private_data;
+	struct file *proc_file = data->proc_file;
+	int (*readdir)(struct file *, void *, filldir_t);
+	struct hppfs_dirent dirent = ((struct hppfs_dirent)
+		                      { .vfs_dirent  	= ent,
+					.filldir 	= filldir,
+					.dentry  	= file->f_path.dentry
+				      });
+	int err;
+
+	readdir = proc_file->f_path.dentry->d_inode->i_fop->readdir;
+
+	proc_file->f_pos = file->f_pos;
+	err = (*readdir)(proc_file, &dirent, hppfs_filldir);
+	file->f_pos = proc_file->f_pos;
+
+	return err;
+}
+
+static int hppfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+	return 0;
+}
+
+static const struct file_operations hppfs_dir_fops = {
+	.owner		= NULL,
+	.readdir	= hppfs_readdir,
+	.open		= hppfs_dir_open,
+	.fsync		= hppfs_fsync,
+};
+
+static int hppfs_statfs(struct dentry *dentry, struct kstatfs *sf)
+{
+	sf->f_blocks = 0;
+	sf->f_bfree = 0;
+	sf->f_bavail = 0;
+	sf->f_files = 0;
+	sf->f_ffree = 0;
+	sf->f_type = HPPFS_SUPER_MAGIC;
+	return 0;
+}
+
+static struct inode *hppfs_alloc_inode(struct super_block *sb)
+{
+	struct hppfs_inode_info *hi;
+
+	hi = kmalloc(sizeof(*hi), GFP_KERNEL);
+	if (!hi)
+		return NULL;
+
+	hi->proc_dentry = NULL;
+	inode_init_once(&hi->vfs_inode);
+	return &hi->vfs_inode;
+}
+
+void hppfs_delete_inode(struct inode *ino)
+{
+	dput(HPPFS_I(ino)->proc_dentry);
+	mntput(ino->i_sb->s_fs_info);
+
+	clear_inode(ino);
+}
+
+static void hppfs_destroy_inode(struct inode *inode)
+{
+	kfree(HPPFS_I(inode));
+}
+
+static const struct super_operations hppfs_sbops = {
+	.alloc_inode	= hppfs_alloc_inode,
+	.destroy_inode	= hppfs_destroy_inode,
+	.delete_inode	= hppfs_delete_inode,
+	.statfs		= hppfs_statfs,
+};
+
+static int hppfs_readlink(struct dentry *dentry, char __user *buffer,
+			  int buflen)
+{
+	struct dentry *proc_dentry;
+
+	proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
+	return proc_dentry->d_inode->i_op->readlink(proc_dentry, buffer,
+						    buflen);
+}
+
+static void *hppfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct dentry *proc_dentry;
+
+	proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
+
+	return proc_dentry->d_inode->i_op->follow_link(proc_dentry, nd);
+}
+
+int hppfs_permission(struct inode *inode, int mask, struct nameidata *nd)
+{
+	return generic_permission(inode, mask, NULL);
+}
+
+static const struct inode_operations hppfs_dir_iops = {
+	.lookup		= hppfs_lookup,
+	.permission	= hppfs_permission,
+};
+
+static const struct inode_operations hppfs_link_iops = {
+	.readlink	= hppfs_readlink,
+	.follow_link	= hppfs_follow_link,
+	.permission	= hppfs_permission,
+};
+
+static struct inode *get_inode(struct super_block *sb, struct dentry *dentry)
+{
+	struct inode *proc_ino = dentry->d_inode;
+	struct inode *inode = new_inode(sb);
+
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	if (S_ISDIR(dentry->d_inode->i_mode)) {
+		inode->i_op = &hppfs_dir_iops;
+		inode->i_fop = &hppfs_dir_fops;
+	} else if (S_ISLNK(dentry->d_inode->i_mode)) {
+		inode->i_op = &hppfs_link_iops;
+		inode->i_fop = &hppfs_file_fops;
+	} else {
+		inode->i_op = &hppfs_file_iops;
+		inode->i_fop = &hppfs_file_fops;
+	}
+
+	HPPFS_I(inode)->proc_dentry = dget(dentry);
+
+	inode->i_uid = proc_ino->i_uid;
+	inode->i_gid = proc_ino->i_gid;
+	inode->i_atime = proc_ino->i_atime;
+	inode->i_mtime = proc_ino->i_mtime;
+	inode->i_ctime = proc_ino->i_ctime;
+	inode->i_ino = proc_ino->i_ino;
+	inode->i_mode = proc_ino->i_mode;
+	inode->i_nlink = proc_ino->i_nlink;
+	inode->i_size = proc_ino->i_size;
+	inode->i_blocks = proc_ino->i_blocks;
+
+	return inode;
+}
+
+static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
+{
+	struct inode *root_inode;
+	struct vfsmount *proc_mnt;
+	int err = -ENOENT;
+
+	proc_mnt = do_kern_mount("proc", 0, "proc", NULL);
+	if (IS_ERR(proc_mnt))
+		goto out;
+
+	sb->s_blocksize = 1024;
+	sb->s_blocksize_bits = 10;
+	sb->s_magic = HPPFS_SUPER_MAGIC;
+	sb->s_op = &hppfs_sbops;
+	sb->s_fs_info = proc_mnt;
+
+	err = -ENOMEM;
+	root_inode = get_inode(sb, proc_mnt->mnt_sb->s_root);
+	if (!root_inode)
+		goto out_mntput;
+
+	sb->s_root = d_alloc_root(root_inode);
+	if (!sb->s_root)
+		goto out_iput;
+
+	return 0;
+
+ out_iput:
+	iput(root_inode);
+ out_mntput:
+	mntput(proc_mnt);
+ out:
+	return(err);
+}
+
+static int hppfs_read_super(struct file_system_type *type,
+			    int flags, const char *dev_name,
+			    void *data, struct vfsmount *mnt)
+{
+	return get_sb_nodev(type, flags, data, hppfs_fill_super, mnt);
+}
+
+static struct file_system_type hppfs_type = {
+	.owner 		= THIS_MODULE,
+	.name 		= "hppfs",
+	.get_sb 	= hppfs_read_super,
+	.kill_sb	= kill_anon_super,
+	.fs_flags 	= 0,
+};
+
+static int __init init_hppfs(void)
+{
+	return register_filesystem(&hppfs_type);
+}
+
+static void __exit exit_hppfs(void)
+{
+	unregister_filesystem(&hppfs_type);
+}
+
+module_init(init_hppfs)
+module_exit(exit_hppfs)
+MODULE_LICENSE("GPL");
diff --git a/fs/hppfs/hppfs_kern.c b/fs/hppfs/hppfs_kern.c
deleted file mode 100644
index 65077aa90f0..00000000000
--- a/fs/hppfs/hppfs_kern.c
+++ /dev/null
@@ -1,771 +0,0 @@
-/*
- * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
- */
-
-#include <linux/ctype.h>
-#include <linux/dcache.h>
-#include <linux/file.h>
-#include <linux/fs.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/list.h>
-#include <linux/module.h>
-#include <linux/mount.h>
-#include <linux/slab.h>
-#include <linux/statfs.h>
-#include <linux/types.h>
-#include <asm/uaccess.h>
-#include "os.h"
-
-static struct inode *get_inode(struct super_block *, struct dentry *);
-
-struct hppfs_data {
-	struct list_head list;
-	char contents[PAGE_SIZE - sizeof(struct list_head)];
-};
-
-struct hppfs_private {
-	struct file *proc_file;
-	int host_fd;
-	loff_t len;
-	struct hppfs_data *contents;
-};
-
-struct hppfs_inode_info {
-	struct dentry *proc_dentry;
-	struct inode vfs_inode;
-};
-
-static inline struct hppfs_inode_info *HPPFS_I(struct inode *inode)
-{
-	return container_of(inode, struct hppfs_inode_info, vfs_inode);
-}
-
-#define HPPFS_SUPER_MAGIC 0xb00000ee
-
-static const struct super_operations hppfs_sbops;
-
-static int is_pid(struct dentry *dentry)
-{
-	struct super_block *sb;
-	int i;
-
-	sb = dentry->d_sb;
-	if (dentry->d_parent != sb->s_root)
-		return 0;
-
-	for (i = 0; i < dentry->d_name.len; i++) {
-		if (!isdigit(dentry->d_name.name[i]))
-			return 0;
-	}
-	return 1;
-}
-
-static char *dentry_name(struct dentry *dentry, int extra)
-{
-	struct dentry *parent;
-	char *root, *name;
-	const char *seg_name;
-	int len, seg_len;
-
-	len = 0;
-	parent = dentry;
-	while (parent->d_parent != parent) {
-		if (is_pid(parent))
-			len += strlen("pid") + 1;
-		else len += parent->d_name.len + 1;
-		parent = parent->d_parent;
-	}
-
-	root = "proc";
-	len += strlen(root);
-	name = kmalloc(len + extra + 1, GFP_KERNEL);
-	if (name == NULL)
-		return NULL;
-
-	name[len] = '\0';
-	parent = dentry;
-	while (parent->d_parent != parent) {
-		if (is_pid(parent)) {
-			seg_name = "pid";
-			seg_len = strlen("pid");
-		}
-		else {
-			seg_name = parent->d_name.name;
-			seg_len = parent->d_name.len;
-		}
-
-		len -= seg_len + 1;
-		name[len] = '/';
-		strncpy(&name[len + 1], seg_name, seg_len);
-		parent = parent->d_parent;
-	}
-	strncpy(name, root, strlen(root));
-	return name;
-}
-
-static int file_removed(struct dentry *dentry, const char *file)
-{
-	char *host_file;
-	int extra, fd;
-
-	extra = 0;
-	if (file != NULL)
-		extra += strlen(file) + 1;
-
-	host_file = dentry_name(dentry, extra + strlen("/remove"));
-	if (host_file == NULL) {
-		printk(KERN_ERR "file_removed : allocation failed\n");
-		return -ENOMEM;
-	}
-
-	if (file != NULL) {
-		strcat(host_file, "/");
-		strcat(host_file, file);
-	}
-	strcat(host_file, "/remove");
-
-	fd = os_open_file(host_file, of_read(OPENFLAGS()), 0);
-	kfree(host_file);
-	if (fd > 0) {
-		os_close_file(fd);
-		return 1;
-	}
-	return 0;
-}
-
-static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry,
-				   struct nameidata *nd)
-{
-	struct dentry *proc_dentry, *new, *parent;
-	struct inode *inode;
-	int err, deleted;
-
-	deleted = file_removed(dentry, NULL);
-	if (deleted < 0)
-		return ERR_PTR(deleted);
-	else if (deleted)
-		return ERR_PTR(-ENOENT);
-
-	err = -ENOMEM;
-	parent = HPPFS_I(ino)->proc_dentry;
-	mutex_lock(&parent->d_inode->i_mutex);
-	proc_dentry = d_lookup(parent, &dentry->d_name);
-	if (proc_dentry == NULL) {
-		proc_dentry = d_alloc(parent, &dentry->d_name);
-		if (proc_dentry == NULL) {
-			mutex_unlock(&parent->d_inode->i_mutex);
-			goto out;
-		}
-		new = (*parent->d_inode->i_op->lookup)(parent->d_inode,
-						       proc_dentry, NULL);
-		if (new) {
-			dput(proc_dentry);
-			proc_dentry = new;
-		}
-	}
-	mutex_unlock(&parent->d_inode->i_mutex);
-
-	if (IS_ERR(proc_dentry))
-		return proc_dentry;
-
-	err = -ENOMEM;
-	inode = get_inode(ino->i_sb, proc_dentry);
-	if (!inode)
-		goto out_dput;
-
- 	d_add(dentry, inode);
-	return NULL;
-
- out_dput:
-	dput(proc_dentry);
- out:
-	return ERR_PTR(err);
-}
-
-static const struct inode_operations hppfs_file_iops = {
-};
-
-static ssize_t read_proc(struct file *file, char __user *buf, ssize_t count,
-			 loff_t *ppos, int is_user)
-{
-	ssize_t (*read)(struct file *, char __user *, size_t, loff_t *);
-	ssize_t n;
-
-	read = file->f_path.dentry->d_inode->i_fop->read;
-
-	if (!is_user)
-		set_fs(KERNEL_DS);
-
-	n = (*read)(file, buf, count, &file->f_pos);
-
-	if (!is_user)
-		set_fs(USER_DS);
-
-	if (ppos)
-		*ppos = file->f_pos;
-	return n;
-}
-
-static ssize_t hppfs_read_file(int fd, char __user *buf, ssize_t count)
-{
-	ssize_t n;
-	int cur, err;
-	char *new_buf;
-
-	n = -ENOMEM;
-	new_buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
-	if (new_buf == NULL) {
-		printk(KERN_ERR "hppfs_read_file : kmalloc failed\n");
-		goto out;
-	}
-	n = 0;
-	while (count > 0) {
-		cur = min_t(ssize_t, count, PAGE_SIZE);
-		err = os_read_file(fd, new_buf, cur);
-		if (err < 0) {
-			printk(KERN_ERR "hppfs_read : read failed, "
-			       "errno = %d\n", err);
-			n = err;
-			goto out_free;
-		} else if (err == 0)
-			break;
-
-		if (copy_to_user(buf, new_buf, err)) {
-			n = -EFAULT;
-			goto out_free;
-		}
-		n += err;
-		count -= err;
-	}
- out_free:
-	kfree(new_buf);
- out:
-	return n;
-}
-
-static ssize_t hppfs_read(struct file *file, char __user *buf, size_t count,
-			  loff_t *ppos)
-{
-	struct hppfs_private *hppfs = file->private_data;
-	struct hppfs_data *data;
-	loff_t off;
-	int err;
-
-	if (hppfs->contents != NULL) {
-		int rem;
-
-		if (*ppos >= hppfs->len)
-			return 0;
-
-		data = hppfs->contents;
-		off = *ppos;
-		while (off >= sizeof(data->contents)) {
-			data = list_entry(data->list.next, struct hppfs_data,
-					  list);
-			off -= sizeof(data->contents);
-		}
-
-		if (off + count > hppfs->len)
-			count = hppfs->len - off;
-		rem = copy_to_user(buf, &data->contents[off], count);
-		*ppos += count - rem;
-		if (rem > 0)
-			return -EFAULT;
-	} else if (hppfs->host_fd != -1) {
-		err = os_seek_file(hppfs->host_fd, *ppos);
-		if (err) {
-			printk(KERN_ERR "hppfs_read : seek failed, "
-			       "errno = %d\n", err);
-			return err;
-		}
-		count = hppfs_read_file(hppfs->host_fd, buf, count);
-		if (count > 0)
-			*ppos += count;
-	}
-	else count = read_proc(hppfs->proc_file, buf, count, ppos, 1);
-
-	return count;
-}
-
-static ssize_t hppfs_write(struct file *file, const char __user *buf,
-			   size_t len, loff_t *ppos)
-{
-	struct hppfs_private *data = file->private_data;
-	struct file *proc_file = data->proc_file;
-	ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *);
-
-	write = proc_file->f_path.dentry->d_inode->i_fop->write;
-	return (*write)(proc_file, buf, len, ppos);
-}
-
-static int open_host_sock(char *host_file, int *filter_out)
-{
-	char *end;
-	int fd;
-
-	end = &host_file[strlen(host_file)];
-	strcpy(end, "/rw");
-	*filter_out = 1;
-	fd = os_connect_socket(host_file);
-	if (fd > 0)
-		return fd;
-
-	strcpy(end, "/r");
-	*filter_out = 0;
-	fd = os_connect_socket(host_file);
-	return fd;
-}
-
-static void free_contents(struct hppfs_data *head)
-{
-	struct hppfs_data *data;
-	struct list_head *ele, *next;
-
-	if (head == NULL)
-		return;
-
-	list_for_each_safe(ele, next, &head->list) {
-		data = list_entry(ele, struct hppfs_data, list);
-		kfree(data);
-	}
-	kfree(head);
-}
-
-static struct hppfs_data *hppfs_get_data(int fd, int filter,
-					 struct file *proc_file,
-					 struct file *hppfs_file,
-					 loff_t *size_out)
-{
-	struct hppfs_data *data, *new, *head;
-	int n, err;
-
-	err = -ENOMEM;
-	data = kmalloc(sizeof(*data), GFP_KERNEL);
-	if (data == NULL) {
-		printk(KERN_ERR "hppfs_get_data : head allocation failed\n");
-		goto failed;
-	}
-
-	INIT_LIST_HEAD(&data->list);
-
-	head = data;
-	*size_out = 0;
-
-	if (filter) {
-		while ((n = read_proc(proc_file, data->contents,
-				      sizeof(data->contents), NULL, 0)) > 0)
-			os_write_file(fd, data->contents, n);
-		err = os_shutdown_socket(fd, 0, 1);
-		if (err) {
-			printk(KERN_ERR "hppfs_get_data : failed to shut down "
-			       "socket\n");
-			goto failed_free;
-		}
-	}
-	while (1) {
-		n = os_read_file(fd, data->contents, sizeof(data->contents));
-		if (n < 0) {
-			err = n;
-			printk(KERN_ERR "hppfs_get_data : read failed, "
-			       "errno = %d\n", err);
-			goto failed_free;
-		} else if (n == 0)
-			break;
-
-		*size_out += n;
-
-		if (n < sizeof(data->contents))
-			break;
-
-		new = kmalloc(sizeof(*data), GFP_KERNEL);
-		if (new == 0) {
-			printk(KERN_ERR "hppfs_get_data : data allocation "
-			       "failed\n");
-			err = -ENOMEM;
-			goto failed_free;
-		}
-
-		INIT_LIST_HEAD(&new->list);
-		list_add(&new->list, &data->list);
-		data = new;
-	}
-	return head;
-
- failed_free:
-	free_contents(head);
- failed:
-	return ERR_PTR(err);
-}
-
-static struct hppfs_private *hppfs_data(void)
-{
-	struct hppfs_private *data;
-
-	data = kmalloc(sizeof(*data), GFP_KERNEL);
-	if (data == NULL)
-		return data;
-
-	*data = ((struct hppfs_private ) { .host_fd  		= -1,
-					   .len  		= -1,
-					   .contents 		= NULL } );
-	return data;
-}
-
-static int file_mode(int fmode)
-{
-	if (fmode == (FMODE_READ | FMODE_WRITE))
-		return O_RDWR;
-	if (fmode == FMODE_READ)
-		return O_RDONLY;
-	if (fmode == FMODE_WRITE)
-		return O_WRONLY;
-	return 0;
-}
-
-static int hppfs_open(struct inode *inode, struct file *file)
-{
-	struct hppfs_private *data;
-	struct vfsmount *proc_mnt;
-	struct dentry *proc_dentry;
-	char *host_file;
-	int err, fd, type, filter;
-
-	err = -ENOMEM;
-	data = hppfs_data();
-	if (data == NULL)
-		goto out;
-
-	host_file = dentry_name(file->f_path.dentry, strlen("/rw"));
-	if (host_file == NULL)
-		goto out_free2;
-
-	proc_dentry = HPPFS_I(inode)->proc_dentry;
-	proc_mnt = inode->i_sb->s_fs_info;
-
-	/* XXX This isn't closed anywhere */
-	data->proc_file = dentry_open(dget(proc_dentry), mntget(proc_mnt),
-				      file_mode(file->f_mode));
-	err = PTR_ERR(data->proc_file);
-	if (IS_ERR(data->proc_file))
-		goto out_free1;
-
-	type = os_file_type(host_file);
-	if (type == OS_TYPE_FILE) {
-		fd = os_open_file(host_file, of_read(OPENFLAGS()), 0);
-		if (fd >= 0)
-			data->host_fd = fd;
-		else
-			printk(KERN_ERR "hppfs_open : failed to open '%s', "
-			       "errno = %d\n", host_file, -fd);
-
-		data->contents = NULL;
-	} else if (type == OS_TYPE_DIR) {
-		fd = open_host_sock(host_file, &filter);
-		if (fd > 0) {
-			data->contents = hppfs_get_data(fd, filter,
-							data->proc_file,
-							file, &data->len);
-			if (!IS_ERR(data->contents))
-				data->host_fd = fd;
-		} else
-			printk(KERN_ERR "hppfs_open : failed to open a socket "
-			       "in '%s', errno = %d\n", host_file, -fd);
-	}
-	kfree(host_file);
-
-	file->private_data = data;
-	return 0;
-
- out_free1:
-	kfree(host_file);
- out_free2:
-	free_contents(data->contents);
-	kfree(data);
- out:
-	return err;
-}
-
-static int hppfs_dir_open(struct inode *inode, struct file *file)
-{
-	struct hppfs_private *data;
-	struct vfsmount *proc_mnt;
-	struct dentry *proc_dentry;
-	int err;
-
-	err = -ENOMEM;
-	data = hppfs_data();
-	if (data == NULL)
-		goto out;
-
-	proc_dentry = HPPFS_I(inode)->proc_dentry;
-	proc_mnt = inode->i_sb->s_fs_info;
-	data->proc_file = dentry_open(dget(proc_dentry), mntget(proc_mnt),
-				      file_mode(file->f_mode));
-	err = PTR_ERR(data->proc_file);
-	if (IS_ERR(data->proc_file))
-		goto out_free;
-
-	file->private_data = data;
-	return 0;
-
- out_free:
-	kfree(data);
- out:
-	return err;
-}
-
-static loff_t hppfs_llseek(struct file *file, loff_t off, int where)
-{
-	struct hppfs_private *data = file->private_data;
-	struct file *proc_file = data->proc_file;
-	loff_t (*llseek)(struct file *, loff_t, int);
-	loff_t ret;
-
-	llseek = proc_file->f_path.dentry->d_inode->i_fop->llseek;
-	if (llseek != NULL) {
-		ret = (*llseek)(proc_file, off, where);
-		if (ret < 0)
-			return ret;
-	}
-
-	return default_llseek(file, off, where);
-}
-
-static const struct file_operations hppfs_file_fops = {
-	.owner		= NULL,
-	.llseek		= hppfs_llseek,
-	.read		= hppfs_read,
-	.write		= hppfs_write,
-	.open		= hppfs_open,
-};
-
-struct hppfs_dirent {
-	void *vfs_dirent;
-	filldir_t filldir;
-	struct dentry *dentry;
-};
-
-static int hppfs_filldir(void *d, const char *name, int size,
-			 loff_t offset, u64 inode, unsigned int type)
-{
-	struct hppfs_dirent *dirent = d;
-
-	if (file_removed(dirent->dentry, name))
-		return 0;
-
-	return (*dirent->filldir)(dirent->vfs_dirent, name, size, offset,
-				  inode, type);
-}
-
-static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir)
-{
-	struct hppfs_private *data = file->private_data;
-	struct file *proc_file = data->proc_file;
-	int (*readdir)(struct file *, void *, filldir_t);
-	struct hppfs_dirent dirent = ((struct hppfs_dirent)
-		                      { .vfs_dirent  	= ent,
-					.filldir 	= filldir,
-					.dentry  	= file->f_path.dentry
-				      });
-	int err;
-
-	readdir = proc_file->f_path.dentry->d_inode->i_fop->readdir;
-
-	proc_file->f_pos = file->f_pos;
-	err = (*readdir)(proc_file, &dirent, hppfs_filldir);
-	file->f_pos = proc_file->f_pos;
-
-	return err;
-}
-
-static int hppfs_fsync(struct file *file, struct dentry *dentry, int datasync)
-{
-	return 0;
-}
-
-static const struct file_operations hppfs_dir_fops = {
-	.owner		= NULL,
-	.readdir	= hppfs_readdir,
-	.open		= hppfs_dir_open,
-	.fsync		= hppfs_fsync,
-};
-
-static int hppfs_statfs(struct dentry *dentry, struct kstatfs *sf)
-{
-	sf->f_blocks = 0;
-	sf->f_bfree = 0;
-	sf->f_bavail = 0;
-	sf->f_files = 0;
-	sf->f_ffree = 0;
-	sf->f_type = HPPFS_SUPER_MAGIC;
-	return 0;
-}
-
-static struct inode *hppfs_alloc_inode(struct super_block *sb)
-{
-	struct hppfs_inode_info *hi;
-
-	hi = kmalloc(sizeof(*hi), GFP_KERNEL);
-	if (!hi)
-		return NULL;
-
-	hi->proc_dentry = NULL;
-	inode_init_once(&hi->vfs_inode);
-	return &hi->vfs_inode;
-}
-
-void hppfs_delete_inode(struct inode *ino)
-{
-	dput(HPPFS_I(ino)->proc_dentry);
-	mntput(ino->i_sb->s_fs_info);
-
-	clear_inode(ino);
-}
-
-static void hppfs_destroy_inode(struct inode *inode)
-{
-	kfree(HPPFS_I(inode));
-}
-
-static const struct super_operations hppfs_sbops = {
-	.alloc_inode	= hppfs_alloc_inode,
-	.destroy_inode	= hppfs_destroy_inode,
-	.delete_inode	= hppfs_delete_inode,
-	.statfs		= hppfs_statfs,
-};
-
-static int hppfs_readlink(struct dentry *dentry, char __user *buffer,
-			  int buflen)
-{
-	struct dentry *proc_dentry;
-
-	proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
-	return proc_dentry->d_inode->i_op->readlink(proc_dentry, buffer,
-						    buflen);
-}
-
-static void *hppfs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-	struct dentry *proc_dentry;
-
-	proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
-
-	return proc_dentry->d_inode->i_op->follow_link(proc_dentry, nd);
-}
-
-int hppfs_permission(struct inode *inode, int mask, struct nameidata *nd)
-{
-	return generic_permission(inode, mask, NULL);
-}
-
-static const struct inode_operations hppfs_dir_iops = {
-	.lookup		= hppfs_lookup,
-	.permission	= hppfs_permission,
-};
-
-static const struct inode_operations hppfs_link_iops = {
-	.readlink	= hppfs_readlink,
-	.follow_link	= hppfs_follow_link,
-	.permission	= hppfs_permission,
-};
-
-static struct inode *get_inode(struct super_block *sb, struct dentry *dentry)
-{
-	struct inode *proc_ino = dentry->d_inode;
-	struct inode *inode = new_inode(sb);
-
-	if (!inode)
-		return ERR_PTR(-ENOMEM);
-
-	if (S_ISDIR(dentry->d_inode->i_mode)) {
-		inode->i_op = &hppfs_dir_iops;
-		inode->i_fop = &hppfs_dir_fops;
-	} else if (S_ISLNK(dentry->d_inode->i_mode)) {
-		inode->i_op = &hppfs_link_iops;
-		inode->i_fop = &hppfs_file_fops;
-	} else {
-		inode->i_op = &hppfs_file_iops;
-		inode->i_fop = &hppfs_file_fops;
-	}
-
-	HPPFS_I(inode)->proc_dentry = dget(dentry);
-
-	inode->i_uid = proc_ino->i_uid;
-	inode->i_gid = proc_ino->i_gid;
-	inode->i_atime = proc_ino->i_atime;
-	inode->i_mtime = proc_ino->i_mtime;
-	inode->i_ctime = proc_ino->i_ctime;
-	inode->i_ino = proc_ino->i_ino;
-	inode->i_mode = proc_ino->i_mode;
-	inode->i_nlink = proc_ino->i_nlink;
-	inode->i_size = proc_ino->i_size;
-	inode->i_blocks = proc_ino->i_blocks;
-
-	return inode;
-}
-
-static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
-{
-	struct inode *root_inode;
-	struct vfsmount *proc_mnt;
-	int err = -ENOENT;
-
-	proc_mnt = do_kern_mount("proc", 0, "proc", NULL);
-	if (IS_ERR(proc_mnt))
-		goto out;
-
-	sb->s_blocksize = 1024;
-	sb->s_blocksize_bits = 10;
-	sb->s_magic = HPPFS_SUPER_MAGIC;
-	sb->s_op = &hppfs_sbops;
-	sb->s_fs_info = proc_mnt;
-
-	err = -ENOMEM;
-	root_inode = get_inode(sb, proc_mnt->mnt_sb->s_root);
-	if (!root_inode)
-		goto out_mntput;
-
-	sb->s_root = d_alloc_root(root_inode);
-	if (!sb->s_root)
-		goto out_iput;
-
-	return 0;
-
- out_iput:
-	iput(root_inode);
- out_mntput:
-	mntput(proc_mnt);
- out:
-	return(err);
-}
-
-static int hppfs_read_super(struct file_system_type *type,
-			    int flags, const char *dev_name,
-			    void *data, struct vfsmount *mnt)
-{
-	return get_sb_nodev(type, flags, data, hppfs_fill_super, mnt);
-}
-
-static struct file_system_type hppfs_type = {
-	.owner 		= THIS_MODULE,
-	.name 		= "hppfs",
-	.get_sb 	= hppfs_read_super,
-	.kill_sb	= kill_anon_super,
-	.fs_flags 	= 0,
-};
-
-static int __init init_hppfs(void)
-{
-	return register_filesystem(&hppfs_type);
-}
-
-static void __exit exit_hppfs(void)
-{
-	unregister_filesystem(&hppfs_type);
-}
-
-module_init(init_hppfs)
-module_exit(exit_hppfs)
-MODULE_LICENSE("GPL");
-- 
cgit v1.2.3-70-g09d2


From 8dc4e37362a5dc910d704d52ac6542bfd49ddc2f Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Mon, 12 May 2008 14:02:04 -0700
Subject: ecryptfs: clean up (un)lock_parent

dget(dentry->d_parent) --> dget_parent(dentry)

unlock_parent() is racy and unnecessary.  Replace single caller with
unlock_dir().

There are several other suspect uses of ->d_parent in ecryptfs...

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Michael Halcrow <mhalcrow@us.ibm.com>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/inode.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 0a1397335a8..c92cc1c00aa 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -37,17 +37,11 @@ static struct dentry *lock_parent(struct dentry *dentry)
 {
 	struct dentry *dir;
 
-	dir = dget(dentry->d_parent);
+	dir = dget_parent(dentry);
 	mutex_lock_nested(&(dir->d_inode->i_mutex), I_MUTEX_PARENT);
 	return dir;
 }
 
-static void unlock_parent(struct dentry *dentry)
-{
-	mutex_unlock(&(dentry->d_parent->d_inode->i_mutex));
-	dput(dentry->d_parent);
-}
-
 static void unlock_dir(struct dentry *dir)
 {
 	mutex_unlock(&dir->d_inode->i_mutex);
@@ -426,8 +420,9 @@ static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry)
 	int rc = 0;
 	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
 	struct inode *lower_dir_inode = ecryptfs_inode_to_lower(dir);
+	struct dentry *lower_dir_dentry;
 
-	lock_parent(lower_dentry);
+	lower_dir_dentry = lock_parent(lower_dentry);
 	rc = vfs_unlink(lower_dir_inode, lower_dentry);
 	if (rc) {
 		printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc);
@@ -439,7 +434,7 @@ static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry)
 	dentry->d_inode->i_ctime = dir->i_ctime;
 	d_drop(dentry);
 out_unlock:
-	unlock_parent(lower_dentry);
+	unlock_dir(lower_dir_dentry);
 	return rc;
 }
 
-- 
cgit v1.2.3-70-g09d2


From bb45d64224e5cafe8c8e0d18a20da998e5a7dc93 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 12 May 2008 14:02:06 -0700
Subject: ufs: remove unneeded ufs_put_inode prototype

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Evgeniy Dushistov <dushistov@mail.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ufs/ufs.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 244a1aaa940..11c035168ea 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -107,7 +107,6 @@ extern struct inode * ufs_new_inode (struct inode *, int);
 
 /* inode.c */
 extern struct inode *ufs_iget(struct super_block *, unsigned long);
-extern void ufs_put_inode (struct inode *);
 extern int ufs_write_inode (struct inode *, int);
 extern int ufs_sync_inode (struct inode *);
 extern void ufs_delete_inode (struct inode *);
-- 
cgit v1.2.3-70-g09d2


From 9377abd026bf9bde7db90dac09170034bf6d1cbf Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 12 May 2008 14:02:08 -0700
Subject: quota: don't call sync_fs() from vfs_quota_off() when there's no
 quota turn off

Sometimes, vfs_quota_off() is called on a partially set up super block (for
example when fill_super() fails for some reason).  In such cases we cannot
call ->sync_fs() because it can Oops because of not properly filled in super
block.  So in case we find there's not quota to turn off, we just skip
everything and return which fixes the above problem.

[akpm@linux-foundation.org: fxi tpyo]
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dquot.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/dquot.c b/fs/dquot.c
index dfba1623ccc..5ac77da1995 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -1491,6 +1491,16 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
 
 	/* We need to serialize quota_off() for device */
 	mutex_lock(&dqopt->dqonoff_mutex);
+
+	/*
+	 * Skip everything if there's nothing to do. We have to do this because
+	 * sometimes we are called when fill_super() failed and calling
+	 * sync_fs() in such cases does no good.
+	 */
+	if (!sb_any_quota_enabled(sb) && !sb_any_quota_suspended(sb)) {
+		mutex_unlock(&dqopt->dqonoff_mutex);
+		return 0;
+	}
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		toputinode[cnt] = NULL;
 		if (type != -1 && cnt != type)
-- 
cgit v1.2.3-70-g09d2


From 289f8e27ed435dcbefad132def06f4e84351e94f Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Mon, 12 May 2008 14:02:13 -0700
Subject: capabilities: add bounding set to /proc/self/status

There is currently no way to query the bounding set of another task.  As there
appears to be no security reason not to, and as Michael Kerrisk points out the
following valid reasons to do so exist:

* consistency (I can see all of the other per-thread/process sets in
  /proc/.../status)

* debugging -- I could imagine that it would make the job of debugging an
  application that uses capabilities a little simpler.

this patch adds the bounding set to /proc/self/status right after the
effective set.

Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Acked-by: Michael Kerrisk <mtk.manpages@gmail.com>
Acked-by: Andrew G. Morgan <morgan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/array.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index dca997a93bf..9e3b8c33c24 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -298,6 +298,7 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p)
 	render_cap_t(m, "CapInh:\t", &p->cap_inheritable);
 	render_cap_t(m, "CapPrm:\t", &p->cap_permitted);
 	render_cap_t(m, "CapEff:\t", &p->cap_effective);
+	render_cap_t(m, "CapBnd:\t", &p->cap_bset);
 }
 
 static inline void task_context_switch_counts(struct seq_file *m,
-- 
cgit v1.2.3-70-g09d2


From 706322496b3a58af3cf258db2b553d6933656eef Mon Sep 17 00:00:00 2001
From: Eric Sesterhenn <snakebyte@gmx.de>
Date: Mon, 12 May 2008 14:02:21 -0700
Subject: Fix hfsplus oops on image without extents

Fix an oops with a corrupted hfs+ image.

See http://bugzilla.kernel.org/show_bug.cgi?id=10548 for details.

Problem is that we call hfs_btree_open() from hfsplus_fill_super() to set
HFSPLUS_SB(sb).[ext_tree|cat_tree] Both trees are still NULL at this moment.
If hfs_btree_open() fails for any reason it calls iput() on the page, which
gets to hfsplus_releasepage() which tries to access HFSPLUS_SB(sb).* which is
still NULL and oopses while dereferencing it.

[akpm@linux-foundation.org: build fix]
Signed-off-by: Eric Sesterhenn <snakebyte@gmx.de>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hfsplus/inode.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index d53b2af91c2..67e1c8b467c 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -65,6 +65,8 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
 		BUG();
 		return 0;
 	}
+	if (!tree)
+		return 0;
 	if (tree->node_size >= PAGE_CACHE_SIZE) {
 		nidx = page->index >> (tree->node_size_shift - PAGE_CACHE_SHIFT);
 		spin_lock(&tree->hash_lock);
-- 
cgit v1.2.3-70-g09d2


From 4cd1a8fc3d3cd740416b14ece2693dbb5d065eaf Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Mon, 12 May 2008 14:02:31 -0700
Subject: memcg: fix possible panic when CONFIG_MM_OWNER=y

When mm destruction happens, we should pass mm_update_next_owner() the old mm.
 But unfortunately new mm is passed in exec_mmap().

Thus, kernel panic is possible when a multi-threaded process uses exec().

Also, the owner member comment description is wrong.  mm->owner does not
necessarily point to the thread group leader.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: "Paul Menage" <menage@google.com>
Cc: "KAMEZAWA Hiroyuki" <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c                |  2 +-
 include/linux/mm_types.h | 13 +++++++++++--
 2 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index aeaa9791d8b..1f8a24aa1f8 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -736,7 +736,7 @@ static int exec_mmap(struct mm_struct *mm)
 	tsk->active_mm = mm;
 	activate_mm(active_mm, mm);
 	task_unlock(tsk);
-	mm_update_next_owner(mm);
+	mm_update_next_owner(old_mm);
 	arch_pick_mmap_layout(mm);
 	if (old_mm) {
 		up_read(&old_mm->mmap_sem);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index eb7c16cc955..02a27ae7853 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -226,8 +226,17 @@ struct mm_struct {
 	rwlock_t		ioctx_list_lock;	/* aio lock */
 	struct kioctx		*ioctx_list;
 #ifdef CONFIG_MM_OWNER
-	struct task_struct *owner;	/* The thread group leader that */
-					/* owns the mm_struct.		*/
+	/*
+	 * "owner" points to a task that is regarded as the canonical
+	 * user/owner of this mm. All of the following must be true in
+	 * order for it to be changed:
+	 *
+	 * current == mm->owner
+	 * current->mm != mm
+	 * new_owner->mm == mm
+	 * new_owner->alloc_lock is held
+	 */
+	struct task_struct *owner;
 #endif
 
 #ifdef CONFIG_PROC_FS
-- 
cgit v1.2.3-70-g09d2


From 78bb6cb9a890d3d50ca3b02fce9223d3e734ab9b Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Mon, 12 May 2008 14:02:32 -0700
Subject: fuse: add flag to turn on big writes

Prior to 2.6.26 fuse only supported single page write requests.  In theory all
fuse filesystem should be able support bigger than 4k writes, as there's
nothing in the API to prevent it.  Unfortunately there's a known case in
NTFS-3G where big writes cause filesystem corruption.  There could also be
other filesystems, where the lack of testing with big write requests would
result in bugs.

To prevent such problems on a kernel upgrade, disable big writes by default,
but let filesystems set a flag to turn it on.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Szabolcs Szakacsits <szaka@ntfs-3g.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fuse/file.c       | 2 ++
 fs/fuse/fuse_i.h     | 3 +++
 fs/fuse/inode.c      | 5 ++++-
 include/linux/fuse.h | 1 +
 4 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index f28cf8b46f8..8092f0d9fd1 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -804,6 +804,8 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
 		if (offset == PAGE_CACHE_SIZE)
 			offset = 0;
 
+		if (!fc->big_writes)
+			break;
 	} while (iov_iter_count(ii) && count < fc->max_write &&
 		 req->num_pages < FUSE_MAX_PAGES_PER_REQ && offset == 0);
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index dadffa21a20..bae948657c4 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -404,6 +404,9 @@ struct fuse_conn {
 	/** Is bmap not implemented by fs? */
 	unsigned no_bmap : 1;
 
+	/** Do multi-page cached writes */
+	unsigned big_writes : 1;
+
 	/** The number of requests waiting for completion */
 	atomic_t num_waiting;
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 79b61587383..fb77e096213 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -576,6 +576,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 				fc->no_lock = 1;
 			if (arg->flags & FUSE_ATOMIC_O_TRUNC)
 				fc->atomic_o_trunc = 1;
+			if (arg->flags & FUSE_BIG_WRITES)
+				fc->big_writes = 1;
 		} else {
 			ra_pages = fc->max_read / PAGE_CACHE_SIZE;
 			fc->no_lock = 1;
@@ -599,7 +601,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
 	arg->major = FUSE_KERNEL_VERSION;
 	arg->minor = FUSE_KERNEL_MINOR_VERSION;
 	arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE;
-	arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC;
+	arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
+		FUSE_BIG_WRITES;
 	req->in.h.opcode = FUSE_INIT;
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(*arg);
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index 5c86f1196c3..d4828219769 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -109,6 +109,7 @@ struct fuse_file_lock {
 #define FUSE_POSIX_LOCKS	(1 << 1)
 #define FUSE_FILE_OPS		(1 << 2)
 #define FUSE_ATOMIC_O_TRUNC	(1 << 3)
+#define FUSE_BIG_WRITES		(1 << 5)
 
 /**
  * Release flags
-- 
cgit v1.2.3-70-g09d2


From f36f21ecca9ee688301174e5f2e0827827a7a7ff Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Mon, 12 May 2008 14:02:33 -0700
Subject: Fix misuses of bdevname()

bdevname() fills the buffer that it is given as a parameter, so calling
strcpy() or snprintf() on the returned value is redundant (and probably not
guaranteed to work - I don't think strcpy and snprintf support overlapping
buffers.)

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Cc: Stephen Tweedie <sct@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 block/blktrace.c     | 2 +-
 block/compat_ioctl.c | 2 +-
 fs/ext4/mballoc.c    | 6 ++----
 fs/jbd2/journal.c    | 4 ++--
 4 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/block/blktrace.c b/block/blktrace.c
index 568588cd16b..b2cbb4e5d76 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -476,7 +476,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
 
 	switch (cmd) {
 	case BLKTRACESETUP:
-		strcpy(b, bdevname(bdev, b));
+		bdevname(bdev, b);
 		ret = blk_trace_setup(q, b, bdev->bd_dev, arg);
 		break;
 	case BLKTRACESTART:
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index c70d0b6f666..c23177e4623 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -555,7 +555,7 @@ static int compat_blk_trace_setup(struct block_device *bdev, char __user *arg)
 	if (copy_from_user(&cbuts, arg, sizeof(cbuts)))
 		return -EFAULT;
 
-	strcpy(b, bdevname(bdev, b));
+	bdevname(bdev, b);
 
 	buts = (struct blk_user_trace_setup) {
 		.act_mask = cbuts.act_mask,
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index fbec2ef9379..b128bdc0f55 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2639,8 +2639,7 @@ static int ext4_mb_init_per_dev_proc(struct super_block *sb)
 	struct proc_dir_entry *proc;
 	char devname[64];
 
-	snprintf(devname, sizeof(devname) - 1, "%s",
-		bdevname(sb->s_bdev, devname));
+	bdevname(sb->s_bdev, devname);
 	sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4);
 
 	MB_PROC_HANDLER(EXT4_MB_STATS_NAME, stats);
@@ -2674,8 +2673,7 @@ static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
 	if (sbi->s_mb_proc == NULL)
 		return -EINVAL;
 
-	snprintf(devname, sizeof(devname) - 1, "%s",
-		bdevname(sb->s_bdev, devname));
+	bdevname(sb->s_bdev, devname);
 	remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc);
 	remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc);
 	remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 53632e3e845..2e24567c4a7 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -901,7 +901,7 @@ static void jbd2_stats_proc_init(journal_t *journal)
 {
 	char name[BDEVNAME_SIZE];
 
-	snprintf(name, sizeof(name) - 1, "%s", bdevname(journal->j_dev, name));
+	bdevname(journal->j_dev, name);
 	journal->j_proc_entry = proc_mkdir(name, proc_jbd2_stats);
 	if (journal->j_proc_entry) {
 		proc_create_data("history", S_IRUGO, journal->j_proc_entry,
@@ -915,7 +915,7 @@ static void jbd2_stats_proc_exit(journal_t *journal)
 {
 	char name[BDEVNAME_SIZE];
 
-	snprintf(name, sizeof(name) - 1, "%s", bdevname(journal->j_dev, name));
+	bdevname(journal->j_dev, name);
 	remove_proc_entry("info", journal->j_proc_entry);
 	remove_proc_entry("history", journal->j_proc_entry);
 	remove_proc_entry(name, proc_jbd2_stats);
-- 
cgit v1.2.3-70-g09d2


From 43f14d856f013a4cc63da2c765617c665274338c Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Mon, 12 May 2008 14:02:40 -0700
Subject: eCryptFS: fix imbalanced mutex locking

Fix imbalanced calls for mutex lock/unlock on ecryptfs_daemon_hash_mux
Revealed by Ingo Molnar: http://lkml.org/lkml/2008/5/7/260

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Michael Halcrow <mhalcrow@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/miscdev.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 788995efd1d..6560da1a58c 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -257,12 +257,14 @@ ecryptfs_miscdev_read(struct file *file, char __user *buf, size_t count,
 	mutex_lock(&daemon->mux);
 	if (daemon->flags & ECRYPTFS_DAEMON_ZOMBIE) {
 		rc = 0;
+		mutex_unlock(&ecryptfs_daemon_hash_mux);
 		printk(KERN_WARNING "%s: Attempt to read from zombified "
 		       "daemon\n", __func__);
 		goto out_unlock_daemon;
 	}
 	if (daemon->flags & ECRYPTFS_DAEMON_IN_READ) {
 		rc = 0;
+		mutex_unlock(&ecryptfs_daemon_hash_mux);
 		goto out_unlock_daemon;
 	}
 	/* This daemon will not go away so long as this flag is set */
-- 
cgit v1.2.3-70-g09d2


From 77c57ec89682c73785d12d51a6d1f873b292fa42 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 13 May 2008 21:39:32 +0000
Subject: [CIFS] don't explicitly do a FindClose on rewind when directory
 search has ended

Do the following series of operations on a CIFS share:

    opendir(dir)
    readdir(dir)
    unlink(file in dir)
    rewinddir(dir)
    readdir(dir)

If the readdir read all entries in the directory this will make CIFS throw an error like this:

     CIFS VFS: Send error in FindClose = -9

CIFS requests "Close at end of search" of the server by setting this bit when issuing FindFirst or FindNext.  Therefore when all search entries are returned, the server may return "end of search" and close the search implicitly when this bit is set by the client on the request.  We check for this when a readdir is explicitly closed - but when the client notices that a directory has changed after the last operation, we attempt to close the directory before reopening by reissuing a second FindFirst. But, the directory may already been implicitly closed (due to end of search) because the first readdir finished. So we only want to issue a FindClose call in this case when we don't expect it to already be closed.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/readdir.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 34ec32100c7..713c2511019 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -670,8 +670,11 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 	   (index_to_find < first_entry_in_buffer)) {
 		/* close and restart search */
 		cFYI(1, ("search backing up - close and restart search"));
-		cifsFile->invalidHandle = true;
-		CIFSFindClose(xid, pTcon, cifsFile->netfid);
+		if (!cifsFile->srch_inf.endOfSearch &&
+		    !cifsFile->invalidHandle) {
+			cifsFile->invalidHandle = true;
+			CIFSFindClose(xid, pTcon, cifsFile->netfid);
+		}
 		kfree(cifsFile->search_resume_name);
 		cifsFile->search_resume_name = NULL;
 		if (cifsFile->srch_inf.ntwrk_buf_start) {
-- 
cgit v1.2.3-70-g09d2


From dfc5d03f12e706c19ee37734184ea96582ef931d Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 13 May 2008 19:11:51 -0400
Subject: ext4: correct mount option parsing to detect when quota options can
 be changed

We should not allow user to change quota mount options when quota is
just suspended.  It would make mount options and internal quota state
inconsistent.  Also we should not allow user to change quota format when
quota is turned on.  On the other hand we can just silently ignore when
some option is set to the value it already has (mount does this on
remount).

Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 52dd0679a4e..686ebcc2e6c 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -979,7 +979,7 @@ static int parse_options (char *options, struct super_block *sb,
 	int data_opt = 0;
 	int option;
 #ifdef CONFIG_QUOTA
-	int qtype;
+	int qtype, qfmt;
 	char *qname;
 #endif
 
@@ -1162,7 +1162,9 @@ static int parse_options (char *options, struct super_block *sb,
 		case Opt_grpjquota:
 			qtype = GRPQUOTA;
 set_qf_name:
-			if (sb_any_quota_enabled(sb)) {
+			if ((sb_any_quota_enabled(sb) ||
+			     sb_any_quota_suspended(sb)) &&
+			    !sbi->s_qf_names[qtype]) {
 				printk(KERN_ERR
 					"EXT4-fs: Cannot change journalled "
 					"quota options when quota turned on.\n");
@@ -1200,7 +1202,9 @@ set_qf_name:
 		case Opt_offgrpjquota:
 			qtype = GRPQUOTA;
 clear_qf_name:
-			if (sb_any_quota_enabled(sb)) {
+			if ((sb_any_quota_enabled(sb) ||
+			     sb_any_quota_suspended(sb)) &&
+			    sbi->s_qf_names[qtype]) {
 				printk(KERN_ERR "EXT4-fs: Cannot change "
 					"journalled quota options when "
 					"quota turned on.\n");
@@ -1213,10 +1217,20 @@ clear_qf_name:
 			sbi->s_qf_names[qtype] = NULL;
 			break;
 		case Opt_jqfmt_vfsold:
-			sbi->s_jquota_fmt = QFMT_VFS_OLD;
-			break;
+			qfmt = QFMT_VFS_OLD;
+			goto set_qf_format;
 		case Opt_jqfmt_vfsv0:
-			sbi->s_jquota_fmt = QFMT_VFS_V0;
+			qfmt = QFMT_VFS_V0;
+set_qf_format:
+			if ((sb_any_quota_enabled(sb) ||
+			     sb_any_quota_suspended(sb)) &&
+			    sbi->s_jquota_fmt != qfmt) {
+				printk(KERN_ERR "EXT4-fs: Cannot change "
+					"journaled quota options when "
+					"quota turned on.\n");
+				return 0;
+			}
+			sbi->s_jquota_fmt = qfmt;
 			break;
 		case Opt_quota:
 		case Opt_usrquota:
-- 
cgit v1.2.3-70-g09d2


From cd59e7b9781a35716b8a3e8c4aa2d48081d7daf7 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 13 May 2008 19:11:51 -0400
Subject: ext4: Fix mount messages when quota disabled

When quota is disabled, we should not print 'journaled quota not
supported' when user tried to mount non-journaled quota. Also fix typo
in the message.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 686ebcc2e6c..94a527261ca 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1255,6 +1255,9 @@ set_qf_format:
 		case Opt_quota:
 		case Opt_usrquota:
 		case Opt_grpquota:
+			printk(KERN_ERR
+				"EXT4-fs: quota options not supported.\n");
+			break;
 		case Opt_usrjquota:
 		case Opt_grpjquota:
 		case Opt_offusrjquota:
@@ -1262,7 +1265,7 @@ set_qf_format:
 		case Opt_jqfmt_vfsold:
 		case Opt_jqfmt_vfsv0:
 			printk(KERN_ERR
-				"EXT4-fs: journalled quota options not "
+				"EXT4-fs: journaled quota options not "
 				"supported.\n");
 			break;
 		case Opt_noquota:
-- 
cgit v1.2.3-70-g09d2


From 0623543b3335c8e439cacf21af99bbf45da42c5a Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 13 May 2008 19:11:51 -0400
Subject: ext4: fix synchronization of quota files in journal=data mode

In journal=data mode, it is not enough to do write_inode_now as done in
vfs_quota_on() to write all data to their final location (which is
needed for quota_read to work correctly).  Calling journal_flush() does
its job.

Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 35 +++++++++++++++++++++++++++--------
 1 file changed, 27 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 94a527261ca..cddf7f0e0fd 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3170,23 +3170,42 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
 
 	if (!test_opt(sb, QUOTA))
 		return -EINVAL;
-	/* Not journalling quota? */
-	if ((!EXT4_SB(sb)->s_qf_names[USRQUOTA] &&
-	    !EXT4_SB(sb)->s_qf_names[GRPQUOTA]) || remount)
+	/* When remounting, no checks are needed and in fact, path is NULL */
+	if (remount)
 		return vfs_quota_on(sb, type, format_id, path, remount);
+
 	err = path_lookup(path, LOOKUP_FOLLOW, &nd);
 	if (err)
 		return err;
+
 	/* Quotafile not on the same filesystem? */
 	if (nd.path.mnt->mnt_sb != sb) {
 		path_put(&nd.path);
 		return -EXDEV;
 	}
-	/* Quotafile not of fs root? */
-	if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode)
-		printk(KERN_WARNING
-			"EXT4-fs: Quota file not on filesystem root. "
-			"Journalled quota will not work.\n");
+	/* Journaling quota? */
+	if (EXT4_SB(sb)->s_qf_names[type]) {
+		/* Quotafile not of fs root? */
+		if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode)
+			printk(KERN_WARNING
+				"EXT4-fs: Quota file not on filesystem root. "
+				"Journaled quota will not work.\n");
+ 	}
+
+	/*
+	 * When we journal data on quota file, we have to flush journal to see
+	 * all updates to the file when we bypass pagecache...
+	 */
+	if (ext4_should_journal_data(nd.path.dentry->d_inode)) {
+		/*
+		 * We don't need to lock updates but journal_flush() could
+		 * otherwise be livelocked...
+		 */
+		jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+		jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+	}
+
 	path_put(&nd.path);
 	return vfs_quota_on(sb, type, format_id, path, remount);
 }
-- 
cgit v1.2.3-70-g09d2


From 2c8be6b222f76c332d9faeb00c047996d340632c Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 13 May 2008 21:27:55 -0400
Subject: ext4: fix typos in messages and comments (journalled -> journaled)

Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index cddf7f0e0fd..09d9359c805 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1166,7 +1166,7 @@ set_qf_name:
 			     sb_any_quota_suspended(sb)) &&
 			    !sbi->s_qf_names[qtype]) {
 				printk(KERN_ERR
-					"EXT4-fs: Cannot change journalled "
+					"EXT4-fs: Cannot change journaled "
 					"quota options when quota turned on.\n");
 				return 0;
 			}
@@ -1206,7 +1206,7 @@ clear_qf_name:
 			     sb_any_quota_suspended(sb)) &&
 			    sbi->s_qf_names[qtype]) {
 				printk(KERN_ERR "EXT4-fs: Cannot change "
-					"journalled quota options when "
+					"journaled quota options when "
 					"quota turned on.\n");
 				return 0;
 			}
@@ -1350,14 +1350,14 @@ set_qf_format:
 		}
 
 		if (!sbi->s_jquota_fmt) {
-			printk(KERN_ERR "EXT4-fs: journalled quota format "
+			printk(KERN_ERR "EXT4-fs: journaled quota format "
 					"not specified.\n");
 			return 0;
 		}
 	} else {
 		if (sbi->s_jquota_fmt) {
-			printk(KERN_ERR "EXT4-fs: journalled quota format "
-					"specified with no journalling "
+			printk(KERN_ERR "EXT4-fs: journaled quota format "
+					"specified with no journaling "
 					"enabled.\n");
 			return 0;
 		}
@@ -1598,7 +1598,7 @@ static void ext4_orphan_cleanup (struct super_block * sb,
 			int ret = ext4_quota_on_mount(sb, i);
 			if (ret < 0)
 				printk(KERN_ERR
-					"EXT4-fs: Cannot turn on journalled "
+					"EXT4-fs: Cannot turn on journaled "
 					"quota: error %d\n", ret);
 		}
 	}
@@ -3123,7 +3123,7 @@ static int ext4_release_dquot(struct dquot *dquot)
 
 static int ext4_mark_dquot_dirty(struct dquot *dquot)
 {
-	/* Are we journalling quotas? */
+	/* Are we journaling quotas? */
 	if (EXT4_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
 	    EXT4_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
 		dquot_mark_dquot_dirty(dquot);
-- 
cgit v1.2.3-70-g09d2


From 1930479c4b6bbcb6f164a5b3498e0d98329967f4 Mon Sep 17 00:00:00 2001
From: Valerie Clement <valerie.clement@bull.net>
Date: Tue, 13 May 2008 19:31:14 -0400
Subject: ext4: mballoc fix mb_normalize_request algorithm for 1KB block size
 filesystems

In case of inode preallocation, the number of blocks to allocate depends
on the file size and it is calculated in ext4_mb_normalize_request().
Each group in the filesystem is then checked to find one that can be
used for allocation; this is done in ext4_mb_good_group().

When a file bigger than 4MB is created, the requested number of blocks
to preallocate, calculated by ext4_mb_normalize_request is 4096.
However for a filesystem with 1KB block size, the maximum size of the
block buddies used by the multiblock allocator is 2048, so none of
groups in the filesystem satisfies the search criteria in
ext4_mb_good_group(). Scanning all the filesystem groups impacts
performance.

This was demonstrated by using a freshly created, 70GB, 1k block
filesystem, with caches dropped write before the test via
/proc/sys/vm/drop_caches, and with the filesystem mounted with
nodelalloc and nodealloc,nomballoc.  The time to write an 8 megabyte
file using "dd if=/dev/zero of=/mnt/test/fo bs=8k count=1k conv=fsync"
took 35.5091 seconds (236kB/s) with nodellaloc, and 0.233754 seconds
(35.9 MB/s) with the nodelloc,nomballoc options.  With a 1TB partition,
it took several minutes to write 8MB!

This patch modifies the algorithm in ext4_mb_normalize_group_request to
calculate the number of blocks to allocate by taking into account the
maximum size of free blocks chunks handled by the multiblock allocator.

It has also been tested for filesystems with 2KB and 4KB block sizes to
ensure that those cases don't regress.

Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Valerie Clement <valerie.clement@bull.net>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b128bdc0f55..1d7fde99452 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2880,12 +2880,11 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 	if (size < i_size_read(ac->ac_inode))
 		size = i_size_read(ac->ac_inode);
 
-	/* max available blocks in a free group */
-	max = EXT4_BLOCKS_PER_GROUP(ac->ac_sb) - 1 - 1 -
-				EXT4_SB(ac->ac_sb)->s_itb_per_group;
+	/* max size of free chunks */
+	max = 2 << bsbits;
 
-#define NRL_CHECK_SIZE(req, size, max,bits)	\
-		(req <= (size) || max <= ((size) >> bits))
+#define NRL_CHECK_SIZE(req, size, max, chunk_size)	\
+		(req <= (size) || max <= (chunk_size))
 
 	/* first, try to predict filesize */
 	/* XXX: should this table be tunable? */
@@ -2904,16 +2903,16 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 		size = 512 * 1024;
 	} else if (size <= 1024 * 1024) {
 		size = 1024 * 1024;
-	} else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, bsbits)) {
+	} else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
 		start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
-						(20 - bsbits)) << 20;
-		size = 1024 * 1024;
-	} else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, bsbits)) {
+						(21 - bsbits)) << 21;
+		size = 2 * 1024 * 1024;
+	} else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
 		start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
 							(22 - bsbits)) << 22;
 		size = 4 * 1024 * 1024;
 	} else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
-					(8<<20)>>bsbits, max, bsbits)) {
+					(8<<20)>>bsbits, max, 8 * 1024)) {
 		start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
 							(23 - bsbits)) << 23;
 		size = 8 * 1024 * 1024;
-- 
cgit v1.2.3-70-g09d2


From e10f7b551d2a79b113d5ce66b5dc9f3657035445 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 14 May 2008 10:21:33 -0700
Subject: clarify return value of cifs_convert_flags()

cifs_convert_flags returns 0x20197 in the default case. It's not
immediately evident where that number comes from, so change it
to be an or'ed set of flags. The compiler will boil it down anyway.

(Thanks to Guenter Kukkukk for clarifying the flags).

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/file.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 31a0a33b9d9..8636cec2642 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -75,7 +75,11 @@ static inline int cifs_convert_flags(unsigned int flags)
 		return (GENERIC_READ | GENERIC_WRITE);
 	}
 
-	return 0x20197;
+	return (READ_CONTROL | FILE_WRITE_ATTRIBUTES | FILE_READ_ATTRIBUTES |
+		FILE_WRITE_EA | FILE_APPEND_DATA | FILE_WRITE_DATA |
+		FILE_READ_DATA);
+
+
 }
 
 static inline int cifs_get_disposition(unsigned int flags)
-- 
cgit v1.2.3-70-g09d2


From 35fc37d5175091c36d034a28c057da0f9594ee7e Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 14 May 2008 10:22:03 -0700
Subject: add function to convert access flags to legacy open mode

SMBLegacyOpen always opens a file as r/w. This could be problematic
for files with ATTR_READONLY set. Have it interpret the access_mode
into a sane open mode.

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifssmb.c | 22 +++++++++++++++-------
 fs/cifs/inode.c   |  3 +--
 2 files changed, 16 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 3c05c2de50e..9c04ad40455 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1166,6 +1166,20 @@ static __u16 convert_disposition(int disposition)
 	return ofun;
 }
 
+static int
+access_flags_to_smbopen_mode(const int access_flags)
+{
+	int masked_flags = access_flags & (GENERIC_READ | GENERIC_WRITE);
+
+	if (masked_flags == GENERIC_READ)
+		return SMBOPEN_READ;
+	else if (masked_flags == GENERIC_WRITE)
+		return SMBOPEN_WRITE;
+
+	/* just go for read/write */
+	return SMBOPEN_READWRITE;
+}
+
 int
 SMBLegacyOpen(const int xid, struct cifsTconInfo *tcon,
 	    const char *fileName, const int openDisposition,
@@ -1207,13 +1221,7 @@ OldOpenRetry:
 		pSMB->OpenFlags = cpu_to_le16(REQ_BATCHOPLOCK);
 
 	pSMB->OpenFlags |= cpu_to_le16(REQ_MORE_INFO);
-	/* BB fixme add conversion for access_flags to bits 0 - 2 of mode */
-	/* 0 = read
-	   1 = write
-	   2 = rw
-	   3 = execute
-	 */
-	pSMB->Mode = cpu_to_le16(2);
+	pSMB->Mode = cpu_to_le16(access_flags_to_smbopen_mode(access_flags));
 	pSMB->Mode |= cpu_to_le16(0x40); /* deny none */
 	/* set file as system file if special file such
 	   as fifo and server expecting SFU style and
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index fcbdbb6ad7b..2d53b436d51 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1502,8 +1502,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 				int oplock = 0;
 
 				rc = SMBLegacyOpen(xid, pTcon, full_path,
-					FILE_OPEN,
-					SYNCHRONIZE | FILE_WRITE_ATTRIBUTES,
+					FILE_OPEN, GENERIC_WRITE,
 					CREATE_NOT_DIR, &netfid, &oplock,
 					NULL, cifs_sb->local_nls,
 					cifs_sb->mnt_cifs_flags &
-- 
cgit v1.2.3-70-g09d2


From b32a09db4fb9a87246ba4e7726a979ac4709ad97 Mon Sep 17 00:00:00 2001
From: Markus Armbruster <armbru@redhat.com>
Date: Tue, 26 Feb 2008 09:57:11 -0600
Subject: add match_strlcpy() us it to make v9fs make uname and remotename
 parsing more robust

match_strcpy() is a somewhat creepy function: the caller needs to make sure
that the destination buffer is big enough, and when he screws up or
forgets, match_strcpy() happily overruns the buffer.

There's exactly one customer: v9fs_parse_options().  I believe it currently
can't overflow its buffer, but that's not exactly obvious.

The source string is a substing of the mount options.  The kernel silently
truncates those to PAGE_SIZE bytes, including the terminating zero.  See
compat_sys_mount() and do_mount().

The destination buffer is obtained from __getname(), which allocates from
name_cachep, which is initialized by vfs_caches_init() for size PATH_MAX.

We're safe as long as PATH_MAX <= PAGE_SIZE.  PATH_MAX is 4096.  As far as
I know, the smallest PAGE_SIZE is also 4096.

Here's a patch that makes the code a bit more obviously correct.  It
doesn't depend on PATH_MAX <= PAGE_SIZE.

Signed-off-by: Markus Armbruster <armbru@redhat.com>
Cc: Latchesar Ionkov <lucho@ionkov.net>
Cc: Jim Meyering <meyering@redhat.com>
Cc: "Randy.Dunlap" <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/v9fs.c           |  4 ++--
 include/linux/parser.h |  2 +-
 lib/parser.c           | 32 ++++++++++++++++++++------------
 3 files changed, 23 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 9b0f0222e8b..e307fbd34fa 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -125,10 +125,10 @@ static void v9fs_parse_options(struct v9fs_session_info *v9ses)
 			v9ses->afid = option;
 			break;
 		case Opt_uname:
-			match_strcpy(v9ses->uname, &args[0]);
+			match_strlcpy(v9ses->uname, &args[0], PATH_MAX);
 			break;
 		case Opt_remotename:
-			match_strcpy(v9ses->aname, &args[0]);
+			match_strlcpy(v9ses->aname, &args[0], PATH_MAX);
 			break;
 		case Opt_nodevmap:
 			v9ses->nodev = 1;
diff --git a/include/linux/parser.h b/include/linux/parser.h
index 26b2bdfcaf0..7dcd0507575 100644
--- a/include/linux/parser.h
+++ b/include/linux/parser.h
@@ -29,5 +29,5 @@ int match_token(char *, match_table_t table, substring_t args[]);
 int match_int(substring_t *, int *result);
 int match_octal(substring_t *, int *result);
 int match_hex(substring_t *, int *result);
-void match_strcpy(char *, const substring_t *);
+size_t match_strlcpy(char *, const substring_t *, size_t);
 char *match_strdup(const substring_t *);
diff --git a/lib/parser.c b/lib/parser.c
index 703c8c13b34..4f0cbc03e0e 100644
--- a/lib/parser.c
+++ b/lib/parser.c
@@ -182,18 +182,25 @@ int match_hex(substring_t *s, int *result)
 }
 
 /**
- * match_strcpy: - copies the characters from a substring_t to a string
- * @to: string to copy characters to.
- * @s: &substring_t to copy
+ * match_strlcpy: - Copy the characters from a substring_t to a sized buffer
+ * @dest: where to copy to
+ * @src: &substring_t to copy
+ * @size: size of destination buffer
  *
- * Description: Copies the set of characters represented by the given
- * &substring_t @s to the c-style string @to. Caller guarantees that @to is
- * large enough to hold the characters of @s.
+ * Description: Copy the characters in &substring_t @src to the
+ * c-style string @dest.  Copy no more than @size - 1 characters, plus
+ * the terminating NUL.  Return length of @src.
  */
-void match_strcpy(char *to, const substring_t *s)
+size_t match_strlcpy(char *dest, const substring_t *src, size_t size)
 {
-	memcpy(to, s->from, s->to - s->from);
-	to[s->to - s->from] = '\0';
+	size_t ret = src->to - src->from;
+
+	if (size) {
+		size_t len = ret >= size ? size - 1 : ret;
+		memcpy(dest, src->from, len);
+		dest[len] = '\0';
+	}
+	return ret;
 }
 
 /**
@@ -206,9 +213,10 @@ void match_strcpy(char *to, const substring_t *s)
  */
 char *match_strdup(const substring_t *s)
 {
-	char *p = kmalloc(s->to - s->from + 1, GFP_KERNEL);
+	size_t sz = s->to - s->from + 1;
+	char *p = kmalloc(sz, GFP_KERNEL);
 	if (p)
-		match_strcpy(p, s);
+		match_strlcpy(p, s, sz);
 	return p;
 }
 
@@ -216,5 +224,5 @@ EXPORT_SYMBOL(match_token);
 EXPORT_SYMBOL(match_int);
 EXPORT_SYMBOL(match_octal);
 EXPORT_SYMBOL(match_hex);
-EXPORT_SYMBOL(match_strcpy);
+EXPORT_SYMBOL(match_strlcpy);
 EXPORT_SYMBOL(match_strdup);
-- 
cgit v1.2.3-70-g09d2


From ee443996a35c1e04f210cafd43d5a98d41e46085 Mon Sep 17 00:00:00 2001
From: Eric Van Hensbergen <ericvh@ericvh-desktop.(none)>
Date: Wed, 5 Mar 2008 07:08:09 -0600
Subject: 9p: Documentation updates

The kernel-doc comments of much of the 9p system have been in disarray since
reorganization.  This patch fixes those problems, adds additional documentation
and a template book which collects the 9p information.

Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/fid.h                |  15 +++
 fs/9p/v9fs.c               |   8 +-
 fs/9p/v9fs.h               |  85 +++++++++++-----
 fs/9p/vfs_addr.c           |   2 +-
 fs/9p/vfs_dir.c            |   2 +-
 fs/9p/vfs_file.c           |  11 ++-
 fs/9p/vfs_inode.c          |  50 +++++++---
 fs/9p/vfs_super.c          |   1 +
 include/net/9p/9p.h        | 239 +++++++++++++++++++++++++++++++++++++++------
 include/net/9p/client.h    |  35 +++++++
 include/net/9p/transport.h |  43 ++++++++
 net/9p/conv.c              | 128 +++++++++++++++++++++++-
 net/9p/error.c             |  11 ++-
 net/9p/fcprint.c           |   8 ++
 net/9p/mod.c               |   7 +-
 net/9p/trans_fd.c          | 146 ++++++++++++++++++++++-----
 net/9p/trans_virtio.c      | 155 +++++++++++++++++++++++++++--
 net/9p/util.c              |  32 ++++--
 18 files changed, 854 insertions(+), 124 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/fid.h b/fs/9p/fid.h
index 26e07df783b..c3bbd6af996 100644
--- a/fs/9p/fid.h
+++ b/fs/9p/fid.h
@@ -22,6 +22,21 @@
 
 #include <linux/list.h>
 
+/**
+ * struct v9fs_dentry - 9p private data stored in dentry d_fsdata
+ * @lock: protects the fidlist
+ * @fidlist: list of FIDs currently associated with this dentry
+ *
+ * This structure defines the 9p private data associated with
+ * a particular dentry.  In particular, this private data is used
+ * to lookup which 9P FID handle should be used for a particular VFS
+ * operation.  FID handles are associated with dentries instead of
+ * inodes in order to more closely map functionality to the Plan 9
+ * expected behavior for FID reclaimation and tracking.
+ *
+ * See Also: Mapping FIDs to Linux VFS model in
+ * Design and Implementation of the Linux 9P File System documentation
+ */
 struct v9fs_dentry {
 	spinlock_t lock; /* protect fidlist */
 	struct list_head fidlist;
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index e307fbd34fa..79d310c0018 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -71,7 +71,6 @@ static match_table_t tokens = {
 
 /**
  * v9fs_parse_options - parse mount options into session structure
- * @options: options string passed from mount
  * @v9ses: existing v9fs session information
  *
  */
@@ -256,9 +255,12 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
 }
 
 /**
- * v9fs_session_cancel - mark transport as disconnected
- * 	and cancel all pending requests.
+ * v9fs_session_cancel - terminate a session
+ * @v9ses: session to terminate
+ *
+ * mark transport as disconnected and cancel all pending requests.
  */
+
 void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
 	P9_DPRINTK(P9_DEBUG_ERROR, "cancel session %p\n", v9ses);
 	p9_client_disconnect(v9ses->clnt);
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 7d3a1018db5..a7d56719299 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -21,18 +21,69 @@
  *
  */
 
-/*
-  * Session structure provides information for an opened session
-  *
-  */
+/**
+ * enum p9_session_flags - option flags for each 9P session
+ * @V9FS_EXTENDED: whether or not to use 9P2000.u extensions
+ * @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy
+ * @V9FS_ACCESS_USER: a new attach will be issued for every user (default)
+ * @V9FS_ACCESS_ANY: use a single attach for all users
+ * @V9FS_ACCESS_MASK: bit mask of different ACCESS options
+ *
+ * Session flags reflect options selected by users at mount time
+ */
+enum p9_session_flags {
+	V9FS_EXTENDED		= 0x01,
+	V9FS_ACCESS_SINGLE	= 0x02,
+	V9FS_ACCESS_USER	= 0x04,
+	V9FS_ACCESS_ANY		= 0x06,
+	V9FS_ACCESS_MASK	= 0x06,
+};
+
+/* possible values of ->cache */
+/**
+ * enum p9_cache_modes - user specified cache preferences
+ * @CACHE_NONE: do not cache data, dentries, or directory contents (default)
+ * @CACHE_LOOSE: cache data, dentries, and directory contents w/no consistency
+ *
+ * eventually support loose, tight, time, session, default always none
+ */
+
+enum p9_cache_modes {
+	CACHE_NONE,
+	CACHE_LOOSE,
+};
+
+/**
+ * struct v9fs_session_info - per-instance session information
+ * @flags: session options of type &p9_session_flags
+ * @nodev: set to 1 to disable device mapping
+ * @debug: debug level
+ * @afid: authentication handle
+ * @cache: cache mode of type &p9_cache_modes
+ * @options: copy of options string given by user
+ * @uname: string user name to mount hierarchy as
+ * @aname: mount specifier for remote hierarchy
+ * @maxdata: maximum data to be sent/recvd per protocol message
+ * @dfltuid: default numeric userid to mount hierarchy as
+ * @dfltgid: default numeric groupid to mount hierarchy as
+ * @uid: if %V9FS_ACCESS_SINGLE, the numeric uid which mounted the hierarchy
+ * @clnt: reference to 9P network client instantiated for this session
+ * @debugfs_dir: reference to debugfs_dir which can be used for add'l debug
+ *
+ * This structure holds state for each session instance established during
+ * a sys_mount() .
+ *
+ * Bugs: there seems to be a lot of state which could be condensed and/or
+ * removed.
+ */
 
 struct v9fs_session_info {
 	/* options */
-	unsigned char flags;	/* session flags */
-	unsigned char nodev;	/* set to 1 if no disable device mapping */
-	unsigned short debug;	/* debug level */
-	unsigned int afid;	/* authentication fid */
-	unsigned int cache;	/* cache mode */
+	unsigned char flags;
+	unsigned char nodev;
+	unsigned short debug;
+	unsigned int afid;
+	unsigned int cache;
 
 	char *options;		/* copy of mount options */
 	char *uname;		/* user name to mount as */
@@ -45,22 +96,6 @@ struct v9fs_session_info {
 	struct dentry *debugfs_dir;
 };
 
-/* session flags */
-enum {
-	V9FS_EXTENDED		= 0x01,	/* 9P2000.u */
-	V9FS_ACCESS_MASK	= 0x06,	/* access mask */
-	V9FS_ACCESS_SINGLE	= 0x02,	/* only one user can access the files */
-	V9FS_ACCESS_USER	= 0x04,	/* attache per user */
-	V9FS_ACCESS_ANY		= 0x06,	/* use the same attach for all users */
-};
-
-/* possible values of ->cache */
-/* eventually support loose, tight, time, session, default always none */
-enum {
-	CACHE_NONE,		/* default */
-	CACHE_LOOSE,		/* no consistency */
-};
-
 extern struct dentry *v9fs_debugfs_root;
 
 struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 6248f0e727a..97d3aed5798 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -43,7 +43,7 @@
 /**
  * v9fs_vfs_readpage - read an entire page in from 9P
  *
- * @file: file being read
+ * @filp: file being read
  * @page: structure to page
  *
  */
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 0924d4477da..88e3787c6ea 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -60,7 +60,7 @@ static inline int dt_type(struct p9_stat *mistat)
 
 /**
  * v9fs_dir_readdir - read a directory
- * @filep: opened file structure
+ * @filp: opened file structure
  * @dirent: directory structure ???
  * @filldir: function to populate directory structure ???
  *
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index a616fff8906..0d55affe37d 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -90,10 +90,11 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 
 /**
  * v9fs_file_lock - lock a file (or directory)
- * @inode: inode to be opened
- * @file: file being opened
+ * @filp: file to be locked
+ * @cmd: lock command
+ * @fl: file lock structure
  *
- * XXX - this looks like a local only lock, we should extend into 9P
+ * Bugs: this looks like a local only lock, we should extend into 9P
  *       by using open exclusive
  */
 
@@ -118,7 +119,7 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
 
 /**
  * v9fs_file_read - read from a file
- * @filep: file pointer to read
+ * @filp: file pointer to read
  * @data: data buffer to read data into
  * @count: size of buffer
  * @offset: offset at which to read data
@@ -142,7 +143,7 @@ v9fs_file_read(struct file *filp, char __user * data, size_t count,
 
 /**
  * v9fs_file_write - write to a file
- * @filep: file pointer to write
+ * @filp: file pointer to write
  * @data: data buffer to write data from
  * @count: size of buffer
  * @offset: offset at which to write data
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 6a28842052e..40fa807bd92 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -129,6 +129,12 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
 	return res;
 }
 
+/**
+ * v9fs_uflags2omode- convert posix open flags to plan 9 mode bits
+ * @uflags: flags to convert
+ *
+ */
+
 int v9fs_uflags2omode(int uflags)
 {
 	int ret;
@@ -312,6 +318,14 @@ error:
 }
 */
 
+/**
+ * v9fs_inode_from_fid - populate an inode by issuing a attribute request
+ * @v9ses: session information
+ * @fid: fid to issue attribute request for
+ * @sb: superblock on which to create inode
+ *
+ */
+
 static struct inode *
 v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
 	struct super_block *sb)
@@ -384,9 +398,12 @@ v9fs_open_created(struct inode *inode, struct file *file)
 
 /**
  * v9fs_create - Create a file
+ * @v9ses: session information
+ * @dir: directory that dentry is being created in
  * @dentry:  dentry that is being created
  * @perm: create permissions
  * @mode: open mode
+ * @extension: 9p2000.u extension string to support devices, etc.
  *
  */
 static struct p9_fid *
@@ -461,7 +478,7 @@ error:
 
 /**
  * v9fs_vfs_create - VFS hook to create files
- * @inode: directory inode that is being created
+ * @dir: directory inode that is being created
  * @dentry:  dentry that is being deleted
  * @mode: create permissions
  * @nd: path information
@@ -519,7 +536,7 @@ error:
 
 /**
  * v9fs_vfs_mkdir - VFS mkdir hook to create a directory
- * @inode:  inode that is being unlinked
+ * @dir:  inode that is being unlinked
  * @dentry: dentry that is being unlinked
  * @mode: mode for new directory
  *
@@ -703,9 +720,9 @@ done:
 
 /**
  * v9fs_vfs_getattr - retrieve file metadata
- * @mnt - mount information
- * @dentry - file to get attributes on
- * @stat - metadata structure to populate
+ * @mnt: mount information
+ * @dentry: file to get attributes on
+ * @stat: metadata structure to populate
  *
  */
 
@@ -928,7 +945,7 @@ done:
 /**
  * v9fs_vfs_readlink - read a symlink's location
  * @dentry: dentry for symlink
- * @buf: buffer to load symlink location into
+ * @buffer: buffer to load symlink location into
  * @buflen: length of buffer
  *
  */
@@ -996,10 +1013,12 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd)
  * v9fs_vfs_put_link - release a symlink path
  * @dentry: dentry for symlink
  * @nd: nameidata
+ * @p: unused
  *
  */
 
-static void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
+static void
+v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
 {
 	char *s = nd_get_link(nd);
 
@@ -1008,6 +1027,15 @@ static void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void
 		__putname(s);
 }
 
+/**
+ * v9fs_vfs_mkspecial - create a special file
+ * @dir: inode to create special file in
+ * @dentry: dentry to create
+ * @mode: mode to create special file
+ * @extension: 9p2000.u format extension string representing special file
+ *
+ */
+
 static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
 	int mode, const char *extension)
 {
@@ -1037,7 +1065,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
  * @dentry: dentry for symlink
  * @symname: symlink data
  *
- * See 9P2000.u RFC for more information
+ * See Also: 9P2000.u RFC for more information
  *
  */
 
@@ -1058,10 +1086,6 @@ v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
  *
  */
 
-/* XXX - lots of code dup'd from symlink and creates,
- * figure out a better reuse strategy
- */
-
 static int
 v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
 	      struct dentry *dentry)
@@ -1098,7 +1122,7 @@ clunk_fid:
  * @dir: inode destination for new link
  * @dentry: dentry for file
  * @mode: mode for creation
- * @dev_t: device associated with special file
+ * @rdev: device associated with special file
  *
  */
 
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index a452ac67fc9..ba10f172626 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -75,6 +75,7 @@ static int v9fs_set_super(struct super_block *s, void *data)
  * v9fs_fill_super - populate superblock with info
  * @sb: superblock
  * @v9ses: session information
+ * @flags: flags propagated from v9fs_get_sb()
  *
  */
 
diff --git a/include/net/9p/9p.h b/include/net/9p/9p.h
index 585eb449699..7bfb2f2e423 100644
--- a/include/net/9p/9p.h
+++ b/include/net/9p/9p.h
@@ -29,14 +29,31 @@
 
 #ifdef CONFIG_NET_9P_DEBUG
 
-#define P9_DEBUG_ERROR		(1<<0)
-#define P9_DEBUG_9P	        (1<<2)
-#define P9_DEBUG_VFS	        (1<<3)
-#define P9_DEBUG_CONV		(1<<4)
-#define P9_DEBUG_MUX		(1<<5)
-#define P9_DEBUG_TRANS		(1<<6)
-#define P9_DEBUG_SLABS	      	(1<<7)
-#define P9_DEBUG_FCALL		(1<<8)
+/**
+ * enum p9_debug_flags - bits for mount time debug parameter
+ * @P9_DEBUG_ERROR: more verbose error messages including original error string
+ * @P9_DEBUG_9P: 9P protocol tracing
+ * @P9_DEBUG_VFS: VFS API tracing
+ * @P9_DEBUG_CONV: protocol conversion tracing
+ * @P9_DEBUG_MUX: trace management of concurrent transactions
+ * @P9_DEBUG_TRANS: transport tracing
+ * @P9_DEBUG_SLABS: memory management tracing
+ * @P9_DEBUG_FCALL: verbose dump of protocol messages
+ *
+ * These flags are passed at mount time to turn on various levels of
+ * verbosity and tracing which will be output to the system logs.
+ */
+
+enum p9_debug_flags {
+	P9_DEBUG_ERROR = 	(1<<0),
+	P9_DEBUG_9P = 		(1<<2),
+	P9_DEBUG_VFS =		(1<<3),
+	P9_DEBUG_CONV =		(1<<4),
+	P9_DEBUG_MUX =		(1<<5),
+	P9_DEBUG_TRANS =	(1<<6),
+	P9_DEBUG_SLABS =      	(1<<7),
+	P9_DEBUG_FCALL =	(1<<8),
+};
 
 extern unsigned int p9_debug_level;
 
@@ -62,9 +79,47 @@ do { \
 		format , __FUNCTION__, task_pid_nr(current), ## arg); \
 } while (0)
 
+/**
+ * enum p9_msg_t - 9P message types
+ * @P9_TVERSION: version handshake request
+ * @P9_RVERSION: version handshake response
+ * @P9_TAUTH: request to establish authentication channel
+ * @P9_RAUTH: response with authentication information
+ * @P9_TATTACH: establish user access to file service
+ * @P9_RATTACH: response with top level handle to file hierarchy
+ * @P9_TERROR: not used
+ * @P9_RERROR: response for any failed request
+ * @P9_TFLUSH: request to abort a previous request
+ * @P9_RFLUSH: response when previous request has been cancelled
+ * @P9_TWALK: descend a directory hierarchy
+ * @P9_RWALK: response with new handle for position within hierarchy
+ * @P9_TOPEN: prepare a handle for I/O on an existing file
+ * @P9_ROPEN: response with file access information
+ * @P9_TCREATE: prepare a handle for I/O on a new file
+ * @P9_RCREATE: response with file access information
+ * @P9_TREAD: request to transfer data from a file or directory
+ * @P9_RREAD: response with data requested
+ * @P9_TWRITE: reuqest to transfer data to a file
+ * @P9_RWRITE: response with out much data was transfered to file
+ * @P9_TCLUNK: forget about a handle to an entity within the file system
+ * @P9_RCLUNK: response when server has forgotten about the handle
+ * @P9_TREMOVE: request to remove an entity from the hierarchy
+ * @P9_RREMOVE: response when server has removed the entity
+ * @P9_TSTAT: request file entity attributes
+ * @P9_RSTAT: response with file entity attributes
+ * @P9_TWSTAT: request to update file entity attributes
+ * @P9_RWSTAT: response when file entity attributes are updated
+ *
+ * There are 14 basic operations in 9P2000, paired as
+ * requests and responses.  The one special case is ERROR
+ * as there is no @P9_TERROR request for clients to transmit to
+ * the server, but the server may respond to any other request
+ * with an @P9_RERROR.
+ *
+ * See Also: http://plan9.bell-labs.com/sys/man/5/INDEX.html
+ */
 
-/* Message Types */
-enum {
+enum p9_msg_t {
 	P9_TVERSION = 100,
 	P9_RVERSION,
 	P9_TAUTH = 102,
@@ -95,30 +150,71 @@ enum {
 	P9_RWSTAT,
 };
 
-/* open modes */
-enum {
+/**
+ * enum p9_open_mode_t - 9P open modes
+ * @P9_OREAD: open file for reading only
+ * @P9_OWRITE: open file for writing only
+ * @P9_ORDWR: open file for reading or writing
+ * @P9_OEXEC: open file for execution
+ * @P9_OTRUNC: truncate file to zero-length before opening it
+ * @P9_OREXEC: close the file when an exec(2) system call is made
+ * @P9_ORCLOSE: remove the file when the file is closed
+ * @P9_OAPPEND: open the file and seek to the end
+ * @P9_OEXCL: only create a file, do not open it
+ *
+ * 9P open modes differ slightly from Posix standard modes.
+ * In particular, there are extra modes which specify different
+ * semantic behaviors than may be available on standard Posix
+ * systems.  For example, @P9_OREXEC and @P9_ORCLOSE are modes that
+ * most likely will not be issued from the Linux VFS client, but may
+ * be supported by servers.
+ *
+ * See Also: http://plan9.bell-labs.com/magic/man2html/2/open
+ */
+
+enum p9_open_mode_t {
 	P9_OREAD = 0x00,
 	P9_OWRITE = 0x01,
 	P9_ORDWR = 0x02,
 	P9_OEXEC = 0x03,
-	P9_OEXCL = 0x04,
 	P9_OTRUNC = 0x10,
 	P9_OREXEC = 0x20,
 	P9_ORCLOSE = 0x40,
 	P9_OAPPEND = 0x80,
-};
-
-/* permissions */
-enum {
+	P9_OEXCL = 0x1000,
+};
+
+/**
+ * enum p9_perm_t - 9P permissions
+ * @P9_DMDIR: mode bite for directories
+ * @P9_DMAPPEND: mode bit for is append-only
+ * @P9_DMEXCL: mode bit for excluse use (only one open handle allowed)
+ * @P9_DMMOUNT: mode bite for mount points
+ * @P9_DMAUTH: mode bit for authentication file
+ * @P9_DMTMP: mode bit for non-backed-up files
+ * @P9_DMSYMLINK: mode bit for symbolic links (9P2000.u)
+ * @P9_DMLINK: mode bit for hard-link (9P2000.u)
+ * @P9_DMDEVICE: mode bit for device files (9P2000.u)
+ * @P9_DMNAMEDPIPE: mode bit for named pipe (9P2000.u)
+ * @P9_DMSOCKET: mode bit for socket (9P2000.u)
+ * @P9_DMSETUID: mode bit for setuid (9P2000.u)
+ * @P9_DMSETGID: mode bit for setgid (9P2000.u)
+ * @P9_DMSETVTX: mode bit for sticky bit (9P2000.u)
+ *
+ * 9P permissions differ slightly from Posix standard modes.
+ *
+ * See Also: http://plan9.bell-labs.com/magic/man2html/2/stat
+ */
+enum p9_perm_t {
 	P9_DMDIR = 0x80000000,
 	P9_DMAPPEND = 0x40000000,
 	P9_DMEXCL = 0x20000000,
 	P9_DMMOUNT = 0x10000000,
 	P9_DMAUTH = 0x08000000,
 	P9_DMTMP = 0x04000000,
+/* 9P2000.u extensions */
 	P9_DMSYMLINK = 0x02000000,
 	P9_DMLINK = 0x01000000,
-	/* 9P2000.u extensions */
 	P9_DMDEVICE = 0x00800000,
 	P9_DMNAMEDPIPE = 0x00200000,
 	P9_DMSOCKET = 0x00100000,
@@ -127,8 +223,26 @@ enum {
 	P9_DMSETVTX = 0x00010000,
 };
 
-/* qid.types */
-enum {
+/**
+ * enum p9_qid_t - QID types
+ * @P9_QTDIR: directory
+ * @P9_QTAPPEND: append-only
+ * @P9_QTEXCL: excluse use (only one open handle allowed)
+ * @P9_QTMOUNT: mount points
+ * @P9_QTAUTH: authentication file
+ * @P9_QTTMP: non-backed-up files
+ * @P9_QTSYMLINK: symbolic links (9P2000.u)
+ * @P9_QTLINK: hard-link (9P2000.u)
+ * @P9_QTFILE: normal files
+ *
+ * QID types are a subset of permissions - they are primarily
+ * used to differentiate semantics for a file system entity via
+ * a jump-table.  Their value is also the most signifigant 16 bits
+ * of the permission_t
+ *
+ * See Also: http://plan9.bell-labs.com/magic/man2html/2/stat
+ */
+enum p9_qid_t {
 	P9_QTDIR = 0x80,
 	P9_QTAPPEND = 0x40,
 	P9_QTEXCL = 0x20,
@@ -140,6 +254,7 @@ enum {
 	P9_QTFILE = 0x00,
 };
 
+/* 9P Magic Numbers */
 #define P9_NOTAG	(u16)(~0)
 #define P9_NOFID	(u32)(~0)
 #define P9_MAXWELEM	16
@@ -147,19 +262,69 @@ enum {
 /* ample room for Twrite/Rread header */
 #define P9_IOHDRSZ	24
 
+/**
+ * struct p9_str - length prefixed string type
+ * @len: length of the string
+ * @str: the string
+ *
+ * The protocol uses length prefixed strings for all
+ * string data, so we replicate that for our internal
+ * string members.
+ */
+
 struct p9_str {
 	u16 len;
 	char *str;
 };
 
-/* qids are the unique ID for a file (like an inode */
+/**
+ * struct p9_qid - file system entity information
+ * @type: 8-bit type &p9_qid_t
+ * @version: 16-bit monotonically incrementing version number
+ * @path: 64-bit per-server-unique ID for a file system element
+ *
+ * qids are identifiers used by 9P servers to track file system
+ * entities.  The type is used to differentiate semantics for operations
+ * on the entity (ie. read means something different on a directory than
+ * on a file).  The path provides a server unique index for an entity
+ * (roughly analogous to an inode number), while the version is updated
+ * every time a file is modified and can be used to maintain cache
+ * coherency between clients and serves.
+ * Servers will often differentiate purely synthetic entities by setting
+ * their version to 0, signaling that they should never be cached and
+ * should be accessed synchronously.
+ *
+ * See Also://plan9.bell-labs.com/magic/man2html/2/stat
+ */
+
 struct p9_qid {
 	u8 type;
 	u32 version;
 	u64 path;
 };
 
-/* Plan 9 file metadata (stat) structure */
+/**
+ * struct p9_stat - file system metadata information
+ * @size: length prefix for this stat structure instance
+ * @type: the type of the server (equivilent to a major number)
+ * @dev: the sub-type of the server (equivilent to a minor number)
+ * @qid: unique id from the server of type &p9_qid
+ * @mode: Plan 9 format permissions of type &p9_perm_t
+ * @atime: Last access/read time
+ * @mtime: Last modify/write time
+ * @length: file length
+ * @name: last element of path (aka filename) in type &p9_str
+ * @uid: owner name in type &p9_str
+ * @gid: group owner in type &p9_str
+ * @muid: last modifier in type &p9_str
+ * @extension: area used to encode extended UNIX support in type &p9_str
+ * @n_uid: numeric user id of owner (part of 9p2000.u extension)
+ * @n_gid: numeric group id (part of 9p2000.u extension)
+ * @n_muid: numeric user id of laster modifier (part of 9p2000.u extension)
+ *
+ * See Also: http://plan9.bell-labs.com/magic/man2html/2/stat
+ */
+
 struct p9_stat {
 	u16 size;
 	u16 type;
@@ -179,10 +344,14 @@ struct p9_stat {
 	u32 n_muid;			/* 9p2000.u extensions */
 };
 
-/* file metadata (stat) structure used to create Twstat message
-   The is similar to p9_stat, but the strings don't point to
-   the same memory block and should be freed separately
-*/
+/*
+ * file metadata (stat) structure used to create Twstat message
+ * The is identical to &p9_stat, but the strings don't point to
+ * the same memory block and should be freed separately
+ *
+ * See Also: http://plan9.bell-labs.com/magic/man2html/2/stat
+ */
+
 struct p9_wstat {
 	u16 size;
 	u16 type;
@@ -335,10 +504,20 @@ struct p9_twstat {
 struct p9_rwstat {
 };
 
-/*
-  * fcall is the primary packet structure
-  *
-  */
+/**
+ * struct p9_fcall - primary packet structure
+ * @size: prefixed length of the structure
+ * @id: protocol operating identifier of type &p9_msg_t
+ * @tag: transaction id of the request
+ * @sdata: payload
+ * @params: per-operation parameters
+ *
+ * &p9_fcall represents the structure for all 9P RPC
+ * transactions.  Requests are packaged into fcalls, and reponses
+ * must be extracted from them.
+ *
+ * See Also: http://plan9.bell-labs.com/magic/man2html/2/fcall
+ */
 
 struct p9_fcall {
 	u32 size;
diff --git a/include/net/9p/client.h b/include/net/9p/client.h
index e52f93d9ac5..c936dd14de4 100644
--- a/include/net/9p/client.h
+++ b/include/net/9p/client.h
@@ -26,6 +26,23 @@
 #ifndef NET_9P_CLIENT_H
 #define NET_9P_CLIENT_H
 
+/**
+ * struct p9_client - per client instance state
+ * @lock: protect @fidlist
+ * @msize: maximum data size negotiated by protocol
+ * @dotu: extension flags negotiated by protocol
+ * @trans_mod: module API instantiated with this client
+ * @trans: tranport instance state and API
+ * @conn: connection state information used by trans_fd
+ * @fidpool: fid handle accounting for session
+ * @fidlist: List of active fid handles
+ *
+ * The client structure is used to keep track of various per-client
+ * state that has been instantiated.
+ *
+ * Bugs: duplicated data and potentially unnecessary elements.
+ */
+
 struct p9_client {
 	spinlock_t lock; /* protect client structure */
 	int msize;
@@ -38,6 +55,24 @@ struct p9_client {
 	struct list_head fidlist;
 };
 
+/**
+ * struct p9_fid - file system entity handle
+ * @clnt: back pointer to instantiating &p9_client
+ * @fid: numeric identifier for this handle
+ * @mode: current mode of this fid (enum?)
+ * @qid: the &p9_qid server identifier this handle points to
+ * @iounit: the server reported maximum transaction size for this file
+ * @uid: the numeric uid of the local user who owns this handle
+ * @aux: transport specific information (unused?)
+ * @rdir_fpos: tracks offset of file position when reading directory contents
+ * @rdir_pos: (unused?)
+ * @rdir_fcall: holds response of last directory read request
+ * @flist: per-client-instance fid tracking
+ * @dlist: per-dentry fid tracking
+ *
+ * TODO: This needs lots of explanation.
+ */
+
 struct p9_fid {
 	struct p9_client *clnt;
 	u32 fid;
diff --git a/include/net/9p/transport.h b/include/net/9p/transport.h
index d2209ae9d18..240e0de888c 100644
--- a/include/net/9p/transport.h
+++ b/include/net/9p/transport.h
@@ -26,12 +26,40 @@
 #ifndef NET_9P_TRANSPORT_H
 #define NET_9P_TRANSPORT_H
 
+/**
+ * enum p9_trans_status - different states of underlying transports
+ * @Connected: transport is connected and healthy
+ * @Disconnected: transport has been disconnected
+ * @Hung: transport is connected by wedged
+ *
+ * This enumeration details the various states a transport
+ * instatiation can be in.
+ */
+
 enum p9_trans_status {
 	Connected,
 	Disconnected,
 	Hung,
 };
 
+/**
+ * struct p9_trans - per-transport state and API
+ * @status: transport &p9_trans_status
+ * @msize: negotiated maximum packet size (duplicate from client)
+ * @extended: negotiated protocol extensions (duplicate from client)
+ * @priv: transport private data
+ * @close: member function to disconnect and close the transport
+ * @rpc: member function to issue a request to the transport
+ *
+ * This is the basic API for a transport instance.  It is used as
+ * a handle by the client to issue requests.  This interface is currently
+ * in flux during reorganization.
+ *
+ * Bugs: there is lots of duplicated data here and its not clear that
+ * the member functions need to be per-instance versus per transport
+ * module.
+ */
+
 struct p9_trans {
 	enum p9_trans_status status;
 	int msize;
@@ -42,6 +70,21 @@ struct p9_trans {
 							struct p9_fcall **rc);
 };
 
+/**
+ * struct p9_trans_module - transport module interface
+ * @list: used to maintain a list of currently available transports
+ * @name: the human-readable name of the transport
+ * @maxsize: transport provided maximum packet size
+ * @def: set if this transport should be considered the default
+ * @create: member function to create a new connection on this transport
+ *
+ * This is the basic API for a transport module which is registered by the
+ * transport module with the 9P core network module and used by the client
+ * to instantiate a new connection on a transport.
+ *
+ * Bugs: the transport module list isn't protected.
+ */
+
 struct p9_trans_module {
 	struct list_head list;
 	char *name;		/* name of transport */
diff --git a/net/9p/conv.c b/net/9p/conv.c
index 3fe35d532c8..44547201f5b 100644
--- a/net/9p/conv.c
+++ b/net/9p/conv.c
@@ -197,7 +197,7 @@ static void buf_get_qid(struct cbuf *bufp, struct p9_qid *qid)
 
 /**
  * p9_size_wstat - calculate the size of a variable length stat struct
- * @stat: metadata (stat) structure
+ * @wstat: metadata (stat) structure
  * @dotu: non-zero if 9P2000.u
  *
  */
@@ -511,6 +511,12 @@ p9_create_common(struct cbuf *bufp, u32 size, u8 id)
 	return fc;
 }
 
+/**
+ * p9_set_tag - set the tag field of an &p9_fcall structure
+ * @fc: fcall structure to set tag within
+ * @tag: tag id to set
+ */
+
 void p9_set_tag(struct p9_fcall *fc, u16 tag)
 {
 	fc->tag = tag;
@@ -518,6 +524,12 @@ void p9_set_tag(struct p9_fcall *fc, u16 tag)
 }
 EXPORT_SYMBOL(p9_set_tag);
 
+/**
+ * p9_create_tversion - allocates and creates a T_VERSION request
+ * @msize: requested maximum data size
+ * @version: version string to negotiate
+ *
+ */
 struct p9_fcall *p9_create_tversion(u32 msize, char *version)
 {
 	int size;
@@ -542,6 +554,16 @@ error:
 }
 EXPORT_SYMBOL(p9_create_tversion);
 
+/**
+ * p9_create_tauth - allocates and creates a T_AUTH request
+ * @afid: handle to use for authentication protocol
+ * @uname: user name attempting to authenticate
+ * @aname: mount specifier for remote server
+ * @n_uname: numeric id for user attempting to authneticate
+ * @dotu: 9P2000.u extension flag
+ *
+ */
+
 struct p9_fcall *p9_create_tauth(u32 afid, char *uname, char *aname,
 	u32 n_uname, int dotu)
 {
@@ -580,6 +602,18 @@ error:
 }
 EXPORT_SYMBOL(p9_create_tauth);
 
+/**
+ * p9_create_tattach - allocates and creates a T_ATTACH request
+ * @fid: handle to use for the new mount point
+ * @afid: handle to use for authentication protocol
+ * @uname: user name attempting to attach
+ * @aname: mount specifier for remote server
+ * @n_uname: numeric id for user attempting to attach
+ * @n_uname: numeric id for user attempting to attach
+ * @dotu: 9P2000.u extension flag
+ *
+ */
+
 struct p9_fcall *
 p9_create_tattach(u32 fid, u32 afid, char *uname, char *aname,
 	u32 n_uname, int dotu)
@@ -616,6 +650,12 @@ error:
 }
 EXPORT_SYMBOL(p9_create_tattach);
 
+/**
+ * p9_create_tflush - allocates and creates a T_FLUSH request
+ * @oldtag: tag id for the transaction we are attempting to cancel
+ *
+ */
+
 struct p9_fcall *p9_create_tflush(u16 oldtag)
 {
 	int size;
@@ -639,6 +679,15 @@ error:
 }
 EXPORT_SYMBOL(p9_create_tflush);
 
+/**
+ * p9_create_twalk - allocates and creates a T_FLUSH request
+ * @fid: handle we are traversing from
+ * @newfid: a new handle for this transaction
+ * @nwname: number of path elements to traverse
+ * @wnames: array of path elements
+ *
+ */
+
 struct p9_fcall *p9_create_twalk(u32 fid, u32 newfid, u16 nwname,
 				     char **wnames)
 {
@@ -677,6 +726,13 @@ error:
 }
 EXPORT_SYMBOL(p9_create_twalk);
 
+/**
+ * p9_create_topen - allocates and creates a T_OPEN request
+ * @fid: handle we are trying to open
+ * @mode: what mode we are trying to open the file in
+ *
+ */
+
 struct p9_fcall *p9_create_topen(u32 fid, u8 mode)
 {
 	int size;
@@ -701,6 +757,19 @@ error:
 }
 EXPORT_SYMBOL(p9_create_topen);
 
+/**
+ * p9_create_tcreate - allocates and creates a T_CREATE request
+ * @fid: handle of directory we are trying to create in
+ * @name: name of the file we are trying to create
+ * @perm: permissions for the file we are trying to create
+ * @mode: what mode we are trying to open the file in
+ * @extension: 9p2000.u extension string (for special files)
+ * @dotu: 9p2000.u enabled flag
+ *
+ * Note: Plan 9 create semantics include opening the resulting file
+ * which is why mode is included.
+ */
+
 struct p9_fcall *p9_create_tcreate(u32 fid, char *name, u32 perm, u8 mode,
 	char *extension, int dotu)
 {
@@ -736,6 +805,13 @@ error:
 }
 EXPORT_SYMBOL(p9_create_tcreate);
 
+/**
+ * p9_create_tread - allocates and creates a T_READ request
+ * @fid: handle of the file we are trying to read
+ * @offset: offset to start reading from
+ * @count: how many bytes to read
+ */
+
 struct p9_fcall *p9_create_tread(u32 fid, u64 offset, u32 count)
 {
 	int size;
@@ -761,6 +837,17 @@ error:
 }
 EXPORT_SYMBOL(p9_create_tread);
 
+/**
+ * p9_create_twrite - allocates and creates a T_WRITE request from the kernel
+ * @fid: handle of the file we are trying to write
+ * @offset: offset to start writing at
+ * @count: how many bytes to write
+ * @data: data to write
+ *
+ * This function will create a requst with data buffers from the kernel
+ * such as the page cache.
+ */
+
 struct p9_fcall *p9_create_twrite(u32 fid, u64 offset, u32 count,
 				      const char *data)
 {
@@ -794,6 +881,16 @@ error:
 }
 EXPORT_SYMBOL(p9_create_twrite);
 
+/**
+ * p9_create_twrite_u - allocates and creates a T_WRITE request from userspace
+ * @fid: handle of the file we are trying to write
+ * @offset: offset to start writing at
+ * @count: how many bytes to write
+ * @data: data to write
+ *
+ * This function will create a request with data buffers from userspace
+ */
+
 struct p9_fcall *p9_create_twrite_u(u32 fid, u64 offset, u32 count,
 				      const char __user *data)
 {
@@ -827,6 +924,14 @@ error:
 }
 EXPORT_SYMBOL(p9_create_twrite_u);
 
+/**
+ * p9_create_tclunk - allocate a request to forget about a file handle
+ * @fid: handle of the file we closing or forgetting about
+ *
+ * clunk is used both to close open files and to discard transient handles
+ * which may be created during meta-data operations and hierarchy traversal.
+ */
+
 struct p9_fcall *p9_create_tclunk(u32 fid)
 {
 	int size;
@@ -850,6 +955,12 @@ error:
 }
 EXPORT_SYMBOL(p9_create_tclunk);
 
+/**
+ * p9_create_tremove - allocate and create a request to remove a file
+ * @fid: handle of the file or directory we are removing
+ *
+ */
+
 struct p9_fcall *p9_create_tremove(u32 fid)
 {
 	int size;
@@ -873,6 +984,12 @@ error:
 }
 EXPORT_SYMBOL(p9_create_tremove);
 
+/**
+ * p9_create_tstat - allocate and populate a request for attributes
+ * @fid: handle of the file or directory we are trying to get the attributes of
+ *
+ */
+
 struct p9_fcall *p9_create_tstat(u32 fid)
 {
 	int size;
@@ -896,6 +1013,14 @@ error:
 }
 EXPORT_SYMBOL(p9_create_tstat);
 
+/**
+ * p9_create_tstat - allocate and populate a request to change attributes
+ * @fid: handle of the file or directory we are trying to change
+ * @wstat: &p9_stat structure with attributes we wish to set
+ * @dotu: 9p2000.u enabled flag
+ *
+ */
+
 struct p9_fcall *p9_create_twstat(u32 fid, struct p9_wstat *wstat,
 				      int dotu)
 {
@@ -922,3 +1047,4 @@ error:
 	return fc;
 }
 EXPORT_SYMBOL(p9_create_twstat);
+
diff --git a/net/9p/error.c b/net/9p/error.c
index 64104b9cb42..388770c3631 100644
--- a/net/9p/error.c
+++ b/net/9p/error.c
@@ -33,6 +33,13 @@
 #include <linux/errno.h>
 #include <net/9p/9p.h>
 
+/**
+ * struct errormap - map string errors from Plan 9 to Linux numeric ids
+ * @name: string sent over 9P
+ * @val: numeric id most closely representing @name
+ * @namelen: length of string
+ * @list: hash-table list for string lookup
+ */
 struct errormap {
 	char *name;
 	int val;
@@ -177,8 +184,7 @@ static struct errormap errmap[] = {
 };
 
 /**
- * p9_error_init - preload
- * @errstr: error string
+ * p9_error_init - preload mappings into hash list
  *
  */
 
@@ -206,6 +212,7 @@ EXPORT_SYMBOL(p9_error_init);
 /**
  * errstr2errno - convert error string to error number
  * @errstr: error string
+ * @len: length of error string
  *
  */
 
diff --git a/net/9p/fcprint.c b/net/9p/fcprint.c
index 40244fbd9b0..53dd8e28dd8 100644
--- a/net/9p/fcprint.c
+++ b/net/9p/fcprint.c
@@ -142,6 +142,14 @@ p9_printdata(char *buf, int buflen, u8 *data, int datalen)
 	return p9_dumpdata(buf, buflen, data, datalen < 16?datalen:16);
 }
 
+/**
+ * p9_printfcall - decode and print a protocol structure into a buffer
+ * @buf: buffer to deposit decoded structure into
+ * @buflen: available space in buffer
+ * @fc: protocol rpc structure of type &p9_fcall
+ * @extended: whether or not session is operating with extended protocol
+ */
+
 int
 p9_printfcall(char *buf, int buflen, struct p9_fcall *fc, int extended)
 {
diff --git a/net/9p/mod.c b/net/9p/mod.c
index c285aab2af0..c6d9695949e 100644
--- a/net/9p/mod.c
+++ b/net/9p/mod.c
@@ -39,9 +39,6 @@ module_param_named(debug, p9_debug_level, uint, 0);
 MODULE_PARM_DESC(debug, "9P debugging level");
 #endif
 
-extern int p9_mux_global_init(void);
-extern void p9_mux_global_exit(void);
-
 /*
  * Dynamic Transport Registration Routines
  *
@@ -52,7 +49,7 @@ static struct p9_trans_module *v9fs_default_transport;
 
 /**
  * v9fs_register_trans - register a new transport with 9p
- * @m - structure describing the transport module and entry points
+ * @m: structure describing the transport module and entry points
  *
  */
 void v9fs_register_trans(struct p9_trans_module *m)
@@ -65,7 +62,7 @@ EXPORT_SYMBOL(v9fs_register_trans);
 
 /**
  * v9fs_match_trans - match transport versus registered transports
- * @arg: string identifying transport
+ * @name: string identifying transport
  *
  */
 struct p9_trans_module *v9fs_match_trans(const substring_t *name)
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index f624dff7685..c6eda999fa7 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -47,12 +47,29 @@
 #define SCHED_TIMEOUT	10
 #define MAXPOLLWADDR	2
 
+/**
+ * struct p9_fd_opts - per-transport options
+ * @rfd: file descriptor for reading (trans=fd)
+ * @wfd: file descriptor for writing (trans=fd)
+ * @port: port to connect to (trans=tcp)
+ *
+ */
+
 struct p9_fd_opts {
 	int rfd;
 	int wfd;
 	u16 port;
 };
 
+
+/**
+ * struct p9_trans_fd - transport state
+ * @rd: reference to file to read from
+ * @wr: reference of file to write to
+ * @conn: connection state reference
+ *
+ */
+
 struct p9_trans_fd {
 	struct file *rd;
 	struct file *wr;
@@ -90,10 +107,24 @@ enum {
 };
 
 struct p9_req;
-
 typedef void (*p9_conn_req_callback)(struct p9_req *req, void *a);
+
+/**
+ * struct p9_req - fd mux encoding of an rpc transaction
+ * @lock: protects req_list
+ * @tag: numeric tag for rpc transaction
+ * @tcall: request &p9_fcall structure
+ * @rcall: response &p9_fcall structure
+ * @err: error state
+ * @cb: callback for when response is received
+ * @cba: argument to pass to callback
+ * @flush: flag to indicate RPC has been flushed
+ * @req_list: list link for higher level objects to chain requests
+ *
+ */
+
 struct p9_req {
-	spinlock_t lock; /* protect request structure */
+	spinlock_t lock;
 	int tag;
 	struct p9_fcall *tcall;
 	struct p9_fcall *rcall;
@@ -104,7 +135,39 @@ struct p9_req {
 	struct list_head req_list;
 };
 
-struct p9_mux_poll_task;
+struct p9_mux_poll_task {
+	struct task_struct *task;
+	struct list_head mux_list;
+	int muxnum;
+};
+
+/**
+ * struct p9_conn - fd mux connection state information
+ * @lock: protects mux_list (?)
+ * @mux_list: list link for mux to manage multiple connections (?)
+ * @poll_task: task polling on this connection
+ * @msize: maximum size for connection (dup)
+ * @extended: 9p2000.u flag (dup)
+ * @trans: reference to transport instance for this connection
+ * @tagpool: id accounting for transactions
+ * @err: error state
+ * @equeue: event wait_q (?)
+ * @req_list: accounting for requests which have been sent
+ * @unsent_req_list: accounting for requests that haven't been sent
+ * @rcall: current response &p9_fcall structure
+ * @rpos: read position in current frame
+ * @rbuf: current read buffer
+ * @wpos: write position for current frame
+ * @wsize: amount of data to write for current frame
+ * @wbuf: current write buffer
+ * @poll_wait: array of wait_q's for various worker threads
+ * @poll_waddr: ????
+ * @pt: poll state
+ * @rq: current read work
+ * @wq: current write work
+ * @wsched: ????
+ *
+ */
 
 struct p9_conn {
 	spinlock_t lock; /* protect lock structure */
@@ -132,11 +195,16 @@ struct p9_conn {
 	unsigned long wsched;
 };
 
-struct p9_mux_poll_task {
-	struct task_struct *task;
-	struct list_head mux_list;
-	int muxnum;
-};
+/**
+ * struct p9_mux_rpc - fd mux rpc accounting structure
+ * @m: connection this request was issued on
+ * @err: error state
+ * @tcall: request &p9_fcall
+ * @rcall: response &p9_fcall
+ * @wqueue: wait queue that client is blocked on for this rpc
+ *
+ * Bug: isn't this information duplicated elsewhere like &p9_req
+ */
 
 struct p9_mux_rpc {
 	struct p9_conn *m;
@@ -207,10 +275,12 @@ static void p9_mux_put_tag(struct p9_conn *m, u16 tag)
 
 /**
  * p9_mux_calc_poll_procs - calculates the number of polling procs
- * based on the number of mounted v9fs filesystems.
+ * @muxnum: number of mounts
  *
+ * Calculation is based on the number of mounted v9fs filesystems.
  * The current implementation returns sqrt of the number of mounts.
  */
+
 static int p9_mux_calc_poll_procs(int muxnum)
 {
 	int n;
@@ -331,12 +401,11 @@ static void p9_mux_poll_stop(struct p9_conn *m)
 
 /**
  * p9_conn_create - allocate and initialize the per-session mux data
- * Creates the polling task if this is the first session.
+ * @trans: transport structure
  *
- * @trans - transport structure
- * @msize - maximum message size
- * @extended - extended flag
+ * Note: Creates the polling task if this is the first session.
  */
+
 static struct p9_conn *p9_conn_create(struct p9_trans *trans)
 {
 	int i, n;
@@ -406,7 +475,10 @@ static struct p9_conn *p9_conn_create(struct p9_trans *trans)
 
 /**
  * p9_mux_destroy - cancels all pending requests and frees mux resources
+ * @m: mux to destroy
+ *
  */
+
 static void p9_conn_destroy(struct p9_conn *m)
 {
 	P9_DPRINTK(P9_DEBUG_MUX, "mux %p prev %p next %p\n", m,
@@ -429,9 +501,14 @@ static void p9_conn_destroy(struct p9_conn *m)
 }
 
 /**
- * p9_pollwait - called by files poll operation to add v9fs-poll task
- * 	to files wait queue
+ * p9_pollwait - add poll task to the wait queue
+ * @filp: file pointer being polled
+ * @wait_address: wait_q to block on
+ * @p: poll state
+ *
+ * called by files poll operation to add v9fs-poll task to files wait queue
  */
+
 static void
 p9_pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p)
 {
@@ -462,7 +539,10 @@ p9_pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p)
 
 /**
  * p9_poll_mux - polls a mux and schedules read or write works if necessary
+ * @m: connection to poll
+ *
  */
+
 static void p9_poll_mux(struct p9_conn *m)
 {
 	int n;
@@ -499,9 +579,14 @@ static void p9_poll_mux(struct p9_conn *m)
 }
 
 /**
- * p9_poll_proc - polls all v9fs transports for new events and queues
- * 	the appropriate work to the work queue
+ * p9_poll_proc - poll worker thread
+ * @a: thread state and arguments
+ *
+ * polls all v9fs transports for new events and queues the appropriate
+ * work to the work queue
+ *
  */
+
 static int p9_poll_proc(void *a)
 {
 	struct p9_conn *m, *mtmp;
@@ -527,7 +612,10 @@ static int p9_poll_proc(void *a)
 
 /**
  * p9_write_work - called when a transport can send some data
+ * @work: container for work to be done
+ *
  */
+
 static void p9_write_work(struct work_struct *work)
 {
 	int n, err;
@@ -638,7 +726,10 @@ static void process_request(struct p9_conn *m, struct p9_req *req)
 
 /**
  * p9_read_work - called when there is some data to be read from a transport
+ * @work: container of work to be done
+ *
  */
+
 static void p9_read_work(struct work_struct *work)
 {
 	int n, err;
@@ -793,7 +884,9 @@ error:
  * @tc: request to be sent
  * @cb: callback function to call when response is received
  * @cba: parameter to pass to the callback function
+ *
  */
+
 static struct p9_req *p9_send_request(struct p9_conn *m,
 					  struct p9_fcall *tc,
 					  p9_conn_req_callback cb, void *cba)
@@ -961,10 +1054,12 @@ p9_conn_rpc_cb(struct p9_req *req, void *a)
 /**
  * p9_fd_rpc- sends 9P request and waits until a response is available.
  *	The function can be interrupted.
- * @m: mux data
+ * @t: transport data
  * @tc: request to be sent
  * @rc: pointer where a pointer to the response is stored
+ *
  */
+
 int
 p9_fd_rpc(struct p9_trans *t, struct p9_fcall *tc, struct p9_fcall **rc)
 {
@@ -1041,8 +1136,10 @@ p9_fd_rpc(struct p9_trans *t, struct p9_fcall *tc, struct p9_fcall **rc)
  * @m: mux data
  * @tc: request to be sent
  * @cb: callback function to be called when response arrives
- * @cba: value to pass to the callback function
+ * @a: value to pass to the callback function
+ *
  */
+
 int p9_conn_rpcnb(struct p9_conn *m, struct p9_fcall *tc,
 		   p9_conn_req_callback cb, void *a)
 {
@@ -1065,7 +1162,9 @@ int p9_conn_rpcnb(struct p9_conn *m, struct p9_fcall *tc,
  * p9_conn_cancel - cancel all pending requests with error
  * @m: mux data
  * @err: error code
+ *
  */
+
 void p9_conn_cancel(struct p9_conn *m, int err)
 {
 	struct p9_req *req, *rtmp;
@@ -1099,7 +1198,7 @@ void p9_conn_cancel(struct p9_conn *m, int err)
 /**
  * v9fs_parse_options - parse mount options into session structure
  * @options: options string passed from mount
- * @v9ses: existing v9fs session information
+ * @opts: transport-specific structure to parse options into
  *
  */
 
@@ -1193,11 +1292,12 @@ static int p9_socket_open(struct p9_trans *trans, struct socket *csocket)
 
 /**
  * p9_fd_read- read from a fd
- * @v9ses: session information
+ * @trans: transport instance state
  * @v: buffer to receive data into
  * @len: size of receive buffer
  *
  */
+
 static int p9_fd_read(struct p9_trans *trans, void *v, int len)
 {
 	int ret;
@@ -1220,11 +1320,12 @@ static int p9_fd_read(struct p9_trans *trans, void *v, int len)
 
 /**
  * p9_fd_write - write to a socket
- * @v9ses: session information
+ * @trans: transport instance state
  * @v: buffer to send data from
  * @len: size of send buffer
  *
  */
+
 static int p9_fd_write(struct p9_trans *trans, void *v, int len)
 {
 	int ret;
@@ -1296,6 +1397,7 @@ end:
  * @trans: private socket structure
  *
  */
+
 static void p9_fd_close(struct p9_trans *trans)
 {
 	struct p9_trans_fd *ts;
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index de7a9f532ed..0bab1f23590 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -55,23 +55,69 @@ static int chan_index;
 
 #define P9_INIT_MAXTAG	16
 
-#define REQ_STATUS_IDLE	0
-#define REQ_STATUS_SENT 1
-#define REQ_STATUS_RCVD 2
-#define REQ_STATUS_FLSH 3
+
+/**
+ * enum p9_req_status_t - virtio request status
+ * @REQ_STATUS_IDLE: request slot unused
+ * @REQ_STATUS_SENT: request sent to server
+ * @REQ_STATUS_RCVD: response received from server
+ * @REQ_STATUS_FLSH: request has been flushed
+ *
+ * The @REQ_STATUS_IDLE state is used to mark a request slot as unused
+ * but use is actually tracked by the idpool structure which handles tag
+ * id allocation.
+ *
+ */
+
+enum p9_req_status_t {
+	REQ_STATUS_IDLE,
+	REQ_STATUS_SENT,
+	REQ_STATUS_RCVD,
+	REQ_STATUS_FLSH,
+};
+
+/**
+ * struct p9_req_t - virtio request slots
+ * @status: status of this request slot
+ * @wq: wait_queue for the client to block on for this request
+ *
+ * The virtio transport uses an array to track outstanding requests
+ * instead of a list.  While this may incurr overhead during initial
+ * allocation or expansion, it makes request lookup much easier as the
+ * tag id is a index into an array.  (We use tag+1 so that we can accomodate
+ * the -1 tag for the T_VERSION request).
+ * This also has the nice effect of only having to allocate wait_queues
+ * once, instead of constantly allocating and freeing them.  Its possible
+ * other resources could benefit from this scheme as well.
+ *
+ */
 
 struct p9_req_t {
 	int status;
 	wait_queue_head_t *wq;
 };
 
-/* We keep all per-channel information in a structure.
+/**
+ * struct virtio_chan - per-instance transport information
+ * @initialized: whether the channel is initialized
+ * @inuse: whether the channel is in use
+ * @lock: protects multiple elements within this structure
+ * @vdev: virtio dev associated with this channel
+ * @vq: virtio queue associated with this channel
+ * @tagpool: accounting for tag ids (and request slots)
+ * @reqs: array of request slots
+ * @max_tag: current number of request_slots allocated
+ * @sg: scatter gather list which is used to pack a request (protected?)
+ *
+ * We keep all per-channel information in a structure.
  * This structure is allocated within the devices dev->mem space.
  * A pointer to the structure will get put in the transport private.
+ *
  */
+
 static struct virtio_chan {
-	bool initialized;		/* channel is initialized */
-	bool inuse;			/* channel is in use */
+	bool initialized;
+	bool inuse;
 
 	spinlock_t lock;
 
@@ -86,7 +132,19 @@ static struct virtio_chan {
 	struct scatterlist sg[VIRTQUEUE_NUM];
 } channels[MAX_9P_CHAN];
 
-/* Lookup requests by tag */
+/**
+ * p9_lookup_tag - Lookup requests by tag
+ * @c: virtio channel to lookup tag within
+ * @tag: numeric id for transaction
+ *
+ * this is a simple array lookup, but will grow the
+ * request_slots as necessary to accomodate transaction
+ * ids which did not previously have a slot.
+ *
+ * Bugs: there is currently no upper limit on request slots set
+ * here, but that should be constrained by the id accounting.
+ */
+
 static struct p9_req_t *p9_lookup_tag(struct virtio_chan *c, u16 tag)
 {
 	/* This looks up the original request by tag so we know which
@@ -130,6 +188,15 @@ static unsigned int rest_of_page(void *data)
 	return PAGE_SIZE - ((unsigned long)data % PAGE_SIZE);
 }
 
+/**
+ * p9_virtio_close - reclaim resources of a channel
+ * @trans: transport state
+ *
+ * This reclaims a channel by freeing its resources and
+ * reseting its inuse flag.
+ *
+ */
+
 static void p9_virtio_close(struct p9_trans *trans)
 {
 	struct virtio_chan *chan = trans->priv;
@@ -151,6 +218,19 @@ static void p9_virtio_close(struct p9_trans *trans)
 	kfree(trans);
 }
 
+/**
+ * req_done - callback which signals activity from the server
+ * @vq: virtio queue activity was received on
+ *
+ * This notifies us that the server has triggered some activity
+ * on the virtio channel - most likely a response to request we
+ * sent.  Figure out which requests now have responses and wake up
+ * those threads.
+ *
+ * Bugs: could do with some additional sanity checking, but appears to work.
+ *
+ */
+
 static void req_done(struct virtqueue *vq)
 {
 	struct virtio_chan *chan = vq->vdev->priv;
@@ -169,6 +249,20 @@ static void req_done(struct virtqueue *vq)
 	spin_unlock_irqrestore(&chan->lock, flags);
 }
 
+/**
+ * pack_sg_list - pack a scatter gather list from a linear buffer
+ * @sg: scatter/gather list to pack into
+ * @start: which segment of the sg_list to start at
+ * @limit: maximum segment to pack data to
+ * @data: data to pack into scatter/gather list
+ * @count: amount of data to pack into the scatter/gather list
+ *
+ * sg_lists have multiple segments of various sizes.  This will pack
+ * arbitrary data into an existing scatter gather list, segmenting the
+ * data as necessary within constraints.
+ *
+ */
+
 static int
 pack_sg_list(struct scatterlist *sg, int start, int limit, char *data,
 								int count)
@@ -189,6 +283,14 @@ pack_sg_list(struct scatterlist *sg, int start, int limit, char *data,
 	return index-start;
 }
 
+/**
+ * p9_virtio_rpc - issue a request and wait for a response
+ * @t: transport state
+ * @tc: &p9_fcall request to transmit
+ * @rc: &p9_fcall to put reponse into
+ *
+ */
+
 static int
 p9_virtio_rpc(struct p9_trans *t, struct p9_fcall *tc, struct p9_fcall **rc)
 {
@@ -263,6 +365,16 @@ p9_virtio_rpc(struct p9_trans *t, struct p9_fcall *tc, struct p9_fcall **rc)
 	return 0;
 }
 
+/**
+ * p9_virtio_probe - probe for existence of 9P virtio channels
+ * @vdev: virtio device to probe
+ *
+ * This probes for existing virtio channels.  At present only
+ * a single channel is in use, so in the future more work may need
+ * to be done here.
+ *
+ */
+
 static int p9_virtio_probe(struct virtio_device *vdev)
 {
 	int err;
@@ -307,11 +419,28 @@ fail:
 	return err;
 }
 
-/* This sets up a transport channel for 9p communication.  Right now
+
+/**
+ * p9_virtio_create - allocate a new virtio channel
+ * @devname: string identifying the channel to connect to (unused)
+ * @args: args passed from sys_mount() for per-transport options (unused)
+ * @msize: requested maximum packet size
+ * @extended: 9p2000.u enabled flag
+ *
+ * This sets up a transport channel for 9p communication.  Right now
  * we only match the first available channel, but eventually we couldlook up
  * alternate channels by matching devname versus a virtio_config entry.
  * We use a simple reference count mechanism to ensure that only a single
- * mount has a channel open at a time. */
+ * mount has a channel open at a time.
+ *
+ * Bugs: doesn't allow identification of a specific channel
+ * to allocate, channels are allocated sequentially. This was
+ * a pragmatic decision to get things rolling, but ideally some
+ * way of identifying the channel to attach to would be nice
+ * if we are going to support multiple channels.
+ *
+ */
+
 static struct p9_trans *
 p9_virtio_create(const char *devname, char *args, int msize,
 							unsigned char extended)
@@ -360,6 +489,12 @@ p9_virtio_create(const char *devname, char *args, int msize,
 	return trans;
 }
 
+/**
+ * p9_virtio_remove - clean up resources associated with a virtio device
+ * @vdev: virtio device to remove
+ *
+ */
+
 static void p9_virtio_remove(struct virtio_device *vdev)
 {
 	struct virtio_chan *chan = vdev->priv;
diff --git a/net/9p/util.c b/net/9p/util.c
index ef7215565d8..4d564604533 100644
--- a/net/9p/util.c
+++ b/net/9p/util.c
@@ -32,11 +32,23 @@
 #include <linux/idr.h>
 #include <net/9p/9p.h>
 
+/**
+ * struct p9_idpool - per-connection accounting for tag idpool
+ * @lock: protects the pool
+ * @pool: idr to allocate tag id from
+ *
+ */
+
 struct p9_idpool {
 	spinlock_t lock;
 	struct idr pool;
 };
 
+/**
+ * p9_idpool_create - create a new per-connection id pool
+ *
+ */
+
 struct p9_idpool *p9_idpool_create(void)
 {
 	struct p9_idpool *p;
@@ -52,6 +64,11 @@ struct p9_idpool *p9_idpool_create(void)
 }
 EXPORT_SYMBOL(p9_idpool_create);
 
+/**
+ * p9_idpool_destroy - create a new per-connection id pool
+ * @p: idpool to destory
+ */
+
 void p9_idpool_destroy(struct p9_idpool *p)
 {
 	idr_destroy(&p->pool);
@@ -61,9 +78,9 @@ EXPORT_SYMBOL(p9_idpool_destroy);
 
 /**
  * p9_idpool_get - allocate numeric id from pool
- * @p - pool to allocate from
+ * @p: pool to allocate from
  *
- * XXX - This seems to be an awful generic function, should it be in idr.c with
+ * Bugs: This seems to be an awful generic function, should it be in idr.c with
  *            the lock included in struct idr?
  */
 
@@ -94,9 +111,10 @@ EXPORT_SYMBOL(p9_idpool_get);
 
 /**
  * p9_idpool_put - release numeric id from pool
- * @p - pool to allocate from
+ * @id: numeric id which is being released
+ * @p: pool to release id into
  *
- * XXX - This seems to be an awful generic function, should it be in idr.c with
+ * Bugs: This seems to be an awful generic function, should it be in idr.c with
  *            the lock included in struct idr?
  */
 
@@ -111,11 +129,13 @@ EXPORT_SYMBOL(p9_idpool_put);
 
 /**
  * p9_idpool_check - check if the specified id is available
- * @id - id to check
- * @p - pool
+ * @id: id to check
+ * @p: pool to check
  */
+
 int p9_idpool_check(int id, struct p9_idpool *p)
 {
 	return idr_find(&p->pool, id) != NULL;
 }
 EXPORT_SYMBOL(p9_idpool_check);
+
-- 
cgit v1.2.3-70-g09d2


From ab31267dfeddf80b2e483f077c8b03905993722b Mon Sep 17 00:00:00 2001
From: Jim Meyering <jim@meyering.net>
Date: Thu, 6 Mar 2008 17:10:28 -0600
Subject: fs/9p/v9fs.c (v9fs_parse_options): Handle kstrdup and match_strdup
 failure. Now that this function can fail, return an int, diagnose other
 option-parsing failures, and adjust the sole caller: (v9fs_session_init):
 Handle kstrdup failure. Propagate any new v9fs_parse_options failure "up".

Signed-off-by: Jim Meyering <meyering@redhat.com>
Cc: Ron Minnich <rminnich@sandia.gov>
Cc: Latchesar Ionkov <lucho@ionkov.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/v9fs.c | 40 ++++++++++++++++++++++++++++++++++------
 1 file changed, 34 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 79d310c0018..5c1ccaf0416 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -73,16 +73,17 @@ static match_table_t tokens = {
  * v9fs_parse_options - parse mount options into session structure
  * @v9ses: existing v9fs session information
  *
+ * Return 0 upon success, -ERRNO upon failure.
  */
 
-static void v9fs_parse_options(struct v9fs_session_info *v9ses)
+static int v9fs_parse_options(struct v9fs_session_info *v9ses)
 {
 	char *options;
 	substring_t args[MAX_OPT_ARGS];
 	char *p;
 	int option = 0;
 	char *s, *e;
-	int ret;
+	int ret = 0;
 
 	/* setup defaults */
 	v9ses->afid = ~0;
@@ -90,19 +91,26 @@ static void v9fs_parse_options(struct v9fs_session_info *v9ses)
 	v9ses->cache = 0;
 
 	if (!v9ses->options)
-		return;
+		return 0;
 
 	options = kstrdup(v9ses->options, GFP_KERNEL);
+	if (!options) {
+		P9_DPRINTK(P9_DEBUG_ERROR,
+			   "failed to allocate copy of option string\n");
+		return -ENOMEM;
+	}
+
 	while ((p = strsep(&options, ",")) != NULL) {
 		int token;
 		if (!*p)
 			continue;
 		token = match_token(p, tokens, args);
 		if (token < Opt_uname) {
-			ret = match_int(&args[0], &option);
-			if (ret < 0) {
+			int r = match_int(&args[0], &option);
+			if (r < 0) {
 				P9_DPRINTK(P9_DEBUG_ERROR,
 					"integer field, but no integer?\n");
+				ret = r;
 				continue;
 			}
 		}
@@ -138,6 +146,13 @@ static void v9fs_parse_options(struct v9fs_session_info *v9ses)
 
 		case Opt_access:
 			s = match_strdup(&args[0]);
+			if (!s) {
+				P9_DPRINTK(P9_DEBUG_ERROR,
+					   "failed to allocate copy"
+					   " of option argument\n");
+				ret = -ENOMEM;
+				break;
+			}
 			v9ses->flags &= ~V9FS_ACCESS_MASK;
 			if (strcmp(s, "user") == 0)
 				v9ses->flags |= V9FS_ACCESS_USER;
@@ -157,6 +172,7 @@ static void v9fs_parse_options(struct v9fs_session_info *v9ses)
 		}
 	}
 	kfree(options);
+	return ret;
 }
 
 /**
@@ -172,6 +188,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 {
 	int retval = -EINVAL;
 	struct p9_fid *fid;
+	int rc;
 
 	v9ses->uname = __getname();
 	if (!v9ses->uname)
@@ -190,7 +207,18 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 	v9ses->dfltuid = V9FS_DEFUID;
 	v9ses->dfltgid = V9FS_DEFGID;
 	v9ses->options = kstrdup(data, GFP_KERNEL);
-	v9fs_parse_options(v9ses);
+	if (!v9ses->options) {
+		P9_DPRINTK(P9_DEBUG_ERROR,
+			   "failed to allocate copy of option string\n");
+		retval = -ENOMEM;
+		goto error;
+	}
+
+	rc = v9fs_parse_options(v9ses);
+	if (rc < 0) {
+		retval = rc;
+		goto error;
+	}
 
 	v9ses->clnt = p9_client_create(dev_name, v9ses->options);
 
-- 
cgit v1.2.3-70-g09d2


From 887b3ece65be7b643dfdae0d433c91a26a3f437d Mon Sep 17 00:00:00 2001
From: Eric Van Hensbergen <ericvh@opteron.9grid.us>
Date: Thu, 8 May 2008 20:26:37 -0500
Subject: 9p: fix error path during early mount

There was some cleanup issues during early mount which would trigger
a kernel bug for certain types of failure.  This patch reorganizes the
cleanup to get rid of the bad behavior.

This also merges the 9pnet and 9pnet_fd modules for the purpose of
configuration and initialization.  Keeping the fd transport separate
from the core 9pnet code seemed like a good idea at the time, but in
practice has caused more harm and confusion than good.

Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/v9fs.c               | 13 +++++++------
 fs/9p/vfs_super.c          | 34 ++++++++++++++++------------------
 include/net/9p/9p.h        |  1 +
 include/net/9p/transport.h |  1 -
 net/9p/Kconfig             | 10 ----------
 net/9p/Makefile            |  3 ---
 net/9p/mod.c               |  1 +
 net/9p/trans_fd.c          | 29 ++++++++++++++++++++++-------
 8 files changed, 47 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 5c1ccaf0416..047c791427a 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -206,12 +206,14 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 	v9ses->uid = ~0;
 	v9ses->dfltuid = V9FS_DEFUID;
 	v9ses->dfltgid = V9FS_DEFGID;
-	v9ses->options = kstrdup(data, GFP_KERNEL);
-	if (!v9ses->options) {
-		P9_DPRINTK(P9_DEBUG_ERROR,
+	if (data) {
+		v9ses->options = kstrdup(data, GFP_KERNEL);
+		if (!v9ses->options) {
+			P9_DPRINTK(P9_DEBUG_ERROR,
 			   "failed to allocate copy of option string\n");
-		retval = -ENOMEM;
-		goto error;
+			retval = -ENOMEM;
+			goto error;
+		}
 	}
 
 	rc = v9fs_parse_options(v9ses);
@@ -260,7 +262,6 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 	return fid;
 
 error:
-	v9fs_session_close(v9ses);
 	return ERR_PTR(retval);
 }
 
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index ba10f172626..bf59c396049 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -128,29 +128,26 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
 	fid = v9fs_session_init(v9ses, dev_name, data);
 	if (IS_ERR(fid)) {
 		retval = PTR_ERR(fid);
-		fid = NULL;
-		kfree(v9ses);
-		v9ses = NULL;
-		goto error;
+		goto close_session;
 	}
 
 	st = p9_client_stat(fid);
 	if (IS_ERR(st)) {
 		retval = PTR_ERR(st);
-		goto error;
+		goto clunk_fid;
 	}
 
 	sb = sget(fs_type, NULL, v9fs_set_super, v9ses);
 	if (IS_ERR(sb)) {
 		retval = PTR_ERR(sb);
-		goto error;
+		goto free_stat;
 	}
 	v9fs_fill_super(sb, v9ses, flags);
 
 	inode = v9fs_get_inode(sb, S_IFDIR | mode);
 	if (IS_ERR(inode)) {
 		retval = PTR_ERR(inode);
-		goto error;
+		goto release_sb;
 	}
 
 	inode->i_uid = uid;
@@ -159,7 +156,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
 	root = d_alloc_root(inode);
 	if (!root) {
 		retval = -ENOMEM;
-		goto error;
+		goto release_sb;
 	}
 
 	sb->s_root = root;
@@ -170,21 +167,22 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
 
 	return simple_set_mnt(mnt, sb);
 
-error:
-	kfree(st);
-	if (fid)
-		p9_client_clunk(fid);
-
-	if (v9ses) {
-		v9fs_session_close(v9ses);
-		kfree(v9ses);
-	}
-
+release_sb:
 	if (sb) {
 		up_write(&sb->s_umount);
 		deactivate_super(sb);
 	}
 
+free_stat:
+	kfree(st);
+
+clunk_fid:
+	p9_client_clunk(fid);
+
+close_session:
+	v9fs_session_close(v9ses);
+	kfree(v9ses);
+
 	return retval;
 }
 
diff --git a/include/net/9p/9p.h b/include/net/9p/9p.h
index 7bfb2f2e423..b3d3e27c629 100644
--- a/include/net/9p/9p.h
+++ b/include/net/9p/9p.h
@@ -595,4 +595,5 @@ int p9_idpool_check(int id, struct p9_idpool *p);
 
 int p9_error_init(void);
 int p9_errstr2errno(char *, int);
+int p9_trans_fd_init(void);
 #endif /* NET_9P_H */
diff --git a/include/net/9p/transport.h b/include/net/9p/transport.h
index 240e0de888c..0db3a4038dc 100644
--- a/include/net/9p/transport.h
+++ b/include/net/9p/transport.h
@@ -96,5 +96,4 @@ struct p9_trans_module {
 void v9fs_register_trans(struct p9_trans_module *m);
 struct p9_trans_module *v9fs_match_trans(const substring_t *name);
 struct p9_trans_module *v9fs_default_trans(void);
-
 #endif /* NET_9P_TRANSPORT_H */
diff --git a/net/9p/Kconfig b/net/9p/Kconfig
index bafc50c9e6f..ff34c5acc13 100644
--- a/net/9p/Kconfig
+++ b/net/9p/Kconfig
@@ -13,16 +13,6 @@ menuconfig NET_9P
 
 	  If unsure, say N.
 
-config NET_9P_FD
-	depends on NET_9P
-	default y if NET_9P
-	tristate "9P File Descriptor Transports (Experimental)"
-	help
-	  This builds support for file descriptor transports for 9p
-	  which includes support for TCP/IP, named pipes, or passed
-	  file descriptors.  TCP/IP is the default transport for 9p,
-	  so if you are going to use 9p, you'll likely want this.
-
 config NET_9P_VIRTIO
 	depends on NET_9P && EXPERIMENTAL && VIRTIO
 	tristate "9P Virtio Transport (Experimental)"
diff --git a/net/9p/Makefile b/net/9p/Makefile
index 8a105110189..519219480db 100644
--- a/net/9p/Makefile
+++ b/net/9p/Makefile
@@ -1,5 +1,4 @@
 obj-$(CONFIG_NET_9P) := 9pnet.o
-obj-$(CONFIG_NET_9P_FD) += 9pnet_fd.o
 obj-$(CONFIG_NET_9P_VIRTIO) += 9pnet_virtio.o
 
 9pnet-objs := \
@@ -9,8 +8,6 @@ obj-$(CONFIG_NET_9P_VIRTIO) += 9pnet_virtio.o
 	error.o \
 	fcprint.o \
 	util.o \
-
-9pnet_fd-objs := \
 	trans_fd.o \
 
 9pnet_virtio-objs := \
diff --git a/net/9p/mod.c b/net/9p/mod.c
index c6d9695949e..bdee1fb7cc6 100644
--- a/net/9p/mod.c
+++ b/net/9p/mod.c
@@ -107,6 +107,7 @@ static int __init init_p9(void)
 
 	p9_error_init();
 	printk(KERN_INFO "Installing 9P2000 support\n");
+	p9_trans_fd_init();
 
 	return ret;
 }
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index 97b103b7049..4507f744f44 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -1433,6 +1433,23 @@ static void p9_fd_close(struct p9_trans *trans)
 	kfree(ts);
 }
 
+/*
+ * stolen from NFS - maybe should be made a generic function?
+ */
+static inline int valid_ipaddr4(const char *buf)
+{
+	int rc, count, in[4];
+
+	rc = sscanf(buf, "%d.%d.%d.%d", &in[0], &in[1], &in[2], &in[3]);
+	if (rc != 4)
+		return -EINVAL;
+	for (count = 0; count < 4; count++) {
+		if (in[count] > 255)
+			return -EINVAL;
+	}
+	return 0;
+}
+
 static struct p9_trans *
 p9_trans_create_tcp(const char *addr, char *args, int msize, unsigned char dotu)
 {
@@ -1447,6 +1464,9 @@ p9_trans_create_tcp(const char *addr, char *args, int msize, unsigned char dotu)
 	if (err < 0)
 		return ERR_PTR(err);
 
+	if (valid_ipaddr4(addr) < 0)
+		return ERR_PTR(-EINVAL);
+
 	csocket = NULL;
 	trans = kmalloc(sizeof(struct p9_trans), GFP_KERNEL);
 	if (!trans)
@@ -1625,7 +1645,7 @@ static struct p9_trans_module p9_fd_trans = {
 	.create = p9_trans_create_fd,
 };
 
-static int __init p9_trans_fd_init(void)
+int p9_trans_fd_init(void)
 {
 	int ret = p9_mux_global_init();
 	if (ret) {
@@ -1639,9 +1659,4 @@ static int __init p9_trans_fd_init(void)
 
 	return 0;
 }
-
-module_init(p9_trans_fd_init);
-
-MODULE_AUTHOR("Latchesar Ionkov <lucho@ionkov.net>");
-MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>");
-MODULE_LICENSE("GPL");
+EXPORT_SYMBOL(p9_trans_fd_init);
-- 
cgit v1.2.3-70-g09d2


From 646dd539878a194bc14b104621c0b2b33587e40f Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 15 May 2008 01:50:56 +0000
Subject: [CIFS] Fix paths when share is in DFS to include proper prefix

Some versions of Samba (3.2-pre e.g.) are stricter about checking to make sure that
paths in DFS name spaces are sent in the form \\server\share\dir\subdir ...
instead of \dir\subdir

Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsproto.h |  3 ---
 fs/cifs/connect.c   | 26 +++------------------
 fs/cifs/dir.c       | 28 ++++++++++++++++++-----
 fs/cifs/inode.c     | 65 +++++++----------------------------------------------
 fs/cifs/link.c      | 42 +++-------------------------------
 5 files changed, 37 insertions(+), 127 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 08248e85b78..845b18e1abe 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -150,9 +150,6 @@ extern int CIFSGetDFSRefer(const int xid, struct cifsSesInfo *ses,
 			unsigned int *number_of_UNC_in_array,
 			const struct nls_table *nls_codepage, int remap);
 
-extern int connect_to_dfs_path(int xid, struct cifsSesInfo *pSesInfo,
-			const char *old_path,
-			const struct nls_table *nls_codepage, int remap);
 extern int get_dfs_path(int xid, struct cifsSesInfo *pSesInfo,
 			const char *old_path,
 			const struct nls_table *nls_codepage,
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 7c2e5ea0330..d5747e30f1c 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1418,27 +1418,6 @@ find_unc(__be32 new_target_ip_addr, char *uncName, char *userName)
 	return NULL;
 }
 
-int
-connect_to_dfs_path(int xid, struct cifsSesInfo *pSesInfo,
-		    const char *old_path, const struct nls_table *nls_codepage,
-		    int remap)
-{
-	struct dfs_info3_param *referrals = NULL;
-	unsigned int num_referrals;
-	int rc = 0;
-
-	rc = get_dfs_path(xid, pSesInfo, old_path, nls_codepage,
-			&num_referrals, &referrals, remap);
-
-	/* BB Add in code to: if valid refrl, if not ip address contact
-		the helper that resolves tcp names, mount to it, try to
-		tcon to it unmount it if fail */
-
-	kfree(referrals);
-
-	return rc;
-}
-
 int
 get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
 	     const struct nls_table *nls_codepage, unsigned int *pnum_referrals,
@@ -2161,10 +2140,11 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 				if ((strchr(volume_info.UNC + 3, '\\') == NULL)
 				    && (strchr(volume_info.UNC + 3, '/') ==
 					NULL)) {
-					rc = connect_to_dfs_path(xid, pSesInfo,
+/*					rc = connect_to_dfs_path(xid, pSesInfo,
 						"", cifs_sb->local_nls,
 						cifs_sb->mnt_cifs_flags &
-						  CIFS_MOUNT_MAP_SPECIAL_CHR);
+						  CIFS_MOUNT_MAP_SPECIAL_CHR);*/
+					cFYI(1, ("DFS root not supported"));
 					rc = -ENODEV;
 					goto out;
 				} else {
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index e4e0078a052..05afe33ea64 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -49,18 +49,25 @@ build_path_from_dentry(struct dentry *direntry)
 	struct dentry *temp;
 	int namelen;
 	int pplen;
+	int dfsplen;
 	char *full_path;
 	char dirsep;
+	struct cifs_sb_info *cifs_sb;
 
 	if (direntry == NULL)
 		return NULL;  /* not much we can do if dentry is freed and
 		we need to reopen the file after it was closed implicitly
 		when the server crashed */
 
-	dirsep = CIFS_DIR_SEP(CIFS_SB(direntry->d_sb));
-	pplen = CIFS_SB(direntry->d_sb)->prepathlen;
+	cifs_sb = CIFS_SB(direntry->d_sb);
+	dirsep = CIFS_DIR_SEP(cifs_sb);
+	pplen = cifs_sb->prepathlen;
+	if (cifs_sb->tcon && (cifs_sb->tcon->Flags & SMB_SHARE_IS_IN_DFS))
+		dfsplen = strnlen(cifs_sb->tcon->treeName, MAX_TREE_SIZE + 1);
+	else
+		dfsplen = 0;
 cifs_bp_rename_retry:
-	namelen = pplen;
+	namelen = pplen + dfsplen;
 	for (temp = direntry; !IS_ROOT(temp);) {
 		namelen += (1 + temp->d_name.len);
 		temp = temp->d_parent;
@@ -91,7 +98,7 @@ cifs_bp_rename_retry:
 			return NULL;
 		}
 	}
-	if (namelen != pplen) {
+	if (namelen != pplen + dfsplen) {
 		cERROR(1,
 		       ("did not end path lookup where expected namelen is %d",
 			namelen));
@@ -107,7 +114,18 @@ cifs_bp_rename_retry:
 	   since the '\' is a valid posix character so we can not switch
 	   those safely to '/' if any are found in the middle of the prepath */
 	/* BB test paths to Windows with '/' in the midst of prepath */
-	strncpy(full_path, CIFS_SB(direntry->d_sb)->prepath, pplen);
+
+	if (dfsplen) {
+		strncpy(full_path, cifs_sb->tcon->treeName, dfsplen);
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) {
+			int i;
+			for (i = 0; i < dfsplen; i++) {
+				if (full_path[i] == '\\')
+					full_path[i] = '/';
+			}
+		}
+	}
+	strncpy(full_path + dfsplen, CIFS_SB(direntry->d_sb)->prepath, pplen);
 	return full_path;
 }
 
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 2d53b436d51..9d9b56a9c08 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -161,52 +161,18 @@ static void cifs_unix_info_to_inode(struct inode *inode,
 	spin_unlock(&inode->i_lock);
 }
 
-static const unsigned char *cifs_get_search_path(struct cifs_sb_info *cifs_sb,
-						const char *search_path)
-{
-	int tree_len;
-	int path_len;
-	int i;
-	char *tmp_path;
-	struct cifsTconInfo *pTcon = cifs_sb->tcon;
-
-	if (!(pTcon->Flags & SMB_SHARE_IS_IN_DFS))
-		return search_path;
-
-	/* use full path name for working with DFS */
-	tree_len = strnlen(pTcon->treeName, MAX_TREE_SIZE + 1);
-	path_len = strnlen(search_path, MAX_PATHCONF);
-
-	tmp_path = kmalloc(tree_len+path_len+1, GFP_KERNEL);
-	if (tmp_path == NULL)
-		return search_path;
-
-	strncpy(tmp_path, pTcon->treeName, tree_len);
-	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
-		for (i = 0; i < tree_len; i++) {
-			if (tmp_path[i] == '\\')
-				tmp_path[i] = '/';
-		}
-	strncpy(tmp_path+tree_len, search_path, path_len);
-	tmp_path[tree_len+path_len] = 0;
-	return tmp_path;
-}
-
 int cifs_get_inode_info_unix(struct inode **pinode,
-	const unsigned char *search_path, struct super_block *sb, int xid)
+	const unsigned char *full_path, struct super_block *sb, int xid)
 {
 	int rc = 0;
 	FILE_UNIX_BASIC_INFO findData;
 	struct cifsTconInfo *pTcon;
 	struct inode *inode;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-	const unsigned char *full_path;
 	bool is_dfs_referral = false;
 
 	pTcon = cifs_sb->tcon;
-	cFYI(1, ("Getting info on %s", search_path));
-
-	full_path = cifs_get_search_path(cifs_sb, search_path);
+	cFYI(1, ("Getting info on %s", full_path));
 
 try_again_CIFSSMBUnixQPathInfo:
 	/* could have done a find first instead but this returns more info */
@@ -218,10 +184,6 @@ try_again_CIFSSMBUnixQPathInfo:
 	if (rc) {
 		if (rc == -EREMOTE && !is_dfs_referral) {
 			is_dfs_referral = true;
-			if (full_path != search_path) {
-				kfree(full_path);
-				full_path = search_path;
-			}
 			goto try_again_CIFSSMBUnixQPathInfo;
 		}
 		goto cgiiu_exit;
@@ -271,8 +233,6 @@ try_again_CIFSSMBUnixQPathInfo:
 		cifs_set_ops(inode, is_dfs_referral);
 	}
 cgiiu_exit:
-	if (full_path != search_path)
-		kfree(full_path);
 	return rc;
 }
 
@@ -380,20 +340,19 @@ static int get_sfu_mode(struct inode *inode,
 }
 
 int cifs_get_inode_info(struct inode **pinode,
-	const unsigned char *search_path, FILE_ALL_INFO *pfindData,
+	const unsigned char *full_path, FILE_ALL_INFO *pfindData,
 	struct super_block *sb, int xid, const __u16 *pfid)
 {
 	int rc = 0;
 	struct cifsTconInfo *pTcon;
 	struct inode *inode;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-	const unsigned char *full_path = NULL;
 	char *buf = NULL;
 	bool adjustTZ = false;
 	bool is_dfs_referral = false;
 
 	pTcon = cifs_sb->tcon;
-	cFYI(1, ("Getting info on %s", search_path));
+	cFYI(1, ("Getting info on %s", full_path));
 
 	if ((pfindData == NULL) && (*pinode != NULL)) {
 		if (CIFS_I(*pinode)->clientCanCacheRead) {
@@ -409,8 +368,6 @@ int cifs_get_inode_info(struct inode **pinode,
 			return -ENOMEM;
 		pfindData = (FILE_ALL_INFO *)buf;
 
-		full_path = cifs_get_search_path(cifs_sb, search_path);
-
 try_again_CIFSSMBQPathInfo:
 		/* could do find first instead but this returns more info */
 		rc = CIFSSMBQPathInfo(xid, pTcon, full_path, pfindData,
@@ -432,10 +389,6 @@ try_again_CIFSSMBQPathInfo:
 	if (rc) {
 		if (rc == -EREMOTE && !is_dfs_referral) {
 			is_dfs_referral = true;
-			if (full_path != search_path) {
-				kfree(full_path);
-				full_path = search_path;
-			}
 			goto try_again_CIFSSMBQPathInfo;
 		}
 		goto cgii_exit;
@@ -470,7 +423,7 @@ try_again_CIFSSMBQPathInfo:
 				__u64 inode_num;
 
 				rc1 = CIFSGetSrvInodeNumber(xid, pTcon,
-					search_path, &inode_num,
+					full_path, &inode_num,
 					cifs_sb->local_nls,
 					cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -539,7 +492,7 @@ try_again_CIFSSMBQPathInfo:
 			   (cifsInfo->cifsAttrs & ATTR_SYSTEM)) {
 			if (decode_sfu_inode(inode,
 					 le64_to_cpu(pfindData->EndOfFile),
-					 search_path,
+					 full_path,
 					 cifs_sb, xid))
 				cFYI(1, ("Unrecognized sfu inode type"));
 
@@ -582,12 +535,12 @@ try_again_CIFSSMBQPathInfo:
 		/* fill in 0777 bits from ACL */
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
 			cFYI(1, ("Getting mode bits from ACL"));
-			acl_to_uid_mode(inode, search_path, pfid);
+			acl_to_uid_mode(inode, full_path, pfid);
 		}
 #endif
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
 			/* fill in remaining high mode bits e.g. SUID, VTX */
-			get_sfu_mode(inode, search_path, cifs_sb, xid);
+			get_sfu_mode(inode, full_path, cifs_sb, xid);
 		} else if (atomic_read(&cifsInfo->inUse) == 0) {
 			inode->i_uid = cifs_sb->mnt_uid;
 			inode->i_gid = cifs_sb->mnt_gid;
@@ -599,8 +552,6 @@ try_again_CIFSSMBQPathInfo:
 		cifs_set_ops(inode, is_dfs_referral);
 	}
 cgii_exit:
-	if (full_path != search_path)
-		kfree(full_path);
 	kfree(buf);
 	return rc;
 }
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 1c2c3ce5020..316f9830ce3 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -295,45 +295,9 @@ cifs_readlink(struct dentry *direntry, char __user *pBuffer, int buflen)
 				cFYI(1, ("Error closing junction point "
 					 "(open for ioctl)"));
 			}
-			/* BB unwind this long, nested function, or remove BB */
-			if (rc == -EIO) {
-				/* Query if DFS Junction */
-				unsigned int num_referrals = 0;
-				struct dfs_info3_param *refs = NULL;
-				tmp_path =
-					kmalloc(MAX_TREE_SIZE + MAX_PATHCONF + 1,
-						GFP_KERNEL);
-				if (tmp_path) {
-					strncpy(tmp_path, pTcon->treeName,
-						MAX_TREE_SIZE);
-					strncat(tmp_path, full_path,
-						MAX_PATHCONF);
-					rc = get_dfs_path(xid, pTcon->ses,
-						tmp_path,
-						cifs_sb->local_nls,
-						&num_referrals, &refs,
-						cifs_sb->mnt_cifs_flags &
-						    CIFS_MOUNT_MAP_SPECIAL_CHR);
-					cFYI(1, ("Get DFS for %s rc = %d ",
-						tmp_path, rc));
-					if ((num_referrals == 0) && (rc == 0))
-						rc = -EACCES;
-					else {
-						cFYI(1, ("num referral: %d",
-							num_referrals));
-						if (refs && refs->path_name) {
-							strncpy(tmpbuffer,
-								refs->path_name,
-								len-1);
-						}
-					}
-					kfree(refs);
-					kfree(tmp_path);
-}
-				/* BB add code like else decode referrals
-				then memcpy to tmpbuffer and free referrals
-				string array BB */
-			}
+			/* If it is a DFS junction earlier we would have gotten
+			   PATH_NOT_COVERED returned from server so we do
+			   not need to request the DFS info here */
 		}
 	}
 	/* BB Anything else to do to handle recursive links? */
-- 
cgit v1.2.3-70-g09d2


From 772279c5f1dceb58d451dca94b557fd89b1ce890 Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@us.ibm.com>
Date: Wed, 14 May 2008 16:05:41 -0700
Subject: jbd: need to hold j_state_lock to updates to transaction t_state to
 T_COMMIT

Updating the current transaction's t_state is protected by j_state_lock.  We
need to do the same when updating the t_state to T_COMMIT.

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Acked-by: Jan Kara <jack@ucw.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/jbd/commit.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index cd931ef1f00..5a8ca61498c 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -470,7 +470,9 @@ void journal_commit_transaction(journal_t *journal)
 	 * transaction!  Now comes the tricky part: we need to write out
 	 * metadata.  Loop over the transaction's entire buffer list:
 	 */
+	spin_lock(&journal->j_state_lock);
 	commit_transaction->t_state = T_COMMIT;
+	spin_unlock(&journal->j_state_lock);
 
 	J_ASSERT(commit_transaction->t_nr_buffers <=
 		 commit_transaction->t_outstanding_credits);
-- 
cgit v1.2.3-70-g09d2


From 7e01c8e5420b6c7f9d85d34c15d8c7a15c9fc720 Mon Sep 17 00:00:00 2001
From: Tiger Yang <tiger.yang@oracle.com>
Date: Wed, 14 May 2008 16:05:47 -0700
Subject: ext3/4: fix uninitialized bs in ext3/4_xattr_set_handle()

This fix the uninitialized bs when we try to replace a xattr entry in
ibody with the new value which require more than free space.

This situation only happens we format ext3/4 with inode size more than 128 and
we have put xattr entries both in ibody and block.  The consequences about
this bug is we will lost the xattr block which pointed by i_file_acl with all
xattr entires in it.  We will alloc a new xattr block and put that large value
entry in it.  The old xattr block will become orphan block.

Signed-off-by: Tiger Yang <tiger.yang@oracle.com>
Cc: <linux-ext4@vger.kernel.org>
Cc: Andreas Gruenbacher <agruen@suse.de>
Acked-by: Andreas Dilger <adilger@sun.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/xattr.c | 5 +++++
 fs/ext4/xattr.c | 5 +++++
 2 files changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index d4a4f0e9ff6..175414ac221 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -1000,6 +1000,11 @@ ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 			i.value = NULL;
 			error = ext3_xattr_block_set(handle, inode, &i, &bs);
 		} else if (error == -ENOSPC) {
+			if (EXT3_I(inode)->i_file_acl && !bs.s.base) {
+				error = ext3_xattr_block_find(inode, &i, &bs);
+				if (error)
+					goto cleanup;
+			}
 			error = ext3_xattr_block_set(handle, inode, &i, &bs);
 			if (error)
 				goto cleanup;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 3fbc2c6c3d0..ff08633f398 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1009,6 +1009,11 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 			i.value = NULL;
 			error = ext4_xattr_block_set(handle, inode, &i, &bs);
 		} else if (error == -ENOSPC) {
+			if (EXT4_I(inode)->i_file_acl && !bs.s.base) {
+				error = ext4_xattr_block_find(inode, &i, &bs);
+				if (error)
+					goto cleanup;
+			}
 			error = ext4_xattr_block_set(handle, inode, &i, &bs);
 			if (error)
 				goto cleanup;
-- 
cgit v1.2.3-70-g09d2


From 0599ad53fee2d084f9ba26247d7452f06a40d298 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Wed, 14 May 2008 22:34:16 -0700
Subject: sysfs: remove error messages for -EEXIST case

It is possible that the entry in sysfs already exists, one case of this is
when a network device is renamed to bonding_masters. Anyway, in this case
the proper error path is for device_rename to return an error code, not to
generate bogus backtrace and errors.

Also, to avoid possible races, the create link should be done before the
remove link. This makes a device rename atomic operation like other renames.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/base/core.c | 8 +++-----
 fs/sysfs/dir.c      | 6 +-----
 2 files changed, 4 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index be288b5e418..3eeac5a7858 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1218,13 +1218,11 @@ int device_rename(struct device *dev, char *new_name)
 	}
 #else
 	if (dev->class) {
-		sysfs_remove_link(&dev->class->subsys.kobj, old_device_name);
 		error = sysfs_create_link(&dev->class->subsys.kobj, &dev->kobj,
 					  dev->bus_id);
-		if (error) {
-			dev_err(dev, "%s: sysfs_create_symlink failed (%d)\n",
-				__func__, error);
-		}
+		if (error)
+			goto out;
+		sysfs_remove_link(&dev->class->subsys.kobj, old_device_name);
 	}
 #endif
 
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index a1c3a1fab7f..8c0e4b92574 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -419,12 +419,8 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
  */
 int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 {
-	if (sysfs_find_dirent(acxt->parent_sd, sd->s_name)) {
-		printk(KERN_WARNING "sysfs: duplicate filename '%s' "
-		       "can not be created\n", sd->s_name);
-		WARN_ON(1);
+	if (sysfs_find_dirent(acxt->parent_sd, sd->s_name))
 		return -EEXIST;
-	}
 
 	sd->s_parent = sysfs_get(acxt->parent_sd);
 
-- 
cgit v1.2.3-70-g09d2


From c32916374b2b4f4d2b7ccdb357fe7989f3b407a6 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 15 May 2008 05:41:54 +0000
Subject: [CIFS] suppress duplicate warning

fs/cifs/dir.c: In function 'cifs_ci_compare':
fs/cifs/dir.c:582: warning: passing argument 1 of 'memcpy' discards
qualifiers from pointer target type

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/dir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 05afe33ea64..f0b5b5f3dd2 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -608,7 +608,7 @@ static int cifs_ci_compare(struct dentry *dentry, struct qstr *a,
 		 * case take precedence.  If a is not a negative dentry, this
 		 * should have no side effects
 		 */
-		memcpy(a->name, b->name, a->len);
+		memcpy((void *)a->name, b->name, a->len);
 		return 0;
 	}
 	return 1;
-- 
cgit v1.2.3-70-g09d2


From f9ddcca4cf7d95238beb295484d1de7c0bf490dd Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 15 May 2008 05:51:55 +0000
Subject: [CIFS] BKL-removal: convert CIFS over to unlocked_ioctl

cifs_ioctl doesn't seem to need the BKL for anything, so convert it over
to use unlocked_ioctl.

Signed-off-by: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c | 10 +++++-----
 fs/cifs/cifsfs.h |  3 +--
 fs/cifs/ioctl.c  |  4 ++--
 3 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 427a7c69589..b6436b888cf 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -657,7 +657,7 @@ const struct file_operations cifs_file_ops = {
 	.splice_read = generic_file_splice_read,
 	.llseek = cifs_llseek,
 #ifdef CONFIG_CIFS_POSIX
-	.ioctl	= cifs_ioctl,
+	.unlocked_ioctl	= cifs_ioctl,
 #endif /* CONFIG_CIFS_POSIX */
 
 #ifdef CONFIG_CIFS_EXPERIMENTAL
@@ -677,7 +677,7 @@ const struct file_operations cifs_file_direct_ops = {
 	.flush = cifs_flush,
 	.splice_read = generic_file_splice_read,
 #ifdef CONFIG_CIFS_POSIX
-	.ioctl  = cifs_ioctl,
+	.unlocked_ioctl  = cifs_ioctl,
 #endif /* CONFIG_CIFS_POSIX */
 	.llseek = cifs_llseek,
 #ifdef CONFIG_CIFS_EXPERIMENTAL
@@ -697,7 +697,7 @@ const struct file_operations cifs_file_nobrl_ops = {
 	.splice_read = generic_file_splice_read,
 	.llseek = cifs_llseek,
 #ifdef CONFIG_CIFS_POSIX
-	.ioctl	= cifs_ioctl,
+	.unlocked_ioctl	= cifs_ioctl,
 #endif /* CONFIG_CIFS_POSIX */
 
 #ifdef CONFIG_CIFS_EXPERIMENTAL
@@ -716,7 +716,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
 	.flush = cifs_flush,
 	.splice_read = generic_file_splice_read,
 #ifdef CONFIG_CIFS_POSIX
-	.ioctl  = cifs_ioctl,
+	.unlocked_ioctl  = cifs_ioctl,
 #endif /* CONFIG_CIFS_POSIX */
 	.llseek = cifs_llseek,
 #ifdef CONFIG_CIFS_EXPERIMENTAL
@@ -731,7 +731,7 @@ const struct file_operations cifs_dir_ops = {
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 	.dir_notify = cifs_dir_notify,
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
-	.ioctl  = cifs_ioctl,
+	.unlocked_ioctl  = cifs_ioctl,
 };
 
 static void
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index cd1301a09b3..25a6cbd1552 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -95,8 +95,7 @@ extern int 	cifs_setxattr(struct dentry *, const char *, const void *,
 			size_t, int);
 extern ssize_t	cifs_getxattr(struct dentry *, const char *, void *, size_t);
 extern ssize_t	cifs_listxattr(struct dentry *, char *, size_t);
-extern int cifs_ioctl(struct inode *inode, struct file *filep,
-		       unsigned int command, unsigned long arg);
+extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 extern const struct export_operations cifs_export_ops;
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 5c792df13d6..0088a5b5256 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -30,9 +30,9 @@
 
 #define CIFS_IOC_CHECKUMOUNT _IO(0xCF, 2)
 
-int cifs_ioctl(struct inode *inode, struct file *filep,
-		unsigned int command, unsigned long arg)
+long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 {
+	struct inode *inode = filep->f_dentry->d_inode;
 	int rc = -ENOTTY; /* strange error - but the precedent */
 	int xid;
 	struct cifs_sb_info *cifs_sb;
-- 
cgit v1.2.3-70-g09d2


From c2cf07d591ef7bc25c220249822d9bdf0f44c75c Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 15 May 2008 06:20:02 +0000
Subject: [CIFS] Finishup DFS code

Fixup GetDFSRefer to prepare for cleanup of SMB response processing
Fix build warning in link.c

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsproto.h |   4 +-
 fs/cifs/cifssmb.c   | 169 ++++++++++++++++++++++++----------------------------
 fs/cifs/connect.c   |   3 +-
 fs/cifs/link.c      |   1 -
 4 files changed, 80 insertions(+), 97 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 845b18e1abe..b9f5e935f82 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -146,8 +146,8 @@ extern int CIFSSMBUnixQPathInfo(const int xid,
 
 extern int CIFSGetDFSRefer(const int xid, struct cifsSesInfo *ses,
 			const unsigned char *searchName,
-			unsigned char **targetUNCs,
-			unsigned int *number_of_UNC_in_array,
+			struct dfs_info3_param **target_nodes,
+			unsigned int *number_of_nodes_in_array,
 			const struct nls_table *nls_codepage, int remap);
 
 extern int get_dfs_path(int xid, struct cifsSesInfo *pSesInfo,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 9c04ad40455..fc297383cb0 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -3870,8 +3870,8 @@ GetInodeNumOut:
 int
 CIFSGetDFSRefer(const int xid, struct cifsSesInfo *ses,
 		const unsigned char *searchName,
-		unsigned char **targetUNCs,
-		unsigned int *number_of_UNC_in_array,
+		struct dfs_info3_param **target_nodes,
+		unsigned int *num_of_nodes,
 		const struct nls_table *nls_codepage, int remap)
 {
 /* TRANS2_GET_DFS_REFERRAL */
@@ -3884,8 +3884,8 @@ CIFSGetDFSRefer(const int xid, struct cifsSesInfo *ses,
 	unsigned int i;
 	char *temp;
 	__u16 params, byte_count;
-	*number_of_UNC_in_array = 0;
-	*targetUNCs = NULL;
+	*num_of_nodes = 0;
+	*target_nodes = NULL;
 
 	cFYI(1, ("In GetDFSRefer the path %s", searchName));
 	if (ses == NULL)
@@ -3955,99 +3955,84 @@ getDFSRetry:
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
 		cFYI(1, ("Send error in GetDFSRefer = %d", rc));
-	} else {		/* decode response */
-/* BB Add logic to parse referrals here */
-		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
-
-		/* BB Also check if enough total bytes returned? */
-		if (rc || (pSMBr->ByteCount < 17))
-			rc = -EIO;      /* bad smb */
-		else {
-			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
-			__u16 data_count = le16_to_cpu(pSMBr->t2.DataCount);
-
-			cFYI(1,
-			    ("Decoding GetDFSRefer response BCC: %d  Offset %d",
-			      pSMBr->ByteCount, data_offset));
-			referrals =
-			    (struct dfs_referral_level_3 *)
-					(8 /* sizeof start of data block */ +
-					data_offset +
-					(char *) &pSMBr->hdr.Protocol);
-			cFYI(1, ("num_referrals: %d dfs flags: 0x%x ... \n"
-				"for referral one refer size: 0x%x srv "
-				"type: 0x%x refer flags: 0x%x ttl: 0x%x",
-				le16_to_cpu(pSMBr->NumberOfReferrals),
-				le16_to_cpu(pSMBr->DFSFlags),
-				le16_to_cpu(referrals->ReferralSize),
-				le16_to_cpu(referrals->ServerType),
-				le16_to_cpu(referrals->ReferralFlags),
-				le16_to_cpu(referrals->TimeToLive)));
-			/* BB This field is actually two bytes in from start of
-			   data block so we could do safety check that DataBlock
-			   begins at address of pSMBr->NumberOfReferrals */
-			*number_of_UNC_in_array =
-					le16_to_cpu(pSMBr->NumberOfReferrals);
-
-			/* BB Fix below so can return more than one referral */
-			if (*number_of_UNC_in_array > 1)
-				*number_of_UNC_in_array = 1;
-
-			/* get the length of the strings describing refs */
-			name_len = 0;
-			for (i = 0; i < *number_of_UNC_in_array; i++) {
-				/* make sure that DfsPathOffset not past end */
-				__u16 offset =
-					le16_to_cpu(referrals->DfsPathOffset);
-				if (offset > data_count) {
-					/* if invalid referral, stop here and do
-					not try to copy any more */
-					*number_of_UNC_in_array = i;
-					break;
-				}
-				temp = ((char *)referrals) + offset;
+		goto GetDFSRefExit;
+	}
+	rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
-				if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) {
-					name_len += UniStrnlen((wchar_t *)temp,
-								data_count);
-				} else {
-					name_len += strnlen(temp, data_count);
-				}
-				referrals++;
-				/* BB add check that referral pointer does
-				   not fall off end PDU */
-			}
-			/* BB add check for name_len bigger than bcc */
-			*targetUNCs =
-				kmalloc(name_len+1+(*number_of_UNC_in_array),
-					GFP_KERNEL);
-			if (*targetUNCs == NULL) {
-				rc = -ENOMEM;
-				goto GetDFSRefExit;
+	/* BB Also check if enough total bytes returned? */
+	if (rc || (pSMBr->ByteCount < 17))
+		rc = -EIO;      /* bad smb */
+	else {
+		__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+		__u16 data_count = le16_to_cpu(pSMBr->t2.DataCount);
+
+		cFYI(1, ("Decoding GetDFSRefer response BCC: %d  Offset %d",
+			 pSMBr->ByteCount, data_offset));
+		referrals =
+		    (struct dfs_referral_level_3 *)
+				(8 /* sizeof start of data block */ +
+				data_offset +
+				(char *) &pSMBr->hdr.Protocol);
+		cFYI(1, ("num_referrals: %d dfs flags: 0x%x ... \n"
+			"for referral one refer size: 0x%x srv "
+			"type: 0x%x refer flags: 0x%x ttl: 0x%x",
+			le16_to_cpu(pSMBr->NumberOfReferrals),
+			le16_to_cpu(pSMBr->DFSFlags),
+			le16_to_cpu(referrals->ReferralSize),
+			le16_to_cpu(referrals->ServerType),
+			le16_to_cpu(referrals->ReferralFlags),
+			le16_to_cpu(referrals->TimeToLive)));
+		/* BB This field is actually two bytes in from start of
+		   data block so we could do safety check that DataBlock
+		   begins at address of pSMBr->NumberOfReferrals */
+		*num_of_nodes = le16_to_cpu(pSMBr->NumberOfReferrals);
+
+		/* BB Fix below so can return more than one referral */
+		if (*num_of_nodes > 1)
+			*num_of_nodes = 1;
+
+		/* get the length of the strings describing refs */
+		name_len = 0;
+		for (i = 0; i < *num_of_nodes; i++) {
+			/* make sure that DfsPathOffset not past end */
+			__u16 offset = le16_to_cpu(referrals->DfsPathOffset);
+			if (offset > data_count) {
+				/* if invalid referral, stop here and do
+				not try to copy any more */
+				*num_of_nodes = i;
+				break;
 			}
-			/* copy the ref strings */
-			referrals = (struct dfs_referral_level_3 *)
-					(8 /* sizeof data hdr */ + data_offset +
-					(char *) &pSMBr->hdr.Protocol);
-
-			for (i = 0; i < *number_of_UNC_in_array; i++) {
-				temp = ((char *)referrals) +
-					  le16_to_cpu(referrals->DfsPathOffset);
-				if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) {
-					cifs_strfromUCS_le(*targetUNCs,
-							  (__le16 *) temp,
-							  name_len,
-							  nls_codepage);
-				} else {
-					strncpy(*targetUNCs, temp, name_len);
-				}
-				/*  BB update target_uncs pointers */
-				referrals++;
+			temp = ((char *)referrals) + offset;
+
+			if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) {
+				name_len += UniStrnlen((wchar_t *)temp,
+							data_count);
+			} else {
+				name_len += strnlen(temp, data_count);
 			}
-			temp = *targetUNCs;
-			temp[name_len] = 0;
+			referrals++;
+			/* BB add check that referral pointer does
+			   not fall off end PDU */
+		}
+		/* BB add check for name_len bigger than bcc */
+		*target_nodes =
+			kmalloc(name_len+1+(*num_of_nodes),
+				GFP_KERNEL);
+		if (*target_nodes == NULL) {
+			rc = -ENOMEM;
+			goto GetDFSRefExit;
 		}
 
+		referrals = (struct dfs_referral_level_3 *)
+				(8 /* sizeof data hdr */ + data_offset +
+				(char *) &pSMBr->hdr.Protocol);
+
+		for (i = 0; i < *num_of_nodes; i++) {
+			temp = ((char *)referrals) +
+				  le16_to_cpu(referrals->DfsPathOffset);
+			/*  BB update target_uncs pointers */
+			referrals++;
+		}
 	}
 GetDFSRefExit:
 	if (pSMB)
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index d5747e30f1c..c397fcfd9f1 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1425,7 +1425,6 @@ get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
 {
 	char *temp_unc;
 	int rc = 0;
-	unsigned char *targetUNCs;
 
 	*pnum_referrals = 0;
 	*preferrals = NULL;
@@ -1448,7 +1447,7 @@ get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
 		kfree(temp_unc);
 	}
 	if (rc == 0)
-		rc = CIFSGetDFSRefer(xid, pSesInfo, old_path, &targetUNCs,
+		rc = CIFSGetDFSRefer(xid, pSesInfo, old_path, preferrals,
 				     pnum_referrals, nls_codepage, remap);
 	/* BB map targetUNCs to dfs_info3 structures, here or
 		in CIFSGetDFSRefer BB */
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 316f9830ce3..63f644000ce 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -234,7 +234,6 @@ cifs_readlink(struct dentry *direntry, char __user *pBuffer, int buflen)
 	struct cifs_sb_info *cifs_sb;
 	struct cifsTconInfo *pTcon;
 	char *full_path = NULL;
-	char *tmp_path = NULL;
 	char *tmpbuffer;
 	int len;
 	__u16 fid;
-- 
cgit v1.2.3-70-g09d2


From 95b1cb90b79896c4bf5ea484bee2b41d7d293f43 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 15 May 2008 16:44:38 +0000
Subject: [CIFS] enable parsing for transport encryption mount parm

Samba now supports transport encryption on particular exports
(mounted tree ids can be encrypted for servers which support the
unix extensions).  This adds parsing support to cifs mount
option parsing for this.

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/README     |  5 +++++
 fs/cifs/cifsglob.h |  1 +
 fs/cifs/connect.c  | 25 +++++++++++++++++--------
 3 files changed, 23 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/README b/fs/cifs/README
index 621aa1a8597..2bd6fe556f8 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -483,6 +483,11 @@ A partial list of the supported mount options follows:
  sign           Must use packet signing (helps avoid unwanted data modification
 		by intermediate systems in the route).  Note that signing
 		does not work with lanman or plaintext authentication.
+ seal           Must seal (encrypt) all data on this mounted share before
+		sending on the network.  Requires support for Unix Extensions.
+		Note that this differs from the sign mount option in that it
+		causes encryption of data sent over this mounted share but other
+		shares mounted to the same server are unaffected.
  sec            Security mode.  Allowed values are:
 			none	attempt to connection as a null user (no name)
 			krb5    Use Kerberos version 5 authentication
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index b7d9f698e63..08914053242 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -281,6 +281,7 @@ struct cifsTconInfo {
 	bool ipc:1;		/* set if connection to IPC$ eg for RPC/PIPES */
 	bool retry:1;
 	bool nocase:1;
+	bool seal:1;      /* transport encryption for this mounted share */
 	bool unix_ext:1;  /* if false disable Linux extensions to CIFS protocol
 				for this mount even if server would support */
 	/* BB add field for back pointer to sb struct(s)? */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index c397fcfd9f1..023434f72c1 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -60,7 +60,7 @@ struct smb_vol {
 	char *domainname;
 	char *UNC;
 	char *UNCip;
-	char *in6_addr;  /* ipv6 address as human readable form of in6_addr */
+	char *in6_addr;   /* ipv6 address as human readable form of in6_addr */
 	char *iocharset;  /* local code page for mapping to and from Unicode */
 	char source_rfc1001_name[16]; /* netbios name of client */
 	char target_rfc1001_name[16]; /* netbios name of server for Win9x/ME */
@@ -82,13 +82,14 @@ struct smb_vol {
 	bool no_xattr:1;   /* set if xattr (EA) support should be disabled*/
 	bool server_ino:1; /* use inode numbers from server ie UniqueId */
 	bool direct_io:1;
-	bool remap:1;     /* set to remap seven reserved chars in filenames */
-	bool posix_paths:1;   /* unset to not ask for posix pathnames. */
+	bool remap:1;      /* set to remap seven reserved chars in filenames */
+	bool posix_paths:1; /* unset to not ask for posix pathnames. */
 	bool no_linux_ext:1;
 	bool sfu_emul:1;
-	bool nullauth:1; /* attempt to authenticate with null user */
-	unsigned nocase;     /* request case insensitive filenames */
-	unsigned nobrl;      /* disable sending byte range locks to srv */
+	bool nullauth:1;   /* attempt to authenticate with null user */
+	bool nocase:1;     /* request case insensitive filenames */
+	bool nobrl:1;      /* disable sending byte range locks to srv */
+	bool seal:1;       /* request transport encryption on share */
 	unsigned int rsize;
 	unsigned int wsize;
 	unsigned int sockopt;
@@ -1273,8 +1274,12 @@ cifs_parse_mount_options(char *options, const char *devname,
 			vol->no_psx_acl = 1;
 		} else if (strnicmp(data, "sign", 4) == 0) {
 			vol->secFlg |= CIFSSEC_MUST_SIGN;
-/*		} else if (strnicmp(data, "seal",4) == 0) {
-			vol->secFlg |= CIFSSEC_MUST_SEAL; */
+		} else if (strnicmp(data, "seal", 4) == 0) {
+			/* we do not do the following in secFlags because seal
+			   is a per tree connection (mount) not a per socket
+			   or per-smb connection option in the protocol */
+			/* vol->secFlg |= CIFSSEC_MUST_SEAL; */
+			vol->seal = 1;
 		} else if (strnicmp(data, "direct", 6) == 0) {
 			vol->direct_io = 1;
 		} else if (strnicmp(data, "forcedirectio", 13) == 0) {
@@ -2126,6 +2131,9 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			   for the retry flag is used */
 			tcon->retry = volume_info.retry;
 			tcon->nocase = volume_info.nocase;
+			if (tcon->seal != volume_info.seal)
+				cERROR(1, ("transport encryption setting "
+					   "conflicts with existing tid"));
 		} else {
 			tcon = tconInfoAlloc();
 			if (tcon == NULL)
@@ -2159,6 +2167,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 					atomic_inc(&pSesInfo->inUse);
 					tcon->retry = volume_info.retry;
 					tcon->nocase = volume_info.nocase;
+					tcon->seal = volume_info.seal;
 				}
 			}
 		}
-- 
cgit v1.2.3-70-g09d2


From 519deca0496a4df07d15acf3181ca5d573bffdec Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 15 May 2008 14:43:20 -0400
Subject: ext4: Retry block allocation if new blocks are allocated from system
 zone.

If the block allocator gets blocks out of system zone ext4 calls
ext4_error. But if the file system is mounted with errors=continue
retry block allocation. We need to mark the system zone blocks as
in use to make sure retry don't pick them again

System zone is the block range mapping block bitmap, inode bitmap and inode
table.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c  | 17 +++++++++++------
 fs/ext4/mballoc.c | 47 ++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 47 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index da994374ec3..30494c5da84 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -287,11 +287,11 @@ read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 			    (int)block_group, (unsigned long long)bitmap_blk);
 		return NULL;
 	}
-	if (!ext4_valid_block_bitmap(sb, desc, block_group, bh)) {
-		put_bh(bh);
-		return NULL;
-	}
-
+	ext4_valid_block_bitmap(sb, desc, block_group, bh);
+	/*
+	 * file system mounted not to panic on error,
+	 * continue with corrupt bitmap
+	 */
 	return bh;
 }
 /*
@@ -1770,7 +1770,12 @@ allocated:
 			    "Allocating block in system zone - "
 			    "blocks from %llu, length %lu",
 			     ret_block, num);
-		goto out;
+		/*
+		 * claim_block marked the blocks we allocated
+		 * as in use. So we may want to selectively
+		 * mark some of the blocks as free
+		 */
+		goto retry_alloc;
 	}
 
 	performed_allocation = 1;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 1d7fde99452..873ad9b3418 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2736,7 +2736,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 	struct ext4_sb_info *sbi;
 	struct super_block *sb;
 	ext4_fsblk_t block;
-	int err;
+	int err, len;
 
 	BUG_ON(ac->ac_status != AC_STATUS_FOUND);
 	BUG_ON(ac->ac_b_ex.fe_len <= 0);
@@ -2770,14 +2770,27 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 		+ ac->ac_b_ex.fe_start
 		+ le32_to_cpu(es->s_first_data_block);
 
-	if (block == ext4_block_bitmap(sb, gdp) ||
-			block == ext4_inode_bitmap(sb, gdp) ||
-			in_range(block, ext4_inode_table(sb, gdp),
-				EXT4_SB(sb)->s_itb_per_group)) {
-
+	len = ac->ac_b_ex.fe_len;
+	if (in_range(ext4_block_bitmap(sb, gdp), block, len) ||
+	    in_range(ext4_inode_bitmap(sb, gdp), block, len) ||
+	    in_range(block, ext4_inode_table(sb, gdp),
+		     EXT4_SB(sb)->s_itb_per_group) ||
+	    in_range(block + len - 1, ext4_inode_table(sb, gdp),
+		     EXT4_SB(sb)->s_itb_per_group)) {
 		ext4_error(sb, __func__,
 			   "Allocating block in system zone - block = %llu",
 			   block);
+		/* File system mounted not to panic on error
+		 * Fix the bitmap and repeat the block allocation
+		 * We leak some of the blocks here.
+		 */
+		mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group),
+				bitmap_bh->b_data, ac->ac_b_ex.fe_start,
+				ac->ac_b_ex.fe_len);
+		err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+		if (!err)
+			err = -EAGAIN;
+		goto out_err;
 	}
 #ifdef AGGRESSIVE_CHECK
 	{
@@ -4032,7 +4045,6 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 
 		ac->ac_op = EXT4_MB_HISTORY_ALLOC;
 		ext4_mb_normalize_request(ac, ar);
-
 repeat:
 		/* allocate space in core */
 		ext4_mb_regular_allocator(ac);
@@ -4046,10 +4058,21 @@ repeat:
 	}
 
 	if (likely(ac->ac_status == AC_STATUS_FOUND)) {
-		ext4_mb_mark_diskspace_used(ac, handle);
-		*errp = 0;
-		block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
-		ar->len = ac->ac_b_ex.fe_len;
+		*errp = ext4_mb_mark_diskspace_used(ac, handle);
+		if (*errp ==  -EAGAIN) {
+			ac->ac_b_ex.fe_group = 0;
+			ac->ac_b_ex.fe_start = 0;
+			ac->ac_b_ex.fe_len = 0;
+			ac->ac_status = AC_STATUS_CONTINUE;
+			goto repeat;
+		} else if (*errp) {
+			ac->ac_b_ex.fe_len = 0;
+			ar->len = 0;
+			ext4_mb_show_ac(ac);
+		} else {
+			block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
+			ar->len = ac->ac_b_ex.fe_len;
+		}
 	} else {
 		freed  = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
 		if (freed)
@@ -4236,6 +4259,8 @@ do_more:
 		ext4_error(sb, __func__,
 			   "Freeing blocks in system zone - "
 			   "Block = %lu, count = %lu", block, count);
+		/* err = 0. ext4_std_error should be a no op */
+		goto error_return;
 	}
 
 	BUFFER_TRACE(bitmap_bh, "getting write access");
-- 
cgit v1.2.3-70-g09d2


From 02c471cb17203c748e9bc87003052c1f46e5df69 Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@us.ibm.com>
Date: Thu, 15 May 2008 14:46:17 -0400
Subject: jbd2: update transaction t_state to T_COMMIT fix

Updating the current transaction's t_state is protected by j_state_lock.  We
need to do the same when updating the t_state to T_COMMIT.

Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/jbd2/commit.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index e0139786f71..4d99685fdce 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -560,7 +560,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	 * transaction!  Now comes the tricky part: we need to write out
 	 * metadata.  Loop over the transaction's entire buffer list:
 	 */
+	spin_lock(&journal->j_state_lock);
 	commit_transaction->t_state = T_COMMIT;
+	spin_unlock(&journal->j_state_lock);
 
 	stats.u.run.rs_logging = jiffies;
 	stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
-- 
cgit v1.2.3-70-g09d2


From de2db8d790b058fcd75d603780b913bd824972b3 Mon Sep 17 00:00:00 2001
From: Igor Mammedov <niallain@gmail.com>
Date: Fri, 16 May 2008 13:10:32 +0400
Subject: Fixed DFS code to work with new 'build_path_from_dentry', that
 returns full path if share in the dfs, now.

Signed-off-by: Igor Mammedov <niallain@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_dfs_ref.c | 49 +------------------------------------------------
 1 file changed, 1 insertion(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index f6fdecf6598..d82374c9e32 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -219,53 +219,6 @@ static struct vfsmount *cifs_dfs_do_refmount(const struct vfsmount *mnt_parent,
 
 }
 
-static char *build_full_dfs_path_from_dentry(struct dentry *dentry)
-{
-	char *full_path = NULL;
-	char *search_path;
-	char *tmp_path;
-	size_t l_max_len;
-	struct cifs_sb_info *cifs_sb;
-
-	if (dentry->d_inode == NULL)
-		return NULL;
-
-	cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
-
-	if (cifs_sb->tcon == NULL)
-		return NULL;
-
-	search_path = build_path_from_dentry(dentry);
-	if (search_path == NULL)
-		return NULL;
-
-	if (cifs_sb->tcon->Flags & SMB_SHARE_IS_IN_DFS) {
-		int i;
-		/* we should use full path name for correct working with DFS */
-		l_max_len = strnlen(cifs_sb->tcon->treeName, MAX_TREE_SIZE+1) +
-					strnlen(search_path, MAX_PATHCONF) + 1;
-		tmp_path = kmalloc(l_max_len, GFP_KERNEL);
-		if (tmp_path == NULL) {
-			kfree(search_path);
-			return NULL;
-		}
-		strncpy(tmp_path, cifs_sb->tcon->treeName, l_max_len);
-		tmp_path[l_max_len-1] = 0;
-		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
-			for (i = 0; i < l_max_len; i++) {
-				if (tmp_path[i] == '\\')
-					tmp_path[i] = '/';
-			}
-		strncat(tmp_path, search_path, l_max_len - strlen(tmp_path));
-
-		full_path = tmp_path;
-		kfree(search_path);
-	} else {
-		full_path = search_path;
-	}
-	return full_path;
-}
-
 static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
 				struct list_head *mntlist)
 {
@@ -333,7 +286,7 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 		goto out_err;
 	}
 
-	full_path = build_full_dfs_path_from_dentry(dentry);
+	full_path = build_path_from_dentry(dentry);
 	if (full_path == NULL) {
 		rc = -ENOMEM;
 		goto out_err;
-- 
cgit v1.2.3-70-g09d2


From b0b539739fe9b7d75002412a787cfdf4efddbc33 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 5 May 2008 11:45:41 -0400
Subject: NFS: Ensure that 'noac' and/or 'actimeo=0' turn off attribute caching

Both the 'noac' and 'actimeo=0' mount options should ensure that attributes
are not cached, however a bug in nfs_attribute_timeout() means that
currently, the attributes may in fact get cached for up to one jiffy. This
has been seen to cause corruption in some applications.

The reason for the bug is that the time_in_range() test returns 'true' as
long as the current time lies between nfsi->read_cache_jiffies and
nfsi->read_cache_jiffies + nfsi->attrtimeo. In other words, if jiffies
equals nfsi->read_cache_jiffies, then we still cache the attribute data.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 5cb3345eb69..2501b864f7c 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -707,6 +707,13 @@ int nfs_attribute_timeout(struct inode *inode)
 
 	if (nfs_have_delegation(inode, FMODE_READ))
 		return 0;
+	/*
+	 * Special case: if the attribute timeout is set to 0, then always
+	 * 		 treat the cache as having expired (unless holding
+	 * 		 a delegation).
+	 */
+	if (nfsi->attrtimeo == 0)
+		return 1;
 	return !time_in_range(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 38def50fabc479dc96ea6bd2cb2526e0dfc36fa4 Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@citi.umich.edu>
Date: Thu, 1 May 2008 20:03:22 +0300
Subject: nfs: fix race in nfs_dirty_request

When called from nfs_flush_incompatible, the req is not locked, so
req->wb_page might be set to NULL before it is used by PageWriteback.

Signed-off-by: Fred Isaman <iisaman@citi.umich.edu>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/write.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 1ade11d1ba0..6d8ace3e325 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -415,7 +415,7 @@ nfs_dirty_request(struct nfs_page *req)
 
 	if (page == NULL || test_bit(PG_NEED_COMMIT, &req->wb_flags))
 		return 0;
-	return !PageWriteback(req->wb_page);
+	return !PageWriteback(page);
 }
 
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
-- 
cgit v1.2.3-70-g09d2


From 3a6258e1fb5ff717dcefa04afc35f81aaae3f3e0 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 6 May 2008 13:32:40 -0400
Subject: NFSv4: Check the return value of decode_compound_hdr_arg()

If decode_compound_hdr_arg() returns a resource error, then we cannot
proceed to process the callback. Return a 'GARBAGE_ARGS' rpc-level error to
the caller instead.
If, however, the minor version field is incorrect, then we need to
propagate the resulting NFS4ERR_MINOR_VERS_MISMATCH error back as the
compound status field (setting the nops field to 0).

Finally, if encode_compound_hdr_res() returns an error, we need to return
an RPC_SYSTEM_ERR to the caller.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/callback_xdr.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 13619d24f02..646d4d85072 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -401,12 +401,12 @@ static __be32 process_op(struct svc_rqst *rqstp,
  */
 static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *resp)
 {
-	struct cb_compound_hdr_arg hdr_arg;
-	struct cb_compound_hdr_res hdr_res;
+	struct cb_compound_hdr_arg hdr_arg = { 0 };
+	struct cb_compound_hdr_res hdr_res = { NULL };
 	struct xdr_stream xdr_in, xdr_out;
 	__be32 *p;
 	__be32 status;
-	unsigned int nops = 1;
+	unsigned int nops = 0;
 
 	dprintk("%s: start\n", __FUNCTION__);
 
@@ -415,20 +415,20 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
 	p = (__be32*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len);
 	xdr_init_encode(&xdr_out, &rqstp->rq_res, p);
 
-	decode_compound_hdr_arg(&xdr_in, &hdr_arg);
+	status = decode_compound_hdr_arg(&xdr_in, &hdr_arg);
+	if (status == __constant_htonl(NFS4ERR_RESOURCE))
+		return rpc_garbage_args;
+
 	hdr_res.taglen = hdr_arg.taglen;
 	hdr_res.tag = hdr_arg.tag;
-	hdr_res.nops = NULL;
-	encode_compound_hdr_res(&xdr_out, &hdr_res);
+	if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0)
+		return rpc_system_err;
 
-	for (;;) {
+	while (status == 0 && nops != hdr_arg.nops) {
 		status = process_op(rqstp, &xdr_in, argp, &xdr_out, resp);
-		if (status != 0)
-			break;
-		if (nops == hdr_arg.nops)
-			break;
 		nops++;
 	}
+
 	*hdr_res.status = status;
 	*hdr_res.nops = htonl(nops);
 	dprintk("%s: done, status = %u\n", __FUNCTION__, ntohl(status));
-- 
cgit v1.2.3-70-g09d2


From 46c8ac74250a396aca855e494f49a960797a6b5e Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Fri, 2 May 2008 13:42:42 -0700
Subject: nfs/lsm: make NFSv4 set LSM mount options

NFSv3 get_sb operations call into the LSM layer to set security options passed
from userspace.  NFSv4 hooks were not originally added since it was reasonably
late in the merge window and NFSv3 was the only thing that had regressed (v4
has never supported any LSM options)

This patch makes NFSv4 call into the LSM to set security options rather than
just blindly dropping them with no notice to the user as happens today.  This
patch was tested in a simple NFSv4 environment with the context= option and
appeared to work as expected.

Signed-off-by: Eric Paris <eparis@redhat.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Acked-by: James Morris <jmorris@namei.org>
Cc: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 7226a506f3c..5ed86ac0fd9 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2015,6 +2015,10 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
 		goto error_splat_super;
 	}
 
+	error = security_sb_set_mnt_opts(s, &data.lsm_opts);
+	if (error)
+		goto error_splat_root;
+
 	s->s_flags |= MS_ACTIVE;
 	mnt->mnt_sb = s;
 	mnt->mnt_root = mntroot;
@@ -2031,6 +2035,8 @@ out_free:
 	nfs_free_server(server);
 	goto out;
 
+error_splat_root:
+	dput(mntroot);
 error_splat_super:
 	up_write(&s->s_umount);
 	deactivate_super(s);
@@ -2114,6 +2120,8 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
 	mnt->mnt_sb = s;
 	mnt->mnt_root = mntroot;
 
+	security_sb_clone_mnt_opts(data->sb, s);
+
 	dprintk("<-- nfs4_xdev_get_sb() = 0\n");
 	return 0;
 
@@ -2197,6 +2205,8 @@ static int nfs4_referral_get_sb(struct file_system_type *fs_type, int flags,
 	mnt->mnt_sb = s;
 	mnt->mnt_root = mntroot;
 
+	security_sb_clone_mnt_opts(data->sb, s);
+
 	dprintk("<-- nfs4_referral_get_sb() = 0\n");
 	return 0;
 
-- 
cgit v1.2.3-70-g09d2


From 3110ff8048fb757b36112b044b384aea9c44d6e4 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Fri, 2 May 2008 13:42:44 -0700
Subject: nfs: replace remaining __FUNCTION__ occurrences

__FUNCTION__ is gcc-specific, use __func__

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/callback.c      |   2 +-
 fs/nfs/callback_proc.c |   4 +-
 fs/nfs/callback_xdr.c  |  18 ++++-----
 fs/nfs/client.c        |   8 ++--
 fs/nfs/delegation.c    |   4 +-
 fs/nfs/dir.c           |  18 ++++-----
 fs/nfs/file.c          |   2 +-
 fs/nfs/inode.c         |   4 +-
 fs/nfs/namespace.c     |   8 ++--
 fs/nfs/nfs3proc.c      |   6 +--
 fs/nfs/nfs4namespace.c |  12 +++---
 fs/nfs/nfs4proc.c      |  32 ++++++++--------
 fs/nfs/nfs4renewd.c    |  10 ++---
 fs/nfs/nfs4state.c     |   6 +--
 fs/nfs/nfs4xdr.c       | 100 ++++++++++++++++++++++++-------------------------
 fs/nfs/proc.c          |   8 ++--
 fs/nfs/read.c          |   2 +-
 fs/nfs/super.c         |   2 +-
 18 files changed, 123 insertions(+), 123 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 5606ae3d72d..c1e7c830062 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -182,7 +182,7 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp)
 	if (clp == NULL)
 		return SVC_DROP;
 
-	dprintk("%s: %s NFSv4 callback!\n", __FUNCTION__,
+	dprintk("%s: %s NFSv4 callback!\n", __func__,
 			svc_print_addr(rqstp, buf, sizeof(buf)));
 	nfs_put_client(clp);
 
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 15f7785048d..f7e83e23cf9 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -57,7 +57,7 @@ out_iput:
 out_putclient:
 	nfs_put_client(clp);
 out:
-	dprintk("%s: exit with status = %d\n", __FUNCTION__, ntohl(res->status));
+	dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status));
 	return res->status;
 }
 
@@ -98,6 +98,6 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
 		nfs_put_client(prev);
 	} while (clp != NULL);
 out:
-	dprintk("%s: exit with status = %d\n", __FUNCTION__, ntohl(res));
+	dprintk("%s: exit with status = %d\n", __func__, ntohl(res));
 	return res;
 }
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 646d4d85072..dd0ef34b584 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -141,7 +141,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
 	/* We do not like overly long tags! */
 	if (hdr->taglen > CB_OP_TAGLEN_MAXSZ - 12) {
 		printk("NFSv4 CALLBACK %s: client sent tag of length %u\n",
-				__FUNCTION__, hdr->taglen);
+				__func__, hdr->taglen);
 		return htonl(NFS4ERR_RESOURCE);
 	}
 	p = read_buf(xdr, 12);
@@ -151,7 +151,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
 	/* Check minor version is zero. */
 	if (minor_version != 0) {
 		printk(KERN_WARNING "%s: NFSv4 server callback with illegal minor version %u!\n",
-				__FUNCTION__, minor_version);
+				__func__, minor_version);
 		return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
 	}
 	hdr->callback_ident = ntohl(*p++);
@@ -179,7 +179,7 @@ static __be32 decode_getattr_args(struct svc_rqst *rqstp, struct xdr_stream *xdr
 	args->addr = svc_addr(rqstp);
 	status = decode_bitmap(xdr, args->bitmap);
 out:
-	dprintk("%s: exit with status = %d\n", __FUNCTION__, ntohl(status));
+	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
 	return status;
 }
 
@@ -200,7 +200,7 @@ static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 	args->truncate = ntohl(*p);
 	status = decode_fh(xdr, &args->fh);
 out:
-	dprintk("%s: exit with status = %d\n", __FUNCTION__, ntohl(status));
+	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
 	return status;
 }
 
@@ -349,7 +349,7 @@ static __be32 encode_getattr_res(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 	status = encode_attr_mtime(xdr, res->bitmap, &res->mtime);
 	*savep = htonl((unsigned int)((char *)xdr->p - (char *)(savep+1)));
 out:
-	dprintk("%s: exit with status = %d\n", __FUNCTION__, ntohl(status));
+	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
 	return status;
 }
 
@@ -363,7 +363,7 @@ static __be32 process_op(struct svc_rqst *rqstp,
 	long maxlen;
 	__be32 res;
 
-	dprintk("%s: start\n", __FUNCTION__);
+	dprintk("%s: start\n", __func__);
 	status = decode_op_hdr(xdr_in, &op_nr);
 	if (likely(status == 0)) {
 		switch (op_nr) {
@@ -392,7 +392,7 @@ static __be32 process_op(struct svc_rqst *rqstp,
 		status = res;
 	if (op->encode_res != NULL && status == 0)
 		status = op->encode_res(rqstp, xdr_out, resp);
-	dprintk("%s: done, status = %d\n", __FUNCTION__, ntohl(status));
+	dprintk("%s: done, status = %d\n", __func__, ntohl(status));
 	return status;
 }
 
@@ -408,7 +408,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
 	__be32 status;
 	unsigned int nops = 0;
 
-	dprintk("%s: start\n", __FUNCTION__);
+	dprintk("%s: start\n", __func__);
 
 	xdr_init_decode(&xdr_in, &rqstp->rq_arg, rqstp->rq_arg.head[0].iov_base);
 
@@ -431,7 +431,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
 
 	*hdr_res.status = status;
 	*hdr_res.nops = htonl(nops);
-	dprintk("%s: done, status = %u\n", __FUNCTION__, ntohl(status));
+	dprintk("%s: done, status = %u\n", __func__, ntohl(status));
 	return rpc_success;
 }
 
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 89ac5bb0401..f2a092ca69b 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -488,7 +488,7 @@ static int nfs_create_rpc_client(struct nfs_client *clp,
 	clnt = rpc_create(&args);
 	if (IS_ERR(clnt)) {
 		dprintk("%s: cannot create RPC client. Error = %ld\n",
-				__FUNCTION__, PTR_ERR(clnt));
+				__func__, PTR_ERR(clnt));
 		return PTR_ERR(clnt);
 	}
 
@@ -576,7 +576,7 @@ static int nfs_init_server_rpcclient(struct nfs_server *server,
 
 	server->client = rpc_clone_client(clp->cl_rpcclient);
 	if (IS_ERR(server->client)) {
-		dprintk("%s: couldn't create rpc_client!\n", __FUNCTION__);
+		dprintk("%s: couldn't create rpc_client!\n", __func__);
 		return PTR_ERR(server->client);
 	}
 
@@ -590,7 +590,7 @@ static int nfs_init_server_rpcclient(struct nfs_server *server,
 
 		auth = rpcauth_create(pseudoflavour, server->client);
 		if (IS_ERR(auth)) {
-			dprintk("%s: couldn't create credcache!\n", __FUNCTION__);
+			dprintk("%s: couldn't create credcache!\n", __func__);
 			return PTR_ERR(auth);
 		}
 	}
@@ -985,7 +985,7 @@ static int nfs4_init_client(struct nfs_client *clp,
 	error = nfs_idmap_new(clp);
 	if (error < 0) {
 		dprintk("%s: failed to create idmapper. Error = %d\n",
-			__FUNCTION__, error);
+			__func__, error);
 		goto error;
 	}
 	__set_bit(NFS_CS_IDMAP, &clp->cl_res_state);
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 00a5e4405e1..cc563cfa694 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -60,7 +60,7 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
 		switch (status) {
 			default:
 				printk(KERN_ERR "%s: unhandled error %d.\n",
-						__FUNCTION__, status);
+						__func__, status);
 			case -NFS4ERR_EXPIRED:
 				/* kill_proc(fl->fl_pid, SIGLOST, 1); */
 			case -NFS4ERR_STALE_CLIENTID:
@@ -186,7 +186,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
 		 */
 		dfprintk(FILE, "%s: server %s handed out "
 				"a duplicate delegation!\n",
-				__FUNCTION__, clp->cl_hostname);
+				__func__, clp->cl_hostname);
 		if (delegation->type <= nfsi->delegation->type) {
 			freeme = delegation;
 			delegation = NULL;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index f288b3ecab4..58d43daec08 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -180,7 +180,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
 	int		error;
 
 	dfprintk(DIRCACHE, "NFS: %s: reading cookie %Lu into page %lu\n",
-			__FUNCTION__, (long long)desc->entry->cookie,
+			__func__, (long long)desc->entry->cookie,
 			page->index);
 
  again:
@@ -256,7 +256,7 @@ int find_dirent(nfs_readdir_descriptor_t *desc)
 
 	while((status = dir_decode(desc)) == 0) {
 		dfprintk(DIRCACHE, "NFS: %s: examining cookie %Lu\n",
-				__FUNCTION__, (unsigned long long)entry->cookie);
+				__func__, (unsigned long long)entry->cookie);
 		if (entry->prev_cookie == *desc->dir_cookie)
 			break;
 		if (loop_count++ > 200) {
@@ -315,7 +315,7 @@ int find_dirent_page(nfs_readdir_descriptor_t *desc)
 	int		status;
 
 	dfprintk(DIRCACHE, "NFS: %s: searching page %ld for target %Lu\n",
-			__FUNCTION__, desc->page_index,
+			__func__, desc->page_index,
 			(long long) *desc->dir_cookie);
 
 	/* If we find the page in the page_cache, we cannot be sure
@@ -339,7 +339,7 @@ int find_dirent_page(nfs_readdir_descriptor_t *desc)
 	if (status < 0)
 		dir_page_release(desc);
  out:
-	dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __FUNCTION__, status);
+	dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status);
 	return status;
 }
 
@@ -380,7 +380,7 @@ int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
 		}
 	}
 
-	dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __FUNCTION__, res);
+	dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, res);
 	return res;
 }
 
@@ -506,7 +506,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
 	desc->entry->eof = 0;
  out:
 	dfprintk(DIRCACHE, "NFS: %s: returns %d\n",
-			__FUNCTION__, status);
+			__func__, status);
 	return status;
  out_release:
 	dir_page_release(desc);
@@ -780,7 +780,7 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
 
 	if (is_bad_inode(inode)) {
 		dfprintk(LOOKUPCACHE, "%s: %s/%s has dud inode\n",
-				__FUNCTION__, dentry->d_parent->d_name.name,
+				__func__, dentry->d_parent->d_name.name,
 				dentry->d_name.name);
 		goto out_bad;
 	}
@@ -808,7 +808,7 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
 	unlock_kernel();
 	dput(parent);
 	dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is valid\n",
-			__FUNCTION__, dentry->d_parent->d_name.name,
+			__func__, dentry->d_parent->d_name.name,
 			dentry->d_name.name);
 	return 1;
 out_zap_parent:
@@ -827,7 +827,7 @@ out_zap_parent:
 	unlock_kernel();
 	dput(parent);
 	dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n",
-			__FUNCTION__, dentry->d_parent->d_name.name,
+			__func__, dentry->d_parent->d_name.name,
 			dentry->d_name.name);
 	return 0;
 }
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 3536b01164f..d84a3d8f32a 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -526,7 +526,7 @@ static int do_vfs_lock(struct file *file, struct file_lock *fl)
 	if (res < 0)
 		dprintk(KERN_WARNING "%s: VFS is out of sync with lock manager"
 			" - error %d!\n",
-				__FUNCTION__, res);
+				__func__, res);
 	return res;
 }
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 2501b864f7c..421d338c698 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1002,7 +1002,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	unsigned long now = jiffies;
 
 	dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n",
-			__FUNCTION__, inode->i_sb->s_id, inode->i_ino,
+			__func__, inode->i_sb->s_id, inode->i_ino,
 			atomic_read(&inode->i_count), fattr->valid);
 
 	if (nfsi->fileid != fattr->fileid)
@@ -1126,7 +1126,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	 * Big trouble! The inode has become a different object.
 	 */
 	printk(KERN_DEBUG "%s: inode %ld mode changed, %07o to %07o\n",
-			__FUNCTION__, inode->i_ino, inode->i_mode, fattr->mode);
+			__func__, inode->i_ino, inode->i_mode, fattr->mode);
  out_err:
 	/*
 	 * No need to worry about unhashing the dentry, as the
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index af4d0f1e402..fca518006a5 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -106,7 +106,7 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 	dprintk("--> nfs_follow_mountpoint()\n");
 
 	BUG_ON(IS_ROOT(dentry));
-	dprintk("%s: enter\n", __FUNCTION__);
+	dprintk("%s: enter\n", __func__);
 	dput(nd->path.dentry);
 	nd->path.dentry = dget(dentry);
 
@@ -143,7 +143,7 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 	nd->path.dentry = dget(mnt->mnt_root);
 	schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
 out:
-	dprintk("%s: done, returned %d\n", __FUNCTION__, err);
+	dprintk("%s: done, returned %d\n", __func__, err);
 
 	dprintk("<-- nfs_follow_mountpoint() = %d\n", err);
 	return ERR_PTR(err);
@@ -230,7 +230,7 @@ static struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
 
 	dprintk("--> nfs_do_submount()\n");
 
-	dprintk("%s: submounting on %s/%s\n", __FUNCTION__,
+	dprintk("%s: submounting on %s/%s\n", __func__,
 			dentry->d_parent->d_name.name,
 			dentry->d_name.name);
 	if (page == NULL)
@@ -243,7 +243,7 @@ static struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
 free_page:
 	free_page((unsigned long)page);
 out:
-	dprintk("%s: done\n", __FUNCTION__);
+	dprintk("%s: done\n", __func__);
 
 	dprintk("<-- nfs_do_submount() = %p\n", mnt);
 	return mnt;
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 549dbce714a..c3523ad03ed 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -63,15 +63,15 @@ do_proc_get_root(struct rpc_clnt *client, struct nfs_fh *fhandle,
 	};
 	int	status;
 
-	dprintk("%s: call  fsinfo\n", __FUNCTION__);
+	dprintk("%s: call  fsinfo\n", __func__);
 	nfs_fattr_init(info->fattr);
 	status = rpc_call_sync(client, &msg, 0);
-	dprintk("%s: reply fsinfo: %d\n", __FUNCTION__, status);
+	dprintk("%s: reply fsinfo: %d\n", __func__, status);
 	if (!(info->fattr->valid & NFS_ATTR_FATTR)) {
 		msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR];
 		msg.rpc_resp = info->fattr;
 		status = rpc_call_sync(client, &msg, 0);
-		dprintk("%s: reply getattr: %d\n", __FUNCTION__, status);
+		dprintk("%s: reply getattr: %d\n", __func__, status);
 	}
 	return status;
 }
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 5f9ba41ed5b..b112857301f 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -86,7 +86,7 @@ static int nfs4_validate_fspath(const struct vfsmount *mnt_parent,
 
 	if (strncmp(path, fs_path, strlen(fs_path)) != 0) {
 		dprintk("%s: path %s does not begin with fsroot %s\n",
-			__FUNCTION__, path, fs_path);
+			__func__, path, fs_path);
 		return -ENOENT;
 	}
 
@@ -134,7 +134,7 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
 	if (locations == NULL || locations->nlocations <= 0)
 		goto out;
 
-	dprintk("%s: referral at %s/%s\n", __FUNCTION__,
+	dprintk("%s: referral at %s/%s\n", __func__,
 		dentry->d_parent->d_name.name, dentry->d_name.name);
 
 	page = (char *) __get_free_page(GFP_USER);
@@ -204,7 +204,7 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
 out:
 	free_page((unsigned long) page);
 	free_page((unsigned long) page2);
-	dprintk("%s: done\n", __FUNCTION__);
+	dprintk("%s: done\n", __func__);
 	return mnt;
 }
 
@@ -223,7 +223,7 @@ struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentr
 	int err;
 
 	/* BUG_ON(IS_ROOT(dentry)); */
-	dprintk("%s: enter\n", __FUNCTION__);
+	dprintk("%s: enter\n", __func__);
 
 	page = alloc_page(GFP_KERNEL);
 	if (page == NULL)
@@ -238,7 +238,7 @@ struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentr
 
 	parent = dget_parent(dentry);
 	dprintk("%s: getting locations for %s/%s\n",
-		__FUNCTION__, parent->d_name.name, dentry->d_name.name);
+		__func__, parent->d_name.name, dentry->d_name.name);
 
 	err = nfs4_proc_fs_locations(parent->d_inode, &dentry->d_name, fs_locations, page);
 	dput(parent);
@@ -252,6 +252,6 @@ out_free:
 	__free_page(page);
 	kfree(fs_locations);
 out:
-	dprintk("%s: done\n", __FUNCTION__);
+	dprintk("%s: done\n", __func__);
 	return mnt;
 }
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index dbc09271af0..f533318b005 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -73,7 +73,7 @@ int nfs4_map_errors(int err)
 {
 	if (err < -1000) {
 		dprintk("%s could not handle NFSv4 error %d\n",
-				__FUNCTION__, -err);
+				__func__, -err);
 		return -EIO;
 	}
 	return err;
@@ -1578,7 +1578,7 @@ static int nfs4_get_referral(struct inode *dir, const struct qstr *name, struct
 		goto out;
 	/* Make sure server returned a different fsid for the referral */
 	if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &locations->fattr.fsid)) {
-		dprintk("%s: server did not return a different fsid for a referral at %s\n", __FUNCTION__, name->name);
+		dprintk("%s: server did not return a different fsid for a referral at %s\n", __func__, name->name);
 		status = -EIO;
 		goto out;
 	}
@@ -2211,7 +2211,7 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 	};
 	int			status;
 
-	dprintk("%s: dentry = %s/%s, cookie = %Lu\n", __FUNCTION__,
+	dprintk("%s: dentry = %s/%s, cookie = %Lu\n", __func__,
 			dentry->d_parent->d_name.name,
 			dentry->d_name.name,
 			(unsigned long long)cookie);
@@ -2223,7 +2223,7 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 
 	nfs_invalidate_atime(dir);
 
-	dprintk("%s: returns %d\n", __FUNCTION__, status);
+	dprintk("%s: returns %d\n", __func__, status);
 	return status;
 }
 
@@ -3342,7 +3342,7 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
 	struct nfs4_lockdata *data = calldata;
 	struct nfs4_state *state = data->lsp->ls_state;
 
-	dprintk("%s: begin!\n", __FUNCTION__);
+	dprintk("%s: begin!\n", __func__);
 	if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0)
 		return;
 	/* Do we need to do an open_to_lock_owner? */
@@ -3356,14 +3356,14 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
 		data->arg.new_lock_owner = 0;
 	data->timestamp = jiffies;
 	rpc_call_start(task);
-	dprintk("%s: done!, ret = %d\n", __FUNCTION__, data->rpc_status);
+	dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status);
 }
 
 static void nfs4_lock_done(struct rpc_task *task, void *calldata)
 {
 	struct nfs4_lockdata *data = calldata;
 
-	dprintk("%s: begin!\n", __FUNCTION__);
+	dprintk("%s: begin!\n", __func__);
 
 	data->rpc_status = task->tk_status;
 	if (RPC_ASSASSINATED(task))
@@ -3381,14 +3381,14 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
 		renew_lease(NFS_SERVER(data->ctx->path.dentry->d_inode), data->timestamp);
 	}
 out:
-	dprintk("%s: done, ret = %d!\n", __FUNCTION__, data->rpc_status);
+	dprintk("%s: done, ret = %d!\n", __func__, data->rpc_status);
 }
 
 static void nfs4_lock_release(void *calldata)
 {
 	struct nfs4_lockdata *data = calldata;
 
-	dprintk("%s: begin!\n", __FUNCTION__);
+	dprintk("%s: begin!\n", __func__);
 	nfs_free_seqid(data->arg.open_seqid);
 	if (data->cancelled != 0) {
 		struct rpc_task *task;
@@ -3396,13 +3396,13 @@ static void nfs4_lock_release(void *calldata)
 				data->arg.lock_seqid);
 		if (!IS_ERR(task))
 			rpc_put_task(task);
-		dprintk("%s: cancelling lock!\n", __FUNCTION__);
+		dprintk("%s: cancelling lock!\n", __func__);
 	} else
 		nfs_free_seqid(data->arg.lock_seqid);
 	nfs4_put_lock_state(data->lsp);
 	put_nfs_open_context(data->ctx);
 	kfree(data);
-	dprintk("%s: done!\n", __FUNCTION__);
+	dprintk("%s: done!\n", __func__);
 }
 
 static const struct rpc_call_ops nfs4_lock_ops = {
@@ -3428,7 +3428,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
 	};
 	int ret;
 
-	dprintk("%s: begin!\n", __FUNCTION__);
+	dprintk("%s: begin!\n", __func__);
 	data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file),
 			fl->fl_u.nfs4_fl.owner);
 	if (data == NULL)
@@ -3451,7 +3451,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
 	} else
 		data->cancelled = 1;
 	rpc_put_task(task);
-	dprintk("%s: done, ret = %d!\n", __FUNCTION__, ret);
+	dprintk("%s: done, ret = %d!\n", __func__, ret);
 	return ret;
 }
 
@@ -3527,7 +3527,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
 	/* Note: we always want to sleep here! */
 	request->fl_flags = fl_flags | FL_SLEEP;
 	if (do_vfs_lock(request->fl_file, request) < 0)
-		printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __FUNCTION__);
+		printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __func__);
 out_unlock:
 	up_read(&clp->cl_sem);
 out:
@@ -3665,12 +3665,12 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
 	};
 	int status;
 
-	dprintk("%s: start\n", __FUNCTION__);
+	dprintk("%s: start\n", __func__);
 	nfs_fattr_init(&fs_locations->fattr);
 	fs_locations->server = server;
 	fs_locations->nlocations = 0;
 	status = rpc_call_sync(server->client, &msg, 0);
-	dprintk("%s: returned status = %d\n", __FUNCTION__, status);
+	dprintk("%s: returned status = %d\n", __func__, status);
 	return status;
 }
 
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 5e2e4af1a0e..3305acbbe2a 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -66,7 +66,7 @@ nfs4_renew_state(struct work_struct *work)
 	unsigned long last, now;
 
 	down_read(&clp->cl_sem);
-	dprintk("%s: start\n", __FUNCTION__);
+	dprintk("%s: start\n", __func__);
 	/* Are there any active superblocks? */
 	if (list_empty(&clp->cl_superblocks))
 		goto out;
@@ -92,17 +92,17 @@ nfs4_renew_state(struct work_struct *work)
 		spin_lock(&clp->cl_lock);
 	} else
 		dprintk("%s: failed to call renewd. Reason: lease not expired \n",
-				__FUNCTION__);
+				__func__);
 	if (timeout < 5 * HZ)    /* safeguard */
 		timeout = 5 * HZ;
 	dprintk("%s: requeueing work. Lease period = %ld\n",
-			__FUNCTION__, (timeout + HZ - 1) / HZ);
+			__func__, (timeout + HZ - 1) / HZ);
 	cancel_delayed_work(&clp->cl_renewd);
 	schedule_delayed_work(&clp->cl_renewd, timeout);
 	spin_unlock(&clp->cl_lock);
 out:
 	up_read(&clp->cl_sem);
-	dprintk("%s: done\n", __FUNCTION__);
+	dprintk("%s: done\n", __func__);
 }
 
 /* Must be called with clp->cl_sem locked for writes */
@@ -117,7 +117,7 @@ nfs4_schedule_state_renewal(struct nfs_client *clp)
 	if (timeout < 5 * HZ)
 		timeout = 5 * HZ;
 	dprintk("%s: requeueing work. Lease period = %ld\n",
-			__FUNCTION__, (timeout + HZ - 1) / HZ);
+			__func__, (timeout + HZ - 1) / HZ);
 	cancel_delayed_work(&clp->cl_renewd);
 	schedule_delayed_work(&clp->cl_renewd, timeout);
 	set_bit(NFS_CS_RENEWD, &clp->cl_res_state);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 46eb624e4f1..5a1e02c8b75 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -828,7 +828,7 @@ static int nfs4_reclaim_locks(struct nfs4_state_recovery_ops *ops, struct nfs4_s
 		switch (status) {
 			default:
 				printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n",
-						__FUNCTION__, status);
+						__func__, status);
 			case -NFS4ERR_EXPIRED:
 			case -NFS4ERR_NO_GRACE:
 			case -NFS4ERR_RECLAIM_BAD:
@@ -869,14 +869,14 @@ static int nfs4_reclaim_open_state(struct nfs4_state_recovery_ops *ops, struct n
 			list_for_each_entry(lock, &state->lock_states, ls_locks) {
 				if (!(lock->ls_flags & NFS_LOCK_INITIALIZED))
 					printk("%s: Lock reclaim failed!\n",
-							__FUNCTION__);
+							__func__);
 			}
 			continue;
 		}
 		switch (status) {
 			default:
 				printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n",
-						__FUNCTION__, status);
+						__func__, status);
 			case -ENOENT:
 			case -NFS4ERR_RECLAIM_BAD:
 			case -NFS4ERR_RECLAIM_CONFLICT:
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 5a2d64927b3..b916297d233 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1831,7 +1831,7 @@ static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nf
 	xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages,
 			 args->pgbase, args->count);
 	dprintk("%s: inlined page args = (%u, %p, %u, %u)\n",
-			__FUNCTION__, replen, args->pages,
+			__func__, replen, args->pages,
 			args->pgbase, args->count);
 
 out:
@@ -2192,9 +2192,9 @@ out:
 	p = xdr_inline_decode(xdr, nbytes); \
 	if (unlikely(!p)) { \
 		dprintk("nfs: %s: prematurely hit end of receive" \
-				" buffer\n", __FUNCTION__); \
+				" buffer\n", __func__); \
 		dprintk("nfs: %s: xdr->p=%p, bytes=%u, xdr->end=%p\n", \
-				__FUNCTION__, xdr->p, nbytes, xdr->end); \
+				__func__, xdr->p, nbytes, xdr->end); \
 		return -EIO; \
 	} \
 } while (0)
@@ -2306,12 +2306,12 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
 		READ_BUF(4);
 		READ32(*type);
 		if (*type < NF4REG || *type > NF4NAMEDATTR) {
-			dprintk("%s: bad type %d\n", __FUNCTION__, *type);
+			dprintk("%s: bad type %d\n", __func__, *type);
 			return -EIO;
 		}
 		bitmap[0] &= ~FATTR4_WORD0_TYPE;
 	}
-	dprintk("%s: type=0%o\n", __FUNCTION__, nfs_type2fmt[*type].nfs2type);
+	dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type].nfs2type);
 	return 0;
 }
 
@@ -2327,7 +2327,7 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
 		READ64(*change);
 		bitmap[0] &= ~FATTR4_WORD0_CHANGE;
 	}
-	dprintk("%s: change attribute=%Lu\n", __FUNCTION__,
+	dprintk("%s: change attribute=%Lu\n", __func__,
 			(unsigned long long)*change);
 	return 0;
 }
@@ -2344,7 +2344,7 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *
 		READ64(*size);
 		bitmap[0] &= ~FATTR4_WORD0_SIZE;
 	}
-	dprintk("%s: file size=%Lu\n", __FUNCTION__, (unsigned long long)*size);
+	dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size);
 	return 0;
 }
 
@@ -2360,7 +2360,7 @@ static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, ui
 		READ32(*res);
 		bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT;
 	}
-	dprintk("%s: link support=%s\n", __FUNCTION__, *res == 0 ? "false" : "true");
+	dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true");
 	return 0;
 }
 
@@ -2376,7 +2376,7 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
 		READ32(*res);
 		bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT;
 	}
-	dprintk("%s: symlink support=%s\n", __FUNCTION__, *res == 0 ? "false" : "true");
+	dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true");
 	return 0;
 }
 
@@ -2394,7 +2394,7 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
 		READ64(fsid->minor);
 		bitmap[0] &= ~FATTR4_WORD0_FSID;
 	}
-	dprintk("%s: fsid=(0x%Lx/0x%Lx)\n", __FUNCTION__,
+	dprintk("%s: fsid=(0x%Lx/0x%Lx)\n", __func__,
 			(unsigned long long)fsid->major,
 			(unsigned long long)fsid->minor);
 	return 0;
@@ -2412,7 +2412,7 @@ static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		READ32(*res);
 		bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME;
 	}
-	dprintk("%s: file size=%u\n", __FUNCTION__, (unsigned int)*res);
+	dprintk("%s: file size=%u\n", __func__, (unsigned int)*res);
 	return 0;
 }
 
@@ -2428,7 +2428,7 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		READ32(*res);
 		bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT;
 	}
-	dprintk("%s: ACLs supported=%u\n", __FUNCTION__, (unsigned int)*res);
+	dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res);
 	return 0;
 }
 
@@ -2444,7 +2444,7 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
 		READ64(*fileid);
 		bitmap[0] &= ~FATTR4_WORD0_FILEID;
 	}
-	dprintk("%s: fileid=%Lu\n", __FUNCTION__, (unsigned long long)*fileid);
+	dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
 	return 0;
 }
 
@@ -2460,7 +2460,7 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma
 		READ64(*fileid);
 		bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
 	}
-	dprintk("%s: fileid=%Lu\n", __FUNCTION__, (unsigned long long)*fileid);
+	dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
 	return 0;
 }
 
@@ -2477,7 +2477,7 @@ static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
 		READ64(*res);
 		bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL;
 	}
-	dprintk("%s: files avail=%Lu\n", __FUNCTION__, (unsigned long long)*res);
+	dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
 }
 
@@ -2494,7 +2494,7 @@ static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		READ64(*res);
 		bitmap[0] &= ~FATTR4_WORD0_FILES_FREE;
 	}
-	dprintk("%s: files free=%Lu\n", __FUNCTION__, (unsigned long long)*res);
+	dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
 }
 
@@ -2511,7 +2511,7 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
 		READ64(*res);
 		bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL;
 	}
-	dprintk("%s: files total=%Lu\n", __FUNCTION__, (unsigned long long)*res);
+	dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
 }
 
@@ -2569,7 +2569,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 	status = 0;
 	if (unlikely(!(bitmap[0] & FATTR4_WORD0_FS_LOCATIONS)))
 		goto out;
-	dprintk("%s: fsroot ", __FUNCTION__);
+	dprintk("%s: fsroot ", __func__);
 	status = decode_pathname(xdr, &res->fs_path);
 	if (unlikely(status != 0))
 		goto out;
@@ -2586,7 +2586,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 		READ32(m);
 
 		loc->nservers = 0;
-		dprintk("%s: servers ", __FUNCTION__);
+		dprintk("%s: servers ", __func__);
 		while (loc->nservers < m) {
 			struct nfs4_string *server = &loc->servers[loc->nservers];
 			status = decode_opaque_inline(xdr, &server->len, &server->data);
@@ -2599,7 +2599,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 				unsigned int i;
 				dprintk("%s: using first %u of %u servers "
 					"returned for location %u\n",
-						__FUNCTION__,
+						__func__,
 						NFS4_FS_LOCATION_MAXSERVERS,
 						m, res->nlocations);
 				for (i = loc->nservers; i < m; i++) {
@@ -2618,7 +2618,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 			res->nlocations++;
 	}
 out:
-	dprintk("%s: fs_locations done, error = %d\n", __FUNCTION__, status);
+	dprintk("%s: fs_locations done, error = %d\n", __func__, status);
 	return status;
 out_eio:
 	status = -EIO;
@@ -2638,7 +2638,7 @@ static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uin
 		READ64(*res);
 		bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE;
 	}
-	dprintk("%s: maxfilesize=%Lu\n", __FUNCTION__, (unsigned long long)*res);
+	dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
 }
 
@@ -2655,7 +2655,7 @@ static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 		READ32(*maxlink);
 		bitmap[0] &= ~FATTR4_WORD0_MAXLINK;
 	}
-	dprintk("%s: maxlink=%u\n", __FUNCTION__, *maxlink);
+	dprintk("%s: maxlink=%u\n", __func__, *maxlink);
 	return status;
 }
 
@@ -2672,7 +2672,7 @@ static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 		READ32(*maxname);
 		bitmap[0] &= ~FATTR4_WORD0_MAXNAME;
 	}
-	dprintk("%s: maxname=%u\n", __FUNCTION__, *maxname);
+	dprintk("%s: maxname=%u\n", __func__, *maxname);
 	return status;
 }
 
@@ -2693,7 +2693,7 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 		*res = (uint32_t)maxread;
 		bitmap[0] &= ~FATTR4_WORD0_MAXREAD;
 	}
-	dprintk("%s: maxread=%lu\n", __FUNCTION__, (unsigned long)*res);
+	dprintk("%s: maxread=%lu\n", __func__, (unsigned long)*res);
 	return status;
 }
 
@@ -2714,7 +2714,7 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
 		*res = (uint32_t)maxwrite;
 		bitmap[0] &= ~FATTR4_WORD0_MAXWRITE;
 	}
-	dprintk("%s: maxwrite=%lu\n", __FUNCTION__, (unsigned long)*res);
+	dprintk("%s: maxwrite=%lu\n", __func__, (unsigned long)*res);
 	return status;
 }
 
@@ -2731,7 +2731,7 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
 		*mode &= ~S_IFMT;
 		bitmap[1] &= ~FATTR4_WORD1_MODE;
 	}
-	dprintk("%s: file mode=0%o\n", __FUNCTION__, (unsigned int)*mode);
+	dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode);
 	return 0;
 }
 
@@ -2747,7 +2747,7 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t
 		READ32(*nlink);
 		bitmap[1] &= ~FATTR4_WORD1_NUMLINKS;
 	}
-	dprintk("%s: nlink=%u\n", __FUNCTION__, (unsigned int)*nlink);
+	dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink);
 	return 0;
 }
 
@@ -2766,13 +2766,13 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 		if (len < XDR_MAX_NETOBJ) {
 			if (nfs_map_name_to_uid(clp, (char *)p, len, uid) != 0)
 				dprintk("%s: nfs_map_name_to_uid failed!\n",
-						__FUNCTION__);
+						__func__);
 		} else
 			dprintk("%s: name too long (%u)!\n",
-					__FUNCTION__, len);
+					__func__, len);
 		bitmap[1] &= ~FATTR4_WORD1_OWNER;
 	}
-	dprintk("%s: uid=%d\n", __FUNCTION__, (int)*uid);
+	dprintk("%s: uid=%d\n", __func__, (int)*uid);
 	return 0;
 }
 
@@ -2791,13 +2791,13 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 		if (len < XDR_MAX_NETOBJ) {
 			if (nfs_map_group_to_gid(clp, (char *)p, len, gid) != 0)
 				dprintk("%s: nfs_map_group_to_gid failed!\n",
-						__FUNCTION__);
+						__func__);
 		} else
 			dprintk("%s: name too long (%u)!\n",
-					__FUNCTION__, len);
+					__func__, len);
 		bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP;
 	}
-	dprintk("%s: gid=%d\n", __FUNCTION__, (int)*gid);
+	dprintk("%s: gid=%d\n", __func__, (int)*gid);
 	return 0;
 }
 
@@ -2820,7 +2820,7 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
 			*rdev = tmp;
 		bitmap[1] &= ~ FATTR4_WORD1_RAWDEV;
 	}
-	dprintk("%s: rdev=(0x%x:0x%x)\n", __FUNCTION__, major, minor);
+	dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor);
 	return 0;
 }
 
@@ -2837,7 +2837,7 @@ static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
 		READ64(*res);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL;
 	}
-	dprintk("%s: space avail=%Lu\n", __FUNCTION__, (unsigned long long)*res);
+	dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
 }
 
@@ -2854,7 +2854,7 @@ static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		READ64(*res);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE;
 	}
-	dprintk("%s: space free=%Lu\n", __FUNCTION__, (unsigned long long)*res);
+	dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
 }
 
@@ -2871,7 +2871,7 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
 		READ64(*res);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL;
 	}
-	dprintk("%s: space total=%Lu\n", __FUNCTION__, (unsigned long long)*res);
+	dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
 }
 
@@ -2887,7 +2887,7 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		READ64(*used);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_USED;
 	}
-	dprintk("%s: space used=%Lu\n", __FUNCTION__,
+	dprintk("%s: space used=%Lu\n", __func__,
 			(unsigned long long)*used);
 	return 0;
 }
@@ -2918,7 +2918,7 @@ static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, str
 		status = decode_attr_time(xdr, time);
 		bitmap[1] &= ~FATTR4_WORD1_TIME_ACCESS;
 	}
-	dprintk("%s: atime=%ld\n", __FUNCTION__, (long)time->tv_sec);
+	dprintk("%s: atime=%ld\n", __func__, (long)time->tv_sec);
 	return status;
 }
 
@@ -2934,7 +2934,7 @@ static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, s
 		status = decode_attr_time(xdr, time);
 		bitmap[1] &= ~FATTR4_WORD1_TIME_METADATA;
 	}
-	dprintk("%s: ctime=%ld\n", __FUNCTION__, (long)time->tv_sec);
+	dprintk("%s: ctime=%ld\n", __func__, (long)time->tv_sec);
 	return status;
 }
 
@@ -2950,7 +2950,7 @@ static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, str
 		status = decode_attr_time(xdr, time);
 		bitmap[1] &= ~FATTR4_WORD1_TIME_MODIFY;
 	}
-	dprintk("%s: mtime=%ld\n", __FUNCTION__, (long)time->tv_sec);
+	dprintk("%s: mtime=%ld\n", __func__, (long)time->tv_sec);
 	return status;
 }
 
@@ -2962,7 +2962,7 @@ static int verify_attr_len(struct xdr_stream *xdr, __be32 *savep, uint32_t attrl
 	if (unlikely(attrwords != nwords)) {
 		dprintk("%s: server returned incorrect attribute length: "
 			"%u %c %u\n",
-				__FUNCTION__,
+				__func__,
 				attrwords << 2,
 				(attrwords < nwords) ? '<' : '>',
 				nwords << 2);
@@ -3067,7 +3067,7 @@ static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_re
 		goto xdr_error;
 	status = verify_attr_len(xdr, savep, attrlen);
 xdr_error:
-	dprintk("%s: xdr returned %d!\n", __FUNCTION__, -status);
+	dprintk("%s: xdr returned %d!\n", __func__, -status);
 	return status;
 }
 	
@@ -3100,7 +3100,7 @@ static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat)
 
 	status = verify_attr_len(xdr, savep, attrlen);
 xdr_error:
-	dprintk("%s: xdr returned %d!\n", __FUNCTION__, -status);
+	dprintk("%s: xdr returned %d!\n", __func__, -status);
 	return status;
 }
 
@@ -3125,7 +3125,7 @@ static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf
 
 	status = verify_attr_len(xdr, savep, attrlen);
 xdr_error:
-	dprintk("%s: xdr returned %d!\n", __FUNCTION__, -status);
+	dprintk("%s: xdr returned %d!\n", __func__, -status);
 	return status;
 }
 
@@ -3193,7 +3193,7 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 	if ((status = verify_attr_len(xdr, savep, attrlen)) == 0)
 		fattr->valid = NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4;
 xdr_error:
-	dprintk("%s: xdr returned %d\n", __FUNCTION__, -status);
+	dprintk("%s: xdr returned %d\n", __func__, -status);
 	return status;
 }
 
@@ -3226,7 +3226,7 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
 
 	status = verify_attr_len(xdr, savep, attrlen);
 xdr_error:
-	dprintk("%s: xdr returned %d!\n", __FUNCTION__, -status);
+	dprintk("%s: xdr returned %d!\n", __func__, -status);
 	return status;
 }
 
@@ -3418,7 +3418,7 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
 
 	return decode_delegation(xdr, res);
 xdr_error:
-	dprintk("%s: Bitmap too large! Length = %u\n", __FUNCTION__, bmlen);
+	dprintk("%s: Bitmap too large! Length = %u\n", __func__, bmlen);
 	return -EIO;
 }
 
@@ -3575,7 +3575,7 @@ short_pkt:
 	 * the call was successful, but incomplete. The caller can retry the
 	 * readdir starting at the last cookie.
 	 */
-	dprintk("%s: short packet at entry %d\n", __FUNCTION__, nr);
+	dprintk("%s: short packet at entry %d\n", __func__, nr);
 	entry[0] = entry[1] = 0;
 	if (nr)
 		goto out;
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 5ccf7faee19..03599bfe81c 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -63,17 +63,17 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
 	};
 	int status;
 
-	dprintk("%s: call getattr\n", __FUNCTION__);
+	dprintk("%s: call getattr\n", __func__);
 	nfs_fattr_init(fattr);
 	status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
-	dprintk("%s: reply getattr: %d\n", __FUNCTION__, status);
+	dprintk("%s: reply getattr: %d\n", __func__, status);
 	if (status)
 		return status;
-	dprintk("%s: call statfs\n", __FUNCTION__);
+	dprintk("%s: call statfs\n", __func__);
 	msg.rpc_proc = &nfs_procedures[NFSPROC_STATFS];
 	msg.rpc_resp = &fsinfo;
 	status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
-	dprintk("%s: reply statfs: %d\n", __FUNCTION__, status);
+	dprintk("%s: reply statfs: %d\n", __func__, status);
 	if (status)
 		return status;
 	info->rtmax  = NFS_MAXDATA;
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 16f57e0af99..40d17987d0e 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -329,7 +329,7 @@ int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data)
 {
 	int status;
 
-	dprintk("NFS: %s: %5u, (status %d)\n", __FUNCTION__, task->tk_pid,
+	dprintk("NFS: %s: %5u, (status %d)\n", __func__, task->tk_pid,
 			task->tk_status);
 
 	status = NFS_PROTO(data->inode)->read_done(task, data);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 5ed86ac0fd9..2a4a024a4e7 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -405,7 +405,7 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 
  out_err:
-	dprintk("%s: statfs error = %d\n", __FUNCTION__, -error);
+	dprintk("%s: statfs error = %d\n", __func__, -error);
 	unlock_kernel();
 	return error;
 }
-- 
cgit v1.2.3-70-g09d2


From 31f31db1a15671513df9cd9fbe56ef45ee1e9a2a Mon Sep 17 00:00:00 2001
From: Jan Blunck <jblunck@suse.de>
Date: Fri, 2 May 2008 13:42:45 -0700
Subject: nfs: path_{get,put}() cleanups

Here are some more places where path_{get,put}() can be used instead of
dput()/mntput() pair.

Signed-off-by: Jan Blunck <jblunck@suse.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c     | 3 +--
 fs/nfs/namespace.c | 3 +--
 fs/nfs/nfs4proc.c  | 6 ++----
 3 files changed, 4 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 421d338c698..596c5d8e86f 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -541,8 +541,7 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int wait)
 	}
 	if (ctx->cred != NULL)
 		put_rpccred(ctx->cred);
-	dput(ctx->path.dentry);
-	mntput(ctx->path.mnt);
+	path_put(&ctx->path);
 	kfree(ctx);
 }
 
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index fca518006a5..2f285ef7639 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -137,8 +137,7 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 			goto out_follow;
 		goto out_err;
 	}
-	mntput(nd->path.mnt);
-	dput(nd->path.dentry);
+	path_put(&nd->path);
 	nd->path.mnt = mnt;
 	nd->path.dentry = dget(mnt->mnt_root);
 	schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index f533318b005..1293e0acd82 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -306,8 +306,7 @@ static void nfs4_opendata_free(struct kref *kref)
 		nfs4_put_open_state(p->state);
 	nfs4_put_state_owner(p->owner);
 	dput(p->dir);
-	dput(p->path.dentry);
-	mntput(p->path.mnt);
+	path_put(&p->path);
 	kfree(p);
 }
 
@@ -1210,8 +1209,7 @@ static void nfs4_free_closedata(void *data)
 	nfs4_put_open_state(calldata->state);
 	nfs_free_seqid(calldata->arg.seqid);
 	nfs4_put_state_owner(sp);
-	dput(calldata->path.dentry);
-	mntput(calldata->path.mnt);
+	path_put(&calldata->path);
 	kfree(calldata);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 1d2e88e73ee0af52b0ed63b5fb8f42a919a4d9de Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Fri, 2 May 2008 13:42:45 -0700
Subject: nfs: make nfs4_drop_state_owner() static

nfs4_drop_state_owner() can now become static.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4_fs.h   | 1 -
 fs/nfs/nfs4state.c | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index bd1b9d663fb..ea790645fda 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -206,7 +206,6 @@ struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp);
 
 extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
 extern void nfs4_put_state_owner(struct nfs4_state_owner *);
-extern void nfs4_drop_state_owner(struct nfs4_state_owner *);
 extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *);
 extern void nfs4_put_open_state(struct nfs4_state *);
 extern void nfs4_close_state(struct path *, struct nfs4_state *, mode_t);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 5a1e02c8b75..856a8934f61 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -282,7 +282,7 @@ nfs4_alloc_state_owner(void)
 	return sp;
 }
 
-void
+static void
 nfs4_drop_state_owner(struct nfs4_state_owner *sp)
 {
 	if (!RB_EMPTY_NODE(&sp->so_client_node)) {
-- 
cgit v1.2.3-70-g09d2


From fec4585fd71cc5ec35d134e8c3854f6e8c4503f0 Mon Sep 17 00:00:00 2001
From: Igor Mammedov <niallain@gmail.com>
Date: Fri, 16 May 2008 13:06:30 +0400
Subject: CIFSGetDFSRefer cleanup + dfs_referral_level_3 fixed to conform
 REFERRAL_V3 the MS-DFSC spec.

Signed-off-by: Igor Mammedov <niallain@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifspdu.h |  16 ++---
 fs/cifs/cifssmb.c | 205 +++++++++++++++++++++++++++++++++++-------------------
 2 files changed, 139 insertions(+), 82 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index c43bf4b7a55..93d5ee02a25 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -1906,17 +1906,15 @@ typedef struct smb_com_transaction2_get_dfs_refer_req {
 
 typedef struct dfs_referral_level_3 {
 	__le16 VersionNumber;
-	__le16 ReferralSize;
-	__le16 ServerType;	/* 0x0001 = CIFS server */
-	__le16 ReferralFlags;	/* or proximity - not clear which since it is
-				   always set to zero - SNIA spec says 0x01
-				   means strip off PathConsumed chars before
-				   submitting RequestFileName to remote node */
-	__le16 TimeToLive;
-	__le16 Proximity;
+	__le16 Size;
+	__le16 ServerType; /* 0x0001 = root targets; 0x0000 = link targets */
+	__le16 ReferralEntryFlags; /* 0x0200 bit set only for domain
+				      or DC referral responce */
+	__le32 TimeToLive;
 	__le16 DfsPathOffset;
 	__le16 DfsAlternatePathOffset;
-	__le16 NetworkAddressOffset;
+	__le16 NetworkAddressOffset; /* offset of the link target */
+	__le16 ServiceSiteGuid;
 } __attribute__((packed)) REFERRAL3;
 
 typedef struct smb_com_transaction_get_dfs_refer_rsp {
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index fc297383cb0..6f8ed93a4ae 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -81,6 +81,39 @@ static struct {
 #endif /* CONFIG_CIFS_WEAK_PW_HASH */
 #endif /* CIFS_POSIX */
 
+/* Allocates buffer into dst and copies smb string from src to it.
+ * caller is responsible for freeing dst if function returned 0.
+ * returns:
+ * 	on success - 0
+ *	on failure - errno
+ */
+static int
+cifs_strncpy_to_host(char **dst, const char *src, const int maxlen,
+		 const bool is_unicode, const struct nls_table *nls_codepage)
+{
+	int plen;
+
+	if (is_unicode) {
+		plen = UniStrnlen((wchar_t *)src, maxlen);
+		*dst = kmalloc(plen + 2, GFP_KERNEL);
+		if (!*dst)
+			goto cifs_strncpy_to_host_ErrExit;
+		cifs_strfromUCS_le(*dst, (__le16 *)src, plen, nls_codepage);
+	} else {
+		plen = strnlen(src, maxlen);
+		*dst = kmalloc(plen + 2, GFP_KERNEL);
+		if (!*dst)
+			goto cifs_strncpy_to_host_ErrExit;
+		strncpy(*dst, src, plen);
+	}
+	(*dst)[plen] = 0;
+	return 0;
+
+cifs_strncpy_to_host_ErrExit:
+	cERROR(1, ("Failed to allocate buffer for string\n"));
+	return -ENOMEM;
+}
+
 
 /* Mark as invalid, all open files on tree connections since they
    were closed when session to server was lost */
@@ -3867,6 +3900,96 @@ GetInodeNumOut:
 	return rc;
 }
 
+/* parses DFS refferal V3 structure
+ * caller is responsible for freeing target_nodes
+ * returns:
+ * 	on success - 0
+ *	on failure - errno
+ */
+static int
+parse_DFS_REFERRALS(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
+		unsigned int *num_of_nodes,
+		struct dfs_info3_param **target_nodes,
+		const struct nls_table *nls_codepage)
+{
+	int i, rc = 0;
+	char *data_end;
+	bool is_unicode;
+	struct dfs_referral_level_3 *ref;
+
+	is_unicode = pSMBr->hdr.Flags2 & SMBFLG2_UNICODE;
+	*num_of_nodes = le16_to_cpu(pSMBr->NumberOfReferrals);
+
+	if (*num_of_nodes < 1) {
+		cERROR(1, ("num_referrals: must be at least > 0,"
+			"but we get num_referrals = %d\n", *num_of_nodes));
+		rc = -EINVAL;
+		goto parse_DFS_REFERRALS_exit;
+	}
+
+	ref = (struct dfs_referral_level_3 *) &(pSMBr->referrals);
+	if (ref->VersionNumber != 3) {
+		cERROR(1, ("Referrals of V%d version are not supported,"
+			"should be V3", ref->VersionNumber));
+		rc = -EINVAL;
+		goto parse_DFS_REFERRALS_exit;
+	}
+
+	/* get the upper boundary of the resp buffer */
+	data_end = (char *)(&(pSMBr->PathConsumed)) +
+				le16_to_cpu(pSMBr->t2.DataCount);
+
+	cFYI(1, ("num_referrals: %d dfs flags: 0x%x ... \n",
+			*num_of_nodes,
+			le16_to_cpu(pSMBr->DFSFlags)));
+
+	*target_nodes = kzalloc(sizeof(struct dfs_info3_param) *
+			*num_of_nodes, GFP_KERNEL);
+	if (*target_nodes == NULL) {
+		cERROR(1, ("Failed to allocate buffer for target_nodes\n"));
+		rc = -ENOMEM;
+		goto parse_DFS_REFERRALS_exit;
+	}
+
+	/* collect neccessary data from referrals */
+	for (i = 0; i < *num_of_nodes; i++) {
+		char *temp;
+		int max_len;
+		struct dfs_info3_param *node = (*target_nodes)+i;
+
+		node->flags = le16_to_cpu(pSMBr->DFSFlags);
+		node->path_consumed = le16_to_cpu(pSMBr->PathConsumed);
+		node->server_type = le16_to_cpu(ref->ServerType);
+		node->ref_flag = le16_to_cpu(ref->ReferralEntryFlags);
+
+		/* copy DfsPath */
+		temp = (char *)ref + le16_to_cpu(ref->DfsPathOffset);
+		max_len = data_end - temp;
+		rc = cifs_strncpy_to_host(&(node->path_name), temp,
+					max_len, is_unicode, nls_codepage);
+		if (rc)
+			goto parse_DFS_REFERRALS_exit;
+
+		/* copy link target UNC */
+		temp = (char *)ref + le16_to_cpu(ref->NetworkAddressOffset);
+		max_len = data_end - temp;
+		rc = cifs_strncpy_to_host(&(node->node_name), temp,
+					max_len, is_unicode, nls_codepage);
+		if (rc)
+			goto parse_DFS_REFERRALS_exit;
+
+		ref += ref->Size;
+	}
+
+parse_DFS_REFERRALS_exit:
+	if (rc) {
+		free_dfs_info_array(*target_nodes, *num_of_nodes);
+		*target_nodes = NULL;
+		*num_of_nodes = 0;
+	}
+	return rc;
+}
+
 int
 CIFSGetDFSRefer(const int xid, struct cifsSesInfo *ses,
 		const unsigned char *searchName,
@@ -3877,12 +4000,9 @@ CIFSGetDFSRefer(const int xid, struct cifsSesInfo *ses,
 /* TRANS2_GET_DFS_REFERRAL */
 	TRANSACTION2_GET_DFS_REFER_REQ *pSMB = NULL;
 	TRANSACTION2_GET_DFS_REFER_RSP *pSMBr = NULL;
-	struct dfs_referral_level_3 *referrals = NULL;
 	int rc = 0;
 	int bytes_returned;
 	int name_len;
-	unsigned int i;
-	char *temp;
 	__u16 params, byte_count;
 	*num_of_nodes = 0;
 	*target_nodes = NULL;
@@ -3960,80 +4080,19 @@ getDFSRetry:
 	rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
 	/* BB Also check if enough total bytes returned? */
-	if (rc || (pSMBr->ByteCount < 17))
+	if (rc || (pSMBr->ByteCount < 17)) {
 		rc = -EIO;      /* bad smb */
-	else {
-		__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
-		__u16 data_count = le16_to_cpu(pSMBr->t2.DataCount);
-
-		cFYI(1, ("Decoding GetDFSRefer response BCC: %d  Offset %d",
-			 pSMBr->ByteCount, data_offset));
-		referrals =
-		    (struct dfs_referral_level_3 *)
-				(8 /* sizeof start of data block */ +
-				data_offset +
-				(char *) &pSMBr->hdr.Protocol);
-		cFYI(1, ("num_referrals: %d dfs flags: 0x%x ... \n"
-			"for referral one refer size: 0x%x srv "
-			"type: 0x%x refer flags: 0x%x ttl: 0x%x",
-			le16_to_cpu(pSMBr->NumberOfReferrals),
-			le16_to_cpu(pSMBr->DFSFlags),
-			le16_to_cpu(referrals->ReferralSize),
-			le16_to_cpu(referrals->ServerType),
-			le16_to_cpu(referrals->ReferralFlags),
-			le16_to_cpu(referrals->TimeToLive)));
-		/* BB This field is actually two bytes in from start of
-		   data block so we could do safety check that DataBlock
-		   begins at address of pSMBr->NumberOfReferrals */
-		*num_of_nodes = le16_to_cpu(pSMBr->NumberOfReferrals);
-
-		/* BB Fix below so can return more than one referral */
-		if (*num_of_nodes > 1)
-			*num_of_nodes = 1;
-
-		/* get the length of the strings describing refs */
-		name_len = 0;
-		for (i = 0; i < *num_of_nodes; i++) {
-			/* make sure that DfsPathOffset not past end */
-			__u16 offset = le16_to_cpu(referrals->DfsPathOffset);
-			if (offset > data_count) {
-				/* if invalid referral, stop here and do
-				not try to copy any more */
-				*num_of_nodes = i;
-				break;
-			}
-			temp = ((char *)referrals) + offset;
+		goto GetDFSRefExit;
+	}
 
-			if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) {
-				name_len += UniStrnlen((wchar_t *)temp,
-							data_count);
-			} else {
-				name_len += strnlen(temp, data_count);
-			}
-			referrals++;
-			/* BB add check that referral pointer does
-			   not fall off end PDU */
-		}
-		/* BB add check for name_len bigger than bcc */
-		*target_nodes =
-			kmalloc(name_len+1+(*num_of_nodes),
-				GFP_KERNEL);
-		if (*target_nodes == NULL) {
-			rc = -ENOMEM;
-			goto GetDFSRefExit;
-		}
+	cFYI(1, ("Decoding GetDFSRefer response BCC: %d  Offset %d",
+				pSMBr->ByteCount,
+				le16_to_cpu(pSMBr->t2.DataOffset)));
 
-		referrals = (struct dfs_referral_level_3 *)
-				(8 /* sizeof data hdr */ + data_offset +
-				(char *) &pSMBr->hdr.Protocol);
+	/* parse returned result into more usable form */
+	rc = parse_DFS_REFERRALS(pSMBr, num_of_nodes,
+				 target_nodes, nls_codepage);
 
-		for (i = 0; i < *num_of_nodes; i++) {
-			temp = ((char *)referrals) +
-				  le16_to_cpu(referrals->DfsPathOffset);
-			/*  BB update target_uncs pointers */
-			referrals++;
-		}
-	}
 GetDFSRefExit:
 	if (pSMB)
 		cifs_buf_release(pSMB);
-- 
cgit v1.2.3-70-g09d2


From a1fe78f16eac7d03d3c391dd5d54559826574982 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Fri, 16 May 2008 18:48:38 +0000
Subject: [CIFS] Add missing defines for DFS

Also has minor cleanup of previous patch

CC: Igor Mammedov <niallain@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifspdu.h |  9 +++++++++
 fs/cifs/cifssmb.c | 17 +++++++++--------
 2 files changed, 18 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 93d5ee02a25..65d58b4e6a6 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -1904,6 +1904,15 @@ typedef struct smb_com_transaction2_get_dfs_refer_req {
 	char RequestFileName[1];
 } __attribute__((packed)) TRANSACTION2_GET_DFS_REFER_REQ;
 
+#define DFS_VERSION cpu_to_le16(0x0003)
+
+/* DFS server target type */
+#define DFS_TYPE_LINK 0x0000  /* also for sysvol targets */
+#define DFS_TYPE_ROOT 0x0001
+ 
+/* Referral Entry Flags */
+#define DFS_NAME_LIST_REF 0x0200
+
 typedef struct dfs_referral_level_3 {
 	__le16 VersionNumber;
 	__le16 Size;
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 6f8ed93a4ae..7b9938445b0 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -107,6 +107,7 @@ cifs_strncpy_to_host(char **dst, const char *src, const int maxlen,
 		strncpy(*dst, src, plen);
 	}
 	(*dst)[plen] = 0;
+	(*dst)[plen+1] = 0; /* harmless for ASCII case, needed for Unicode */
 	return 0;
 
 cifs_strncpy_to_host_ErrExit:
@@ -3907,7 +3908,7 @@ GetInodeNumOut:
  *	on failure - errno
  */
 static int
-parse_DFS_REFERRALS(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
+parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
 		unsigned int *num_of_nodes,
 		struct dfs_info3_param **target_nodes,
 		const struct nls_table *nls_codepage)
@@ -3924,7 +3925,7 @@ parse_DFS_REFERRALS(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
 		cERROR(1, ("num_referrals: must be at least > 0,"
 			"but we get num_referrals = %d\n", *num_of_nodes));
 		rc = -EINVAL;
-		goto parse_DFS_REFERRALS_exit;
+		goto parse_DFS_referrals_exit;
 	}
 
 	ref = (struct dfs_referral_level_3 *) &(pSMBr->referrals);
@@ -3932,7 +3933,7 @@ parse_DFS_REFERRALS(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
 		cERROR(1, ("Referrals of V%d version are not supported,"
 			"should be V3", ref->VersionNumber));
 		rc = -EINVAL;
-		goto parse_DFS_REFERRALS_exit;
+		goto parse_DFS_referrals_exit;
 	}
 
 	/* get the upper boundary of the resp buffer */
@@ -3948,7 +3949,7 @@ parse_DFS_REFERRALS(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
 	if (*target_nodes == NULL) {
 		cERROR(1, ("Failed to allocate buffer for target_nodes\n"));
 		rc = -ENOMEM;
-		goto parse_DFS_REFERRALS_exit;
+		goto parse_DFS_referrals_exit;
 	}
 
 	/* collect neccessary data from referrals */
@@ -3968,7 +3969,7 @@ parse_DFS_REFERRALS(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
 		rc = cifs_strncpy_to_host(&(node->path_name), temp,
 					max_len, is_unicode, nls_codepage);
 		if (rc)
-			goto parse_DFS_REFERRALS_exit;
+			goto parse_DFS_referrals_exit;
 
 		/* copy link target UNC */
 		temp = (char *)ref + le16_to_cpu(ref->NetworkAddressOffset);
@@ -3976,12 +3977,12 @@ parse_DFS_REFERRALS(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
 		rc = cifs_strncpy_to_host(&(node->node_name), temp,
 					max_len, is_unicode, nls_codepage);
 		if (rc)
-			goto parse_DFS_REFERRALS_exit;
+			goto parse_DFS_referrals_exit;
 
 		ref += ref->Size;
 	}
 
-parse_DFS_REFERRALS_exit:
+parse_DFS_referrals_exit:
 	if (rc) {
 		free_dfs_info_array(*target_nodes, *num_of_nodes);
 		*target_nodes = NULL;
@@ -4090,7 +4091,7 @@ getDFSRetry:
 				le16_to_cpu(pSMBr->t2.DataOffset)));
 
 	/* parse returned result into more usable form */
-	rc = parse_DFS_REFERRALS(pSMBr, num_of_nodes,
+	rc = parse_DFS_referrals(pSMBr, num_of_nodes,
 				 target_nodes, nls_codepage);
 
 GetDFSRefExit:
-- 
cgit v1.2.3-70-g09d2


From 9a6ab769bdacc65e7d4e931034e12e02c357c4d3 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Fri, 16 May 2008 11:20:25 -0700
Subject: byteorder: don't directly include linux/byteorder/generic.h

Use asm/byteorder.h instead.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/snsc_event.c                    | 2 +-
 drivers/media/video/et61x251/et61x251_core.c | 2 +-
 drivers/media/video/sn9c102/sn9c102_core.c   | 2 +-
 drivers/media/video/zc0301/zc0301_core.c     | 2 +-
 drivers/media/video/zoran_device.c           | 2 +-
 drivers/media/video/zoran_driver.c           | 2 +-
 drivers/net/wireless/atmel.c                 | 2 +-
 fs/befs/endian.h                             | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/drivers/char/snsc_event.c b/drivers/char/snsc_event.c
index 53b3d44f8c0..55a95892ccf 100644
--- a/drivers/char/snsc_event.c
+++ b/drivers/char/snsc_event.c
@@ -17,7 +17,7 @@
 
 #include <linux/interrupt.h>
 #include <linux/sched.h>
-#include <linux/byteorder/generic.h>
+#include <asm/byteorder.h>
 #include <asm/sn/sn_sal.h>
 #include <asm/unaligned.h>
 #include "snsc.h"
diff --git a/drivers/media/video/et61x251/et61x251_core.c b/drivers/media/video/et61x251/et61x251_core.c
index 5e749c528a6..15d037ae25c 100644
--- a/drivers/media/video/et61x251/et61x251_core.c
+++ b/drivers/media/video/et61x251/et61x251_core.c
@@ -34,7 +34,7 @@
 #include <linux/mm.h>
 #include <linux/vmalloc.h>
 #include <linux/page-flags.h>
-#include <linux/byteorder/generic.h>
+#include <asm/byteorder.h>
 #include <asm/page.h>
 #include <asm/uaccess.h>
 
diff --git a/drivers/media/video/sn9c102/sn9c102_core.c b/drivers/media/video/sn9c102/sn9c102_core.c
index 5748b1e1a12..7f9c7bcf3c8 100644
--- a/drivers/media/video/sn9c102/sn9c102_core.c
+++ b/drivers/media/video/sn9c102/sn9c102_core.c
@@ -34,7 +34,7 @@
 #include <linux/mm.h>
 #include <linux/vmalloc.h>
 #include <linux/page-flags.h>
-#include <linux/byteorder/generic.h>
+#include <asm/byteorder.h>
 #include <asm/page.h>
 #include <asm/uaccess.h>
 
diff --git a/drivers/media/video/zc0301/zc0301_core.c b/drivers/media/video/zc0301/zc0301_core.c
index 363dd2b9475..e5c4e9f5193 100644
--- a/drivers/media/video/zc0301/zc0301_core.c
+++ b/drivers/media/video/zc0301/zc0301_core.c
@@ -38,7 +38,7 @@
 #include <linux/mm.h>
 #include <linux/vmalloc.h>
 #include <linux/page-flags.h>
-#include <linux/byteorder/generic.h>
+#include <asm/byteorder.h>
 #include <asm/page.h>
 #include <asm/uaccess.h>
 
diff --git a/drivers/media/video/zoran_device.c b/drivers/media/video/zoran_device.c
index 7b60533efe4..37629ffd34c 100644
--- a/drivers/media/video/zoran_device.c
+++ b/drivers/media/video/zoran_device.c
@@ -31,7 +31,6 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/vmalloc.h>
-#include <linux/byteorder/generic.h>
 
 #include <linux/interrupt.h>
 #include <linux/proc_fs.h>
@@ -47,6 +46,7 @@
 #include <linux/delay.h>
 #include <linux/wait.h>
 
+#include <asm/byteorder.h>
 #include <asm/io.h>
 
 #include "videocodec.h"
diff --git a/drivers/media/video/zoran_driver.c b/drivers/media/video/zoran_driver.c
index 0134bec1e39..345c77e4683 100644
--- a/drivers/media/video/zoran_driver.c
+++ b/drivers/media/video/zoran_driver.c
@@ -52,7 +52,6 @@
 #include <linux/pci.h>
 #include <linux/vmalloc.h>
 #include <linux/wait.h>
-#include <linux/byteorder/generic.h>
 
 #include <linux/interrupt.h>
 #include <linux/i2c.h>
@@ -74,6 +73,7 @@
 #include <media/v4l2-common.h>
 #include "videocodec.h"
 
+#include <asm/byteorder.h>
 #include <asm/io.h>
 #include <asm/uaccess.h>
 #include <linux/proc_fs.h>
diff --git a/drivers/net/wireless/atmel.c b/drivers/net/wireless/atmel.c
index ef2da4023d6..438e63ecccf 100644
--- a/drivers/net/wireless/atmel.c
+++ b/drivers/net/wireless/atmel.c
@@ -47,6 +47,7 @@
 #include <linux/string.h>
 #include <linux/ctype.h>
 #include <linux/timer.h>
+#include <asm/byteorder.h>
 #include <asm/io.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -60,7 +61,6 @@
 #include <linux/delay.h>
 #include <linux/wireless.h>
 #include <net/iw_handler.h>
-#include <linux/byteorder/generic.h>
 #include <linux/crc32.h>
 #include <linux/proc_fs.h>
 #include <linux/device.h>
diff --git a/fs/befs/endian.h b/fs/befs/endian.h
index e254a20869f..6cb84d896d0 100644
--- a/fs/befs/endian.h
+++ b/fs/befs/endian.h
@@ -9,7 +9,7 @@
 #ifndef LINUX_BEFS_ENDIAN
 #define LINUX_BEFS_ENDIAN
 
-#include <linux/byteorder/generic.h>
+#include <asm/byteorder.h>
 
 static inline u64
 fs64_to_cpu(const struct super_block *sb, fs64 n)
-- 
cgit v1.2.3-70-g09d2


From f52111b1546943545e67573c4dde1c7613ca33d3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 8 May 2008 18:19:16 -0400
Subject: [PATCH] take init_files to fs/file.c

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/alpha/kernel/init_task.c     |  1 -
 arch/arm/kernel/init_task.c       |  1 -
 arch/avr32/kernel/init_task.c     |  1 -
 arch/blackfin/kernel/init_task.c  |  1 -
 arch/cris/kernel/process.c        |  1 -
 arch/frv/kernel/init_task.c       |  1 -
 arch/h8300/kernel/init_task.c     |  1 -
 arch/ia64/kernel/init_task.c      |  1 -
 arch/m32r/kernel/init_task.c      |  1 -
 arch/m68k/kernel/process.c        |  1 -
 arch/m68knommu/kernel/init_task.c |  1 -
 arch/mips/kernel/init_task.c      |  1 -
 arch/mn10300/kernel/init_task.c   |  1 -
 arch/parisc/kernel/init_task.c    |  1 -
 arch/powerpc/kernel/init_task.c   |  1 -
 arch/s390/kernel/init_task.c      |  1 -
 arch/sh/kernel/init_task.c        |  1 -
 arch/sparc/kernel/init_task.c     |  1 -
 arch/sparc64/kernel/init_task.c   |  1 -
 arch/um/kernel/init_task.c        |  1 -
 arch/v850/kernel/init_task.c      |  1 -
 arch/x86/kernel/init_task.c       |  1 -
 arch/xtensa/kernel/init_task.c    |  1 -
 fs/file.c                         | 13 +++++++++++++
 include/linux/init_task.h         | 23 +----------------------
 25 files changed, 14 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/arch/alpha/kernel/init_task.c b/arch/alpha/kernel/init_task.c
index 835d09a7b33..1f762189fa6 100644
--- a/arch/alpha/kernel/init_task.c
+++ b/arch/alpha/kernel/init_task.c
@@ -9,7 +9,6 @@
 
 
 static struct fs_struct init_fs = INIT_FS;
-static struct files_struct init_files = INIT_FILES;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/arm/kernel/init_task.c b/arch/arm/kernel/init_task.c
index bd4ef53bc6b..8b8c9d38a76 100644
--- a/arch/arm/kernel/init_task.c
+++ b/arch/arm/kernel/init_task.c
@@ -13,7 +13,6 @@
 #include <asm/pgtable.h>
 
 static struct fs_struct init_fs = INIT_FS;
-static struct files_struct init_files = INIT_FILES;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/avr32/kernel/init_task.c b/arch/avr32/kernel/init_task.c
index effcacf9d1a..44058469c6e 100644
--- a/arch/avr32/kernel/init_task.c
+++ b/arch/avr32/kernel/init_task.c
@@ -14,7 +14,6 @@
 #include <asm/pgtable.h>
 
 static struct fs_struct init_fs = INIT_FS;
-static struct files_struct init_files = INIT_FILES;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/blackfin/kernel/init_task.c b/arch/blackfin/kernel/init_task.c
index c640154030e..6bdba7b2110 100644
--- a/arch/blackfin/kernel/init_task.c
+++ b/arch/blackfin/kernel/init_task.c
@@ -34,7 +34,6 @@
 #include <linux/fs.h>
 
 static struct fs_struct init_fs = INIT_FS;
-static struct files_struct init_files = INIT_FILES;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 
diff --git a/arch/cris/kernel/process.c b/arch/cris/kernel/process.c
index ef2db8fd102..5933656db5a 100644
--- a/arch/cris/kernel/process.c
+++ b/arch/cris/kernel/process.c
@@ -38,7 +38,6 @@
  */
 
 static struct fs_struct init_fs = INIT_FS;
-static struct files_struct init_files = INIT_FILES;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/frv/kernel/init_task.c b/arch/frv/kernel/init_task.c
index 22993932b3f..e2198815b63 100644
--- a/arch/frv/kernel/init_task.c
+++ b/arch/frv/kernel/init_task.c
@@ -11,7 +11,6 @@
 
 
 static struct fs_struct init_fs = INIT_FS;
-static struct files_struct init_files = INIT_FILES;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/h8300/kernel/init_task.c b/arch/h8300/kernel/init_task.c
index 19272c2ac56..93a4899e46c 100644
--- a/arch/h8300/kernel/init_task.c
+++ b/arch/h8300/kernel/init_task.c
@@ -13,7 +13,6 @@
 #include <asm/pgtable.h>
 
 static struct fs_struct init_fs = INIT_FS;
-static struct files_struct init_files = INIT_FILES;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/ia64/kernel/init_task.c b/arch/ia64/kernel/init_task.c
index bc8efcad28b..9d7e1c66faf 100644
--- a/arch/ia64/kernel/init_task.c
+++ b/arch/ia64/kernel/init_task.c
@@ -18,7 +18,6 @@
 #include <asm/pgtable.h>
 
 static struct fs_struct init_fs = INIT_FS;
-static struct files_struct init_files = INIT_FILES;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/m32r/kernel/init_task.c b/arch/m32r/kernel/init_task.c
index 9e508fd9d97..0d658dbb676 100644
--- a/arch/m32r/kernel/init_task.c
+++ b/arch/m32r/kernel/init_task.c
@@ -12,7 +12,6 @@
 #include <asm/pgtable.h>
 
 static struct fs_struct init_fs = INIT_FS;
-static struct files_struct init_files = INIT_FILES;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
index 5de4e4ed76a..7888cdf91f5 100644
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -41,7 +41,6 @@
  * setup.
  */
 static struct fs_struct init_fs = INIT_FS;
-static struct files_struct init_files = INIT_FILES;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/m68knommu/kernel/init_task.c b/arch/m68knommu/kernel/init_task.c
index 3897043a126..344c01aede0 100644
--- a/arch/m68knommu/kernel/init_task.c
+++ b/arch/m68knommu/kernel/init_task.c
@@ -13,7 +13,6 @@
 #include <asm/pgtable.h>
 
 static struct fs_struct init_fs = INIT_FS;
-static struct files_struct init_files = INIT_FILES;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/mips/kernel/init_task.c b/arch/mips/kernel/init_task.c
index aeda7f58391..d72487ad7c1 100644
--- a/arch/mips/kernel/init_task.c
+++ b/arch/mips/kernel/init_task.c
@@ -10,7 +10,6 @@
 #include <asm/pgtable.h>
 
 static struct fs_struct init_fs = INIT_FS;
-static struct files_struct init_files = INIT_FILES;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/mn10300/kernel/init_task.c b/arch/mn10300/kernel/init_task.c
index 39fe6882dd1..af16f6e5c91 100644
--- a/arch/mn10300/kernel/init_task.c
+++ b/arch/mn10300/kernel/init_task.c
@@ -19,7 +19,6 @@
 #include <asm/pgtable.h>
 
 static struct fs_struct init_fs = INIT_FS;
-static struct files_struct init_files = INIT_FILES;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/parisc/kernel/init_task.c b/arch/parisc/kernel/init_task.c
index 26198a074d6..f5941c08655 100644
--- a/arch/parisc/kernel/init_task.c
+++ b/arch/parisc/kernel/init_task.c
@@ -35,7 +35,6 @@
 #include <asm/pgalloc.h>
 
 static struct fs_struct init_fs = INIT_FS;
-static struct files_struct init_files = INIT_FILES;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/powerpc/kernel/init_task.c b/arch/powerpc/kernel/init_task.c
index 941043ae040..4c85b8d5647 100644
--- a/arch/powerpc/kernel/init_task.c
+++ b/arch/powerpc/kernel/init_task.c
@@ -8,7 +8,6 @@
 #include <asm/uaccess.h>
 
 static struct fs_struct init_fs = INIT_FS;
-static struct files_struct init_files = INIT_FILES;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/s390/kernel/init_task.c b/arch/s390/kernel/init_task.c
index d494161b05b..7ad00396925 100644
--- a/arch/s390/kernel/init_task.c
+++ b/arch/s390/kernel/init_task.c
@@ -17,7 +17,6 @@
 #include <asm/pgtable.h>
 
 static struct fs_struct init_fs = INIT_FS;
-static struct files_struct init_files = INIT_FILES;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/sh/kernel/init_task.c b/arch/sh/kernel/init_task.c
index f9bcc606127..b151a25cb14 100644
--- a/arch/sh/kernel/init_task.c
+++ b/arch/sh/kernel/init_task.c
@@ -8,7 +8,6 @@
 #include <asm/pgtable.h>
 
 static struct fs_struct init_fs = INIT_FS;
-static struct files_struct init_files = INIT_FILES;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct pt_regs fake_swapper_regs;
diff --git a/arch/sparc/kernel/init_task.c b/arch/sparc/kernel/init_task.c
index d9d4f96360c..8e64ebc445e 100644
--- a/arch/sparc/kernel/init_task.c
+++ b/arch/sparc/kernel/init_task.c
@@ -9,7 +9,6 @@
 #include <asm/uaccess.h>
 
 static struct fs_struct init_fs = INIT_FS;
-static struct files_struct init_files = INIT_FILES;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/sparc64/kernel/init_task.c b/arch/sparc64/kernel/init_task.c
index 90007cf88ba..d2b312381c1 100644
--- a/arch/sparc64/kernel/init_task.c
+++ b/arch/sparc64/kernel/init_task.c
@@ -10,7 +10,6 @@
 #include <asm/processor.h>
 
 static struct fs_struct init_fs = INIT_FS;
-static struct files_struct init_files = INIT_FILES;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/um/kernel/init_task.c b/arch/um/kernel/init_task.c
index dcfceca9505..910eda8fca1 100644
--- a/arch/um/kernel/init_task.c
+++ b/arch/um/kernel/init_task.c
@@ -12,7 +12,6 @@
 
 static struct fs_struct init_fs = INIT_FS;
 struct mm_struct init_mm = INIT_MM(init_mm);
-static struct files_struct init_files = INIT_FILES;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 EXPORT_SYMBOL(init_mm);
diff --git a/arch/v850/kernel/init_task.c b/arch/v850/kernel/init_task.c
index ed2f93cf7c6..44b274dff33 100644
--- a/arch/v850/kernel/init_task.c
+++ b/arch/v850/kernel/init_task.c
@@ -21,7 +21,6 @@
 #include <asm/pgtable.h>
 
 static struct fs_struct init_fs = INIT_FS;
-static struct files_struct init_files = INIT_FILES;
 static struct signal_struct init_signals = INIT_SIGNALS (init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM (init_mm);
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index 3d01e47777d..a4f93b4120c 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -11,7 +11,6 @@
 #include <asm/desc.h>
 
 static struct fs_struct init_fs = INIT_FS;
-static struct files_struct init_files = INIT_FILES;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/xtensa/kernel/init_task.c b/arch/xtensa/kernel/init_task.c
index 021b4f46ff9..3df469dbe81 100644
--- a/arch/xtensa/kernel/init_task.c
+++ b/arch/xtensa/kernel/init_task.c
@@ -22,7 +22,6 @@
 #include <asm/uaccess.h>
 
 static struct fs_struct init_fs = INIT_FS;
-static struct files_struct init_files = INIT_FILES;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/fs/file.c b/fs/file.c
index 4c6f0ea12c4..754cd05b06a 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -275,3 +275,16 @@ void __init files_defer_init(void)
 	for_each_possible_cpu(i)
 		fdtable_defer_list_init(i);
 }
+
+struct files_struct init_files = {
+	.count		= ATOMIC_INIT(1),
+	.fdt		= &init_files.fdtab,
+	.fdtab		= {
+		.max_fds	= NR_OPEN_DEFAULT,
+		.fd		= &init_files.fd_array[0],
+		.close_on_exec	= (fd_set *)&init_files.close_on_exec_init,
+		.open_fds	= (fd_set *)&init_files.open_fds_init,
+		.rcu		= RCU_HEAD_INIT,
+	},
+	.file_lock	= __SPIN_LOCK_UNLOCKED(init_task.file_lock),
+};
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index b24c2875aa0..9927a88674a 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -1,7 +1,6 @@
 #ifndef _LINUX__INIT_TASK_H
 #define _LINUX__INIT_TASK_H
 
-#include <linux/fdtable.h>
 #include <linux/rcupdate.h>
 #include <linux/irqflags.h>
 #include <linux/utsname.h>
@@ -12,27 +11,7 @@
 #include <linux/securebits.h>
 #include <net/net_namespace.h>
 
-#define INIT_FDTABLE \
-{							\
-	.max_fds	= NR_OPEN_DEFAULT, 		\
-	.fd		= &init_files.fd_array[0], 	\
-	.close_on_exec	= (fd_set *)&init_files.close_on_exec_init, \
-	.open_fds	= (fd_set *)&init_files.open_fds_init, 	\
-	.rcu		= RCU_HEAD_INIT, 		\
-	.next		= NULL,		 		\
-}
-
-#define INIT_FILES \
-{ 							\
-	.count		= ATOMIC_INIT(1), 		\
-	.fdt		= &init_files.fdtab, 		\
-	.fdtab		= INIT_FDTABLE,			\
-	.file_lock	= __SPIN_LOCK_UNLOCKED(init_task.file_lock), \
-	.next_fd	= 0, 				\
-	.close_on_exec_init = { { 0, } }, 		\
-	.open_fds_init	= { { 0, } }, 			\
-	.fd_array	= { NULL, } 			\
-}
+extern struct files_struct init_files;
 
 #define INIT_KIOCTX(name, which_mm) \
 {							\
-- 
cgit v1.2.3-70-g09d2


From 02afc6267f6d55d47aba9fcafdbd1b7230d2294a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 8 May 2008 19:42:56 -0400
Subject: [PATCH] dup_fd() fixes, part 1

Move the sucker to fs/file.c in preparation to the rest

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file.c               | 130 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/fdtable.h |   1 +
 kernel/fork.c           | 130 ------------------------------------------------
 3 files changed, 131 insertions(+), 130 deletions(-)

(limited to 'fs')

diff --git a/fs/file.c b/fs/file.c
index 754cd05b06a..7dbadaaf00f 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -261,6 +261,136 @@ int expand_files(struct files_struct *files, int nr)
 	return expand_fdtable(files, nr);
 }
 
+static int count_open_files(struct fdtable *fdt)
+{
+	int size = fdt->max_fds;
+	int i;
+
+	/* Find the last open fd */
+	for (i = size/(8*sizeof(long)); i > 0; ) {
+		if (fdt->open_fds->fds_bits[--i])
+			break;
+	}
+	i = (i+1) * 8 * sizeof(long);
+	return i;
+}
+
+static struct files_struct *alloc_files(void)
+{
+	struct files_struct *newf;
+	struct fdtable *fdt;
+
+	newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
+	if (!newf)
+		goto out;
+
+	atomic_set(&newf->count, 1);
+
+	spin_lock_init(&newf->file_lock);
+	newf->next_fd = 0;
+	fdt = &newf->fdtab;
+	fdt->max_fds = NR_OPEN_DEFAULT;
+	fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
+	fdt->open_fds = (fd_set *)&newf->open_fds_init;
+	fdt->fd = &newf->fd_array[0];
+	INIT_RCU_HEAD(&fdt->rcu);
+	fdt->next = NULL;
+	rcu_assign_pointer(newf->fdt, fdt);
+out:
+	return newf;
+}
+
+/*
+ * Allocate a new files structure and copy contents from the
+ * passed in files structure.
+ * errorp will be valid only when the returned files_struct is NULL.
+ */
+struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
+{
+	struct files_struct *newf;
+	struct file **old_fds, **new_fds;
+	int open_files, size, i;
+	struct fdtable *old_fdt, *new_fdt;
+
+	*errorp = -ENOMEM;
+	newf = alloc_files();
+	if (!newf)
+		goto out;
+
+	spin_lock(&oldf->file_lock);
+	old_fdt = files_fdtable(oldf);
+	new_fdt = files_fdtable(newf);
+	open_files = count_open_files(old_fdt);
+
+	/*
+	 * Check whether we need to allocate a larger fd array and fd set.
+	 * Note: we're not a clone task, so the open count won't change.
+	 */
+	if (open_files > new_fdt->max_fds) {
+		new_fdt->max_fds = 0;
+		spin_unlock(&oldf->file_lock);
+		spin_lock(&newf->file_lock);
+		*errorp = expand_files(newf, open_files-1);
+		spin_unlock(&newf->file_lock);
+		if (*errorp < 0)
+			goto out_release;
+		new_fdt = files_fdtable(newf);
+		/*
+		 * Reacquire the oldf lock and a pointer to its fd table
+		 * who knows it may have a new bigger fd table. We need
+		 * the latest pointer.
+		 */
+		spin_lock(&oldf->file_lock);
+		old_fdt = files_fdtable(oldf);
+	}
+
+	old_fds = old_fdt->fd;
+	new_fds = new_fdt->fd;
+
+	memcpy(new_fdt->open_fds->fds_bits,
+		old_fdt->open_fds->fds_bits, open_files/8);
+	memcpy(new_fdt->close_on_exec->fds_bits,
+		old_fdt->close_on_exec->fds_bits, open_files/8);
+
+	for (i = open_files; i != 0; i--) {
+		struct file *f = *old_fds++;
+		if (f) {
+			get_file(f);
+		} else {
+			/*
+			 * The fd may be claimed in the fd bitmap but not yet
+			 * instantiated in the files array if a sibling thread
+			 * is partway through open().  So make sure that this
+			 * fd is available to the new process.
+			 */
+			FD_CLR(open_files - i, new_fdt->open_fds);
+		}
+		rcu_assign_pointer(*new_fds++, f);
+	}
+	spin_unlock(&oldf->file_lock);
+
+	/* compute the remainder to be cleared */
+	size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
+
+	/* This is long word aligned thus could use a optimized version */
+	memset(new_fds, 0, size);
+
+	if (new_fdt->max_fds > open_files) {
+		int left = (new_fdt->max_fds-open_files)/8;
+		int start = open_files / (8 * sizeof(unsigned long));
+
+		memset(&new_fdt->open_fds->fds_bits[start], 0, left);
+		memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
+	}
+
+	return newf;
+
+out_release:
+	kmem_cache_free(files_cachep, newf);
+out:
+	return NULL;
+}
+
 static void __devinit fdtable_defer_list_init(int cpu)
 {
 	struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu);
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index a118f3c0b24..4aab6f12cfa 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -93,6 +93,7 @@ struct files_struct *get_files_struct(struct task_struct *);
 void put_files_struct(struct files_struct *fs);
 void reset_files_struct(struct files_struct *);
 int unshare_files(struct files_struct **);
+struct files_struct *dup_fd(struct files_struct *, int *);
 
 extern struct kmem_cache *files_cachep;
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 933e60ebcca..19908b26cf8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -660,136 +660,6 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
 	return 0;
 }
 
-static int count_open_files(struct fdtable *fdt)
-{
-	int size = fdt->max_fds;
-	int i;
-
-	/* Find the last open fd */
-	for (i = size/(8*sizeof(long)); i > 0; ) {
-		if (fdt->open_fds->fds_bits[--i])
-			break;
-	}
-	i = (i+1) * 8 * sizeof(long);
-	return i;
-}
-
-static struct files_struct *alloc_files(void)
-{
-	struct files_struct *newf;
-	struct fdtable *fdt;
-
-	newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
-	if (!newf)
-		goto out;
-
-	atomic_set(&newf->count, 1);
-
-	spin_lock_init(&newf->file_lock);
-	newf->next_fd = 0;
-	fdt = &newf->fdtab;
-	fdt->max_fds = NR_OPEN_DEFAULT;
-	fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
-	fdt->open_fds = (fd_set *)&newf->open_fds_init;
-	fdt->fd = &newf->fd_array[0];
-	INIT_RCU_HEAD(&fdt->rcu);
-	fdt->next = NULL;
-	rcu_assign_pointer(newf->fdt, fdt);
-out:
-	return newf;
-}
-
-/*
- * Allocate a new files structure and copy contents from the
- * passed in files structure.
- * errorp will be valid only when the returned files_struct is NULL.
- */
-static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
-{
-	struct files_struct *newf;
-	struct file **old_fds, **new_fds;
-	int open_files, size, i;
-	struct fdtable *old_fdt, *new_fdt;
-
-	*errorp = -ENOMEM;
-	newf = alloc_files();
-	if (!newf)
-		goto out;
-
-	spin_lock(&oldf->file_lock);
-	old_fdt = files_fdtable(oldf);
-	new_fdt = files_fdtable(newf);
-	open_files = count_open_files(old_fdt);
-
-	/*
-	 * Check whether we need to allocate a larger fd array and fd set.
-	 * Note: we're not a clone task, so the open count won't change.
-	 */
-	if (open_files > new_fdt->max_fds) {
-		new_fdt->max_fds = 0;
-		spin_unlock(&oldf->file_lock);
-		spin_lock(&newf->file_lock);
-		*errorp = expand_files(newf, open_files-1);
-		spin_unlock(&newf->file_lock);
-		if (*errorp < 0)
-			goto out_release;
-		new_fdt = files_fdtable(newf);
-		/*
-		 * Reacquire the oldf lock and a pointer to its fd table
-		 * who knows it may have a new bigger fd table. We need
-		 * the latest pointer.
-		 */
-		spin_lock(&oldf->file_lock);
-		old_fdt = files_fdtable(oldf);
-	}
-
-	old_fds = old_fdt->fd;
-	new_fds = new_fdt->fd;
-
-	memcpy(new_fdt->open_fds->fds_bits,
-		old_fdt->open_fds->fds_bits, open_files/8);
-	memcpy(new_fdt->close_on_exec->fds_bits,
-		old_fdt->close_on_exec->fds_bits, open_files/8);
-
-	for (i = open_files; i != 0; i--) {
-		struct file *f = *old_fds++;
-		if (f) {
-			get_file(f);
-		} else {
-			/*
-			 * The fd may be claimed in the fd bitmap but not yet
-			 * instantiated in the files array if a sibling thread
-			 * is partway through open().  So make sure that this
-			 * fd is available to the new process.
-			 */
-			FD_CLR(open_files - i, new_fdt->open_fds);
-		}
-		rcu_assign_pointer(*new_fds++, f);
-	}
-	spin_unlock(&oldf->file_lock);
-
-	/* compute the remainder to be cleared */
-	size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
-
-	/* This is long word aligned thus could use a optimized version */
-	memset(new_fds, 0, size);
-
-	if (new_fdt->max_fds > open_files) {
-		int left = (new_fdt->max_fds-open_files)/8;
-		int start = open_files / (8 * sizeof(unsigned long));
-
-		memset(&new_fdt->open_fds->fds_bits[start], 0, left);
-		memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
-	}
-
-	return newf;
-
-out_release:
-	kmem_cache_free(files_cachep, newf);
-out:
-	return NULL;
-}
-
 static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
 {
 	struct files_struct *oldf, *newf;
-- 
cgit v1.2.3-70-g09d2


From 9dec3c4d306b09b31331e475e895bb9674e16d81 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 8 May 2008 21:02:45 -0400
Subject: [PATCH] dup_fd() part 2

use alloc_fdtable() instead of expand_files(), get rid of pointless
grabbing newf->file_lock, kill magic in copy_fdtable() that used to
be there only to skip copying when called from dup_fd().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/file.c b/fs/file.c
index 7dbadaaf00f..6491b2b5bc3 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -119,8 +119,6 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
 	unsigned int cpy, set;
 
 	BUG_ON(nfdt->max_fds < ofdt->max_fds);
-	if (ofdt->max_fds == 0)
-		return;
 
 	cpy = ofdt->max_fds * sizeof(struct file *);
 	set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
@@ -327,14 +325,24 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 	 * Note: we're not a clone task, so the open count won't change.
 	 */
 	if (open_files > new_fdt->max_fds) {
-		new_fdt->max_fds = 0;
 		spin_unlock(&oldf->file_lock);
-		spin_lock(&newf->file_lock);
-		*errorp = expand_files(newf, open_files-1);
-		spin_unlock(&newf->file_lock);
-		if (*errorp < 0)
+
+		new_fdt = alloc_fdtable(open_files - 1);
+		if (!new_fdt) {
+			*errorp = -ENOMEM;
+			goto out_release;
+		}
+
+		/* beyond sysctl_nr_open; nothing to do */
+		if (unlikely(new_fdt->max_fds < open_files)) {
+			free_fdarr(new_fdt);
+			free_fdset(new_fdt);
+			kfree(new_fdt);
+			*errorp = -EMFILE;
 			goto out_release;
-		new_fdt = files_fdtable(newf);
+		}
+		rcu_assign_pointer(files->fdt, new_fdt);
+
 		/*
 		 * Reacquire the oldf lock and a pointer to its fd table
 		 * who knows it may have a new bigger fd table. We need
-- 
cgit v1.2.3-70-g09d2


From afbec7fff4928c273a1f1bb14dfdfdf62688a193 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 8 May 2008 21:11:17 -0400
Subject: [PATCH] dup_fd() - part 3

merge alloc_files() into dup_fd(), leave setting newf->fdt until the end

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file.c | 43 +++++++++++++++----------------------------
 1 file changed, 15 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/file.c b/fs/file.c
index 6491b2b5bc3..689d2b6947e 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -273,31 +273,6 @@ static int count_open_files(struct fdtable *fdt)
 	return i;
 }
 
-static struct files_struct *alloc_files(void)
-{
-	struct files_struct *newf;
-	struct fdtable *fdt;
-
-	newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
-	if (!newf)
-		goto out;
-
-	atomic_set(&newf->count, 1);
-
-	spin_lock_init(&newf->file_lock);
-	newf->next_fd = 0;
-	fdt = &newf->fdtab;
-	fdt->max_fds = NR_OPEN_DEFAULT;
-	fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
-	fdt->open_fds = (fd_set *)&newf->open_fds_init;
-	fdt->fd = &newf->fd_array[0];
-	INIT_RCU_HEAD(&fdt->rcu);
-	fdt->next = NULL;
-	rcu_assign_pointer(newf->fdt, fdt);
-out:
-	return newf;
-}
-
 /*
  * Allocate a new files structure and copy contents from the
  * passed in files structure.
@@ -311,13 +286,24 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 	struct fdtable *old_fdt, *new_fdt;
 
 	*errorp = -ENOMEM;
-	newf = alloc_files();
+	newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
 	if (!newf)
 		goto out;
 
+	atomic_set(&newf->count, 1);
+
+	spin_lock_init(&newf->file_lock);
+	newf->next_fd = 0;
+	new_fdt = &newf->fdtab;
+	new_fdt->max_fds = NR_OPEN_DEFAULT;
+	new_fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
+	new_fdt->open_fds = (fd_set *)&newf->open_fds_init;
+	new_fdt->fd = &newf->fd_array[0];
+	INIT_RCU_HEAD(&new_fdt->rcu);
+	new_fdt->next = NULL;
+
 	spin_lock(&oldf->file_lock);
 	old_fdt = files_fdtable(oldf);
-	new_fdt = files_fdtable(newf);
 	open_files = count_open_files(old_fdt);
 
 	/*
@@ -341,7 +327,6 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 			*errorp = -EMFILE;
 			goto out_release;
 		}
-		rcu_assign_pointer(files->fdt, new_fdt);
 
 		/*
 		 * Reacquire the oldf lock and a pointer to its fd table
@@ -391,6 +376,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 		memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
 	}
 
+	rcu_assign_pointer(newf->fdt, new_fdt);
+
 	return newf;
 
 out_release:
-- 
cgit v1.2.3-70-g09d2


From adbecb128cd2cc5d14b0ebef6d020ced0efd0ec6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 8 May 2008 21:19:42 -0400
Subject: [PATCH] dup_fd() part 4 - race fix

Parent _can_ be a clone task, contrary to the comment.  Moreover,
more files could be opened while we allocate a copy, in which case
we end up copying only part into new descriptor table.  Since what
we get _is_ affected by all changes in the old range, we can get
rather weird effects - e.g.
	dup2(0, 1024); close(0);
in parallel with fork() resulting in child that sees the effect of
close(), but not that of dup2() done just before that close().

What we need is to recalculate the open_count after having reacquired
->file_lock and if external fdtable we'd just allocated is too small for
it, free the sucker and redo allocation.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/file.c b/fs/file.c
index 689d2b6947e..0f705c7cfef 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -308,11 +308,16 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 
 	/*
 	 * Check whether we need to allocate a larger fd array and fd set.
-	 * Note: we're not a clone task, so the open count won't change.
 	 */
-	if (open_files > new_fdt->max_fds) {
+	while (unlikely(open_files > new_fdt->max_fds)) {
 		spin_unlock(&oldf->file_lock);
 
+		if (new_fdt != &newf->fdtab) {
+			free_fdarr(new_fdt);
+			free_fdset(new_fdt);
+			kfree(new_fdt);
+		}
+
 		new_fdt = alloc_fdtable(open_files - 1);
 		if (!new_fdt) {
 			*errorp = -ENOMEM;
@@ -335,6 +340,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 		 */
 		spin_lock(&oldf->file_lock);
 		old_fdt = files_fdtable(oldf);
+		open_files = count_open_files(old_fdt);
 	}
 
 	old_fds = old_fdt->fd;
-- 
cgit v1.2.3-70-g09d2


From eceea0b3df05ed262ae32e0c6340cc7a3626632d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 10 May 2008 10:08:32 -0400
Subject: [PATCH] avoid multiplication overflows and signedness issues for
 max_fds

Limit sysctl_nr_open - we don't want ->max_fds to exceed MAX_INT and
we don't want size calculation for ->fd[] to overflow.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file.c       | 4 ++++
 kernel/sysctl.c | 5 ++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/file.c b/fs/file.c
index 0f705c7cfef..7b3887e054d 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -26,6 +26,8 @@ struct fdtable_defer {
 };
 
 int sysctl_nr_open __read_mostly = 1024*1024;
+int sysctl_nr_open_min = BITS_PER_LONG;
+int sysctl_nr_open_max = 1024 * 1024; /* raised later */
 
 /*
  * We use this list to defer free fdtables that have vmalloced
@@ -405,6 +407,8 @@ void __init files_defer_init(void)
 	int i;
 	for_each_possible_cpu(i)
 		fdtable_defer_list_init(i);
+	sysctl_nr_open_max = min((size_t)INT_MAX, ~(size_t)0/sizeof(void *)) &
+			     -BITS_PER_LONG;
 }
 
 struct files_struct init_files = {
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d7ffdc59816..29116652dca 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -81,6 +81,7 @@ extern int compat_log;
 extern int maps_protect;
 extern int sysctl_stat_interval;
 extern int latencytop_enabled;
+extern int sysctl_nr_open_min, sysctl_nr_open_max;
 
 /* Constants used for minimum and  maximum */
 #if defined(CONFIG_DETECT_SOFTLOCKUP) || defined(CONFIG_HIGHMEM)
@@ -1190,7 +1191,9 @@ static struct ctl_table fs_table[] = {
 		.data		= &sysctl_nr_open,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &sysctl_nr_open_min,
+		.extra2		= &sysctl_nr_open_max,
 	},
 	{
 		.ctl_name	= FS_DENTRY,
-- 
cgit v1.2.3-70-g09d2


From 5f719558edf9c84bfbb1f7ad37e84c483282d09f Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Tue, 6 May 2008 12:45:35 +0800
Subject: [Patch] fs/binfmt_elf.c: fix a wrong free

In kmalloc failing path, we shouldn't free pointers in 'info',
because the struct 'info' is uninitilized when kmalloc is called.

And when kmalloc returns NULL, it's needless to kfree it.

Signed-off-by: WANG Cong <wangcong@zeuux.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>

--
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/binfmt_elf.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index b25707fee2c..bd08332079c 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1900,7 +1900,7 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
 	/* alloc memory for large data structures: too large to be on stack */
 	elf = kmalloc(sizeof(*elf), GFP_KERNEL);
 	if (!elf)
-		goto cleanup;
+		goto out;
 	
 	segs = current->mm->map_count;
 #ifdef ELF_CORE_EXTRA_PHDRS
@@ -2034,8 +2034,9 @@ end_coredump:
 	set_fs(fs);
 
 cleanup:
-	kfree(elf);
 	free_note_info(&info);
+	kfree(elf);
+out:
 	return has_dumped;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 08a6fac1c63233c87eec129938022f1a9a4d51f6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 10 May 2008 16:38:25 -0400
Subject: [PATCH] get rid of leak in compat_execve()

Even though copy_compat_strings() doesn't cache the pages,
copy_strings_kernel() and stuff indirectly called by e.g.
->load_binary() is doing that, so we need to drop the
cache contents in the end.

[found by WANG Cong <wangcong@zeuux.org>]

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/compat.c             |  4 ++--
 fs/exec.c               | 12 ++++++++----
 include/linux/binfmts.h |  1 +
 3 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index 332a869d2c5..ed43e17a5dc 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1405,7 +1405,7 @@ int compat_do_execve(char * filename,
 		/* execve success */
 		security_bprm_free(bprm);
 		acct_update_integrals(current);
-		kfree(bprm);
+		free_bprm(bprm);
 		return retval;
 	}
 
@@ -1424,7 +1424,7 @@ out_file:
 	}
 
 out_kfree:
-	kfree(bprm);
+	free_bprm(bprm);
 
 out_ret:
 	return retval;
diff --git a/fs/exec.c b/fs/exec.c
index 1f8a24aa1f8..3c2ba7ce11d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1251,6 +1251,12 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 
 EXPORT_SYMBOL(search_binary_handler);
 
+void free_bprm(struct linux_binprm *bprm)
+{
+	free_arg_pages(bprm);
+	kfree(bprm);
+}
+
 /*
  * sys_execve() executes a new program.
  */
@@ -1320,17 +1326,15 @@ int do_execve(char * filename,
 	retval = search_binary_handler(bprm,regs);
 	if (retval >= 0) {
 		/* execve success */
-		free_arg_pages(bprm);
 		security_bprm_free(bprm);
 		acct_update_integrals(current);
-		kfree(bprm);
+		free_bprm(bprm);
 		if (displaced)
 			put_files_struct(displaced);
 		return retval;
 	}
 
 out:
-	free_arg_pages(bprm);
 	if (bprm->security)
 		security_bprm_free(bprm);
 
@@ -1344,7 +1348,7 @@ out_file:
 		fput(bprm->file);
 	}
 out_kfree:
-	kfree(bprm);
+	free_bprm(bprm);
 
 out_files:
 	if (displaced)
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index b512e48f6d8..ee0ed48e834 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -99,6 +99,7 @@ extern int copy_strings_kernel(int argc,char ** argv,struct linux_binprm *bprm);
 extern void compute_creds(struct linux_binprm *binprm);
 extern int do_coredump(long signr, int exit_code, struct pt_regs * regs);
 extern int set_binfmt(struct linux_binfmt *new);
+extern void free_bprm(struct linux_binprm *);
 
 #endif /* __KERNEL__ */
 #endif /* _LINUX_BINFMTS_H */
-- 
cgit v1.2.3-70-g09d2


From 23c4971e3d97de4e1b7961ca6eacee35aa15ce5f Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Thu, 8 May 2008 21:52:33 +0800
Subject: [Patch] fs/binfmt_elf.c: fix wrong return values

create_elf_tables() returns 0 on success. But when strnlen_user() "fails",
it returns 0 directly. So this is wrong.

Signed-off-by: WANG Cong <wangcong@zeuux.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/binfmt_elf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index bd08332079c..0fa95b198e6 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -256,7 +256,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 			return -EFAULT;
 		len = strnlen_user((void __user *)p, MAX_ARG_STRLEN);
 		if (!len || len > MAX_ARG_STRLEN)
-			return 0;
+			return -EINVAL;
 		p += len;
 	}
 	if (__put_user(0, argv))
@@ -268,7 +268,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 			return -EFAULT;
 		len = strnlen_user((void __user *)p, MAX_ARG_STRLEN);
 		if (!len || len > MAX_ARG_STRLEN)
-			return 0;
+			return -EINVAL;
 		p += len;
 	}
 	if (__put_user(0, envp))
-- 
cgit v1.2.3-70-g09d2


From e9baf6e59842285bcf9570f5094e4c27674a0f7c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 15 May 2008 04:49:12 -0400
Subject: [PATCH] return to old errno choice in mkdir() et.al.

	In case when both EEXIST and EROFS would apply we used to
return the former in mkdir(2) and friends.  Lest anyone suspects
us of being consistent, in the same situation knfsd gave clients
nfs_erofs...

	ro-bind series had switched the syscall side of things to
returning -EROFS and immediately broke an application - namely,
mkdir -p.  Patch restores the original behaviour...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 32fd9655485..c7e43536c49 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2003,18 +2003,22 @@ struct dentry *lookup_create(struct nameidata *nd, int is_dir)
 	if (IS_ERR(dentry))
 		goto fail;
 
+	if (dentry->d_inode)
+		goto eexist;
 	/*
 	 * Special case - lookup gave negative, but... we had foo/bar/
 	 * From the vfs_mknod() POV we just have a negative dentry -
 	 * all is fine. Let's be bastards - you had / on the end, you've
 	 * been asking for (non-existent) directory. -ENOENT for you.
 	 */
-	if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode)
-		goto enoent;
+	if (unlikely(!is_dir && nd->last.name[nd->last.len])) {
+		dput(dentry);
+		dentry = ERR_PTR(-ENOENT);
+	}
 	return dentry;
-enoent:
+eexist:
 	dput(dentry);
-	dentry = ERR_PTR(-ENOENT);
+	dentry = ERR_PTR(-EEXIST);
 fail:
 	return dentry;
 }
-- 
cgit v1.2.3-70-g09d2


From 2b280fab12b6697b6a7a24a13aaf9f4339edd075 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Sat, 17 May 2008 03:12:45 +0000
Subject: [CIFS] add more complete mount options to cifs_show_options

adds various options to cifs_show_options
(displayed when you cat /proc/mounts with a cifs mount).  I limited
the new ones to values that are associated with the mount with the
exception of "seal" (which is a per tree connection property, but I
thought was important enough to show through).

Eventually cifs's parse_mount_options also needs to
be rewritten to use the match_token API but that would be a big enough
change that I would prefer that changing parse_mount_options wait
until next release.

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index b6436b888cf..57e40c49d3b 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1,7 +1,7 @@
 /*
  *   fs/cifs/cifsfs.c
  *
- *   Copyright (C) International Business Machines  Corp., 2002,2007
+ *   Copyright (C) International Business Machines  Corp., 2002,2008
  *   Author(s): Steve French (sfrench@us.ibm.com)
  *
  *   Common Internet FileSystem (CIFS) client
@@ -353,9 +353,38 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
 			if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID) ||
 			   !(cifs_sb->tcon->unix_ext))
 				seq_printf(s, ",gid=%d", cifs_sb->mnt_gid);
+			if (!cifs_sb->tcon->unix_ext) {
+				seq_printf(s, ",file_mode=0%o,dir_mode=0%o",
+					   cifs_sb->mnt_file_mode,
+					   cifs_sb->mnt_dir_mode);
+			}
+			if (cifs_sb->tcon->seal)
+				seq_printf(s, ",seal");
+				seq_printf(s, ",nocase");
 		}
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
 			seq_printf(s, ",posixpaths");
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)
+			seq_printf(s, ",setuids");
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
+			seq_printf(s, ",serverino");
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO)
+			seq_printf(s, ",directio");
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
+			seq_printf(s, ",nouser_xattr");
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR)
+			seq_printf(s, ",mapchars");
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
+			seq_printf(s, ",sfu");
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+			seq_printf(s, ",nobrl");
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
+			seq_printf(s, ",cifsacl");
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
+			seq_printf(s, ",dynperm");
+		if (m->mnt_sb->s_flags & MS_POSIXACL)
+			seq_printf(s, ",acl");
+
 		seq_printf(s, ",rsize=%d", cifs_sb->rsize);
 		seq_printf(s, ",wsize=%d", cifs_sb->wsize);
 	}
-- 
cgit v1.2.3-70-g09d2


From 6ee650467d5bf972d10441e99688e9b48171f99c Mon Sep 17 00:00:00 2001
From: Steve Grubb <sgrubb@redhat.com>
Date: Tue, 29 Apr 2008 15:01:13 -0400
Subject: [PATCH] open sessionid permissions

The current permissions on sessionid are a little too restrictive.

Signed-off-by: Steve Grubb <sgrubb@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/base.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 808cbdc193d..c447e0743a3 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2441,7 +2441,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 	REG("oom_adj",    S_IRUGO|S_IWUSR, oom_adjust),
 #ifdef CONFIG_AUDITSYSCALL
 	REG("loginuid",   S_IWUSR|S_IRUGO, loginuid),
-	REG("sessionid",  S_IRUSR, sessionid),
+	REG("sessionid",  S_IRUGO, sessionid),
 #endif
 #ifdef CONFIG_FAULT_INJECTION
 	REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject),
-- 
cgit v1.2.3-70-g09d2


From b55e0ba19cd2d83317a7f1dbb626080951442ab3 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Mon, 28 Apr 2008 18:22:50 -0400
Subject: nfsd: remove unnecessary atomic ops

These bit operations don't need to be atomic.  They're all done under a
single big mutex anyway.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 8799b870818..5c97d47676a 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1579,8 +1579,8 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct svc_fh *cur_fh, struct nfs4_sta
 	}
 	/* remember the open */
 	filp->f_mode |= open->op_share_access;
-	set_bit(open->op_share_access, &stp->st_access_bmap);
-	set_bit(open->op_share_deny, &stp->st_deny_bmap);
+	__set_bit(open->op_share_access, &stp->st_access_bmap);
+	__set_bit(open->op_share_deny, &stp->st_deny_bmap);
 
 	return nfs_ok;
 }
-- 
cgit v1.2.3-70-g09d2


From 88dd0be3874566796fa4ffbdf927a53c4a6a2f4b Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Mon, 5 May 2008 19:47:29 -0400
Subject: nfsd: reorder printk in do_probe_callback to avoid use-after-free

We're currently dereferencing the client after we drop our reference
count to it.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4callback.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 0b3ffa9840c..4d4760e687c 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -419,9 +419,9 @@ static int do_probe_callback(void *data)
 out_release_client:
 	rpc_shutdown_client(client);
 out_err:
-	put_nfs4_client(clp);
 	dprintk("NFSD: warning: no callback path to client %.*s\n",
 		(int)clp->cl_name.len, clp->cl_name.data);
+	put_nfs4_client(clp);
 	return status;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 7a936ce71eed7b887b8a0d6c54dd8a9072f71c9f Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <matthias@kaehlcke.net>
Date: Mon, 12 May 2008 10:04:51 -0500
Subject: dlm: convert connections_lock in a mutex

The semaphore connections_lock is used as a mutex.  Convert it to the mutex
API.

Signed-off-by: Matthias Kaehlcke <matthias@kaehlcke.net>
Cc: Christine Caulfield <ccaulfie@redhat.com>
Cc: David Teigland <teigland@redhat.com>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 7c1e5e5cccd..c7d232a9ae1 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -50,6 +50,7 @@
 #include <linux/pagemap.h>
 #include <linux/idr.h>
 #include <linux/file.h>
+#include <linux/mutex.h>
 #include <linux/sctp.h>
 #include <net/sctp/user.h>
 
@@ -138,7 +139,7 @@ static struct workqueue_struct *recv_workqueue;
 static struct workqueue_struct *send_workqueue;
 
 static DEFINE_IDR(connections_idr);
-static DECLARE_MUTEX(connections_lock);
+static DEFINE_MUTEX(connections_lock);
 static int max_nodeid;
 static struct kmem_cache *con_cache;
 
@@ -205,9 +206,9 @@ static struct connection *nodeid2con(int nodeid, gfp_t allocation)
 {
 	struct connection *con;
 
-	down(&connections_lock);
+	mutex_lock(&connections_lock);
 	con = __nodeid2con(nodeid, allocation);
-	up(&connections_lock);
+	mutex_unlock(&connections_lock);
 
 	return con;
 }
@@ -218,15 +219,15 @@ static struct connection *assoc2con(int assoc_id)
 	int i;
 	struct connection *con;
 
-	down(&connections_lock);
+	mutex_lock(&connections_lock);
 	for (i=0; i<=max_nodeid; i++) {
 		con = __nodeid2con(i, 0);
 		if (con && con->sctp_assoc == assoc_id) {
-			up(&connections_lock);
+			mutex_unlock(&connections_lock);
 			return con;
 		}
 	}
-	up(&connections_lock);
+	mutex_unlock(&connections_lock);
 	return NULL;
 }
 
@@ -381,7 +382,7 @@ static void sctp_init_failed(void)
 	int i;
 	struct connection *con;
 
-	down(&connections_lock);
+	mutex_lock(&connections_lock);
 	for (i=1; i<=max_nodeid; i++) {
 		con = __nodeid2con(i, 0);
 		if (!con)
@@ -393,7 +394,7 @@ static void sctp_init_failed(void)
 			}
 		}
 	}
-	up(&connections_lock);
+	mutex_unlock(&connections_lock);
 }
 
 /* Something happened to an association */
@@ -1417,7 +1418,7 @@ void dlm_lowcomms_stop(void)
 	/* Set all the flags to prevent any
 	   socket activity.
 	*/
-	down(&connections_lock);
+	mutex_lock(&connections_lock);
 	for (i = 0; i <= max_nodeid; i++) {
 		con = __nodeid2con(i, 0);
 		if (con) {
@@ -1426,11 +1427,11 @@ void dlm_lowcomms_stop(void)
 				con->sock->sk->sk_user_data = NULL;
 		}
 	}
-	up(&connections_lock);
+	mutex_unlock(&connections_lock);
 
 	work_stop();
 
-	down(&connections_lock);
+	mutex_lock(&connections_lock);
 	clean_writequeues();
 
 	for (i = 0; i <= max_nodeid; i++) {
@@ -1443,7 +1444,7 @@ void dlm_lowcomms_stop(void)
 		}
 	}
 	max_nodeid = 0;
-	up(&connections_lock);
+	mutex_unlock(&connections_lock);
 	kmem_cache_destroy(con_cache);
 	idr_init(&connections_idr);
 }
-- 
cgit v1.2.3-70-g09d2


From 88ad23195e4609cef73b6fcf2b4c08aaaef33204 Mon Sep 17 00:00:00 2001
From: Leonardo Potenza <lpotenza@inwind.it>
Date: Sun, 11 May 2008 19:15:34 +0200
Subject: dlm: section mismatch warning fix

Removed the section mismatch message:
WARNING: fs/dlm/dlm.o(.init.text+0x132): Section mismatch in reference from the function init_module() to the function .exit.text:dlm_netlink_exit()

Since dlm_netlink_exit() is called in the init_dlm() error handling,
the __exit annotation has been removed.

Signed-off-by: Leonardo Potenza <lpotenza@inwind.it>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/netlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 714593621f4..18bda83cc89 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -95,7 +95,7 @@ int __init dlm_netlink_init(void)
 	return rv;
 }
 
-void __exit dlm_netlink_exit(void)
+void dlm_netlink_exit(void)
 {
 	genl_unregister_ops(&family, &dlm_nl_ops);
 	genl_unregister_family(&family);
-- 
cgit v1.2.3-70-g09d2


From 0035a4b14931eb62a5f8a7762284c18e7ab14289 Mon Sep 17 00:00:00 2001
From: Marcin Slusarz <marcin.slusarz@gmail.com>
Date: Sun, 11 May 2008 22:01:29 +0200
Subject: dlm: tcp_connect_to_sock should check for -EINVAL, not EINVAL

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Cc: Christine Caulfield <ccaulfie@redhat.com>
Cc: David Teigland <teigland@redhat.com>
Cc: cluster-devel@redhat.com
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index c7d232a9ae1..637018c891e 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -931,7 +931,7 @@ out_err:
 	 * errors we try again until the max number of retries is reached.
 	 */
 	if (result != -EHOSTUNREACH && result != -ENETUNREACH &&
-	    result != -ENETDOWN && result != EINVAL
+	    result != -ENETDOWN && result != -EINVAL
 	    && result != -EPROTONOSUPPORT) {
 		lowcomms_connect_sock(con);
 		result = 0;
-- 
cgit v1.2.3-70-g09d2


From 817d10bad56f2fdfa321b4a864a21295226b123a Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Tue, 13 May 2008 14:28:26 -0500
Subject: dlm: fix plock dev_write return value

The return value on writes to the plock device should be
the number of bytes written.  It was returning 0 instead
when an nfs lock callback was involved.

Reported-by: Nathan Straz <nstraz@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/plock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index d6d6e370f89..78878c5781c 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -379,7 +379,7 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
 		struct plock_xop *xop;
 		xop = (struct plock_xop *)op;
 		if (xop->callback)
-			count = dlm_plock_callback(op);
+			dlm_plock_callback(op);
 		else
 			wake_up(&recv_wq);
 	} else
-- 
cgit v1.2.3-70-g09d2


From 89562b777c50d100d1694db7b1b023279839b9ae Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Mon, 19 May 2008 22:26:42 +0000
Subject: [CIFS] add missing seq_printf to cifs_show_options for hard mount
 option

Also Kari Hurtta noticed a missing check in the same function which is now fixed.

CC: Kari Hurtta <hurtta+gmane@siilo.fmi.fi>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 57e40c49d3b..5df93fd6303 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -360,7 +360,10 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
 			}
 			if (cifs_sb->tcon->seal)
 				seq_printf(s, ",seal");
+			if (cifs_sb->tcon->nocase)
 				seq_printf(s, ",nocase");
+			if (cifs_sb->tcon->retry)
+				seq_printf(s, ",hard");
 		}
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
 			seq_printf(s, ",posixpaths");
-- 
cgit v1.2.3-70-g09d2


From 0e4bbde94fdc33f5b3d793166b21bf768ca3e098 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 20 May 2008 19:50:46 +0000
Subject: [CIFS] Enable DFS support for Unix query path info

Final piece for handling DFS in unix_query_path_info, constructing a
fake inode for the junction directory which the submount will cover.

Acked-by: Igor Mammedov <niallain@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/AUTHORS |   1 +
 fs/cifs/CHANGES |   2 +
 fs/cifs/TODO    |  15 ++++----
 fs/cifs/inode.c | 117 +++++++++++++++++++++++++++++++++++---------------------
 4 files changed, 85 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/AUTHORS b/fs/cifs/AUTHORS
index 8848e4dfa02..9c136d7803d 100644
--- a/fs/cifs/AUTHORS
+++ b/fs/cifs/AUTHORS
@@ -36,6 +36,7 @@ Miklos Szeredi
 Kazeon team for various fixes especially for 2.4 version.
 Asser Ferno (Change Notify support)
 Shaggy (Dave Kleikamp) for inumerable small fs suggestions and some good cleanup
+Igor Mammedov (DFS support)
 
 Test case and Bug Report contributors
 -------------------------------------
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 502a4c2b841..28e3d5c5fca 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,5 +1,7 @@
 Version 1.53
 ------------
+DFS support added (Microsoft Distributed File System client support needed
+for referrals which enable a hierarchical name space among servers).
 
 Version 1.52
 ------------
diff --git a/fs/cifs/TODO b/fs/cifs/TODO
index 92c9feac440..5aff46c61e5 100644
--- a/fs/cifs/TODO
+++ b/fs/cifs/TODO
@@ -1,4 +1,4 @@
-Version 1.52 January 3, 2008
+Version 1.53 May 20, 2008
 
 A Partial List of Missing Features
 ==================================
@@ -20,20 +20,21 @@ d) Cleanup now unneeded SessSetup code in
 fs/cifs/connect.c and add back in NTLMSSP code if any servers
 need it
 
-e) ms-dfs and ms-dfs host name resolution cleanup
-
-f) fix NTLMv2 signing when two mounts with different users to same
+e) fix NTLMv2 signing when two mounts with different users to same
 server.
 
-g) Directory entry caching relies on a 1 second timer, rather than 
+f) Directory entry caching relies on a 1 second timer, rather than 
 using FindNotify or equivalent.  - (started)
 
-h) quota support (needs minor kernel change since quota calls
+g) quota support (needs minor kernel change since quota calls
 to make it to network filesystems or deviceless filesystems)
 
-i) investigate sync behavior (including syncpage) and check  
+h) investigate sync behavior (including syncpage) and check  
 for proper behavior of intr/nointr
 
+i) improve support for very old servers (OS/2 and Win9x for example)
+Including support for changing the time remotely (utimes command).
+
 j) hook lower into the sockets api (as NFS/SunRPC does) to avoid the
 extra copy in/out of the socket buffers in some cases.
 
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 9d9b56a9c08..422d4e219fa 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -161,77 +161,108 @@ static void cifs_unix_info_to_inode(struct inode *inode,
 	spin_unlock(&inode->i_lock);
 }
 
+static void fill_fake_finddataunix(FILE_UNIX_BASIC_INFO *pfnd_dat,
+			       struct super_block *sb)
+{
+	struct inode *pinode = NULL;
+
+	memset(pfnd_dat, sizeof(FILE_UNIX_BASIC_INFO), 0);
+
+/*	__le64 pfnd_dat->EndOfFile = cpu_to_le64(0);
+	__le64 pfnd_dat->NumOfBytes = cpu_to_le64(0);
+	__u64 UniqueId = 0;  */
+	pfnd_dat->LastStatusChange =
+		cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+	pfnd_dat->LastAccessTime =
+		cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+	pfnd_dat->LastModificationTime =
+		cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+	pfnd_dat->Type = cpu_to_le32(UNIX_DIR);
+	pfnd_dat->Permissions = cpu_to_le64(S_IXUGO | S_IRWXU);
+	pfnd_dat->Nlinks = cpu_to_le64(2);
+	if (sb->s_root)
+		pinode = sb->s_root->d_inode;
+	if (pinode == NULL)
+		return;
+
+	/* fill in default values for the remaining based on root
+	   inode since we can not query the server for this inode info */
+	pfnd_dat->DevMajor = cpu_to_le64(MAJOR(pinode->i_rdev));
+	pfnd_dat->DevMinor = cpu_to_le64(MINOR(pinode->i_rdev));
+	pfnd_dat->Uid = cpu_to_le64(pinode->i_uid);
+	pfnd_dat->Gid = cpu_to_le64(pinode->i_gid);
+}
+
 int cifs_get_inode_info_unix(struct inode **pinode,
 	const unsigned char *full_path, struct super_block *sb, int xid)
 {
 	int rc = 0;
-	FILE_UNIX_BASIC_INFO findData;
+	FILE_UNIX_BASIC_INFO find_data;
 	struct cifsTconInfo *pTcon;
 	struct inode *inode;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 	bool is_dfs_referral = false;
+	struct cifsInodeInfo *cifsInfo;
+	__u64 num_of_bytes;
+	__u64 end_of_file;
 
 	pTcon = cifs_sb->tcon;
 	cFYI(1, ("Getting info on %s", full_path));
 
-try_again_CIFSSMBUnixQPathInfo:
 	/* could have done a find first instead but this returns more info */
-	rc = CIFSSMBUnixQPathInfo(xid, pTcon, full_path, &findData,
+	rc = CIFSSMBUnixQPathInfo(xid, pTcon, full_path, &find_data,
 				  cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
-/*	dump_mem("\nUnixQPathInfo return data", &findData,
-		 sizeof(findData)); */
 	if (rc) {
 		if (rc == -EREMOTE && !is_dfs_referral) {
 			is_dfs_referral = true;
-			goto try_again_CIFSSMBUnixQPathInfo;
+			cERROR(1, ("DFS ref")); /* BB removeme BB */
+			/* for DFS, server does not give us real inode data */
+			fill_fake_finddataunix(&find_data, sb);
+			rc = 0;
 		}
-		goto cgiiu_exit;
-	} else {
-		struct cifsInodeInfo *cifsInfo;
-		__u64 num_of_bytes = le64_to_cpu(findData.NumOfBytes);
-		__u64 end_of_file = le64_to_cpu(findData.EndOfFile);
+	}
+	num_of_bytes = le64_to_cpu(find_data.NumOfBytes);
+	end_of_file = le64_to_cpu(find_data.EndOfFile);
 
-		/* get new inode */
+	/* get new inode */
+	if (*pinode == NULL) {
+		*pinode = new_inode(sb);
 		if (*pinode == NULL) {
-			*pinode = new_inode(sb);
-			if (*pinode == NULL) {
-				rc = -ENOMEM;
-				goto cgiiu_exit;
-			}
-			/* Is an i_ino of zero legal? */
-			/* Are there sanity checks we can use to ensure that
-			   the server is really filling in that field? */
-			if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
-				(*pinode)->i_ino =
-					(unsigned long)findData.UniqueId;
-			} /* note ino incremented to unique num in new_inode */
-			if (sb->s_flags & MS_NOATIME)
-				(*pinode)->i_flags |= S_NOATIME | S_NOCMTIME;
-
-			insert_inode_hash(*pinode);
+			rc = -ENOMEM;
+		goto cgiiu_exit;
 		}
+		/* Is an i_ino of zero legal? */
+		/* note ino incremented to unique num in new_inode */
+		/* Are there sanity checks we can use to ensure that
+		   the server is really filling in that field? */
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
+			(*pinode)->i_ino = (unsigned long)find_data.UniqueId;
 
-		inode = *pinode;
-		cifsInfo = CIFS_I(inode);
+		if (sb->s_flags & MS_NOATIME)
+			(*pinode)->i_flags |= S_NOATIME | S_NOCMTIME;
 
-		cFYI(1, ("Old time %ld", cifsInfo->time));
-		cifsInfo->time = jiffies;
-		cFYI(1, ("New time %ld", cifsInfo->time));
-		/* this is ok to set on every inode revalidate */
-		atomic_set(&cifsInfo->inUse, 1);
+		insert_inode_hash(*pinode);
+	}
 
-		cifs_unix_info_to_inode(inode, &findData, 0);
+	inode = *pinode;
+	cifsInfo = CIFS_I(inode);
 
+	cFYI(1, ("Old time %ld", cifsInfo->time));
+	cifsInfo->time = jiffies;
+	cFYI(1, ("New time %ld", cifsInfo->time));
+	/* this is ok to set on every inode revalidate */
+	atomic_set(&cifsInfo->inUse, 1);
 
-		if (num_of_bytes < end_of_file)
-			cFYI(1, ("allocation size less than end of file"));
-		cFYI(1, ("Size %ld and blocks %llu",
-			(unsigned long) inode->i_size,
-			(unsigned long long)inode->i_blocks));
+	cifs_unix_info_to_inode(inode, &find_data, 0);
 
-		cifs_set_ops(inode, is_dfs_referral);
-	}
+	if (num_of_bytes < end_of_file)
+		cFYI(1, ("allocation size less than end of file"));
+	cFYI(1, ("Size %ld and blocks %llu",
+		(unsigned long) inode->i_size,
+		(unsigned long long)inode->i_blocks));
+
+	cifs_set_ops(inode, is_dfs_referral);
 cgiiu_exit:
 	return rc;
 }
-- 
cgit v1.2.3-70-g09d2


From b9a3260f25ab5d2ba5c8b9508e7952848b9d704b Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 20 May 2008 21:52:32 +0000
Subject: [CIFS] Enable DFS support for Windows query path info

Final piece for handling DFS in query_path_info, constructing a
fake inode for the junction directory which the submount will cover.

This handles the non-Unix (Windows etc.) code path.

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/inode.c | 324 +++++++++++++++++++++++++++++++-------------------------
 1 file changed, 178 insertions(+), 146 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 422d4e219fa..1cf43e10194 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -161,6 +161,12 @@ static void cifs_unix_info_to_inode(struct inode *inode,
 	spin_unlock(&inode->i_lock);
 }
 
+
+/*
+ *	Needed to setup inode data for the directory which is the
+ *	junction to the new submount (ie to setup the fake directory
+ *      which represents a DFS referral)
+ */
 static void fill_fake_finddataunix(FILE_UNIX_BASIC_INFO *pfnd_dat,
 			       struct super_block *sb)
 {
@@ -370,11 +376,42 @@ static int get_sfu_mode(struct inode *inode,
 #endif
 }
 
+/*
+ *	Needed to setup inode data for the directory which is the
+ *	junction to the new submount (ie to setup the fake directory
+ *      which represents a DFS referral)
+ */
+static void fill_fake_finddata(FILE_ALL_INFO *pfnd_dat,
+			       struct super_block *sb)
+{
+	memset(pfnd_dat, sizeof(FILE_ALL_INFO), 0);
+
+/*	__le64 pfnd_dat->AllocationSize = cpu_to_le64(0);
+	__le64 pfnd_dat->EndOfFile = cpu_to_le64(0);
+	__u8 pfnd_dat->DeletePending = 0;
+	__u8 pfnd_data->Directory = 0;
+	__le32 pfnd_dat->EASize = 0;
+	__u64 pfnd_dat->IndexNumber = 0;
+	__u64 pfnd_dat->IndexNumber1 = 0;  */
+	pfnd_dat->CreationTime =
+		cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+	pfnd_dat->LastAccessTime =
+		cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+	pfnd_dat->LastWriteTime =
+		cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+	pfnd_dat->ChangeTime =
+		cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+	pfnd_dat->Attributes = cpu_to_le32(ATTR_DIRECTORY);
+	pfnd_dat->NumberOfLinks = cpu_to_le32(2);
+}
+
 int cifs_get_inode_info(struct inode **pinode,
 	const unsigned char *full_path, FILE_ALL_INFO *pfindData,
 	struct super_block *sb, int xid, const __u16 *pfid)
 {
 	int rc = 0;
+	__u32 attr;
+	struct cifsInodeInfo *cifsInfo;
 	struct cifsTconInfo *pTcon;
 	struct inode *inode;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
@@ -399,7 +436,6 @@ int cifs_get_inode_info(struct inode **pinode,
 			return -ENOMEM;
 		pfindData = (FILE_ALL_INFO *)buf;
 
-try_again_CIFSSMBQPathInfo:
 		/* could do find first instead but this returns more info */
 		rc = CIFSSMBQPathInfo(xid, pTcon, full_path, pfindData,
 			      0 /* not legacy */,
@@ -417,171 +453,167 @@ try_again_CIFSSMBQPathInfo:
 		}
 	}
 	/* dump_mem("\nQPathInfo return data",&findData, sizeof(findData)); */
-	if (rc) {
-		if (rc == -EREMOTE && !is_dfs_referral) {
-			is_dfs_referral = true;
-			goto try_again_CIFSSMBQPathInfo;
-		}
+	if (rc == -EREMOTE) {
+		is_dfs_referral = true;
+		fill_fake_finddata(pfindData, sb);
+		rc = 0;
+	} else if (rc)
 		goto cgii_exit;
-	} else {
-		struct cifsInodeInfo *cifsInfo;
-		__u32 attr = le32_to_cpu(pfindData->Attributes);
 
-		/* get new inode */
+	attr = le32_to_cpu(pfindData->Attributes);
+
+	/* get new inode */
+	if (*pinode == NULL) {
+		*pinode = new_inode(sb);
 		if (*pinode == NULL) {
-			*pinode = new_inode(sb);
-			if (*pinode == NULL) {
-				rc = -ENOMEM;
-				goto cgii_exit;
-			}
-			/* Is an i_ino of zero legal? Can we use that to check
-			   if the server supports returning inode numbers?  Are
-			   there other sanity checks we can use to ensure that
-			   the server is really filling in that field? */
+			rc = -ENOMEM;
+			goto cgii_exit;
+		}
+		/* Is an i_ino of zero legal? Can we use that to check
+		   if the server supports returning inode numbers?  Are
+		   there other sanity checks we can use to ensure that
+		   the server is really filling in that field? */
 
-			/* We can not use the IndexNumber field by default from
-			   Windows or Samba (in ALL_INFO buf) but we can request
-			   it explicitly.  It may not be unique presumably if
-			   the server has multiple devices mounted under one
-			   share */
+		/* We can not use the IndexNumber field by default from
+		   Windows or Samba (in ALL_INFO buf) but we can request
+		   it explicitly.  It may not be unique presumably if
+		   the server has multiple devices mounted under one share */
 
-			/* There may be higher info levels that work but are
-			   there Windows server or network appliances for which
-			   IndexNumber field is not guaranteed unique? */
+		/* There may be higher info levels that work but are
+		   there Windows server or network appliances for which
+		   IndexNumber field is not guaranteed unique? */
 
-			if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
-				int rc1 = 0;
-				__u64 inode_num;
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
+			int rc1 = 0;
+			__u64 inode_num;
 
-				rc1 = CIFSGetSrvInodeNumber(xid, pTcon,
+			rc1 = CIFSGetSrvInodeNumber(xid, pTcon,
 					full_path, &inode_num,
 					cifs_sb->local_nls,
 					cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
-				if (rc1) {
-					cFYI(1, ("GetSrvInodeNum rc %d", rc1));
-					/* BB EOPNOSUPP disable SERVER_INUM? */
-				} else /* do we need cast or hash to ino? */
-					(*pinode)->i_ino = inode_num;
-			} /* else ino incremented to unique num in new_inode*/
-			if (sb->s_flags & MS_NOATIME)
-				(*pinode)->i_flags |= S_NOATIME | S_NOCMTIME;
-			insert_inode_hash(*pinode);
-		}
-		inode = *pinode;
-		cifsInfo = CIFS_I(inode);
-		cifsInfo->cifsAttrs = attr;
-		cFYI(1, ("Old time %ld", cifsInfo->time));
-		cifsInfo->time = jiffies;
-		cFYI(1, ("New time %ld", cifsInfo->time));
-
-		/* blksize needs to be multiple of two. So safer to default to
-		blksize and blkbits set in superblock so 2**blkbits and blksize
-		will match rather than setting to:
-		(pTcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE) & 0xFFFFFE00;*/
-
-		/* Linux can not store file creation time so ignore it */
-		if (pfindData->LastAccessTime)
-			inode->i_atime = cifs_NTtimeToUnix
-				(le64_to_cpu(pfindData->LastAccessTime));
-		else /* do not need to use current_fs_time - time not stored */
-			inode->i_atime = CURRENT_TIME;
-		inode->i_mtime =
+			if (rc1) {
+				cFYI(1, ("GetSrvInodeNum rc %d", rc1));
+				/* BB EOPNOSUPP disable SERVER_INUM? */
+			} else /* do we need cast or hash to ino? */
+				(*pinode)->i_ino = inode_num;
+		} /* else ino incremented to unique num in new_inode*/
+		if (sb->s_flags & MS_NOATIME)
+			(*pinode)->i_flags |= S_NOATIME | S_NOCMTIME;
+		insert_inode_hash(*pinode);
+	}
+	inode = *pinode;
+	cifsInfo = CIFS_I(inode);
+	cifsInfo->cifsAttrs = attr;
+	cFYI(1, ("Old time %ld", cifsInfo->time));
+	cifsInfo->time = jiffies;
+	cFYI(1, ("New time %ld", cifsInfo->time));
+
+	/* blksize needs to be multiple of two. So safer to default to
+	blksize and blkbits set in superblock so 2**blkbits and blksize
+	will match rather than setting to:
+	(pTcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE) & 0xFFFFFE00;*/
+
+	/* Linux can not store file creation time so ignore it */
+	if (pfindData->LastAccessTime)
+		inode->i_atime = cifs_NTtimeToUnix
+			(le64_to_cpu(pfindData->LastAccessTime));
+	else /* do not need to use current_fs_time - time not stored */
+		inode->i_atime = CURRENT_TIME;
+	inode->i_mtime =
 		    cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastWriteTime));
-		inode->i_ctime =
-		    cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime));
-		cFYI(0, ("Attributes came in as 0x%x", attr));
-		if (adjustTZ && (pTcon->ses) && (pTcon->ses->server)) {
-			inode->i_ctime.tv_sec += pTcon->ses->server->timeAdj;
-			inode->i_mtime.tv_sec += pTcon->ses->server->timeAdj;
-		}
+	inode->i_ctime =
+	    cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime));
+	cFYI(DBG2, ("Attributes came in as 0x%x", attr));
+	if (adjustTZ && (pTcon->ses) && (pTcon->ses->server)) {
+		inode->i_ctime.tv_sec += pTcon->ses->server->timeAdj;
+		inode->i_mtime.tv_sec += pTcon->ses->server->timeAdj;
+	}
 
-		/* set default mode. will override for dirs below */
-		if (atomic_read(&cifsInfo->inUse) == 0)
-			/* new inode, can safely set these fields */
-			inode->i_mode = cifs_sb->mnt_file_mode;
-		else /* since we set the inode type below we need to mask off
-		     to avoid strange results if type changes and both
-		     get orred in */
-			inode->i_mode &= ~S_IFMT;
-/*		if (attr & ATTR_REPARSE)  */
-		/* We no longer handle these as symlinks because we could not
-		   follow them due to the absolute path with drive letter */
-		if (attr & ATTR_DIRECTORY) {
-		/* override default perms since we do not do byte range locking
-		   on dirs */
-			inode->i_mode = cifs_sb->mnt_dir_mode;
-			inode->i_mode |= S_IFDIR;
-		} else if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) &&
-			   (cifsInfo->cifsAttrs & ATTR_SYSTEM) &&
-			   /* No need to le64 convert size of zero */
-			   (pfindData->EndOfFile == 0)) {
-			inode->i_mode = cifs_sb->mnt_file_mode;
-			inode->i_mode |= S_IFIFO;
+	/* set default mode. will override for dirs below */
+	if (atomic_read(&cifsInfo->inUse) == 0)
+		/* new inode, can safely set these fields */
+		inode->i_mode = cifs_sb->mnt_file_mode;
+	else /* since we set the inode type below we need to mask off
+	     to avoid strange results if type changes and both
+	     get orred in */
+		inode->i_mode &= ~S_IFMT;
+/*	if (attr & ATTR_REPARSE)  */
+	/* We no longer handle these as symlinks because we could not
+	   follow them due to the absolute path with drive letter */
+	if (attr & ATTR_DIRECTORY) {
+	/* override default perms since we do not do byte range locking
+	   on dirs */
+		inode->i_mode = cifs_sb->mnt_dir_mode;
+		inode->i_mode |= S_IFDIR;
+	} else if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) &&
+		   (cifsInfo->cifsAttrs & ATTR_SYSTEM) &&
+		   /* No need to le64 convert size of zero */
+		   (pfindData->EndOfFile == 0)) {
+		inode->i_mode = cifs_sb->mnt_file_mode;
+		inode->i_mode |= S_IFIFO;
 /* BB Finish for SFU style symlinks and devices */
-		} else if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) &&
-			   (cifsInfo->cifsAttrs & ATTR_SYSTEM)) {
-			if (decode_sfu_inode(inode,
-					 le64_to_cpu(pfindData->EndOfFile),
-					 full_path,
-					 cifs_sb, xid))
-				cFYI(1, ("Unrecognized sfu inode type"));
-
-			cFYI(1, ("sfu mode 0%o", inode->i_mode));
-		} else {
-			inode->i_mode |= S_IFREG;
-			/* treat the dos attribute of read-only as read-only
-			   mode e.g. 555 */
-			if (cifsInfo->cifsAttrs & ATTR_READONLY)
-				inode->i_mode &= ~(S_IWUGO);
-			else if ((inode->i_mode & S_IWUGO) == 0)
-				/* the ATTR_READONLY flag may have been	*/
-				/* changed on server -- set any w bits	*/
-				/* allowed by mnt_file_mode		*/
-				inode->i_mode |= (S_IWUGO &
-						  cifs_sb->mnt_file_mode);
-		/* BB add code here -
-		   validate if device or weird share or device type? */
-		}
+	} else if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) &&
+		   (cifsInfo->cifsAttrs & ATTR_SYSTEM)) {
+		if (decode_sfu_inode(inode, le64_to_cpu(pfindData->EndOfFile),
+				     full_path, cifs_sb, xid))
+			cFYI(1, ("Unrecognized sfu inode type"));
 
-		spin_lock(&inode->i_lock);
-		if (is_size_safe_to_change(cifsInfo,
-					   le64_to_cpu(pfindData->EndOfFile))) {
-			/* can not safely shrink the file size here if the
-			   client is writing to it due to potential races */
-			i_size_write(inode, le64_to_cpu(pfindData->EndOfFile));
-
-			/* 512 bytes (2**9) is the fake blocksize that must be
-			   used for this calculation */
-			inode->i_blocks = (512 - 1 + le64_to_cpu(
-					   pfindData->AllocationSize)) >> 9;
-		}
-		spin_unlock(&inode->i_lock);
+		cFYI(1, ("sfu mode 0%o", inode->i_mode));
+	} else {
+		inode->i_mode |= S_IFREG;
+		/* treat dos attribute of read-only as read-only mode eg 555 */
+		if (cifsInfo->cifsAttrs & ATTR_READONLY)
+			inode->i_mode &= ~(S_IWUGO);
+		else if ((inode->i_mode & S_IWUGO) == 0)
+			/* the ATTR_READONLY flag may have been	*/
+			/* changed on server -- set any w bits	*/
+			/* allowed by mnt_file_mode		*/
+			inode->i_mode |= (S_IWUGO & cifs_sb->mnt_file_mode);
+	/* BB add code to validate if device or weird share or device type? */
+	}
+
+	spin_lock(&inode->i_lock);
+	if (is_size_safe_to_change(cifsInfo,
+				   le64_to_cpu(pfindData->EndOfFile))) {
+		/* can not safely shrink the file size here if the
+		   client is writing to it due to potential races */
+		i_size_write(inode, le64_to_cpu(pfindData->EndOfFile));
+
+		/* 512 bytes (2**9) is the fake blocksize that must be
+		   used for this calculation */
+		inode->i_blocks = (512 - 1 + le64_to_cpu(
+				   pfindData->AllocationSize)) >> 9;
+	}
+	spin_unlock(&inode->i_lock);
 
-		inode->i_nlink = le32_to_cpu(pfindData->NumberOfLinks);
+	inode->i_nlink = le32_to_cpu(pfindData->NumberOfLinks);
 
-		/* BB fill in uid and gid here? with help from winbind?
-		   or retrieve from NTFS stream extended attribute */
+	/* BB fill in uid and gid here? with help from winbind?
+	   or retrieve from NTFS stream extended attribute */
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-		/* fill in 0777 bits from ACL */
-		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
-			cFYI(1, ("Getting mode bits from ACL"));
-			acl_to_uid_mode(inode, full_path, pfid);
-		}
+	/* fill in 0777 bits from ACL */
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
+		cFYI(1, ("Getting mode bits from ACL"));
+		acl_to_uid_mode(inode, full_path, pfid);
+	}
 #endif
-		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
-			/* fill in remaining high mode bits e.g. SUID, VTX */
-			get_sfu_mode(inode, full_path, cifs_sb, xid);
-		} else if (atomic_read(&cifsInfo->inUse) == 0) {
-			inode->i_uid = cifs_sb->mnt_uid;
-			inode->i_gid = cifs_sb->mnt_gid;
-			/* set so we do not keep refreshing these fields with
-			   bad data after user has changed them in memory */
-			atomic_set(&cifsInfo->inUse, 1);
-		}
-
-		cifs_set_ops(inode, is_dfs_referral);
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
+		/* fill in remaining high mode bits e.g. SUID, VTX */
+		get_sfu_mode(inode, full_path, cifs_sb, xid);
+	} else if (atomic_read(&cifsInfo->inUse) == 0) {
+		inode->i_uid = cifs_sb->mnt_uid;
+		inode->i_gid = cifs_sb->mnt_gid;
+		/* set so we do not keep refreshing these fields with
+		   bad data after user has changed them in memory */
+		atomic_set(&cifsInfo->inUse, 1);
 	}
+
+	cifs_set_ops(inode, is_dfs_referral);
+
+
+
+
 cgii_exit:
 	kfree(buf);
 	return rc;
-- 
cgit v1.2.3-70-g09d2


From 5651ced3ab196b5e7dc485c5777f210aa41e2d8d Mon Sep 17 00:00:00 2001
From: Igor Mammedov <niallain@gmail.com>
Date: Tue, 20 May 2008 13:02:01 +0400
Subject: Fix possible access to undefined memory region.

Signed-off-by: Igor Mammedov <niallain@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/dns_resolve.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 939e256f849..f730ef35499 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -134,10 +134,6 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
 	rkey = request_key(&key_type_dns_resolver, name, "");
 	if (!IS_ERR(rkey)) {
 		data = rkey->payload.data;
-		cFYI(1, ("%s: resolved: %s to %s", __func__,
-					rkey->description,
-					*ip_addr
-				));
 	} else {
 		cERROR(1, ("%s: unable to resolve: %s", __func__, name));
 		goto out;
@@ -150,6 +146,11 @@ skip_upcall:
 		if (*ip_addr) {
 			memcpy(*ip_addr, data, len);
 			(*ip_addr)[len] = '\0';
+			if (!IS_ERR(rkey))
+				cFYI(1, ("%s: resolved: %s to %s", __func__,
+							name,
+							*ip_addr
+					));
 			rc = 0;
 		} else {
 			rc = -ENOMEM;
-- 
cgit v1.2.3-70-g09d2


From 397d71ddfda5b11b85e396d6ea822011c132b962 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Wed, 21 May 2008 03:49:46 +0000
Subject: [CIFS] Remove debug statement

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 1cf43e10194..00ced97bd53 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -222,7 +222,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
 	if (rc) {
 		if (rc == -EREMOTE && !is_dfs_referral) {
 			is_dfs_referral = true;
-			cERROR(1, ("DFS ref")); /* BB removeme BB */
+			cFYI(DBG2, ("DFS ref"));
 			/* for DFS, server does not give us real inode data */
 			fill_fake_finddataunix(&find_data, sb);
 			rc = 0;
-- 
cgit v1.2.3-70-g09d2


From 6536d2891ba2c4e837ba8478dc13bb173ed24a23 Mon Sep 17 00:00:00 2001
From: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Date: Wed, 21 May 2008 10:45:16 -0500
Subject: JFS: skip bad iput() call in error path

If jfs_iget() fails, we can't call iput() on the returned error.
Thanks to Eric Sesterhenn's fuzzer testing for reporting the problem.

Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
---
 fs/jfs/super.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 50ea6545173..0288e6d7936 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -499,7 +499,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
 	inode = jfs_iget(sb, ROOT_I);
 	if (IS_ERR(inode)) {
 		ret = PTR_ERR(inode);
-		goto out_no_root;
+		goto out_no_rw;
 	}
 	sb->s_root = d_alloc_root(inode);
 	if (!sb->s_root)
@@ -521,9 +521,8 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 
 out_no_root:
-	jfs_err("jfs_read_super: get root inode failed");
-	if (inode)
-		iput(inode);
+	jfs_err("jfs_read_super: get root dentry failed");
+	iput(inode);
 
 out_no_rw:
 	rc = jfs_umount(sb);
-- 
cgit v1.2.3-70-g09d2


From 13c48c490208d9e70d8d66d56f96c5054db69af7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Wed, 21 May 2008 06:32:11 +0100
Subject: fix hppfs Makefile breakage

Fallout from commit 46d7b522ebf486edbd096965d534cc6465e9e309 ("uml: move
hppfs_kern.c to hppfs.c")

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Acked-by: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hppfs/Makefile | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/hppfs/Makefile b/fs/hppfs/Makefile
index 8a1f5034436..3a982bd975d 100644
--- a/fs/hppfs/Makefile
+++ b/fs/hppfs/Makefile
@@ -3,7 +3,4 @@
 # Licensed under the GPL
 #
 
-hppfs-objs := hppfs.o
-
-obj-y =
-obj-$(CONFIG_HPPFS) += $(hppfs-objs)
+obj-$(CONFIG_HPPFS) += hppfs.o
-- 
cgit v1.2.3-70-g09d2


From 79bc12a0a09c2eb1ccbb01c192045f994567bda2 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Wed, 21 May 2008 06:32:11 +0100
Subject: ecryptfs fixes

memcpy() from userland pointer is a Bad Thing(tm)

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/miscdev.c | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 6560da1a58c..50c994a249a 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -243,7 +243,6 @@ ecryptfs_miscdev_read(struct file *file, char __user *buf, size_t count,
 	struct ecryptfs_daemon *daemon;
 	struct ecryptfs_msg_ctx *msg_ctx;
 	size_t packet_length_size;
-	u32 counter_nbo;
 	char packet_length[3];
 	size_t i;
 	size_t total_length;
@@ -328,20 +327,18 @@ check_list:
 		       "pending message\n", __func__, count, total_length);
 		goto out_unlock_msg_ctx;
 	}
-	i = 0;
-	buf[i++] = msg_ctx->type;
-	counter_nbo = cpu_to_be32(msg_ctx->counter);
-	memcpy(&buf[i], (char *)&counter_nbo, 4);
-	i += 4;
+	rc = -EFAULT;
+	if (put_user(msg_ctx->type, buf))
+		goto out_unlock_msg_ctx;
+	if (put_user(cpu_to_be32(msg_ctx->counter), (__be32 __user *)(buf + 1)))
+		goto out_unlock_msg_ctx;
+	i = 5;
 	if (msg_ctx->msg) {
-		memcpy(&buf[i], packet_length, packet_length_size);
+		if (copy_to_user(&buf[i], packet_length, packet_length_size))
+			goto out_unlock_msg_ctx;
 		i += packet_length_size;
-		rc = copy_to_user(&buf[i], msg_ctx->msg, msg_ctx->msg_size);
-		if (rc) {
-			printk(KERN_ERR "%s: copy_to_user returned error "
-			       "[%d]\n", __func__, rc);
+		if (copy_to_user(&buf[i], msg_ctx->msg, msg_ctx->msg_size))
 			goto out_unlock_msg_ctx;
-		}
 		i += msg_ctx->msg_size;
 	}
 	rc = i;
@@ -452,7 +449,8 @@ static ssize_t
 ecryptfs_miscdev_write(struct file *file, const char __user *buf,
 		       size_t count, loff_t *ppos)
 {
-	u32 counter_nbo, seq;
+	__be32 counter_nbo;
+	u32 seq;
 	size_t packet_size, packet_size_length, i;
 	ssize_t sz = 0;
 	char *data;
@@ -485,7 +483,7 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
 			       count);
 			goto out_free;
 		}
-		memcpy((char *)&counter_nbo, &data[i], 4);
+		memcpy(&counter_nbo, &data[i], 4);
 		seq = be32_to_cpu(counter_nbo);
 		i += 4;
 		rc = ecryptfs_parse_packet_length(&data[i], &packet_size,
-- 
cgit v1.2.3-70-g09d2


From 9d8df6aa9b1ca74127b11537d91de492dbea666a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Wed, 21 May 2008 06:32:11 +0100
Subject: ocfs2 endianness fixes

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/alloc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 41f84c92094..10bfb466e06 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -2788,7 +2788,7 @@ static int ocfs2_merge_rec_right(struct inode *inode,
 	BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
 	left_rec = &el->l_recs[index];
 
-	if (index == le16_to_cpu(el->l_next_free_rec - 1) &&
+	if (index == le16_to_cpu(el->l_next_free_rec) - 1 &&
 	    le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
 		/* we meet with a cross extent block merge. */
 		ret = ocfs2_get_right_path(inode, left_path, &right_path);
@@ -2802,7 +2802,7 @@ static int ocfs2_merge_rec_right(struct inode *inode,
 		BUG_ON(next_free <= 0);
 		right_rec = &right_el->l_recs[0];
 		if (ocfs2_is_empty_extent(right_rec)) {
-			BUG_ON(le16_to_cpu(next_free) <= 1);
+			BUG_ON(next_free <= 1);
 			right_rec = &right_el->l_recs[1];
 		}
 
-- 
cgit v1.2.3-70-g09d2


From 0d817bc0d6cdd92ff1ab2e98dd5878659a48659c Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 22 May 2008 02:02:03 +0000
Subject: [CIFS] Remove redundant NULL check

Noticed by Coverity checker.

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifssmb.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 7b9938445b0..9b8b4cfdf99 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -2159,8 +2159,7 @@ copyRetry:
 		cFYI(1, ("Send error in copy = %d with %d files copied",
 			rc, le16_to_cpu(pSMBr->CopyCount)));
 	}
-	if (pSMB)
-		cifs_buf_release(pSMB);
+	cifs_buf_release(pSMB);
 
 	if (rc == -EAGAIN)
 		goto copyRetry;
@@ -2249,8 +2248,7 @@ createSymLinkRetry:
 	if (rc)
 		cFYI(1, ("Send error in SetPathInfo create symlink = %d", rc));
 
-	if (pSMB)
-		cifs_buf_release(pSMB);
+	cifs_buf_release(pSMB);
 
 	if (rc == -EAGAIN)
 		goto createSymLinkRetry;
@@ -4095,8 +4093,7 @@ getDFSRetry:
 				 target_nodes, nls_codepage);
 
 GetDFSRefExit:
-	if (pSMB)
-		cifs_buf_release(pSMB);
+	cifs_buf_release(pSMB);
 
 	if (rc == -EAGAIN)
 		goto getDFSRetry;
@@ -5117,8 +5114,7 @@ setPermsRetry:
 	if (rc)
 		cFYI(1, ("SetPathInfo (perms) returned %d", rc));
 
-	if (pSMB)
-		cifs_buf_release(pSMB);
+	cifs_buf_release(pSMB);
 	if (rc == -EAGAIN)
 		goto setPermsRetry;
 	return rc;
@@ -5340,8 +5336,7 @@ QAllEAsRetry:
 			}
 		}
 	}
-	if (pSMB)
-		cifs_buf_release(pSMB);
+	cifs_buf_release(pSMB);
 	if (rc == -EAGAIN)
 		goto QAllEAsRetry;
 
@@ -5490,8 +5485,7 @@ QEARetry:
 			}
 		}
 	}
-	if (pSMB)
-		cifs_buf_release(pSMB);
+	cifs_buf_release(pSMB);
 	if (rc == -EAGAIN)
 		goto QEARetry;
 
-- 
cgit v1.2.3-70-g09d2


From e4058245ac0c4d9a517cda688a35aef065cb7f4e Mon Sep 17 00:00:00 2001
From: Igor Mammedov <niallain@gmail.com>
Date: Wed, 2 Apr 2008 17:33:47 +0400
Subject: Adds username in the upcall key for unattended mounts with keytab

Signed-off-by: Igor Mammedov <niallain@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_spnego.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 6653e29637a..7013aaff6ae 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -119,6 +119,9 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
 	dp = description + strlen(description);
 	sprintf(dp, ";uid=0x%x", sesInfo->linux_uid);
 
+	dp = description + strlen(description);
+	sprintf(dp, ";user=%s", sesInfo->userName);
+
 	cFYI(1, ("key description = %s", description));
 	spnego_key = request_key(&cifs_spnego_key_type, description, "");
 
-- 
cgit v1.2.3-70-g09d2


From 0a891adccc867c28b022128bc342a779e476c816 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Thu, 22 May 2008 14:20:21 +0000
Subject: [CIFS] Fix reversed memset arguments

Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 00ced97bd53..129dbfe4dca 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -172,7 +172,7 @@ static void fill_fake_finddataunix(FILE_UNIX_BASIC_INFO *pfnd_dat,
 {
 	struct inode *pinode = NULL;
 
-	memset(pfnd_dat, sizeof(FILE_UNIX_BASIC_INFO), 0);
+	memset(pfnd_dat, 0, sizeof(FILE_UNIX_BASIC_INFO));
 
 /*	__le64 pfnd_dat->EndOfFile = cpu_to_le64(0);
 	__le64 pfnd_dat->NumOfBytes = cpu_to_le64(0);
@@ -384,7 +384,7 @@ static int get_sfu_mode(struct inode *inode,
 static void fill_fake_finddata(FILE_ALL_INFO *pfnd_dat,
 			       struct super_block *sb)
 {
-	memset(pfnd_dat, sizeof(FILE_ALL_INFO), 0);
+	memset(pfnd_dat, 0, sizeof(FILE_ALL_INFO));
 
 /*	__le64 pfnd_dat->AllocationSize = cpu_to_le64(0);
 	__le64 pfnd_dat->EndOfFile = cpu_to_le64(0);
-- 
cgit v1.2.3-70-g09d2


From 978b7237123d007b9fa983af6e0e2fa8f97f9934 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Mon, 19 May 2008 16:29:46 +1000
Subject: [XFS] Fix fsync() b0rkage.

xfs_fsync() fails to wait for data I/O completion before checking if the
inode is dirty or clean to decide whether to log the inode or not. This
misses inode size updates when the data flushed by the fsync() is
extending the file.

Hence, like fdatasync(), we need to wait for I/o completion first, then
check the inode for cleanliness. Doing so makes the behaviour of
xfs_fsync() identical for fsync and fdatasync and we *always* use
synchronous semantics if the inode is dirty. Therefore also kill the
differences and remove the unused flags from the xfs_fsync function and
callers.

SGI-PV: 981296
SGI-Modid: xfs-linux-melb:xfs-kern:31033a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_file.c  |  17 ++++---
 fs/xfs/linux-2.6/xfs_vnode.h |   8 ----
 fs/xfs/xfs_vnodeops.c        | 112 ++++++++++++++++---------------------------
 fs/xfs/xfs_vnodeops.h        |   3 +-
 4 files changed, 54 insertions(+), 86 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 65e78c13d4a..5f60363b934 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -184,19 +184,24 @@ xfs_file_release(
 	return -xfs_release(XFS_I(inode));
 }
 
+/*
+ * We ignore the datasync flag here because a datasync is effectively
+ * identical to an fsync. That is, datasync implies that we need to write
+ * only the metadata needed to be able to access the data that is written
+ * if we crash after the call completes. Hence if we are writing beyond
+ * EOF we have to log the inode size change as well, which makes it a
+ * full fsync. If we don't write beyond EOF, the inode core will be
+ * clean in memory and so we don't need to log the inode, just like
+ * fsync.
+ */
 STATIC int
 xfs_file_fsync(
 	struct file	*filp,
 	struct dentry	*dentry,
 	int		datasync)
 {
-	int		flags = FSYNC_WAIT;
-
-	if (datasync)
-		flags |= FSYNC_DATA;
 	xfs_iflags_clear(XFS_I(dentry->d_inode), XFS_ITRUNCATED);
-	return -xfs_fsync(XFS_I(dentry->d_inode), flags,
-			(xfs_off_t)0, (xfs_off_t)-1);
+	return -xfs_fsync(XFS_I(dentry->d_inode));
 }
 
 /*
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index 9d73cb5c0fc..25eb2a9e8d9 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -229,14 +229,6 @@ static inline void vn_atime_to_time_t(bhv_vnode_t *vp, time_t *tt)
 #define ATTR_NOLOCK	0x200	/* Don't grab any conflicting locks */
 #define ATTR_NOSIZETOK	0x400	/* Don't get the SIZE token */
 
-/*
- * Flags to vop_fsync/reclaim.
- */
-#define FSYNC_NOWAIT	0	/* asynchronous flush */
-#define FSYNC_WAIT	0x1	/* synchronous fsync or forced reclaim */
-#define FSYNC_INVAL	0x2	/* flush and invalidate cached data */
-#define FSYNC_DATA	0x4	/* synchronous fsync of data only */
-
 /*
  * Tracking vnode activity.
  */
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 70702a60b4b..e475e3717eb 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -856,18 +856,14 @@ xfs_readlink(
 /*
  * xfs_fsync
  *
- * This is called to sync the inode and its data out to disk.
- * We need to hold the I/O lock while flushing the data, and
- * the inode lock while flushing the inode.  The inode lock CANNOT
- * be held while flushing the data, so acquire after we're done
- * with that.
+ * This is called to sync the inode and its data out to disk.  We need to hold
+ * the I/O lock while flushing the data, and the inode lock while flushing the
+ * inode.  The inode lock CANNOT be held while flushing the data, so acquire
+ * after we're done with that.
  */
 int
 xfs_fsync(
-	xfs_inode_t	*ip,
-	int		flag,
-	xfs_off_t	start,
-	xfs_off_t	stop)
+	xfs_inode_t	*ip)
 {
 	xfs_trans_t	*tp;
 	int		error;
@@ -875,103 +871,79 @@ xfs_fsync(
 
 	xfs_itrace_entry(ip);
 
-	ASSERT(start >= 0 && stop >= -1);
-
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 		return XFS_ERROR(EIO);
 
-	if (flag & FSYNC_DATA)
-		filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping);
+	/* capture size updates in I/O completion before writing the inode. */
+	error = filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping);
+	if (error)
+		return XFS_ERROR(error);
 
 	/*
-	 * We always need to make sure that the required inode state
-	 * is safe on disk.  The vnode might be clean but because
-	 * of committed transactions that haven't hit the disk yet.
-	 * Likewise, there could be unflushed non-transactional
-	 * changes to the inode core that have to go to disk.
+	 * We always need to make sure that the required inode state is safe on
+	 * disk.  The vnode might be clean but we still might need to force the
+	 * log because of committed transactions that haven't hit the disk yet.
+	 * Likewise, there could be unflushed non-transactional changes to the
+	 * inode core that have to go to disk and this requires us to issue
+	 * a synchronous transaction to capture these changes correctly.
 	 *
-	 * The following code depends on one assumption:  that
-	 * any transaction that changes an inode logs the core
-	 * because it has to change some field in the inode core
-	 * (typically nextents or nblocks).  That assumption
-	 * implies that any transactions against an inode will
-	 * catch any non-transactional updates.  If inode-altering
-	 * transactions exist that violate this assumption, the
-	 * code breaks.  Right now, it figures that if the involved
-	 * update_* field is clear and the inode is unpinned, the
-	 * inode is clean.  Either it's been flushed or it's been
-	 * committed and the commit has hit the disk unpinning the inode.
-	 * (Note that xfs_inode_item_format() called at commit clears
-	 * the update_* fields.)
+	 * This code relies on the assumption that if the update_* fields
+	 * of the inode are clear and the inode is unpinned then it is clean
+	 * and no action is required.
 	 */
 	xfs_ilock(ip, XFS_ILOCK_SHARED);
 
-	/* If we are flushing data then we care about update_size
-	 * being set, otherwise we care about update_core
-	 */
-	if ((flag & FSYNC_DATA) ?
-			(ip->i_update_size == 0) :
-			(ip->i_update_core == 0)) {
+	if (!(ip->i_update_size || ip->i_update_core)) {
 		/*
-		 * Timestamps/size haven't changed since last inode
-		 * flush or inode transaction commit.  That means
-		 * either nothing got written or a transaction
-		 * committed which caught the updates.	If the
-		 * latter happened and the transaction hasn't
-		 * hit the disk yet, the inode will be still
-		 * be pinned.  If it is, force the log.
+		 * Timestamps/size haven't changed since last inode flush or
+		 * inode transaction commit.  That means either nothing got
+		 * written or a transaction committed which caught the updates.
+		 * If the latter happened and the transaction hasn't hit the
+		 * disk yet, the inode will be still be pinned.  If it is,
+		 * force the log.
 		 */
 
 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
 
 		if (xfs_ipincount(ip)) {
-			_xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
-				      XFS_LOG_FORCE |
-				      ((flag & FSYNC_WAIT)
-				       ? XFS_LOG_SYNC : 0),
+			error = _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
+				      XFS_LOG_FORCE | XFS_LOG_SYNC,
 				      &log_flushed);
 		} else {
 			/*
-			 * If the inode is not pinned and nothing
-			 * has changed we don't need to flush the
-			 * cache.
+			 * If the inode is not pinned and nothing has changed
+			 * we don't need to flush the cache.
 			 */
 			changed = 0;
 		}
-		error = 0;
 	} else	{
 		/*
-		 * Kick off a transaction to log the inode
-		 * core to get the updates.  Make it
-		 * sync if FSYNC_WAIT is passed in (which
-		 * is done by everybody but specfs).  The
-		 * sync transaction will also force the log.
+		 * Kick off a transaction to log the inode core to get the
+		 * updates.  The sync transaction will also force the log.
 		 */
 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
 		tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
-		if ((error = xfs_trans_reserve(tp, 0,
-				XFS_FSYNC_TS_LOG_RES(ip->i_mount),
-				0, 0, 0)))  {
+		error = xfs_trans_reserve(tp, 0,
+				XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0);
+		if (error) {
 			xfs_trans_cancel(tp, 0);
 			return error;
 		}
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
 
 		/*
-		 * Note - it's possible that we might have pushed
-		 * ourselves out of the way during trans_reserve
-		 * which would flush the inode.	 But there's no
-		 * guarantee that the inode buffer has actually
-		 * gone out yet (it's delwri).	Plus the buffer
-		 * could be pinned anyway if it's part of an
-		 * inode in another recent transaction.	 So we
-		 * play it safe and fire off the transaction anyway.
+		 * Note - it's possible that we might have pushed ourselves out
+		 * of the way during trans_reserve which would flush the inode.
+		 * But there's no guarantee that the inode buffer has actually
+		 * gone out yet (it's delwri).	Plus the buffer could be pinned
+		 * anyway if it's part of an inode in another recent
+		 * transaction.	 So we play it safe and fire off the
+		 * transaction anyway.
 		 */
 		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 		xfs_trans_ihold(tp, ip);
 		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-		if (flag & FSYNC_WAIT)
-			xfs_trans_set_sync(tp);
+		xfs_trans_set_sync(tp);
 		error = _xfs_trans_commit(tp, 0, &log_flushed);
 
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 8abe8f186e2..57335ba4ce5 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -18,8 +18,7 @@ int xfs_open(struct xfs_inode *ip);
 int xfs_setattr(struct xfs_inode *ip, struct bhv_vattr *vap, int flags,
 		struct cred *credp);
 int xfs_readlink(struct xfs_inode *ip, char *link);
-int xfs_fsync(struct xfs_inode *ip, int flag, xfs_off_t start,
-		xfs_off_t stop);
+int xfs_fsync(struct xfs_inode *ip);
 int xfs_release(struct xfs_inode *ip);
 int xfs_inactive(struct xfs_inode *ip);
 int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
-- 
cgit v1.2.3-70-g09d2


From 49383b0e98ad1f69ff4c816eb1961f703df12318 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Mon, 19 May 2008 16:29:34 +1000
Subject: [XFS] Don't allow memory reclaim to wait on the filesystem in inode
 writeback

If we allow memory reclaim to wait on the pages under writeback in inode
cluster writeback we could deadlock because we are currently holding the
ILOCK on the initial writeback inode which is needed in data I/O
completion to change the file size or do unwritten extent conversion
before the pages are taken out of writeback state.

SGI-PV: 981091
SGI-Modid: xfs-linux-melb:xfs-kern:31015a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index cf0bb9c1d62..739ea45a9d1 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2986,7 +2986,7 @@ xfs_iflush_cluster(
 	ASSERT(pag->pag_ici_init);
 
 	ilist_size = XFS_INODE_CLUSTER_SIZE(mp) * sizeof(xfs_inode_t *);
-	ilist = kmem_alloc(ilist_size, KM_MAYFAIL);
+	ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
 	if (!ilist)
 		return 0;
 
-- 
cgit v1.2.3-70-g09d2


From c8f5f12e46f079a954d4f7163ba59dadee08ca26 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Tue, 20 May 2008 11:30:15 +1000
Subject: [XFS] Fix inode list allocation size in writeback.

We only need to allocate space for the number of inodes in the cluster
when writing back inodes, not every byte in the inode cluster. This
reduces the amount of memory needing to be allocated to 256 bytes instead
of 64k.

SGI-PV: 981949
SGI-Modid: xfs-linux-melb:xfs-kern:31182a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_inode.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 739ea45a9d1..e569bf5d6cf 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2974,6 +2974,7 @@ xfs_iflush_cluster(
 	xfs_mount_t		*mp = ip->i_mount;
 	xfs_perag_t		*pag = xfs_get_perag(mp, ip->i_ino);
 	unsigned long		first_index, mask;
+	unsigned long		inodes_per_cluster;
 	int			ilist_size;
 	xfs_inode_t		**ilist;
 	xfs_inode_t		*iq;
@@ -2985,7 +2986,8 @@ xfs_iflush_cluster(
 	ASSERT(pag->pagi_inodeok);
 	ASSERT(pag->pag_ici_init);
 
-	ilist_size = XFS_INODE_CLUSTER_SIZE(mp) * sizeof(xfs_inode_t *);
+	inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog;
+	ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
 	ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
 	if (!ilist)
 		return 0;
@@ -2995,8 +2997,7 @@ xfs_iflush_cluster(
 	read_lock(&pag->pag_ici_lock);
 	/* really need a gang lookup range call here */
 	nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
-					first_index,
-					XFS_INODE_CLUSTER_SIZE(mp));
+					first_index, inodes_per_cluster);
 	if (nr_found == 0)
 		goto out_free;
 
-- 
cgit v1.2.3-70-g09d2


From 6ab455eeaff6893cd06da33843e840d888cdc04a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 19 May 2008 16:34:42 +1000
Subject: [XFS] Fix memory corruption with small buffer reads

When we have multiple buffers in a single page for a blocksize == pagesize
filesystem we might overwrite the page contents if two callers hit it
shortly after each other. To prevent that we need to keep the page locked
until I/O is completed and the page marked uptodate.

Thanks to Eric Sandeen for triaging this bug and finding a reproducible
testcase and Dave Chinner for additional advice.

This should fix kernel.org bz #10421.

Tested-by: Eric Sandeen <sandeen@sandeen.net>

SGI-PV: 981813
SGI-Modid: xfs-linux-melb:xfs-kern:31173a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_buf.c | 24 ++++++++++++++++++++----
 fs/xfs/linux-2.6/xfs_buf.h | 19 +++++++++++++++++++
 2 files changed, 39 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 5105015a75a..98e0e86093b 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -387,6 +387,8 @@ _xfs_buf_lookup_pages(
 		if (unlikely(page == NULL)) {
 			if (flags & XBF_READ_AHEAD) {
 				bp->b_page_count = i;
+				for (i = 0; i < bp->b_page_count; i++)
+					unlock_page(bp->b_pages[i]);
 				return -ENOMEM;
 			}
 
@@ -416,17 +418,24 @@ _xfs_buf_lookup_pages(
 		ASSERT(!PagePrivate(page));
 		if (!PageUptodate(page)) {
 			page_count--;
-			if (blocksize < PAGE_CACHE_SIZE && !PagePrivate(page)) {
+			if (blocksize >= PAGE_CACHE_SIZE) {
+				if (flags & XBF_READ)
+					bp->b_flags |= _XBF_PAGE_LOCKED;
+			} else if (!PagePrivate(page)) {
 				if (test_page_region(page, offset, nbytes))
 					page_count++;
 			}
 		}
 
-		unlock_page(page);
 		bp->b_pages[i] = page;
 		offset = 0;
 	}
 
+	if (!(bp->b_flags & _XBF_PAGE_LOCKED)) {
+		for (i = 0; i < bp->b_page_count; i++)
+			unlock_page(bp->b_pages[i]);
+	}
+
 	if (page_count == bp->b_page_count)
 		bp->b_flags |= XBF_DONE;
 
@@ -746,6 +755,7 @@ xfs_buf_associate_memory(
 	bp->b_count_desired = len;
 	bp->b_buffer_length = buflen;
 	bp->b_flags |= XBF_MAPPED;
+	bp->b_flags &= ~_XBF_PAGE_LOCKED;
 
 	return 0;
 }
@@ -1093,8 +1103,10 @@ _xfs_buf_ioend(
 	xfs_buf_t		*bp,
 	int			schedule)
 {
-	if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
+	if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
+		bp->b_flags &= ~_XBF_PAGE_LOCKED;
 		xfs_buf_ioend(bp, schedule);
+	}
 }
 
 STATIC void
@@ -1125,6 +1137,9 @@ xfs_buf_bio_end_io(
 
 		if (--bvec >= bio->bi_io_vec)
 			prefetchw(&bvec->bv_page->flags);
+
+		if (bp->b_flags & _XBF_PAGE_LOCKED)
+			unlock_page(page);
 	} while (bvec >= bio->bi_io_vec);
 
 	_xfs_buf_ioend(bp, 1);
@@ -1163,7 +1178,8 @@ _xfs_buf_ioapply(
 	 * filesystem block size is not smaller than the page size.
 	 */
 	if ((bp->b_buffer_length < PAGE_CACHE_SIZE) &&
-	    (bp->b_flags & XBF_READ) &&
+	    ((bp->b_flags & (XBF_READ|_XBF_PAGE_LOCKED)) ==
+	      (XBF_READ|_XBF_PAGE_LOCKED)) &&
 	    (blocksize >= PAGE_CACHE_SIZE)) {
 		bio = bio_alloc(GFP_NOIO, 1);
 
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 841d7883528..f948ec7ba9a 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -66,6 +66,25 @@ typedef enum {
 	_XBF_PAGES = (1 << 18),	    /* backed by refcounted pages	   */
 	_XBF_RUN_QUEUES = (1 << 19),/* run block device task queue	   */
 	_XBF_DELWRI_Q = (1 << 21),   /* buffer on delwri queue		   */
+
+	/*
+	 * Special flag for supporting metadata blocks smaller than a FSB.
+	 *
+	 * In this case we can have multiple xfs_buf_t on a single page and
+	 * need to lock out concurrent xfs_buf_t readers as they only
+	 * serialise access to the buffer.
+	 *
+	 * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation
+	 * between reads of the page. Hence we can have one thread read the
+	 * page and modify it, but then race with another thread that thinks
+	 * the page is not up-to-date and hence reads it again.
+	 *
+	 * The result is that the first modifcation to the page is lost.
+	 * This sort of AGF/AGI reading race can happen when unlinking inodes
+	 * that require truncation and results in the AGI unlinked list
+	 * modifications being lost.
+	 */
+	_XBF_PAGE_LOCKED = (1 << 22),
 } xfs_buf_flags_t;
 
 typedef enum {
-- 
cgit v1.2.3-70-g09d2


From aaa9bbe039febf1d3a0f3a374deea0680d9f5758 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Fri, 23 May 2008 17:38:32 +0000
Subject: [CIFS] remove unused variables

CC: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsglob.h | 3 +--
 fs/cifs/cifssmb.c  | 6 ++----
 fs/cifs/file.c     | 7 -------
 fs/cifs/misc.c     | 3 +--
 fs/cifs/readdir.c  | 6 +-----
 5 files changed, 5 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 08914053242..9cfcf326ead 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -333,7 +333,6 @@ struct cifsFileInfo {
 	bool messageMode:1;	/* for pipes: message vs byte mode */
 	atomic_t wrtPending;   /* handle in use - defer close */
 	struct semaphore fh_sem; /* prevents reopen race after dead ses*/
-	char *search_resume_name; /* BB removeme BB */
 	struct cifs_search_info srch_inf;
 };
 
@@ -626,7 +625,7 @@ GLOBAL_EXTERN atomic_t tcpSesAllocCount;
 GLOBAL_EXTERN atomic_t tcpSesReconnectCount;
 GLOBAL_EXTERN atomic_t tconInfoReconnectCount;
 
-/* Various Debug counters to remove someday (BB) */
+/* Various Debug counters */
 GLOBAL_EXTERN atomic_t bufAllocCount;    /* current number allocated  */
 #ifdef CONFIG_CIFS_STATS2
 GLOBAL_EXTERN atomic_t totBufAllocCount; /* total allocated over all time */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 9b8b4cfdf99..174bf8aca23 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1728,7 +1728,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
 {
 	int rc = 0;
 	LOCK_REQ *pSMB = NULL;
-	LOCK_RSP *pSMBr = NULL;
+/*	LOCK_RSP *pSMBr = NULL; */ /* No response data other than rc to parse */
 	int bytes_returned;
 	int timeout = 0;
 	__u16 count;
@@ -1739,8 +1739,6 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
 	if (rc)
 		return rc;
 
-	pSMBr = (LOCK_RSP *)pSMB; /* BB removeme BB */
-
 	if (lockType == LOCKING_ANDX_OPLOCK_RELEASE) {
 		timeout = CIFS_ASYNC_OP; /* no response expected */
 		pSMB->Timeout = 0;
@@ -1774,7 +1772,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
 
 	if (waitFlag) {
 		rc = SendReceiveBlockingLock(xid, tcon, (struct smb_hdr *) pSMB,
-			(struct smb_hdr *) pSMBr, &bytes_returned);
+			(struct smb_hdr *) pSMB, &bytes_returned);
 		cifs_small_buf_release(pSMB);
 	} else {
 		rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *)pSMB,
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 8636cec2642..0aac824371a 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -546,7 +546,6 @@ int cifs_close(struct inode *inode, struct file *file)
 			msleep(timeout);
 			timeout *= 8;
 		}
-		kfree(pSMBFile->search_resume_name);
 		kfree(file->private_data);
 		file->private_data = NULL;
 	} else
@@ -605,12 +604,6 @@ int cifs_closedir(struct inode *inode, struct file *file)
 			else
 				cifs_buf_release(ptmp);
 		}
-		ptmp = pCFileStruct->search_resume_name;
-		if (ptmp) {
-			cFYI(1, ("closedir free resume name"));
-			pCFileStruct->search_resume_name = NULL;
-			kfree(ptmp);
-		}
 		kfree(file->private_data);
 		file->private_data = NULL;
 	}
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 1d69b8014e0..4b17f8fe315 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -519,8 +519,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 			pnotify = (struct file_notify_information *)
 				((char *)&pSMBr->hdr.Protocol + data_offset);
 			cFYI(1, ("dnotify on %s Action: 0x%x",
-				 pnotify->FileName,
-				pnotify->Action));  /* BB removeme BB */
+				 pnotify->FileName, pnotify->Action));
 			/*   cifs_dump_mem("Rcvd notify Data: ",buf,
 				sizeof(struct smb_hdr)+60); */
 			return true;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 713c2511019..df2c3c466ee 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -675,8 +675,6 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 			cifsFile->invalidHandle = true;
 			CIFSFindClose(xid, pTcon, cifsFile->netfid);
 		}
-		kfree(cifsFile->search_resume_name);
-		cifsFile->search_resume_name = NULL;
 		if (cifsFile->srch_inf.ntwrk_buf_start) {
 			cFYI(1, ("freeing SMB ff cache buf on search rewind"));
 			if (cifsFile->srch_inf.smallBuf)
@@ -1043,9 +1041,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 		} /* else {
 			cifsFile->invalidHandle = true;
 			CIFSFindClose(xid, pTcon, cifsFile->netfid);
-		}
-		kfree(cifsFile->search_resume_name);
-		cifsFile->search_resume_name = NULL; */
+		} */
 
 		rc = find_cifs_entry(xid, pTcon, file,
 				&current_entry, &num_to_fill);
-- 
cgit v1.2.3-70-g09d2


From 4468eb3fd102cad559e51594a01cbc65b994d264 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Thu, 22 May 2008 09:31:40 -0400
Subject: on non-posix shares, clear write bits in mode when ATTR_READONLY is
 set

When mounting a share with posix extensions disabled,
cifs_get_inode_info turns off all the write bits in the mode for regular
files if ATTR_READONLY is set. Directories and other inode types,
however, can also have ATTR_READONLY set, but the mode gives no
indication of this.

This patch makes this apply to other inode types besides regular files.
It also cleans up how modes are set in cifs_get_inode_info for both the
"normal" and "dynperm" cases.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/inode.c   | 76 ++++++++++++++++++++++++++-----------------------------
 fs/cifs/readdir.c | 71 ++++++++++++++++++++++++++++-----------------------
 2 files changed, 75 insertions(+), 72 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 129dbfe4dca..ae5bcaf2031 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -418,6 +418,7 @@ int cifs_get_inode_info(struct inode **pinode,
 	char *buf = NULL;
 	bool adjustTZ = false;
 	bool is_dfs_referral = false;
+	umode_t default_mode;
 
 	pTcon = cifs_sb->tcon;
 	cFYI(1, ("Getting info on %s", full_path));
@@ -530,47 +531,42 @@ int cifs_get_inode_info(struct inode **pinode,
 		inode->i_mtime.tv_sec += pTcon->ses->server->timeAdj;
 	}
 
-	/* set default mode. will override for dirs below */
-	if (atomic_read(&cifsInfo->inUse) == 0)
-		/* new inode, can safely set these fields */
-		inode->i_mode = cifs_sb->mnt_file_mode;
-	else /* since we set the inode type below we need to mask off
-	     to avoid strange results if type changes and both
-	     get orred in */
-		inode->i_mode &= ~S_IFMT;
-/*	if (attr & ATTR_REPARSE)  */
-	/* We no longer handle these as symlinks because we could not
-	   follow them due to the absolute path with drive letter */
-	if (attr & ATTR_DIRECTORY) {
-	/* override default perms since we do not do byte range locking
-	   on dirs */
-		inode->i_mode = cifs_sb->mnt_dir_mode;
-		inode->i_mode |= S_IFDIR;
-	} else if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) &&
-		   (cifsInfo->cifsAttrs & ATTR_SYSTEM) &&
-		   /* No need to le64 convert size of zero */
-		   (pfindData->EndOfFile == 0)) {
-		inode->i_mode = cifs_sb->mnt_file_mode;
-		inode->i_mode |= S_IFIFO;
-/* BB Finish for SFU style symlinks and devices */
-	} else if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) &&
-		   (cifsInfo->cifsAttrs & ATTR_SYSTEM)) {
-		if (decode_sfu_inode(inode, le64_to_cpu(pfindData->EndOfFile),
-				     full_path, cifs_sb, xid))
-			cFYI(1, ("Unrecognized sfu inode type"));
-
-		cFYI(1, ("sfu mode 0%o", inode->i_mode));
+	/* get default inode mode */
+	if (attr & ATTR_DIRECTORY)
+		default_mode = cifs_sb->mnt_dir_mode;
+	else
+		default_mode = cifs_sb->mnt_file_mode;
+
+	/* set permission bits */
+	if (atomic_read(&cifsInfo->inUse) == 0 ||
+	    (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) == 0)
+		inode->i_mode = default_mode;
+	else {
+		/* just reenable write bits if !ATTR_READONLY */
+		if ((inode->i_mode & S_IWUGO) == 0 &&
+		    (attr & ATTR_READONLY) == 0)
+			inode->i_mode |= (S_IWUGO & default_mode);
+			inode->i_mode &= ~S_IFMT;
+	}
+	/* clear write bits if ATTR_READONLY is set */
+	if (attr & ATTR_READONLY)
+		inode->i_mode &= ~S_IWUGO;
+
+	/* set inode type */
+	if ((attr & ATTR_SYSTEM) &&
+	    (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) {
+		/* no need to fix endianness on 0 */
+		if (pfindData->EndOfFile == 0)
+			inode->i_mode |= S_IFIFO;
+		else if (decode_sfu_inode(inode,
+				le64_to_cpu(pfindData->EndOfFile),
+				full_path, cifs_sb, xid))
+			cFYI(1, ("unknown SFU file type\n"));
 	} else {
-		inode->i_mode |= S_IFREG;
-		/* treat dos attribute of read-only as read-only mode eg 555 */
-		if (cifsInfo->cifsAttrs & ATTR_READONLY)
-			inode->i_mode &= ~(S_IWUGO);
-		else if ((inode->i_mode & S_IWUGO) == 0)
-			/* the ATTR_READONLY flag may have been	*/
-			/* changed on server -- set any w bits	*/
-			/* allowed by mnt_file_mode		*/
-			inode->i_mode |= (S_IWUGO & cifs_sb->mnt_file_mode);
-	/* BB add code to validate if device or weird share or device type? */
+		if (attr & ATTR_DIRECTORY)
+			inode->i_mode |= S_IFDIR;
+		else
+			inode->i_mode |= S_IFREG;
 	}
 
 	spin_lock(&inode->i_lock);
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index df2c3c466ee..83f30695488 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -132,6 +132,7 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
 	__u32 attr;
 	__u64 allocation_size;
 	__u64 end_of_file;
+	umode_t default_mode;
 
 	/* save mtime and size */
 	local_mtime = tmp_inode->i_mtime;
@@ -187,48 +188,54 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
 	if (atomic_read(&cifsInfo->inUse) == 0) {
 		tmp_inode->i_uid = cifs_sb->mnt_uid;
 		tmp_inode->i_gid = cifs_sb->mnt_gid;
-		/* set default mode. will override for dirs below */
-		tmp_inode->i_mode = cifs_sb->mnt_file_mode;
-	} else {
-		/* mask off the type bits since it gets set
-		below and we do not want to get two type
-		bits set */
+	}
+
+	if (attr & ATTR_DIRECTORY)
+		default_mode = cifs_sb->mnt_dir_mode;
+	else
+		default_mode = cifs_sb->mnt_file_mode;
+
+	/* set initial permissions */
+	if ((atomic_read(&cifsInfo->inUse) == 0) ||
+	    (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) == 0)
+		tmp_inode->i_mode = default_mode;
+	else {
+		/* just reenable write bits if !ATTR_READONLY */
+		if ((tmp_inode->i_mode & S_IWUGO) == 0 &&
+		    (attr & ATTR_READONLY) == 0)
+			tmp_inode->i_mode |= (S_IWUGO & default_mode);
+
 		tmp_inode->i_mode &= ~S_IFMT;
 	}
 
-	if (attr & ATTR_DIRECTORY) {
-		*pobject_type = DT_DIR;
-		/* override default perms since we do not lock dirs */
-		if (atomic_read(&cifsInfo->inUse) == 0)
-			tmp_inode->i_mode = cifs_sb->mnt_dir_mode;
-		tmp_inode->i_mode |= S_IFDIR;
-	} else if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) &&
-		   (attr & ATTR_SYSTEM)) {
+	/* clear write bits if ATTR_READONLY is set */
+	if (attr & ATTR_READONLY)
+		tmp_inode->i_mode &= ~S_IWUGO;
+
+	/* set inode type */
+	if ((attr & ATTR_SYSTEM) &&
+	    (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) {
 		if (end_of_file == 0)  {
-			*pobject_type = DT_FIFO;
 			tmp_inode->i_mode |= S_IFIFO;
+			*pobject_type = DT_FIFO;
 		} else {
-			/* rather than get the type here, we mark the
-			inode as needing revalidate and get the real type
-			(blk vs chr vs. symlink) later ie in lookup */
-			*pobject_type = DT_REG;
+			/*
+			 * trying to get the type can be slow, so just call
+			 * this a regular file for now, and mark for reval
+			 */
 			tmp_inode->i_mode |= S_IFREG;
+			*pobject_type = DT_REG;
 			cifsInfo->time = 0;
 		}
-/* we no longer mark these because we could not follow them */
-/*        } else if (attr & ATTR_REPARSE) {
-		*pobject_type = DT_LNK;
-		tmp_inode->i_mode |= S_IFLNK; */
 	} else {
-		*pobject_type = DT_REG;
-		tmp_inode->i_mode |= S_IFREG;
-		if (attr & ATTR_READONLY)
-			tmp_inode->i_mode &= ~(S_IWUGO);
-		else if ((tmp_inode->i_mode & S_IWUGO) == 0)
-			/* the ATTR_READONLY flag may have been changed on   */
-			/* server -- set any w bits allowed by mnt_file_mode */
-			tmp_inode->i_mode |= (S_IWUGO & cifs_sb->mnt_file_mode);
-	} /* could add code here - to validate if device or weird share type? */
+		if (attr & ATTR_DIRECTORY) {
+			tmp_inode->i_mode |= S_IFDIR;
+			*pobject_type = DT_DIR;
+		} else {
+			tmp_inode->i_mode |= S_IFREG;
+			*pobject_type = DT_REG;
+		}
+	}
 
 	/* can not fill in nlink here as in qpathinfo version and Unx search */
 	if (atomic_read(&cifsInfo->inUse) == 0)
-- 
cgit v1.2.3-70-g09d2


From b0fd30d3e7e768aad5e398caaea6ae5a5c814eab Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Thu, 22 May 2008 09:33:34 -0400
Subject: when creating new inodes, use file_mode/dir_mode exclusively on mount
 without unix extensions

When CIFS creates a new inode on a mount without unix extensions, it
temporarily assigns the mode that was passed to it in the create/mkdir
call. Eventually, when the inode is revalidated, it changes to have the
file_mode or dir_mode for the mount. This is confusing to users who
expect that the mode shouldn't change this way. It's also problematic
since only the mode is treated this way, not the uid or gid. Suppose you
have a CIFS mount that's mounted with:

uid=0,gid=0,file_mode=0666,dir_mode=0777

...if an unprivileged user comes along and does this on the mount:

mkdir -m 0700 foo
touch foo/bar

...there is a period of time where the touch will fail, since the dir
will initially be owned by root and have mode 0700. If the user waits
long enough, then "foo" will be revalidated and will get the correct
dir_mode permissions.

This patch changes cifs_mkdir and cifs_create to not overwrite the
mode found by the initial cifs_get_inode_info call after the inode is
created on the server. Legacy behavior can be reenabled with the
new "dynperm" mount option.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/dir.c   | 4 +++-
 fs/cifs/inode.c | 7 +++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index f0b5b5f3dd2..fb69c1fa85c 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -260,7 +260,9 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 						 buf, inode->i_sb, xid,
 						 &fileHandle);
 			if (newinode) {
-				newinode->i_mode = mode;
+				if (cifs_sb->mnt_cifs_flags &
+				    CIFS_MOUNT_DYNPERM)
+					newinode->i_mode = mode;
 				if ((oplock & CIFS_CREATE_ACTION) &&
 				    (cifs_sb->mnt_cifs_flags &
 				     CIFS_MOUNT_SET_UID)) {
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index ae5bcaf2031..12667d6bf30 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1015,8 +1015,11 @@ mkdir_get_info:
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
 			}
 			if (direntry->d_inode) {
-				direntry->d_inode->i_mode = mode;
-				direntry->d_inode->i_mode |= S_IFDIR;
+				if (cifs_sb->mnt_cifs_flags &
+				     CIFS_MOUNT_DYNPERM)
+					direntry->d_inode->i_mode =
+						(mode | S_IFDIR);
+					
 				if (cifs_sb->mnt_cifs_flags &
 				     CIFS_MOUNT_SET_UID) {
 					direntry->d_inode->i_uid =
-- 
cgit v1.2.3-70-g09d2


From 4e94a105ed0df78e25b20ff8ed6761f5937662b1 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Fri, 23 May 2008 18:22:46 +0000
Subject: [CIFS] remove trailing whitespace

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 12667d6bf30..ae6b725b366 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1019,7 +1019,7 @@ mkdir_get_info:
 				     CIFS_MOUNT_DYNPERM)
 					direntry->d_inode->i_mode =
 						(mode | S_IFDIR);
-					
+
 				if (cifs_sb->mnt_cifs_flags &
 				     CIFS_MOUNT_SET_UID) {
 					direntry->d_inode->i_uid =
-- 
cgit v1.2.3-70-g09d2


From 4ca691a892e8ab4f79583de1394f17a7dcfa2b57 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Thu, 22 May 2008 09:33:34 -0400
Subject: silently ignore ownership changes unless unix extensions are enabled
 or we're faking uid changes

CIFS currently allows you to change the ownership of a file, but unless
unix extensions are enabled this change is not passed off to the server.

Have CIFS silently ignore ownership changes that can't be persistently
stored on the server unless the "setuids" option is explicitly
specified.

We could return an error here (-EOPNOTSUPP or something), but this is
how most disk-based windows filesystems on behave on Linux (e.g.  VFAT,
NTFS, etc). With cifsacl support and proper Windows to Unix idmapping
support, we may be able to do this more properly in the future.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/inode.c | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index ae6b725b366..fe752fdb26c 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1546,13 +1546,26 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 		} else
 			goto cifs_setattr_exit;
 	}
-	if (attrs->ia_valid & ATTR_UID) {
-		cFYI(1, ("UID changed to %d", attrs->ia_uid));
-		uid = attrs->ia_uid;
-	}
-	if (attrs->ia_valid & ATTR_GID) {
-		cFYI(1, ("GID changed to %d", attrs->ia_gid));
-		gid = attrs->ia_gid;
+
+	/*
+	 * Without unix extensions we can't send ownership changes to the
+	 * server, so silently ignore them. This is consistent with how
+	 * local DOS/Windows filesystems behave (VFAT, NTFS, etc). With
+	 * CIFSACL support + proper Windows to Unix idmapping, we may be
+	 * able to support this in the future.
+	 */
+	if (!pTcon->unix_ext &&
+	    !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) {
+		attrs->ia_valid &= ~(ATTR_UID | ATTR_GID);
+	} else {
+		if (attrs->ia_valid & ATTR_UID) {
+			cFYI(1, ("UID changed to %d", attrs->ia_uid));
+			uid = attrs->ia_uid;
+		}
+		if (attrs->ia_valid & ATTR_GID) {
+			cFYI(1, ("GID changed to %d", attrs->ia_gid));
+			gid = attrs->ia_gid;
+		}
 	}
 
 	time_buf.Attributes = 0;
-- 
cgit v1.2.3-70-g09d2


From 27adb44c4f671d15932eb0702a09d27244a8a7c1 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Fri, 23 May 2008 19:43:29 +0000
Subject: [CIFS] warn if both dynperm and cifsacl mount options specified

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 023434f72c1..d49e274f8eb 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2120,6 +2120,10 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
 		}
 
+		if ((volume_info.cifs_acl) && (volume_info.dynperm))
+			cERROR(1, ("mount option dynperm ignored if cifsacl "
+				   "mount option supported"));
+
 		tcon =
 		    find_unc(sin_server.sin_addr.s_addr, volume_info.UNC,
 			     volume_info.username);
-- 
cgit v1.2.3-70-g09d2


From b7206153f61bb63ee2cffa63905b57ec01d20e6e Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Fri, 23 May 2008 20:35:07 +0000
Subject: [CIFS] Correct incorrect obscure open flag

Also add defines for pipe subcommand codes

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifspdu.h | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 65d58b4e6a6..0f327c224da 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -79,6 +79,19 @@
 #define TRANS2_GET_DFS_REFERRAL       0x10
 #define TRANS2_REPORT_DFS_INCOSISTENCY 0x11
 
+/* SMB Transact (Named Pipe) subcommand codes */
+#define TRANS_SET_NMPIPE_STATE      0x0001
+#define TRANS_RAW_READ_NMPIPE       0x0011
+#define TRANS_QUERY_NMPIPE_STATE    0x0021
+#define TRANS_QUERY_NMPIPE_INFO     0x0022
+#define TRANS_PEEK_NMPIPE           0x0023
+#define TRANS_TRANSACT_NMPIPE       0x0026
+#define TRANS_RAW_WRITE_NMPIPE      0x0031
+#define TRANS_READ_NMPIPE           0x0036
+#define TRANS_WRITE_NMPIPE          0x0037
+#define TRANS_WAIT_NMPIPE           0x0053
+#define TRANS_CALL_NMPIPE           0x0054
+
 /* NT Transact subcommand codes */
 #define NT_TRANSACT_CREATE            0x01
 #define NT_TRANSACT_IOCTL             0x02
@@ -328,12 +341,13 @@
 #define CREATE_COMPLETE_IF_OPLK 0x00000100	/* should be zero */
 #define CREATE_NO_EA_KNOWLEDGE  0x00000200
 #define CREATE_EIGHT_DOT_THREE  0x00000400	/* doc says this is obsolete
-						 open for recovery flag - should
-						 be zero */
+						 "open for recovery" flag - should
+						 be zero in any case */
+#define CREATE_OPEN_FOR_RECOVERY 0x00000400
 #define CREATE_RANDOM_ACCESS	0x00000800
 #define CREATE_DELETE_ON_CLOSE	0x00001000
 #define CREATE_OPEN_BY_ID       0x00002000
-#define CREATE_OPEN_BACKUP_INTN 0x00004000
+#define CREATE_OPEN_BACKUP_INTENT 0x00004000
 #define CREATE_NO_COMPRESSION   0x00008000
 #define CREATE_RESERVE_OPFILTER 0x00100000	/* should be zero */
 #define OPEN_REPARSE_POINT	0x00200000
@@ -722,7 +736,6 @@ typedef struct smb_com_tconx_rsp_ext {
 #define SMB_CSC_CACHE_AUTO_REINT   0x0004
 #define SMB_CSC_CACHE_VDO          0x0008
 #define SMB_CSC_NO_CACHING         0x000C
-
 #define SMB_UNIQUE_FILE_NAME    0x0010
 #define SMB_EXTENDED_SIGNATURES 0x0020
 
@@ -806,7 +819,7 @@ typedef struct smb_com_findclose_req {
 #define ICOUNT_MASK		0x00FF
 #define PIPE_READ_MODE		0x0100
 #define NAMED_PIPE_TYPE		0x0400
-#define PIPE_END_POINT		0x0800
+#define PIPE_END_POINT		0x4000
 #define BLOCKING_NAMED_PIPE	0x8000
 
 typedef struct smb_com_open_req {	/* also handles create */
-- 
cgit v1.2.3-70-g09d2


From 03fb0bce01490c9bdedad861962c76f987531014 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 23 May 2008 13:04:19 -0700
Subject: fuse: fix bdi naming conflict

Fuse allocates a separate bdi for each filesystem, and registers them
in sysfs with "MAJOR:MINOR" of sb->s_dev (st_dev).  This works fine for
anon devices normally used by fuse, but can conflict with an already
registered BDI for "fuseblk" filesystems, where sb->s_dev represents a
real block device.  In particularl this happens if a non-partitioned
device is being mounted.

Fix by registering with a different name for "fuseblk" filesystems.

Thanks to Ioan Ionita for the bug report.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Reported-by: Ioan Ionita <opslynx@gmail.com>
Tested-by: Ioan Ionita <opslynx@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/ABI/testing/sysfs-class-bdi | 4 ++++
 fs/fuse/inode.c                           | 7 ++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/Documentation/ABI/testing/sysfs-class-bdi b/Documentation/ABI/testing/sysfs-class-bdi
index 5ac1e01bbd4..5f500977b42 100644
--- a/Documentation/ABI/testing/sysfs-class-bdi
+++ b/Documentation/ABI/testing/sysfs-class-bdi
@@ -14,6 +14,10 @@ MAJOR:MINOR
 	non-block filesystems which provide their own BDI, such as NFS
 	and FUSE.
 
+MAJOR:MINOR-fuseblk
+
+	Value of st_dev on fuseblk filesystems.
+
 default
 
 	The default backing dev, used for non-block device backed
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index fb77e096213..43e99513334 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -488,7 +488,12 @@ static struct fuse_conn *new_conn(struct super_block *sb)
 		err = bdi_init(&fc->bdi);
 		if (err)
 			goto error_kfree;
-		err = bdi_register_dev(&fc->bdi, fc->dev);
+		if (sb->s_bdev) {
+			err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk",
+					   MAJOR(fc->dev), MINOR(fc->dev));
+		} else {
+			err = bdi_register_dev(&fc->bdi, fc->dev);
+		}
 		if (err)
 			goto error_bdi_destroy;
 		/*
-- 
cgit v1.2.3-70-g09d2


From 71fd5179e8d1d4d503b517e0c5374f7c49540bfc Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Fri, 23 May 2008 13:04:20 -0700
Subject: ecryptfs: fix missed mutex_unlock

Cc: Michael Halcrow <mhalcrow@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/crypto.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index cd62d75b2cc..e2832bc7869 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1906,9 +1906,9 @@ int ecryptfs_get_tfm_and_mutex_for_cipher_name(struct crypto_blkcipher **tfm,
 			goto out;
 		}
 	}
-	mutex_unlock(&key_tfm_list_mutex);
 	(*tfm) = key_tfm->key_tfm;
 	(*tfm_mutex) = &key_tfm->key_tfm_mutex;
 out:
+	mutex_unlock(&key_tfm_list_mutex);
 	return rc;
 }
-- 
cgit v1.2.3-70-g09d2


From 80bfc25f42db6d4715c7688ae2352c5a8038fe7e Mon Sep 17 00:00:00 2001
From: Marcin Slusarz <marcin.slusarz@gmail.com>
Date: Fri, 23 May 2008 13:04:22 -0700
Subject: ntfs: le*_add_cpu conversion

replace all:
little_endian_variable = cpu_to_leX(leX_to_cpu(little_endian_variable) +
					expression_in_cpu_byteorder);
with:
	leX_add_cpu(&little_endian_variable, expression_in_cpu_byteorder);
generated with semantic patch

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Acked-by: Anton Altaparmakov <aia21@cantab.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ntfs/upcase.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/upcase.c b/fs/ntfs/upcase.c
index 9101807dc81..e2f72ca9803 100644
--- a/fs/ntfs/upcase.c
+++ b/fs/ntfs/upcase.c
@@ -77,11 +77,10 @@ ntfschar *generate_default_upcase(void)
 		uc[i] = cpu_to_le16(i);
 	for (r = 0; uc_run_table[r][0]; r++)
 		for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++)
-			uc[i] = cpu_to_le16(le16_to_cpu(uc[i]) +
-					uc_run_table[r][2]);
+			le16_add_cpu(&uc[i], uc_run_table[r][2]);
 	for (r = 0; uc_dup_table[r][0]; r++)
 		for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2)
-			uc[i + 1] = cpu_to_le16(le16_to_cpu(uc[i + 1]) - 1);
+			le16_add_cpu(&uc[i + 1], -1);
 	for (r = 0; uc_word_table[r][0]; r++)
 		uc[uc_word_table[r][0]] = cpu_to_le16(uc_word_table[r][1]);
 	return uc;
-- 
cgit v1.2.3-70-g09d2


From 80119ef5c8153e0a6cc5edf00c083dc98a9bd348 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@redhat.com>
Date: Fri, 23 May 2008 13:04:31 -0700
Subject: mm: fix atomic_t overflow in vm

The atomic_t type is 32bit but a 64bit system can have more than 2^32
pages of virtual address space available.  Without this we overflow on
ludicrously large mappings

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/proc_misc.c  | 2 +-
 include/linux/mman.h | 4 ++--
 mm/mmap.c            | 4 ++--
 mm/nommu.c           | 4 ++--
 mm/swap.c            | 4 ++--
 5 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 74a323d2b85..32dc14cd890 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -139,7 +139,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 #define K(x) ((x) << (PAGE_SHIFT - 10))
 	si_meminfo(&i);
 	si_swapinfo(&i);
-	committed = atomic_read(&vm_committed_space);
+	committed = atomic_long_read(&vm_committed_space);
 	allowed = ((totalram_pages - hugetlb_total_pages())
 		* sysctl_overcommit_ratio / 100) + total_swap_pages;
 
diff --git a/include/linux/mman.h b/include/linux/mman.h
index 87920a0852a..dab8892e6ff 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -17,14 +17,14 @@
 
 extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
-extern atomic_t vm_committed_space;
+extern atomic_long_t vm_committed_space;
 
 #ifdef CONFIG_SMP
 extern void vm_acct_memory(long pages);
 #else
 static inline void vm_acct_memory(long pages)
 {
-	atomic_add(pages, &vm_committed_space);
+	atomic_long_add(pages, &vm_committed_space);
 }
 #endif
 
diff --git a/mm/mmap.c b/mm/mmap.c
index fac66337da2..669499e7c2f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -80,7 +80,7 @@ EXPORT_SYMBOL(vm_get_page_prot);
 int sysctl_overcommit_memory = OVERCOMMIT_GUESS;  /* heuristic overcommit */
 int sysctl_overcommit_ratio = 50;	/* default is 50% */
 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
-atomic_t vm_committed_space = ATOMIC_INIT(0);
+atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0);
 
 /*
  * Check that a process has enough memory to allocate a new virtual
@@ -177,7 +177,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 	 * cast `allowed' as a signed long because vm_committed_space
 	 * sometimes has a negative value
 	 */
-	if (atomic_read(&vm_committed_space) < (long)allowed)
+	if (atomic_long_read(&vm_committed_space) < (long)allowed)
 		return 0;
 error:
 	vm_unacct_memory(pages);
diff --git a/mm/nommu.c b/mm/nommu.c
index ef8c62cec69..dca93fcb8b7 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -39,7 +39,7 @@ struct page *mem_map;
 unsigned long max_mapnr;
 unsigned long num_physpages;
 unsigned long askedalloc, realalloc;
-atomic_t vm_committed_space = ATOMIC_INIT(0);
+atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0);
 int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
 int sysctl_overcommit_ratio = 50; /* default is 50% */
 int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
@@ -1410,7 +1410,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 	 * cast `allowed' as a signed long because vm_committed_space
 	 * sometimes has a negative value
 	 */
-	if (atomic_read(&vm_committed_space) < (long)allowed)
+	if (atomic_long_read(&vm_committed_space) < (long)allowed)
 		return 0;
 error:
 	vm_unacct_memory(pages);
diff --git a/mm/swap.c b/mm/swap.c
index 91e194445a5..45c9f25a8a3 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -503,7 +503,7 @@ void vm_acct_memory(long pages)
 	local = &__get_cpu_var(committed_space);
 	*local += pages;
 	if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) {
-		atomic_add(*local, &vm_committed_space);
+		atomic_long_add(*local, &vm_committed_space);
 		*local = 0;
 	}
 	preempt_enable();
@@ -520,7 +520,7 @@ static int cpu_swap_callback(struct notifier_block *nfb,
 
 	committed = &per_cpu(committed_space, (long)hcpu);
 	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
-		atomic_add(*committed, &vm_committed_space);
+		atomic_long_add(*committed, &vm_committed_space);
 		*committed = 0;
 		drain_cpu_pagevecs((long)hcpu);
 	}
-- 
cgit v1.2.3-70-g09d2


From c4185a0e019387f5ad6e99009804965531fa1fab Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Fri, 23 May 2008 13:04:47 -0700
Subject: proc: proc_get_inode() should get module only once

Any file under /proc/net opened more than once leaked the refcounter
on the module it belongs to.

The problem is that module_get is called for each file opening while
module_put is called only when /proc inode is destroyed. So, lets put
module counter if we are dealing with already initialised inode.

Addresses http://bugzilla.kernel.org/show_bug.cgi?id=10737

Signed-off-by: Denis V. Lunev <den@openvz.org>
Cc: David Miller <davem@davemloft.net>
Cc: Patrick McHardy <kaber@trash.net>
Acked-by: Pavel Emelyanov <xemul@openvz.org>
Acked-by: Robert Olsson <robert.olsson@its.uu.se>
Acked-by: Eric W. Biederman <ebiederm@xmission.com>
Reported-by: Roland Kletzing <devzero@web.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 6f4e8dc97da..b08d1001791 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -425,7 +425,8 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
 			}
 		}
 		unlock_new_inode(inode);
-	}
+	} else
+	       module_put(de->owner);
 	return inode;
 
 out_ino:
-- 
cgit v1.2.3-70-g09d2


From 5132861a7a44498ebb18357473f8b8d4cdc70e9f Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Thu, 22 May 2008 09:33:34 -0400
Subject: disable most mode changes on non-unix/non-cifsacl mounts

CIFS currently allows you to change the mode of an inode on a share that
doesn't have unix extensions enabled, and isn't using cifsacl. The inode
in this case *only* has its mode changed in memory on the client. This
is problematic since it can change any time the inode is purged from the
cache.

This patch makes cifs_setattr silently ignore most mode changes when
unix extensions and cifsacl support are not enabled, and when the share
is not mounted with the "dynperm" option. The exceptions are:

When a mode change would remove all write access to an inode we turn on
the ATTR_READONLY bit on the server and remove all write bits from the
inode's mode in memory.

When a mode change would add a write bit to an inode that previously had
them all turned off, it turns off the ATTR_READONLY bit on the server,
and resets the mode back to what it would normally be (generally, the
file_mode or dir_mode of the share).

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/inode.c | 38 ++++++++++++++++++++++++++------------
 1 file changed, 26 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index fe752fdb26c..722be543cee 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1575,7 +1575,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 		attrs->ia_valid &= ~ATTR_MODE;
 
 	if (attrs->ia_valid & ATTR_MODE) {
-		cFYI(1, ("Mode changed to 0x%x", attrs->ia_mode));
+		cFYI(1, ("Mode changed to 0%o", attrs->ia_mode));
 		mode = attrs->ia_mode;
 	}
 
@@ -1590,18 +1590,18 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
 			rc = mode_to_acl(inode, full_path, mode);
-		else if ((mode & S_IWUGO) == 0) {
-#else
-		if ((mode & S_IWUGO) == 0) {
+		else
 #endif
-			/* not writeable */
-			if ((cifsInode->cifsAttrs & ATTR_READONLY) == 0) {
-				set_dosattr = true;
-				time_buf.Attributes =
-					cpu_to_le32(cifsInode->cifsAttrs |
-						    ATTR_READONLY);
-			}
-		} else if (cifsInode->cifsAttrs & ATTR_READONLY) {
+		if (((mode & S_IWUGO) == 0) &&
+		    (cifsInode->cifsAttrs & ATTR_READONLY) == 0) {
+			set_dosattr = true;
+			time_buf.Attributes = cpu_to_le32(cifsInode->cifsAttrs |
+							  ATTR_READONLY);
+			/* fix up mode if we're not using dynperm */
+			if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) == 0)
+				attrs->ia_mode = inode->i_mode & ~S_IWUGO;
+		} else if ((mode & S_IWUGO) &&
+			   (cifsInode->cifsAttrs & ATTR_READONLY)) {
 			/* If file is readonly on server, we would
 			not be able to write to it - so if any write
 			bit is enabled for user or group or other we
@@ -1612,6 +1612,20 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 			/* Windows ignores set to zero */
 			if (time_buf.Attributes == 0)
 				time_buf.Attributes |= cpu_to_le32(ATTR_NORMAL);
+
+			/* reset local inode permissions to normal */
+			if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)) {
+				attrs->ia_mode &= ~(S_IALLUGO);
+				if (S_ISDIR(inode->i_mode))
+					attrs->ia_mode |=
+						cifs_sb->mnt_dir_mode;
+				else
+					attrs->ia_mode |=
+						cifs_sb->mnt_file_mode;
+			}
+		} else if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)) {
+			/* ignore mode change - ATTR_READONLY hasn't changed */
+			attrs->ia_valid &= ~ATTR_MODE;
 		}
 	}
 
-- 
cgit v1.2.3-70-g09d2


From a2eddfa95919a730e0e5ed17e9c303fe5ba249cd Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Mon, 12 May 2008 15:44:41 +0200
Subject: x86: make /proc/stat account for all interrupts

LAPIC interrupts, which don't go through the generic interrupt handling
code, aren't accounted for in /proc/stat. Hence this patch adds a
mechanism architectures can use to accordingly adjust the statistics.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/irq_32.c  | 38 ++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/irq_64.c  | 28 ++++++++++++++++++++++++++++
 fs/proc/proc_misc.c       |  9 +++++++++
 include/asm-x86/hardirq.h |  6 ++++++
 4 files changed, 81 insertions(+)

(limited to 'fs')

diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 147352df28b..468acd04aa2 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -313,16 +313,20 @@ skip:
 				per_cpu(irq_stat,j).irq_tlb_count);
 		seq_printf(p, "  TLB shootdowns\n");
 #endif
+#ifdef CONFIG_X86_MCE
 		seq_printf(p, "TRM: ");
 		for_each_online_cpu(j)
 			seq_printf(p, "%10u ",
 				per_cpu(irq_stat,j).irq_thermal_count);
 		seq_printf(p, "  Thermal event interrupts\n");
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC
 		seq_printf(p, "SPU: ");
 		for_each_online_cpu(j)
 			seq_printf(p, "%10u ",
 				per_cpu(irq_stat,j).irq_spurious_count);
 		seq_printf(p, "  Spurious interrupts\n");
+#endif
 		seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
 #if defined(CONFIG_X86_IO_APIC)
 		seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
@@ -331,6 +335,40 @@ skip:
 	return 0;
 }
 
+/*
+ * /proc/stat helpers
+ */
+u64 arch_irq_stat_cpu(unsigned int cpu)
+{
+	u64 sum = nmi_count(cpu);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	sum += per_cpu(irq_stat, cpu).apic_timer_irqs;
+#endif
+#ifdef CONFIG_SMP
+	sum += per_cpu(irq_stat, cpu).irq_resched_count;
+	sum += per_cpu(irq_stat, cpu).irq_call_count;
+	sum += per_cpu(irq_stat, cpu).irq_tlb_count;
+#endif
+#ifdef CONFIG_X86_MCE
+	sum += per_cpu(irq_stat, cpu).irq_thermal_count;
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC
+	sum += per_cpu(irq_stat, cpu).irq_spurious_count;
+#endif
+	return sum;
+}
+
+u64 arch_irq_stat(void)
+{
+	u64 sum = atomic_read(&irq_err_count);
+
+#ifdef CONFIG_X86_IO_APIC
+	sum += atomic_read(&irq_mis_count);
+#endif
+	return sum;
+}
+
 #ifdef CONFIG_HOTPLUG_CPU
 #include <mach_apic.h>
 
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 3aac15466a9..1f78b238d8d 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -135,6 +135,7 @@ skip:
 			seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count);
 		seq_printf(p, "  TLB shootdowns\n");
 #endif
+#ifdef CONFIG_X86_MCE
 		seq_printf(p, "TRM: ");
 		for_each_online_cpu(j)
 			seq_printf(p, "%10u ", cpu_pda(j)->irq_thermal_count);
@@ -143,6 +144,7 @@ skip:
 		for_each_online_cpu(j)
 			seq_printf(p, "%10u ", cpu_pda(j)->irq_threshold_count);
 		seq_printf(p, "  Threshold APIC interrupts\n");
+#endif
 		seq_printf(p, "SPU: ");
 		for_each_online_cpu(j)
 			seq_printf(p, "%10u ", cpu_pda(j)->irq_spurious_count);
@@ -152,6 +154,32 @@ skip:
 	return 0;
 }
 
+/*
+ * /proc/stat helpers
+ */
+u64 arch_irq_stat_cpu(unsigned int cpu)
+{
+	u64 sum = cpu_pda(cpu)->__nmi_count;
+
+	sum += cpu_pda(cpu)->apic_timer_irqs;
+#ifdef CONFIG_SMP
+	sum += cpu_pda(cpu)->irq_resched_count;
+	sum += cpu_pda(cpu)->irq_call_count;
+	sum += cpu_pda(cpu)->irq_tlb_count;
+#endif
+#ifdef CONFIG_X86_MCE
+	sum += cpu_pda(cpu)->irq_thermal_count;
+	sum += cpu_pda(cpu)->irq_threshold_count;
+#endif
+	sum += cpu_pda(cpu)->irq_spurious_count;
+	return sum;
+}
+
+u64 arch_irq_stat(void)
+{
+	return atomic_read(&irq_err_count);
+}
+
 /*
  * do_IRQ handles all normal device IRQ's (the special
  * SMP cross-CPU interrupts have their own specific
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 74a323d2b85..903e617bec5 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -472,6 +472,13 @@ static const struct file_operations proc_vmalloc_operations = {
 };
 #endif
 
+#ifndef arch_irq_stat_cpu
+#define arch_irq_stat_cpu(cpu) 0
+#endif
+#ifndef arch_irq_stat
+#define arch_irq_stat() 0
+#endif
+
 static int show_stat(struct seq_file *p, void *v)
 {
 	int i;
@@ -509,7 +516,9 @@ static int show_stat(struct seq_file *p, void *v)
 			sum += temp;
 			per_irq_sum[j] += temp;
 		}
+		sum += arch_irq_stat_cpu(i);
 	}
+	sum += arch_irq_stat();
 
 	seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
 		(unsigned long long)cputime64_to_clock_t(user),
diff --git a/include/asm-x86/hardirq.h b/include/asm-x86/hardirq.h
index 314434d664e..000787df66e 100644
--- a/include/asm-x86/hardirq.h
+++ b/include/asm-x86/hardirq.h
@@ -3,3 +3,9 @@
 #else
 # include "hardirq_64.h"
 #endif
+
+extern u64 arch_irq_stat_cpu(unsigned int cpu);
+#define arch_irq_stat_cpu	arch_irq_stat_cpu
+
+extern u64 arch_irq_stat(void);
+#define arch_irq_stat		arch_irq_stat
-- 
cgit v1.2.3-70-g09d2


From 03cddb80ed2dacaf03c370d38bcc75f8303a03b8 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 5 Jun 2008 20:59:29 -0400
Subject: ext4: Fix use of uninitialized data with debug enabled.

Fix use of uninitialized data with debug enabled.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 873ad9b3418..c9900aade15 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2745,8 +2745,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 	sbi = EXT4_SB(sb);
 	es = sbi->s_es;
 
-	ext4_debug("using block group %lu(%d)\n", ac->ac_b_ex.fe_group,
-			gdp->bg_free_blocks_count);
 
 	err = -EIO;
 	bitmap_bh = read_block_bitmap(sb, ac->ac_b_ex.fe_group);
@@ -2762,6 +2760,9 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 	if (!gdp)
 		goto out_err;
 
+	ext4_debug("using block group %lu(%d)\n", ac->ac_b_ex.fe_group,
+			gdp->bg_free_blocks_count);
+
 	err = ext4_journal_get_write_access(handle, gdp_bh);
 	if (err)
 		goto out_err;
@@ -3094,8 +3095,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
 static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
 				struct ext4_prealloc_space *pa)
 {
-	unsigned len = ac->ac_o_ex.fe_len;
-
+	unsigned int len = ac->ac_o_ex.fe_len;
 	ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
 					&ac->ac_b_ex.fe_group,
 					&ac->ac_b_ex.fe_start);
-- 
cgit v1.2.3-70-g09d2


From 0bf7e8379ce7e0159a2a6bd3d937f2f6ada79799 Mon Sep 17 00:00:00 2001
From: "Jose R. Santos" <jrs@us.ibm.com>
Date: Tue, 3 Jun 2008 14:07:29 -0400
Subject: ext4: Fix uninit block group initialization with FLEX_BG

With FLEX_BG block bitmaps, inode bitmaps and inode tables _MAY_ be
allocated outside the group.  So, when initializing an uninitialized
block bitmap, we need to check the location of this blocks before
setting the corresponding bits in the block bitmap of the newly
initialized group.  Also return the right number of free blocks when
counting the available free blocks in uninit group.

Tested-by: Aneesh Kumar K.V <aneesh.kumar@inux.vnet.ibm.com>
Signed-off-by: Jose R. Santos <jrs@us.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 62 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 30494c5da84..9cc80b9cc8d 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -43,6 +43,46 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
 
 }
 
+static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
+			ext4_group_t block_group)
+{
+	ext4_group_t actual_group;
+	ext4_get_group_no_and_offset(sb, block, &actual_group, 0);
+	if (actual_group == block_group)
+		return 1;
+	return 0;
+}
+
+static int ext4_group_used_meta_blocks(struct super_block *sb,
+				ext4_group_t block_group)
+{
+	ext4_fsblk_t tmp;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	/* block bitmap, inode bitmap, and inode table blocks */
+	int used_blocks = sbi->s_itb_per_group + 2;
+
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
+		struct ext4_group_desc *gdp;
+		struct buffer_head *bh;
+
+		gdp = ext4_get_group_desc(sb, block_group, &bh);
+		if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp),
+					block_group))
+			used_blocks--;
+
+		if (!ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp),
+					block_group))
+			used_blocks--;
+
+		tmp = ext4_inode_table(sb, gdp);
+		for (; tmp < ext4_inode_table(sb, gdp) +
+				sbi->s_itb_per_group; tmp++) {
+			if (!ext4_block_in_group(sb, tmp, block_group))
+				used_blocks -= 1;
+		}
+	}
+	return used_blocks;
+}
 /* Initializes an uninitialized block bitmap if given, and returns the
  * number of blocks free in the group. */
 unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
@@ -105,20 +145,34 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 	free_blocks = group_blocks - bit_max;
 
 	if (bh) {
-		ext4_fsblk_t start;
+		ext4_fsblk_t start, tmp;
+		int flex_bg = 0;
 
 		for (bit = 0; bit < bit_max; bit++)
 			ext4_set_bit(bit, bh->b_data);
 
 		start = ext4_group_first_block_no(sb, block_group);
 
-		/* Set bits for block and inode bitmaps, and inode table */
-		ext4_set_bit(ext4_block_bitmap(sb, gdp) - start, bh->b_data);
-		ext4_set_bit(ext4_inode_bitmap(sb, gdp) - start, bh->b_data);
-		for (bit = (ext4_inode_table(sb, gdp) - start),
-		     bit_max = bit + sbi->s_itb_per_group; bit < bit_max; bit++)
-			ext4_set_bit(bit, bh->b_data);
+		if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+					      EXT4_FEATURE_INCOMPAT_FLEX_BG))
+			flex_bg = 1;
 
+		/* Set bits for block and inode bitmaps, and inode table */
+		tmp = ext4_block_bitmap(sb, gdp);
+		if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
+			ext4_set_bit(tmp - start, bh->b_data);
+
+		tmp = ext4_inode_bitmap(sb, gdp);
+		if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
+			ext4_set_bit(tmp - start, bh->b_data);
+
+		tmp = ext4_inode_table(sb, gdp);
+		for (; tmp < ext4_inode_table(sb, gdp) +
+				sbi->s_itb_per_group; tmp++) {
+			if (!flex_bg ||
+				ext4_block_in_group(sb, tmp, block_group))
+				ext4_set_bit(tmp - start, bh->b_data);
+		}
 		/*
 		 * Also if the number of blocks within the group is
 		 * less than the blocksize * 8 ( which is the size
@@ -126,8 +180,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 		 */
 		mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data);
 	}
-
-	return free_blocks - sbi->s_itb_per_group - 2;
+	return free_blocks - ext4_group_used_meta_blocks(sb, block_group);
 }
 
 
-- 
cgit v1.2.3-70-g09d2


From 944600930a37aa725ba6f93c3244e2d77a1e3581 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Fri, 6 Jun 2008 18:05:52 -0400
Subject: ext4: fix online resize bug

There is a bug when we are trying to verify that the reserve inode's
double indirect blocks point back to the primary gdt blocks.  The fix is
obvious, we need to mod the gdb count by the addr's per block.  This was
verified using the same testcase as with the ext3 equivalent of this
patch.

Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/resize.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 9f086a6a472..9ecb92f6854 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -563,7 +563,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
 	}
 
 	blk = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + EXT4_SB(sb)->s_gdb_count;
-	data = (__le32 *)dind->b_data + EXT4_SB(sb)->s_gdb_count;
+	data = (__le32 *)dind->b_data + (EXT4_SB(sb)->s_gdb_count %
+					 EXT4_ADDR_PER_BLOCK(sb));
 	end = (__le32 *)dind->b_data + EXT4_ADDR_PER_BLOCK(sb);
 
 	/* Get each reserved primary GDT block and verify it holds backups */
-- 
cgit v1.2.3-70-g09d2


From 8ea76900be3b4522396e2021260d2818a27b3a5b Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 26 May 2008 10:28:09 -0400
Subject: jbd2: Fix memory leak when verifying checksums in the journal

Cc: Andreas Dilger <adilger@clusterfs.com>
Cc: Girish Shilamkar <girish@clusterfs.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/recovery.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 5d0405a9e7c..7199db52b2f 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -344,6 +344,7 @@ static int calc_chksums(journal_t *journal, struct buffer_head *bh,
 			*crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data,
 				     obh->b_size);
 		}
+		put_bh(obh);
 	}
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 624080eded68738daee041ad64672a9d2614754f Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 6 Jun 2008 17:50:40 -0400
Subject: jbd2: If a journal checksum error is detected, propagate the error to
 ext4

If a journal checksum error is detected, the ext4 filesystem will call
ext4_error(), and the mount will either continue, become a read-only
mount, or cause a kernel panic based on the superblock flags
indicating the user's preference of what to do in case of filesystem
corruption being detected.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c      | 23 +++++++++++++++++++++++
 fs/jbd2/recovery.c   | 11 ++++-------
 include/linux/jbd2.h |  3 +++
 3 files changed, 30 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 09d9359c805..d01a32e8b50 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2189,6 +2189,29 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 	    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
 		if (ext4_load_journal(sb, es, journal_devnum))
 			goto failed_mount3;
+		if (!(sb->s_flags & MS_RDONLY) &&
+		    EXT4_SB(sb)->s_journal->j_failed_commit) {
+			printk(KERN_CRIT "EXT4-fs error (device %s): "
+			       "ext4_fill_super: Journal transaction "
+			       "%u is corrupt\n", sb->s_id, 
+			       EXT4_SB(sb)->s_journal->j_failed_commit);
+			if (test_opt (sb, ERRORS_RO)) {
+				printk (KERN_CRIT
+					"Mounting filesystem read-only\n");
+				sb->s_flags |= MS_RDONLY;
+				EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+				es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
+			}
+			if (test_opt(sb, ERRORS_PANIC)) {
+				EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+				es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
+				ext4_commit_super(sb, es, 1);
+				printk(KERN_CRIT
+				       "EXT4-fs (device %s): mount failed\n",
+				      sb->s_id);
+				goto failed_mount4;
+			}
+		}
 	} else if (journal_inum) {
 		if (ext4_create_journal(sb, es, journal_inum))
 			goto failed_mount3;
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 7199db52b2f..058f50f65b7 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -611,9 +611,8 @@ static int do_one_pass(journal_t *journal,
 				chksum_err = chksum_seen = 0;
 
 				if (info->end_transaction) {
-					printk(KERN_ERR "JBD: Transaction %u "
-						"found to be corrupt.\n",
-						next_commit_ID - 1);
+					journal->j_failed_commit =
+						info->end_transaction;
 					brelse(bh);
 					break;
 				}
@@ -644,10 +643,8 @@ static int do_one_pass(journal_t *journal,
 
 					if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
 					   JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)){
-						printk(KERN_ERR
-						       "JBD: Transaction %u "
-						       "found to be corrupt.\n",
-						       next_commit_ID);
+						journal->j_failed_commit =
+							next_commit_ID;
 						brelse(bh);
 						break;
 					}
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 05e2b307161..d147f0f9036 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -919,6 +919,9 @@ struct journal_s
 	struct proc_dir_entry	*j_proc_entry;
 	struct transaction_stats_s j_stats;
 
+	/* Failed journal commit ID */
+	unsigned int		j_failed_commit;
+
 	/*
 	 * An opaque pointer to fs-private information.  ext3 puts its
 	 * superblock pointer here
-- 
cgit v1.2.3-70-g09d2


From cd0b6a39a1d68b61b1073662f40f747c8b728f98 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 26 May 2008 10:28:28 -0400
Subject: ext4: Display the journal_async_commit mount option in /proc/mounts

Cc: Andreas Dilger <adilger@clusterfs.com>
Cc: Girish Shilamkar <girish@clusterfs.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index d01a32e8b50..ba92c606ad9 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -731,6 +731,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	}
 	if (test_opt(sb, BARRIER))
 		seq_puts(seq, ",barrier=1");
+	if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
+		seq_puts(seq, ",journal_async_commit");
 	if (test_opt(sb, NOBH))
 		seq_puts(seq, ",nobh");
 	if (!test_opt(sb, EXTENTS))
-- 
cgit v1.2.3-70-g09d2


From 034772b068be62a79470d6c1b81b01fbe27793ac Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 3 Jun 2008 22:31:11 -0400
Subject: jbd2: Fix barrier fallback code to re-lock the buffer head

If the device doesn't support write barriers, the write is retried
without ordered mode.  But the buffer head needs to be re-locked or
submit_bh will fail with on BUG(!buffer_locked(bh)).

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/commit.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 4d99685fdce..a2ed72f7cee 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -168,6 +168,7 @@ static int journal_submit_commit_record(journal_t *journal,
 		spin_unlock(&journal->j_state_lock);
 
 		/* And try again, without the barrier */
+		lock_buffer(bh);
 		set_buffer_uptodate(bh);
 		set_buffer_dirty(bh);
 		ret = submit_bh(WRITE, bh);
-- 
cgit v1.2.3-70-g09d2


From 571640cad3fda6475da45d91cf86076f1f86bd9b Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Mon, 26 May 2008 12:29:46 -0400
Subject: ext4: enable barriers by default

I can't think of any valid reason for ext4 to not use barriers when
they are available;  I believe this is necessary for filesystem
integrity in the face of a volatile write cache on storage.

An administrator who trusts that the cache is sufficiently battery-
backed (and power supplies are sufficiently redundant, etc...)
can always turn it back off again.

SuSE has carried such a patch for ext3 for quite some time now.

Also document the mount option while we're at it.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 Documentation/filesystems/ext4.txt | 12 ++++++++++--
 fs/ext4/super.c                    | 11 +++++++++--
 2 files changed, 19 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 560f88dc709..0c5086db835 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -139,8 +139,16 @@ commit=nrsec	(*)	Ext4 can be told to sync all its data and metadata
 			Setting it to very large values will improve
 			performance.
 
-barrier=1		This enables/disables barriers.  barrier=0 disables
-			it, barrier=1 enables it.
+barrier=<0|1(*)>	This enables/disables the use of write barriers in
+			the jbd code.  barrier=0 disables, barrier=1 enables.
+			This also requires an IO stack which can support
+			barriers, and if jbd gets an error on a barrier
+			write, it will disable again with a warning.
+			Write barriers enforce proper on-disk ordering
+			of journal commits, making volatile disk write caches
+			safe to use, at some performance penalty.  If
+			your disks are battery-backed in one way or another,
+			disabling barriers may safely improve performance.
 
 orlov		(*)	This enables the new Orlov block allocator. It is
 			enabled by default.
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ba92c606ad9..cb96f127c36 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -671,6 +671,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	unsigned long def_mount_opts;
 	struct super_block *sb = vfs->mnt_sb;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	journal_t *journal = sbi->s_journal;
 	struct ext4_super_block *es = sbi->s_es;
 
 	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
@@ -729,8 +730,13 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_printf(seq, ",commit=%u",
 			   (unsigned) (sbi->s_commit_interval / HZ));
 	}
-	if (test_opt(sb, BARRIER))
-		seq_puts(seq, ",barrier=1");
+	/*
+	 * We're changing the default of barrier mount option, so
+	 * let's always display its mount state so it's clear what its
+	 * status is.
+	 */
+	seq_puts(seq, ",barrier=");
+	seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
 	if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
 		seq_puts(seq, ",journal_async_commit");
 	if (test_opt(sb, NOBH))
@@ -1909,6 +1915,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 	sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
 
 	set_opt(sbi->s_mount_opt, RESERVATION);
+	set_opt(sbi->s_mount_opt, BARRIER);
 
 	/*
 	 * turn on extents feature by default in ext4 filesystem
-- 
cgit v1.2.3-70-g09d2


From cbaffba12ce08beb3e80bfda148ee0fa14aac188 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 26 May 2008 20:55:42 +0400
Subject: posix timers: discard SI_TIMER signals on exec

Based on Roland's patch. This approach was suggested by Austin Clements
from the very beginning, and then by Linus.

As Austin pointed out, the execing task can be killed by SI_TIMER signal
because exec flushes the signal handlers, but doesn't discard the pending
signals generated by posix timers. Perhaps not a bug, but people find this
surprising. See http://bugzilla.kernel.org/show_bug.cgi?id=10460

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Austin Clements <amdragon+kernelbugzilla@mit.edu>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c             |  1 +
 include/linux/sched.h |  2 ++
 kernel/signal.c       | 34 ++++++++++++++++++++++++++++++++++
 3 files changed, 37 insertions(+)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 3c2ba7ce11d..9448f1b50b4 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -860,6 +860,7 @@ static int de_thread(struct task_struct *tsk)
 
 no_thread_group:
 	exit_itimers(sig);
+	flush_itimer_signals();
 	if (leader)
 		release_task(leader);
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5395a6176f4..3e05e547474 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1848,7 +1848,9 @@ extern void exit_thread(void);
 extern void exit_files(struct task_struct *);
 extern void __cleanup_signal(struct signal_struct *);
 extern void __cleanup_sighand(struct sighand_struct *);
+
 extern void exit_itimers(struct signal_struct *);
+extern void flush_itimer_signals(void);
 
 extern NORET_TYPE void do_group_exit(int);
 
diff --git a/kernel/signal.c b/kernel/signal.c
index 2955f6c4f36..6c0958e52ea 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -231,6 +231,40 @@ void flush_signals(struct task_struct *t)
 	spin_unlock_irqrestore(&t->sighand->siglock, flags);
 }
 
+static void __flush_itimer_signals(struct sigpending *pending)
+{
+	sigset_t signal, retain;
+	struct sigqueue *q, *n;
+
+	signal = pending->signal;
+	sigemptyset(&retain);
+
+	list_for_each_entry_safe(q, n, &pending->list, list) {
+		int sig = q->info.si_signo;
+
+		if (likely(q->info.si_code != SI_TIMER)) {
+			sigaddset(&retain, sig);
+		} else {
+			sigdelset(&signal, sig);
+			list_del_init(&q->list);
+			__sigqueue_free(q);
+		}
+	}
+
+	sigorsets(&pending->signal, &signal, &retain);
+}
+
+void flush_itimer_signals(void)
+{
+	struct task_struct *tsk = current;
+	unsigned long flags;
+
+	spin_lock_irqsave(&tsk->sighand->siglock, flags);
+	__flush_itimer_signals(&tsk->pending);
+	__flush_itimer_signals(&tsk->signal->shared_pending);
+	spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
+}
+
 void ignore_signals(struct task_struct *t)
 {
 	int i;
-- 
cgit v1.2.3-70-g09d2


From a82c53a0e3f57f02782330372b7adad67b417645 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@comcast.net>
Date: Fri, 9 May 2008 13:28:36 +0200
Subject: splice: fix sendfile() issue with relay

Splice isn't always incrementing the ppos correctly, which broke
relay splice.

Signed-off-by: Tom Zanussi <zanussi@comcast.net>
Tested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/splice.c    | 12 ++++++++----
 kernel/relay.c |  2 +-
 2 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 78150038b58..a048ad2130c 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -983,7 +983,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
 
 	while (len) {
 		size_t read_len;
-		loff_t pos = sd->pos;
+		loff_t pos = sd->pos, prev_pos = pos;
 
 		ret = do_splice_to(in, &pos, pipe, len, flags);
 		if (unlikely(ret <= 0))
@@ -998,15 +998,19 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
 		 * could get stuck data in the internal pipe:
 		 */
 		ret = actor(pipe, sd);
-		if (unlikely(ret <= 0))
+		if (unlikely(ret <= 0)) {
+			sd->pos = prev_pos;
 			goto out_release;
+		}
 
 		bytes += ret;
 		len -= ret;
 		sd->pos = pos;
 
-		if (ret < read_len)
+		if (ret < read_len) {
+			sd->pos = prev_pos + ret;
 			goto out_release;
+		}
 	}
 
 done:
@@ -1072,7 +1076,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
 
 	ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
 	if (ret > 0)
-		*ppos += ret;
+		*ppos = sd.pos;
 
 	return ret;
 }
diff --git a/kernel/relay.c b/kernel/relay.c
index bc24dcdc570..7de644cdec4 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1191,7 +1191,7 @@ static ssize_t relay_file_splice_read(struct file *in,
 	ret = 0;
 	spliced = 0;
 
-	while (len) {
+	while (len && !spliced) {
 		ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret);
 		if (ret < 0)
 			break;
-- 
cgit v1.2.3-70-g09d2


From ca39d651d17df49b6d11f851d56c0ce0ce01ea1a Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 20 May 2008 21:27:41 +0200
Subject: splice: handle try_to_release_page() failure

splice currently assumes that try_to_release_page() always suceeds,
but it can return failure. If it does, we cannot steal the page.

Acked-by: Mingming Cao <cmm@us.ibm.com
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/splice.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index a048ad2130c..aa5f6f60b30 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -58,8 +58,8 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
 		 */
 		wait_on_page_writeback(page);
 
-		if (PagePrivate(page))
-			try_to_release_page(page, GFP_KERNEL);
+		if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
+			goto out_unlock;
 
 		/*
 		 * If we succeeded in removing the mapping, set LRU flag
@@ -75,6 +75,7 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
 	 * Raced with truncate or failed to remove page from current
 	 * address space, unlock and return failure.
 	 */
+out_unlock:
 	unlock_page(page);
 	return 1;
 }
-- 
cgit v1.2.3-70-g09d2


From 3c65e8743bf8b5cf0f90e8d767bf1d8b50c14c76 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 28 May 2008 08:58:56 -0500
Subject: JFS: diAlloc() should return -EIO rather than EIO

The comment above the function says one of its return value is -EIO,
and also the caller of diAlloc() checks for -EIO:

struct inode *ialloc(struct inode *parent, umode_t mode)
{
	...
	rc = diAlloc(parent, S_ISDIR(mode), inode);
	if (rc) {
		jfs_warn("ialloc: diAlloc returned %d!", rc);
		if (rc == -EIO)
			make_bad_inode(inode);
	...

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
---
 fs/jfs/jfs_imap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 734ec916bea..d6363d8309d 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -1520,7 +1520,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip)
 					jfs_error(ip->i_sb,
 						  "diAlloc: can't find free bit "
 						  "in wmap");
-					return EIO;
+					return -EIO;
 				}
 
 				/* determine the inode number within the
-- 
cgit v1.2.3-70-g09d2


From a12630b186d56a77d17c9b34c82b88dda4337ed7 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 9 May 2008 18:49:29 -0700
Subject: ocfs2: Rename 'user_stack' plugin structure to 'ocfs2_user_plugin'

The static structure describing the userspace cluster plugin for ocfs2
was named 'user_stack', which is a real pain when people are grep(1)ing
the tree for the program stack object 'user_stack'.  Change the name to
something distinct and namespaced.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/stack_user.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index b503772cd0e..6b97d11f6bf 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -61,7 +61,7 @@
  * negotiated by the client.  The client negotiates based on the maximum
  * version advertised in /sys/fs/ocfs2/max_locking_protocol.  The major
  * number from the "SETV" message must match
- * user_stack.sp_proto->lp_max_version.pv_major, and the minor number
+ * ocfs2_user_plugin.sp_proto->lp_max_version.pv_major, and the minor number
  * must be less than or equal to ...->lp_max_version.pv_minor.
  *
  * Once this information has been set, mounts will be allowed.  From this
@@ -153,7 +153,7 @@ union ocfs2_control_message {
 	struct ocfs2_control_message_down	u_down;
 };
 
-static struct ocfs2_stack_plugin user_stack;
+static struct ocfs2_stack_plugin ocfs2_user_plugin;
 
 static atomic_t ocfs2_control_opened;
 static int ocfs2_control_this_node = -1;
@@ -399,7 +399,7 @@ static int ocfs2_control_do_setversion_msg(struct file *file,
 	char *ptr = NULL;
 	struct ocfs2_control_private *p = file->private_data;
 	struct ocfs2_protocol_version *max =
-		&user_stack.sp_proto->lp_max_version;
+		&ocfs2_user_plugin.sp_proto->lp_max_version;
 
 	if (ocfs2_control_get_handshake_state(file) !=
 	    OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
@@ -680,7 +680,7 @@ static void fsdlm_lock_ast_wrapper(void *astarg)
 	struct dlm_lksb *lksb = fsdlm_astarg_to_lksb(astarg);
 	int status = lksb->sb_status;
 
-	BUG_ON(user_stack.sp_proto == NULL);
+	BUG_ON(ocfs2_user_plugin.sp_proto == NULL);
 
 	/*
 	 * For now we're punting on the issue of other non-standard errors
@@ -693,16 +693,16 @@ static void fsdlm_lock_ast_wrapper(void *astarg)
 	 */
 
 	if (status == -DLM_EUNLOCK || status == -DLM_ECANCEL)
-		user_stack.sp_proto->lp_unlock_ast(astarg, 0);
+		ocfs2_user_plugin.sp_proto->lp_unlock_ast(astarg, 0);
 	else
-		user_stack.sp_proto->lp_lock_ast(astarg);
+		ocfs2_user_plugin.sp_proto->lp_lock_ast(astarg);
 }
 
 static void fsdlm_blocking_ast_wrapper(void *astarg, int level)
 {
-	BUG_ON(user_stack.sp_proto == NULL);
+	BUG_ON(ocfs2_user_plugin.sp_proto == NULL);
 
-	user_stack.sp_proto->lp_blocking_ast(astarg, level);
+	ocfs2_user_plugin.sp_proto->lp_blocking_ast(astarg, level);
 }
 
 static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
@@ -838,7 +838,7 @@ static int user_cluster_this_node(unsigned int *this_node)
 	return 0;
 }
 
-static struct ocfs2_stack_operations user_stack_ops = {
+static struct ocfs2_stack_operations ocfs2_user_plugin_ops = {
 	.connect	= user_cluster_connect,
 	.disconnect	= user_cluster_disconnect,
 	.this_node	= user_cluster_this_node,
@@ -849,20 +849,20 @@ static struct ocfs2_stack_operations user_stack_ops = {
 	.dump_lksb	= user_dlm_dump_lksb,
 };
 
-static struct ocfs2_stack_plugin user_stack = {
+static struct ocfs2_stack_plugin ocfs2_user_plugin = {
 	.sp_name	= "user",
-	.sp_ops		= &user_stack_ops,
+	.sp_ops		= &ocfs2_user_plugin_ops,
 	.sp_owner	= THIS_MODULE,
 };
 
 
-static int __init user_stack_init(void)
+static int __init ocfs2_user_plugin_init(void)
 {
 	int rc;
 
 	rc = ocfs2_control_init();
 	if (!rc) {
-		rc = ocfs2_stack_glue_register(&user_stack);
+		rc = ocfs2_stack_glue_register(&ocfs2_user_plugin);
 		if (rc)
 			ocfs2_control_exit();
 	}
@@ -870,14 +870,14 @@ static int __init user_stack_init(void)
 	return rc;
 }
 
-static void __exit user_stack_exit(void)
+static void __exit ocfs2_user_plugin_exit(void)
 {
-	ocfs2_stack_glue_unregister(&user_stack);
+	ocfs2_stack_glue_unregister(&ocfs2_user_plugin);
 	ocfs2_control_exit();
 }
 
 MODULE_AUTHOR("Oracle");
 MODULE_DESCRIPTION("ocfs2 driver for userspace cluster stacks");
 MODULE_LICENSE("GPL");
-module_init(user_stack_init);
-module_exit(user_stack_exit);
+module_init(ocfs2_user_plugin_init);
+module_exit(ocfs2_user_plugin_exit);
-- 
cgit v1.2.3-70-g09d2


From 271d772d02507c7541d5e6b4938ed2380e59a39a Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Mon, 12 May 2008 18:31:35 -0700
Subject: [PATCH 1/3] ocfs2/net: Silence build warnings

This patch silences the build warnings concerning o2net_debugfs_init()
and friends when building without CONFIG_DEBUG_FS enabled.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/cluster/tcp.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index a705d5d1903..fd6179eb26d 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -128,23 +128,23 @@ void o2net_debug_del_nst(struct o2net_send_tracking *nst);
 void o2net_debug_add_sc(struct o2net_sock_container *sc);
 void o2net_debug_del_sc(struct o2net_sock_container *sc);
 #else
-static int o2net_debugfs_init(void)
+static inline int o2net_debugfs_init(void)
 {
 	return 0;
 }
-static void o2net_debugfs_exit(void)
+static inline void o2net_debugfs_exit(void)
 {
 }
-static void o2net_debug_add_nst(struct o2net_send_tracking *nst)
+static inline void o2net_debug_add_nst(struct o2net_send_tracking *nst)
 {
 }
-static void o2net_debug_del_nst(struct o2net_send_tracking *nst)
+static inline void o2net_debug_del_nst(struct o2net_send_tracking *nst)
 {
 }
-static void o2net_debug_add_sc(struct o2net_sock_container *sc)
+static inline void o2net_debug_add_sc(struct o2net_sock_container *sc)
 {
 }
-static void o2net_debug_del_sc(struct o2net_sock_container *sc)
+static inline void o2net_debug_del_sc(struct o2net_sock_container *sc)
 {
 }
 #endif	/* CONFIG_DEBUG_FS */
-- 
cgit v1.2.3-70-g09d2


From 959040c37a8cae8117907d4aed87f1b01ff1ea19 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Mon, 12 May 2008 18:31:36 -0700
Subject: [PATCH 2/3] ocfs2/dlm: Silence build warnings

This patch silences the build warnings concerning dlm_debug_init()
and friends when building without CONFIG_DEBUG_FS enabled.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlm/dlmdebug.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
index d34a62a3a62..8c686d22f9c 100644
--- a/fs/ocfs2/dlm/dlmdebug.h
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -60,25 +60,25 @@ void dlm_destroy_debugfs_root(void);
 
 #else
 
-static int dlm_debug_init(struct dlm_ctxt *dlm)
+static inline int dlm_debug_init(struct dlm_ctxt *dlm)
 {
 	return 0;
 }
-static void dlm_debug_shutdown(struct dlm_ctxt *dlm)
+static inline void dlm_debug_shutdown(struct dlm_ctxt *dlm)
 {
 }
-static int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
+static inline int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
 {
 	return 0;
 }
-static void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
+static inline void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
 {
 }
-static int dlm_create_debugfs_root(void)
+static inline int dlm_create_debugfs_root(void)
 {
 	return 0;
 }
-static void dlm_destroy_debugfs_root(void)
+static inline void dlm_destroy_debugfs_root(void)
 {
 }
 
-- 
cgit v1.2.3-70-g09d2


From 0f475b2abed6cbccee1da20a0bef2895eb2a0edd Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Mon, 12 May 2008 18:31:37 -0700
Subject: [PATCH 3/3] ocfs2/net: Silence build warnings

This patch silences the build warnings concerning o2net_init_nst()
and friends when building without CONFIG_DEBUG_FS enabled.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/cluster/tcp.c          | 28 +++++++++-------------------
 fs/ocfs2/cluster/tcp_internal.h | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 1e44ad14881..a27d61581bd 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -142,53 +142,43 @@ static void o2net_idle_timer(unsigned long data);
 static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
 static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc);
 
-static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
-			   u32 msgkey, struct task_struct *task, u8 node)
-{
 #ifdef CONFIG_DEBUG_FS
+void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
+		    u32 msgkey, struct task_struct *task, u8 node)
+{
 	INIT_LIST_HEAD(&nst->st_net_debug_item);
 	nst->st_task = task;
 	nst->st_msg_type = msgtype;
 	nst->st_msg_key = msgkey;
 	nst->st_node = node;
-#endif
 }
 
-static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
 {
-#ifdef CONFIG_DEBUG_FS
 	do_gettimeofday(&nst->st_sock_time);
-#endif
 }
 
-static void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
 {
-#ifdef CONFIG_DEBUG_FS
 	do_gettimeofday(&nst->st_send_time);
-#endif
 }
 
-static void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
 {
-#ifdef CONFIG_DEBUG_FS
 	do_gettimeofday(&nst->st_status_time);
-#endif
 }
 
-static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
 					 struct o2net_sock_container *sc)
 {
-#ifdef CONFIG_DEBUG_FS
 	nst->st_sc = sc;
-#endif
 }
 
-static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id)
+void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id)
 {
-#ifdef CONFIG_DEBUG_FS
 	nst->st_id = msg_id;
-#endif
 }
+#endif /* CONFIG_DEBUG_FS */
 
 static inline int o2net_reconnect_delay(void)
 {
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 8d58cfe410b..18307ff81b7 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -224,10 +224,42 @@ struct o2net_send_tracking {
 	struct timeval			st_send_time;
 	struct timeval			st_status_time;
 };
+
+void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
+		    u32 msgkey, struct task_struct *task, u8 node);
+void o2net_set_nst_sock_time(struct o2net_send_tracking *nst);
+void o2net_set_nst_send_time(struct o2net_send_tracking *nst);
+void o2net_set_nst_status_time(struct o2net_send_tracking *nst);
+void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+				  struct o2net_sock_container *sc);
+void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id);
+
 #else
 struct o2net_send_tracking {
 	u32	dummy;
 };
+
+static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
+				  u32 msgkey, struct task_struct *task, u8 node)
+{
+}
+static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+{
+}
+static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+{
+}
+static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+{
+}
+static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+						struct o2net_sock_container *sc)
+{
+}
+static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
+					u32 msg_id)
+{
+}
 #endif	/* CONFIG_DEBUG_FS */
 
 #endif /* O2CLUSTER_TCP_INTERNAL_H */
-- 
cgit v1.2.3-70-g09d2


From ca05a99a54db1db5bca72eccb5866d2a86f8517f Mon Sep 17 00:00:00 2001
From: "Andrew G. Morgan" <morgan@kernel.org>
Date: Tue, 27 May 2008 22:05:17 -0700
Subject: capabilities: remain source compatible with 32-bit raw legacy
 capability support.

Source code out there hard-codes a notion of what the
_LINUX_CAPABILITY_VERSION #define means in terms of the semantics of the
raw capability system calls capget() and capset().  Its unfortunate, but
true.

Since the confusing header file has been in a released kernel, there is
software that is erroneously using 64-bit capabilities with the semantics
of 32-bit compatibilities.  These recently compiled programs may suffer
corruption of their memory when sys_getcap() overwrites more memory than
they are coded to expect, and the raising of added capabilities when using
sys_capset().

As such, this patch does a number of things to clean up the situation
for all. It

  1. forces the _LINUX_CAPABILITY_VERSION define to always retain its
     legacy value.

  2. adopts a new #define strategy for the kernel's internal
     implementation of the preferred magic.

  3. deprecates v2 capability magic in favor of a new (v3) magic
     number. The functionality of v3 is entirely equivalent to v2,
     the only difference being that the v2 magic causes the kernel
     to log a "deprecated" warning so the admin can find applications
     that may be using v2 inappropriately.

[User space code continues to be encouraged to use the libcap API which
protects the application from details like this.  libcap-2.10 is the first
to support v3 capabilities.]

Fixes issue reported in https://bugzilla.redhat.com/show_bug.cgi?id=447518.
Thanks to Bojan Smojver for the report.

[akpm@linux-foundation.org: s/depreciate/deprecate/g]
[akpm@linux-foundation.org: be robust about put_user size]
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Andrew G. Morgan <morgan@kernel.org>
Cc: Serge E. Hallyn <serue@us.ibm.com>
Cc: Bojan Smojver <bojan@rexursive.com>
Cc: stable@kernel.org
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 fs/proc/array.c            |   2 +-
 include/linux/capability.h |  29 ++++++++----
 kernel/capability.c        | 111 +++++++++++++++++++++++++++++----------------
 3 files changed, 95 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 9e3b8c33c24..797d775e035 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -288,7 +288,7 @@ static void render_cap_t(struct seq_file *m, const char *header,
 	seq_printf(m, "%s", header);
 	CAP_FOR_EACH_U32(__capi) {
 		seq_printf(m, "%08x",
-			   a->cap[(_LINUX_CAPABILITY_U32S-1) - __capi]);
+			   a->cap[(_KERNEL_CAPABILITY_U32S-1) - __capi]);
 	}
 	seq_printf(m, "\n");
 }
diff --git a/include/linux/capability.h b/include/linux/capability.h
index f4ea0dd9a61..fa830f8de03 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -31,11 +31,11 @@ struct task_struct;
 #define _LINUX_CAPABILITY_VERSION_1  0x19980330
 #define _LINUX_CAPABILITY_U32S_1     1
 
-#define _LINUX_CAPABILITY_VERSION_2  0x20071026
+#define _LINUX_CAPABILITY_VERSION_2  0x20071026  /* deprecated - use v3 */
 #define _LINUX_CAPABILITY_U32S_2     2
 
-#define _LINUX_CAPABILITY_VERSION    _LINUX_CAPABILITY_VERSION_2
-#define _LINUX_CAPABILITY_U32S       _LINUX_CAPABILITY_U32S_2
+#define _LINUX_CAPABILITY_VERSION_3  0x20080522
+#define _LINUX_CAPABILITY_U32S_3     2
 
 typedef struct __user_cap_header_struct {
 	__u32 version;
@@ -77,10 +77,23 @@ struct vfs_cap_data {
 	} data[VFS_CAP_U32];
 };
 
-#ifdef __KERNEL__
+#ifndef __KERNEL__
+
+/*
+ * Backwardly compatible definition for source code - trapped in a
+ * 32-bit world. If you find you need this, please consider using
+ * libcap to untrap yourself...
+ */
+#define _LINUX_CAPABILITY_VERSION  _LINUX_CAPABILITY_VERSION_1
+#define _LINUX_CAPABILITY_U32S     _LINUX_CAPABILITY_U32S_1
+
+#else
+
+#define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3
+#define _KERNEL_CAPABILITY_U32S    _LINUX_CAPABILITY_U32S_3
 
 typedef struct kernel_cap_struct {
-	__u32 cap[_LINUX_CAPABILITY_U32S];
+	__u32 cap[_KERNEL_CAPABILITY_U32S];
 } kernel_cap_t;
 
 #define _USER_CAP_HEADER_SIZE  (sizeof(struct __user_cap_header_struct))
@@ -351,7 +364,7 @@ typedef struct kernel_cap_struct {
  */
 
 #define CAP_FOR_EACH_U32(__capi)  \
-	for (__capi = 0; __capi < _LINUX_CAPABILITY_U32S; ++__capi)
+	for (__capi = 0; __capi < _KERNEL_CAPABILITY_U32S; ++__capi)
 
 # define CAP_FS_MASK_B0     (CAP_TO_MASK(CAP_CHOWN)		\
 			    | CAP_TO_MASK(CAP_DAC_OVERRIDE)	\
@@ -361,7 +374,7 @@ typedef struct kernel_cap_struct {
 
 # define CAP_FS_MASK_B1     (CAP_TO_MASK(CAP_MAC_OVERRIDE))
 
-#if _LINUX_CAPABILITY_U32S != 2
+#if _KERNEL_CAPABILITY_U32S != 2
 # error Fix up hand-coded capability macro initializers
 #else /* HAND-CODED capability initializers */
 
@@ -372,7 +385,7 @@ typedef struct kernel_cap_struct {
 # define CAP_NFSD_SET     ((kernel_cap_t){{ CAP_FS_MASK_B0|CAP_TO_MASK(CAP_SYS_RESOURCE), \
 					CAP_FS_MASK_B1 } })
 
-#endif /* _LINUX_CAPABILITY_U32S != 2 */
+#endif /* _KERNEL_CAPABILITY_U32S != 2 */
 
 #define CAP_INIT_INH_SET    CAP_EMPTY_SET
 
diff --git a/kernel/capability.c b/kernel/capability.c
index 39e8193b41e..cfbe4429948 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -52,6 +52,69 @@ static void warn_legacy_capability_use(void)
 	}
 }
 
+/*
+ * Version 2 capabilities worked fine, but the linux/capability.h file
+ * that accompanied their introduction encouraged their use without
+ * the necessary user-space source code changes. As such, we have
+ * created a version 3 with equivalent functionality to version 2, but
+ * with a header change to protect legacy source code from using
+ * version 2 when it wanted to use version 1. If your system has code
+ * that trips the following warning, it is using version 2 specific
+ * capabilities and may be doing so insecurely.
+ *
+ * The remedy is to either upgrade your version of libcap (to 2.10+,
+ * if the application is linked against it), or recompile your
+ * application with modern kernel headers and this warning will go
+ * away.
+ */
+
+static void warn_deprecated_v2(void)
+{
+	static int warned;
+
+	if (!warned) {
+		char name[sizeof(current->comm)];
+
+		printk(KERN_INFO "warning: `%s' uses deprecated v2"
+		       " capabilities in a way that may be insecure.\n",
+		       get_task_comm(name, current));
+		warned = 1;
+	}
+}
+
+/*
+ * Version check. Return the number of u32s in each capability flag
+ * array, or a negative value on error.
+ */
+static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
+{
+	__u32 version;
+
+	if (get_user(version, &header->version))
+		return -EFAULT;
+
+	switch (version) {
+	case _LINUX_CAPABILITY_VERSION_1:
+		warn_legacy_capability_use();
+		*tocopy = _LINUX_CAPABILITY_U32S_1;
+		break;
+	case _LINUX_CAPABILITY_VERSION_2:
+		warn_deprecated_v2();
+		/*
+		 * fall through - v3 is otherwise equivalent to v2.
+		 */
+	case _LINUX_CAPABILITY_VERSION_3:
+		*tocopy = _LINUX_CAPABILITY_U32S_3;
+		break;
+	default:
+		if (put_user((u32)_KERNEL_CAPABILITY_VERSION, &header->version))
+			return -EFAULT;
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 /*
  * For sys_getproccap() and sys_setproccap(), any of the three
  * capability set pointers may be NULL -- indicating that that set is
@@ -71,27 +134,13 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
 {
 	int ret = 0;
 	pid_t pid;
-	__u32 version;
 	struct task_struct *target;
 	unsigned tocopy;
 	kernel_cap_t pE, pI, pP;
 
-	if (get_user(version, &header->version))
-		return -EFAULT;
-
-	switch (version) {
-	case _LINUX_CAPABILITY_VERSION_1:
-		warn_legacy_capability_use();
-		tocopy = _LINUX_CAPABILITY_U32S_1;
-		break;
-	case _LINUX_CAPABILITY_VERSION_2:
-		tocopy = _LINUX_CAPABILITY_U32S_2;
-		break;
-	default:
-		if (put_user(_LINUX_CAPABILITY_VERSION, &header->version))
-			return -EFAULT;
-		return -EINVAL;
-	}
+	ret = cap_validate_magic(header, &tocopy);
+	if (ret != 0)
+		return ret;
 
 	if (get_user(pid, &header->pid))
 		return -EFAULT;
@@ -118,7 +167,7 @@ out:
 	spin_unlock(&task_capability_lock);
 
 	if (!ret) {
-		struct __user_cap_data_struct kdata[_LINUX_CAPABILITY_U32S];
+		struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
 		unsigned i;
 
 		for (i = 0; i < tocopy; i++) {
@@ -128,7 +177,7 @@ out:
 		}
 
 		/*
-		 * Note, in the case, tocopy < _LINUX_CAPABILITY_U32S,
+		 * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S,
 		 * we silently drop the upper capabilities here. This
 		 * has the effect of making older libcap
 		 * implementations implicitly drop upper capability
@@ -240,30 +289,16 @@ static inline int cap_set_all(kernel_cap_t *effective,
  */
 asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 {
-	struct __user_cap_data_struct kdata[_LINUX_CAPABILITY_U32S];
+	struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
 	unsigned i, tocopy;
 	kernel_cap_t inheritable, permitted, effective;
-	__u32 version;
 	struct task_struct *target;
 	int ret;
 	pid_t pid;
 
-	if (get_user(version, &header->version))
-		return -EFAULT;
-
-	switch (version) {
-	case _LINUX_CAPABILITY_VERSION_1:
-		warn_legacy_capability_use();
-		tocopy = _LINUX_CAPABILITY_U32S_1;
-		break;
-	case _LINUX_CAPABILITY_VERSION_2:
-		tocopy = _LINUX_CAPABILITY_U32S_2;
-		break;
-	default:
-		if (put_user(_LINUX_CAPABILITY_VERSION, &header->version))
-			return -EFAULT;
-		return -EINVAL;
-	}
+	ret = cap_validate_magic(header, &tocopy);
+	if (ret != 0)
+		return ret;
 
 	if (get_user(pid, &header->pid))
 		return -EFAULT;
@@ -281,7 +316,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 		permitted.cap[i] = kdata[i].permitted;
 		inheritable.cap[i] = kdata[i].inheritable;
 	}
-	while (i < _LINUX_CAPABILITY_U32S) {
+	while (i < _KERNEL_CAPABILITY_U32S) {
 		effective.cap[i] = 0;
 		permitted.cap[i] = 0;
 		inheritable.cap[i] = 0;
-- 
cgit v1.2.3-70-g09d2


From 1d92cfd54a51ff1b9593019fdde56793b66ba6a9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Mon, 2 Jun 2008 10:59:02 +0100
Subject: cifs endianness fixes

__le16 fields used as host-endian.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Acked-by: Steve French <smfrench@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/cifs/cifssmb.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 9b8b4cfdf99..fb655b4593c 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -3927,9 +3927,9 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
 	}
 
 	ref = (struct dfs_referral_level_3 *) &(pSMBr->referrals);
-	if (ref->VersionNumber != 3) {
+	if (ref->VersionNumber != cpu_to_le16(3)) {
 		cERROR(1, ("Referrals of V%d version are not supported,"
-			"should be V3", ref->VersionNumber));
+			"should be V3", le16_to_cpu(ref->VersionNumber)));
 		rc = -EINVAL;
 		goto parse_DFS_referrals_exit;
 	}
@@ -3977,7 +3977,7 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
 		if (rc)
 			goto parse_DFS_referrals_exit;
 
-		ref += ref->Size;
+		ref += le16_to_cpu(ref->Size);
 	}
 
 parse_DFS_referrals_exit:
-- 
cgit v1.2.3-70-g09d2


From ddb2c43594f22843e9f3153da151deaba1a834c5 Mon Sep 17 00:00:00 2001
From: Chris Wright <chrisw@sous-sol.org>
Date: Wed, 4 Jun 2008 09:16:33 -0700
Subject: asn1: additional sanity checking during BER decoding

- Don't trust a length which is greater than the working buffer.
  An invalid length could cause overflow when calculating buffer size
  for decoding oid.

- An oid length of zero is invalid and allows for an off-by-one error when
  decoding oid because the first subid actually encodes first 2 subids.

- A primitive encoding may not have an indefinite length.

Thanks to Wei Wang from McAfee for report.

Cc: Steven French <sfrench@us.ibm.com>
Cc: stable@kernel.org
Acked-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/cifs/asn1.c                         | 14 ++++++++++++++
 net/ipv4/netfilter/nf_nat_snmp_basic.c | 14 ++++++++++++++
 2 files changed, 28 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index cb52cbbe45f..f58e41d3ba4 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -186,6 +186,11 @@ asn1_length_decode(struct asn1_ctx *ctx, unsigned int *def, unsigned int *len)
 			}
 		}
 	}
+
+	/* don't trust len bigger than ctx buffer */
+	if (*len > ctx->end - ctx->pointer)
+		return 0;
+
 	return 1;
 }
 
@@ -203,6 +208,10 @@ asn1_header_decode(struct asn1_ctx *ctx,
 	if (!asn1_length_decode(ctx, &def, &len))
 		return 0;
 
+	/* primitive shall be definite, indefinite shall be constructed */
+	if (*con == ASN1_PRI && !def)
+		return 0;
+
 	if (def)
 		*eoc = ctx->pointer + len;
 	else
@@ -389,6 +398,11 @@ asn1_oid_decode(struct asn1_ctx *ctx,
 	unsigned long *optr;
 
 	size = eoc - ctx->pointer + 1;
+
+	/* first subid actually encodes first two subids */
+	if (size < 2 || size > ULONG_MAX/sizeof(unsigned long))
+		return 0;
+
 	*oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC);
 	if (*oid == NULL)
 		return 0;
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index 5daefad3d19..7750c97fde7 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -232,6 +232,11 @@ static unsigned char asn1_length_decode(struct asn1_ctx *ctx,
 			}
 		}
 	}
+
+	/* don't trust len bigger than ctx buffer */
+	if (*len > ctx->end - ctx->pointer)
+		return 0;
+
 	return 1;
 }
 
@@ -250,6 +255,10 @@ static unsigned char asn1_header_decode(struct asn1_ctx *ctx,
 	if (!asn1_length_decode(ctx, &def, &len))
 		return 0;
 
+	/* primitive shall be definite, indefinite shall be constructed */
+	if (*con == ASN1_PRI && !def)
+		return 0;
+
 	if (def)
 		*eoc = ctx->pointer + len;
 	else
@@ -434,6 +443,11 @@ static unsigned char asn1_oid_decode(struct asn1_ctx *ctx,
 	unsigned long *optr;
 
 	size = eoc - ctx->pointer + 1;
+
+	/* first subid actually encodes first two subids */
+	if (size < 2 || size > ULONG_MAX/sizeof(unsigned long))
+		return 0;
+
 	*oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC);
 	if (*oid == NULL) {
 		if (net_ratelimit())
-- 
cgit v1.2.3-70-g09d2


From b8c141e8fd80fa64d80c6a74492053f25a28e0ea Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Thu, 5 Jun 2008 22:45:55 -0700
Subject: frv: don't offer BINFMT_FLAT

Fix the following compile error:

  CC      fs/binfmt_flat.o
In file included from
/home/bunk/linux/kernel-2.6/git/linux-2.6/fs/binfmt_flat.c:36:
/home/bunk/linux/kernel-2.6/git/linux-2.6/include/linux/flat.h:14:22: error: asm/flat.h: No such file or directory
/home/bunk/linux/kernel-2.6/git/linux-2.6/fs/binfmt_flat.c: In function 'create_flat_tables':
/home/bunk/linux/kernel-2.6/git/linux-2.6/fs/binfmt_flat.c:124: error: implicit declaration of function 'flat_stack_align'
/home/bunk/linux/kernel-2.6/git/linux-2.6/fs/binfmt_flat.c:125: error: implicit declaration of function 'flat_argvp_envp_on_stack'
/home/bunk/linux/kernel-2.6/git/linux-2.6/fs/binfmt_flat.c: In function 'calc_reloc':
/home/bunk/linux/kernel-2.6/git/linux-2.6/fs/binfmt_flat.c:347: error: implicit declaration of function 'flat_reloc_valid'
/home/bunk/linux/kernel-2.6/git/linux-2.6/fs/binfmt_flat.c: In function 'load_flat_file':
/home/bunk/linux/kernel-2.6/git/linux-2.6/fs/binfmt_flat.c:479: error: implicit declaration of function 'flat_old_ram_flag'
/home/bunk/linux/kernel-2.6/git/linux-2.6/fs/binfmt_flat.c:755: error: implicit declaration of function 'flat_set_persistent'
/home/bunk/linux/kernel-2.6/git/linux-2.6/fs/binfmt_flat.c:757: error: implicit declaration of function 'flat_get_relocate_addr'
/home/bunk/linux/kernel-2.6/git/linux-2.6/fs/binfmt_flat.c:765: error: implicit declaration of function 'flat_get_addr_from_rp'
/home/bunk/linux/kernel-2.6/git/linux-2.6/fs/binfmt_flat.c:781: error: implicit declaration of function 'flat_put_addr_at_rp'

Reported-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Adrian Bunk <bunk@kernel.org>
Tested-by: David Howells <dhowells@redhat.com>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/Kconfig.binfmt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 55e8ee1900a..3263084eef9 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -42,7 +42,7 @@ config BINFMT_ELF_FDPIC
 
 config BINFMT_FLAT
 	bool "Kernel support for flat binaries"
-	depends on !MMU
+	depends on !MMU && (!FRV || BROKEN)
 	help
 	  Support uClinux FLAT format binaries.
 
-- 
cgit v1.2.3-70-g09d2


From d3e49afbb66109613c3474f2273f5830ac2dcb09 Mon Sep 17 00:00:00 2001
From: Michael Halcrow <mhalcrow@us.ibm.com>
Date: Thu, 5 Jun 2008 22:46:02 -0700
Subject: eCryptfs: remove unnecessary page decrypt call

The page decrypt calls in ecryptfs_write() are both pointless and buggy.
Pointless because ecryptfs_get_locked_page() has already brought the page
up to date, and buggy because prior mmap writes will just be blown away by
the decrypt call.

This patch also removes the declaration of a now-nonexistent function
ecryptfs_write_zeros().

Thanks to Eric Sandeen and David Kleikamp for helping to track this
down.

Eric said:

   fsx w/ mmap dies quickly ( < 100 ops) without this, and survives
   nicely (to millions of ops+) with it in place.

Signed-off-by: Michael Halcrow <mhalcrow@us.ibm.com>
Cc: Eric Sandeen <sandeen@redhat.com>
Cc: Dave Kleikamp <shaggy@austin.ibm.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/ecryptfs_kernel.h |  2 --
 fs/ecryptfs/read_write.c      | 22 ----------------------
 2 files changed, 24 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 951ee33a022..c15c25745e0 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -660,8 +660,6 @@ int ecryptfs_get_tfm_and_mutex_for_cipher_name(struct crypto_blkcipher **tfm,
 int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key,
 				      struct ecryptfs_auth_tok **auth_tok,
 				      char *sig);
-int ecryptfs_write_zeros(struct file *file, pgoff_t index, int start,
-			 int num_zeros);
 int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
 			 loff_t offset, size_t size);
 int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index ebf55150be5..75c2ea9fee3 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -157,20 +157,6 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
 			       ecryptfs_page_idx, rc);
 			goto out;
 		}
-		if (start_offset_in_page) {
-			/* Read in the page from the lower
-			 * into the eCryptfs inode page cache,
-			 * decrypting */
-			rc = ecryptfs_decrypt_page(ecryptfs_page);
-			if (rc) {
-				printk(KERN_ERR "%s: Error decrypting "
-				       "page; rc = [%d]\n",
-				       __func__, rc);
-				ClearPageUptodate(ecryptfs_page);
-				page_cache_release(ecryptfs_page);
-				goto out;
-			}
-		}
 		ecryptfs_page_virt = kmap_atomic(ecryptfs_page, KM_USER0);
 
 		/*
@@ -349,14 +335,6 @@ int ecryptfs_read(char *data, loff_t offset, size_t size,
 			       ecryptfs_page_idx, rc);
 			goto out;
 		}
-		rc = ecryptfs_decrypt_page(ecryptfs_page);
-		if (rc) {
-			printk(KERN_ERR "%s: Error decrypting "
-			       "page; rc = [%d]\n", __func__, rc);
-			ClearPageUptodate(ecryptfs_page);
-			page_cache_release(ecryptfs_page);
-			goto out;
-		}
 		ecryptfs_page_virt = kmap_atomic(ecryptfs_page, KM_USER0);
 		memcpy((data + data_offset),
 		       ((char *)ecryptfs_page_virt + start_offset_in_page),
-- 
cgit v1.2.3-70-g09d2


From 44d1b980c72db0faf35adb082fb2208351803028 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Thu, 5 Jun 2008 22:46:18 -0700
Subject: Fix various old email addresses for dwmw2

Although if people have questions about ARCnet, perhaps it's _better_
for them to be mailing dwmw2@cam.ac.uk about it...

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/networking/arcnet.txt   | 2 +-
 arch/frv/kernel/cmode.S               | 2 +-
 arch/frv/kernel/sleep.S               | 2 +-
 arch/frv/mb93090-mb00/pci-dma-nommu.c | 2 +-
 drivers/mtd/redboot.c                 | 2 +-
 fs/afs/callback.c                     | 2 +-
 fs/afs/inode.c                        | 2 +-
 fs/afs/super.c                        | 2 +-
 include/linux/mtd/nand.h              | 2 +-
 include/linux/tty.h                   | 2 +-
 10 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/Documentation/networking/arcnet.txt b/Documentation/networking/arcnet.txt
index 770fc41a78e..79601254038 100644
--- a/Documentation/networking/arcnet.txt
+++ b/Documentation/networking/arcnet.txt
@@ -46,7 +46,7 @@ These are the ARCnet drivers for Linux.
 
 
 This new release (2.91) has been put together by David Woodhouse 
-<dwmw2@cam.ac.uk>, in an attempt to tidy up the driver after adding support 
+<dwmw2@infradead.org>, in an attempt to tidy up the driver after adding support
 for yet another chipset. Now the generic support has been separated from the
 individual chipset drivers, and the source files aren't quite so packed with
 #ifdefs! I've changed this file a bit, but kept it in the first person from
diff --git a/arch/frv/kernel/cmode.S b/arch/frv/kernel/cmode.S
index 81ba28ad220..53deeb5d7e8 100644
--- a/arch/frv/kernel/cmode.S
+++ b/arch/frv/kernel/cmode.S
@@ -1,7 +1,7 @@
 /* cmode.S: clock mode management
  *
  * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
- * Written by David Woodhouse (dwmw2@redhat.com)
+ * Written by David Woodhouse (dwmw2@infradead.org)
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
diff --git a/arch/frv/kernel/sleep.S b/arch/frv/kernel/sleep.S
index c9b2d51ab9a..f67bf73cd2c 100644
--- a/arch/frv/kernel/sleep.S
+++ b/arch/frv/kernel/sleep.S
@@ -1,7 +1,7 @@
 /* sleep.S: power saving mode entry
  *
  * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
- * Written by David Woodhouse (dwmw2@redhat.com)
+ * Written by David Woodhouse (dwmw2@infradead.org)
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
diff --git a/arch/frv/mb93090-mb00/pci-dma-nommu.c b/arch/frv/mb93090-mb00/pci-dma-nommu.c
index 4985466b1a7..64ee58d748b 100644
--- a/arch/frv/mb93090-mb00/pci-dma-nommu.c
+++ b/arch/frv/mb93090-mb00/pci-dma-nommu.c
@@ -1,7 +1,7 @@
 /* pci-dma-nommu.c: Dynamic DMA mapping support for the FRV
  *
  * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
- * Written by David Woodhouse (dwmw2@redhat.com)
+ * Written by David Woodhouse (dwmw2@infradead.org)
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
diff --git a/drivers/mtd/redboot.c b/drivers/mtd/redboot.c
index 47474903263..c5030f94f04 100644
--- a/drivers/mtd/redboot.c
+++ b/drivers/mtd/redboot.c
@@ -295,5 +295,5 @@ module_init(redboot_parser_init);
 module_exit(redboot_parser_exit);
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Red Hat, Inc. - David Woodhouse <dwmw2@cambridge.redhat.com>");
+MODULE_AUTHOR("David Woodhouse <dwmw2@infradead.org>");
 MODULE_DESCRIPTION("Parsing code for RedBoot Flash Image System (FIS) tables");
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index a78d5b236bb..587ef5123cd 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -8,7 +8,7 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  *
- * Authors: David Woodhouse <dwmw2@cambridge.redhat.com>
+ * Authors: David Woodhouse <dwmw2@infradead.org>
  *          David Howells <dhowells@redhat.com>
  *
  */
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 08db82e1343..bb47217f6a1 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -8,7 +8,7 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  *
- * Authors: David Woodhouse <dwmw2@cambridge.redhat.com>
+ * Authors: David Woodhouse <dwmw2@infradead.org>
  *          David Howells <dhowells@redhat.com>
  *
  */
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 4b572b801d8..7e3faeef681 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -10,7 +10,7 @@
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  *
  * Authors: David Howells <dhowells@redhat.com>
- *          David Woodhouse <dwmw2@redhat.com>
+ *          David Woodhouse <dwmw2@infradead.org>
  *
  */
 
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index c42bc7f533a..53ea3dc8b0e 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -1,7 +1,7 @@
 /*
  *  linux/include/linux/mtd/nand.h
  *
- *  Copyright (c) 2000 David Woodhouse <dwmw2@mvhi.com>
+ *  Copyright (c) 2000 David Woodhouse <dwmw2@infradead.org>
  *                     Steven J. Hill <sjhill@realitydiluted.com>
  *		       Thomas Gleixner <tglx@linutronix.de>
  *
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 7f7121f9c96..324a3b231d4 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -36,7 +36,7 @@
 #define N_6PACK		7
 #define N_MASC		8	/* Reserved for Mobitex module <kaz@cafe.net> */
 #define N_R3964		9	/* Reserved for Simatic R3964 module */
-#define N_PROFIBUS_FDL	10	/* Reserved for Profibus <Dave@mvhi.com> */
+#define N_PROFIBUS_FDL	10	/* Reserved for Profibus */
 #define N_IRDA		11	/* Linux IrDa - http://irda.sourceforge.net/ */
 #define N_SMSBLOCK	12	/* SMS block mode - for talking to GSM data */
 				/* cards about SMS messages */
-- 
cgit v1.2.3-70-g09d2


From 93b071139a956e51c98cdefd50a47981a4eb852e Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Thu, 5 Jun 2008 22:46:21 -0700
Subject: introduce memory_read_from_buffer()

This patch introduces memory_read_from_buffer().

The only difference between memory_read_from_buffer() and
simple_read_from_buffer() is which address space the function copies to.

simple_read_from_buffer copies to user space memory.
memory_read_from_buffer copies to normal memory.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Doug Warzecha <Douglas_Warzecha@dell.com>
Cc: Zhang Rui <rui.zhang@intel.com>
Cc: Matt Domsch <Matt_Domsch@dell.com>
Cc: Abhay Salunke <Abhay_Salunke@dell.com>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Markus Rechberger <markus.rechberger@amd.com>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Bob Moore <robert.moore@intel.com>
Cc: Thomas Renninger <trenn@suse.de>
Cc: Len Brown <lenb@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: "Antonino A. Daplas" <adaplas@pol.net>
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Peter Oberparleiter <peter.oberparleiter@de.ibm.com>
Cc: Michael Holzheu <holzheu@de.ibm.com>
Cc: Brian King <brking@us.ibm.com>
Cc: James E.J. Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Andrew Vasquez <linux-driver@qlogic.com>
Cc: Seokmann Ju <seokmann.ju@qlogic.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/libfs.c         | 18 ++++++++++++++++++
 include/linux/fs.h |  5 ++++-
 2 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/libfs.c b/fs/libfs.c
index b004dfadd89..892d41cb338 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -528,6 +528,23 @@ ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
 	return count;
 }
 
+ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
+				const void *from, size_t available)
+{
+	loff_t pos = *ppos;
+
+	if (pos < 0)
+		return -EINVAL;
+	if (pos >= available)
+		return 0;
+	if (count > available - pos)
+		count = available - pos;
+	memcpy(to, from + pos, count);
+	*ppos = pos + count;
+
+	return count;
+}
+
 /*
  * Transaction based IO.
  * The file expects a single write which triggers the transaction, and then
@@ -800,6 +817,7 @@ EXPORT_SYMBOL(simple_statfs);
 EXPORT_SYMBOL(simple_sync_file);
 EXPORT_SYMBOL(simple_unlink);
 EXPORT_SYMBOL(simple_read_from_buffer);
+EXPORT_SYMBOL(memory_read_from_buffer);
 EXPORT_SYMBOL(simple_transaction_get);
 EXPORT_SYMBOL(simple_transaction_read);
 EXPORT_SYMBOL(simple_transaction_release);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f413085f748..d490779f18d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2000,7 +2000,10 @@ extern int simple_fill_super(struct super_block *, int, struct tree_descr *);
 extern int simple_pin_fs(struct file_system_type *, struct vfsmount **mount, int *count);
 extern void simple_release_fs(struct vfsmount **mount, int *count);
 
-extern ssize_t simple_read_from_buffer(void __user *, size_t, loff_t *, const void *, size_t);
+extern ssize_t simple_read_from_buffer(void __user *to, size_t count,
+			loff_t *ppos, const void *from, size_t available);
+extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
+			const void *from, size_t available);
 
 #ifdef CONFIG_MIGRATION
 extern int buffer_migrate_page(struct address_space *,
-- 
cgit v1.2.3-70-g09d2


From 7db9cfd380205f6b50afdc3bc3619f876a5eaf0d Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Thu, 5 Jun 2008 22:46:27 -0700
Subject: devscgroup: check for device permissions at mount time

Currently even if a task sits in an all-denied cgroup it can still mount
any block device in any mode it wants.

Put a proper check in do_open for block device to prevent this.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Tested-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/block_dev.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 7d822fae776..470c10ceb0f 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -12,6 +12,7 @@
 #include <linux/kmod.h>
 #include <linux/major.h>
 #include <linux/smp_lock.h>
+#include <linux/device_cgroup.h>
 #include <linux/highmem.h>
 #include <linux/blkdev.h>
 #include <linux/module.h>
@@ -928,9 +929,14 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 {
 	struct module *owner = NULL;
 	struct gendisk *disk;
-	int ret = -ENXIO;
+	int ret;
 	int part;
 
+	ret = devcgroup_inode_permission(bdev->bd_inode, file->f_mode);
+	if (ret != 0)
+		return ret;
+
+	ret = -ENXIO;
 	file->f_mapping = bdev->bd_inode->i_mapping;
 	lock_kernel();
 	disk = get_gendisk(bdev->bd_dev, &part);
-- 
cgit v1.2.3-70-g09d2


From aae8679b0ebcaa92f99c1c3cb0cd651594a43915 Mon Sep 17 00:00:00 2001
From: Thomas Tuttle <ttuttle@google.com>
Date: Thu, 5 Jun 2008 22:46:31 -0700
Subject: pagemap: fix bug in add_to_pagemap, require aligned-length reads of
 /proc/pid/pagemap

Fix a bug in add_to_pagemap.  Previously, since pm->out was a char *,
put_user was only copying 1 byte of every PFN, resulting in the top 7
bytes of each PFN not being copied.  By requiring that reads be a multiple
of 8 bytes, I can make pm->out and pm->end u64*s instead of char*s, which
makes put_user work properly, and also simplifies the logic in
add_to_pagemap a bit.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Thomas Tuttle <ttuttle@google.com>
Cc: Matt Mackall <mpm@selenic.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 28 +++++++++-------------------
 1 file changed, 9 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 88717c0f941..17403629e33 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -496,7 +496,7 @@ const struct file_operations proc_clear_refs_operations = {
 };
 
 struct pagemapread {
-	char __user *out, *end;
+	u64 __user *out, *end;
 };
 
 #define PM_ENTRY_BYTES      sizeof(u64)
@@ -519,21 +519,11 @@ struct pagemapread {
 static int add_to_pagemap(unsigned long addr, u64 pfn,
 			  struct pagemapread *pm)
 {
-	/*
-	 * Make sure there's room in the buffer for an
-	 * entire entry.  Otherwise, only copy part of
-	 * the pfn.
-	 */
-	if (pm->out + PM_ENTRY_BYTES >= pm->end) {
-		if (copy_to_user(pm->out, &pfn, pm->end - pm->out))
-			return -EFAULT;
-		pm->out = pm->end;
-		return PM_END_OF_BUFFER;
-	}
-
 	if (put_user(pfn, pm->out))
 		return -EFAULT;
-	pm->out += PM_ENTRY_BYTES;
+	pm->out++;
+	if (pm->out >= pm->end)
+		return PM_END_OF_BUFFER;
 	return 0;
 }
 
@@ -634,7 +624,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 
 	ret = -EINVAL;
 	/* file position must be aligned */
-	if (*ppos % PM_ENTRY_BYTES)
+	if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES))
 		goto out_task;
 
 	ret = 0;
@@ -664,8 +654,8 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 		goto out_pages;
 	}
 
-	pm.out = buf;
-	pm.end = buf + count;
+	pm.out = (u64 *)buf;
+	pm.end = (u64 *)(buf + count);
 
 	if (!ptrace_may_attach(task)) {
 		ret = -EIO;
@@ -690,9 +680,9 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 		if (ret == PM_END_OF_BUFFER)
 			ret = 0;
 		/* don't need mmap_sem for these, but this looks cleaner */
-		*ppos += pm.out - buf;
+		*ppos += (char *)pm.out - buf;
 		if (!ret)
-			ret = pm.out - buf;
+			ret = (char *)pm.out - buf;
 	}
 
 out_pages:
-- 
cgit v1.2.3-70-g09d2


From d100d148aa48df3b6ad526a48624f906695efe60 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 5 Jun 2008 22:46:46 -0700
Subject: nommu: fix ksize() abuse

The nommu binfmt code uses ksize() for pointers returned from do_mmap()
which is wrong.  This converts the call-sites to use the nommu specific
kobjsize() function which works as expected.

Cc: Christoph Lameter <clameter@sgi.com>
Cc: Matt Mackall <mpm@selenic.com>
Acked-by: Paul Mundt <lethal@linux-sh.org>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Acked-by: Greg Ungerer <gerg@snapgear.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_elf_fdpic.c | 2 +-
 fs/binfmt_flat.c      | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index ddd35d87339..d051a32e627 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -390,7 +390,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
 	}
 
 	/* expand the stack mapping to use up the entire allocation granule */
-	fullsize = ksize((char *) current->mm->start_brk);
+	fullsize = kobjsize((char *) current->mm->start_brk);
 	if (!IS_ERR_VALUE(do_mremap(current->mm->start_brk, stack_size,
 				    fullsize, 0, 0)))
 		stack_size = fullsize;
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 3b40d45a3a1..2cb1acda3a8 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -548,7 +548,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 			PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0);
 		/* Remap to use all availabe slack region space */
 		if (realdatastart && (realdatastart < (unsigned long)-4096)) {
-			reallen = ksize((void *)realdatastart);
+			reallen = kobjsize((void *)realdatastart);
 			if (reallen > len) {
 				realdatastart = do_mremap(realdatastart, len,
 					reallen, MREMAP_FIXED, realdatastart);
@@ -600,7 +600,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 			PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0);
 		/* Remap to use all availabe slack region space */
 		if (textpos && (textpos < (unsigned long) -4096)) {
-			reallen = ksize((void *)textpos);
+			reallen = kobjsize((void *)textpos);
 			if (reallen > len) {
 				textpos = do_mremap(textpos, len, reallen,
 					MREMAP_FIXED, textpos);
@@ -683,7 +683,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 		 */
 		current->mm->start_brk = datapos + data_len + bss_len;
 		current->mm->brk = (current->mm->start_brk + 3) & ~3;
-		current->mm->context.end_brk = memp + ksize((void *) memp) - stack_len;
+		current->mm->context.end_brk = memp + kobjsize((void *) memp) - stack_len;
 	}
 
 	if (flags & FLAT_FLAG_KTRACE)
@@ -790,7 +790,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 
 	/* zero the BSS,  BRK and stack areas */
 	memset((void*)(datapos + data_len), 0, bss_len + 
-			(memp + ksize((void *) memp) - stack_len -	/* end brk */
+			(memp + kobjsize((void *) memp) - stack_len -	/* end brk */
 			libinfo->lib_list[id].start_brk) +		/* start brk */
 			stack_len);
 
-- 
cgit v1.2.3-70-g09d2


From 9bb91784de6618c955994b2d5be332fb68c87ef1 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Thu, 5 Jun 2008 22:46:47 -0700
Subject: ext3: fix online resize bug

There is a bug when we are trying to verify that the reserve inode's
double indirect blocks point back to the primary gdt blocks.  The fix is
obvious, we need to mod the gdb count by the addr's per block.  You can
verify this with the following test case

dd if=/dev/zero of=disk1 seek=1024 count=1 bs=100M
losetup /dev/loop1 disk1
pvcreate /dev/loop1
vgcreate loopvg1 /dev/loop1
lvcreate -l 100%VG loopvg1 -n looplv1
mkfs.ext3 -J size=64 -b 1024 /dev/loopvg1/looplv1
mount /dev/loopvg1/looplv1 /mnt/loop
dd if=/dev/zero of=disk2 seek=1024 count=1 bs=50M
losetup /dev/loop2 disk2
pvcreate /dev/loop2
vgextend loopvg1 /dev/loop2
lvextend -l 100%VG /dev/loopvg1/looplv1
resize2fs /dev/loopvg1/looplv1

without this patch the resize2fs fails, with it the resize2fs succeeds.

Signed-off-by: Josef Bacik <jbacik@redhat.com>
Acked-by: Andreas Dilger <adilger@sun.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/resize.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 28cfd0b4052..77278e947e9 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -580,7 +580,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
 	}
 
 	blk = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + EXT3_SB(sb)->s_gdb_count;
-	data = (__le32 *)dind->b_data + EXT3_SB(sb)->s_gdb_count;
+	data = (__le32 *)dind->b_data + (EXT3_SB(sb)->s_gdb_count %
+					 EXT3_ADDR_PER_BLOCK(sb));
 	end = (__le32 *)dind->b_data + EXT3_ADDR_PER_BLOCK(sb);
 
 	/* Get each reserved primary GDT block and verify it holds backups */
-- 
cgit v1.2.3-70-g09d2


From aed5417593ad125283f35513573282139a8664b5 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@gmail.com>
Date: Thu, 5 Jun 2008 22:46:53 -0700
Subject: proc: calculate the correct /proc/<pid> link count

This patch:

  commit e9720acd728a46cb40daa52c99a979f7c4ff195c
  Author: Pavel Emelyanov <xemul@openvz.org>
  Date:   Fri Mar 7 11:08:40 2008 -0800

    [NET]: Make /proc/net a symlink on /proc/self/net (v3)

introduced a /proc/self/net directory without bumping the corresponding
link count for /proc/self.

This patch replaces the static link count initializations with a call that
counts the number of directory entries in the given pid_entry table
whenever it is instantiated, and thus relieves the burden of manually
keeping the two in sync.

[akpm@linux-foundation.org: cleanup]
Acked-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 33 +++++++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index c447e0743a3..3b455371e7f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -127,6 +127,25 @@ struct pid_entry {
 		NULL, &proc_single_file_operations,	\
 		{ .proc_show = &proc_##OTYPE } )
 
+/*
+ * Count the number of hardlinks for the pid_entry table, excluding the .
+ * and .. links.
+ */
+static unsigned int pid_entry_count_dirs(const struct pid_entry *entries,
+	unsigned int n)
+{
+	unsigned int i;
+	unsigned int count;
+
+	count = 0;
+	for (i = 0; i < n; ++i) {
+		if (S_ISDIR(entries[i].mode))
+			++count;
+	}
+
+	return count;
+}
+
 int maps_protect;
 EXPORT_SYMBOL(maps_protect);
 
@@ -2585,10 +2604,9 @@ static struct dentry *proc_pid_instantiate(struct inode *dir,
 	inode->i_op = &proc_tgid_base_inode_operations;
 	inode->i_fop = &proc_tgid_base_operations;
 	inode->i_flags|=S_IMMUTABLE;
-	inode->i_nlink = 5;
-#ifdef CONFIG_SECURITY
-	inode->i_nlink += 1;
-#endif
+
+	inode->i_nlink = 2 + pid_entry_count_dirs(tgid_base_stuff,
+		ARRAY_SIZE(tgid_base_stuff));
 
 	dentry->d_op = &pid_dentry_operations;
 
@@ -2816,10 +2834,9 @@ static struct dentry *proc_task_instantiate(struct inode *dir,
 	inode->i_op = &proc_tid_base_inode_operations;
 	inode->i_fop = &proc_tid_base_operations;
 	inode->i_flags|=S_IMMUTABLE;
-	inode->i_nlink = 4;
-#ifdef CONFIG_SECURITY
-	inode->i_nlink += 1;
-#endif
+
+	inode->i_nlink = 2 + pid_entry_count_dirs(tid_base_stuff,
+		ARRAY_SIZE(tid_base_stuff));
 
 	dentry->d_op = &pid_dentry_operations;
 
-- 
cgit v1.2.3-70-g09d2


From bbcdac0c20aa20d1daad41d9c138102b70e5aae4 Mon Sep 17 00:00:00 2001
From: Thomas Tuttle <ttuttle@google.com>
Date: Thu, 5 Jun 2008 22:46:58 -0700
Subject: pagemap: return map count, not reference count, in /proc/kpagecount

Since pagemap is all about examining pages mapped into processes' memory
spaces, it makes sense for kpagecount to return the map counts, not the
reference counts.

Signed-off-by: Thomas Tuttle <ttuttle@google.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/proc_misc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 32dc14cd890..5a16090a6d6 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -726,7 +726,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
 		if (!ppage)
 			pcount = 0;
 		else
-			pcount = atomic_read(&ppage->_count);
+			pcount = page_mapcount(ppage);
 
 		if (put_user(pcount, out++)) {
 			ret = -EFAULT;
-- 
cgit v1.2.3-70-g09d2


From 4710d1ac4c491dd8a28f57946214c0b5fe73cc87 Mon Sep 17 00:00:00 2001
From: Thomas Tuttle <ttuttle@google.com>
Date: Thu, 5 Jun 2008 22:46:58 -0700
Subject: pagemap: return EINVAL, not EIO, for unaligned reads of kpagecount or
 kpageflags

If the user tries to read from a position that is not a multiple of 8, or
read a number of bytes that is not a multiple of 8, they have passed an
invalid argument to read, for the purpose of reading these files.  It's
not an IO error because we didn't encounter any trouble finding the data
they asked for.

Signed-off-by: Thomas Tuttle <ttuttle@google.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/proc_misc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 5a16090a6d6..7e277f2ad46 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -716,7 +716,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
 	pfn = src / KPMSIZE;
 	count = min_t(size_t, count, (max_pfn * KPMSIZE) - src);
 	if (src & KPMMASK || count & KPMMASK)
-		return -EIO;
+		return -EINVAL;
 
 	while (count > 0) {
 		ppage = NULL;
@@ -782,7 +782,7 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
 	pfn = src / KPMSIZE;
 	count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
 	if (src & KPMMASK || count & KPMMASK)
-		return -EIO;
+		return -EINVAL;
 
 	while (count > 0) {
 		ppage = NULL;
-- 
cgit v1.2.3-70-g09d2


From aab2545fdd6641b76af0ae96456c4ca9d1e50dad Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 6 Jun 2008 11:31:39 -0700
Subject: uml: activate_mm: remove the dead PF_BORROWED_MM check

use_mm() was changed to use switch_mm() instead of activate_mm(), since
then nobody calls (and nobody should call) activate_mm() with
PF_BORROWED_MM bit set.

As Jeff Dike pointed out, we can also remove the "old != new" check, it is
always true.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/aio.c                     |  4 ----
 include/asm-um/mmu_context.h | 12 +++---------
 2 files changed, 3 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index b5253e77eb2..0fb3117ddd9 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -591,10 +591,6 @@ static void use_mm(struct mm_struct *mm)
 	atomic_inc(&mm->mm_count);
 	tsk->mm = mm;
 	tsk->active_mm = mm;
-	/*
-	 * Note that on UML this *requires* PF_BORROWED_MM to be set, otherwise
-	 * it won't work. Update it accordingly if you change it here
-	 */
 	switch_mm(active_mm, mm, tsk);
 	task_unlock(tsk);
 
diff --git a/include/asm-um/mmu_context.h b/include/asm-um/mmu_context.h
index 6686fc524ca..54f42e8b010 100644
--- a/include/asm-um/mmu_context.h
+++ b/include/asm-um/mmu_context.h
@@ -22,16 +22,10 @@ extern void force_flush_all(void);
 static inline void activate_mm(struct mm_struct *old, struct mm_struct *new)
 {
 	/*
-	 * This is called by fs/exec.c and fs/aio.c. In the first case, for an
-	 * exec, we don't need to do anything as we're called from userspace
-	 * and thus going to use a new host PID. In the second, we're called
-	 * from a kernel thread, and thus need to go doing the mmap's on the
-	 * host. Since they're very expensive, we want to avoid that as far as
-	 * possible.
+	 * This is called by fs/exec.c and sys_unshare()
+	 * when the new ->mm is used for the first time.
 	 */
-	if (old != new && (current->flags & PF_BORROWED_MM))
-		__switch_mm(&new->context.id);
-
+	__switch_mm(&new->context.id);
 	arch_dup_mmap(old, new);
 }
 
-- 
cgit v1.2.3-70-g09d2


From ec1aef33668448718fcba79e4e981592bfd7e0a3 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Tue, 10 Jun 2008 15:12:58 -0500
Subject: jfs: remove DIRENTSIZ

After fat gets fixed the unused DIRENTSIZ macro was the last user of
struct dirent we should get rid of since the kernel and userspace
versions differed.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
---
 fs/jfs/jfs_dtree.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h
index cdac2d5bafe..2545bb31723 100644
--- a/fs/jfs/jfs_dtree.h
+++ b/fs/jfs/jfs_dtree.h
@@ -243,9 +243,6 @@ typedef union {
 #define JFS_REMOVE 3
 #define JFS_RENAME 4
 
-#define DIRENTSIZ(namlen) \
-    ( (sizeof(struct dirent) - 2*(JFS_NAME_MAX+1) + 2*((namlen)+1) + 3) &~ 3 )
-
 /*
  * Maximum file offset for directories.
  */
-- 
cgit v1.2.3-70-g09d2


From dbdbb87636e882042cbe53d5d4eac94206f8db83 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 10 Jun 2008 21:21:56 +0000
Subject: [CIFS] Fix hang in mount when negprot causes server to kill tcp
 session

Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES   | 5 +++++
 fs/cifs/connect.c | 1 +
 2 files changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 28e3d5c5fca..1f3465201fd 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -2,6 +2,11 @@ Version 1.53
 ------------
 DFS support added (Microsoft Distributed File System client support needed
 for referrals which enable a hierarchical name space among servers).
+Disable temporary caching of mode bits to servers which do not support
+storing of mode (e.g. Windows servers, when client mounts without cifsacl
+mount option) and add new "dynperm" mount option to enable temporary caching
+of mode (enable old behavior).  Fix hang on mount caused when server crashes
+tcp session during negotiate protocol.
 
 Version 1.52
 ------------
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index d49e274f8eb..e8fa46c7cff 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -653,6 +653,7 @@ multi_t2_fnd:
 	spin_lock(&GlobalMid_Lock);
 	server->tcpStatus = CifsExiting;
 	spin_unlock(&GlobalMid_Lock);
+	wake_up_all(&server->response_q);
 
 	/* don't exit until kthread_stop is called */
 	set_current_state(TASK_UNINTERRUPTIBLE);
-- 
cgit v1.2.3-70-g09d2


From 79ee9a8b2d328243488fee8b55bfacc822049a2a Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 10 Jun 2008 21:37:02 +0000
Subject: [CIFS] cifs: fix oops on mount when CONFIG_CIFS_DFS_UPCALL is enabled

simple "mount -t cifs //xxx /mnt" oopsed on strlen of options
http://kerneloops.org/guilty.php?guilty=cifs_get_sb&version=2.6.25-release&start=16711 \
68&end=1703935&class=oops

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 5df93fd6303..86b4d5f405a 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -97,9 +97,6 @@ cifs_read_super(struct super_block *sb, void *data,
 {
 	struct inode *inode;
 	struct cifs_sb_info *cifs_sb;
-#ifdef CONFIG_CIFS_DFS_UPCALL
-	int len;
-#endif
 	int rc = 0;
 
 	/* BB should we make this contingent on mount parm? */
@@ -117,15 +114,17 @@ cifs_read_super(struct super_block *sb, void *data,
 	 * complex operation (mount), and in case of fail
 	 * just exit instead of doing mount and attempting
 	 * undo it if this copy fails?*/
-	len = strlen(data);
-	cifs_sb->mountdata = kzalloc(len + 1, GFP_KERNEL);
-	if (cifs_sb->mountdata == NULL) {
-		kfree(sb->s_fs_info);
-		sb->s_fs_info = NULL;
-		return -ENOMEM;
+	if (data) {
+		int len = strlen(data);
+		cifs_sb->mountdata = kzalloc(len + 1, GFP_KERNEL);
+		if (cifs_sb->mountdata == NULL) {
+			kfree(sb->s_fs_info);
+			sb->s_fs_info = NULL;
+			return -ENOMEM;
+		}
+		strncpy(cifs_sb->mountdata, data, len + 1);
+		cifs_sb->mountdata[len] = '\0';
 	}
-	strncpy(cifs_sb->mountdata, data, len + 1);
-	cifs_sb->mountdata[len] = '\0';
 #endif
 
 	rc = cifs_mount(sb, cifs_sb, data, devname);
-- 
cgit v1.2.3-70-g09d2


From 2d518f84e5ecd1d71df0e6ac5176d212f68c27ce Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 12 Jun 2008 15:21:28 -0700
Subject: fat: relax the permission check of fat_setattr()

New chmod() allows only acceptable permission, and if not acceptable, it
returns -EPERM.  Old one allows even if it can't store permission to on
disk inode.  But it seems too strict for users.

E.g.  https://bugzilla.redhat.com/show_bug.cgi?id=449080: With new one,
rsync couldn't create the temporary file.

So, this patch allows like old one, but now it doesn't change the
permission if it can't store, and it returns 0.

Also, this patch fixes missing check.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/file.c | 44 +++++++++++++++++++++++++++-----------------
 1 file changed, 27 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/file.c b/fs/fat/file.c
index 27cc1164ec3..771326b8047 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -257,26 +257,34 @@ int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 }
 EXPORT_SYMBOL_GPL(fat_getattr);
 
-static int fat_check_mode(const struct msdos_sb_info *sbi, struct inode *inode,
-			  mode_t mode)
+static int fat_sanitize_mode(const struct msdos_sb_info *sbi,
+			     struct inode *inode, umode_t *mode_ptr)
 {
-	mode_t mask, req = mode & ~S_IFMT;
+	mode_t mask, perm;
 
-	if (S_ISREG(mode))
+	/*
+	 * Note, the basic check is already done by a caller of
+	 * (attr->ia_mode & ~MSDOS_VALID_MODE)
+	 */
+
+	if (S_ISREG(inode->i_mode))
 		mask = sbi->options.fs_fmask;
 	else
 		mask = sbi->options.fs_dmask;
 
+	perm = *mode_ptr & ~(S_IFMT | mask);
+
 	/*
 	 * Of the r and x bits, all (subject to umask) must be present. Of the
 	 * w bits, either all (subject to umask) or none must be present.
 	 */
-	req &= ~mask;
-	if ((req & (S_IRUGO | S_IXUGO)) != (inode->i_mode & (S_IRUGO|S_IXUGO)))
+	if ((perm & (S_IRUGO | S_IXUGO)) != (inode->i_mode & (S_IRUGO|S_IXUGO)))
 		return -EPERM;
-	if ((req & S_IWUGO) && ((req & S_IWUGO) != (S_IWUGO & ~mask)))
+	if ((perm & S_IWUGO) && ((perm & S_IWUGO) != (S_IWUGO & ~mask)))
 		return -EPERM;
 
+	*mode_ptr &= S_IFMT | perm;
+
 	return 0;
 }
 
@@ -299,7 +307,7 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
 	struct inode *inode = dentry->d_inode;
-	int mask, error = 0;
+	int error = 0;
 	unsigned int ia_valid;
 
 	lock_kernel();
@@ -332,12 +340,13 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
 			error = 0;
 		goto out;
 	}
+
 	if (((attr->ia_valid & ATTR_UID) &&
 	     (attr->ia_uid != sbi->options.fs_uid)) ||
 	    ((attr->ia_valid & ATTR_GID) &&
 	     (attr->ia_gid != sbi->options.fs_gid)) ||
 	    ((attr->ia_valid & ATTR_MODE) &&
-	     fat_check_mode(sbi, inode, attr->ia_mode) < 0))
+	     (attr->ia_mode & ~MSDOS_VALID_MODE)))
 		error = -EPERM;
 
 	if (error) {
@@ -346,15 +355,16 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
 		goto out;
 	}
 
-	error = inode_setattr(inode, attr);
-	if (error)
-		goto out;
+	/*
+	 * We don't return -EPERM here. Yes, strange, but this is too
+	 * old behavior.
+	 */
+	if (attr->ia_valid & ATTR_MODE) {
+		if (fat_sanitize_mode(sbi, inode, &attr->ia_mode) < 0)
+			attr->ia_valid &= ~ATTR_MODE;
+	}
 
-	if (S_ISDIR(inode->i_mode))
-		mask = sbi->options.fs_dmask;
-	else
-		mask = sbi->options.fs_fmask;
-	inode->i_mode &= S_IFMT | (S_IRWXUGO & ~mask);
+	error = inode_setattr(inode, attr);
 out:
 	unlock_kernel();
 	return error;
-- 
cgit v1.2.3-70-g09d2


From 2165009bdf63f79716a36ad545df14c3cdf958b7 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@linux.vnet.ibm.com>
Date: Thu, 12 Jun 2008 15:21:47 -0700
Subject: pagemap: pass mm into pagewalkers

We need this at least for huge page detection for now, because powerpc
needs the vm_area_struct to be able to determine whether a virtual address
is referring to a huge page (its pmd_huge() doesn't work).

It might also come in handy for some of the other users.

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Acked-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 44 +++++++++++++++++++++++++-------------------
 include/linux/mm.h | 17 +++++++++--------
 mm/pagewalk.c      | 42 ++++++++++++++++++++++--------------------
 3 files changed, 56 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 17403629e33..f0df3109343 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -315,9 +315,9 @@ struct mem_size_stats {
 };
 
 static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
-			   void *private)
+			   struct mm_walk *walk)
 {
-	struct mem_size_stats *mss = private;
+	struct mem_size_stats *mss = walk->private;
 	struct vm_area_struct *vma = mss->vma;
 	pte_t *pte, ptent;
 	spinlock_t *ptl;
@@ -365,19 +365,21 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	return 0;
 }
 
-static struct mm_walk smaps_walk = { .pmd_entry = smaps_pte_range };
-
 static int show_smap(struct seq_file *m, void *v)
 {
 	struct vm_area_struct *vma = v;
 	struct mem_size_stats mss;
 	int ret;
+	struct mm_walk smaps_walk = {
+		.pmd_entry = smaps_pte_range,
+		.mm = vma->vm_mm,
+		.private = &mss,
+	};
 
 	memset(&mss, 0, sizeof mss);
 	mss.vma = vma;
 	if (vma->vm_mm && !is_vm_hugetlb_page(vma))
-		walk_page_range(vma->vm_mm, vma->vm_start, vma->vm_end,
-				&smaps_walk, &mss);
+		walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
 
 	ret = show_map(m, v);
 	if (ret)
@@ -426,9 +428,9 @@ const struct file_operations proc_smaps_operations = {
 };
 
 static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
-				unsigned long end, void *private)
+				unsigned long end, struct mm_walk *walk)
 {
-	struct vm_area_struct *vma = private;
+	struct vm_area_struct *vma = walk->private;
 	pte_t *pte, ptent;
 	spinlock_t *ptl;
 	struct page *page;
@@ -452,8 +454,6 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 	return 0;
 }
 
-static struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range };
-
 static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 				size_t count, loff_t *ppos)
 {
@@ -476,11 +476,17 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 		return -ESRCH;
 	mm = get_task_mm(task);
 	if (mm) {
+		static struct mm_walk clear_refs_walk;
+		memset(&clear_refs_walk, 0, sizeof(clear_refs_walk));
+		clear_refs_walk.pmd_entry = clear_refs_pte_range;
+		clear_refs_walk.mm = mm;
 		down_read(&mm->mmap_sem);
-		for (vma = mm->mmap; vma; vma = vma->vm_next)
+		for (vma = mm->mmap; vma; vma = vma->vm_next) {
+			clear_refs_walk.private = vma;
 			if (!is_vm_hugetlb_page(vma))
-				walk_page_range(mm, vma->vm_start, vma->vm_end,
-						&clear_refs_walk, vma);
+				walk_page_range(vma->vm_start, vma->vm_end,
+						&clear_refs_walk);
+		}
 		flush_tlb_mm(mm);
 		up_read(&mm->mmap_sem);
 		mmput(mm);
@@ -528,9 +534,9 @@ static int add_to_pagemap(unsigned long addr, u64 pfn,
 }
 
 static int pagemap_pte_hole(unsigned long start, unsigned long end,
-				void *private)
+				struct mm_walk *walk)
 {
-	struct pagemapread *pm = private;
+	struct pagemapread *pm = walk->private;
 	unsigned long addr;
 	int err = 0;
 	for (addr = start; addr < end; addr += PAGE_SIZE) {
@@ -548,9 +554,9 @@ static u64 swap_pte_to_pagemap_entry(pte_t pte)
 }
 
 static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
-			     void *private)
+			     struct mm_walk *walk)
 {
-	struct pagemapread *pm = private;
+	struct pagemapread *pm = walk->private;
 	pte_t *pte;
 	int err = 0;
 
@@ -675,8 +681,8 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 		 * user buffer is tracked in "pm", and the walk
 		 * will stop when we hit the end of the buffer.
 		 */
-		ret = walk_page_range(mm, start_vaddr, end_vaddr,
-					&pagemap_walk, &pm);
+		ret = walk_page_range(start_vaddr, end_vaddr,
+					&pagemap_walk);
 		if (ret == PM_END_OF_BUFFER)
 			ret = 0;
 		/* don't need mmap_sem for these, but this looks cleaner */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c31a9cd2a30..586a943cab0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -760,16 +760,17 @@ unsigned long unmap_vmas(struct mmu_gather **tlb,
  * (see walk_page_range for more details)
  */
 struct mm_walk {
-	int (*pgd_entry)(pgd_t *, unsigned long, unsigned long, void *);
-	int (*pud_entry)(pud_t *, unsigned long, unsigned long, void *);
-	int (*pmd_entry)(pmd_t *, unsigned long, unsigned long, void *);
-	int (*pte_entry)(pte_t *, unsigned long, unsigned long, void *);
-	int (*pte_hole)(unsigned long, unsigned long, void *);
+	int (*pgd_entry)(pgd_t *, unsigned long, unsigned long, struct mm_walk *);
+	int (*pud_entry)(pud_t *, unsigned long, unsigned long, struct mm_walk *);
+	int (*pmd_entry)(pmd_t *, unsigned long, unsigned long, struct mm_walk *);
+	int (*pte_entry)(pte_t *, unsigned long, unsigned long, struct mm_walk *);
+	int (*pte_hole)(unsigned long, unsigned long, struct mm_walk *);
+	struct mm_struct *mm;
+	void *private;
 };
 
-int walk_page_range(const struct mm_struct *, unsigned long addr,
-		    unsigned long end, const struct mm_walk *walk,
-		    void *private);
+int walk_page_range(unsigned long addr, unsigned long end,
+		struct mm_walk *walk);
 void free_pgd_range(struct mmu_gather **tlb, unsigned long addr,
 		unsigned long end, unsigned long floor, unsigned long ceiling);
 void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *start_vma,
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 0afd2387e50..d5878bed784 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -3,14 +3,14 @@
 #include <linux/sched.h>
 
 static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
-			  const struct mm_walk *walk, void *private)
+			  struct mm_walk *walk)
 {
 	pte_t *pte;
 	int err = 0;
 
 	pte = pte_offset_map(pmd, addr);
 	for (;;) {
-		err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, private);
+		err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
 		if (err)
 		       break;
 		addr += PAGE_SIZE;
@@ -24,7 +24,7 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 }
 
 static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
-			  const struct mm_walk *walk, void *private)
+			  struct mm_walk *walk)
 {
 	pmd_t *pmd;
 	unsigned long next;
@@ -35,15 +35,15 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(pmd)) {
 			if (walk->pte_hole)
-				err = walk->pte_hole(addr, next, private);
+				err = walk->pte_hole(addr, next, walk);
 			if (err)
 				break;
 			continue;
 		}
 		if (walk->pmd_entry)
-			err = walk->pmd_entry(pmd, addr, next, private);
+			err = walk->pmd_entry(pmd, addr, next, walk);
 		if (!err && walk->pte_entry)
-			err = walk_pte_range(pmd, addr, next, walk, private);
+			err = walk_pte_range(pmd, addr, next, walk);
 		if (err)
 			break;
 	} while (pmd++, addr = next, addr != end);
@@ -52,7 +52,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
 }
 
 static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
-			  const struct mm_walk *walk, void *private)
+			  struct mm_walk *walk)
 {
 	pud_t *pud;
 	unsigned long next;
@@ -63,15 +63,15 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud)) {
 			if (walk->pte_hole)
-				err = walk->pte_hole(addr, next, private);
+				err = walk->pte_hole(addr, next, walk);
 			if (err)
 				break;
 			continue;
 		}
 		if (walk->pud_entry)
-			err = walk->pud_entry(pud, addr, next, private);
+			err = walk->pud_entry(pud, addr, next, walk);
 		if (!err && (walk->pmd_entry || walk->pte_entry))
-			err = walk_pmd_range(pud, addr, next, walk, private);
+			err = walk_pmd_range(pud, addr, next, walk);
 		if (err)
 			break;
 	} while (pud++, addr = next, addr != end);
@@ -85,15 +85,15 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
  * @addr: starting address
  * @end: ending address
  * @walk: set of callbacks to invoke for each level of the tree
- * @private: private data passed to the callback function
  *
  * Recursively walk the page table for the memory area in a VMA,
  * calling supplied callbacks. Callbacks are called in-order (first
  * PGD, first PUD, first PMD, first PTE, second PTE... second PMD,
  * etc.). If lower-level callbacks are omitted, walking depth is reduced.
  *
- * Each callback receives an entry pointer, the start and end of the
- * associated range, and a caller-supplied private data pointer.
+ * Each callback receives an entry pointer and the start and end of the
+ * associated range, and a copy of the original mm_walk for access to
+ * the ->private or ->mm fields.
  *
  * No locks are taken, but the bottom level iterator will map PTE
  * directories from highmem if necessary.
@@ -101,9 +101,8 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
  * If any callback returns a non-zero value, the walk is aborted and
  * the return value is propagated back to the caller. Otherwise 0 is returned.
  */
-int walk_page_range(const struct mm_struct *mm,
-		    unsigned long addr, unsigned long end,
-		    const struct mm_walk *walk, void *private)
+int walk_page_range(unsigned long addr, unsigned long end,
+		    struct mm_walk *walk)
 {
 	pgd_t *pgd;
 	unsigned long next;
@@ -112,21 +111,24 @@ int walk_page_range(const struct mm_struct *mm,
 	if (addr >= end)
 		return err;
 
-	pgd = pgd_offset(mm, addr);
+	if (!walk->mm)
+		return -EINVAL;
+
+	pgd = pgd_offset(walk->mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd)) {
 			if (walk->pte_hole)
-				err = walk->pte_hole(addr, next, private);
+				err = walk->pte_hole(addr, next, walk);
 			if (err)
 				break;
 			continue;
 		}
 		if (walk->pgd_entry)
-			err = walk->pgd_entry(pgd, addr, next, private);
+			err = walk->pgd_entry(pgd, addr, next, walk);
 		if (!err &&
 		    (walk->pud_entry || walk->pmd_entry || walk->pte_entry))
-			err = walk_pud_range(pgd, addr, next, walk, private);
+			err = walk_pud_range(pgd, addr, next, walk);
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
-- 
cgit v1.2.3-70-g09d2


From bcf8039ed45f56013c4afea5520bca7d909e5e61 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@linux.vnet.ibm.com>
Date: Thu, 12 Jun 2008 15:21:48 -0700
Subject: pagemap: fix large pages in pagemap

We were walking right into huge page areas in the pagemap walker, and
calling the pmds pmd_bad() and clearing them.

That leaked huge pages.  Bad.

This patch at least works around that for now.  It ignores huge pages in
the pagemap walker for the time being, and won't leak those pages.

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Acked-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 39 ++++++++++++++++++++++++++++++---------
 1 file changed, 30 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index f0df3109343..ab8ccc9d14f 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -553,24 +553,45 @@ static u64 swap_pte_to_pagemap_entry(pte_t pte)
 	return swp_type(e) | (swp_offset(e) << MAX_SWAPFILES_SHIFT);
 }
 
+static unsigned long pte_to_pagemap_entry(pte_t pte)
+{
+	unsigned long pme = 0;
+	if (is_swap_pte(pte))
+		pme = PM_PFRAME(swap_pte_to_pagemap_entry(pte))
+			| PM_PSHIFT(PAGE_SHIFT) | PM_SWAP;
+	else if (pte_present(pte))
+		pme = PM_PFRAME(pte_pfn(pte))
+			| PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT;
+	return pme;
+}
+
 static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 			     struct mm_walk *walk)
 {
+	struct vm_area_struct *vma;
 	struct pagemapread *pm = walk->private;
 	pte_t *pte;
 	int err = 0;
 
+	/* find the first VMA at or above 'addr' */
+	vma = find_vma(walk->mm, addr);
 	for (; addr != end; addr += PAGE_SIZE) {
 		u64 pfn = PM_NOT_PRESENT;
-		pte = pte_offset_map(pmd, addr);
-		if (is_swap_pte(*pte))
-			pfn = PM_PFRAME(swap_pte_to_pagemap_entry(*pte))
-				| PM_PSHIFT(PAGE_SHIFT) | PM_SWAP;
-		else if (pte_present(*pte))
-			pfn = PM_PFRAME(pte_pfn(*pte))
-				| PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT;
-		/* unmap so we're not in atomic when we copy to userspace */
-		pte_unmap(pte);
+
+		/* check to see if we've left 'vma' behind
+		 * and need a new, higher one */
+		if (vma && (addr >= vma->vm_end))
+			vma = find_vma(walk->mm, addr);
+
+		/* check that 'vma' actually covers this address,
+		 * and that it isn't a huge page vma */
+		if (vma && (vma->vm_start <= addr) &&
+		    !is_vm_hugetlb_page(vma)) {
+			pte = pte_offset_map(pmd, addr);
+			pfn = pte_to_pagemap_entry(*pte);
+			/* unmap before userspace copy */
+			pte_unmap(pte);
+		}
 		err = add_to_pagemap(addr, pfn, pm);
 		if (err)
 			return err;
-- 
cgit v1.2.3-70-g09d2


From e4f3ec063421bdbcb93330e72aa3eeedb6a0d85a Mon Sep 17 00:00:00 2001
From: Paul Collins <paul@burly.ondioline.org>
Date: Sat, 14 Jun 2008 14:14:59 +1200
Subject: udf: restore UDFFS_DEBUG to being undefined by default

Commit 706047a79725b585cf272fdefc234b31b6545c72, "udf: Fix compilation
warnings when UDF debug is on" inadvertently (I assume) enabled
debugging messages by default for UDF.  This patch disables them again.

Signed-off-by: Paul Collins <paul@ondioline.org>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/udfdecl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 8fa9c2d7091..8ec865de5f1 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -16,7 +16,7 @@
 #define UDF_PREALLOCATE
 #define UDF_DEFAULT_PREALLOC_BLOCKS	8
 
-#define UDFFS_DEBUG
+#undef UDFFS_DEBUG
 
 #ifdef UDFFS_DEBUG
 #define udf_debug(f, a...) \
-- 
cgit v1.2.3-70-g09d2


From 702773b16e83fcddc41e0019b8214d3c3cecedbe Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Mon, 16 Jun 2008 12:11:54 +0100
Subject: Include <asm/a.out.h> in fs/exec.c only for Alpha.

We only need it for the /sbin/loader hack for OSF/1 executables, and we
don't want to include it otherwise.

While we're at it, remove the redundant '&& CONFIG_ARCH_SUPPORTS_AOUT'
in the ifdef around that code. It's already dependent on __alpha__, and
CONFIG_ARCH_SUPPORTS_AOUT is hard-coded to 'y' there.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Acked-by: Peter Korsgaard <jacmet@sunsite.dk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 9448f1b50b4..da94a6f05df 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -26,7 +26,6 @@
 #include <linux/file.h>
 #include <linux/fdtable.h>
 #include <linux/mman.h>
-#include <linux/a.out.h>
 #include <linux/stat.h>
 #include <linux/fcntl.h>
 #include <linux/smp_lock.h>
@@ -61,6 +60,11 @@
 #include <linux/kmod.h>
 #endif
 
+#ifdef __alpha__
+/* for /sbin/loader handling in search_binary_handler() */
+#include <linux/a.out.h>
+#endif
+
 int core_uses_pid;
 char core_pattern[CORENAME_MAX_SIZE] = "core";
 int suid_dumpable = 0;
@@ -1155,7 +1159,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 {
 	int try,retval;
 	struct linux_binfmt *fmt;
-#if defined(__alpha__) && defined(CONFIG_ARCH_SUPPORTS_AOUT)
+#ifdef __alpha__
 	/* handle /sbin/loader.. */
 	{
 	    struct exec * eh = (struct exec *) bprm->buf;
-- 
cgit v1.2.3-70-g09d2


From a9e0f5293d4999f93b469af4e70382db800a8204 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Mon, 16 Jun 2008 12:18:13 +0100
Subject: Remove last traces of a.out support from ELF loader.

In commit d20894a23708c2af75966534f8e4dedb46d48db2 ("Remove a.out
interpreter support in ELF loader"), Andi removed support for a.out
interpreters from the ELF loader, which was only ever needed for the
transition from a.out to ELF.

This removes the last traces of that support, in particular the
inclusion of <linux/a.out.h>.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Acked-by: Peter Korsgaard <jacmet@sunsite.dk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_elf.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 0fa95b198e6..d48ff5f370f 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -16,7 +16,6 @@
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
-#include <linux/a.out.h>
 #include <linux/errno.h>
 #include <linux/signal.h>
 #include <linux/binfmts.h>
@@ -548,7 +547,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	struct {
 		struct elfhdr elf_ex;
 		struct elfhdr interp_elf_ex;
-  		struct exec interp_ex;
 	} *loc;
 
 	loc = kmalloc(sizeof(*loc), GFP_KERNEL);
@@ -680,7 +678,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 			}
 
 			/* Get the exec headers */
-			loc->interp_ex = *((struct exec *)bprm->buf);
 			loc->interp_elf_ex = *((struct elfhdr *)bprm->buf);
 			break;
 		}
-- 
cgit v1.2.3-70-g09d2


From 3878f110f71a0971ff7acc15dd6db711b6ef37c6 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 30 May 2008 15:30:49 -0700
Subject: ocfs2: Move the hb_ctl_path sysctl into the stack glue.

ocfs2 needs to call out to the hb_ctl program at unmount for all cluster
stacks.  The first step is to move the hb_ctl_path sysctl out of the
o2cb code and into the generic stack glue.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/cluster/nodemanager.c | 74 +-----------------------------------
 fs/ocfs2/cluster/nodemanager.h |  4 --
 fs/ocfs2/stack_o2cb.c          |  2 +-
 fs/ocfs2/stackglue.c           | 85 ++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/stackglue.h           |  2 +
 5 files changed, 89 insertions(+), 78 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index cf9401e8cd0..cfdb08b484e 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -21,7 +21,6 @@
 
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/sysctl.h>
 #include <linux/configfs.h>
 
 #include "tcp.h"
@@ -36,65 +35,6 @@
  * cluster references throughout where nodes are looked up */
 struct o2nm_cluster *o2nm_single_cluster = NULL;
 
-#define OCFS2_MAX_HB_CTL_PATH 256
-static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl";
-
-static ctl_table ocfs2_nm_table[] = {
-	{
-		.ctl_name	= 1,
-		.procname	= "hb_ctl_path",
-		.data		= ocfs2_hb_ctl_path,
-		.maxlen		= OCFS2_MAX_HB_CTL_PATH,
-		.mode		= 0644,
-		.proc_handler	= &proc_dostring,
-		.strategy	= &sysctl_string,
-	},
-	{ .ctl_name = 0 }
-};
-
-static ctl_table ocfs2_mod_table[] = {
-	{
-		.ctl_name	= FS_OCFS2_NM,
-		.procname	= "nm",
-		.data		= NULL,
-		.maxlen		= 0,
-		.mode		= 0555,
-		.child		= ocfs2_nm_table
-	},
-	{ .ctl_name = 0}
-};
-
-static ctl_table ocfs2_kern_table[] = {
-	{
-		.ctl_name	= FS_OCFS2,
-		.procname	= "ocfs2",
-		.data		= NULL,
-		.maxlen		= 0,
-		.mode		= 0555,
-		.child		= ocfs2_mod_table
-	},
-	{ .ctl_name = 0}
-};
-
-static ctl_table ocfs2_root_table[] = {
-	{
-		.ctl_name	= CTL_FS,
-		.procname	= "fs",
-		.data		= NULL,
-		.maxlen		= 0,
-		.mode		= 0555,
-		.child		= ocfs2_kern_table
-	},
-	{ .ctl_name = 0 }
-};
-
-static struct ctl_table_header *ocfs2_table_header = NULL;
-
-const char *o2nm_get_hb_ctl_path(void)
-{
-	return ocfs2_hb_ctl_path;
-}
-EXPORT_SYMBOL_GPL(o2nm_get_hb_ctl_path);
 
 struct o2nm_node *o2nm_get_node_by_num(u8 node_num)
 {
@@ -941,9 +881,6 @@ void o2nm_undepend_this_node(void)
 
 static void __exit exit_o2nm(void)
 {
-	if (ocfs2_table_header)
-		unregister_sysctl_table(ocfs2_table_header);
-
 	/* XXX sync with hb callbacks and shut down hb? */
 	o2net_unregister_hb_callbacks();
 	configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys);
@@ -964,16 +901,9 @@ static int __init init_o2nm(void)
 	if (ret)
 		goto out;
 
-	ocfs2_table_header = register_sysctl_table(ocfs2_root_table);
-	if (!ocfs2_table_header) {
-		printk(KERN_ERR "nodemanager: unable to register sysctl\n");
-		ret = -ENOMEM; /* or something. */
-		goto out_o2net;
-	}
-
 	ret = o2net_register_hb_callbacks();
 	if (ret)
-		goto out_sysctl;
+		goto out_o2net;
 
 	config_group_init(&o2nm_cluster_group.cs_subsys.su_group);
 	mutex_init(&o2nm_cluster_group.cs_subsys.su_mutex);
@@ -990,8 +920,6 @@ static int __init init_o2nm(void)
 	configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys);
 out_callbacks:
 	o2net_unregister_hb_callbacks();
-out_sysctl:
-	unregister_sysctl_table(ocfs2_table_header);
 out_o2net:
 	o2net_exit();
 out:
diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h
index 7c860361b8d..c992ea0da4a 100644
--- a/fs/ocfs2/cluster/nodemanager.h
+++ b/fs/ocfs2/cluster/nodemanager.h
@@ -33,10 +33,6 @@
 #include <linux/configfs.h>
 #include <linux/rbtree.h>
 
-#define FS_OCFS2_NM		1
-
-const char *o2nm_get_hb_ctl_path(void);
-
 struct o2nm_node {
 	spinlock_t		nd_lock;
 	struct config_item	nd_item;
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index bbd1667aa7d..fb26a7c69c4 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -338,7 +338,7 @@ static void o2hb_stop(const char *group)
 	int ret;
 	char *argv[5], *envp[3];
 
-	argv[0] = (char *)o2nm_get_hb_ctl_path();
+	argv[0] = (char *)ocfs2_get_hb_ctl_path();
 	argv[1] = "-K";
 	argv[2] = "-u";
 	argv[3] = (char *)group;
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 119f60cea9c..fb9b8e0db26 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -26,6 +26,7 @@
 #include <linux/fs.h>
 #include <linux/kobject.h>
 #include <linux/sysfs.h>
+#include <linux/sysctl.h>
 
 #include "ocfs2_fs.h"
 
@@ -548,10 +549,92 @@ error:
 	return ret;
 }
 
+/*
+ * Sysctl bits
+ *
+ * The sysctl lives at /proc/sys/fs/ocfs2/nm/hb_ctl_path.  The 'nm' doesn't
+ * make as much sense in a multiple cluster stack world, but it's safer
+ * and easier to preserve the name.
+ */
+
+#define FS_OCFS2_NM		1
+
+#define OCFS2_MAX_HB_CTL_PATH 256
+static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl";
+
+static ctl_table ocfs2_nm_table[] = {
+	{
+		.ctl_name	= 1,
+		.procname	= "hb_ctl_path",
+		.data		= ocfs2_hb_ctl_path,
+		.maxlen		= OCFS2_MAX_HB_CTL_PATH,
+		.mode		= 0644,
+		.proc_handler	= &proc_dostring,
+		.strategy	= &sysctl_string,
+	},
+	{ .ctl_name = 0 }
+};
+
+static ctl_table ocfs2_mod_table[] = {
+	{
+		.ctl_name	= FS_OCFS2_NM,
+		.procname	= "nm",
+		.data		= NULL,
+		.maxlen		= 0,
+		.mode		= 0555,
+		.child		= ocfs2_nm_table
+	},
+	{ .ctl_name = 0}
+};
+
+static ctl_table ocfs2_kern_table[] = {
+	{
+		.ctl_name	= FS_OCFS2,
+		.procname	= "ocfs2",
+		.data		= NULL,
+		.maxlen		= 0,
+		.mode		= 0555,
+		.child		= ocfs2_mod_table
+	},
+	{ .ctl_name = 0}
+};
+
+static ctl_table ocfs2_root_table[] = {
+	{
+		.ctl_name	= CTL_FS,
+		.procname	= "fs",
+		.data		= NULL,
+		.maxlen		= 0,
+		.mode		= 0555,
+		.child		= ocfs2_kern_table
+	},
+	{ .ctl_name = 0 }
+};
+
+static struct ctl_table_header *ocfs2_table_header = NULL;
+
+const char *ocfs2_get_hb_ctl_path(void)
+{
+	return ocfs2_hb_ctl_path;
+}
+EXPORT_SYMBOL_GPL(ocfs2_get_hb_ctl_path);
+
+
+/*
+ * Initialization
+ */
+
 static int __init ocfs2_stack_glue_init(void)
 {
 	strcpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB);
 
+	ocfs2_table_header = register_sysctl_table(ocfs2_root_table);
+	if (!ocfs2_table_header) {
+		printk(KERN_ERR
+		       "ocfs2 stack glue: unable to register sysctl\n");
+		return -ENOMEM; /* or something. */
+	}
+
 	return ocfs2_sysfs_init();
 }
 
@@ -559,6 +642,8 @@ static void __exit ocfs2_stack_glue_exit(void)
 {
 	lproto = NULL;
 	ocfs2_sysfs_exit();
+	if (ocfs2_table_header)
+		unregister_sysctl_table(ocfs2_table_header);
 }
 
 MODULE_AUTHOR("Oracle");
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 005e4f170e0..c9fb01ab634 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -258,4 +258,6 @@ void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto)
 /* Used by stack plugins */
 int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
 void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
+const char *ocfs2_get_hb_ctl_path(void);
+
 #endif  /* STACKGLUE_H */
-- 
cgit v1.2.3-70-g09d2


From 9f9a99f4eccc64650e932090cff0ebd07b81e334 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 30 May 2008 15:43:58 -0700
Subject: ocfs2: Move the call of ocfs2_hb_ctl into the stack glue.

Take o2hb_stop() out of the o2cb code and make it part of the generic
stack glue as ocfs2_leave_group().  This also allows us to remove the
ocfs2_get_hb_ctl_path() function - everything to do with hb_ctl is now
part of stackglue.c.  o2cb no longer needs a ->hangup() function.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/stack_o2cb.c | 38 --------------------------------------
 fs/ocfs2/stackglue.c  | 49 ++++++++++++++++++++++++++++++++++++++++---------
 fs/ocfs2/stackglue.h  |  1 -
 3 files changed, 40 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index fb26a7c69c4..765ade5ee84 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -333,43 +333,6 @@ static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn,
 	return 0;
 }
 
-static void o2hb_stop(const char *group)
-{
-	int ret;
-	char *argv[5], *envp[3];
-
-	argv[0] = (char *)ocfs2_get_hb_ctl_path();
-	argv[1] = "-K";
-	argv[2] = "-u";
-	argv[3] = (char *)group;
-	argv[4] = NULL;
-
-	mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]);
-
-	/* minimal command environment taken from cpu_run_sbin_hotplug */
-	envp[0] = "HOME=/";
-	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
-	envp[2] = NULL;
-
-	ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
-	if (ret < 0)
-		mlog_errno(ret);
-}
-
-/*
- * Hangup is a hack for tools compatibility.  Older ocfs2-tools software
- * expects the filesystem to call "ocfs2_hb_ctl" during unmount.  This
- * happens regardless of whether the DLM got started, so we can't do it
- * in ocfs2_cluster_disconnect().  We bring the o2hb_stop() function into
- * the glue and provide a "hangup" API for super.c to call.
- *
- * Other stacks will eventually provide a NULL ->hangup() pointer.
- */
-static void o2cb_cluster_hangup(const char *group, int grouplen)
-{
-	o2hb_stop(group);
-}
-
 static int o2cb_cluster_this_node(unsigned int *node)
 {
 	int node_num;
@@ -388,7 +351,6 @@ static int o2cb_cluster_this_node(unsigned int *node)
 static struct ocfs2_stack_operations o2cb_stack_ops = {
 	.connect	= o2cb_cluster_connect,
 	.disconnect	= o2cb_cluster_disconnect,
-	.hangup		= o2cb_cluster_hangup,
 	.this_node	= o2cb_cluster_this_node,
 	.dlm_lock	= o2cb_dlm_lock,
 	.dlm_unlock	= o2cb_dlm_unlock,
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index fb9b8e0db26..5f78ff4c76c 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -34,11 +34,13 @@
 
 #define OCFS2_STACK_PLUGIN_O2CB		"o2cb"
 #define OCFS2_STACK_PLUGIN_USER		"user"
+#define OCFS2_MAX_HB_CTL_PATH		256
 
 static struct ocfs2_locking_protocol *lproto;
 static DEFINE_SPINLOCK(ocfs2_stack_lock);
 static LIST_HEAD(ocfs2_stack_list);
 static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1];
+static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl";
 
 /*
  * The stack currently in use.  If not null, active_stack->sp_count > 0,
@@ -363,6 +365,42 @@ int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
 }
 EXPORT_SYMBOL_GPL(ocfs2_cluster_disconnect);
 
+/*
+ * Leave the group for this filesystem.  This is executed by a userspace
+ * program (stored in ocfs2_hb_ctl_path).
+ */
+static void ocfs2_leave_group(const char *group)
+{
+	int ret;
+	char *argv[5], *envp[3];
+
+	argv[0] = ocfs2_hb_ctl_path;
+	argv[1] = "-K";
+	argv[2] = "-u";
+	argv[3] = (char *)group;
+	argv[4] = NULL;
+
+	/* minimal command environment taken from cpu_run_sbin_hotplug */
+	envp[0] = "HOME=/";
+	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+	envp[2] = NULL;
+
+	ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+	if (ret < 0) {
+		printk(KERN_ERR
+		       "ocfs2: Error %d running user helper "
+		       "\"%s %s %s %s\"\n",
+		       ret, argv[0], argv[1], argv[2], argv[3]);
+	}
+}
+
+/*
+ * Hangup is a required post-umount.  ocfs2-tools software expects the
+ * filesystem to call "ocfs2_hb_ctl" during unmount.  This happens
+ * regardless of whether the DLM got started, so we can't do it
+ * in ocfs2_cluster_disconnect().  The ocfs2_leave_group() function does
+ * the actual work.
+ */
 void ocfs2_cluster_hangup(const char *group, int grouplen)
 {
 	BUG_ON(group == NULL);
@@ -371,6 +409,8 @@ void ocfs2_cluster_hangup(const char *group, int grouplen)
 	if (active_stack->sp_ops->hangup)
 		active_stack->sp_ops->hangup(group, grouplen);
 
+	ocfs2_leave_group(group);
+
 	/* cluster_disconnect() was called with hangup_pending==1 */
 	ocfs2_stack_driver_put();
 }
@@ -559,9 +599,6 @@ error:
 
 #define FS_OCFS2_NM		1
 
-#define OCFS2_MAX_HB_CTL_PATH 256
-static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl";
-
 static ctl_table ocfs2_nm_table[] = {
 	{
 		.ctl_name	= 1,
@@ -613,12 +650,6 @@ static ctl_table ocfs2_root_table[] = {
 
 static struct ctl_table_header *ocfs2_table_header = NULL;
 
-const char *ocfs2_get_hb_ctl_path(void)
-{
-	return ocfs2_hb_ctl_path;
-}
-EXPORT_SYMBOL_GPL(ocfs2_get_hb_ctl_path);
-
 
 /*
  * Initialization
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index c9fb01ab634..fe3fd2a1282 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -258,6 +258,5 @@ void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto)
 /* Used by stack plugins */
 int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
 void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
-const char *ocfs2_get_hb_ctl_path(void);
 
 #endif  /* STACKGLUE_H */
-- 
cgit v1.2.3-70-g09d2


From 2c39450b39880e162b3eb339672314101f58ee1a Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 30 May 2008 15:58:26 -0700
Subject: ocfs2: Remove ->hangup() from stack glue operations.

The ->hangup() call was only used to execute ocfs2_hb_ctl.  Now that
the generic stack glue code handles this, the underlying stack drivers
don't need to know about it.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/stack_o2cb.c |  3 +--
 fs/ocfs2/stack_user.c |  3 +--
 fs/ocfs2/stackglue.c  |  5 +----
 fs/ocfs2/stackglue.h  | 18 +++---------------
 4 files changed, 6 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 765ade5ee84..fcd120f1493 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -317,8 +317,7 @@ out:
 	return rc;
 }
 
-static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn,
-				   int hangup_pending)
+static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn)
 {
 	struct dlm_ctxt *dlm = conn->cc_lockspace;
 	struct o2dlm_private *priv = conn->cc_private;
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 6b97d11f6bf..c021280dd46 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -816,8 +816,7 @@ out:
 	return rc;
 }
 
-static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn,
-				   int hangup_pending)
+static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn)
 {
 	dlm_release_lockspace(conn->cc_lockspace, 2);
 	conn->cc_lockspace = NULL;
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 5f78ff4c76c..10e149ae5e3 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -352,7 +352,7 @@ int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
 
 	BUG_ON(conn == NULL);
 
-	ret = active_stack->sp_ops->disconnect(conn, hangup_pending);
+	ret = active_stack->sp_ops->disconnect(conn);
 
 	/* XXX Should we free it anyway? */
 	if (!ret) {
@@ -406,9 +406,6 @@ void ocfs2_cluster_hangup(const char *group, int grouplen)
 	BUG_ON(group == NULL);
 	BUG_ON(group[grouplen] != '\0');
 
-	if (active_stack->sp_ops->hangup)
-		active_stack->sp_ops->hangup(group, grouplen);
-
 	ocfs2_leave_group(group);
 
 	/* cluster_disconnect() was called with hangup_pending==1 */
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index fe3fd2a1282..db56281dd1b 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -134,22 +134,10 @@ struct ocfs2_stack_operations {
 	 * be freed.  Thus, a stack must not return from ->disconnect()
 	 * until it will no longer reference the conn pointer.
 	 *
-	 * If hangup_pending is zero, ocfs2_cluster_disconnect() will also
-	 * be dropping the reference on the module.
+	 * Once this call returns, the stack glue will be dropping this
+	 * connection's reference on the module.
 	 */
-	int (*disconnect)(struct ocfs2_cluster_connection *conn,
-			  int hangup_pending);
-
-	/*
-	 * ocfs2_cluster_hangup() exists for compatibility with older
-	 * ocfs2 tools.  Only the classic stack really needs it.  As such
-	 * ->hangup() is not required of all stacks.  See the comment by
-	 * ocfs2_cluster_hangup() for more details.
-	 *
-	 * Note that ocfs2_cluster_hangup() can only be called if
-	 * hangup_pending was passed to ocfs2_cluster_disconnect().
-	 */
-	void (*hangup)(const char *group, int grouplen);
+	int (*disconnect)(struct ocfs2_cluster_connection *conn);
 
 	/*
 	 * ->this_node() returns the cluster's unique identifier for the
-- 
cgit v1.2.3-70-g09d2


From 87de87d5e47f94b4ea647a5bd1bc8dc1f7930db4 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 3 Jun 2008 09:14:03 -0700
Subject: wext: Dispatch and handle compat ioctls entirely in
 net/wireless/wext.c

Next we can kill the hacks in fs/compat_ioctl.c and also
dispatch compat ioctls down into the driver and 80211 protocol
helper layers in order to handle iw_point objects embedded in
stream replies which need to be translated.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/compat_ioctl.c        |   6 ---
 include/linux/wireless.h |  13 ++++++
 include/net/wext.h       |   7 ++++
 net/socket.c             |  10 +++++
 net/wireless/wext.c      | 104 +++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 134 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 97dba0d9234..8ab850bf2ee 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1757,12 +1757,6 @@ static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, unsigned long a
 	return sys_ioctl(fd, cmd, (unsigned long)tdata);
 }
 
-struct compat_iw_point {
-	compat_caddr_t pointer;
-	__u16 length;
-	__u16 flags;
-};
-
 static int do_wireless_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
 {
 	struct iwreq __user *iwr;
diff --git a/include/linux/wireless.h b/include/linux/wireless.h
index 4a95a0e5eec..79d84687582 100644
--- a/include/linux/wireless.h
+++ b/include/linux/wireless.h
@@ -677,6 +677,19 @@ struct	iw_point
   __u16		flags;		/* Optional params */
 };
 
+#ifdef __KERNEL__
+#ifdef CONFIG_COMPAT
+
+#include <linux/compat.h>
+
+struct compat_iw_point {
+	compat_caddr_t pointer;
+	__u16 length;
+	__u16 flags;
+};
+#endif
+#endif
+
 /*
  *	A frequency
  *	For numbers lower than 10^9, we encode the number in 'm' and
diff --git a/include/net/wext.h b/include/net/wext.h
index 80b31d826b7..6d76a39a9c5 100644
--- a/include/net/wext.h
+++ b/include/net/wext.h
@@ -12,6 +12,8 @@ extern int wext_proc_init(struct net *net);
 extern void wext_proc_exit(struct net *net);
 extern int wext_handle_ioctl(struct net *net, struct ifreq *ifr, unsigned int cmd,
 			     void __user *arg);
+extern int compat_wext_handle_ioctl(struct net *net, unsigned int cmd,
+				    unsigned long arg);
 #else
 static inline int wext_proc_init(struct net *net)
 {
@@ -26,6 +28,11 @@ static inline int wext_handle_ioctl(struct net *net, struct ifreq *ifr, unsigned
 {
 	return -EINVAL;
 }
+static inline int compat_wext_handle_ioctl(struct net *net, unsigned int cmd,
+					   unsigned long arg)
+{
+	return -EINVAL;
+}
 #endif
 
 #endif /* __NET_WEXT_H */
diff --git a/net/socket.c b/net/socket.c
index 66c4a8cf6db..81fe8251304 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -90,6 +90,7 @@
 #include <asm/unistd.h>
 
 #include <net/compat.h>
+#include <net/wext.h>
 
 #include <net/sock.h>
 #include <linux/netfilter.h>
@@ -2210,10 +2211,19 @@ static long compat_sock_ioctl(struct file *file, unsigned cmd,
 {
 	struct socket *sock = file->private_data;
 	int ret = -ENOIOCTLCMD;
+	struct sock *sk;
+	struct net *net;
+
+	sk = sock->sk;
+	net = sock_net(sk);
 
 	if (sock->ops->compat_ioctl)
 		ret = sock->ops->compat_ioctl(sock, cmd, arg);
 
+	if (ret == -ENOIOCTLCMD &&
+	    (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST))
+		ret = compat_wext_handle_ioctl(net, cmd, arg);
+
 	return ret;
 }
 #endif
diff --git a/net/wireless/wext.c b/net/wireless/wext.c
index 09022cbb58b..1a4636a9fcd 100644
--- a/net/wireless/wext.c
+++ b/net/wireless/wext.c
@@ -1112,6 +1112,110 @@ int wext_handle_ioctl(struct net *net, struct ifreq *ifr, unsigned int cmd,
 	return ret;
 }
 
+#ifdef CONFIG_COMPAT
+static int compat_standard_call(struct net_device	*dev,
+				struct iwreq		*iwr,
+				unsigned int		cmd,
+				iw_handler		handler)
+{
+	const struct iw_ioctl_description *descr;
+	struct compat_iw_point *iwp_compat;
+	struct iw_request_info info;
+	struct iw_point iwp;
+	int err;
+
+	descr = standard_ioctl + (cmd - SIOCIWFIRST);
+
+	if (descr->header_type != IW_HEADER_TYPE_POINT)
+		return ioctl_standard_call(dev, iwr, cmd, handler);
+
+	iwp_compat = (struct compat_iw_point *) &iwr->u.data;
+	iwp.pointer = compat_ptr(iwp_compat->pointer);
+	iwp.length = iwp_compat->length;
+	iwp.flags = iwp_compat->flags;
+
+	info.cmd = cmd;
+	info.flags = 0;
+
+	err = ioctl_standard_iw_point(&iwp, cmd, descr, handler, dev, &info);
+
+	iwp_compat->pointer = ptr_to_compat(iwp.pointer);
+	iwp_compat->length = iwp.length;
+	iwp_compat->flags = iwp.flags;
+
+	return err;
+}
+
+static int compat_private_call(struct net_device *dev, struct iwreq *iwr,
+			       unsigned int cmd, iw_handler handler)
+{
+	const struct iw_priv_args *descr;
+	struct iw_request_info info;
+	int ret, extra_size;
+
+	extra_size = get_priv_descr_and_size(dev, cmd, &descr);
+
+	/* Prepare the call */
+	info.cmd = cmd;
+	info.flags = 0;
+
+	/* Check if we have a pointer to user space data or not. */
+	if (extra_size == 0) {
+		/* No extra arguments. Trivial to handle */
+		ret = handler(dev, &info, &(iwr->u), (char *) &(iwr->u));
+	} else {
+		struct compat_iw_point *iwp_compat;
+		struct iw_point iwp;
+
+		iwp_compat = (struct compat_iw_point *) &iwr->u.data;
+		iwp.pointer = compat_ptr(iwp_compat->pointer);
+		iwp.length = iwp_compat->length;
+		iwp.flags = iwp_compat->flags;
+
+		ret = ioctl_private_iw_point(&iwp, cmd, descr,
+					     handler, dev, &info, extra_size);
+
+		iwp_compat->pointer = ptr_to_compat(iwp.pointer);
+		iwp_compat->length = iwp.length;
+		iwp_compat->flags = iwp.flags;
+	}
+
+	/* Call commit handler if needed and defined */
+	if (ret == -EIWCOMMIT)
+		ret = call_commit_handler(dev);
+
+	return ret;
+}
+
+int compat_wext_handle_ioctl(struct net *net, unsigned int cmd,
+			     unsigned long arg)
+{
+	void __user *argp = (void __user *)arg;
+	struct iwreq iwr;
+	char *colon;
+	int ret;
+
+	if (copy_from_user(&iwr, argp, sizeof(struct iwreq)))
+		return -EFAULT;
+
+	iwr.ifr_name[IFNAMSIZ-1] = 0;
+	colon = strchr(iwr.ifr_name, ':');
+	if (colon)
+		*colon = 0;
+
+	ret = wext_ioctl_dispatch(net, (struct ifreq *) &iwr, cmd,
+				  compat_standard_call,
+				  compat_private_call);
+
+	if (ret >= 0 &&
+	    IW_IS_GET(cmd) &&
+	    copy_to_user(argp, &iwr, sizeof(struct iwreq)))
+		return -EFAULT;
+
+	return ret;
+}
+#endif
+
 /************************* EVENT PROCESSING *************************/
 /*
  * Process events generated by the wireless layer or the driver.
-- 
cgit v1.2.3-70-g09d2


From 169a3ec492ddb6b0a8203fccba2ddff077154e26 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Fri, 21 Dec 2007 03:55:13 -0800
Subject: wext: Remove compat handling from fs/compat_ioctl.c

No longer used.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/compat_ioctl.c | 107 +-----------------------------------------------------
 1 file changed, 1 insertion(+), 106 deletions(-)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 8ab850bf2ee..05ec7eef869 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -58,7 +58,6 @@
 #include <linux/syscalls.h>
 #include <linux/i2c.h>
 #include <linux/i2c-dev.h>
-#include <linux/wireless.h>
 #include <linux/atalk.h>
 #include <linux/loop.h>
 
@@ -1757,58 +1756,6 @@ static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, unsigned long a
 	return sys_ioctl(fd, cmd, (unsigned long)tdata);
 }
 
-static int do_wireless_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-	struct iwreq __user *iwr;
-	struct iwreq __user *iwr_u;
-	struct iw_point __user *iwp;
-	struct compat_iw_point __user *iwp_u;
-	compat_caddr_t pointer_u;
-	void __user *pointer;
-	__u16 length, flags;
-	int ret;
-
-	iwr_u = compat_ptr(arg);
-	iwp_u = (struct compat_iw_point __user *) &iwr_u->u.data;
-	iwr = compat_alloc_user_space(sizeof(*iwr));
-	if (iwr == NULL)
-		return -ENOMEM;
-
-	iwp = &iwr->u.data;
-
-	if (!access_ok(VERIFY_WRITE, iwr, sizeof(*iwr)))
-		return -EFAULT;
-
-	if (__copy_in_user(&iwr->ifr_ifrn.ifrn_name[0],
-			   &iwr_u->ifr_ifrn.ifrn_name[0],
-			   sizeof(iwr->ifr_ifrn.ifrn_name)))
-		return -EFAULT;
-
-	if (__get_user(pointer_u, &iwp_u->pointer) ||
-	    __get_user(length, &iwp_u->length) ||
-	    __get_user(flags, &iwp_u->flags))
-		return -EFAULT;
-
-	if (__put_user(compat_ptr(pointer_u), &iwp->pointer) ||
-	    __put_user(length, &iwp->length) ||
-	    __put_user(flags, &iwp->flags))
-		return -EFAULT;
-
-	ret = sys_ioctl(fd, cmd, (unsigned long) iwr);
-
-	if (__get_user(pointer, &iwp->pointer) ||
-	    __get_user(length, &iwp->length) ||
-	    __get_user(flags, &iwp->flags))
-		return -EFAULT;
-
-	if (__put_user(ptr_to_compat(pointer), &iwp_u->pointer) ||
-	    __put_user(length, &iwp_u->length) ||
-	    __put_user(flags, &iwp_u->flags))
-		return -EFAULT;
-
-	return ret;
-}
-
 /* Since old style bridge ioctl's endup using SIOCDEVPRIVATE
  * for some operations; this forces use of the newer bridge-utils that
  * use compatiable ioctls
@@ -2489,36 +2436,6 @@ COMPATIBLE_IOCTL(I2C_TENBIT)
 COMPATIBLE_IOCTL(I2C_PEC)
 COMPATIBLE_IOCTL(I2C_RETRIES)
 COMPATIBLE_IOCTL(I2C_TIMEOUT)
-/* wireless */
-COMPATIBLE_IOCTL(SIOCSIWCOMMIT)
-COMPATIBLE_IOCTL(SIOCGIWNAME)
-COMPATIBLE_IOCTL(SIOCSIWNWID)
-COMPATIBLE_IOCTL(SIOCGIWNWID)
-COMPATIBLE_IOCTL(SIOCSIWFREQ)
-COMPATIBLE_IOCTL(SIOCGIWFREQ)
-COMPATIBLE_IOCTL(SIOCSIWMODE)
-COMPATIBLE_IOCTL(SIOCGIWMODE)
-COMPATIBLE_IOCTL(SIOCSIWSENS)
-COMPATIBLE_IOCTL(SIOCGIWSENS)
-COMPATIBLE_IOCTL(SIOCSIWRANGE)
-COMPATIBLE_IOCTL(SIOCSIWPRIV)
-COMPATIBLE_IOCTL(SIOCSIWSTATS)
-COMPATIBLE_IOCTL(SIOCSIWAP)
-COMPATIBLE_IOCTL(SIOCGIWAP)
-COMPATIBLE_IOCTL(SIOCSIWRATE)
-COMPATIBLE_IOCTL(SIOCGIWRATE)
-COMPATIBLE_IOCTL(SIOCSIWRTS)
-COMPATIBLE_IOCTL(SIOCGIWRTS)
-COMPATIBLE_IOCTL(SIOCSIWFRAG)
-COMPATIBLE_IOCTL(SIOCGIWFRAG)
-COMPATIBLE_IOCTL(SIOCSIWTXPOW)
-COMPATIBLE_IOCTL(SIOCGIWTXPOW)
-COMPATIBLE_IOCTL(SIOCSIWRETRY)
-COMPATIBLE_IOCTL(SIOCGIWRETRY)
-COMPATIBLE_IOCTL(SIOCSIWPOWER)
-COMPATIBLE_IOCTL(SIOCGIWPOWER)
-COMPATIBLE_IOCTL(SIOCSIWAUTH)
-COMPATIBLE_IOCTL(SIOCGIWAUTH)
 /* hiddev */
 COMPATIBLE_IOCTL(HIDIOCGVERSION)
 COMPATIBLE_IOCTL(HIDIOCAPPLICATION)
@@ -2749,29 +2666,7 @@ COMPATIBLE_IOCTL(USBDEVFS_IOCTL32)
 HANDLE_IOCTL(I2C_FUNCS, w_long)
 HANDLE_IOCTL(I2C_RDWR, do_i2c_rdwr_ioctl)
 HANDLE_IOCTL(I2C_SMBUS, do_i2c_smbus_ioctl)
-/* wireless */
-HANDLE_IOCTL(SIOCGIWRANGE, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCGIWPRIV, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCGIWSTATS, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCSIWSPY, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCGIWSPY, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCSIWTHRSPY, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCGIWTHRSPY, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCSIWMLME, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCGIWAPLIST, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCSIWSCAN, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCGIWSCAN, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCSIWESSID, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCGIWESSID, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCSIWNICKN, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCGIWNICKN, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCSIWENCODE, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCGIWENCODE, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCSIWGENIE, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCGIWGENIE, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCSIWENCODEEXT, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCGIWENCODEEXT, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCSIWPMKSA, do_wireless_ioctl)
+/* bridge */
 HANDLE_IOCTL(SIOCSIFBR, old_bridge_ioctl)
 HANDLE_IOCTL(SIOCGIFBR, old_bridge_ioctl)
 /* Not implemented in the native kernel */
-- 
cgit v1.2.3-70-g09d2


From f948d56435fc1f7506f08866302ecd6e60b533dd Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Tue, 17 Jun 2008 18:05:40 +0200
Subject: fuse: fix thinko in max I/O size calucation

Use max not min to enforce a lower limit on the max I/O size.

This bug was introduced by "fuse: fix max i/o size calculation" (commit
e5d9a0df07484d6d191756878c974e4307fb24ce).

Thanks to Brian Wang for noticing.

Reported-by: Brian Wang <ywang221@hotmail.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Acked-by: Szabolcs Szakacsits <szaka@ntfs-3g.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fuse/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 43e99513334..3141690558c 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -591,7 +591,7 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 		fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages);
 		fc->minor = arg->minor;
 		fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
-		fc->max_write = min_t(unsigned, 4096, fc->max_write);
+		fc->max_write = max_t(unsigned, 4096, fc->max_write);
 		fc->conn_init = 1;
 	}
 	fuse_put_request(fc, req);
@@ -667,7 +667,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	fc->flags = d.flags;
 	fc->user_id = d.user_id;
 	fc->group_id = d.group_id;
-	fc->max_read = min_t(unsigned, 4096, d.max_read);
+	fc->max_read = max_t(unsigned, 4096, d.max_read);
 
 	/* Used by get_root_inode() */
 	sb->s_fs_info = fc;
-- 
cgit v1.2.3-70-g09d2


From 2856922c158605514ec5974a03097eaec91f4c0d Mon Sep 17 00:00:00 2001
From: Frederic Bohe <frederic.bohe@bull.net>
Date: Fri, 20 Jun 2008 11:48:48 -0400
Subject: Ext4: Fix online resize block group descriptor corruption

This is the patch for the group descriptor table corruption during
online resize pointed out by Theodore Tso.  The problem was caused by
the fact that the ext4 group descriptor can be either 32 or 64 bytes
long.  Only the 64 bytes structure was taken into account.

Signed-off-by: Frederic Bohe <frederic.bohe@bull.net>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/resize.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 9ecb92f6854..9ff7b1c0423 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -855,7 +855,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	 */
 
 	/* Update group descriptor block for new group */
-	gdp = (struct ext4_group_desc *)primary->b_data + gdb_off;
+	gdp = (struct ext4_group_desc *)((char *)primary->b_data +
+					 gdb_off * EXT4_DESC_SIZE(sb));
 
 	ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
 	ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
-- 
cgit v1.2.3-70-g09d2


From a30427d92d0bc152b833088e4a305bbeb1a0c162 Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Sun, 18 May 2008 15:39:11 -0600
Subject: Add a comment in chrdev_open()

I stared at this code for a while and almost deleted it before
understanding crept into my slow brain.  Hopefully this makes life easier
for the next person to happen on it.

Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 fs/char_dev.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/char_dev.c b/fs/char_dev.c
index 68e510b8845..a54d69369b2 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -373,6 +373,8 @@ static int chrdev_open(struct inode *inode, struct file *filp)
 			return -ENXIO;
 		new = container_of(kobj, struct cdev, kobj);
 		spin_lock(&cdev_lock);
+		/* Check i_cdev again in case somebody beat us to it while
+		   we dropped the lock. */
 		p = inode->i_cdev;
 		if (!p) {
 			inode->i_cdev = p = new;
-- 
cgit v1.2.3-70-g09d2


From 9514dff918b947ae43b66517dc90d0e05548bd6a Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Sun, 18 May 2008 15:40:00 -0600
Subject: Remove the lock_kernel() call from chrdev_open()

All in-kernel char device open() functions now either have their own
lock_kernel() calls or clearly do not need one.

Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 fs/char_dev.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/char_dev.c b/fs/char_dev.c
index a54d69369b2..3cb7cda3d78 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -394,11 +394,8 @@ static int chrdev_open(struct inode *inode, struct file *filp)
 		cdev_put(p);
 		return -ENXIO;
 	}
-	if (filp->f_op->open) {
-		lock_kernel();
+	if (filp->f_op->open)
 		ret = filp->f_op->open(inode,filp);
-		unlock_kernel();
-	}
 	if (ret)
 		cdev_put(p);
 	return ret;
-- 
cgit v1.2.3-70-g09d2


From 8f5934278d1d86590244c2791b28f77d67466007 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 19 May 2008 19:53:01 -0700
Subject: Replace BKL with superblock lock in fat/msdos/vfat

This replaces the use of the BKL in the FAT family of filesystems with the
existing superblock lock instead.

The code already appears to do mostly proper locking with its own private
spinlocks (and mutexes), but while the BKL could possibly have been
dropped entirely, converting it to use the superblock lock (which is just
a regular mutex) is the conservative thing to do.

As a per-filesystem mutex, it not only won't have any of the possible
latency issues related to the BKL, but the lock is obviously private to
the particular filesystem instance and will thus not cause problems for
entirely unrelated users like the BKL can.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 fs/fat/cache.c   |  2 +-
 fs/fat/dir.c     |  4 ++--
 fs/fat/file.c    | 12 +++++++-----
 fs/fat/inode.c   | 26 ++++++++++++++++----------
 fs/msdos/namei.c | 35 +++++++++++++++++++----------------
 fs/vfat/namei.c  | 35 +++++++++++++++++++----------------
 6 files changed, 64 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index fda25479af2..3a9ecac8d61 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -61,7 +61,7 @@ void fat_cache_destroy(void)
 
 static inline struct fat_cache *fat_cache_alloc(struct inode *inode)
 {
-	return kmem_cache_alloc(fat_cache_cachep, GFP_KERNEL);
+	return kmem_cache_alloc(fat_cache_cachep, GFP_NOFS);
 }
 
 static inline void fat_cache_free(struct fat_cache *cache)
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 486725ee99a..34541d06e62 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -472,7 +472,7 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
 	loff_t cpos;
 	int ret = 0;
 
-	lock_kernel();
+	lock_super(sb);
 
 	cpos = filp->f_pos;
 	/* Fake . and .. for the root directory. */
@@ -654,7 +654,7 @@ FillFailed:
 	if (unicode)
 		__putname(unicode);
 out:
-	unlock_kernel();
+	unlock_super(sb);
 	return ret;
 }
 
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 27cc1164ec3..7059928fb35 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -229,7 +229,8 @@ static int fat_free(struct inode *inode, int skip)
 
 void fat_truncate(struct inode *inode)
 {
-	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+	struct super_block *sb = inode->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 	const unsigned int cluster_size = sbi->cluster_size;
 	int nr_clusters;
 
@@ -242,9 +243,9 @@ void fat_truncate(struct inode *inode)
 
 	nr_clusters = (inode->i_size + (cluster_size - 1)) >> sbi->cluster_bits;
 
-	lock_kernel();
+	lock_super(sb);
 	fat_free(inode, nr_clusters);
-	unlock_kernel();
+	unlock_super(sb);
 	fat_flush_inodes(inode->i_sb, inode, NULL);
 }
 
@@ -297,12 +298,13 @@ static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode)
 
 int fat_setattr(struct dentry *dentry, struct iattr *attr)
 {
+	struct super_block *sb = dentry->d_sb;
 	struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
 	struct inode *inode = dentry->d_inode;
 	int mask, error = 0;
 	unsigned int ia_valid;
 
-	lock_kernel();
+	lock_super(sb);
 
 	/*
 	 * Expand the file. Since inode_setattr() updates ->i_size
@@ -356,7 +358,7 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
 		mask = sbi->options.fs_fmask;
 	inode->i_mode &= S_IFMT | (S_IRWXUGO & ~mask);
 out:
-	unlock_kernel();
+	unlock_super(sb);
 	return error;
 }
 EXPORT_SYMBOL_GPL(fat_setattr);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 4e0a3dd9d67..46a4508ffd2 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -440,14 +440,13 @@ static void fat_delete_inode(struct inode *inode)
 
 static void fat_clear_inode(struct inode *inode)
 {
-	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+	struct super_block *sb = inode->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 
-	lock_kernel();
 	spin_lock(&sbi->inode_hash_lock);
 	fat_cache_inval_inode(inode);
 	hlist_del_init(&MSDOS_I(inode)->i_fat_hash);
 	spin_unlock(&sbi->inode_hash_lock);
-	unlock_kernel();
 }
 
 static void fat_write_super(struct super_block *sb)
@@ -485,7 +484,7 @@ static struct kmem_cache *fat_inode_cachep;
 static struct inode *fat_alloc_inode(struct super_block *sb)
 {
 	struct msdos_inode_info *ei;
-	ei = kmem_cache_alloc(fat_inode_cachep, GFP_KERNEL);
+	ei = kmem_cache_alloc(fat_inode_cachep, GFP_NOFS);
 	if (!ei)
 		return NULL;
 	return &ei->vfs_inode;
@@ -567,7 +566,7 @@ retry:
 	if (inode->i_ino == MSDOS_ROOT_INO || !i_pos)
 		return 0;
 
-	lock_kernel();
+	lock_super(sb);
 	bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits);
 	if (!bh) {
 		printk(KERN_ERR "FAT: unable to read inode block "
@@ -579,7 +578,7 @@ retry:
 	if (i_pos != MSDOS_I(inode)->i_pos) {
 		spin_unlock(&sbi->inode_hash_lock);
 		brelse(bh);
-		unlock_kernel();
+		unlock_super(sb);
 		goto retry;
 	}
 
@@ -606,7 +605,7 @@ retry:
 		err = sync_dirty_buffer(bh);
 	brelse(bh);
 out:
-	unlock_kernel();
+	unlock_super(sb);
 	return err;
 }
 
@@ -736,6 +735,7 @@ fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable)
 
 static struct dentry *fat_get_parent(struct dentry *child)
 {
+	struct super_block *sb = child->d_sb;
 	struct buffer_head *bh;
 	struct msdos_dir_entry *de;
 	loff_t i_pos;
@@ -743,14 +743,14 @@ static struct dentry *fat_get_parent(struct dentry *child)
 	struct inode *inode;
 	int err;
 
-	lock_kernel();
+	lock_super(sb);
 
 	err = fat_get_dotdot_entry(child->d_inode, &bh, &de, &i_pos);
 	if (err) {
 		parent = ERR_PTR(err);
 		goto out;
 	}
-	inode = fat_build_inode(child->d_sb, de, i_pos);
+	inode = fat_build_inode(sb, de, i_pos);
 	brelse(bh);
 	if (IS_ERR(inode)) {
 		parent = ERR_CAST(inode);
@@ -762,7 +762,7 @@ static struct dentry *fat_get_parent(struct dentry *child)
 		parent = ERR_PTR(-ENOMEM);
 	}
 out:
-	unlock_kernel();
+	unlock_super(sb);
 
 	return parent;
 }
@@ -1172,6 +1172,12 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
 	long error;
 	char buf[50];
 
+	/*
+	 * GFP_KERNEL is ok here, because while we do hold the
+	 * supeblock lock, memory pressure can't call back into
+	 * the filesystem, since we're only just about to mount
+	 * it and have no inodes etc active!
+	 */
 	sbi = kzalloc(sizeof(struct msdos_sb_info), GFP_KERNEL);
 	if (!sbi)
 		return -ENOMEM;
diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c
index 05ff4f1d702..1f7f2956412 100644
--- a/fs/msdos/namei.c
+++ b/fs/msdos/namei.c
@@ -214,7 +214,7 @@ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
 
 	dentry->d_op = &msdos_dentry_operations;
 
-	lock_kernel();
+	lock_super(sb);
 	res = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
 	if (res == -ENOENT)
 		goto add;
@@ -232,7 +232,7 @@ add:
 	if (dentry)
 		dentry->d_op = &msdos_dentry_operations;
 out:
-	unlock_kernel();
+	unlock_super(sb);
 	if (!res)
 		return dentry;
 	return ERR_PTR(res);
@@ -286,7 +286,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, int mode,
 	unsigned char msdos_name[MSDOS_NAME];
 	int err, is_hid;
 
-	lock_kernel();
+	lock_super(sb);
 
 	err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
 				msdos_name, &MSDOS_SB(sb)->options);
@@ -315,7 +315,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, int mode,
 
 	d_instantiate(dentry, inode);
 out:
-	unlock_kernel();
+	unlock_super(sb);
 	if (!err)
 		err = fat_flush_inodes(sb, dir, inode);
 	return err;
@@ -324,11 +324,12 @@ out:
 /***** Remove a directory */
 static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
 {
+	struct super_block *sb = dir->i_sb;
 	struct inode *inode = dentry->d_inode;
 	struct fat_slot_info sinfo;
 	int err;
 
-	lock_kernel();
+	lock_super(sb);
 	/*
 	 * Check whether the directory is not in use, then check
 	 * whether it is empty.
@@ -349,9 +350,9 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
 	inode->i_ctime = CURRENT_TIME_SEC;
 	fat_detach(inode);
 out:
-	unlock_kernel();
+	unlock_super(sb);
 	if (!err)
-		err = fat_flush_inodes(inode->i_sb, dir, inode);
+		err = fat_flush_inodes(sb, dir, inode);
 
 	return err;
 }
@@ -366,7 +367,7 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	struct timespec ts;
 	int err, is_hid, cluster;
 
-	lock_kernel();
+	lock_super(sb);
 
 	err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
 				msdos_name, &MSDOS_SB(sb)->options);
@@ -404,14 +405,14 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 	d_instantiate(dentry, inode);
 
-	unlock_kernel();
+	unlock_super(sb);
 	fat_flush_inodes(sb, dir, inode);
 	return 0;
 
 out_free:
 	fat_free_clusters(dir, cluster);
 out:
-	unlock_kernel();
+	unlock_super(sb);
 	return err;
 }
 
@@ -419,10 +420,11 @@ out:
 static int msdos_unlink(struct inode *dir, struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
+	struct super_block *sb= inode->i_sb;
 	struct fat_slot_info sinfo;
 	int err;
 
-	lock_kernel();
+	lock_super(sb);
 	err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
 	if (err)
 		goto out;
@@ -434,9 +436,9 @@ static int msdos_unlink(struct inode *dir, struct dentry *dentry)
 	inode->i_ctime = CURRENT_TIME_SEC;
 	fat_detach(inode);
 out:
-	unlock_kernel();
+	unlock_super(sb);
 	if (!err)
-		err = fat_flush_inodes(inode->i_sb, dir, inode);
+		err = fat_flush_inodes(sb, dir, inode);
 
 	return err;
 }
@@ -618,10 +620,11 @@ error_inode:
 static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
 			struct inode *new_dir, struct dentry *new_dentry)
 {
+	struct super_block *sb = old_dir->i_sb;
 	unsigned char old_msdos_name[MSDOS_NAME], new_msdos_name[MSDOS_NAME];
 	int err, is_hid;
 
-	lock_kernel();
+	lock_super(sb);
 
 	err = msdos_format_name(old_dentry->d_name.name,
 				old_dentry->d_name.len, old_msdos_name,
@@ -640,9 +643,9 @@ static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
 	err = do_msdos_rename(old_dir, old_msdos_name, old_dentry,
 			      new_dir, new_msdos_name, new_dentry, is_hid);
 out:
-	unlock_kernel();
+	unlock_super(sb);
 	if (!err)
-		err = fat_flush_inodes(old_dir->i_sb, old_dir, new_dir);
+		err = fat_flush_inodes(sb, old_dir, new_dir);
 	return err;
 }
 
diff --git a/fs/vfat/namei.c b/fs/vfat/namei.c
index a3522727ea5..b546ba69be8 100644
--- a/fs/vfat/namei.c
+++ b/fs/vfat/namei.c
@@ -645,7 +645,7 @@ static int vfat_add_entry(struct inode *dir, struct qstr *qname, int is_dir,
 	if (len == 0)
 		return -ENOENT;
 
-	slots = kmalloc(sizeof(*slots) * MSDOS_SLOTS, GFP_KERNEL);
+	slots = kmalloc(sizeof(*slots) * MSDOS_SLOTS, GFP_NOFS);
 	if (slots == NULL)
 		return -ENOMEM;
 
@@ -687,7 +687,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
 	struct dentry *alias;
 	int err, table;
 
-	lock_kernel();
+	lock_super(sb);
 	table = (MSDOS_SB(sb)->options.name_check == 's') ? 2 : 0;
 	dentry->d_op = &vfat_dentry_ops[table];
 
@@ -699,7 +699,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
 	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
 	brelse(sinfo.bh);
 	if (IS_ERR(inode)) {
-		unlock_kernel();
+		unlock_super(sb);
 		return ERR_CAST(inode);
 	}
 	alias = d_find_alias(inode);
@@ -708,13 +708,13 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
 			dput(alias);
 		else {
 			iput(inode);
-			unlock_kernel();
+			unlock_super(sb);
 			return alias;
 		}
 
 	}
 error:
-	unlock_kernel();
+	unlock_super(sb);
 	dentry->d_op = &vfat_dentry_ops[table];
 	dentry->d_time = dentry->d_parent->d_inode->i_version;
 	dentry = d_splice_alias(inode, dentry);
@@ -734,7 +734,7 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, int mode,
 	struct timespec ts;
 	int err;
 
-	lock_kernel();
+	lock_super(sb);
 
 	ts = CURRENT_TIME_SEC;
 	err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo);
@@ -755,17 +755,18 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, int mode,
 	dentry->d_time = dentry->d_parent->d_inode->i_version;
 	d_instantiate(dentry, inode);
 out:
-	unlock_kernel();
+	unlock_super(sb);
 	return err;
 }
 
 static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = dir->i_sb;
 	struct fat_slot_info sinfo;
 	int err;
 
-	lock_kernel();
+	lock_super(sb);
 
 	err = fat_dir_empty(inode);
 	if (err)
@@ -783,7 +784,7 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
 	inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
 	fat_detach(inode);
 out:
-	unlock_kernel();
+	unlock_super(sb);
 
 	return err;
 }
@@ -791,10 +792,11 @@ out:
 static int vfat_unlink(struct inode *dir, struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = dir->i_sb;
 	struct fat_slot_info sinfo;
 	int err;
 
-	lock_kernel();
+	lock_super(sb);
 
 	err = vfat_find(dir, &dentry->d_name, &sinfo);
 	if (err)
@@ -807,7 +809,7 @@ static int vfat_unlink(struct inode *dir, struct dentry *dentry)
 	inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
 	fat_detach(inode);
 out:
-	unlock_kernel();
+	unlock_super(sb);
 
 	return err;
 }
@@ -820,7 +822,7 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	struct timespec ts;
 	int err, cluster;
 
-	lock_kernel();
+	lock_super(sb);
 
 	ts = CURRENT_TIME_SEC;
 	cluster = fat_alloc_new_dir(dir, &ts);
@@ -849,13 +851,13 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	dentry->d_time = dentry->d_parent->d_inode->i_version;
 	d_instantiate(dentry, inode);
 
-	unlock_kernel();
+	unlock_super(sb);
 	return 0;
 
 out_free:
 	fat_free_clusters(dir, cluster);
 out:
-	unlock_kernel();
+	unlock_super(sb);
 	return err;
 }
 
@@ -869,11 +871,12 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct timespec ts;
 	loff_t dotdot_i_pos, new_i_pos;
 	int err, is_dir, update_dotdot, corrupt = 0;
+	struct super_block *sb = old_dir->i_sb;
 
 	old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
 	old_inode = old_dentry->d_inode;
 	new_inode = new_dentry->d_inode;
-	lock_kernel();
+	lock_super(sb);
 	err = vfat_find(old_dir, &old_dentry->d_name, &old_sinfo);
 	if (err)
 		goto out;
@@ -951,7 +954,7 @@ out:
 	brelse(sinfo.bh);
 	brelse(dotdot_bh);
 	brelse(old_sinfo.bh);
-	unlock_kernel();
+	unlock_super(sb);
 
 	return err;
 
-- 
cgit v1.2.3-70-g09d2


From 514bcc66d4072a221a8dfd341a4006385a441918 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 20 May 2008 19:15:48 +0200
Subject: dlm-user: BKL pushdown

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 fs/dlm/user.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index ebbcf38fd33..f976f303c19 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -15,6 +15,7 @@
 #include <linux/poll.h>
 #include <linux/signal.h>
 #include <linux/spinlock.h>
+#include <linux/smp_lock.h>
 #include <linux/dlm.h>
 #include <linux/dlm_device.h>
 
@@ -618,13 +619,17 @@ static int device_open(struct inode *inode, struct file *file)
 	struct dlm_user_proc *proc;
 	struct dlm_ls *ls;
 
+	lock_kernel();
 	ls = dlm_find_lockspace_device(iminor(inode));
-	if (!ls)
+	if (!ls) {
+		unlock_kernel();
 		return -ENOENT;
+	}
 
 	proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL);
 	if (!proc) {
 		dlm_put_lockspace(ls);
+		unlock_kernel();
 		return -ENOMEM;
 	}
 
@@ -636,6 +641,7 @@ static int device_open(struct inode *inode, struct file *file)
 	spin_lock_init(&proc->locks_spin);
 	init_waitqueue_head(&proc->wait);
 	file->private_data = proc;
+	unlock_kernel();
 
 	return 0;
 }
@@ -870,6 +876,7 @@ static unsigned int device_poll(struct file *file, poll_table *wait)
 
 static int ctl_device_open(struct inode *inode, struct file *file)
 {
+	cycle_kernel_lock();
 	file->private_data = NULL;
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 55d8538498f62ec72b5ba67aa386c7726f630475 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 22 Jun 2008 12:23:15 -0700
Subject: Fix performance regression on lmbench select benchmark

Christian Borntraeger reported that reinstating cond_resched() with
CONFIG_PREEMPT caused a performance regression on lmbench:

	For example select file 500:
	23 microseconds
	32 microseconds

and that's really because we totally unnecessarily do the cond_resched()
in the innermost loop of select(), which is just silly.

This moves it out from the innermost loop (which only ever loops ove the
bits in a single "unsigned long" anyway), which makes the performance
regression go away.

Reported-and-tested-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/select.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/select.c b/fs/select.c
index 8dda969614a..da0e88201c3 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -249,7 +249,6 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout)
 						retval++;
 					}
 				}
-				cond_resched();
 			}
 			if (res_in)
 				*rinp = res_in;
@@ -257,6 +256,7 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout)
 				*routp = res_out;
 			if (res_ex)
 				*rexp = res_ex;
+			cond_resched();
 		}
 		wait = NULL;
 		if (retval || !*timeout || signal_pending(current))
-- 
cgit v1.2.3-70-g09d2


From fe6e9c1f25ac01f848bd084ee0ee62a5a0966ff3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 23 Jun 2008 08:30:55 -0400
Subject: [PATCH] fix cgroup-inflicted breakage in block_dev.c

devcgroup_inode_permission() expects MAY_FOO, not FMODE_FOO; kindly
keep your misdesign consistent if you positively have to inflict it
on the kernel.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/block_dev.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 470c10ceb0f..10d8a0aa871 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -931,8 +931,16 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 	struct gendisk *disk;
 	int ret;
 	int part;
+	int perm = 0;
 
-	ret = devcgroup_inode_permission(bdev->bd_inode, file->f_mode);
+	if (file->f_mode & FMODE_READ)
+		perm |= MAY_READ;
+	if (file->f_mode & FMODE_WRITE)
+		perm |= MAY_WRITE;
+	/*
+	 * hooks: /n/, see "layering violations".
+	 */
+	ret = devcgroup_inode_permission(bdev->bd_inode, perm);
 	if (ret != 0)
 		return ret;
 
-- 
cgit v1.2.3-70-g09d2


From 12fd0d3088d27867be68655bcab2b074f2835f60 Mon Sep 17 00:00:00 2001
From: Michael Kerrisk <mtk.manpages@googlemail.com>
Date: Mon, 9 Jun 2008 21:16:07 -0700
Subject: [patch for 2.6.26 2/4] vfs: utimensat(): be consistent with utime()
 for immutable and append-only files

This patch fixes utimensat() to make its behavior consistent
with that of utime()/utimes() when dealing with files marked
immutable and append-only.

The current utimensat() implementation also returns EPERM if
'times' is non-NULL and the tv_nsec fields are both UTIME_NOW.
For consistency, the

(times != NULL && times[0].tv_nsec == UTIME_NOW &&
                  times[1].tv_nsec == UTIME_NOW)

case should be treated like the traditional utimes() case where
'times' is NULL.  That is, the call should succeed for a file
marked append-only and should give the error EACCES if the file
is marked as immutable.

The simple way to do this is to set 'times' to NULL
if (times[0].tv_nsec == UTIME_NOW && times[1].tv_nsec == UTIME_NOW).

This is also the natural approach, since POSIX.1 semantics consider the
times == {{x, UTIME_NOW}, {y, UTIME_NOW}}
to be exactly equivalent to the case for
times == NULL.

(Thanks to Miklos for pointing this out.)

Patch 3 in this series relies on the simplification provided
by this patch.

Acked-by: Miklos Szeredi <miklos@szeredi.hu>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/utimes.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/utimes.c b/fs/utimes.c
index af059d5cb48..14d3edbb3d7 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -102,6 +102,10 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
 	if (error)
 		goto dput_and_out;
 
+	if (times && times[0].tv_nsec == UTIME_NOW &&
+		     times[1].tv_nsec == UTIME_NOW)
+		times = NULL;
+
 	/* Don't worry, the checks are done in inode_change_ok() */
 	newattrs.ia_valid = ATTR_CTIME | ATTR_MTIME | ATTR_ATIME;
 	if (times) {
-- 
cgit v1.2.3-70-g09d2


From 94c70b9ba7e9c1036284e779e2fef5be89021533 Mon Sep 17 00:00:00 2001
From: Michael Kerrisk <mtk.manpages@googlemail.com>
Date: Mon, 9 Jun 2008 21:16:05 -0700
Subject: [patch for 2.6.26 1/4] vfs: utimensat(): ignore tv_sec if tv_nsec ==
 UTIME_OMIT or UTIME_NOW

The POSIX.1 draft spec for utimensat() says that if a times[n].tv_nsec
field is UTIME_OMIT or UTIME_NOW, then the value in the corresponding
tv_sec field is ignored.  See the last sentence of this para, from
the spec:

    If the tv_nsec field of a timespec structure has
    the special value UTIME_NOW, the file's relevant
    timestamp shall be set to the greatest value
    supported by the file system that is not greater than
    the current time. If the tv_nsec field has the
    special value UTIME_OMIT, the file's relevant
    timestamp shall not be changed. In either case,
    the tv_sec field shall be ignored.

However the current Linux implementation requires the tv_sec value to be
zero (or the EINVAL error results). This requirement should be removed.

Acked-by: Miklos Szeredi <miklos@szeredi.hu>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/utimes.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'fs')

diff --git a/fs/utimes.c b/fs/utimes.c
index 14d3edbb3d7..d466bc587e6 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -173,14 +173,6 @@ asmlinkage long sys_utimensat(int dfd, char __user *filename, struct timespec __
 	if (utimes) {
 		if (copy_from_user(&tstimes, utimes, sizeof(tstimes)))
 			return -EFAULT;
-		if ((tstimes[0].tv_nsec == UTIME_OMIT ||
-		     tstimes[0].tv_nsec == UTIME_NOW) &&
-		    tstimes[0].tv_sec != 0)
-			return -EINVAL;
-		if ((tstimes[1].tv_nsec == UTIME_OMIT ||
-		     tstimes[1].tv_nsec == UTIME_NOW) &&
-		    tstimes[1].tv_sec != 0)
-			return -EINVAL;
 
 		/* Nothing to do, we must not even check the path.  */
 		if (tstimes[0].tv_nsec == UTIME_OMIT &&
-- 
cgit v1.2.3-70-g09d2


From 4cca92264e61a90b43fc4e076cd25b7f4e16dc61 Mon Sep 17 00:00:00 2001
From: Michael Kerrisk <mtk.manpages@googlemail.com>
Date: Mon, 9 Jun 2008 21:16:08 -0700
Subject: [patch for 2.6.26 3/4] vfs: utimensat(): fix error checking for
 {UTIME_NOW,UTIME_OMIT} case

The POSIX.1 draft spec for utimensat() says:

    Only a process with the effective user ID equal to the
    user ID of the file or with appropriate privileges may use
    futimens() or utimensat() with a non-null times argument
    that does not have both tv_nsec fields set to UTIME_NOW
    and does not have both tv_nsec fields set to UTIME_OMIT.

If this condition is violated, then the error EPERM should result.
However, the current implementation does not generate EPERM if
one tv_nsec field is UTIME_NOW while the other is UTIME_OMIT.
It should give this error for that case.

This patch:

a) Repairs that problem.
b) Removes the now unneeded nsec_special() helper function.
c) Adds some comments to explain the checks that are being
   performed.

Thanks to Miklos, who provided comments on the previous iteration
of this patch.  As a result, this version is a little simpler and
and its logic is better structured.

Miklos suggested an alternative idea, migrating the
is_owner_or_cap() checks into fs/attr.c:inode_change_ok() via
the use of an ATTR_OWNER_CHECK flag.  Maybe we could do that
later, but for now I've gone with this version, which is
IMO simpler, and can be more easily read as being correct.

Acked-by: Miklos Szeredi <miklos@szeredi.hu>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/utimes.c | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/utimes.c b/fs/utimes.c
index d466bc587e6..118d1c3241b 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -40,14 +40,9 @@ asmlinkage long sys_utime(char __user *filename, struct utimbuf __user *times)
 
 #endif
 
-static bool nsec_special(long nsec)
-{
-	return nsec == UTIME_OMIT || nsec == UTIME_NOW;
-}
-
 static bool nsec_valid(long nsec)
 {
-	if (nsec_special(nsec))
+	if (nsec == UTIME_OMIT || nsec == UTIME_NOW)
 		return true;
 
 	return nsec >= 0 && nsec <= 999999999;
@@ -106,7 +101,7 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
 		     times[1].tv_nsec == UTIME_NOW)
 		times = NULL;
 
-	/* Don't worry, the checks are done in inode_change_ok() */
+	/* In most cases, the checks are done in inode_change_ok() */
 	newattrs.ia_valid = ATTR_CTIME | ATTR_MTIME | ATTR_ATIME;
 	if (times) {
 		error = -EPERM;
@@ -128,15 +123,26 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
 			newattrs.ia_mtime.tv_nsec = times[1].tv_nsec;
 			newattrs.ia_valid |= ATTR_MTIME_SET;
 		}
-	}
 
-	/*
-	 * If times is NULL or both times are either UTIME_OMIT or
-	 * UTIME_NOW, then need to check permissions, because
-	 * inode_change_ok() won't do it.
-	 */
-	if (!times || (nsec_special(times[0].tv_nsec) &&
-		       nsec_special(times[1].tv_nsec))) {
+		/*
+		 * For the UTIME_OMIT/UTIME_NOW and UTIME_NOW/UTIME_OMIT
+		 * cases, we need to make an extra check that is not done by
+		 * inode_change_ok().
+		 */
+		if (((times[0].tv_nsec == UTIME_NOW &&
+			    times[1].tv_nsec == UTIME_OMIT)
+		     ||
+		     (times[0].tv_nsec == UTIME_OMIT &&
+			    times[1].tv_nsec == UTIME_NOW))
+		    && !is_owner_or_cap(inode))
+			goto mnt_drop_write_and_out;
+	} else {
+
+		/*
+		 * If times is NULL (or both times are UTIME_NOW),
+		 * then we need to check permissions, because
+		 * inode_change_ok() won't do it.
+		 */
 		error = -EACCES;
                 if (IS_IMMUTABLE(inode))
 			goto mnt_drop_write_and_out;
-- 
cgit v1.2.3-70-g09d2


From c70f84417429f41519be0197a1092a53c2201f47 Mon Sep 17 00:00:00 2001
From: Michael Kerrisk <mtk.manpages@googlemail.com>
Date: Mon, 9 Jun 2008 21:16:09 -0700
Subject: [patch for 2.6.26 4/4] vfs: utimensat(): fix write access check for
 futimens()

The POSIX.1 draft spec for futimens()/utimensat() says:

        Only a process with the effective user ID equal to the
        user ID of the file, *or with write access to the file*,
        or with appropriate privileges may use futimens() or
        utimensat() with a null pointer as the times argument
        or with both tv_nsec fields set to the special value
        UTIME_NOW.

The important piece here is "with write access to the file", and
this matters for futimens(), which deals with an argument that
is a file descriptor referring to the file whose timestamps are
being updated,  The standard is saying that the "writability"
check is based on the file permissions, not the access mode with
which the file is opened.  (This behavior is consistent with the
semantics of FreeBSD's futimes().)  However, Linux is currently
doing the latter -- futimens(fd, times) is a library
function implemented as

       utimensat(fd, NULL, times, 0)

and within the utimensat() implementation we have the code:

                f = fget(dfd);  // dfd is 'fd'
                ...
                if (f) {
                        if (!(f->f_mode & FMODE_WRITE))
                                goto mnt_drop_write_and_out;

The check should instead be based on the file permissions.

Thanks to Miklos for pointing out how to do this check.
Miklos also pointed out a simplification that could be
made to my first version of this patch, since the checks
for the pathname and file descriptor cases can now be
conflated.

Acked-by: Miklos Szeredi <miklos@szeredi.hu>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/utimes.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/utimes.c b/fs/utimes.c
index 118d1c3241b..b6b664e7145 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -148,14 +148,9 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
 			goto mnt_drop_write_and_out;
 
 		if (!is_owner_or_cap(inode)) {
-			if (f) {
-				if (!(f->f_mode & FMODE_WRITE))
-					goto mnt_drop_write_and_out;
-			} else {
-				error = vfs_permission(&nd, MAY_WRITE);
-				if (error)
-					goto mnt_drop_write_and_out;
-			}
+			error = permission(inode, MAY_WRITE, NULL);
+			if (error)
+				goto mnt_drop_write_and_out;
 		}
 	}
 	mutex_lock(&inode->i_mutex);
-- 
cgit v1.2.3-70-g09d2


From c8e7f449b225ee6c87454ac069f0a041035c5140 Mon Sep 17 00:00:00 2001
From: Jan Blunck <jblunck@suse.de>
Date: Mon, 9 Jun 2008 16:40:35 -0700
Subject: [patch 1/4] vfs: path_{get,put}() cleanups

Here are some more places where path_{get,put}() can be used instead of
dput()/mntput() pair.

Signed-off-by: Jan Blunck <jblunck@suse.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 11 +++++------
 fs/pipe.c  | 10 ++++------
 2 files changed, 9 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index c7e43536c49..ee1544696e8 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -581,15 +581,13 @@ static __always_inline int link_path_walk(const char *name, struct nameidata *nd
 	int result;
 
 	/* make sure the stuff we saved doesn't go away */
-	dget(save.dentry);
-	mntget(save.mnt);
+	path_get(&save);
 
 	result = __link_path_walk(name, nd);
 	if (result == -ESTALE) {
 		/* nd->path had been dropped */
 		nd->path = save;
-		dget(nd->path.dentry);
-		mntget(nd->path.mnt);
+		path_get(&nd->path);
 		nd->flags |= LOOKUP_REVAL;
 		result = __link_path_walk(name, nd);
 	}
@@ -1216,8 +1214,9 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 	nd->flags = flags;
 	nd->depth = 0;
 
-	nd->path.mnt = mntget(mnt);
-	nd->path.dentry = dget(dentry);
+	nd->path.dentry = dentry;
+	nd->path.mnt = mnt;
+	path_get(&nd->path);
 
 	retval = path_walk(name, nd);
 	if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
diff --git a/fs/pipe.c b/fs/pipe.c
index ec228bc9f88..700f4e0d957 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1003,8 +1003,7 @@ struct file *create_write_pipe(void)
 void free_write_pipe(struct file *f)
 {
 	free_pipe_info(f->f_dentry->d_inode);
-	dput(f->f_path.dentry);
-	mntput(f->f_path.mnt);
+	path_put(&f->f_path);
 	put_filp(f);
 }
 
@@ -1015,8 +1014,8 @@ struct file *create_read_pipe(struct file *wrf)
 		return ERR_PTR(-ENFILE);
 
 	/* Grab pipe from the writer */
-	f->f_path.mnt = mntget(wrf->f_path.mnt);
-	f->f_path.dentry = dget(wrf->f_path.dentry);
+	f->f_path = wrf->f_path;
+	path_get(&wrf->f_path);
 	f->f_mapping = wrf->f_path.dentry->d_inode->i_mapping;
 
 	f->f_pos = 0;
@@ -1068,8 +1067,7 @@ int do_pipe(int *fd)
  err_fdr:
 	put_unused_fd(fdr);
  err_read_pipe:
-	dput(fr->f_dentry);
-	mntput(fr->f_vfsmnt);
+	path_put(&fr->f_path);
 	put_filp(fr);
  err_write_pipe:
 	free_write_pipe(fw);
-- 
cgit v1.2.3-70-g09d2


From 20d4fdc1a788e4ca0aaf2422772ba668e7e10839 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Mon, 9 Jun 2008 16:40:36 -0700
Subject: [patch 2/4] fs: make struct file arg to d_path const

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c            | 2 +-
 include/linux/dcache.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 3ee588d5f58..c4c9072d810 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1847,7 +1847,7 @@ Elong:
  *
  * "buflen" should be positive. Caller holds the dcache_lock.
  */
-char *d_path(struct path *path, char *buf, int buflen)
+char *d_path(const struct path *path, char *buf, int buflen)
 {
 	char *res;
 	struct path root;
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 2a6639407c8..d982eb89c77 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -300,7 +300,7 @@ extern int d_validate(struct dentry *, struct dentry *);
 extern char *dynamic_dname(struct dentry *, char *, int, const char *, ...);
 
 extern char *__d_path(const struct path *path, struct path *root, char *, int);
-extern char *d_path(struct path *, char *, int);
+extern char *d_path(const struct path *, char *, int);
 extern char *dentry_path(struct dentry *, char *, int);
 
 /* Allocation counts.. */
-- 
cgit v1.2.3-70-g09d2


From 694a1764d657e0f7a9b139bc7269c8d5f5a2534b Mon Sep 17 00:00:00 2001
From: Marcin Slusarz <marcin.slusarz@gmail.com>
Date: Mon, 9 Jun 2008 16:40:37 -0700
Subject: [patch 3/4] vfs: fix ERR_PTR abuse in generic_readlink

generic_readlink calls ERR_PTR for negative and positive values
(vfs_readlink returns length of "link"), but it should not
(not an errno) and does not need to.

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Acked-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index ee1544696e8..01e67dddcc3 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2856,16 +2856,17 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 {
 	struct nameidata nd;
 	void *cookie;
+	int res;
 
 	nd.depth = 0;
 	cookie = dentry->d_inode->i_op->follow_link(dentry, &nd);
-	if (!IS_ERR(cookie)) {
-		int res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd));
-		if (dentry->d_inode->i_op->put_link)
-			dentry->d_inode->i_op->put_link(dentry, &nd, cookie);
-		cookie = ERR_PTR(res);
-	}
-	return PTR_ERR(cookie);
+	if (IS_ERR(cookie))
+		return PTR_ERR(cookie);
+
+	res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd));
+	if (dentry->d_inode->i_op->put_link)
+		dentry->d_inode->i_op->put_link(dentry, &nd, cookie);
+	return res;
 }
 
 int vfs_follow_link(struct nameidata *nd, const char *link)
-- 
cgit v1.2.3-70-g09d2


From f9f48ec72bfc9489a30bc6ddbfcf27d86a8bc651 Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Mon, 9 Jun 2008 16:40:38 -0700
Subject: [patch 4/4] flock: remove unused fields from file_lock_operations

fl_insert and fl_remove are not used right now in the kernel. Remove them.

Signed-off-by: Denis V. Lunev <den@openvz.org>
Cc: Matthew Wilcox <matthew@wil.cx>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/locks.c         | 6 ------
 include/linux/fs.h | 2 --
 2 files changed, 8 deletions(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index 11dbf08651b..dce8c747371 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -561,9 +561,6 @@ static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl)
 	/* insert into file's list */
 	fl->fl_next = *pos;
 	*pos = fl;
-
-	if (fl->fl_ops && fl->fl_ops->fl_insert)
-		fl->fl_ops->fl_insert(fl);
 }
 
 /*
@@ -586,9 +583,6 @@ static void locks_delete_lock(struct file_lock **thisfl_p)
 		fl->fl_fasync = NULL;
 	}
 
-	if (fl->fl_ops && fl->fl_ops->fl_remove)
-		fl->fl_ops->fl_remove(fl);
-
 	if (fl->fl_nspid) {
 		put_pid(fl->fl_nspid);
 		fl->fl_nspid = NULL;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d490779f18d..7c108082683 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -894,8 +894,6 @@ static inline int file_check_writeable(struct file *filp)
 typedef struct files_struct *fl_owner_t;
 
 struct file_lock_operations {
-	void (*fl_insert)(struct file_lock *);	/* lock insertion callback */
-	void (*fl_remove)(struct file_lock *);	/* lock removal callback */
 	void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
 	void (*fl_release_private)(struct file_lock *);
 };
-- 
cgit v1.2.3-70-g09d2


From 3b12cd9862d5f560031d90bda78952ab55e36b24 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Mon, 5 May 2008 17:17:44 -0400
Subject: nfsd: add dprintk of compound return

We already print each operation of the compound when debugging is turned
on; printing the result could also help with remote debugging.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4proc.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index c309c881bd4..313484380a9 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -952,6 +952,7 @@ encode_op:
 out:
 	nfsd4_release_compoundargs(args);
 	cstate_free(cstate);
+	dprintk("nfsv4 compound returned %d\n", ntohl(status));
 	return status;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 13b1867cacbfe6d8203f432996bd8a2ee6b04e79 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Thu, 29 May 2008 12:56:08 +0300
Subject: nfsd: make nfs4xdr WRITEMEM safe against zero count

WRITEMEM zeroes the last word in the destination buffer
for padding purposes, but this must not be done if
no bytes are to be copied, as it would result
in zeroing of the word right before the array.

The current implementation works since it's always called
with non zero nbytes or it follows an encoding of the
string (or opaque) length which, if equal to zero,
can be overwritten with zero.

Nevertheless, it seems safer to check for this case.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4xdr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index c513bbdf2d3..9547ab63627 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1201,7 +1201,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 	*p++ = htonl((u32)((n) >> 32));				\
 	*p++ = htonl((u32)(n));					\
 } while (0)
-#define WRITEMEM(ptr,nbytes)     do {				\
+#define WRITEMEM(ptr,nbytes)     do if (nbytes > 0) {		\
 	*(p + XDR_QUADLEN(nbytes) -1) = 0;                      \
 	memcpy(p, ptr, nbytes);					\
 	p += XDR_QUADLEN(nbytes);				\
-- 
cgit v1.2.3-70-g09d2


From bedbdd8bada194a690d2901801bf8451965086b3 Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@suse.de>
Date: Tue, 10 Jun 2008 08:40:35 -0400
Subject: knfsd: Replace lock_kernel with a mutex for nfsd thread
 startup/shutdown locking.

This removes the BKL from the RPC service creation codepath. The BKL
really isn't adequate for this job since some of this info needs
protection across sleeps.

Also, add some comments to try and clarify how the locking should work
and to make it clear that the BKL isn't necessary as long as there is
adequate locking between tasks when touching the svc_serv fields.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfsctl.c          | 37 +++++++++++++++++++++++--------------
 fs/nfsd/nfssvc.c          | 45 ++++++++++++++++++++++++++++++++-------------
 include/linux/nfsd/nfsd.h |  1 +
 net/sunrpc/svc.c          | 15 +++++++++------
 4 files changed, 65 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 5ac00c4fee9..049d2a9c771 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -450,22 +450,26 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
 	int i;
 	int rv;
 	int len;
-    	int npools = nfsd_nrpools();
+	int npools;
 	int *nthreads;
 
+	mutex_lock(&nfsd_mutex);
+	npools = nfsd_nrpools();
 	if (npools == 0) {
 		/*
 		 * NFS is shut down.  The admin can start it by
 		 * writing to the threads file but NOT the pool_threads
 		 * file, sorry.  Report zero threads.
 		 */
+		mutex_unlock(&nfsd_mutex);
 		strcpy(buf, "0\n");
 		return strlen(buf);
 	}
 
 	nthreads = kcalloc(npools, sizeof(int), GFP_KERNEL);
+	rv = -ENOMEM;
 	if (nthreads == NULL)
-		return -ENOMEM;
+		goto out_free;
 
 	if (size > 0) {
 		for (i = 0; i < npools; i++) {
@@ -496,10 +500,12 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
 		mesg += len;
 	}
 
+	mutex_unlock(&nfsd_mutex);
 	return (mesg-buf);
 
 out_free:
 	kfree(nthreads);
+	mutex_unlock(&nfsd_mutex);
 	return rv;
 }
 
@@ -566,14 +572,13 @@ static ssize_t write_versions(struct file *file, char *buf, size_t size)
 	return len;
 }
 
-static ssize_t write_ports(struct file *file, char *buf, size_t size)
+static ssize_t __write_ports(struct file *file, char *buf, size_t size)
 {
 	if (size == 0) {
 		int len = 0;
-		lock_kernel();
+
 		if (nfsd_serv)
 			len = svc_xprt_names(nfsd_serv, buf, 0);
-		unlock_kernel();
 		return len;
 	}
 	/* Either a single 'fd' number is written, in which
@@ -603,9 +608,7 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
 			/* Decrease the count, but don't shutdown the
 			 * the service
 			 */
-			lock_kernel();
 			nfsd_serv->sv_nrthreads--;
-			unlock_kernel();
 		}
 		return err < 0 ? err : 0;
 	}
@@ -614,10 +617,8 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
 		int len = 0;
 		if (!toclose)
 			return -ENOMEM;
-		lock_kernel();
 		if (nfsd_serv)
 			len = svc_sock_names(buf, nfsd_serv, toclose);
-		unlock_kernel();
 		if (len >= 0)
 			lockd_down();
 		kfree(toclose);
@@ -655,7 +656,6 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
 		if (sscanf(&buf[1], "%15s %4d", transport, &port) == 2) {
 			if (port == 0)
 				return -EINVAL;
-			lock_kernel();
 			if (nfsd_serv) {
 				xprt = svc_find_xprt(nfsd_serv, transport,
 						     AF_UNSPEC, port);
@@ -666,13 +666,22 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
 				} else
 					err = -ENOTCONN;
 			}
-			unlock_kernel();
 			return err < 0 ? err : 0;
 		}
 	}
 	return -EINVAL;
 }
 
+static ssize_t write_ports(struct file *file, char *buf, size_t size)
+{
+	ssize_t rv;
+	mutex_lock(&nfsd_mutex);
+	rv = __write_ports(file, buf, size);
+	mutex_unlock(&nfsd_mutex);
+	return rv;
+}
+
+
 int nfsd_max_blksize;
 
 static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
@@ -691,13 +700,13 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
 		if (bsize > NFSSVC_MAXBLKSIZE)
 			bsize = NFSSVC_MAXBLKSIZE;
 		bsize &= ~(1024-1);
-		lock_kernel();
+		mutex_lock(&nfsd_mutex);
 		if (nfsd_serv && nfsd_serv->sv_nrthreads) {
-			unlock_kernel();
+			mutex_unlock(&nfsd_mutex);
 			return -EBUSY;
 		}
 		nfsd_max_blksize = bsize;
-		unlock_kernel();
+		mutex_unlock(&nfsd_mutex);
 	}
 	return sprintf(buf, "%d\n", nfsd_max_blksize);
 }
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 941041f4b13..512bd04c6dd 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -53,11 +53,27 @@
 extern struct svc_program	nfsd_program;
 static void			nfsd(struct svc_rqst *rqstp);
 struct timeval			nfssvc_boot;
-       struct svc_serv 		*nfsd_serv;
 static atomic_t			nfsd_busy;
 static unsigned long		nfsd_last_call;
 static DEFINE_SPINLOCK(nfsd_call_lock);
 
+/*
+ * nfsd_mutex protects nfsd_serv -- both the pointer itself and the members
+ * of the svc_serv struct. In particular, ->sv_nrthreads but also to some
+ * extent ->sv_temp_socks and ->sv_permsocks. It also protects nfsdstats.th_cnt
+ *
+ * If (out side the lock) nfsd_serv is non-NULL, then it must point to a
+ * properly initialised 'struct svc_serv' with ->sv_nrthreads > 0. That number
+ * of nfsd threads must exist and each must listed in ->sp_all_threads in each
+ * entry of ->sv_pools[].
+ *
+ * Transitions of the thread count between zero and non-zero are of particular
+ * interest since the svc_serv needs to be created and initialized at that
+ * point, or freed.
+ */
+DEFINE_MUTEX(nfsd_mutex);
+struct svc_serv 		*nfsd_serv;
+
 #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
 static struct svc_stat	nfsd_acl_svcstats;
 static struct svc_version *	nfsd_acl_version[] = {
@@ -190,13 +206,14 @@ void nfsd_reset_versions(void)
 	}
 }
 
+
 int nfsd_create_serv(void)
 {
 	int err = 0;
-	lock_kernel();
+
+	WARN_ON(!mutex_is_locked(&nfsd_mutex));
 	if (nfsd_serv) {
 		svc_get(nfsd_serv);
-		unlock_kernel();
 		return 0;
 	}
 	if (nfsd_max_blksize == 0) {
@@ -223,7 +240,7 @@ int nfsd_create_serv(void)
 				      nfsd, SIG_NOCLEAN, THIS_MODULE);
 	if (nfsd_serv == NULL)
 		err = -ENOMEM;
-	unlock_kernel();
+
 	do_gettimeofday(&nfssvc_boot);		/* record boot time */
 	return err;
 }
@@ -282,6 +299,8 @@ int nfsd_set_nrthreads(int n, int *nthreads)
 	int tot = 0;
 	int err = 0;
 
+	WARN_ON(!mutex_is_locked(&nfsd_mutex));
+
 	if (nfsd_serv == NULL || n <= 0)
 		return 0;
 
@@ -316,7 +335,6 @@ int nfsd_set_nrthreads(int n, int *nthreads)
 		nthreads[0] = 1;
 
 	/* apply the new numbers */
-	lock_kernel();
 	svc_get(nfsd_serv);
 	for (i = 0; i < n; i++) {
 		err = svc_set_num_threads(nfsd_serv, &nfsd_serv->sv_pools[i],
@@ -325,7 +343,6 @@ int nfsd_set_nrthreads(int n, int *nthreads)
 			break;
 	}
 	svc_destroy(nfsd_serv);
-	unlock_kernel();
 
 	return err;
 }
@@ -334,8 +351,8 @@ int
 nfsd_svc(unsigned short port, int nrservs)
 {
 	int	error;
-	
-	lock_kernel();
+
+	mutex_lock(&nfsd_mutex);
 	dprintk("nfsd: creating service\n");
 	error = -EINVAL;
 	if (nrservs <= 0)
@@ -363,7 +380,7 @@ nfsd_svc(unsigned short port, int nrservs)
  failure:
 	svc_destroy(nfsd_serv);		/* Release server */
  out:
-	unlock_kernel();
+	mutex_unlock(&nfsd_mutex);
 	return error;
 }
 
@@ -399,7 +416,7 @@ nfsd(struct svc_rqst *rqstp)
 	sigset_t shutdown_mask, allowed_mask;
 
 	/* Lock module and set up kernel thread */
-	lock_kernel();
+	mutex_lock(&nfsd_mutex);
 	daemonize("nfsd");
 
 	/* After daemonize() this kernel thread shares current->fs
@@ -417,11 +434,13 @@ nfsd(struct svc_rqst *rqstp)
 	siginitsetinv(&shutdown_mask, SHUTDOWN_SIGS);
 	siginitsetinv(&allowed_mask, ALLOWED_SIGS);
 
+
 	nfsdstats.th_cnt++;
 
 	rqstp->rq_task = current;
 
-	unlock_kernel();
+	mutex_unlock(&nfsd_mutex);
+
 
 	/*
 	 * We want less throttling in balance_dirty_pages() so that nfs to
@@ -477,7 +496,7 @@ nfsd(struct svc_rqst *rqstp)
 	/* Clear signals before calling svc_exit_thread() */
 	flush_signals(current);
 
-	lock_kernel();
+	mutex_lock(&nfsd_mutex);
 
 	nfsdstats.th_cnt --;
 
@@ -486,7 +505,7 @@ out:
 	svc_exit_thread(rqstp);
 
 	/* Release module */
-	unlock_kernel();
+	mutex_unlock(&nfsd_mutex);
 	module_put_and_exit(0);
 }
 
diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h
index 41d30c9c9de..88d85b96442 100644
--- a/include/linux/nfsd/nfsd.h
+++ b/include/linux/nfsd/nfsd.h
@@ -54,6 +54,7 @@ typedef int (*nfsd_dirop_t)(struct inode *, struct dentry *, int, int);
 extern struct svc_program	nfsd_program;
 extern struct svc_version	nfsd_version2, nfsd_version3,
 				nfsd_version4;
+extern struct mutex		nfsd_mutex;
 extern struct svc_serv		*nfsd_serv;
 
 extern struct seq_operations nfs_exports_op;
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 01c7e311b90..7bffaff2a3a 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -461,7 +461,8 @@ svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
 EXPORT_SYMBOL(svc_create_pooled);
 
 /*
- * Destroy an RPC service.  Should be called with the BKL held
+ * Destroy an RPC service. Should be called with appropriate locking to
+ * protect the sv_nrthreads, sv_permsocks and sv_tempsocks.
  */
 void
 svc_destroy(struct svc_serv *serv)
@@ -578,9 +579,10 @@ out_enomem:
 EXPORT_SYMBOL(svc_prepare_thread);
 
 /*
- * Create a thread in the given pool.  Caller must hold BKL.
- * On a NUMA or SMP machine, with a multi-pool serv, the thread
- * will be restricted to run on the cpus belonging to the pool.
+ * Create a thread in the given pool.  Caller must hold BKL or another lock to
+ * serialize access to the svc_serv struct. On a NUMA or SMP machine, with a
+ * multi-pool serv, the thread will be restricted to run on the cpus belonging
+ * to the pool.
  */
 static int
 __svc_create_thread(svc_thread_fn func, struct svc_serv *serv,
@@ -674,7 +676,7 @@ found_pool:
  * of threads the given number.  If `pool' is non-NULL, applies
  * only to threads in that pool, otherwise round-robins between
  * all pools.  Must be called with a svc_get() reference and
- * the BKL held.
+ * the BKL or another lock to protect access to svc_serv fields.
  *
  * Destroying threads relies on the service threads filling in
  * rqstp->rq_task, which only the nfs ones do.  Assumes the serv
@@ -722,7 +724,8 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
 EXPORT_SYMBOL(svc_set_num_threads);
 
 /*
- * Called from a server thread as it's exiting.  Caller must hold BKL.
+ * Called from a server thread as it's exiting. Caller must hold the BKL or
+ * the "service mutex", whichever is appropriate for the service.
  */
 void
 svc_exit_thread(struct svc_rqst *rqstp)
-- 
cgit v1.2.3-70-g09d2


From 3dd98a3bccb1bdd30b8a4a755e7bead1b64160ec Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 10 Jun 2008 08:40:36 -0400
Subject: knfsd: clean up nfsd filesystem interfaces

Several of the nfsd filesystem interfaces allow changes to parameters
that don't have any effect on a running nfsd service. They are only ever
checked when nfsd is started. This patch fixes it so that changes to
those procfiles return -EBUSY if nfsd is already running to make it
clear that changes on the fly don't work.

The patch should also close some relatively harmless races between
changing the info in those interfaces and starting nfsd, since these
variables are being moved under the protection of the nfsd_mutex.

Finally, the nfsv4recoverydir file always returns -EINVAL if read. This
patch fixes it to return the recoverydir path as expected.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c | 17 ++++++++++----
 fs/nfsd/nfsctl.c    | 66 ++++++++++++++++++++++++++++++++++++++++++-----------
 fs/nfsd/nfssvc.c    |  8 +++++++
 3 files changed, 74 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 5c97d47676a..bf11d6879ab 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3249,12 +3249,14 @@ nfs4_state_shutdown(void)
 	nfs4_unlock_state();
 }
 
+/*
+ * user_recovery_dirname is protected by the nfsd_mutex since it's only
+ * accessed when nfsd is starting.
+ */
 static void
 nfs4_set_recdir(char *recdir)
 {
-	nfs4_lock_state();
 	strcpy(user_recovery_dirname, recdir);
-	nfs4_unlock_state();
 }
 
 /*
@@ -3278,6 +3280,12 @@ nfs4_reset_recoverydir(char *recdir)
 	return status;
 }
 
+char *
+nfs4_recoverydir(void)
+{
+	return user_recovery_dirname;
+}
+
 /*
  * Called when leasetime is changed.
  *
@@ -3286,11 +3294,12 @@ nfs4_reset_recoverydir(char *recdir)
  * we start to register any changes in lease time.  If the administrator
  * really wants to change the lease time *now*, they can go ahead and bring
  * nfsd down and then back up again after changing the lease time.
+ *
+ * user_lease_time is protected by nfsd_mutex since it's only really accessed
+ * when nfsd is starting
  */
 void
 nfs4_reset_lease(time_t leasetime)
 {
-	lock_kernel();
 	user_lease_time = leasetime;
-	unlock_kernel();
 }
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 049d2a9c771..2c2eb8796c1 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -509,7 +509,7 @@ out_free:
 	return rv;
 }
 
-static ssize_t write_versions(struct file *file, char *buf, size_t size)
+static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 {
 	/*
 	 * Format:
@@ -572,6 +572,16 @@ static ssize_t write_versions(struct file *file, char *buf, size_t size)
 	return len;
 }
 
+static ssize_t write_versions(struct file *file, char *buf, size_t size)
+{
+	ssize_t rv;
+
+	mutex_lock(&nfsd_mutex);
+	rv = __write_versions(file, buf, size);
+	mutex_unlock(&nfsd_mutex);
+	return rv;
+}
+
 static ssize_t __write_ports(struct file *file, char *buf, size_t size)
 {
 	if (size == 0) {
@@ -675,6 +685,7 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
 static ssize_t write_ports(struct file *file, char *buf, size_t size)
 {
 	ssize_t rv;
+
 	mutex_lock(&nfsd_mutex);
 	rv = __write_ports(file, buf, size);
 	mutex_unlock(&nfsd_mutex);
@@ -714,16 +725,17 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
 #ifdef CONFIG_NFSD_V4
 extern time_t nfs4_leasetime(void);
 
-static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
+static ssize_t __write_leasetime(struct file *file, char *buf, size_t size)
 {
 	/* if size > 10 seconds, call
 	 * nfs4_reset_lease() then write out the new lease (seconds) as reply
 	 */
 	char *mesg = buf;
-	int rv;
+	int rv, lease;
 
 	if (size > 0) {
-		int lease;
+		if (nfsd_serv)
+			return -EBUSY;
 		rv = get_int(&mesg, &lease);
 		if (rv)
 			return rv;
@@ -735,24 +747,52 @@ static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
 	return strlen(buf);
 }
 
-static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
+static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
+{
+	ssize_t rv;
+
+	mutex_lock(&nfsd_mutex);
+	rv = __write_leasetime(file, buf, size);
+	mutex_unlock(&nfsd_mutex);
+	return rv;
+}
+
+extern char *nfs4_recoverydir(void);
+
+static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size)
 {
 	char *mesg = buf;
 	char *recdir;
 	int len, status;
 
-	if (size == 0 || size > PATH_MAX || buf[size-1] != '\n')
-		return -EINVAL;
-	buf[size-1] = 0;
+	if (size > 0) {
+		if (nfsd_serv)
+			return -EBUSY;
+		if (size > PATH_MAX || buf[size-1] != '\n')
+			return -EINVAL;
+		buf[size-1] = 0;
 
-	recdir = mesg;
-	len = qword_get(&mesg, recdir, size);
-	if (len <= 0)
-		return -EINVAL;
+		recdir = mesg;
+		len = qword_get(&mesg, recdir, size);
+		if (len <= 0)
+			return -EINVAL;
 
-	status = nfs4_reset_recoverydir(recdir);
+		status = nfs4_reset_recoverydir(recdir);
+	}
+	sprintf(buf, "%s\n", nfs4_recoverydir());
 	return strlen(buf);
 }
+
+static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
+{
+	ssize_t rv;
+
+	mutex_lock(&nfsd_mutex);
+	rv = __write_recoverydir(file, buf, size);
+	mutex_unlock(&nfsd_mutex);
+	return rv;
+}
+
 #endif
 
 /*----------------------------------------------------------------------------*/
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 512bd04c6dd..929af233510 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -70,6 +70,14 @@ static DEFINE_SPINLOCK(nfsd_call_lock);
  * Transitions of the thread count between zero and non-zero are of particular
  * interest since the svc_serv needs to be created and initialized at that
  * point, or freed.
+ *
+ * Finally, the nfsd_mutex also protects some of the global variables that are
+ * accessed when nfsd starts and that are settable via the write_* routines in
+ * nfsctl.c. In particular:
+ *
+ *	user_recovery_dirname
+ *	user_lease_time
+ *	nfsd_versions
  */
 DEFINE_MUTEX(nfsd_mutex);
 struct svc_serv 		*nfsd_serv;
-- 
cgit v1.2.3-70-g09d2


From e096bbc6488d3e49d476bf986d33752709361277 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 10 Jun 2008 08:40:37 -0400
Subject: knfsd: remove special handling for SIGHUP

The special handling for SIGHUP in knfsd is a holdover from much
earlier versions of Linux where reloading the export table was
more expensive. That facility is not really needed anymore and
to my knowledge, is seldom-used.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfssvc.c | 33 ++++++++-------------------------
 1 file changed, 8 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 929af233510..6339cb70a08 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -44,11 +44,6 @@
  * when not handling a request. i.e. when waiting
  */
 #define SHUTDOWN_SIGS	(sigmask(SIGKILL) | sigmask(SIGHUP) | sigmask(SIGINT) | sigmask(SIGQUIT))
-/* if the last thread dies with SIGHUP, then the exports table is
- * left unchanged ( like 2.4-{0-9} ).  Any other signal will clear
- * the exports table (like 2.2).
- */
-#define	SIG_NOCLEAN	SIGHUP
 
 extern struct svc_program	nfsd_program;
 static void			nfsd(struct svc_rqst *rqstp);
@@ -175,7 +170,6 @@ int nfsd_nrthreads(void)
 		return nfsd_serv->sv_nrthreads;
 }
 
-static int killsig;	/* signal that was used to kill last nfsd */
 static void nfsd_last_thread(struct svc_serv *serv)
 {
 	/* When last nfsd thread exits we need to do some clean-up */
@@ -186,11 +180,9 @@ static void nfsd_last_thread(struct svc_serv *serv)
 	nfsd_racache_shutdown();
 	nfs4_state_shutdown();
 
-	printk(KERN_WARNING "nfsd: last server has exited\n");
-	if (killsig != SIG_NOCLEAN) {
-		printk(KERN_WARNING "nfsd: unexporting all filesystems\n");
-		nfsd_export_flush();
-	}
+	printk(KERN_WARNING "nfsd: last server has exited, flushing export "
+			    "cache\n");
+	nfsd_export_flush();
 }
 
 void nfsd_reset_versions(void)
@@ -242,10 +234,9 @@ int nfsd_create_serv(void)
 	}
 
 	atomic_set(&nfsd_busy, 0);
-	nfsd_serv = svc_create_pooled(&nfsd_program,
-				      nfsd_max_blksize,
-				      nfsd_last_thread,
-				      nfsd, SIG_NOCLEAN, THIS_MODULE);
+	nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
+				      nfsd_last_thread, nfsd, SIGINT,
+				      THIS_MODULE);
 	if (nfsd_serv == NULL)
 		err = -ENOMEM;
 
@@ -490,17 +481,9 @@ nfsd(struct svc_rqst *rqstp)
 		atomic_dec(&nfsd_busy);
 	}
 
-	if (err != -EINTR) {
+	if (err != -EINTR)
 		printk(KERN_WARNING "nfsd: terminating on error %d\n", -err);
-	} else {
-		unsigned int	signo;
-
-		for (signo = 1; signo <= _NSIG; signo++)
-			if (sigismember(&current->pending.signal, signo) &&
-			    !sigismember(&current->blocked, signo))
-				break;
-		killsig = signo;
-	}
+
 	/* Clear signals before calling svc_exit_thread() */
 	flush_signals(current);
 
-- 
cgit v1.2.3-70-g09d2


From 9867d76ca16b3f455f9ca83861f4ce5c94a25928 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 10 Jun 2008 08:40:38 -0400
Subject: knfsd: convert knfsd to kthread API

This patch is rather large, but I couldn't figure out a way to break it
up that would remain bisectable. It does several things:

- change svc_thread_fn typedef to better match what kthread_create expects
- change svc_pool_map_set_cpumask to be more kthread friendly. Make it
  take a task arg and and get rid of the "oldmask"
- have svc_set_num_threads call kthread_create directly
- eliminate __svc_create_thread

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfssvc.c           |  45 ++++++++++++--------
 include/linux/sunrpc/svc.h |   2 +-
 net/sunrpc/svc.c           | 100 +++++++++++++++------------------------------
 3 files changed, 64 insertions(+), 83 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 6339cb70a08..9e215681371 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -21,6 +21,7 @@
 #include <linux/smp_lock.h>
 #include <linux/freezer.h>
 #include <linux/fs_struct.h>
+#include <linux/kthread.h>
 
 #include <linux/sunrpc/types.h>
 #include <linux/sunrpc/stats.h>
@@ -46,7 +47,7 @@
 #define SHUTDOWN_SIGS	(sigmask(SIGKILL) | sigmask(SIGHUP) | sigmask(SIGINT) | sigmask(SIGQUIT))
 
 extern struct svc_program	nfsd_program;
-static void			nfsd(struct svc_rqst *rqstp);
+static int			nfsd(void *vrqstp);
 struct timeval			nfssvc_boot;
 static atomic_t			nfsd_busy;
 static unsigned long		nfsd_last_call;
@@ -407,18 +408,19 @@ update_thread_usage(int busy_threads)
 /*
  * This is the NFS server kernel thread
  */
-static void
-nfsd(struct svc_rqst *rqstp)
+static int
+nfsd(void *vrqstp)
 {
+	struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp;
 	struct fs_struct *fsp;
-	int		err;
 	sigset_t shutdown_mask, allowed_mask;
+	int err, preverr = 0;
+	unsigned int signo;
 
 	/* Lock module and set up kernel thread */
 	mutex_lock(&nfsd_mutex);
-	daemonize("nfsd");
 
-	/* After daemonize() this kernel thread shares current->fs
+	/* At this point, the thread shares current->fs
 	 * with the init process. We need to create files with a
 	 * umask of 0 instead of init's umask. */
 	fsp = copy_fs_struct(current->fs);
@@ -433,14 +435,18 @@ nfsd(struct svc_rqst *rqstp)
 	siginitsetinv(&shutdown_mask, SHUTDOWN_SIGS);
 	siginitsetinv(&allowed_mask, ALLOWED_SIGS);
 
+	/*
+	 * thread is spawned with all signals set to SIG_IGN, re-enable
+	 * the ones that matter
+	 */
+	for (signo = 1; signo <= _NSIG; signo++) {
+		if (!sigismember(&shutdown_mask, signo))
+			allow_signal(signo);
+	}
 
 	nfsdstats.th_cnt++;
-
-	rqstp->rq_task = current;
-
 	mutex_unlock(&nfsd_mutex);
 
-
 	/*
 	 * We want less throttling in balance_dirty_pages() so that nfs to
 	 * localhost doesn't cause nfsd to lock up due to all the client's
@@ -462,15 +468,25 @@ nfsd(struct svc_rqst *rqstp)
 		 */
 		while ((err = svc_recv(rqstp, 60*60*HZ)) == -EAGAIN)
 			;
-		if (err < 0)
+		if (err == -EINTR)
 			break;
+		else if (err < 0) {
+			if (err != preverr) {
+				printk(KERN_WARNING "%s: unexpected error "
+					"from svc_recv (%d)\n", __func__, -err);
+				preverr = err;
+			}
+			schedule_timeout_uninterruptible(HZ);
+			continue;
+		}
+
 		update_thread_usage(atomic_read(&nfsd_busy));
 		atomic_inc(&nfsd_busy);
 
 		/* Lock the export hash tables for reading. */
 		exp_readlock();
 
-		/* Process request with signals blocked.  */
+		/* Process request with signals blocked. */
 		sigprocmask(SIG_SETMASK, &allowed_mask, NULL);
 
 		svc_process(rqstp);
@@ -481,14 +497,10 @@ nfsd(struct svc_rqst *rqstp)
 		atomic_dec(&nfsd_busy);
 	}
 
-	if (err != -EINTR)
-		printk(KERN_WARNING "nfsd: terminating on error %d\n", -err);
-
 	/* Clear signals before calling svc_exit_thread() */
 	flush_signals(current);
 
 	mutex_lock(&nfsd_mutex);
-
 	nfsdstats.th_cnt --;
 
 out:
@@ -498,6 +510,7 @@ out:
 	/* Release module */
 	mutex_unlock(&nfsd_mutex);
 	module_put_and_exit(0);
+	return 0;
 }
 
 static __be32 map_new_errors(u32 vers, __be32 nfserr)
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 4b54c5fdcfd..011d6d8100d 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -22,7 +22,7 @@
 /*
  * This is the RPC server thread function prototype
  */
-typedef void		(*svc_thread_fn)(struct svc_rqst *);
+typedef int		(*svc_thread_fn)(void *);
 
 /*
  *
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 7bffaff2a3a..03a9f1a9e75 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -18,6 +18,7 @@
 #include <linux/mm.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
+#include <linux/kthread.h>
 
 #include <linux/sunrpc/types.h>
 #include <linux/sunrpc/xdr.h>
@@ -291,15 +292,14 @@ svc_pool_map_put(void)
 
 
 /*
- * Set the current thread's cpus_allowed mask so that it
+ * Set the given thread's cpus_allowed mask so that it
  * will only run on cpus in the given pool.
- *
- * Returns 1 and fills in oldmask iff a cpumask was applied.
  */
-static inline int
-svc_pool_map_set_cpumask(unsigned int pidx, cpumask_t *oldmask)
+static inline void
+svc_pool_map_set_cpumask(struct task_struct *task, unsigned int pidx)
 {
 	struct svc_pool_map *m = &svc_pool_map;
+	unsigned int node = m->pool_to[pidx];
 
 	/*
 	 * The caller checks for sv_nrpools > 1, which
@@ -307,26 +307,17 @@ svc_pool_map_set_cpumask(unsigned int pidx, cpumask_t *oldmask)
 	 */
 	BUG_ON(m->count == 0);
 
-	switch (m->mode)
-	{
-	default:
-		return 0;
+	switch (m->mode) {
 	case SVC_POOL_PERCPU:
 	{
-		unsigned int cpu = m->pool_to[pidx];
-
-		*oldmask = current->cpus_allowed;
-		set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
-		return 1;
+		set_cpus_allowed_ptr(task, &cpumask_of_cpu(node));
+		break;
 	}
 	case SVC_POOL_PERNODE:
 	{
-		unsigned int node = m->pool_to[pidx];
 		node_to_cpumask_ptr(nodecpumask, node);
-
-		*oldmask = current->cpus_allowed;
-		set_cpus_allowed_ptr(current, nodecpumask);
-		return 1;
+		set_cpus_allowed_ptr(task, nodecpumask);
+		break;
 	}
 	}
 }
@@ -578,47 +569,6 @@ out_enomem:
 }
 EXPORT_SYMBOL(svc_prepare_thread);
 
-/*
- * Create a thread in the given pool.  Caller must hold BKL or another lock to
- * serialize access to the svc_serv struct. On a NUMA or SMP machine, with a
- * multi-pool serv, the thread will be restricted to run on the cpus belonging
- * to the pool.
- */
-static int
-__svc_create_thread(svc_thread_fn func, struct svc_serv *serv,
-		    struct svc_pool *pool)
-{
-	struct svc_rqst	*rqstp;
-	int		error = -ENOMEM;
-	int		have_oldmask = 0;
-	cpumask_t	uninitialized_var(oldmask);
-
-	rqstp = svc_prepare_thread(serv, pool);
-	if (IS_ERR(rqstp)) {
-		error = PTR_ERR(rqstp);
-		goto out;
-	}
-
-	if (serv->sv_nrpools > 1)
-		have_oldmask = svc_pool_map_set_cpumask(pool->sp_id, &oldmask);
-
-	error = kernel_thread((int (*)(void *)) func, rqstp, 0);
-
-	if (have_oldmask)
-		set_cpus_allowed(current, oldmask);
-
-	if (error < 0)
-		goto out_thread;
-	svc_sock_update_bufs(serv);
-	error = 0;
-out:
-	return error;
-
-out_thread:
-	svc_exit_thread(rqstp);
-	goto out;
-}
-
 /*
  * Choose a pool in which to create a new thread, for svc_set_num_threads
  */
@@ -688,7 +638,9 @@ found_pool:
 int
 svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
 {
-	struct task_struct *victim;
+	struct svc_rqst	*rqstp;
+	struct task_struct *task;
+	struct svc_pool *chosen_pool;
 	int error = 0;
 	unsigned int state = serv->sv_nrthreads-1;
 
@@ -704,18 +656,34 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
 	/* create new threads */
 	while (nrservs > 0) {
 		nrservs--;
+		chosen_pool = choose_pool(serv, pool, &state);
+
+		rqstp = svc_prepare_thread(serv, chosen_pool);
+		if (IS_ERR(rqstp)) {
+			error = PTR_ERR(rqstp);
+			break;
+		}
+
 		__module_get(serv->sv_module);
-		error = __svc_create_thread(serv->sv_function, serv,
-					    choose_pool(serv, pool, &state));
-		if (error < 0) {
+		task = kthread_create(serv->sv_function, rqstp, serv->sv_name);
+		if (IS_ERR(task)) {
+			error = PTR_ERR(task);
 			module_put(serv->sv_module);
+			svc_exit_thread(rqstp);
 			break;
 		}
+
+		rqstp->rq_task = task;
+		if (serv->sv_nrpools > 1)
+			svc_pool_map_set_cpumask(task, chosen_pool->sp_id);
+
+		svc_sock_update_bufs(serv);
+		wake_up_process(task);
 	}
 	/* destroy old threads */
 	while (nrservs < 0 &&
-	       (victim = choose_victim(serv, pool, &state)) != NULL) {
-		send_sig(serv->sv_kill_signal, victim, 1);
+	       (task = choose_victim(serv, pool, &state)) != NULL) {
+		send_sig(serv->sv_kill_signal, task, 1);
 		nrservs++;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From a75c5d01e4235a7dd785548ac756f248b1b40107 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 10 Jun 2008 08:40:39 -0400
Subject: sunrpc: remove sv_kill_signal field from svc_serv struct

Since we no longer make any distinction between shutdown signals with
nfsd, then it becomes easier to just standardize on a particular signal
to use to bring it down (SIGINT, in this case).

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfssvc.c           | 3 +--
 include/linux/sunrpc/svc.h | 5 ++---
 net/sunrpc/svc.c           | 5 ++---
 3 files changed, 5 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 9e215681371..26c81149d49 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -236,8 +236,7 @@ int nfsd_create_serv(void)
 
 	atomic_set(&nfsd_busy, 0);
 	nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
-				      nfsd_last_thread, nfsd, SIGINT,
-				      THIS_MODULE);
+				      nfsd_last_thread, nfsd, THIS_MODULE);
 	if (nfsd_serv == NULL)
 		err = -ENOMEM;
 
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 011d6d8100d..dc69068d94c 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -80,7 +80,6 @@ struct svc_serv {
 	struct module *		sv_module;	/* optional module to count when
 						 * adding threads */
 	svc_thread_fn		sv_function;	/* main function for threads */
-	int			sv_kill_signal;	/* signal to kill threads */
 };
 
 /*
@@ -388,8 +387,8 @@ struct svc_rqst *svc_prepare_thread(struct svc_serv *serv,
 					struct svc_pool *pool);
 void		   svc_exit_thread(struct svc_rqst *);
 struct svc_serv *  svc_create_pooled(struct svc_program *, unsigned int,
-			void (*shutdown)(struct svc_serv*),
-			svc_thread_fn, int sig, struct module *);
+			void (*shutdown)(struct svc_serv*), svc_thread_fn,
+			struct module *);
 int		   svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
 void		   svc_destroy(struct svc_serv *);
 int		   svc_process(struct svc_rqst *);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 03a9f1a9e75..5a32cb7c4bb 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -434,7 +434,7 @@ EXPORT_SYMBOL(svc_create);
 struct svc_serv *
 svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
 		void (*shutdown)(struct svc_serv *serv),
-		  svc_thread_fn func, int sig, struct module *mod)
+		  svc_thread_fn func, struct module *mod)
 {
 	struct svc_serv *serv;
 	unsigned int npools = svc_pool_map_get();
@@ -443,7 +443,6 @@ svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
 
 	if (serv != NULL) {
 		serv->sv_function = func;
-		serv->sv_kill_signal = sig;
 		serv->sv_module = mod;
 	}
 
@@ -683,7 +682,7 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
 	/* destroy old threads */
 	while (nrservs < 0 &&
 	       (task = choose_victim(serv, pool, &state)) != NULL) {
-		send_sig(serv->sv_kill_signal, task, 1);
+		send_sig(SIGINT, task, 1);
 		nrservs++;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From abd1ec4efd82ca06127bce833ad8a4bbec8a0dcb Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 11 Jun 2008 10:03:12 -0400
Subject: lockd: close potential race with rapid lockd_up/lockd_down cycle

If lockd_down is called very rapidly after lockd_up returns, then
there is a slim chance that lockd() will never be called. kthread()
will return before calling the function, so we'll end up never
actually calling the cleanup functions for the thread.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/svc.c | 33 +++++++++++++--------------------
 1 file changed, 13 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 2169af4d545..5bd9bf0fa9d 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -50,7 +50,7 @@ EXPORT_SYMBOL(nlmsvc_ops);
 static DEFINE_MUTEX(nlmsvc_mutex);
 static unsigned int		nlmsvc_users;
 static struct task_struct	*nlmsvc_task;
-static struct svc_serv		*nlmsvc_serv;
+static struct svc_rqst		*nlmsvc_rqst;
 int				nlmsvc_grace_period;
 unsigned long			nlmsvc_timeout;
 
@@ -194,20 +194,11 @@ lockd(void *vrqstp)
 
 		svc_process(rqstp);
 	}
-
 	flush_signals(current);
 	if (nlmsvc_ops)
 		nlmsvc_invalidate_all();
 	nlm_shutdown_hosts();
-
 	unlock_kernel();
-
-	nlmsvc_task = NULL;
-	nlmsvc_serv = NULL;
-
-	/* Exit the RPC thread */
-	svc_exit_thread(rqstp);
-
 	return 0;
 }
 
@@ -254,16 +245,15 @@ int
 lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */
 {
 	struct svc_serv *serv;
-	struct svc_rqst *rqstp;
 	int		error = 0;
 
 	mutex_lock(&nlmsvc_mutex);
 	/*
 	 * Check whether we're already up and running.
 	 */
-	if (nlmsvc_serv) {
+	if (nlmsvc_rqst) {
 		if (proto)
-			error = make_socks(nlmsvc_serv, proto);
+			error = make_socks(nlmsvc_rqst->rq_server, proto);
 		goto out;
 	}
 
@@ -288,9 +278,10 @@ lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */
 	/*
 	 * Create the kernel thread and wait for it to start.
 	 */
-	rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]);
-	if (IS_ERR(rqstp)) {
-		error = PTR_ERR(rqstp);
+	nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0]);
+	if (IS_ERR(nlmsvc_rqst)) {
+		error = PTR_ERR(nlmsvc_rqst);
+		nlmsvc_rqst = NULL;
 		printk(KERN_WARNING
 			"lockd_up: svc_rqst allocation failed, error=%d\n",
 			error);
@@ -298,16 +289,15 @@ lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */
 	}
 
 	svc_sock_update_bufs(serv);
-	nlmsvc_serv = rqstp->rq_server;
 
-	nlmsvc_task = kthread_run(lockd, rqstp, serv->sv_name);
+	nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name);
 	if (IS_ERR(nlmsvc_task)) {
 		error = PTR_ERR(nlmsvc_task);
+		svc_exit_thread(nlmsvc_rqst);
 		nlmsvc_task = NULL;
-		nlmsvc_serv = NULL;
+		nlmsvc_rqst = NULL;
 		printk(KERN_WARNING
 			"lockd_up: kthread_run failed, error=%d\n", error);
-		svc_exit_thread(rqstp);
 		goto destroy_and_out;
 	}
 
@@ -346,6 +336,9 @@ lockd_down(void)
 		BUG();
 	}
 	kthread_stop(nlmsvc_task);
+	svc_exit_thread(nlmsvc_rqst);
+	nlmsvc_task = NULL;
+	nlmsvc_rqst = NULL;
 out:
 	mutex_unlock(&nlmsvc_mutex);
 }
-- 
cgit v1.2.3-70-g09d2


From c7d106c90ec40a0e35a6960157b40f238627246e Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@suse.de>
Date: Thu, 12 Jun 2008 13:38:42 +1000
Subject: nfsd: fix race in nfsd_nrthreads()

We need the nfsd_mutex before accessing nfsd_serv->sv_nrthreads or we
can't even guarantee nfsd_serv will still be there.

Signed-off-by: Neil Brown <neilb@suse.de>
Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfssvc.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 26c81149d49..96fdbcab8d9 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -165,10 +165,12 @@ int nfsd_vers(int vers, enum vers_op change)
 
 int nfsd_nrthreads(void)
 {
-	if (nfsd_serv == NULL)
-		return 0;
-	else
-		return nfsd_serv->sv_nrthreads;
+	int rv = 0;
+	mutex_lock(&nfsd_mutex);
+	if (nfsd_serv)
+		rv = nfsd_serv->sv_nrthreads;
+	mutex_unlock(&nfsd_mutex);
+	return rv;
 }
 
 static void nfsd_last_thread(struct svc_serv *serv)
-- 
cgit v1.2.3-70-g09d2


From 599eb3046a1380f31c65715f3940184c531c90cb Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 19 Jun 2008 10:11:09 +1000
Subject: knfsd: nfsd: Handle ERESTARTSYS from syscalls.

OCFS2 can return -ERESTARTSYS from write requests (and possibly
elsewhere) if there is a signal pending.

If nfsd is shutdown (by sending a signal to each thread) while there
is still an IO load from the client, each thread could handle one last
request with a signal pending.  This can result in -ERESTARTSYS
which is not understood by nfserrno() and so is reflected back to
the client as nfserr_io aka -EIO.  This is wrong.

Instead, interpret ERESTARTSYS to mean "try again later" by returning
nfserr_jukebox.  The client will resend and - if the server is
restarted - the write will (hopefully) be successful and everyone will
be happy.

 The symptom that I narrowed down to this was:
    copy a large file via NFS to an OCFS2 filesystem, and restart
    the nfs server during the copy.
    The 'cp' might get an -EIO, and the file will be corrupted -
    presumably holes in the middle where writes appeared to fail.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfsproc.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 6cfc96a1248..b5a20c48671 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -614,6 +614,7 @@ nfserrno (int errno)
 #endif
 		{ nfserr_stale, -ESTALE },
 		{ nfserr_jukebox, -ETIMEDOUT },
+		{ nfserr_jukebox, -ERESTARTSYS },
 		{ nfserr_dropit, -EAGAIN },
 		{ nfserr_dropit, -ENOMEM },
 		{ nfserr_badname, -ESRCH },
-- 
cgit v1.2.3-70-g09d2


From 8837abcab3d16608bd2c7fac051a839d48f2f30c Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Mon, 16 Jun 2008 13:20:29 +0200
Subject: nfsd: rename MAY_ flags

Rename nfsd_permission() specific MAY_* flags to NFSD_MAY_* to make it
clear, that these are not used outside nfsd, and to avoid name and
number space conflicts with the VFS.

[comment from hch: rename MAY_READ, MAY_WRITE and MAY_EXEC as well]

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/lockd.c           |   2 +-
 fs/nfsd/nfs2acl.c         |   7 +--
 fs/nfsd/nfs3acl.c         |   5 +-
 fs/nfsd/nfs3proc.c        |   8 ++--
 fs/nfsd/nfs4proc.c        |  23 ++++++----
 fs/nfsd/nfs4state.c       |   6 +--
 fs/nfsd/nfsfh.c           |   2 +-
 fs/nfsd/nfsproc.c         |   8 ++--
 fs/nfsd/vfs.c             | 115 ++++++++++++++++++++++++----------------------
 include/linux/nfsd/nfsd.h |  26 +++++------
 10 files changed, 105 insertions(+), 97 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
index 9e4a568a501..6b6225ac492 100644
--- a/fs/nfsd/lockd.c
+++ b/fs/nfsd/lockd.c
@@ -35,7 +35,7 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp)
 	fh.fh_export = NULL;
 
 	exp_readlock();
-	nfserr = nfsd_open(rqstp, &fh, S_IFREG, MAY_LOCK, filp);
+	nfserr = nfsd_open(rqstp, &fh, S_IFREG, NFSD_MAY_LOCK, filp);
 	fh_put(&fh);
 	rqstp->rq_client = NULL;
 	exp_readunlock();
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 1c3b7654e96..4e3219e8411 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -40,7 +40,8 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp,
 	dprintk("nfsd: GETACL(2acl)   %s\n", SVCFH_fmt(&argp->fh));
 
 	fh = fh_copy(&resp->fh, &argp->fh);
-	if ((nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP)))
+	nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
+	if (nfserr)
 		RETURN_STATUS(nfserr);
 
 	if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
@@ -107,7 +108,7 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst * rqstp,
 	dprintk("nfsd: SETACL(2acl)   %s\n", SVCFH_fmt(&argp->fh));
 
 	fh = fh_copy(&resp->fh, &argp->fh);
-	nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_SATTR);
+	nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR);
 
 	if (!nfserr) {
 		nfserr = nfserrno( nfsd_set_posix_acl(
@@ -134,7 +135,7 @@ static __be32 nfsacld_proc_getattr(struct svc_rqst * rqstp,
 	dprintk("nfsd: GETATTR  %s\n", SVCFH_fmt(&argp->fh));
 
 	fh_copy(&resp->fh, &argp->fh);
-	return fh_verify(rqstp, &resp->fh, 0, MAY_NOP);
+	return fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
 }
 
 /*
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index b647f2f872d..9981dbb377a 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -36,7 +36,8 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp,
 	__be32 nfserr = 0;
 
 	fh = fh_copy(&resp->fh, &argp->fh);
-	if ((nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP)))
+	nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
+	if (nfserr)
 		RETURN_STATUS(nfserr);
 
 	if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
@@ -101,7 +102,7 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst * rqstp,
 	__be32 nfserr = 0;
 
 	fh = fh_copy(&resp->fh, &argp->fh);
-	nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_SATTR);
+	nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR);
 
 	if (!nfserr) {
 		nfserr = nfserrno( nfsd_set_posix_acl(
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index c721a1e6e9d..4d617ea28cf 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -63,7 +63,7 @@ nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle  *argp,
 		SVCFH_fmt(&argp->fh));
 
 	fh_copy(&resp->fh, &argp->fh);
-	nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP);
+	nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
 	if (nfserr)
 		RETURN_STATUS(nfserr);
 
@@ -242,7 +242,7 @@ nfsd3_proc_create(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
 	attr   = &argp->attrs;
 
 	/* Get the directory inode */
-	nfserr = fh_verify(rqstp, dirfhp, S_IFDIR, MAY_CREATE);
+	nfserr = fh_verify(rqstp, dirfhp, S_IFDIR, NFSD_MAY_CREATE);
 	if (nfserr)
 		RETURN_STATUS(nfserr);
 
@@ -558,7 +558,7 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle    *argp,
 	resp->f_maxfilesize = ~(u32) 0;
 	resp->f_properties = NFS3_FSF_DEFAULT;
 
-	nfserr = fh_verify(rqstp, &argp->fh, 0, MAY_NOP);
+	nfserr = fh_verify(rqstp, &argp->fh, 0, NFSD_MAY_NOP);
 
 	/* Check special features of the file system. May request
 	 * different read/write sizes for file systems known to have
@@ -597,7 +597,7 @@ nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle      *argp,
 	resp->p_case_insensitive = 0;
 	resp->p_case_preserving = 1;
 
-	nfserr = fh_verify(rqstp, &argp->fh, 0, MAY_NOP);
+	nfserr = fh_verify(rqstp, &argp->fh, 0, NFSD_MAY_NOP);
 
 	if (nfserr == 0) {
 		struct super_block *sb = argp->fh.fh_dentry->d_inode->i_sb;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 313484380a9..5c3683cfd59 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -71,11 +71,11 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs
 		return nfserr_inval;
 
 	if (open->op_share_access & NFS4_SHARE_ACCESS_READ)
-		accmode |= MAY_READ;
+		accmode |= NFSD_MAY_READ;
 	if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
-		accmode |= (MAY_WRITE | MAY_TRUNC);
+		accmode |= (NFSD_MAY_WRITE | NFSD_MAY_TRUNC);
 	if (open->op_share_deny & NFS4_SHARE_DENY_WRITE)
-		accmode |= MAY_WRITE;
+		accmode |= NFSD_MAY_WRITE;
 
 	status = fh_verify(rqstp, current_fh, S_IFREG, accmode);
 
@@ -126,7 +126,8 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
 			&resfh.fh_handle.fh_base, resfh.fh_handle.fh_size);
 
 	if (!created)
-		status = do_open_permission(rqstp, current_fh, open, MAY_NOP);
+		status = do_open_permission(rqstp, current_fh, open,
+					    NFSD_MAY_NOP);
 
 out:
 	fh_put(&resfh);
@@ -157,7 +158,8 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
 	open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) &&
 		(open->op_iattr.ia_size == 0);
 
-	status = do_open_permission(rqstp, current_fh, open, MAY_OWNER_OVERRIDE);
+	status = do_open_permission(rqstp, current_fh, open,
+				    NFSD_MAY_OWNER_OVERRIDE);
 
 	return status;
 }
@@ -186,7 +188,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		cstate->current_fh.fh_handle.fh_size = rp->rp_openfh_len;
 		memcpy(&cstate->current_fh.fh_handle.fh_base, rp->rp_openfh,
 				rp->rp_openfh_len);
-		status = fh_verify(rqstp, &cstate->current_fh, 0, MAY_NOP);
+		status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
 		if (status)
 			dprintk("nfsd4_open: replay failed"
 				" restoring previous filehandle\n");
@@ -285,7 +287,7 @@ nfsd4_putfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	cstate->current_fh.fh_handle.fh_size = putfh->pf_fhlen;
 	memcpy(&cstate->current_fh.fh_handle.fh_base, putfh->pf_fhval,
 	       putfh->pf_fhlen);
-	return fh_verify(rqstp, &cstate->current_fh, 0, MAY_NOP);
+	return fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
 }
 
 static __be32
@@ -363,7 +365,8 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	fh_init(&resfh, NFS4_FHSIZE);
 
-	status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, MAY_CREATE);
+	status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR,
+			   NFSD_MAY_CREATE);
 	if (status == nfserr_symlink)
 		status = nfserr_notdir;
 	if (status)
@@ -445,7 +448,7 @@ nfsd4_getattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 {
 	__be32 status;
 
-	status = fh_verify(rqstp, &cstate->current_fh, 0, MAY_NOP);
+	status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
 	if (status)
 		return status;
 
@@ -730,7 +733,7 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	int count;
 	__be32 status;
 
-	status = fh_verify(rqstp, &cstate->current_fh, 0, MAY_NOP);
+	status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
 	if (status)
 		return status;
 
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index bf11d6879ab..eca8aaa450f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1722,9 +1722,9 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 		/* Stateid was not found, this is a new OPEN */
 		int flags = 0;
 		if (open->op_share_access & NFS4_SHARE_ACCESS_READ)
-			flags |= MAY_READ;
+			flags |= NFSD_MAY_READ;
 		if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
-			flags |= MAY_WRITE;
+			flags |= NFSD_MAY_WRITE;
 		status = nfs4_new_open(rqstp, &stp, dp, current_fh, flags);
 		if (status)
 			goto out;
@@ -2610,7 +2610,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		 return nfserr_inval;
 
 	if ((status = fh_verify(rqstp, &cstate->current_fh,
-				S_IFREG, MAY_LOCK))) {
+				S_IFREG, NFSD_MAY_LOCK))) {
 		dprintk("NFSD: nfsd4_lock: permission denied!\n");
 		return status;
 	}
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 100ae564116..c7b0fdaeac9 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -279,7 +279,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
 	if (error)
 		goto out;
 
-	if (!(access & MAY_LOCK)) {
+	if (!(access & NFSD_MAY_LOCK)) {
 		/*
 		 * pseudoflavor restrictions are not enforced on NLM,
 		 * which clients virtually always use auth_sys for,
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index b5a20c48671..0766f95d236 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -65,7 +65,7 @@ nfsd_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle  *argp,
 	dprintk("nfsd: GETATTR  %s\n", SVCFH_fmt(&argp->fh));
 
 	fh_copy(&resp->fh, &argp->fh);
-	nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP);
+	nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
 	return nfsd_return_attrs(nfserr, resp);
 }
 
@@ -215,11 +215,11 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
 		SVCFH_fmt(dirfhp), argp->len, argp->name);
 
 	/* First verify the parent file handle */
-	nfserr = fh_verify(rqstp, dirfhp, S_IFDIR, MAY_EXEC);
+	nfserr = fh_verify(rqstp, dirfhp, S_IFDIR, NFSD_MAY_EXEC);
 	if (nfserr)
 		goto done; /* must fh_put dirfhp even on error */
 
-	/* Check for MAY_WRITE in nfsd_create if necessary */
+	/* Check for NFSD_MAY_WRITE in nfsd_create if necessary */
 
 	nfserr = nfserr_acces;
 	if (!argp->len)
@@ -281,7 +281,7 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
 					nfserr = nfsd_permission(rqstp,
 								 newfhp->fh_export,
 								 newfhp->fh_dentry,
-								 MAY_WRITE|MAY_LOCAL_ACCESS);
+								 NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS);
 					if (nfserr && nfserr != nfserr_rofs)
 						goto out_unlock;
 				}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index a3a291f771f..5e05ddda456 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -144,7 +144,7 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name);
 
 	/* Obtain dentry and export. */
-	err = fh_verify(rqstp, fhp, S_IFDIR, MAY_EXEC);
+	err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
 	if (err)
 		return err;
 
@@ -262,14 +262,14 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 {
 	struct dentry	*dentry;
 	struct inode	*inode;
-	int		accmode = MAY_SATTR;
+	int		accmode = NFSD_MAY_SATTR;
 	int		ftype = 0;
 	__be32		err;
 	int		host_err;
 	int		size_change = 0;
 
 	if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE))
-		accmode |= MAY_WRITE|MAY_OWNER_OVERRIDE;
+		accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE;
 	if (iap->ia_valid & ATTR_SIZE)
 		ftype = S_IFREG;
 
@@ -331,7 +331,8 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 	 */
 	if (iap->ia_valid & ATTR_SIZE) {
 		if (iap->ia_size < inode->i_size) {
-			err = nfsd_permission(rqstp, fhp->fh_export, dentry, MAY_TRUNC|MAY_OWNER_OVERRIDE);
+			err = nfsd_permission(rqstp, fhp->fh_export, dentry,
+					NFSD_MAY_TRUNC|NFSD_MAY_OWNER_OVERRIDE);
 			if (err)
 				goto out;
 		}
@@ -462,7 +463,7 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	unsigned int flags = 0;
 
 	/* Get inode */
-	error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, MAY_SATTR);
+	error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, NFSD_MAY_SATTR);
 	if (error)
 		return error;
 
@@ -563,20 +564,20 @@ struct accessmap {
 	int		how;
 };
 static struct accessmap	nfs3_regaccess[] = {
-    {	NFS3_ACCESS_READ,	MAY_READ			},
-    {	NFS3_ACCESS_EXECUTE,	MAY_EXEC			},
-    {	NFS3_ACCESS_MODIFY,	MAY_WRITE|MAY_TRUNC		},
-    {	NFS3_ACCESS_EXTEND,	MAY_WRITE			},
+    {	NFS3_ACCESS_READ,	NFSD_MAY_READ			},
+    {	NFS3_ACCESS_EXECUTE,	NFSD_MAY_EXEC			},
+    {	NFS3_ACCESS_MODIFY,	NFSD_MAY_WRITE|NFSD_MAY_TRUNC	},
+    {	NFS3_ACCESS_EXTEND,	NFSD_MAY_WRITE			},
 
     {	0,			0				}
 };
 
 static struct accessmap	nfs3_diraccess[] = {
-    {	NFS3_ACCESS_READ,	MAY_READ			},
-    {	NFS3_ACCESS_LOOKUP,	MAY_EXEC			},
-    {	NFS3_ACCESS_MODIFY,	MAY_EXEC|MAY_WRITE|MAY_TRUNC	},
-    {	NFS3_ACCESS_EXTEND,	MAY_EXEC|MAY_WRITE		},
-    {	NFS3_ACCESS_DELETE,	MAY_REMOVE			},
+    {	NFS3_ACCESS_READ,	NFSD_MAY_READ			},
+    {	NFS3_ACCESS_LOOKUP,	NFSD_MAY_EXEC			},
+    {	NFS3_ACCESS_MODIFY,	NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC},
+    {	NFS3_ACCESS_EXTEND,	NFSD_MAY_EXEC|NFSD_MAY_WRITE	},
+    {	NFS3_ACCESS_DELETE,	NFSD_MAY_REMOVE			},
 
     {	0,			0				}
 };
@@ -589,10 +590,10 @@ static struct accessmap	nfs3_anyaccess[] = {
 	 * mainly at mode bits, and we make sure to ignore read-only
 	 * filesystem checks
 	 */
-    {	NFS3_ACCESS_READ,	MAY_READ			},
-    {	NFS3_ACCESS_EXECUTE,	MAY_EXEC			},
-    {	NFS3_ACCESS_MODIFY,	MAY_WRITE|MAY_LOCAL_ACCESS	},
-    {	NFS3_ACCESS_EXTEND,	MAY_WRITE|MAY_LOCAL_ACCESS	},
+    {	NFS3_ACCESS_READ,	NFSD_MAY_READ			},
+    {	NFS3_ACCESS_EXECUTE,	NFSD_MAY_EXEC			},
+    {	NFS3_ACCESS_MODIFY,	NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS	},
+    {	NFS3_ACCESS_EXTEND,	NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS	},
 
     {	0,			0				}
 };
@@ -606,7 +607,7 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor
 	u32			query, result = 0, sresult = 0;
 	__be32			error;
 
-	error = fh_verify(rqstp, fhp, 0, MAY_NOP);
+	error = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP);
 	if (error)
 		goto out;
 
@@ -678,7 +679,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 	 * and (hopefully) checked permission - so allow OWNER_OVERRIDE
 	 * in case a chmod has now revoked permission.
 	 */
-	err = fh_verify(rqstp, fhp, type, access | MAY_OWNER_OVERRIDE);
+	err = fh_verify(rqstp, fhp, type, access | NFSD_MAY_OWNER_OVERRIDE);
 	if (err)
 		goto out;
 
@@ -689,7 +690,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 	 * or any access when mandatory locking enabled
 	 */
 	err = nfserr_perm;
-	if (IS_APPEND(inode) && (access & MAY_WRITE))
+	if (IS_APPEND(inode) && (access & NFSD_MAY_WRITE))
 		goto out;
 	/*
 	 * We must ignore files (but only files) which might have mandatory
@@ -706,14 +707,14 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 	 * Check to see if there are any leases on this file.
 	 * This may block while leases are broken.
 	 */
-	host_err = break_lease(inode, O_NONBLOCK | ((access & MAY_WRITE) ? FMODE_WRITE : 0));
+	host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? FMODE_WRITE : 0));
 	if (host_err == -EWOULDBLOCK)
 		host_err = -ETIMEDOUT;
 	if (host_err) /* NOMEM or WOULDBLOCK */
 		goto out_nfserr;
 
-	if (access & MAY_WRITE) {
-		if (access & MAY_READ)
+	if (access & NFSD_MAY_WRITE) {
+		if (access & NFSD_MAY_READ)
 			flags = O_RDWR|O_LARGEFILE;
 		else
 			flags = O_WRONLY|O_LARGEFILE;
@@ -1069,12 +1070,12 @@ nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 
 	if (file) {
 		err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
-				MAY_READ|MAY_OWNER_OVERRIDE);
+				NFSD_MAY_READ|NFSD_MAY_OWNER_OVERRIDE);
 		if (err)
 			goto out;
 		err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
 	} else {
-		err = nfsd_open(rqstp, fhp, S_IFREG, MAY_READ, &file);
+		err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
 		if (err)
 			goto out;
 		err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
@@ -1098,13 +1099,13 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 
 	if (file) {
 		err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
-				MAY_WRITE|MAY_OWNER_OVERRIDE);
+				NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE);
 		if (err)
 			goto out;
 		err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt,
 				stablep);
 	} else {
-		err = nfsd_open(rqstp, fhp, S_IFREG, MAY_WRITE, &file);
+		err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file);
 		if (err)
 			goto out;
 
@@ -1136,7 +1137,8 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	if ((u64)count > ~(u64)offset)
 		return nfserr_inval;
 
-	if ((err = nfsd_open(rqstp, fhp, S_IFREG, MAY_WRITE, &file)) != 0)
+	err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file);
+	if (err)
 		return err;
 	if (EX_ISSYNC(fhp->fh_export)) {
 		if (file->f_op && file->f_op->fsync) {
@@ -1197,7 +1199,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	if (isdotent(fname, flen))
 		goto out;
 
-	err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE);
+	err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
 	if (err)
 		goto out;
 
@@ -1334,7 +1336,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		goto out;
 	if (!(iap->ia_valid & ATTR_MODE))
 		iap->ia_mode = 0;
-	err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE);
+	err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
 	if (err)
 		goto out;
 
@@ -1471,7 +1473,7 @@ nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
 	__be32		err;
 	int		host_err;
 
-	err = fh_verify(rqstp, fhp, S_IFLNK, MAY_NOP);
+	err = fh_verify(rqstp, fhp, S_IFLNK, NFSD_MAY_NOP);
 	if (err)
 		goto out;
 
@@ -1526,7 +1528,7 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	if (isdotent(fname, flen))
 		goto out;
 
-	err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE);
+	err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
 	if (err)
 		goto out;
 	fh_lock(fhp);
@@ -1591,10 +1593,10 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
 	__be32		err;
 	int		host_err;
 
-	err = fh_verify(rqstp, ffhp, S_IFDIR, MAY_CREATE);
+	err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_CREATE);
 	if (err)
 		goto out;
-	err = fh_verify(rqstp, tfhp, -S_IFDIR, MAY_NOP);
+	err = fh_verify(rqstp, tfhp, -S_IFDIR, NFSD_MAY_NOP);
 	if (err)
 		goto out;
 
@@ -1661,10 +1663,10 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
 	__be32		err;
 	int		host_err;
 
-	err = fh_verify(rqstp, ffhp, S_IFDIR, MAY_REMOVE);
+	err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE);
 	if (err)
 		goto out;
-	err = fh_verify(rqstp, tfhp, S_IFDIR, MAY_CREATE);
+	err = fh_verify(rqstp, tfhp, S_IFDIR, NFSD_MAY_CREATE);
 	if (err)
 		goto out;
 
@@ -1768,7 +1770,7 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 	err = nfserr_acces;
 	if (!flen || isdotent(fname, flen))
 		goto out;
-	err = fh_verify(rqstp, fhp, S_IFDIR, MAY_REMOVE);
+	err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_REMOVE);
 	if (err)
 		goto out;
 
@@ -1834,7 +1836,7 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp,
 	struct file	*file;
 	loff_t		offset = *offsetp;
 
-	err = nfsd_open(rqstp, fhp, S_IFDIR, MAY_READ, &file);
+	err = nfsd_open(rqstp, fhp, S_IFDIR, NFSD_MAY_READ, &file);
 	if (err)
 		goto out;
 
@@ -1875,7 +1877,7 @@ out:
 __be32
 nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat)
 {
-	__be32 err = fh_verify(rqstp, fhp, 0, MAY_NOP);
+	__be32 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP);
 	if (!err && vfs_statfs(fhp->fh_dentry,stat))
 		err = nfserr_io;
 	return err;
@@ -1896,18 +1898,18 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
 	struct inode	*inode = dentry->d_inode;
 	int		err;
 
-	if (acc == MAY_NOP)
+	if (acc == NFSD_MAY_NOP)
 		return 0;
 #if 0
 	dprintk("nfsd: permission 0x%x%s%s%s%s%s%s%s mode 0%o%s%s%s\n",
 		acc,
-		(acc & MAY_READ)?	" read"  : "",
-		(acc & MAY_WRITE)?	" write" : "",
-		(acc & MAY_EXEC)?	" exec"  : "",
-		(acc & MAY_SATTR)?	" sattr" : "",
-		(acc & MAY_TRUNC)?	" trunc" : "",
-		(acc & MAY_LOCK)?	" lock"  : "",
-		(acc & MAY_OWNER_OVERRIDE)? " owneroverride" : "",
+		(acc & NFSD_MAY_READ)?	" read"  : "",
+		(acc & NFSD_MAY_WRITE)?	" write" : "",
+		(acc & NFSD_MAY_EXEC)?	" exec"  : "",
+		(acc & NFSD_MAY_SATTR)?	" sattr" : "",
+		(acc & NFSD_MAY_TRUNC)?	" trunc" : "",
+		(acc & NFSD_MAY_LOCK)?	" lock"  : "",
+		(acc & NFSD_MAY_OWNER_OVERRIDE)? " owneroverride" : "",
 		inode->i_mode,
 		IS_IMMUTABLE(inode)?	" immut" : "",
 		IS_APPEND(inode)?	" append" : "",
@@ -1920,18 +1922,18 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
 	 * system.  But if it is IRIX doing check on write-access for a 
 	 * device special file, we ignore rofs.
 	 */
-	if (!(acc & MAY_LOCAL_ACCESS))
-		if (acc & (MAY_WRITE | MAY_SATTR | MAY_TRUNC)) {
+	if (!(acc & NFSD_MAY_LOCAL_ACCESS))
+		if (acc & (NFSD_MAY_WRITE | NFSD_MAY_SATTR | NFSD_MAY_TRUNC)) {
 			if (exp_rdonly(rqstp, exp) ||
 			    __mnt_is_readonly(exp->ex_path.mnt))
 				return nfserr_rofs;
-			if (/* (acc & MAY_WRITE) && */ IS_IMMUTABLE(inode))
+			if (/* (acc & NFSD_MAY_WRITE) && */ IS_IMMUTABLE(inode))
 				return nfserr_perm;
 		}
-	if ((acc & MAY_TRUNC) && IS_APPEND(inode))
+	if ((acc & NFSD_MAY_TRUNC) && IS_APPEND(inode))
 		return nfserr_perm;
 
-	if (acc & MAY_LOCK) {
+	if (acc & NFSD_MAY_LOCK) {
 		/* If we cannot rely on authentication in NLM requests,
 		 * just allow locks, otherwise require read permission, or
 		 * ownership
@@ -1939,7 +1941,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
 		if (exp->ex_flags & NFSEXP_NOAUTHNLM)
 			return 0;
 		else
-			acc = MAY_READ | MAY_OWNER_OVERRIDE;
+			acc = NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE;
 	}
 	/*
 	 * The file owner always gets access permission for accesses that
@@ -1955,15 +1957,16 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
 	 * We must trust the client to do permission checking - using "ACCESS"
 	 * with NFSv3.
 	 */
-	if ((acc & MAY_OWNER_OVERRIDE) &&
+	if ((acc & NFSD_MAY_OWNER_OVERRIDE) &&
 	    inode->i_uid == current->fsuid)
 		return 0;
 
+	/* This assumes  NFSD_MAY_{READ,WRITE,EXEC} == MAY_{READ,WRITE,EXEC} */
 	err = permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC), NULL);
 
 	/* Allow read access to binaries even when mode 111 */
 	if (err == -EACCES && S_ISREG(inode->i_mode) &&
-	    acc == (MAY_READ | MAY_OWNER_OVERRIDE))
+	    acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE))
 		err = permission(inode, MAY_EXEC, NULL);
 
 	return err? nfserrno(err) : 0;
diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h
index 88d85b96442..a2861d95ecc 100644
--- a/include/linux/nfsd/nfsd.h
+++ b/include/linux/nfsd/nfsd.h
@@ -28,20 +28,20 @@
 #define NFSD_SUPPORTED_MINOR_VERSION	0
 
 /*
- * Special flags for nfsd_permission. These must be different from MAY_READ,
- * MAY_WRITE, and MAY_EXEC.
+ * Flags for nfsd_permission
  */
-#define MAY_NOP			0
-#define MAY_SATTR		8
-#define MAY_TRUNC		16
-#define MAY_LOCK		32
-#define MAY_OWNER_OVERRIDE	64
-#define	MAY_LOCAL_ACCESS	128 /* IRIX doing local access check on device special file*/
-#if (MAY_SATTR | MAY_TRUNC | MAY_LOCK | MAY_OWNER_OVERRIDE | MAY_LOCAL_ACCESS) & (MAY_READ | MAY_WRITE | MAY_EXEC)
-# error "please use a different value for MAY_SATTR or MAY_TRUNC or MAY_LOCK or MAY_LOCAL_ACCESS or MAY_OWNER_OVERRIDE."
-#endif
-#define MAY_CREATE		(MAY_EXEC|MAY_WRITE)
-#define MAY_REMOVE		(MAY_EXEC|MAY_WRITE|MAY_TRUNC)
+#define NFSD_MAY_NOP		0
+#define NFSD_MAY_EXEC		1 /* == MAY_EXEC */
+#define NFSD_MAY_WRITE		2 /* == MAY_WRITE */
+#define NFSD_MAY_READ		4 /* == MAY_READ */
+#define NFSD_MAY_SATTR		8
+#define NFSD_MAY_TRUNC		16
+#define NFSD_MAY_LOCK		32
+#define NFSD_MAY_OWNER_OVERRIDE	64
+#define NFSD_MAY_LOCAL_ACCESS	128 /* IRIX doing local access check on device special file*/
+
+#define NFSD_MAY_CREATE		(NFSD_MAY_EXEC|NFSD_MAY_WRITE)
+#define NFSD_MAY_REMOVE		(NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
 
 /*
  * Callback function for readdir
-- 
cgit v1.2.3-70-g09d2


From be285c712bbbe5db43e503782fbef2bfeaa345f9 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@suse.de>
Date: Mon, 16 Jun 2008 13:28:07 +0200
Subject: [patch 3/3] vfs: make d_path() consistent across mount operations

The path that __d_path() computes can become slightly inconsistent when it
races with mount operations: it grabs the vfsmount_lock when traversing mount
points but immediately drops it again, only to re-grab it when it reaches the
next mount point.  The result is that the filename computed is not always
consisent, and the file may never have had that name. (This is unlikely, but
still possible.)

Fix this by grabbing the vfsmount_lock for the whole duration of
__d_path().

Signed-off-by: Andreas Gruenbacher <agruen@suse.de>
Signed-off-by: John Johansen <jjohansen@suse.de>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Acked-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index c4c9072d810..2b479de10a0 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1782,6 +1782,7 @@ char *__d_path(const struct path *path, struct path *root,
 	char * end = buffer+buflen;
 	char * retval;
 
+	spin_lock(&vfsmount_lock);
 	prepend(&end, &buflen, "\0", 1);
 	if (!IS_ROOT(dentry) && d_unhashed(dentry) &&
 		(prepend(&end, &buflen, " (deleted)", 10) != 0))
@@ -1800,14 +1801,11 @@ char *__d_path(const struct path *path, struct path *root,
 			break;
 		if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
 			/* Global root? */
-			spin_lock(&vfsmount_lock);
 			if (vfsmnt->mnt_parent == vfsmnt) {
-				spin_unlock(&vfsmount_lock);
 				goto global_root;
 			}
 			dentry = vfsmnt->mnt_mountpoint;
 			vfsmnt = vfsmnt->mnt_parent;
-			spin_unlock(&vfsmount_lock);
 			continue;
 		}
 		parent = dentry->d_parent;
@@ -1820,6 +1818,8 @@ char *__d_path(const struct path *path, struct path *root,
 		dentry = parent;
 	}
 
+out:
+	spin_unlock(&vfsmount_lock);
 	return retval;
 
 global_root:
@@ -1829,9 +1829,11 @@ global_root:
 		goto Elong;
 	root->mnt = vfsmnt;
 	root->dentry = dentry;
-	return retval;
+	goto out;
+
 Elong:
-	return ERR_PTR(-ENAMETOOLONG);
+	retval = ERR_PTR(-ENAMETOOLONG);
+	goto out;
 }
 
 /**
-- 
cgit v1.2.3-70-g09d2


From 31f3e0b3a18c6d48196c40a82a3b8c01f4ff6b23 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Mon, 23 Jun 2008 18:11:52 +0200
Subject: [patch 1/3] vfs: dcache sparse fixes

Fix the following sparse warnings:

fs/dcache.c:2183:19: warning: symbol 'filp_cachep' was not declared. Should it be static?
fs/dcache.c:115:3: warning: context imbalance in 'dentry_iput' - unexpected unlock
fs/dcache.c:188:2: warning: context imbalance in 'dput' - different lock contexts for basic block
fs/dcache.c:400:2: warning: context imbalance in 'prune_one_dentry' - different lock contexts for basic block
fs/dcache.c:431:22: warning: context imbalance in 'prune_dcache' - different lock contexts for basic block
fs/dcache.c:563:2: warning: context imbalance in 'shrink_dcache_sb' - different lock contexts for basic block
fs/dcache.c:1385:6: warning: context imbalance in 'd_delete' - wrong count at exit
fs/dcache.c:1636:2: warning: context imbalance in '__d_unalias' - unexpected unlock
fs/dcache.c:1735:2: warning: context imbalance in 'd_materialise_unique' - different lock contexts for basic block

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Reviewed-by: Matthew Wilcox <willy@linux.intel.com>
Acked-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 2b479de10a0..e4b2b9436b3 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -17,6 +17,7 @@
 #include <linux/syscalls.h>
 #include <linux/string.h>
 #include <linux/mm.h>
+#include <linux/fdtable.h>
 #include <linux/fs.h>
 #include <linux/fsnotify.h>
 #include <linux/slab.h>
@@ -106,9 +107,10 @@ static void dentry_lru_remove(struct dentry *dentry)
 /*
  * Release the dentry's inode, using the filesystem
  * d_iput() operation if defined.
- * Called with dcache_lock and per dentry lock held, drops both.
  */
 static void dentry_iput(struct dentry * dentry)
+	__releases(dentry->d_lock)
+	__releases(dcache_lock)
 {
 	struct inode *inode = dentry->d_inode;
 	if (inode) {
@@ -132,12 +134,13 @@ static void dentry_iput(struct dentry * dentry)
  * d_kill - kill dentry and return parent
  * @dentry: dentry to kill
  *
- * Called with dcache_lock and d_lock, releases both.  The dentry must
- * already be unhashed and removed from the LRU.
+ * The dentry must already be unhashed and removed from the LRU.
  *
  * If this is the root of the dentry tree, return NULL.
  */
 static struct dentry *d_kill(struct dentry *dentry)
+	__releases(dentry->d_lock)
+	__releases(dcache_lock)
 {
 	struct dentry *parent;
 
@@ -383,11 +386,11 @@ restart:
  * Try to prune ancestors as well.  This is necessary to prevent
  * quadratic behavior of shrink_dcache_parent(), but is also expected
  * to be beneficial in reducing dentry cache fragmentation.
- *
- * Called with dcache_lock, drops it and then regains.
- * Called with dentry->d_lock held, drops it.
  */
 static void prune_one_dentry(struct dentry * dentry)
+	__releases(dentry->d_lock)
+	__releases(dcache_lock)
+	__acquires(dcache_lock)
 {
 	__d_drop(dentry);
 	dentry = d_kill(dentry);
@@ -1604,10 +1607,9 @@ static int d_isparent(struct dentry *p1, struct dentry *p2)
  *
  * Note: If ever the locking in lock_rename() changes, then please
  * remember to update this too...
- *
- * On return, dcache_lock will have been unlocked.
  */
 static struct dentry *__d_unalias(struct dentry *dentry, struct dentry *alias)
+	__releases(dcache_lock)
 {
 	struct mutex *m1 = NULL, *m2 = NULL;
 	struct dentry *ret;
@@ -1743,7 +1745,6 @@ out_nolock:
 shouldnt_be_hashed:
 	spin_unlock(&dcache_lock);
 	BUG();
-	goto shouldnt_be_hashed;
 }
 
 static int prepend(char **buffer, int *buflen, const char *str,
@@ -1758,7 +1759,7 @@ static int prepend(char **buffer, int *buflen, const char *str,
 }
 
 /**
- * d_path - return the path of a dentry
+ * __d_path - return the path of a dentry
  * @path: the dentry/vfsmount to report
  * @root: root vfsmnt/dentry (may be modified by this function)
  * @buffer: buffer to return value in
@@ -1847,7 +1848,7 @@ Elong:
  *
  * Returns the buffer or an error code if the path was too long.
  *
- * "buflen" should be positive. Caller holds the dcache_lock.
+ * "buflen" should be positive.
  */
 char *d_path(const struct path *path, char *buf, int buflen)
 {
-- 
cgit v1.2.3-70-g09d2


From cdd16d0265c9234228fd37fbbad844d7e894b278 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Mon, 23 Jun 2008 18:11:53 +0200
Subject: [patch 2/3] vfs: dcache cleanups

Comment from Al Viro: add prepend_name() wrapper.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index e4b2b9436b3..6068c25b393 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1747,8 +1747,7 @@ shouldnt_be_hashed:
 	BUG();
 }
 
-static int prepend(char **buffer, int *buflen, const char *str,
-			  int namelen)
+static int prepend(char **buffer, int *buflen, const char *str, int namelen)
 {
 	*buflen -= namelen;
 	if (*buflen < 0)
@@ -1758,6 +1757,11 @@ static int prepend(char **buffer, int *buflen, const char *str,
 	return 0;
 }
 
+static int prepend_name(char **buffer, int *buflen, struct qstr *name)
+{
+	return prepend(buffer, buflen, name->name, name->len);
+}
+
 /**
  * __d_path - return the path of a dentry
  * @path: the dentry/vfsmount to report
@@ -1780,8 +1784,8 @@ char *__d_path(const struct path *path, struct path *root,
 {
 	struct dentry *dentry = path->dentry;
 	struct vfsmount *vfsmnt = path->mnt;
-	char * end = buffer+buflen;
-	char * retval;
+	char *end = buffer + buflen;
+	char *retval;
 
 	spin_lock(&vfsmount_lock);
 	prepend(&end, &buflen, "\0", 1);
@@ -1811,8 +1815,7 @@ char *__d_path(const struct path *path, struct path *root,
 		}
 		parent = dentry->d_parent;
 		prefetch(parent);
-		if ((prepend(&end, &buflen, dentry->d_name.name,
-				dentry->d_name.len) != 0) ||
+		if ((prepend_name(&end, &buflen, &dentry->d_name) != 0) ||
 		    (prepend(&end, &buflen, "/", 1) != 0))
 			goto Elong;
 		retval = end;
@@ -1825,8 +1828,7 @@ out:
 
 global_root:
 	retval += 1;	/* hit the slash */
-	if (prepend(&retval, &buflen, dentry->d_name.name,
-		    dentry->d_name.len) != 0)
+	if (prepend_name(&retval, &buflen, &dentry->d_name) != 0)
 		goto Elong;
 	root->mnt = vfsmnt;
 	root->dentry = dentry;
@@ -1918,16 +1920,11 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
 	retval = end-1;
 	*retval = '/';
 
-	for (;;) {
-		struct dentry *parent;
-		if (IS_ROOT(dentry))
-			break;
+	while (!IS_ROOT(dentry)) {
+		struct dentry *parent = dentry->d_parent;
 
-		parent = dentry->d_parent;
 		prefetch(parent);
-
-		if ((prepend(&end, &buflen, dentry->d_name.name,
-				dentry->d_name.len) != 0) ||
+		if ((prepend_name(&end, &buflen, &dentry->d_name) != 0) ||
 		    (prepend(&end, &buflen, "/", 1) != 0))
 			goto Elong;
 
@@ -1978,7 +1975,7 @@ asmlinkage long sys_getcwd(char __user *buf, unsigned long size)
 	error = -ENOENT;
 	/* Has the current directory has been unlinked? */
 	spin_lock(&dcache_lock);
-	if (pwd.dentry->d_parent == pwd.dentry || !d_unhashed(pwd.dentry)) {
+	if (IS_ROOT(pwd.dentry) || !d_unhashed(pwd.dentry)) {
 		unsigned long len;
 		struct path tmp = root;
 		char * cwd;
-- 
cgit v1.2.3-70-g09d2


From 33852a1f2bb014e4047a844556c0d76a2f790c37 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 19 Jun 2008 14:20:11 -0400
Subject: NFS: Reduce the NFS mount code stack usage.

This appears to fix the Oops reported in
  http://bugzilla.kernel.org/show_bug.cgi?id=10826

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 68 ++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 40 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 2a4a024a4e7..dac663dc561 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1216,8 +1216,6 @@ static int nfs_validate_mount_data(void *options,
 {
 	struct nfs_mount_data *data = (struct nfs_mount_data *)options;
 
-	memset(args, 0, sizeof(*args));
-
 	if (data == NULL)
 		goto out_no_data;
 
@@ -1585,24 +1583,29 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 {
 	struct nfs_server *server = NULL;
 	struct super_block *s;
-	struct nfs_fh mntfh;
-	struct nfs_parsed_mount_data data;
+	struct nfs_parsed_mount_data *data;
+	struct nfs_fh *mntfh;
 	struct dentry *mntroot;
 	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
 	struct nfs_sb_mountdata sb_mntdata = {
 		.mntflags = flags,
 	};
-	int error;
+	int error = -ENOMEM;
 
-	security_init_mnt_opts(&data.lsm_opts);
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL);
+	if (data == NULL || mntfh == NULL)
+		goto out_free_fh;
+
+	security_init_mnt_opts(&data->lsm_opts);
 
 	/* Validate the mount data */
-	error = nfs_validate_mount_data(raw_data, &data, &mntfh, dev_name);
+	error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name);
 	if (error < 0)
 		goto out;
 
 	/* Get a volume representation */
-	server = nfs_create_server(&data, &mntfh);
+	server = nfs_create_server(data, mntfh);
 	if (IS_ERR(server)) {
 		error = PTR_ERR(server);
 		goto out;
@@ -1630,16 +1633,16 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 
 	if (!s->s_root) {
 		/* initial superblock/root creation */
-		nfs_fill_super(s, &data);
+		nfs_fill_super(s, data);
 	}
 
-	mntroot = nfs_get_root(s, &mntfh);
+	mntroot = nfs_get_root(s, mntfh);
 	if (IS_ERR(mntroot)) {
 		error = PTR_ERR(mntroot);
 		goto error_splat_super;
 	}
 
-	error = security_sb_set_mnt_opts(s, &data.lsm_opts);
+	error = security_sb_set_mnt_opts(s, &data->lsm_opts);
 	if (error)
 		goto error_splat_root;
 
@@ -1649,9 +1652,12 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 	error = 0;
 
 out:
-	kfree(data.nfs_server.hostname);
-	kfree(data.mount_server.hostname);
-	security_free_mnt_opts(&data.lsm_opts);
+	kfree(data->nfs_server.hostname);
+	kfree(data->mount_server.hostname);
+	security_free_mnt_opts(&data->lsm_opts);
+out_free_fh:
+	kfree(mntfh);
+	kfree(data);
 	return error;
 
 out_err_nosb:
@@ -1800,8 +1806,6 @@ static int nfs4_validate_mount_data(void *options,
 	struct nfs4_mount_data *data = (struct nfs4_mount_data *)options;
 	char *c;
 
-	memset(args, 0, sizeof(*args));
-
 	if (data == NULL)
 		goto out_no_data;
 
@@ -1959,26 +1963,31 @@ out_no_client_address:
 static int nfs4_get_sb(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
 {
-	struct nfs_parsed_mount_data data;
+	struct nfs_parsed_mount_data *data;
 	struct super_block *s;
 	struct nfs_server *server;
-	struct nfs_fh mntfh;
+	struct nfs_fh *mntfh;
 	struct dentry *mntroot;
 	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
 	struct nfs_sb_mountdata sb_mntdata = {
 		.mntflags = flags,
 	};
-	int error;
+	int error = -ENOMEM;
 
-	security_init_mnt_opts(&data.lsm_opts);
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL);
+	if (data == NULL || mntfh == NULL)
+		goto out_free_fh;
+
+	security_init_mnt_opts(&data->lsm_opts);
 
 	/* Validate the mount data */
-	error = nfs4_validate_mount_data(raw_data, &data, dev_name);
+	error = nfs4_validate_mount_data(raw_data, data, dev_name);
 	if (error < 0)
 		goto out;
 
 	/* Get a volume representation */
-	server = nfs4_create_server(&data, &mntfh);
+	server = nfs4_create_server(data, mntfh);
 	if (IS_ERR(server)) {
 		error = PTR_ERR(server);
 		goto out;
@@ -2009,13 +2018,13 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
 		nfs4_fill_super(s);
 	}
 
-	mntroot = nfs4_get_root(s, &mntfh);
+	mntroot = nfs4_get_root(s, mntfh);
 	if (IS_ERR(mntroot)) {
 		error = PTR_ERR(mntroot);
 		goto error_splat_super;
 	}
 
-	error = security_sb_set_mnt_opts(s, &data.lsm_opts);
+	error = security_sb_set_mnt_opts(s, &data->lsm_opts);
 	if (error)
 		goto error_splat_root;
 
@@ -2025,10 +2034,13 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
 	error = 0;
 
 out:
-	kfree(data.client_address);
-	kfree(data.nfs_server.export_path);
-	kfree(data.nfs_server.hostname);
-	security_free_mnt_opts(&data.lsm_opts);
+	kfree(data->client_address);
+	kfree(data->nfs_server.export_path);
+	kfree(data->nfs_server.hostname);
+	security_free_mnt_opts(&data->lsm_opts);
+out_free_fh:
+	kfree(mntfh);
+	kfree(data);
 	return error;
 
 out_free:
-- 
cgit v1.2.3-70-g09d2


From b7e2445737ff69cef892b6fd9cd71cae2c9e9515 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 19 Jun 2008 15:21:11 -0400
Subject: NFS: Fix filehandle size comparisons in the mount code

Fix a sign issue in xdr_decode_fhstatus3()
Fix incorrect comparison in nfs_validate_mount_data()

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/mount_clnt.c | 5 +++--
 fs/nfs/super.c      | 8 ++++----
 2 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 49c7cd0502c..779d2eb649c 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -130,10 +130,11 @@ static int xdr_decode_fhstatus3(struct rpc_rqst *req, __be32 *p,
 				struct mnt_fhstatus *res)
 {
 	struct nfs_fh *fh = res->fh;
+	unsigned size;
 
 	if ((res->status = ntohl(*p++)) == 0) {
-		int size = ntohl(*p++);
-		if (size <= NFS3_FHSIZE) {
+		size = ntohl(*p++);
+		if (size <= NFS3_FHSIZE && size != 0) {
 			fh->size = size;
 			memcpy(fh->data, p, size);
 		} else
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index dac663dc561..614efeed543 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1249,13 +1249,13 @@ static int nfs_validate_mount_data(void *options,
 	case 5:
 		memset(data->context, 0, sizeof(data->context));
 	case 6:
-		if (data->flags & NFS_MOUNT_VER3)
+		if (data->flags & NFS_MOUNT_VER3) {
+			if (data->root.size > NFS3_FHSIZE || data->root.size == 0)
+				goto out_invalid_fh;
 			mntfh->size = data->root.size;
-		else
+		} else
 			mntfh->size = NFS2_FHSIZE;
 
-		if (mntfh->size > sizeof(mntfh->data))
-			goto out_invalid_fh;
 
 		memcpy(mntfh->data, data->root.data, mntfh->size);
 		if (mntfh->size < sizeof(mntfh->data))
-- 
cgit v1.2.3-70-g09d2


From 03fa9e84e5dc10aeacb0e4eb2f708cd9fc36a5b8 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 5 Jun 2008 16:02:35 -0400
Subject: NFS: nfs_updatepage(): don't mark page as dirty if an error occurred

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/write.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 6d8ace3e325..f333848fd3b 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -739,12 +739,13 @@ int nfs_updatepage(struct file *file, struct page *page,
 	}
 
 	status = nfs_writepage_setup(ctx, page, offset, count);
-	__set_page_dirty_nobuffers(page);
+	if (status < 0)
+		nfs_set_pageerror(page);
+	else
+		__set_page_dirty_nobuffers(page);
 
         dprintk("NFS:      nfs_updatepage returns %d (isize %Ld)\n",
 			status, (long long)i_size_read(inode));
-	if (status < 0)
-		nfs_set_pageerror(page);
 	return status;
 }
 
-- 
cgit v1.2.3-70-g09d2


From e8183c2452041326c95258ecc7865b6fcd91c730 Mon Sep 17 00:00:00 2001
From: Tomas Janousek <tomi@nomi.cz>
Date: Mon, 23 Jun 2008 15:12:35 +0200
Subject: udf: Fix regression in UDF anchor block detection

In some cases it could happen that some block passed test in
udf_check_anchor_block() even though udf_read_tagged() refused to read it later
(e.g. because checksum was not correct).  This patch makes
udf_check_anchor_block() use udf_read_tagged() so that the checking is
stricter.

This fixes the regression (certain disks unmountable) caused by commit
423cf6dc04eb79d441bfda2b127bc4b57134b41d.

Signed-off-by: Tomas Janousek <tomi@nomi.cz>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/super.c | 57 +++++++++++++++++++++++----------------------------------
 1 file changed, 23 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/super.c b/fs/udf/super.c
index 7a5f69be6ac..44cc702f96c 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -682,38 +682,26 @@ static int udf_vrs(struct super_block *sb, int silent)
 /*
  * Check whether there is an anchor block in the given block
  */
-static int udf_check_anchor_block(struct super_block *sb, sector_t block,
-					bool varconv)
+static int udf_check_anchor_block(struct super_block *sb, sector_t block)
 {
-	struct buffer_head *bh = NULL;
-	tag *t;
+	struct buffer_head *bh;
 	uint16_t ident;
-	uint32_t location;
 
-	if (varconv) {
-		if (udf_fixed_to_variable(block) >=
-		    sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits)
-			return 0;
-		bh = sb_bread(sb, udf_fixed_to_variable(block));
-	}
-	else
-		bh = sb_bread(sb, block);
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) &&
+	    udf_fixed_to_variable(block) >=
+	    sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits)
+		return 0;
 
+	bh = udf_read_tagged(sb, block, block, &ident);
 	if (!bh)
 		return 0;
-
-	t = (tag *)bh->b_data;
-	ident = le16_to_cpu(t->tagIdent);
-	location = le32_to_cpu(t->tagLocation);
 	brelse(bh);
-	if (ident != TAG_IDENT_AVDP)
-		return 0;
-	return location == block;
+
+	return ident == TAG_IDENT_AVDP;
 }
 
 /* Search for an anchor volume descriptor pointer */
-static sector_t udf_scan_anchors(struct super_block *sb, bool varconv,
-					sector_t lastblock)
+static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock)
 {
 	sector_t last[6];
 	int i;
@@ -739,7 +727,7 @@ static sector_t udf_scan_anchors(struct super_block *sb, bool varconv,
 				sb->s_blocksize_bits)
 			continue;
 
-		if (udf_check_anchor_block(sb, last[i], varconv)) {
+		if (udf_check_anchor_block(sb, last[i])) {
 			sbi->s_anchor[0] = last[i];
 			sbi->s_anchor[1] = last[i] - 256;
 			return last[i];
@@ -748,17 +736,17 @@ static sector_t udf_scan_anchors(struct super_block *sb, bool varconv,
 		if (last[i] < 256)
 			continue;
 
-		if (udf_check_anchor_block(sb, last[i] - 256, varconv)) {
+		if (udf_check_anchor_block(sb, last[i] - 256)) {
 			sbi->s_anchor[1] = last[i] - 256;
 			return last[i];
 		}
 	}
 
-	if (udf_check_anchor_block(sb, sbi->s_session + 256, varconv)) {
+	if (udf_check_anchor_block(sb, sbi->s_session + 256)) {
 		sbi->s_anchor[0] = sbi->s_session + 256;
 		return last[0];
 	}
-	if (udf_check_anchor_block(sb, sbi->s_session + 512, varconv)) {
+	if (udf_check_anchor_block(sb, sbi->s_session + 512)) {
 		sbi->s_anchor[0] = sbi->s_session + 512;
 		return last[0];
 	}
@@ -780,23 +768,24 @@ static void udf_find_anchor(struct super_block *sb)
 	int i;
 	struct udf_sb_info *sbi = UDF_SB(sb);
 
-	lastblock = udf_scan_anchors(sb, 0, sbi->s_last_block);
+	lastblock = udf_scan_anchors(sb, sbi->s_last_block);
 	if (lastblock)
 		goto check_anchor;
 
 	/* No anchor found? Try VARCONV conversion of block numbers */
+	UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
 	/* Firstly, we try to not convert number of the last block */
-	lastblock = udf_scan_anchors(sb, 1,
+	lastblock = udf_scan_anchors(sb,
 				udf_variable_to_fixed(sbi->s_last_block));
-	if (lastblock) {
-		UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
+	if (lastblock)
 		goto check_anchor;
-	}
 
 	/* Secondly, we try with converted number of the last block */
-	lastblock = udf_scan_anchors(sb, 1, sbi->s_last_block);
-	if (lastblock)
-		UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
+	lastblock = udf_scan_anchors(sb, sbi->s_last_block);
+	if (!lastblock) {
+		/* VARCONV didn't help. Clear it. */
+		UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV);
+	}
 
 check_anchor:
 	/*
-- 
cgit v1.2.3-70-g09d2


From 17c15da00c0e7289375ad57e8fea0c7892b74aa0 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Wed, 18 Jun 2008 11:30:40 -0500
Subject: [GFS2] BUG: unable to handle kernel paging request at
 ffff81002690e000

This patch fixes bugzilla bug bz448866: gfs2: BUG: unable to
handle kernel paging request at ffff81002690e000.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/rgrp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 6387523a315..3401628d742 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -195,7 +195,7 @@ ulong_aligned:
 	   depending on architecture.  I've experimented with several ways
 	   of writing this section such as using an else before the goto
 	   but this one seems to be the fastest. */
-	while ((unsigned char *)plong < end - 1) {
+	while ((unsigned char *)plong < end - sizeof(unsigned long)) {
 		prefetch(plong + 1);
 		if (((*plong) & LBITMASK) != lskipval)
 			break;
-- 
cgit v1.2.3-70-g09d2


From 5af4e7a0bea715f2dd7190859a43eb2258b1f388 Mon Sep 17 00:00:00 2001
From: Benjamin Marzinski <bmarzins@redhat.com>
Date: Tue, 24 Jun 2008 12:53:38 -0500
Subject: [GFS2] fix gfs2 block allocation (cleaned up)

This patch fixes bz 450641.

This patch changes the computation for zero_metapath_length(), which it
renames to metapath_branch_start(). When you are extending the metadata
tree, The indirect blocks that point to the new data block must either
diverge from the existing tree either at the inode, or at the first
indirect block. They can diverge at the first indirect block because the
inode has room for 483 pointers while the indirect blocks have room for
509 pointers, so when the tree is grown, there is some free space in the
first indirect block. What metapath_branch_start() now computes is the
height where the first indirect block for the new data block is located.
It can either be 1 (if the indirect block diverges from the inode) or 2
(if it diverges from the first indirect block).

Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index c19184f2e70..bec76b1c2bb 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -246,15 +246,11 @@ static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
 
 }
 
-static inline unsigned int zero_metapath_length(const struct metapath *mp,
-						unsigned height)
+static inline unsigned int metapath_branch_start(const struct metapath *mp)
 {
-	unsigned int i;
-	for (i = 0; i < height - 1; i++) {
-		if (mp->mp_list[i] != 0)
-			return i;
-	}
-	return height;
+	if (mp->mp_list[0] == 0)
+		return 2;
+	return 1;
 }
 
 /**
@@ -436,7 +432,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct buffer_head *dibh = mp->mp_bh[0];
 	u64 bn, dblock = 0;
-	unsigned n, i, blks, alloced = 0, iblks = 0, zmpl = 0;
+	unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
 	unsigned dblks = 0;
 	unsigned ptrs_per_blk;
 	const unsigned end_of_metadata = height - 1;
@@ -471,9 +467,8 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
 			/* Building up tree height */
 			state = ALLOC_GROW_HEIGHT;
 			iblks = height - ip->i_height;
-			zmpl = zero_metapath_length(mp, height);
-			iblks -= zmpl;
-			iblks += height;
+			branch_start = metapath_branch_start(mp);
+			iblks += (height - branch_start);
 		}
 	}
 
@@ -509,13 +504,13 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
 					sizeof(struct gfs2_meta_header));
 				*ptr = zero_bn;
 				state = ALLOC_GROW_DEPTH;
-				for(i = zmpl; i < height; i++) {
+				for(i = branch_start; i < height; i++) {
 					if (mp->mp_bh[i] == NULL)
 						break;
 					brelse(mp->mp_bh[i]);
 					mp->mp_bh[i] = NULL;
 				}
-				i = zmpl;
+				i = branch_start;
 			}
 			if (n == 0)
 				break;
-- 
cgit v1.2.3-70-g09d2


From 15c8b6c1aaaf1c4edd67e2f02e4d8e1bd1a51c0d Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 9 May 2008 09:39:44 +0200
Subject: on_each_cpu(): kill unused 'retry' parameter

It's not even passed on to smp_call_function() anymore, since that
was removed. So kill it.

Acked-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 arch/alpha/kernel/process.c            |  2 +-
 arch/alpha/kernel/smp.c                |  4 ++--
 arch/arm/kernel/smp.c                  |  6 +++---
 arch/ia64/kernel/mca.c                 |  4 ++--
 arch/ia64/kernel/perfmon.c             |  4 ++--
 arch/ia64/kernel/smp.c                 |  4 ++--
 arch/mips/kernel/irq-rm9000.c          |  4 ++--
 arch/mips/kernel/smp.c                 |  4 ++--
 arch/mips/oprofile/common.c            |  6 +++---
 arch/parisc/kernel/cache.c             |  6 +++---
 arch/parisc/kernel/smp.c               |  2 +-
 arch/parisc/mm/init.c                  |  2 +-
 arch/powerpc/kernel/rtas.c             |  2 +-
 arch/powerpc/kernel/tau_6xx.c          |  4 ++--
 arch/powerpc/kernel/time.c             |  2 +-
 arch/powerpc/mm/slice.c                |  2 +-
 arch/powerpc/oprofile/common.c         |  6 +++---
 arch/s390/kernel/smp.c                 |  6 +++---
 arch/s390/kernel/time.c                |  2 +-
 arch/sh/kernel/smp.c                   |  4 ++--
 arch/sparc64/mm/hugetlbpage.c          |  2 +-
 arch/x86/kernel/cpu/mcheck/mce_64.c    |  6 +++---
 arch/x86/kernel/cpu/mcheck/non-fatal.c |  2 +-
 arch/x86/kernel/cpu/perfctr-watchdog.c |  4 ++--
 arch/x86/kernel/io_apic_32.c           |  2 +-
 arch/x86/kernel/io_apic_64.c           |  2 +-
 arch/x86/kernel/nmi_32.c               |  4 ++--
 arch/x86/kernel/nmi_64.c               |  4 ++--
 arch/x86/kernel/tlb_32.c               |  2 +-
 arch/x86/kernel/tlb_64.c               |  2 +-
 arch/x86/kernel/vsyscall_64.c          |  2 +-
 arch/x86/kvm/vmx.c                     |  2 +-
 arch/x86/mach-voyager/voyager_smp.c    |  2 +-
 arch/x86/mm/pageattr.c                 |  4 ++--
 arch/x86/oprofile/nmi_int.c            | 10 +++++-----
 drivers/char/agp/generic.c             |  2 +-
 drivers/lguest/x86/core.c              |  4 ++--
 fs/buffer.c                            |  2 +-
 include/linux/smp.h                    |  4 ++--
 kernel/hrtimer.c                       |  2 +-
 kernel/profile.c                       |  6 +++---
 kernel/rcupdate.c                      |  2 +-
 kernel/softirq.c                       |  2 +-
 mm/page_alloc.c                        |  2 +-
 mm/slab.c                              |  4 ++--
 mm/slub.c                              |  2 +-
 net/iucv/iucv.c                        |  2 +-
 virt/kvm/kvm_main.c                    |  8 ++++----
 48 files changed, 84 insertions(+), 84 deletions(-)

(limited to 'fs')

diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index 96ed82fd9ee..351407e07e7 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -160,7 +160,7 @@ common_shutdown(int mode, char *restart_cmd)
 	struct halt_info args;
 	args.mode = mode;
 	args.restart_cmd = restart_cmd;
-	on_each_cpu(common_shutdown_1, &args, 1, 0);
+	on_each_cpu(common_shutdown_1, &args, 0);
 }
 
 void
diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
index 44114c8dbb2..83df541650f 100644
--- a/arch/alpha/kernel/smp.c
+++ b/arch/alpha/kernel/smp.c
@@ -657,7 +657,7 @@ void
 smp_imb(void)
 {
 	/* Must wait other processors to flush their icache before continue. */
-	if (on_each_cpu(ipi_imb, NULL, 1, 1))
+	if (on_each_cpu(ipi_imb, NULL, 1))
 		printk(KERN_CRIT "smp_imb: timed out\n");
 }
 EXPORT_SYMBOL(smp_imb);
@@ -673,7 +673,7 @@ flush_tlb_all(void)
 {
 	/* Although we don't have any data to pass, we do want to
 	   synchronize with the other processors.  */
-	if (on_each_cpu(ipi_flush_tlb_all, NULL, 1, 1)) {
+	if (on_each_cpu(ipi_flush_tlb_all, NULL, 1)) {
 		printk(KERN_CRIT "flush_tlb_all: timed out\n");
 	}
 }
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 6344466b211..5a7c09564d1 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -604,7 +604,7 @@ static inline void ipi_flush_tlb_kernel_range(void *arg)
 
 void flush_tlb_all(void)
 {
-	on_each_cpu(ipi_flush_tlb_all, NULL, 1, 1);
+	on_each_cpu(ipi_flush_tlb_all, NULL, 1);
 }
 
 void flush_tlb_mm(struct mm_struct *mm)
@@ -631,7 +631,7 @@ void flush_tlb_kernel_page(unsigned long kaddr)
 
 	ta.ta_start = kaddr;
 
-	on_each_cpu(ipi_flush_tlb_kernel_page, &ta, 1, 1);
+	on_each_cpu(ipi_flush_tlb_kernel_page, &ta, 1);
 }
 
 void flush_tlb_range(struct vm_area_struct *vma,
@@ -654,5 +654,5 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 	ta.ta_start = start;
 	ta.ta_end = end;
 
-	on_each_cpu(ipi_flush_tlb_kernel_range, &ta, 1, 1);
+	on_each_cpu(ipi_flush_tlb_kernel_range, &ta, 1);
 }
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index 9cd818cc700..7dd96c12717 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -707,7 +707,7 @@ ia64_mca_cmc_vector_enable (void *dummy)
 static void
 ia64_mca_cmc_vector_disable_keventd(struct work_struct *unused)
 {
-	on_each_cpu(ia64_mca_cmc_vector_disable, NULL, 1, 0);
+	on_each_cpu(ia64_mca_cmc_vector_disable, NULL, 0);
 }
 
 /*
@@ -719,7 +719,7 @@ ia64_mca_cmc_vector_disable_keventd(struct work_struct *unused)
 static void
 ia64_mca_cmc_vector_enable_keventd(struct work_struct *unused)
 {
-	on_each_cpu(ia64_mca_cmc_vector_enable, NULL, 1, 0);
+	on_each_cpu(ia64_mca_cmc_vector_enable, NULL, 0);
 }
 
 /*
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 9baa48255c1..19d4493c619 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -6508,7 +6508,7 @@ pfm_install_alt_pmu_interrupt(pfm_intr_handler_desc_t *hdl)
 	}
 
 	/* save the current system wide pmu states */
-	ret = on_each_cpu(pfm_alt_save_pmu_state, NULL, 0, 1);
+	ret = on_each_cpu(pfm_alt_save_pmu_state, NULL, 1);
 	if (ret) {
 		DPRINT(("on_each_cpu() failed: %d\n", ret));
 		goto cleanup_reserve;
@@ -6553,7 +6553,7 @@ pfm_remove_alt_pmu_interrupt(pfm_intr_handler_desc_t *hdl)
 
 	pfm_alt_intr_handler = NULL;
 
-	ret = on_each_cpu(pfm_alt_restore_pmu_state, NULL, 0, 1);
+	ret = on_each_cpu(pfm_alt_restore_pmu_state, NULL, 1);
 	if (ret) {
 		DPRINT(("on_each_cpu() failed: %d\n", ret));
 	}
diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c
index 19152dcbf6e..3676468612b 100644
--- a/arch/ia64/kernel/smp.c
+++ b/arch/ia64/kernel/smp.c
@@ -285,7 +285,7 @@ smp_flush_tlb_cpumask(cpumask_t xcpumask)
 void
 smp_flush_tlb_all (void)
 {
-	on_each_cpu((void (*)(void *))local_flush_tlb_all, NULL, 1, 1);
+	on_each_cpu((void (*)(void *))local_flush_tlb_all, NULL, 1);
 }
 
 void
@@ -308,7 +308,7 @@ smp_flush_tlb_mm (struct mm_struct *mm)
 	 * anyhow, and once a CPU is interrupted, the cost of local_flush_tlb_all() is
 	 * rather trivial.
 	 */
-	on_each_cpu((void (*)(void *))local_finish_flush_tlb_mm, mm, 1, 1);
+	on_each_cpu((void (*)(void *))local_finish_flush_tlb_mm, mm, 1);
 }
 
 void arch_send_call_function_single_ipi(int cpu)
diff --git a/arch/mips/kernel/irq-rm9000.c b/arch/mips/kernel/irq-rm9000.c
index ed9febe63d7..b47e4615ec1 100644
--- a/arch/mips/kernel/irq-rm9000.c
+++ b/arch/mips/kernel/irq-rm9000.c
@@ -49,7 +49,7 @@ static void local_rm9k_perfcounter_irq_startup(void *args)
 
 static unsigned int rm9k_perfcounter_irq_startup(unsigned int irq)
 {
-	on_each_cpu(local_rm9k_perfcounter_irq_startup, (void *) irq, 0, 1);
+	on_each_cpu(local_rm9k_perfcounter_irq_startup, (void *) irq, 1);
 
 	return 0;
 }
@@ -66,7 +66,7 @@ static void local_rm9k_perfcounter_irq_shutdown(void *args)
 
 static void rm9k_perfcounter_irq_shutdown(unsigned int irq)
 {
-	on_each_cpu(local_rm9k_perfcounter_irq_shutdown, (void *) irq, 0, 1);
+	on_each_cpu(local_rm9k_perfcounter_irq_shutdown, (void *) irq, 1);
 }
 
 static struct irq_chip rm9k_irq_controller = {
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index 7a9ae830be8..4410f172b8a 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -246,7 +246,7 @@ static void flush_tlb_all_ipi(void *info)
 
 void flush_tlb_all(void)
 {
-	on_each_cpu(flush_tlb_all_ipi, NULL, 1, 1);
+	on_each_cpu(flush_tlb_all_ipi, NULL, 1);
 }
 
 static void flush_tlb_mm_ipi(void *mm)
@@ -366,7 +366,7 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 		.addr2 = end,
 	};
 
-	on_each_cpu(flush_tlb_kernel_range_ipi, &fd, 1, 1);
+	on_each_cpu(flush_tlb_kernel_range_ipi, &fd, 1);
 }
 
 static void flush_tlb_page_ipi(void *info)
diff --git a/arch/mips/oprofile/common.c b/arch/mips/oprofile/common.c
index b5f6f71b27b..dd2fbd6645c 100644
--- a/arch/mips/oprofile/common.c
+++ b/arch/mips/oprofile/common.c
@@ -27,7 +27,7 @@ static int op_mips_setup(void)
 	model->reg_setup(ctr);
 
 	/* Configure the registers on all cpus.  */
-	on_each_cpu(model->cpu_setup, NULL, 0, 1);
+	on_each_cpu(model->cpu_setup, NULL, 1);
 
         return 0;
 }
@@ -58,7 +58,7 @@ static int op_mips_create_files(struct super_block * sb, struct dentry * root)
 
 static int op_mips_start(void)
 {
-	on_each_cpu(model->cpu_start, NULL, 0, 1);
+	on_each_cpu(model->cpu_start, NULL, 1);
 
 	return 0;
 }
@@ -66,7 +66,7 @@ static int op_mips_start(void)
 static void op_mips_stop(void)
 {
 	/* Disable performance monitoring for all counters.  */
-	on_each_cpu(model->cpu_stop, NULL, 0, 1);
+	on_each_cpu(model->cpu_stop, NULL, 1);
 }
 
 int __init oprofile_arch_init(struct oprofile_operations *ops)
diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c
index e10d25d2d9c..5259d8c2067 100644
--- a/arch/parisc/kernel/cache.c
+++ b/arch/parisc/kernel/cache.c
@@ -51,12 +51,12 @@ static struct pdc_btlb_info btlb_info __read_mostly;
 void
 flush_data_cache(void)
 {
-	on_each_cpu(flush_data_cache_local, NULL, 1, 1);
+	on_each_cpu(flush_data_cache_local, NULL, 1);
 }
 void 
 flush_instruction_cache(void)
 {
-	on_each_cpu(flush_instruction_cache_local, NULL, 1, 1);
+	on_each_cpu(flush_instruction_cache_local, NULL, 1);
 }
 #endif
 
@@ -515,7 +515,7 @@ static void cacheflush_h_tmp_function(void *dummy)
 
 void flush_cache_all(void)
 {
-	on_each_cpu(cacheflush_h_tmp_function, NULL, 1, 1);
+	on_each_cpu(cacheflush_h_tmp_function, NULL, 1);
 }
 
 void flush_cache_mm(struct mm_struct *mm)
diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c
index 126105c76a4..d47f3975c9c 100644
--- a/arch/parisc/kernel/smp.c
+++ b/arch/parisc/kernel/smp.c
@@ -292,7 +292,7 @@ void arch_send_call_function_single_ipi(int cpu)
 void
 smp_flush_tlb_all(void)
 {
-	on_each_cpu(flush_tlb_all_local, NULL, 1, 1);
+	on_each_cpu(flush_tlb_all_local, NULL, 1);
 }
 
 /*
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index ce0da689a89..b4d6c8777ed 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -1053,7 +1053,7 @@ void flush_tlb_all(void)
 	    do_recycle++;
 	}
 	spin_unlock(&sid_lock);
-	on_each_cpu(flush_tlb_all_local, NULL, 1, 1);
+	on_each_cpu(flush_tlb_all_local, NULL, 1);
 	if (do_recycle) {
 	    spin_lock(&sid_lock);
 	    recycle_sids(recycle_ndirty,recycle_dirty_array);
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index 34843c31841..647f3e8677d 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -747,7 +747,7 @@ static int rtas_ibm_suspend_me(struct rtas_args *args)
 	/* Call function on all CPUs.  One of us will make the
 	 * rtas call
 	 */
-	if (on_each_cpu(rtas_percpu_suspend_me, &data, 1, 0))
+	if (on_each_cpu(rtas_percpu_suspend_me, &data, 0))
 		data.error = -EINVAL;
 
 	wait_for_completion(&done);
diff --git a/arch/powerpc/kernel/tau_6xx.c b/arch/powerpc/kernel/tau_6xx.c
index 368a4934f7e..c3a56d65c5a 100644
--- a/arch/powerpc/kernel/tau_6xx.c
+++ b/arch/powerpc/kernel/tau_6xx.c
@@ -192,7 +192,7 @@ static void tau_timeout_smp(unsigned long unused)
 
 	/* schedule ourselves to be run again */
 	mod_timer(&tau_timer, jiffies + shrink_timer) ;
-	on_each_cpu(tau_timeout, NULL, 1, 0);
+	on_each_cpu(tau_timeout, NULL, 0);
 }
 
 /*
@@ -234,7 +234,7 @@ int __init TAU_init(void)
 	tau_timer.expires = jiffies + shrink_timer;
 	add_timer(&tau_timer);
 
-	on_each_cpu(TAU_init_smp, NULL, 1, 0);
+	on_each_cpu(TAU_init_smp, NULL, 0);
 
 	printk("Thermal assist unit ");
 #ifdef CONFIG_TAU_INT
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 73401e83739..f1a38a6c1e2 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -322,7 +322,7 @@ void snapshot_timebases(void)
 {
 	if (!cpu_has_feature(CPU_FTR_PURR))
 		return;
-	on_each_cpu(snapshot_tb_and_purr, NULL, 0, 1);
+	on_each_cpu(snapshot_tb_and_purr, NULL, 1);
 }
 
 /*
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index ad928edafb0..2bd12d965db 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -218,7 +218,7 @@ static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psiz
 	mb();
 
 	/* XXX this is sub-optimal but will do for now */
-	on_each_cpu(slice_flush_segments, mm, 0, 1);
+	on_each_cpu(slice_flush_segments, mm, 1);
 #ifdef CONFIG_SPU_BASE
 	spu_flush_all_slbs(mm);
 #endif
diff --git a/arch/powerpc/oprofile/common.c b/arch/powerpc/oprofile/common.c
index 4908dc98f9c..17807acb05d 100644
--- a/arch/powerpc/oprofile/common.c
+++ b/arch/powerpc/oprofile/common.c
@@ -65,7 +65,7 @@ static int op_powerpc_setup(void)
 
 	/* Configure the registers on all cpus.	 If an error occurs on one
 	 * of the cpus, op_per_cpu_rc will be set to the error */
-	on_each_cpu(op_powerpc_cpu_setup, NULL, 0, 1);
+	on_each_cpu(op_powerpc_cpu_setup, NULL, 1);
 
 out:	if (op_per_cpu_rc) {
 		/* error on setup release the performance counter hardware */
@@ -100,7 +100,7 @@ static int op_powerpc_start(void)
 	if (model->global_start)
 		return model->global_start(ctr);
 	if (model->start) {
-		on_each_cpu(op_powerpc_cpu_start, NULL, 0, 1);
+		on_each_cpu(op_powerpc_cpu_start, NULL, 1);
 		return op_per_cpu_rc;
 	}
 	return -EIO; /* No start function is defined for this
@@ -115,7 +115,7 @@ static inline void op_powerpc_cpu_stop(void *dummy)
 static void op_powerpc_stop(void)
 {
 	if (model->stop)
-		on_each_cpu(op_powerpc_cpu_stop, NULL, 0, 1);
+		on_each_cpu(op_powerpc_cpu_stop, NULL, 1);
         if (model->global_stop)
                 model->global_stop();
 }
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 276b105fb2a..b6781030cfb 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -299,7 +299,7 @@ static void smp_ptlb_callback(void *info)
 
 void smp_ptlb_all(void)
 {
-	on_each_cpu(smp_ptlb_callback, NULL, 0, 1);
+	on_each_cpu(smp_ptlb_callback, NULL, 1);
 }
 EXPORT_SYMBOL(smp_ptlb_all);
 #endif /* ! CONFIG_64BIT */
@@ -347,7 +347,7 @@ void smp_ctl_set_bit(int cr, int bit)
 	memset(&parms.orvals, 0, sizeof(parms.orvals));
 	memset(&parms.andvals, 0xff, sizeof(parms.andvals));
 	parms.orvals[cr] = 1 << bit;
-	on_each_cpu(smp_ctl_bit_callback, &parms, 0, 1);
+	on_each_cpu(smp_ctl_bit_callback, &parms, 1);
 }
 EXPORT_SYMBOL(smp_ctl_set_bit);
 
@@ -361,7 +361,7 @@ void smp_ctl_clear_bit(int cr, int bit)
 	memset(&parms.orvals, 0, sizeof(parms.orvals));
 	memset(&parms.andvals, 0xff, sizeof(parms.andvals));
 	parms.andvals[cr] = ~(1L << bit);
-	on_each_cpu(smp_ctl_bit_callback, &parms, 0, 1);
+	on_each_cpu(smp_ctl_bit_callback, &parms, 1);
 }
 EXPORT_SYMBOL(smp_ctl_clear_bit);
 
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index bf7bf2c2236..6037ed2b747 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -909,7 +909,7 @@ static void etr_work_fn(struct work_struct *work)
 	if (!eacr.ea) {
 		/* Both ports offline. Reset everything. */
 		eacr.dp = eacr.es = eacr.sl = 0;
-		on_each_cpu(etr_disable_sync_clock, NULL, 0, 1);
+		on_each_cpu(etr_disable_sync_clock, NULL, 1);
 		del_timer_sync(&etr_timer);
 		etr_update_eacr(eacr);
 		set_bit(ETR_FLAG_EACCES, &etr_flags);
diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c
index 71781ba2675..60c50841143 100644
--- a/arch/sh/kernel/smp.c
+++ b/arch/sh/kernel/smp.c
@@ -197,7 +197,7 @@ static void flush_tlb_all_ipi(void *info)
 
 void flush_tlb_all(void)
 {
-	on_each_cpu(flush_tlb_all_ipi, 0, 1, 1);
+	on_each_cpu(flush_tlb_all_ipi, 0, 1);
 }
 
 static void flush_tlb_mm_ipi(void *mm)
@@ -284,7 +284,7 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 
 	fd.addr1 = start;
 	fd.addr2 = end;
-	on_each_cpu(flush_tlb_kernel_range_ipi, (void *)&fd, 1, 1);
+	on_each_cpu(flush_tlb_kernel_range_ipi, (void *)&fd, 1);
 }
 
 static void flush_tlb_page_ipi(void *info)
diff --git a/arch/sparc64/mm/hugetlbpage.c b/arch/sparc64/mm/hugetlbpage.c
index 6cfab2e4d34..ebefd2a1437 100644
--- a/arch/sparc64/mm/hugetlbpage.c
+++ b/arch/sparc64/mm/hugetlbpage.c
@@ -344,7 +344,7 @@ void hugetlb_prefault_arch_hook(struct mm_struct *mm)
 			 * also executing in this address space.
 			 */
 			mm->context.sparc64_ctx_val = ctx;
-			on_each_cpu(context_reload, mm, 0, 0);
+			on_each_cpu(context_reload, mm, 0);
 		}
 		spin_unlock(&ctx_alloc_lock);
 	}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index e07e8c068ae..43b7cb59491 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -363,7 +363,7 @@ static void mcheck_check_cpu(void *info)
 
 static void mcheck_timer(struct work_struct *work)
 {
-	on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
+	on_each_cpu(mcheck_check_cpu, NULL, 1);
 
 	/*
 	 * Alert userspace if needed.  If we logged an MCE, reduce the
@@ -612,7 +612,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 	 * Collect entries that were still getting written before the
 	 * synchronize.
 	 */
-	on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
+	on_each_cpu(collect_tscs, cpu_tsc, 1);
 	for (i = next; i < MCE_LOG_LEN; i++) {
 		if (mcelog.entry[i].finished &&
 		    mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
@@ -737,7 +737,7 @@ static void mce_restart(void)
 	if (next_interval)
 		cancel_delayed_work(&mcheck_work);
 	/* Timer race is harmless here */
-	on_each_cpu(mce_init, NULL, 1, 1);
+	on_each_cpu(mce_init, NULL, 1);
 	next_interval = check_interval * HZ;
 	if (next_interval)
 		schedule_delayed_work(&mcheck_work,
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c
index 00ccb6c14ec..cc1fccdd31e 100644
--- a/arch/x86/kernel/cpu/mcheck/non-fatal.c
+++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c
@@ -59,7 +59,7 @@ static DECLARE_DELAYED_WORK(mce_work, mce_work_fn);
 
 static void mce_work_fn(struct work_struct *work)
 {
-	on_each_cpu(mce_checkregs, NULL, 1, 1);
+	on_each_cpu(mce_checkregs, NULL, 1);
 	schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
 }
 
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index f9ae93adffe..58043f06d7e 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -180,7 +180,7 @@ void disable_lapic_nmi_watchdog(void)
 	if (atomic_read(&nmi_active) <= 0)
 		return;
 
-	on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
+	on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
 	wd_ops->unreserve();
 
 	BUG_ON(atomic_read(&nmi_active) != 0);
@@ -202,7 +202,7 @@ void enable_lapic_nmi_watchdog(void)
 		return;
 	}
 
-	on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
+	on_each_cpu(setup_apic_nmi_watchdog, NULL, 1);
 	touch_nmi_watchdog();
 }
 
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 4dc8600d9d2..720640ff36c 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -1565,7 +1565,7 @@ void /*__init*/ print_local_APIC(void * dummy)
 
 void print_all_local_APICs (void)
 {
-	on_each_cpu(print_local_APIC, NULL, 1, 1);
+	on_each_cpu(print_local_APIC, NULL, 1);
 }
 
 void /*__init*/ print_PIC(void)
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index ef1a8dfcc52..4504c7f5001 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -1146,7 +1146,7 @@ void __apicdebuginit print_local_APIC(void * dummy)
 
 void print_all_local_APICs (void)
 {
-	on_each_cpu(print_local_APIC, NULL, 1, 1);
+	on_each_cpu(print_local_APIC, NULL, 1);
 }
 
 void __apicdebuginit print_PIC(void)
diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c
index 5562dab0bd2..11008e0857c 100644
--- a/arch/x86/kernel/nmi_32.c
+++ b/arch/x86/kernel/nmi_32.c
@@ -218,7 +218,7 @@ static void __acpi_nmi_enable(void *__unused)
 void acpi_nmi_enable(void)
 {
 	if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
-		on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
+		on_each_cpu(__acpi_nmi_enable, NULL, 1);
 }
 
 static void __acpi_nmi_disable(void *__unused)
@@ -232,7 +232,7 @@ static void __acpi_nmi_disable(void *__unused)
 void acpi_nmi_disable(void)
 {
 	if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
-		on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
+		on_each_cpu(__acpi_nmi_disable, NULL, 1);
 }
 
 void setup_apic_nmi_watchdog(void *unused)
diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c
index 2f1e4f503c9..bbdcb17b3df 100644
--- a/arch/x86/kernel/nmi_64.c
+++ b/arch/x86/kernel/nmi_64.c
@@ -225,7 +225,7 @@ static void __acpi_nmi_enable(void *__unused)
 void acpi_nmi_enable(void)
 {
 	if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
-		on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
+		on_each_cpu(__acpi_nmi_enable, NULL, 1);
 }
 
 static void __acpi_nmi_disable(void *__unused)
@@ -239,7 +239,7 @@ static void __acpi_nmi_disable(void *__unused)
 void acpi_nmi_disable(void)
 {
 	if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
-		on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
+		on_each_cpu(__acpi_nmi_disable, NULL, 1);
 }
 
 void setup_apic_nmi_watchdog(void *unused)
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
index 9bb2363851a..fec1ecedc9b 100644
--- a/arch/x86/kernel/tlb_32.c
+++ b/arch/x86/kernel/tlb_32.c
@@ -238,6 +238,6 @@ static void do_flush_tlb_all(void *info)
 
 void flush_tlb_all(void)
 {
-	on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
+	on_each_cpu(do_flush_tlb_all, NULL, 1);
 }
 
diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c
index a1f07d79320..184a367516d 100644
--- a/arch/x86/kernel/tlb_64.c
+++ b/arch/x86/kernel/tlb_64.c
@@ -270,5 +270,5 @@ static void do_flush_tlb_all(void *info)
 
 void flush_tlb_all(void)
 {
-	on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
+	on_each_cpu(do_flush_tlb_all, NULL, 1);
 }
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 0a03d57f9b3..0dcae19ed62 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -301,7 +301,7 @@ static int __init vsyscall_init(void)
 #ifdef CONFIG_SYSCTL
 	register_sysctl_table(kernel_root_table2);
 #endif
-	on_each_cpu(cpu_vsyscall_init, NULL, 0, 1);
+	on_each_cpu(cpu_vsyscall_init, NULL, 1);
 	hotcpu_notifier(cpu_vsyscall_notifier, 0);
 	return 0;
 }
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5534fe59b5f..10ce6ee4c49 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2968,7 +2968,7 @@ static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
 	if (vmx->vmcs) {
-		on_each_cpu(__vcpu_clear, vmx, 0, 1);
+		on_each_cpu(__vcpu_clear, vmx, 1);
 		free_vmcs(vmx->vmcs);
 		vmx->vmcs = NULL;
 	}
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index 04f596eab74..abea08459a7 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -1072,7 +1072,7 @@ static void do_flush_tlb_all(void *info)
 /* flush the TLB of every active CPU in the system */
 void flush_tlb_all(void)
 {
-	on_each_cpu(do_flush_tlb_all, 0, 1, 1);
+	on_each_cpu(do_flush_tlb_all, 0, 1);
 }
 
 /* used to set up the trampoline for other CPUs when the memory manager
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 60bcb5b6a37..9b836ba9ded 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -106,7 +106,7 @@ static void cpa_flush_all(unsigned long cache)
 {
 	BUG_ON(irqs_disabled());
 
-	on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1);
+	on_each_cpu(__cpa_flush_all, (void *) cache, 1);
 }
 
 static void __cpa_flush_range(void *arg)
@@ -127,7 +127,7 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
 	BUG_ON(irqs_disabled());
 	WARN_ON(PAGE_ALIGN(start) != start);
 
-	on_each_cpu(__cpa_flush_range, NULL, 1, 1);
+	on_each_cpu(__cpa_flush_range, NULL, 1);
 
 	if (!cache)
 		return;
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index cc48d3fde54..3238ad32ffd 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -218,8 +218,8 @@ static int nmi_setup(void)
 		}
 
 	}
-	on_each_cpu(nmi_save_registers, NULL, 0, 1);
-	on_each_cpu(nmi_cpu_setup, NULL, 0, 1);
+	on_each_cpu(nmi_save_registers, NULL, 1);
+	on_each_cpu(nmi_cpu_setup, NULL, 1);
 	nmi_enabled = 1;
 	return 0;
 }
@@ -271,7 +271,7 @@ static void nmi_shutdown(void)
 {
 	struct op_msrs *msrs = &__get_cpu_var(cpu_msrs);
 	nmi_enabled = 0;
-	on_each_cpu(nmi_cpu_shutdown, NULL, 0, 1);
+	on_each_cpu(nmi_cpu_shutdown, NULL, 1);
 	unregister_die_notifier(&profile_exceptions_nb);
 	model->shutdown(msrs);
 	free_msrs();
@@ -285,7 +285,7 @@ static void nmi_cpu_start(void *dummy)
 
 static int nmi_start(void)
 {
-	on_each_cpu(nmi_cpu_start, NULL, 0, 1);
+	on_each_cpu(nmi_cpu_start, NULL, 1);
 	return 0;
 }
 
@@ -297,7 +297,7 @@ static void nmi_cpu_stop(void *dummy)
 
 static void nmi_stop(void)
 {
-	on_each_cpu(nmi_cpu_stop, NULL, 0, 1);
+	on_each_cpu(nmi_cpu_stop, NULL, 1);
 }
 
 struct op_counter_config counter_config[OP_MAX_COUNTER];
diff --git a/drivers/char/agp/generic.c b/drivers/char/agp/generic.c
index 564daaa6c7d..eaa1a355bb3 100644
--- a/drivers/char/agp/generic.c
+++ b/drivers/char/agp/generic.c
@@ -1249,7 +1249,7 @@ static void ipi_handler(void *null)
 
 void global_cache_flush(void)
 {
-	if (on_each_cpu(ipi_handler, NULL, 1, 1) != 0)
+	if (on_each_cpu(ipi_handler, NULL, 1) != 0)
 		panic(PFX "timed out waiting for the other CPUs!\n");
 }
 EXPORT_SYMBOL(global_cache_flush);
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 2e554a4ab33..95dfda52b4f 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -478,7 +478,7 @@ void __init lguest_arch_host_init(void)
 		cpu_had_pge = 1;
 		/* adjust_pge is a helper function which sets or unsets the PGE
 		 * bit on its CPU, depending on the argument (0 == unset). */
-		on_each_cpu(adjust_pge, (void *)0, 0, 1);
+		on_each_cpu(adjust_pge, (void *)0, 1);
 		/* Turn off the feature in the global feature set. */
 		clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
 	}
@@ -493,7 +493,7 @@ void __exit lguest_arch_host_fini(void)
 	if (cpu_had_pge) {
 		set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
 		/* adjust_pge's argument "1" means set PGE. */
-		on_each_cpu(adjust_pge, (void *)1, 0, 1);
+		on_each_cpu(adjust_pge, (void *)1, 1);
 	}
 	put_online_cpus();
 }
diff --git a/fs/buffer.c b/fs/buffer.c
index a073f3f4f01..5c23ef560d0 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1464,7 +1464,7 @@ static void invalidate_bh_lru(void *arg)
 	
 void invalidate_bh_lrus(void)
 {
-	on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
+	on_each_cpu(invalidate_bh_lru, NULL, 1);
 }
 EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
 
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 338cad1b954..55261101d09 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -89,7 +89,7 @@ static inline void init_call_single_data(void)
 /*
  * Call a function on all processors
  */
-int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait);
+int on_each_cpu(void (*func) (void *info), void *info, int wait);
 
 #define MSG_ALL_BUT_SELF	0x8000	/* Assume <32768 CPU's */
 #define MSG_ALL			0x8001
@@ -121,7 +121,7 @@ static inline int up_smp_call_function(void (*func)(void *), void *info)
 }
 #define smp_call_function(func, info, wait) \
 			(up_smp_call_function(func, info))
-#define on_each_cpu(func,info,retry,wait)	\
+#define on_each_cpu(func,info,wait)		\
 	({					\
 		local_irq_disable();		\
 		func(info);			\
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 421be5fe5cc..50e8616d795 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -623,7 +623,7 @@ static void retrigger_next_event(void *arg)
 void clock_was_set(void)
 {
 	/* Retrigger the CPU local events everywhere */
-	on_each_cpu(retrigger_next_event, NULL, 0, 1);
+	on_each_cpu(retrigger_next_event, NULL, 1);
 }
 
 /*
diff --git a/kernel/profile.c b/kernel/profile.c
index ae7ead82cbc..58926411eb2 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -252,7 +252,7 @@ static void profile_flip_buffers(void)
 	mutex_lock(&profile_flip_mutex);
 	j = per_cpu(cpu_profile_flip, get_cpu());
 	put_cpu();
-	on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
+	on_each_cpu(__profile_flip_buffers, NULL, 1);
 	for_each_online_cpu(cpu) {
 		struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j];
 		for (i = 0; i < NR_PROFILE_HIT; ++i) {
@@ -275,7 +275,7 @@ static void profile_discard_flip_buffers(void)
 	mutex_lock(&profile_flip_mutex);
 	i = per_cpu(cpu_profile_flip, get_cpu());
 	put_cpu();
-	on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
+	on_each_cpu(__profile_flip_buffers, NULL, 1);
 	for_each_online_cpu(cpu) {
 		struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i];
 		memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit));
@@ -558,7 +558,7 @@ static int __init create_hash_tables(void)
 out_cleanup:
 	prof_on = 0;
 	smp_mb();
-	on_each_cpu(profile_nop, NULL, 0, 1);
+	on_each_cpu(profile_nop, NULL, 1);
 	for_each_online_cpu(cpu) {
 		struct page *page;
 
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index c09605f8d16..6addab5e6d8 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -127,7 +127,7 @@ void rcu_barrier(void)
 	 * until all the callbacks are queued.
 	 */
 	rcu_read_lock();
-	on_each_cpu(rcu_barrier_func, NULL, 0, 1);
+	on_each_cpu(rcu_barrier_func, NULL, 1);
 	rcu_read_unlock();
 	wait_for_completion(&rcu_barrier_completion);
 	mutex_unlock(&rcu_barrier_mutex);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index d73afb4764e..c159fd09477 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -674,7 +674,7 @@ __init int spawn_ksoftirqd(void)
 /*
  * Call a function on all processors
  */
-int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait)
+int on_each_cpu(void (*func) (void *info), void *info, int wait)
 {
 	int ret = 0;
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2f552955a02..53242344a77 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -918,7 +918,7 @@ void drain_local_pages(void *arg)
  */
 void drain_all_pages(void)
 {
-	on_each_cpu(drain_local_pages, NULL, 0, 1);
+	on_each_cpu(drain_local_pages, NULL, 1);
 }
 
 #ifdef CONFIG_HIBERNATION
diff --git a/mm/slab.c b/mm/slab.c
index 046607f05f3..0772abb412b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2454,7 +2454,7 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
 	struct kmem_list3 *l3;
 	int node;
 
-	on_each_cpu(do_drain, cachep, 1, 1);
+	on_each_cpu(do_drain, cachep, 1);
 	check_irq_on();
 	for_each_online_node(node) {
 		l3 = cachep->nodelists[node];
@@ -3939,7 +3939,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
 	}
 	new->cachep = cachep;
 
-	on_each_cpu(do_ccupdate_local, (void *)new, 1, 1);
+	on_each_cpu(do_ccupdate_local, (void *)new, 1);
 
 	check_irq_on();
 	cachep->batchcount = batchcount;
diff --git a/mm/slub.c b/mm/slub.c
index 0987d1cd943..44715eb70c0 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1497,7 +1497,7 @@ static void flush_cpu_slab(void *d)
 static void flush_all(struct kmem_cache *s)
 {
 #ifdef CONFIG_SMP
-	on_each_cpu(flush_cpu_slab, s, 1, 1);
+	on_each_cpu(flush_cpu_slab, s, 1);
 #else
 	unsigned long flags;
 
diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c
index 94d5a45c3a5..a178e27e7b1 100644
--- a/net/iucv/iucv.c
+++ b/net/iucv/iucv.c
@@ -545,7 +545,7 @@ out:
  */
 static void iucv_disable(void)
 {
-	on_each_cpu(iucv_retrieve_cpu, NULL, 0, 1);
+	on_each_cpu(iucv_retrieve_cpu, NULL, 1);
 	kfree(iucv_path_table);
 }
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ea1f595f8a8..d4eae6af073 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1286,7 +1286,7 @@ static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
 		 * in vmx root mode.
 		 */
 		printk(KERN_INFO "kvm: exiting hardware virtualization\n");
-		on_each_cpu(hardware_disable, NULL, 0, 1);
+		on_each_cpu(hardware_disable, NULL, 1);
 	}
 	return NOTIFY_OK;
 }
@@ -1479,7 +1479,7 @@ int kvm_init(void *opaque, unsigned int vcpu_size,
 			goto out_free_1;
 	}
 
-	on_each_cpu(hardware_enable, NULL, 0, 1);
+	on_each_cpu(hardware_enable, NULL, 1);
 	r = register_cpu_notifier(&kvm_cpu_notifier);
 	if (r)
 		goto out_free_2;
@@ -1525,7 +1525,7 @@ out_free_3:
 	unregister_reboot_notifier(&kvm_reboot_notifier);
 	unregister_cpu_notifier(&kvm_cpu_notifier);
 out_free_2:
-	on_each_cpu(hardware_disable, NULL, 0, 1);
+	on_each_cpu(hardware_disable, NULL, 1);
 out_free_1:
 	kvm_arch_hardware_unsetup();
 out_free_0:
@@ -1547,7 +1547,7 @@ void kvm_exit(void)
 	sysdev_class_unregister(&kvm_sysdev_class);
 	unregister_reboot_notifier(&kvm_reboot_notifier);
 	unregister_cpu_notifier(&kvm_cpu_notifier);
-	on_each_cpu(hardware_disable, NULL, 0, 1);
+	on_each_cpu(hardware_disable, NULL, 1);
 	kvm_arch_hardware_unsetup();
 	kvm_arch_exit();
 	kvm_exit_debug();
-- 
cgit v1.2.3-70-g09d2


From 6802e3400ff4549525930ee744030c36fce9cc73 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 21 May 2008 17:03:22 +0100
Subject: [GFS2] Clean up the glock core

This patch implements a number of cleanups to the core of the
GFS2 glock code. As a result a lot of code is removed. It looks
like a really big change, but actually a large part of this patch
is either removing or moving existing code.

There are some new bits too though, such as the new run_queue()
function which is considerably streamlined. Highlights of this
patch include:

 o Fixes a cluster coherency bug during SH -> EX lock conversions
 o Removes the "glmutex" code in favour of a single bit lock
 o Removes the ->go_xmote_bh() for inodes since it was duplicating
   ->go_lock()
 o We now only use the ->lm_lock() function for both locks and
   unlocks (i.e. unlock is a lock with target mode LM_ST_UNLOCKED)
 o The fast path is considerably shortly, giving performance gains
   especially with lock_nolock
 o The glock_workqueue is now used for all the callbacks from the DLM
   which allows us to simplify the lock_dlm module (see following patch)
 o The way is now open to make further changes such as eliminating the two
   threads (gfs2_glockd and gfs2_scand) in favour of a more efficient
   scheme.

This patch has undergone extensive testing with various test suites
so it should be pretty stable by now.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Bob Peterson <rpeterso@redhat.com>
---
 fs/gfs2/glock.c               | 1655 +++++++++++++++++------------------------
 fs/gfs2/glock.h               |    9 +-
 fs/gfs2/glops.c               |   70 +-
 fs/gfs2/incore.h              |   35 +-
 fs/gfs2/locking/dlm/lock.c    |    3 +
 fs/gfs2/locking/nolock/main.c |    2 +
 fs/gfs2/main.c                |    2 -
 fs/gfs2/meta_io.c             |   14 +-
 fs/gfs2/meta_io.h             |    1 +
 fs/gfs2/ops_address.c         |   25 +-
 fs/gfs2/ops_file.c            |    8 +-
 fs/gfs2/recovery.c            |    2 +-
 fs/gfs2/super.c               |    3 +-
 13 files changed, 758 insertions(+), 1071 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index d636b3e80f5..519a54cc0b7 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -45,21 +45,19 @@ struct gfs2_gl_hash_bucket {
         struct hlist_head hb_list;
 };
 
-struct glock_iter {
-	int hash;                     /* hash bucket index         */
-	struct gfs2_sbd *sdp;         /* incore superblock         */
-	struct gfs2_glock *gl;        /* current glock struct      */
-	struct seq_file *seq;         /* sequence file for debugfs */
-	char string[512];             /* scratch space             */
+struct gfs2_glock_iter {
+	int hash;			/* hash bucket index         */
+	struct gfs2_sbd *sdp;		/* incore superblock         */
+	struct gfs2_glock *gl;		/* current glock struct      */
+	char string[512];		/* scratch space             */
 };
 
 typedef void (*glock_examiner) (struct gfs2_glock * gl);
 
 static int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
-static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl);
-static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh);
-static void gfs2_glock_drop_th(struct gfs2_glock *gl);
-static void run_queue(struct gfs2_glock *gl);
+static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
+#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0)
+static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
 
 static DECLARE_RWSEM(gfs2_umount_flush_sem);
 static struct dentry *gfs2_root;
@@ -122,33 +120,6 @@ static inline rwlock_t *gl_lock_addr(unsigned int x)
 }
 #endif
 
-/**
- * relaxed_state_ok - is a requested lock compatible with the current lock mode?
- * @actual: the current state of the lock
- * @requested: the lock state that was requested by the caller
- * @flags: the modifier flags passed in by the caller
- *
- * Returns: 1 if the locks are compatible, 0 otherwise
- */
-
-static inline int relaxed_state_ok(unsigned int actual, unsigned requested,
-				   int flags)
-{
-	if (actual == requested)
-		return 1;
-
-	if (flags & GL_EXACT)
-		return 0;
-
-	if (actual == LM_ST_EXCLUSIVE && requested == LM_ST_SHARED)
-		return 1;
-
-	if (actual != LM_ST_UNLOCKED && (flags & LM_FLAG_ANY))
-		return 1;
-
-	return 0;
-}
-
 /**
  * gl_hash() - Turn glock number into hash bucket number
  * @lock: The glock number
@@ -211,17 +182,14 @@ static void gfs2_glock_hold(struct gfs2_glock *gl)
 int gfs2_glock_put(struct gfs2_glock *gl)
 {
 	int rv = 0;
-	struct gfs2_sbd *sdp = gl->gl_sbd;
 
 	write_lock(gl_lock_addr(gl->gl_hash));
 	if (atomic_dec_and_test(&gl->gl_ref)) {
 		hlist_del(&gl->gl_list);
 		write_unlock(gl_lock_addr(gl->gl_hash));
-		gfs2_assert(sdp, gl->gl_state == LM_ST_UNLOCKED);
-		gfs2_assert(sdp, list_empty(&gl->gl_reclaim));
-		gfs2_assert(sdp, list_empty(&gl->gl_holders));
-		gfs2_assert(sdp, list_empty(&gl->gl_waiters1));
-		gfs2_assert(sdp, list_empty(&gl->gl_waiters3));
+		GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_UNLOCKED);
+		GLOCK_BUG_ON(gl, !list_empty(&gl->gl_reclaim));
+		GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
 		glock_free(gl);
 		rv = 1;
 		goto out;
@@ -281,16 +249,382 @@ static struct gfs2_glock *gfs2_glock_find(const struct gfs2_sbd *sdp,
 	return gl;
 }
 
+/**
+ * may_grant - check if its ok to grant a new lock
+ * @gl: The glock
+ * @gh: The lock request which we wish to grant
+ *
+ * Returns: true if its ok to grant the lock
+ */
+
+static inline int may_grant(const struct gfs2_glock *gl, const struct gfs2_holder *gh)
+{
+	const struct gfs2_holder *gh_head = list_entry(gl->gl_holders.next, const struct gfs2_holder, gh_list);
+	if ((gh->gh_state == LM_ST_EXCLUSIVE ||
+	     gh_head->gh_state == LM_ST_EXCLUSIVE) && gh != gh_head)
+		return 0;
+	if (gl->gl_state == gh->gh_state)
+		return 1;
+	if (gh->gh_flags & GL_EXACT)
+		return 0;
+	if (gh->gh_state == LM_ST_SHARED && gl->gl_state == LM_ST_EXCLUSIVE)
+		return 1;
+	if (gl->gl_state != LM_ST_UNLOCKED && (gh->gh_flags & LM_FLAG_ANY))
+		return 1;
+	return 0;
+}
+
+static void gfs2_holder_wake(struct gfs2_holder *gh)
+{
+	clear_bit(HIF_WAIT, &gh->gh_iflags);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&gh->gh_iflags, HIF_WAIT);
+}
+
+/**
+ * do_promote - promote as many requests as possible on the current queue
+ * @gl: The glock
+ * 
+ * Returns: true if there is a blocked holder at the head of the list
+ */
+
+static int do_promote(struct gfs2_glock *gl)
+{
+	const struct gfs2_glock_operations *glops = gl->gl_ops;
+	struct gfs2_holder *gh, *tmp;
+	int ret;
+
+restart:
+	list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
+		if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+			continue;
+		if (may_grant(gl, gh)) {
+			if (gh->gh_list.prev == &gl->gl_holders &&
+			    glops->go_lock) {
+				spin_unlock(&gl->gl_spin);
+				/* FIXME: eliminate this eventually */
+				ret = glops->go_lock(gh);
+				spin_lock(&gl->gl_spin);
+				if (ret) {
+					gh->gh_error = ret;
+					list_del_init(&gh->gh_list);
+					gfs2_holder_wake(gh);
+					goto restart;
+				}
+				set_bit(HIF_HOLDER, &gh->gh_iflags);
+				gfs2_holder_wake(gh);
+				goto restart;
+			}
+			set_bit(HIF_HOLDER, &gh->gh_iflags);
+			gfs2_holder_wake(gh);
+			continue;
+		}
+		if (gh->gh_list.prev == &gl->gl_holders)
+			return 1;
+		break;
+	}
+	return 0;
+}
+
+/**
+ * do_error - Something unexpected has happened during a lock request
+ *
+ */
+
+static inline void do_error(struct gfs2_glock *gl, const int ret)
+{
+	struct gfs2_holder *gh, *tmp;
+
+	list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
+		if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+			continue;
+		if (ret & LM_OUT_ERROR)
+			gh->gh_error = -EIO;
+		else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))
+			gh->gh_error = GLR_TRYFAILED;
+		else
+			continue;
+		list_del_init(&gh->gh_list);
+		gfs2_holder_wake(gh);
+	}
+}
+
+/**
+ * find_first_waiter - find the first gh that's waiting for the glock
+ * @gl: the glock
+ */
+
+static inline struct gfs2_holder *find_first_waiter(const struct gfs2_glock *gl)
+{
+	struct gfs2_holder *gh;
+
+	list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+		if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
+			return gh;
+	}
+	return NULL;
+}
+
+/**
+ * state_change - record that the glock is now in a different state
+ * @gl: the glock
+ * @new_state the new state
+ *
+ */
+
+static void state_change(struct gfs2_glock *gl, unsigned int new_state)
+{
+	int held1, held2;
+
+	held1 = (gl->gl_state != LM_ST_UNLOCKED);
+	held2 = (new_state != LM_ST_UNLOCKED);
+
+	if (held1 != held2) {
+		if (held2)
+			gfs2_glock_hold(gl);
+		else
+			gfs2_glock_put(gl);
+	}
+
+	gl->gl_state = new_state;
+	gl->gl_tchange = jiffies;
+}
+
+static void gfs2_demote_wake(struct gfs2_glock *gl)
+{
+	gl->gl_demote_state = LM_ST_EXCLUSIVE;
+	clear_bit(GLF_DEMOTE, &gl->gl_flags);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&gl->gl_flags, GLF_DEMOTE);
+}
+
+/**
+ * finish_xmote - The DLM has replied to one of our lock requests
+ * @gl: The glock
+ * @ret: The status from the DLM
+ *
+ */
+
+static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
+{
+	const struct gfs2_glock_operations *glops = gl->gl_ops;
+	struct gfs2_holder *gh;
+	unsigned state = ret & LM_OUT_ST_MASK;
+
+	spin_lock(&gl->gl_spin);
+	state_change(gl, state);
+	gh = find_first_waiter(gl);
+
+	/* Demote to UN request arrived during demote to SH or DF */
+	if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
+	    state != LM_ST_UNLOCKED && gl->gl_demote_state == LM_ST_UNLOCKED)
+		gl->gl_target = LM_ST_UNLOCKED;
+
+	/* Check for state != intended state */
+	if (unlikely(state != gl->gl_target)) {
+		if (gh && !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) {
+			/* move to back of queue and try next entry */
+			if (ret & LM_OUT_CANCELED) {
+				if ((gh->gh_flags & LM_FLAG_PRIORITY) == 0)
+					list_move_tail(&gh->gh_list, &gl->gl_holders);
+				gh = find_first_waiter(gl);
+				gl->gl_target = gh->gh_state;
+				goto retry;
+			}
+			/* Some error or failed "try lock" - report it */
+			if ((ret & LM_OUT_ERROR) ||
+			    (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) {
+				gl->gl_target = gl->gl_state;
+				do_error(gl, ret);
+				goto out;
+			}
+		}
+		switch(state) {
+		/* Unlocked due to conversion deadlock, try again */
+		case LM_ST_UNLOCKED:
+retry:
+			do_xmote(gl, gh, gl->gl_target);
+			break;
+		/* Conversion fails, unlock and try again */
+		case LM_ST_SHARED:
+		case LM_ST_DEFERRED:
+			do_xmote(gl, gh, LM_ST_UNLOCKED);
+			break;
+		default: /* Everything else */
+			printk(KERN_ERR "GFS2: wanted %u got %u\n", gl->gl_target, state);
+			GLOCK_BUG_ON(gl, 1);
+		}
+		spin_unlock(&gl->gl_spin);
+		gfs2_glock_put(gl);
+		return;
+	}
+
+	/* Fast path - we got what we asked for */
+	if (test_and_clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags))
+		gfs2_demote_wake(gl);
+	if (state != LM_ST_UNLOCKED) {
+		if (glops->go_xmote_bh) {
+			int rv;
+			spin_unlock(&gl->gl_spin);
+			rv = glops->go_xmote_bh(gl, gh);
+			if (rv == -EAGAIN)
+				return;
+			spin_lock(&gl->gl_spin);
+			if (rv) {
+				do_error(gl, rv);
+				goto out;
+			}
+		}
+		do_promote(gl);
+	}
+out:
+	clear_bit(GLF_LOCK, &gl->gl_flags);
+	spin_unlock(&gl->gl_spin);
+	gfs2_glock_put(gl);
+}
+
+static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
+				 unsigned int cur_state, unsigned int req_state,
+				 unsigned int flags)
+{
+	int ret = LM_OUT_ERROR;
+	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state,
+							 req_state, flags);
+	return ret;
+}
+
+/**
+ * do_xmote - Calls the DLM to change the state of a lock
+ * @gl: The lock state
+ * @gh: The holder (only for promotes)
+ * @target: The target lock state
+ *
+ */
+
+static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target)
+{
+	const struct gfs2_glock_operations *glops = gl->gl_ops;
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	unsigned int lck_flags = gh ? gh->gh_flags : 0;
+	int ret;
+
+	lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
+		      LM_FLAG_PRIORITY);
+	BUG_ON(gl->gl_state == target);
+	BUG_ON(gl->gl_state == gl->gl_target);
+	if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) &&
+	    glops->go_inval) {
+		set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
+		do_error(gl, 0); /* Fail queued try locks */
+	}
+	spin_unlock(&gl->gl_spin);
+	if (glops->go_xmote_th)
+		glops->go_xmote_th(gl);
+	if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
+		glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA);
+	clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
+
+	gfs2_glock_hold(gl);
+	if (target != LM_ST_UNLOCKED && (gl->gl_state == LM_ST_SHARED ||
+	    gl->gl_state == LM_ST_DEFERRED) &&
+	    !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
+		lck_flags |= LM_FLAG_TRY_1CB;
+	ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, target, lck_flags);
+
+	if (!(ret & LM_OUT_ASYNC)) {
+		finish_xmote(gl, ret);
+		gfs2_glock_hold(gl);
+		if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+			gfs2_glock_put(gl);
+	} else {
+		GLOCK_BUG_ON(gl, ret != LM_OUT_ASYNC);
+	}
+	spin_lock(&gl->gl_spin);
+}
+
+/**
+ * find_first_holder - find the first "holder" gh
+ * @gl: the glock
+ */
+
+static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
+{
+	struct gfs2_holder *gh;
+
+	if (!list_empty(&gl->gl_holders)) {
+		gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
+		if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+			return gh;
+	}
+	return NULL;
+}
+
+/**
+ * run_queue - do all outstanding tasks related to a glock
+ * @gl: The glock in question
+ * @nonblock: True if we must not block in run_queue
+ *
+ */
+
+static void run_queue(struct gfs2_glock *gl, const int nonblock)
+{
+	struct gfs2_holder *gh = NULL;
+
+	if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
+		return;
+
+	GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags));
+
+	if (test_bit(GLF_DEMOTE, &gl->gl_flags) &&
+	    gl->gl_demote_state != gl->gl_state) {
+		if (find_first_holder(gl))
+			goto out;
+		if (nonblock)
+			goto out_sched;
+		set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
+		gl->gl_target = gl->gl_demote_state;
+	} else {
+		if (test_bit(GLF_DEMOTE, &gl->gl_flags))
+			gfs2_demote_wake(gl);
+		if (do_promote(gl) == 0)
+			goto out;
+		gh = find_first_waiter(gl);
+		gl->gl_target = gh->gh_state;
+		if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
+			do_error(gl, 0); /* Fail queued try locks */
+	}
+	do_xmote(gl, gh, gl->gl_target);
+	return;
+
+out_sched:
+	gfs2_glock_hold(gl);
+	if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+		gfs2_glock_put(gl);
+out:
+	clear_bit(GLF_LOCK, &gl->gl_flags);
+}
+
 static void glock_work_func(struct work_struct *work)
 {
+	unsigned long delay = 0;
 	struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work);
 
+	if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags))
+		finish_xmote(gl, gl->gl_reply);
 	spin_lock(&gl->gl_spin);
-	if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags))
-		set_bit(GLF_DEMOTE, &gl->gl_flags);
-	run_queue(gl);
+	if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags)) {
+		unsigned long holdtime, now = jiffies;
+		holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
+		if (time_before(now, holdtime))
+			delay = holdtime - now;
+		set_bit(delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE, &gl->gl_flags);
+	}
+	run_queue(gl, 0);
 	spin_unlock(&gl->gl_spin);
-	gfs2_glock_put(gl);
+	if (!delay ||
+	    queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
+		gfs2_glock_put(gl);
 }
 
 static int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
@@ -342,12 +676,10 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 	gl->gl_name = name;
 	atomic_set(&gl->gl_ref, 1);
 	gl->gl_state = LM_ST_UNLOCKED;
+	gl->gl_target = LM_ST_UNLOCKED;
 	gl->gl_demote_state = LM_ST_EXCLUSIVE;
 	gl->gl_hash = hash;
-	gl->gl_owner_pid = NULL;
-	gl->gl_ip = 0;
 	gl->gl_ops = glops;
-	gl->gl_req_gh = NULL;
 	gl->gl_stamp = jiffies;
 	gl->gl_tchange = jiffies;
 	gl->gl_object = NULL;
@@ -447,656 +779,77 @@ void gfs2_holder_uninit(struct gfs2_holder *gh)
 	gh->gh_ip = 0;
 }
 
-static void gfs2_holder_wake(struct gfs2_holder *gh)
-{
-	clear_bit(HIF_WAIT, &gh->gh_iflags);
-	smp_mb__after_clear_bit();
-	wake_up_bit(&gh->gh_iflags, HIF_WAIT);
-}
-
 static int just_schedule(void *word)
 {
         schedule();
         return 0;
 }
 
-static void wait_on_holder(struct gfs2_holder *gh)
-{
-	might_sleep();
-	wait_on_bit(&gh->gh_iflags, HIF_WAIT, just_schedule, TASK_UNINTERRUPTIBLE);
-}
-
-static void gfs2_demote_wake(struct gfs2_glock *gl)
-{
-	gl->gl_demote_state = LM_ST_EXCLUSIVE;
-        clear_bit(GLF_DEMOTE, &gl->gl_flags);
-        smp_mb__after_clear_bit();
-        wake_up_bit(&gl->gl_flags, GLF_DEMOTE);
-}
-
-static void wait_on_demote(struct gfs2_glock *gl)
-{
-	might_sleep();
-	wait_on_bit(&gl->gl_flags, GLF_DEMOTE, just_schedule, TASK_UNINTERRUPTIBLE);
-}
-
-/**
- * rq_mutex - process a mutex request in the queue
- * @gh: the glock holder
- *
- * Returns: 1 if the queue is blocked
- */
-
-static int rq_mutex(struct gfs2_holder *gh)
-{
-	struct gfs2_glock *gl = gh->gh_gl;
-
-	list_del_init(&gh->gh_list);
-	/*  gh->gh_error never examined.  */
-	set_bit(GLF_LOCK, &gl->gl_flags);
-	clear_bit(HIF_WAIT, &gh->gh_iflags);
-	smp_mb();
-	wake_up_bit(&gh->gh_iflags, HIF_WAIT);
-
-	return 1;
-}
-
-/**
- * rq_promote - process a promote request in the queue
- * @gh: the glock holder
- *
- * Acquire a new inter-node lock, or change a lock state to more restrictive.
- *
- * Returns: 1 if the queue is blocked
- */
-
-static int rq_promote(struct gfs2_holder *gh)
-{
-	struct gfs2_glock *gl = gh->gh_gl;
-
-	if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
-		if (list_empty(&gl->gl_holders)) {
-			gl->gl_req_gh = gh;
-			set_bit(GLF_LOCK, &gl->gl_flags);
-			spin_unlock(&gl->gl_spin);
-			gfs2_glock_xmote_th(gh->gh_gl, gh);
-			spin_lock(&gl->gl_spin);
-		}
-		return 1;
-	}
-
-	if (list_empty(&gl->gl_holders)) {
-		set_bit(HIF_FIRST, &gh->gh_iflags);
-		set_bit(GLF_LOCK, &gl->gl_flags);
-	} else {
-		struct gfs2_holder *next_gh;
-		if (gh->gh_state == LM_ST_EXCLUSIVE)
-			return 1;
-		next_gh = list_entry(gl->gl_holders.next, struct gfs2_holder,
-				     gh_list);
-		if (next_gh->gh_state == LM_ST_EXCLUSIVE)
-			 return 1;
-	}
-
-	list_move_tail(&gh->gh_list, &gl->gl_holders);
-	gh->gh_error = 0;
-	set_bit(HIF_HOLDER, &gh->gh_iflags);
-
-	gfs2_holder_wake(gh);
-
-	return 0;
-}
-
-/**
- * rq_demote - process a demote request in the queue
- * @gh: the glock holder
- *
- * Returns: 1 if the queue is blocked
- */
-
-static int rq_demote(struct gfs2_glock *gl)
-{
-	if (!list_empty(&gl->gl_holders))
-		return 1;
-
-	if (gl->gl_state == gl->gl_demote_state ||
-	    gl->gl_state == LM_ST_UNLOCKED) {
-		gfs2_demote_wake(gl);
-		return 0;
-	}
-
-	set_bit(GLF_LOCK, &gl->gl_flags);
-	set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
-
-	if (gl->gl_demote_state == LM_ST_UNLOCKED ||
-	    gl->gl_state != LM_ST_EXCLUSIVE) {
-		spin_unlock(&gl->gl_spin);
-		gfs2_glock_drop_th(gl);
-	} else {
-		spin_unlock(&gl->gl_spin);
-		gfs2_glock_xmote_th(gl, NULL);
-	}
-
-	spin_lock(&gl->gl_spin);
-	clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
-
-	return 0;
-}
-
-/**
- * run_queue - process holder structures on a glock
- * @gl: the glock
- *
- */
-static void run_queue(struct gfs2_glock *gl)
-{
-	struct gfs2_holder *gh;
-	int blocked = 1;
-
-	for (;;) {
-		if (test_bit(GLF_LOCK, &gl->gl_flags))
-			break;
-
-		if (!list_empty(&gl->gl_waiters1)) {
-			gh = list_entry(gl->gl_waiters1.next,
-					struct gfs2_holder, gh_list);
-			blocked = rq_mutex(gh);
-		} else if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
-			blocked = rq_demote(gl);
-			if (test_bit(GLF_WAITERS2, &gl->gl_flags) &&
-				     !blocked) {
-				set_bit(GLF_DEMOTE, &gl->gl_flags);
-				gl->gl_demote_state = LM_ST_UNLOCKED;
-			}
-			clear_bit(GLF_WAITERS2, &gl->gl_flags);
-		} else if (!list_empty(&gl->gl_waiters3)) {
-			gh = list_entry(gl->gl_waiters3.next,
-					struct gfs2_holder, gh_list);
-			blocked = rq_promote(gh);
-		} else
-			break;
-
-		if (blocked)
-			break;
-	}
-}
-
-/**
- * gfs2_glmutex_lock - acquire a local lock on a glock
- * @gl: the glock
- *
- * Gives caller exclusive access to manipulate a glock structure.
- */
-
-static void gfs2_glmutex_lock(struct gfs2_glock *gl)
-{
-	spin_lock(&gl->gl_spin);
-	if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
-		struct gfs2_holder gh;
-
-		gfs2_holder_init(gl, 0, 0, &gh);
-		set_bit(HIF_WAIT, &gh.gh_iflags);
-		list_add_tail(&gh.gh_list, &gl->gl_waiters1);
-		spin_unlock(&gl->gl_spin);
-		wait_on_holder(&gh);
-		gfs2_holder_uninit(&gh);
-	} else {
-		gl->gl_owner_pid = get_pid(task_pid(current));
-		gl->gl_ip = (unsigned long)__builtin_return_address(0);
-		spin_unlock(&gl->gl_spin);
-	}
-}
-
-/**
- * gfs2_glmutex_trylock - try to acquire a local lock on a glock
- * @gl: the glock
- *
- * Returns: 1 if the glock is acquired
- */
-
-static int gfs2_glmutex_trylock(struct gfs2_glock *gl)
-{
-	int acquired = 1;
-
-	spin_lock(&gl->gl_spin);
-	if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
-		acquired = 0;
-	} else {
-		gl->gl_owner_pid = get_pid(task_pid(current));
-		gl->gl_ip = (unsigned long)__builtin_return_address(0);
-	}
-	spin_unlock(&gl->gl_spin);
-
-	return acquired;
-}
-
-/**
- * gfs2_glmutex_unlock - release a local lock on a glock
- * @gl: the glock
- *
- */
-
-static void gfs2_glmutex_unlock(struct gfs2_glock *gl)
-{
-	struct pid *pid;
-
-	spin_lock(&gl->gl_spin);
-	clear_bit(GLF_LOCK, &gl->gl_flags);
-	pid = gl->gl_owner_pid;
-	gl->gl_owner_pid = NULL;
-	gl->gl_ip = 0;
-	run_queue(gl);
-	spin_unlock(&gl->gl_spin);
-
-	put_pid(pid);
-}
-
-/**
- * handle_callback - process a demote request
- * @gl: the glock
- * @state: the state the caller wants us to change to
- *
- * There are only two requests that we are going to see in actual
- * practise: LM_ST_SHARED and LM_ST_UNLOCKED
- */
-
-static void handle_callback(struct gfs2_glock *gl, unsigned int state,
-			    int remote, unsigned long delay)
-{
-	int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE;
-
-	spin_lock(&gl->gl_spin);
-	set_bit(bit, &gl->gl_flags);
-	if (gl->gl_demote_state == LM_ST_EXCLUSIVE) {
-		gl->gl_demote_state = state;
-		gl->gl_demote_time = jiffies;
-		if (remote && gl->gl_ops->go_type == LM_TYPE_IOPEN &&
-		    gl->gl_object) {
-			gfs2_glock_schedule_for_reclaim(gl);
-			spin_unlock(&gl->gl_spin);
-			return;
-		}
-	} else if (gl->gl_demote_state != LM_ST_UNLOCKED &&
-			gl->gl_demote_state != state) {
-		if (test_bit(GLF_DEMOTE_IN_PROGRESS,  &gl->gl_flags)) 
-			set_bit(GLF_WAITERS2, &gl->gl_flags);
-		else 
-			gl->gl_demote_state = LM_ST_UNLOCKED;
-	}
-	spin_unlock(&gl->gl_spin);
-}
-
-/**
- * state_change - record that the glock is now in a different state
- * @gl: the glock
- * @new_state the new state
- *
- */
-
-static void state_change(struct gfs2_glock *gl, unsigned int new_state)
-{
-	int held1, held2;
-
-	held1 = (gl->gl_state != LM_ST_UNLOCKED);
-	held2 = (new_state != LM_ST_UNLOCKED);
-
-	if (held1 != held2) {
-		if (held2)
-			gfs2_glock_hold(gl);
-		else
-			gfs2_glock_put(gl);
-	}
-
-	gl->gl_state = new_state;
-	gl->gl_tchange = jiffies;
-}
-
-/**
- * drop_bh - Called after a lock module unlock completes
- * @gl: the glock
- * @ret: the return status
- *
- * Doesn't wake up the process waiting on the struct gfs2_holder (if any)
- * Doesn't drop the reference on the glock the top half took out
- *
- */
-
-static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
-{
-	struct gfs2_sbd *sdp = gl->gl_sbd;
-	struct gfs2_holder *gh = gl->gl_req_gh;
-
-	gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
-	gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
-	gfs2_assert_warn(sdp, !ret);
-
-	state_change(gl, LM_ST_UNLOCKED);
-
-	if (test_and_clear_bit(GLF_CONV_DEADLK, &gl->gl_flags)) {
-		spin_lock(&gl->gl_spin);
-		gh->gh_error = 0;
-		spin_unlock(&gl->gl_spin);
-		gfs2_glock_xmote_th(gl, gl->gl_req_gh);
-		gfs2_glock_put(gl);
-		return;
-	}
-
-	spin_lock(&gl->gl_spin);
-	gfs2_demote_wake(gl);
-	clear_bit(GLF_LOCK, &gl->gl_flags);
-	spin_unlock(&gl->gl_spin);
-	gfs2_glock_put(gl);
-}
-
-/**
- * xmote_bh - Called after the lock module is done acquiring a lock
- * @gl: The glock in question
- * @ret: the int returned from the lock module
- *
- */
-
-static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
-{
-	struct gfs2_sbd *sdp = gl->gl_sbd;
-	const struct gfs2_glock_operations *glops = gl->gl_ops;
-	struct gfs2_holder *gh = gl->gl_req_gh;
-	int op_done = 1;
-
-	if (!gh && (ret & LM_OUT_ST_MASK) == LM_ST_UNLOCKED) {
-		drop_bh(gl, ret);
-		return;
-	}
-
-	gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
-	gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
-	gfs2_assert_warn(sdp, !(ret & LM_OUT_ASYNC));
-
-	state_change(gl, ret & LM_OUT_ST_MASK);
-
-	/*  Deal with each possible exit condition  */
-
-	if (!gh) {
-		gl->gl_stamp = jiffies;
-		if (ret & LM_OUT_CANCELED) {
-			op_done = 0;
-		} else {
-			spin_lock(&gl->gl_spin);
-			if (gl->gl_state != gl->gl_demote_state) {
-				spin_unlock(&gl->gl_spin);
-				gfs2_glock_drop_th(gl);
-				gfs2_glock_put(gl);
-				return;
-			}
-			gfs2_demote_wake(gl);
-			spin_unlock(&gl->gl_spin);
-		}
-	} else {
-		spin_lock(&gl->gl_spin);
-		if (ret & LM_OUT_CONV_DEADLK) {
-			gh->gh_error = 0;
-			set_bit(GLF_CONV_DEADLK, &gl->gl_flags);
-			spin_unlock(&gl->gl_spin);
-			gfs2_glock_drop_th(gl);
-			gfs2_glock_put(gl);
-			return;
-		}
-		list_del_init(&gh->gh_list);
-		gh->gh_error = -EIO;
-		if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) 
-			goto out;
-		gh->gh_error = GLR_CANCELED;
-		if (ret & LM_OUT_CANCELED) 
-			goto out;
-		if (relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
-			list_add_tail(&gh->gh_list, &gl->gl_holders);
-			gh->gh_error = 0;
-			set_bit(HIF_HOLDER, &gh->gh_iflags);
-			set_bit(HIF_FIRST, &gh->gh_iflags);
-			op_done = 0;
-			goto out;
-		}
-		gh->gh_error = GLR_TRYFAILED;
-		if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))
-			goto out;
-		gh->gh_error = -EINVAL;
-		if (gfs2_assert_withdraw(sdp, 0) == -1)
-			fs_err(sdp, "ret = 0x%.8X\n", ret);
-out:
-		spin_unlock(&gl->gl_spin);
-	}
-
-	if (glops->go_xmote_bh)
-		glops->go_xmote_bh(gl);
-
-	if (op_done) {
-		spin_lock(&gl->gl_spin);
-		gl->gl_req_gh = NULL;
-		clear_bit(GLF_LOCK, &gl->gl_flags);
-		spin_unlock(&gl->gl_spin);
-	}
-
-	gfs2_glock_put(gl);
-
-	if (gh)
-		gfs2_holder_wake(gh);
-}
-
-static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
-				 unsigned int cur_state, unsigned int req_state,
-				 unsigned int flags)
-{
-	int ret = 0;
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state,
-							 req_state, flags);
-	return ret;
-}
-
-/**
- * gfs2_glock_xmote_th - Call into the lock module to acquire or change a glock
- * @gl: The glock in question
- * @state: the requested state
- * @flags: modifier flags to the lock call
- *
- */
-
-static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh)
-{
-	struct gfs2_sbd *sdp = gl->gl_sbd;
-	int flags = gh ? gh->gh_flags : 0;
-	unsigned state = gh ? gh->gh_state : gl->gl_demote_state;
-	const struct gfs2_glock_operations *glops = gl->gl_ops;
-	int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB |
-				 LM_FLAG_NOEXP | LM_FLAG_ANY |
-				 LM_FLAG_PRIORITY);
-	unsigned int lck_ret;
-
-	if (glops->go_xmote_th)
-		glops->go_xmote_th(gl);
-	if (state == LM_ST_DEFERRED && glops->go_inval)
-		glops->go_inval(gl, DIO_METADATA);
-
-	gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
-	gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
-	gfs2_assert_warn(sdp, state != LM_ST_UNLOCKED);
-	gfs2_assert_warn(sdp, state != gl->gl_state);
-
-	gfs2_glock_hold(gl);
-
-	lck_ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, state, lck_flags);
-
-	if (gfs2_assert_withdraw(sdp, !(lck_ret & LM_OUT_ERROR)))
-		return;
-
-	if (lck_ret & LM_OUT_ASYNC)
-		gfs2_assert_warn(sdp, lck_ret == LM_OUT_ASYNC);
-	else
-		xmote_bh(gl, lck_ret);
-}
-
-static unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock,
-				   unsigned int cur_state)
+static void wait_on_holder(struct gfs2_holder *gh)
 {
-	int ret = 0;
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		ret =  sdp->sd_lockstruct.ls_ops->lm_unlock(lock, cur_state);
-	return ret;
+	might_sleep();
+	wait_on_bit(&gh->gh_iflags, HIF_WAIT, just_schedule, TASK_UNINTERRUPTIBLE);
 }
 
-/**
- * gfs2_glock_drop_th - call into the lock module to unlock a lock
- * @gl: the glock
- *
- */
-
-static void gfs2_glock_drop_th(struct gfs2_glock *gl)
+static void wait_on_demote(struct gfs2_glock *gl)
 {
-	struct gfs2_sbd *sdp = gl->gl_sbd;
-	const struct gfs2_glock_operations *glops = gl->gl_ops;
-	unsigned int ret;
-
-	if (glops->go_xmote_th)
-		glops->go_xmote_th(gl);
-	if (glops->go_inval)
-		glops->go_inval(gl, DIO_METADATA);
-
-	gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
-	gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
-	gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED);
-
-	gfs2_glock_hold(gl);
-
-	ret = gfs2_lm_unlock(sdp, gl->gl_lock, gl->gl_state);
-
-	if (gfs2_assert_withdraw(sdp, !(ret & LM_OUT_ERROR)))
-		return;
-
-	if (!ret)
-		drop_bh(gl, ret);
-	else
-		gfs2_assert_warn(sdp, ret == LM_OUT_ASYNC);
+	might_sleep();
+	wait_on_bit(&gl->gl_flags, GLF_DEMOTE, just_schedule, TASK_UNINTERRUPTIBLE);
 }
 
 /**
- * do_cancels - cancel requests for locks stuck waiting on an expire flag
- * @gh: the LM_FLAG_PRIORITY holder waiting to acquire the lock
+ * handle_callback - process a demote request
+ * @gl: the glock
+ * @state: the state the caller wants us to change to
  *
- * Don't cancel GL_NOCANCEL requests.
+ * There are only two requests that we are going to see in actual
+ * practise: LM_ST_SHARED and LM_ST_UNLOCKED
  */
 
-static void do_cancels(struct gfs2_holder *gh)
+static void handle_callback(struct gfs2_glock *gl, unsigned int state,
+			    int remote, unsigned long delay)
 {
-	struct gfs2_glock *gl = gh->gh_gl;
-	struct gfs2_sbd *sdp = gl->gl_sbd;
-
-	spin_lock(&gl->gl_spin);
+	int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE;
 
-	while (gl->gl_req_gh != gh &&
-	       !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
-	       !list_empty(&gh->gh_list)) {
-		if (!(gl->gl_req_gh && (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) {
-			spin_unlock(&gl->gl_spin);
-			if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-				sdp->sd_lockstruct.ls_ops->lm_cancel(gl->gl_lock);
-			msleep(100);
-			spin_lock(&gl->gl_spin);
-		} else {
-			spin_unlock(&gl->gl_spin);
-			msleep(100);
-			spin_lock(&gl->gl_spin);
-		}
+	set_bit(bit, &gl->gl_flags);
+	if (gl->gl_demote_state == LM_ST_EXCLUSIVE) {
+		gl->gl_demote_state = state;
+		gl->gl_demote_time = jiffies;
+		if (remote && gl->gl_ops->go_type == LM_TYPE_IOPEN &&
+		    gl->gl_object)
+			gfs2_glock_schedule_for_reclaim(gl);
+	} else if (gl->gl_demote_state != LM_ST_UNLOCKED &&
+			gl->gl_demote_state != state) {
+		gl->gl_demote_state = LM_ST_UNLOCKED;
 	}
-
-	spin_unlock(&gl->gl_spin);
 }
 
 /**
- * glock_wait_internal - wait on a glock acquisition
+ * gfs2_glock_wait - wait on a glock acquisition
  * @gh: the glock holder
  *
  * Returns: 0 on success
  */
 
-static int glock_wait_internal(struct gfs2_holder *gh)
+int gfs2_glock_wait(struct gfs2_holder *gh)
 {
-	struct gfs2_glock *gl = gh->gh_gl;
-	struct gfs2_sbd *sdp = gl->gl_sbd;
-	const struct gfs2_glock_operations *glops = gl->gl_ops;
-
-	if (test_bit(HIF_ABORTED, &gh->gh_iflags))
-		return -EIO;
-
-	if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
-		spin_lock(&gl->gl_spin);
-		if (gl->gl_req_gh != gh &&
-		    !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
-		    !list_empty(&gh->gh_list)) {
-			list_del_init(&gh->gh_list);
-			gh->gh_error = GLR_TRYFAILED;
-			run_queue(gl);
-			spin_unlock(&gl->gl_spin);
-			return gh->gh_error;
-		}
-		spin_unlock(&gl->gl_spin);
-	}
-
-	if (gh->gh_flags & LM_FLAG_PRIORITY)
-		do_cancels(gh);
-
 	wait_on_holder(gh);
-	if (gh->gh_error)
-		return gh->gh_error;
-
-	gfs2_assert_withdraw(sdp, test_bit(HIF_HOLDER, &gh->gh_iflags));
-	gfs2_assert_withdraw(sdp, relaxed_state_ok(gl->gl_state, gh->gh_state,
-						   gh->gh_flags));
-
-	if (test_bit(HIF_FIRST, &gh->gh_iflags)) {
-		gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
-
-		if (glops->go_lock) {
-			gh->gh_error = glops->go_lock(gh);
-			if (gh->gh_error) {
-				spin_lock(&gl->gl_spin);
-				list_del_init(&gh->gh_list);
-				spin_unlock(&gl->gl_spin);
-			}
-		}
-
-		spin_lock(&gl->gl_spin);
-		gl->gl_req_gh = NULL;
-		clear_bit(GLF_LOCK, &gl->gl_flags);
-		run_queue(gl);
-		spin_unlock(&gl->gl_spin);
-	}
-
 	return gh->gh_error;
 }
 
-static inline struct gfs2_holder *
-find_holder_by_owner(struct list_head *head, struct pid *pid)
-{
-	struct gfs2_holder *gh;
-
-	list_for_each_entry(gh, head, gh_list) {
-		if (gh->gh_owner_pid == pid)
-			return gh;
-	}
-
-	return NULL;
-}
-
-static void print_dbg(struct glock_iter *gi, const char *fmt, ...)
+void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
 {
 	va_list args;
 
 	va_start(args, fmt);
-	if (gi) {
+	if (seq) {
+		struct gfs2_glock_iter *gi = seq->private;
 		vsprintf(gi->string, fmt, args);
-		seq_printf(gi->seq, gi->string);
-	}
-	else
+		seq_printf(seq, gi->string);
+	} else {
+		printk(KERN_ERR " ");
 		vprintk(fmt, args);
+	}
 	va_end(args);
 }
 
@@ -1104,50 +857,75 @@ static void print_dbg(struct glock_iter *gi, const char *fmt, ...)
  * add_to_queue - Add a holder to the wait queue (but look for recursion)
  * @gh: the holder structure to add
  *
+ * Eventually we should move the recursive locking trap to a
+ * debugging option or something like that. This is the fast
+ * path and needs to have the minimum number of distractions.
+ * 
  */
 
-static void add_to_queue(struct gfs2_holder *gh)
+static inline void add_to_queue(struct gfs2_holder *gh)
 {
 	struct gfs2_glock *gl = gh->gh_gl;
-	struct gfs2_holder *existing;
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct list_head *insert_pt = NULL;
+	struct gfs2_holder *gh2;
+	int try_lock = 0;
 
 	BUG_ON(gh->gh_owner_pid == NULL);
 	if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags))
 		BUG();
 
-	if (!(gh->gh_flags & GL_FLOCK)) {
-		existing = find_holder_by_owner(&gl->gl_holders, 
-						gh->gh_owner_pid);
-		if (existing) {
-			print_symbol(KERN_WARNING "original: %s\n", 
-				     existing->gh_ip);
-			printk(KERN_INFO "pid : %d\n",
-					pid_nr(existing->gh_owner_pid));
-			printk(KERN_INFO "lock type : %d lock state : %d\n",
-			       existing->gh_gl->gl_name.ln_type, 
-			       existing->gh_gl->gl_state);
-			print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
-			printk(KERN_INFO "pid : %d\n",
-					pid_nr(gh->gh_owner_pid));
-			printk(KERN_INFO "lock type : %d lock state : %d\n",
-			       gl->gl_name.ln_type, gl->gl_state);
-			BUG();
-		}
-		
-		existing = find_holder_by_owner(&gl->gl_waiters3, 
-						gh->gh_owner_pid);
-		if (existing) {
-			print_symbol(KERN_WARNING "original: %s\n", 
-				     existing->gh_ip);
-			print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
-			BUG();
+	if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
+		if (test_bit(GLF_LOCK, &gl->gl_flags))
+			try_lock = 1;
+		if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
+			goto fail;
+	}
+
+	list_for_each_entry(gh2, &gl->gl_holders, gh_list) {
+		if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid &&
+		    (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK)))
+			goto trap_recursive;
+		if (try_lock &&
+		    !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) &&
+		    !may_grant(gl, gh)) {
+fail:
+			gh->gh_error = GLR_TRYFAILED;
+			gfs2_holder_wake(gh);
+			return;
 		}
+		if (test_bit(HIF_HOLDER, &gh2->gh_iflags))
+			continue;
+		if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt))
+			insert_pt = &gh2->gh_list;
+	}
+	if (likely(insert_pt == NULL)) {
+		list_add_tail(&gh->gh_list, &gl->gl_holders);
+		if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY))
+			goto do_cancel;
+		return;
+	}
+	list_add_tail(&gh->gh_list, insert_pt);
+do_cancel:
+	gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
+	if (!(gh->gh_flags & LM_FLAG_PRIORITY)) {
+		spin_unlock(&gl->gl_spin);
+		sdp->sd_lockstruct.ls_ops->lm_cancel(gl->gl_lock);
+		spin_lock(&gl->gl_spin);
 	}
+	return;
 
-	if (gh->gh_flags & LM_FLAG_PRIORITY)
-		list_add(&gh->gh_list, &gl->gl_waiters3);
-	else
-		list_add_tail(&gh->gh_list, &gl->gl_waiters3);
+trap_recursive:
+	print_symbol(KERN_ERR "original: %s\n", gh2->gh_ip);
+	printk(KERN_ERR "pid: %d\n", pid_nr(gh2->gh_owner_pid));
+	printk(KERN_ERR "lock type: %d req lock state : %d\n",
+	       gh2->gh_gl->gl_name.ln_type, gh2->gh_state);
+	print_symbol(KERN_ERR "new: %s\n", gh->gh_ip);
+	printk(KERN_ERR "pid: %d\n", pid_nr(gh->gh_owner_pid));
+	printk(KERN_ERR "lock type: %d req lock state : %d\n",
+	       gh->gh_gl->gl_name.ln_type, gh->gh_state);
+	__dump_glock(NULL, gl);
+	BUG();
 }
 
 /**
@@ -1165,24 +943,16 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
 	struct gfs2_sbd *sdp = gl->gl_sbd;
 	int error = 0;
 
-restart:
-	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
-		set_bit(HIF_ABORTED, &gh->gh_iflags);
+	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
 		return -EIO;
-	}
 
 	spin_lock(&gl->gl_spin);
 	add_to_queue(gh);
-	run_queue(gl);
+	run_queue(gl, 1);
 	spin_unlock(&gl->gl_spin);
 
-	if (!(gh->gh_flags & GL_ASYNC)) {
-		error = glock_wait_internal(gh);
-		if (error == GLR_CANCELED) {
-			msleep(100);
-			goto restart;
-		}
-	}
+	if (!(gh->gh_flags & GL_ASYNC))
+		error = gfs2_glock_wait(gh);
 
 	return error;
 }
@@ -1196,48 +966,7 @@ restart:
 
 int gfs2_glock_poll(struct gfs2_holder *gh)
 {
-	struct gfs2_glock *gl = gh->gh_gl;
-	int ready = 0;
-
-	spin_lock(&gl->gl_spin);
-
-	if (test_bit(HIF_HOLDER, &gh->gh_iflags))
-		ready = 1;
-	else if (list_empty(&gh->gh_list)) {
-		if (gh->gh_error == GLR_CANCELED) {
-			spin_unlock(&gl->gl_spin);
-			msleep(100);
-			if (gfs2_glock_nq(gh))
-				return 1;
-			return 0;
-		} else
-			ready = 1;
-	}
-
-	spin_unlock(&gl->gl_spin);
-
-	return ready;
-}
-
-/**
- * gfs2_glock_wait - wait for a lock acquisition that ended in a GLR_ASYNC
- * @gh: the holder structure
- *
- * Returns: 0, GLR_TRYFAILED, or errno on failure
- */
-
-int gfs2_glock_wait(struct gfs2_holder *gh)
-{
-	int error;
-
-	error = glock_wait_internal(gh);
-	if (error == GLR_CANCELED) {
-		msleep(100);
-		gh->gh_flags &= ~GL_ASYNC;
-		error = gfs2_glock_nq(gh);
-	}
-
-	return error;
+	return test_bit(HIF_WAIT, &gh->gh_iflags) ? 0 : 1;
 }
 
 /**
@@ -1251,26 +980,30 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
 	struct gfs2_glock *gl = gh->gh_gl;
 	const struct gfs2_glock_operations *glops = gl->gl_ops;
 	unsigned delay = 0;
+	int fast_path = 0;
 
+	spin_lock(&gl->gl_spin);
 	if (gh->gh_flags & GL_NOCACHE)
 		handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
 
-	gfs2_glmutex_lock(gl);
-
-	spin_lock(&gl->gl_spin);
 	list_del_init(&gh->gh_list);
-
-	if (list_empty(&gl->gl_holders)) {
+	if (find_first_holder(gl) == NULL) {
 		if (glops->go_unlock) {
+			GLOCK_BUG_ON(gl, test_and_set_bit(GLF_LOCK, &gl->gl_flags));
 			spin_unlock(&gl->gl_spin);
 			glops->go_unlock(gh);
 			spin_lock(&gl->gl_spin);
+			clear_bit(GLF_LOCK, &gl->gl_flags);
 		}
 		gl->gl_stamp = jiffies;
+		if (list_empty(&gl->gl_holders) &&
+		    !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
+		    !test_bit(GLF_DEMOTE, &gl->gl_flags))
+			fast_path = 1;
 	}
-
-	clear_bit(GLF_LOCK, &gl->gl_flags);
 	spin_unlock(&gl->gl_spin);
+	if (likely(fast_path))
+		return;
 
 	gfs2_glock_hold(gl);
 	if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
@@ -1469,20 +1202,14 @@ int gfs2_lvb_hold(struct gfs2_glock *gl)
 {
 	int error;
 
-	gfs2_glmutex_lock(gl);
-
 	if (!atomic_read(&gl->gl_lvb_count)) {
 		error = gfs2_lm_hold_lvb(gl->gl_sbd, gl->gl_lock, &gl->gl_lvb);
-		if (error) {
-			gfs2_glmutex_unlock(gl);
+		if (error) 
 			return error;
-		}
 		gfs2_glock_hold(gl);
 	}
 	atomic_inc(&gl->gl_lvb_count);
 
-	gfs2_glmutex_unlock(gl);
-
 	return 0;
 }
 
@@ -1497,8 +1224,6 @@ void gfs2_lvb_unhold(struct gfs2_glock *gl)
 	struct gfs2_sbd *sdp = gl->gl_sbd;
 
 	gfs2_glock_hold(gl);
-	gfs2_glmutex_lock(gl);
-
 	gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0);
 	if (atomic_dec_and_test(&gl->gl_lvb_count)) {
 		if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
@@ -1506,8 +1231,6 @@ void gfs2_lvb_unhold(struct gfs2_glock *gl)
 		gl->gl_lvb = NULL;
 		gfs2_glock_put(gl);
 	}
-
-	gfs2_glmutex_unlock(gl);
 	gfs2_glock_put(gl);
 }
 
@@ -1527,7 +1250,9 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
 	if (time_before(now, holdtime))
 		delay = holdtime - now;
 
+	spin_lock(&gl->gl_spin);
 	handle_callback(gl, state, 1, delay);
+	spin_unlock(&gl->gl_spin);
 	if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
 		gfs2_glock_put(gl);
 }
@@ -1568,7 +1293,8 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
 		gl = gfs2_glock_find(sdp, &async->lc_name);
 		if (gfs2_assert_warn(sdp, gl))
 			return;
-		xmote_bh(gl, async->lc_ret);
+		gl->gl_reply = async->lc_ret;
+		set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
 		if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
 			gfs2_glock_put(gl);
 		up_read(&gfs2_umount_flush_sem);
@@ -1646,6 +1372,7 @@ void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
 void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
 {
 	struct gfs2_glock *gl;
+	int done_callback = 0;
 
 	spin_lock(&sdp->sd_reclaim_lock);
 	if (list_empty(&sdp->sd_reclaim_list)) {
@@ -1660,14 +1387,16 @@ void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
 	atomic_dec(&sdp->sd_reclaim_count);
 	atomic_inc(&sdp->sd_reclaimed);
 
-	if (gfs2_glmutex_trylock(gl)) {
-		if (list_empty(&gl->gl_holders) &&
-		    gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
-			handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
-		gfs2_glmutex_unlock(gl);
+	spin_lock(&gl->gl_spin);
+	if (find_first_holder(gl) == NULL &&
+	    gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) {
+		handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
+		done_callback = 1;
 	}
-
-	gfs2_glock_put(gl);
+	spin_unlock(&gl->gl_spin);
+	if (!done_callback ||
+	    queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+		gfs2_glock_put(gl);
 }
 
 /**
@@ -1724,18 +1453,14 @@ static void scan_glock(struct gfs2_glock *gl)
 {
 	if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object)
 		return;
+	if (test_bit(GLF_LOCK, &gl->gl_flags))
+		return;
 
-	if (gfs2_glmutex_trylock(gl)) {
-		if (list_empty(&gl->gl_holders) &&
-		    gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
-			goto out_schedule;
-		gfs2_glmutex_unlock(gl);
-	}
-	return;
-
-out_schedule:
-	gfs2_glmutex_unlock(gl);
-	gfs2_glock_schedule_for_reclaim(gl);
+	spin_lock(&gl->gl_spin);
+	if (find_first_holder(gl) == NULL &&
+	    gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
+		gfs2_glock_schedule_for_reclaim(gl);
+	spin_unlock(&gl->gl_spin);
 }
 
 /**
@@ -1760,12 +1485,13 @@ static void clear_glock(struct gfs2_glock *gl)
 		spin_unlock(&sdp->sd_reclaim_lock);
 	}
 
-	if (gfs2_glmutex_trylock(gl)) {
-		if (list_empty(&gl->gl_holders) &&
-		    gl->gl_state != LM_ST_UNLOCKED)
-			handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
-		gfs2_glmutex_unlock(gl);
-	}
+	spin_lock(&gl->gl_spin);
+	if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED)
+		handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
+	spin_unlock(&gl->gl_spin);
+	gfs2_glock_hold(gl);
+	if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+		gfs2_glock_put(gl);
 }
 
 /**
@@ -1810,180 +1536,164 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
 	}
 }
 
-/*
- *  Diagnostic routines to help debug distributed deadlock
- */
-
-static void gfs2_print_symbol(struct glock_iter *gi, const char *fmt,
-                              unsigned long address)
+static const char *state2str(unsigned state)
 {
-	char buffer[KSYM_SYMBOL_LEN];
-
-	sprint_symbol(buffer, address);
-	print_dbg(gi, fmt, buffer);
+	switch(state) {
+	case LM_ST_UNLOCKED:
+		return "UN";
+	case LM_ST_SHARED:
+		return "SH";
+	case LM_ST_DEFERRED:
+		return "DF";
+	case LM_ST_EXCLUSIVE:
+		return "EX";
+	}
+	return "??";
+}
+
+static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
+{
+	char *p = buf;
+	if (flags & LM_FLAG_TRY)
+		*p++ = 't';
+	if (flags & LM_FLAG_TRY_1CB)
+		*p++ = 'T';
+	if (flags & LM_FLAG_NOEXP)
+		*p++ = 'e';
+	if (flags & LM_FLAG_ANY)
+		*p++ = 'a';
+	if (flags & LM_FLAG_PRIORITY)
+		*p++ = 'p';
+	if (flags & GL_ASYNC)
+		*p++ = 'a';
+	if (flags & GL_EXACT)
+		*p++ = 'E';
+	if (flags & GL_ATIME)
+		*p++ = 'a';
+	if (flags & GL_NOCACHE)
+		*p++ = 'c';
+	if (test_bit(HIF_HOLDER, &iflags))
+		*p++ = 'H';
+	if (test_bit(HIF_WAIT, &iflags))
+		*p++ = 'W';
+	if (test_bit(HIF_FIRST, &iflags))
+		*p++ = 'F';
+	*p = 0;
+	return buf;
 }
 
 /**
  * dump_holder - print information about a glock holder
- * @str: a string naming the type of holder
+ * @seq: the seq_file struct
  * @gh: the glock holder
  *
  * Returns: 0 on success, -ENOBUFS when we run out of space
  */
 
-static int dump_holder(struct glock_iter *gi, char *str,
-		       struct gfs2_holder *gh)
+static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
 {
-	unsigned int x;
-	struct task_struct *gh_owner;
+	struct task_struct *gh_owner = NULL;
+	char buffer[KSYM_SYMBOL_LEN];
+	char flags_buf[32];
 
-	print_dbg(gi, "  %s\n", str);
-	if (gh->gh_owner_pid) {
-		print_dbg(gi, "    owner = %ld ",
-				(long)pid_nr(gh->gh_owner_pid));
+	sprint_symbol(buffer, gh->gh_ip);
+	if (gh->gh_owner_pid)
 		gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID);
-		if (gh_owner)
-			print_dbg(gi, "(%s)\n", gh_owner->comm);
-		else
-			print_dbg(gi, "(ended)\n");
-	} else
-		print_dbg(gi, "    owner = -1\n");
-	print_dbg(gi, "    gh_state = %u\n", gh->gh_state);
-	print_dbg(gi, "    gh_flags =");
-	for (x = 0; x < 32; x++)
-		if (gh->gh_flags & (1 << x))
-			print_dbg(gi, " %u", x);
-	print_dbg(gi, " \n");
-	print_dbg(gi, "    error = %d\n", gh->gh_error);
-	print_dbg(gi, "    gh_iflags =");
-	for (x = 0; x < 32; x++)
-		if (test_bit(x, &gh->gh_iflags))
-			print_dbg(gi, " %u", x);
-	print_dbg(gi, " \n");
-        gfs2_print_symbol(gi, "    initialized at: %s\n", gh->gh_ip);
-
+	gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %s\n",
+		  state2str(gh->gh_state),
+		  hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
+		  gh->gh_error, 
+		  gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
+		  gh_owner ? gh_owner->comm : "(ended)", buffer);
 	return 0;
 }
 
-/**
- * dump_inode - print information about an inode
- * @ip: the inode
- *
- * Returns: 0 on success, -ENOBUFS when we run out of space
- */
-
-static int dump_inode(struct glock_iter *gi, struct gfs2_inode *ip)
-{
-	unsigned int x;
-
-	print_dbg(gi, "  Inode:\n");
-	print_dbg(gi, "    num = %llu/%llu\n",
-		  (unsigned long long)ip->i_no_formal_ino,
-		  (unsigned long long)ip->i_no_addr);
-	print_dbg(gi, "    type = %u\n", IF2DT(ip->i_inode.i_mode));
-	print_dbg(gi, "    i_flags =");
-	for (x = 0; x < 32; x++)
-		if (test_bit(x, &ip->i_flags))
-			print_dbg(gi, " %u", x);
-	print_dbg(gi, " \n");
-	return 0;
+static const char *gflags2str(char *buf, const unsigned long *gflags)
+{
+	char *p = buf;
+	if (test_bit(GLF_LOCK, gflags))
+		*p++ = 'l';
+	if (test_bit(GLF_STICKY, gflags))
+		*p++ = 's';
+	if (test_bit(GLF_DEMOTE, gflags))
+		*p++ = 'D';
+	if (test_bit(GLF_PENDING_DEMOTE, gflags))
+		*p++ = 'd';
+	if (test_bit(GLF_DEMOTE_IN_PROGRESS, gflags))
+		*p++ = 'p';
+	if (test_bit(GLF_DIRTY, gflags))
+		*p++ = 'y';
+	if (test_bit(GLF_LFLUSH, gflags))
+		*p++ = 'f';
+	if (test_bit(GLF_INVALIDATE_IN_PROGRESS, gflags))
+		*p++ = 'i';
+	if (test_bit(GLF_REPLY_PENDING, gflags))
+		*p++ = 'r';
+	*p = 0;
+	return buf;
 }
 
 /**
- * dump_glock - print information about a glock
+ * __dump_glock - print information about a glock
+ * @seq: The seq_file struct
  * @gl: the glock
- * @count: where we are in the buffer
+ *
+ * The file format is as follows:
+ * One line per object, capital letters are used to indicate objects
+ * G = glock, I = Inode, R = rgrp, H = holder. Glocks are not indented,
+ * other objects are indented by a single space and follow the glock to
+ * which they are related. Fields are indicated by lower case letters
+ * followed by a colon and the field value, except for strings which are in
+ * [] so that its possible to see if they are composed of spaces for
+ * example. The field's are n = number (id of the object), f = flags,
+ * t = type, s = state, r = refcount, e = error, p = pid.
  *
  * Returns: 0 on success, -ENOBUFS when we run out of space
  */
 
-static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl)
+static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
 {
-	struct gfs2_holder *gh;
-	unsigned int x;
-	int error = -ENOBUFS;
-	struct task_struct *gl_owner;
+	const struct gfs2_glock_operations *glops = gl->gl_ops;
+	unsigned long long dtime;
+	const struct gfs2_holder *gh;
+	char gflags_buf[32];
+	int error = 0;
 
-	spin_lock(&gl->gl_spin);
+	dtime = jiffies - gl->gl_demote_time;
+	dtime *= 1000000/HZ; /* demote time in uSec */
+	if (!test_bit(GLF_DEMOTE, &gl->gl_flags))
+		dtime = 0;
+	gfs2_print_dbg(seq, "G:  s:%s n:%u/%llu f:%s t:%s d:%s/%llu l:%d a:%d r:%d\n",
+		  state2str(gl->gl_state),
+		  gl->gl_name.ln_type,
+		  (unsigned long long)gl->gl_name.ln_number,
+		  gflags2str(gflags_buf, &gl->gl_flags),
+		  state2str(gl->gl_target),
+		  state2str(gl->gl_demote_state), dtime,
+		  atomic_read(&gl->gl_lvb_count),
+		  atomic_read(&gl->gl_ail_count),
+		  atomic_read(&gl->gl_ref));
 
-	print_dbg(gi, "Glock 0x%p (%u, 0x%llx)\n", gl, gl->gl_name.ln_type,
-		   (unsigned long long)gl->gl_name.ln_number);
-	print_dbg(gi, "  gl_flags =");
-	for (x = 0; x < 32; x++) {
-		if (test_bit(x, &gl->gl_flags))
-			print_dbg(gi, " %u", x);
-	}
-	if (!test_bit(GLF_LOCK, &gl->gl_flags))
-		print_dbg(gi, " (unlocked)");
-	print_dbg(gi, " \n");
-	print_dbg(gi, "  gl_ref = %d\n", atomic_read(&gl->gl_ref));
-	print_dbg(gi, "  gl_state = %u\n", gl->gl_state);
-	if (gl->gl_owner_pid) {
-		gl_owner = pid_task(gl->gl_owner_pid, PIDTYPE_PID);
-		if (gl_owner)
-			print_dbg(gi, "  gl_owner = pid %d (%s)\n",
-				  pid_nr(gl->gl_owner_pid), gl_owner->comm);
-		else
-			print_dbg(gi, "  gl_owner = %d (ended)\n",
-				  pid_nr(gl->gl_owner_pid));
-	} else
-		print_dbg(gi, "  gl_owner = -1\n");
-	print_dbg(gi, "  gl_ip = %lu\n", gl->gl_ip);
-	print_dbg(gi, "  req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no");
-	print_dbg(gi, "  lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
-	print_dbg(gi, "  object = %s\n", (gl->gl_object) ? "yes" : "no");
-	print_dbg(gi, "  reclaim = %s\n",
-		   (list_empty(&gl->gl_reclaim)) ? "no" : "yes");
-	if (gl->gl_aspace)
-		print_dbg(gi, "  aspace = 0x%p nrpages = %lu\n", gl->gl_aspace,
-			   gl->gl_aspace->i_mapping->nrpages);
-	else
-		print_dbg(gi, "  aspace = no\n");
-	print_dbg(gi, "  ail = %d\n", atomic_read(&gl->gl_ail_count));
-	if (gl->gl_req_gh) {
-		error = dump_holder(gi, "Request", gl->gl_req_gh);
-		if (error)
-			goto out;
-	}
 	list_for_each_entry(gh, &gl->gl_holders, gh_list) {
-		error = dump_holder(gi, "Holder", gh);
+		error = dump_holder(seq, gh);
 		if (error)
 			goto out;
 	}
-	list_for_each_entry(gh, &gl->gl_waiters1, gh_list) {
-		error = dump_holder(gi, "Waiter1", gh);
-		if (error)
-			goto out;
-	}
-	list_for_each_entry(gh, &gl->gl_waiters3, gh_list) {
-		error = dump_holder(gi, "Waiter3", gh);
-		if (error)
-			goto out;
-	}
-	if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
-		print_dbg(gi, "  Demotion req to state %u (%llu uS ago)\n",
-			  gl->gl_demote_state, (unsigned long long)
-			  (jiffies - gl->gl_demote_time)*(1000000/HZ));
-	}
-	if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) {
-		if (!test_bit(GLF_LOCK, &gl->gl_flags) &&
-			list_empty(&gl->gl_holders)) {
-			error = dump_inode(gi, gl->gl_object);
-			if (error)
-				goto out;
-		} else {
-			error = -ENOBUFS;
-			print_dbg(gi, "  Inode: busy\n");
-		}
-	}
-
-	error = 0;
-
+	if (gl->gl_state != LM_ST_UNLOCKED && glops->go_dump)
+		error = glops->go_dump(seq, gl);
 out:
-	spin_unlock(&gl->gl_spin);
 	return error;
 }
 
+static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
+{
+	int ret;
+	spin_lock(&gl->gl_spin);
+	ret = __dump_glock(seq, gl);
+	spin_unlock(&gl->gl_spin);
+	return ret;
+}
+
 /**
  * gfs2_dump_lockstate - print out the current lockstate
  * @sdp: the filesystem
@@ -2086,7 +1796,7 @@ void gfs2_glock_exit(void)
 module_param(scand_secs, uint, S_IRUGO|S_IWUSR);
 MODULE_PARM_DESC(scand_secs, "The number of seconds between scand runs");
 
-static int gfs2_glock_iter_next(struct glock_iter *gi)
+static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
 {
 	struct gfs2_glock *gl;
 
@@ -2104,7 +1814,7 @@ restart:
 		gfs2_glock_put(gl);
 	if (gl && gi->gl == NULL)
 		gi->hash++;
-	while(gi->gl == NULL) {
+	while (gi->gl == NULL) {
 		if (gi->hash >= GFS2_GL_HASH_SIZE)
 			return 1;
 		read_lock(gl_lock_addr(gi->hash));
@@ -2122,58 +1832,34 @@ restart:
 	return 0;
 }
 
-static void gfs2_glock_iter_free(struct glock_iter *gi)
+static void gfs2_glock_iter_free(struct gfs2_glock_iter *gi)
 {
 	if (gi->gl)
 		gfs2_glock_put(gi->gl);
-	kfree(gi);
-}
-
-static struct glock_iter *gfs2_glock_iter_init(struct gfs2_sbd *sdp)
-{
-	struct glock_iter *gi;
-
-	gi = kmalloc(sizeof (*gi), GFP_KERNEL);
-	if (!gi)
-		return NULL;
-
-	gi->sdp = sdp;
-	gi->hash = 0;
-	gi->seq = NULL;
 	gi->gl = NULL;
-	memset(gi->string, 0, sizeof(gi->string));
-
-	if (gfs2_glock_iter_next(gi)) {
-		gfs2_glock_iter_free(gi);
-		return NULL;
-	}
-
-	return gi;
 }
 
-static void *gfs2_glock_seq_start(struct seq_file *file, loff_t *pos)
+static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos)
 {
-	struct glock_iter *gi;
+	struct gfs2_glock_iter *gi = seq->private;
 	loff_t n = *pos;
 
-	gi = gfs2_glock_iter_init(file->private);
-	if (!gi)
-		return NULL;
+	gi->hash = 0;
 
-	while(n--) {
+	do {
 		if (gfs2_glock_iter_next(gi)) {
 			gfs2_glock_iter_free(gi);
 			return NULL;
 		}
-	}
+	} while (n--);
 
-	return gi;
+	return gi->gl;
 }
 
-static void *gfs2_glock_seq_next(struct seq_file *file, void *iter_ptr,
+static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr,
 				 loff_t *pos)
 {
-	struct glock_iter *gi = iter_ptr;
+	struct gfs2_glock_iter *gi = seq->private;
 
 	(*pos)++;
 
@@ -2182,24 +1868,18 @@ static void *gfs2_glock_seq_next(struct seq_file *file, void *iter_ptr,
 		return NULL;
 	}
 
-	return gi;
+	return gi->gl;
 }
 
-static void gfs2_glock_seq_stop(struct seq_file *file, void *iter_ptr)
+static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr)
 {
-	struct glock_iter *gi = iter_ptr;
-	if (gi)
-		gfs2_glock_iter_free(gi);
+	struct gfs2_glock_iter *gi = seq->private;
+	gfs2_glock_iter_free(gi);
 }
 
-static int gfs2_glock_seq_show(struct seq_file *file, void *iter_ptr)
+static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
 {
-	struct glock_iter *gi = iter_ptr;
-
-	gi->seq = file;
-	dump_glock(gi, gi->gl);
-
-	return 0;
+	return dump_glock(seq, iter_ptr);
 }
 
 static const struct seq_operations gfs2_glock_seq_ops = {
@@ -2211,17 +1891,14 @@ static const struct seq_operations gfs2_glock_seq_ops = {
 
 static int gfs2_debugfs_open(struct inode *inode, struct file *file)
 {
-	struct seq_file *seq;
-	int ret;
-
-	ret = seq_open(file, &gfs2_glock_seq_ops);
-	if (ret)
-		return ret;
-
-	seq = file->private_data;
-	seq->private = inode->i_private;
-
-	return 0;
+	int ret = seq_open_private(file, &gfs2_glock_seq_ops,
+				   sizeof(struct gfs2_glock_iter));
+	if (ret == 0) {
+		struct seq_file *seq = file->private_data;
+		struct gfs2_glock_iter *gi = seq->private;
+		gi->sdp = inode->i_private;
+	}
+	return ret;
 }
 
 static const struct file_operations gfs2_debug_fops = {
@@ -2229,7 +1906,7 @@ static const struct file_operations gfs2_debug_fops = {
 	.open    = gfs2_debugfs_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
-	.release = seq_release
+	.release = seq_release_private,
 };
 
 int gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index cdad3e6f815..7389f8ef0a3 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -26,11 +26,8 @@
 #define GL_SKIP			0x00000100
 #define GL_ATIME		0x00000200
 #define GL_NOCACHE		0x00000400
-#define GL_FLOCK		0x00000800
-#define GL_NOCANCEL		0x00001000
 
 #define GLR_TRYFAILED		13
-#define GLR_CANCELED		14
 
 static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
 {
@@ -41,6 +38,8 @@ static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *
 	spin_lock(&gl->gl_spin);
 	pid = task_pid(current);
 	list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+		if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
+			break;
 		if (gh->gh_owner_pid == pid)
 			goto out;
 	}
@@ -70,7 +69,7 @@ static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
 {
 	int ret;
 	spin_lock(&gl->gl_spin);
-	ret = test_bit(GLF_DEMOTE, &gl->gl_flags) || !list_empty(&gl->gl_waiters3);
+	ret = test_bit(GLF_DEMOTE, &gl->gl_flags);
 	spin_unlock(&gl->gl_spin);
 	return ret;
 }
@@ -98,6 +97,7 @@ int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
 int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
+void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
 
 /**
  * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock
@@ -130,7 +130,6 @@ int gfs2_lvb_hold(struct gfs2_glock *gl);
 void gfs2_lvb_unhold(struct gfs2_glock *gl);
 
 void gfs2_glock_cb(void *cb_data, unsigned int type, void *data);
-
 void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
 void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
 void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 07d84d16cda..c6c318c2a0f 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -13,6 +13,7 @@
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/lm_interface.h>
+#include <linux/bio.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -171,26 +172,6 @@ static void inode_go_sync(struct gfs2_glock *gl)
 	}
 }
 
-/**
- * inode_go_xmote_bh - After promoting/demoting a glock
- * @gl: the glock
- *
- */
-
-static void inode_go_xmote_bh(struct gfs2_glock *gl)
-{
-	struct gfs2_holder *gh = gl->gl_req_gh;
-	struct buffer_head *bh;
-	int error;
-
-	if (gl->gl_state != LM_ST_UNLOCKED &&
-	    (!gh || !(gh->gh_flags & GL_SKIP))) {
-		error = gfs2_meta_read(gl, gl->gl_name.ln_number, 0, &bh);
-		if (!error)
-			brelse(bh);
-	}
-}
-
 /**
  * inode_go_inval - prepare a inode glock to be released
  * @gl: the glock
@@ -266,6 +247,26 @@ static int inode_go_lock(struct gfs2_holder *gh)
 	return error;
 }
 
+/**
+ * inode_go_dump - print information about an inode
+ * @seq: The iterator
+ * @ip: the inode
+ *
+ * Returns: 0 on success, -ENOBUFS when we run out of space
+ */
+
+static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
+{
+	const struct gfs2_inode *ip = gl->gl_object;
+	if (ip == NULL)
+		return 0;
+	gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%08lx\n",
+		  (unsigned long long)ip->i_no_formal_ino,
+		  (unsigned long long)ip->i_no_addr,
+		  IF2DT(ip->i_inode.i_mode), ip->i_flags);
+	return 0;
+}
+
 /**
  * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock
  * @gl: the glock
@@ -305,6 +306,22 @@ static void rgrp_go_unlock(struct gfs2_holder *gh)
 	gfs2_rgrp_bh_put(gh->gh_gl->gl_object);
 }
 
+/**
+ * rgrp_go_dump - print out an rgrp
+ * @seq: The iterator
+ * @gl: The glock in question
+ *
+ */
+
+static int rgrp_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
+{
+	const struct gfs2_rgrpd *rgd = gl->gl_object;
+	if (rgd == NULL)
+		return 0;
+	gfs2_print_dbg(seq, " R: n:%llu\n", (unsigned long long)rgd->rd_addr);
+	return 0;
+}
+
 /**
  * trans_go_sync - promote/demote the transaction glock
  * @gl: the glock
@@ -330,7 +347,7 @@ static void trans_go_sync(struct gfs2_glock *gl)
  *
  */
 
-static void trans_go_xmote_bh(struct gfs2_glock *gl)
+static int trans_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh)
 {
 	struct gfs2_sbd *sdp = gl->gl_sbd;
 	struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
@@ -338,8 +355,7 @@ static void trans_go_xmote_bh(struct gfs2_glock *gl)
 	struct gfs2_log_header_host head;
 	int error;
 
-	if (gl->gl_state != LM_ST_UNLOCKED &&
-	    test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
+	if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
 		j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
 
 		error = gfs2_find_jhead(sdp->sd_jdesc, &head);
@@ -354,6 +370,7 @@ static void trans_go_xmote_bh(struct gfs2_glock *gl)
 			gfs2_log_pointers_init(sdp, head.lh_blkno);
 		}
 	}
+	return 0;
 }
 
 /**
@@ -375,12 +392,12 @@ const struct gfs2_glock_operations gfs2_meta_glops = {
 
 const struct gfs2_glock_operations gfs2_inode_glops = {
 	.go_xmote_th = inode_go_sync,
-	.go_xmote_bh = inode_go_xmote_bh,
 	.go_inval = inode_go_inval,
 	.go_demote_ok = inode_go_demote_ok,
 	.go_lock = inode_go_lock,
+	.go_dump = inode_go_dump,
 	.go_type = LM_TYPE_INODE,
-	.go_min_hold_time = HZ / 10,
+	.go_min_hold_time = HZ / 5,
 };
 
 const struct gfs2_glock_operations gfs2_rgrp_glops = {
@@ -389,8 +406,9 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
 	.go_demote_ok = rgrp_go_demote_ok,
 	.go_lock = rgrp_go_lock,
 	.go_unlock = rgrp_go_unlock,
+	.go_dump = rgrp_go_dump,
 	.go_type = LM_TYPE_RGRP,
-	.go_min_hold_time = HZ / 10,
+	.go_min_hold_time = HZ / 5,
 };
 
 const struct gfs2_glock_operations gfs2_trans_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index eabe5eac41d..4b734c6e34f 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -128,20 +128,20 @@ struct gfs2_bufdata {
 
 struct gfs2_glock_operations {
 	void (*go_xmote_th) (struct gfs2_glock *gl);
-	void (*go_xmote_bh) (struct gfs2_glock *gl);
+	int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh);
 	void (*go_inval) (struct gfs2_glock *gl, int flags);
 	int (*go_demote_ok) (struct gfs2_glock *gl);
 	int (*go_lock) (struct gfs2_holder *gh);
 	void (*go_unlock) (struct gfs2_holder *gh);
+	int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
 	const int go_type;
 	const unsigned long go_min_hold_time;
 };
 
 enum {
 	/* States */
-	HIF_HOLDER		= 6,
+	HIF_HOLDER		= 6,  /* Set for gh that "holds" the glock */
 	HIF_FIRST		= 7,
-	HIF_ABORTED		= 9,
 	HIF_WAIT		= 10,
 };
 
@@ -154,20 +154,20 @@ struct gfs2_holder {
 	unsigned gh_flags;
 
 	int gh_error;
-	unsigned long gh_iflags;
+	unsigned long gh_iflags; /* HIF_... */
 	unsigned long gh_ip;
 };
 
 enum {
-	GLF_LOCK		= 1,
-	GLF_STICKY		= 2,
-	GLF_DEMOTE		= 3,
-	GLF_PENDING_DEMOTE	= 4,
-	GLF_DIRTY		= 5,
-	GLF_DEMOTE_IN_PROGRESS	= 6,
-	GLF_LFLUSH		= 7,
-	GLF_WAITERS2		= 8,
-	GLF_CONV_DEADLK		= 9,
+	GLF_LOCK			= 1,
+	GLF_STICKY			= 2,
+	GLF_DEMOTE			= 3,
+	GLF_PENDING_DEMOTE		= 4,
+	GLF_DEMOTE_IN_PROGRESS		= 5,
+	GLF_DIRTY			= 6,
+	GLF_LFLUSH			= 7,
+	GLF_INVALIDATE_IN_PROGRESS	= 8,
+	GLF_REPLY_PENDING		= 9,
 };
 
 struct gfs2_glock {
@@ -179,19 +179,14 @@ struct gfs2_glock {
 	spinlock_t gl_spin;
 
 	unsigned int gl_state;
+	unsigned int gl_target;
+	unsigned int gl_reply;
 	unsigned int gl_hash;
 	unsigned int gl_demote_state; /* state requested by remote node */
 	unsigned long gl_demote_time; /* time of first demote request */
-	struct pid *gl_owner_pid;
-	unsigned long gl_ip;
 	struct list_head gl_holders;
-	struct list_head gl_waiters1;	/* HIF_MUTEX */
-	struct list_head gl_waiters3;	/* HIF_PROMOTE */
 
 	const struct gfs2_glock_operations *gl_ops;
-
-	struct gfs2_holder *gl_req_gh;
-
 	void *gl_lock;
 	char *gl_lvb;
 	atomic_t gl_lvb_count;
diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
index cf7ea8abec8..fed9a67be0f 100644
--- a/fs/gfs2/locking/dlm/lock.c
+++ b/fs/gfs2/locking/dlm/lock.c
@@ -308,6 +308,9 @@ unsigned int gdlm_lock(void *lock, unsigned int cur_state,
 {
 	struct gdlm_lock *lp = lock;
 
+	if (req_state == LM_ST_UNLOCKED)
+		return gdlm_unlock(lock, cur_state);
+
 	clear_bit(LFL_DLM_CANCEL, &lp->flags);
 	if (flags & LM_FLAG_NOEXP)
 		set_bit(LFL_NOBLOCK, &lp->flags);
diff --git a/fs/gfs2/locking/nolock/main.c b/fs/gfs2/locking/nolock/main.c
index 284a5ece8d9..627bfb79bc8 100644
--- a/fs/gfs2/locking/nolock/main.c
+++ b/fs/gfs2/locking/nolock/main.c
@@ -107,6 +107,8 @@ static void nolock_put_lock(void *lock)
 static unsigned int nolock_lock(void *lock, unsigned int cur_state,
 				unsigned int req_state, unsigned int flags)
 {
+	if (req_state == LM_ST_UNLOCKED)
+		return 0;
 	return req_state | LM_OUT_CACHEABLE;
 }
 
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 053e2ebbbd5..bcc668d0fad 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -40,8 +40,6 @@ static void gfs2_init_glock_once(struct kmem_cache *cachep, void *foo)
 	INIT_HLIST_NODE(&gl->gl_list);
 	spin_lock_init(&gl->gl_spin);
 	INIT_LIST_HEAD(&gl->gl_holders);
-	INIT_LIST_HEAD(&gl->gl_waiters1);
-	INIT_LIST_HEAD(&gl->gl_waiters3);
 	gl->gl_lvb = NULL;
 	atomic_set(&gl->gl_lvb_count, 0);
 	INIT_LIST_HEAD(&gl->gl_reclaim);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 78d75f892f8..09853620c95 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -129,7 +129,7 @@ void gfs2_meta_sync(struct gfs2_glock *gl)
 }
 
 /**
- * getbuf - Get a buffer with a given address space
+ * gfs2_getbuf - Get a buffer with a given address space
  * @gl: the glock
  * @blkno: the block number (filesystem scope)
  * @create: 1 if the buffer should be created
@@ -137,7 +137,7 @@ void gfs2_meta_sync(struct gfs2_glock *gl)
  * Returns: the buffer
  */
 
-static struct buffer_head *getbuf(struct gfs2_glock *gl, u64 blkno, int create)
+struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
 {
 	struct address_space *mapping = gl->gl_aspace->i_mapping;
 	struct gfs2_sbd *sdp = gl->gl_sbd;
@@ -205,7 +205,7 @@ static void meta_prep_new(struct buffer_head *bh)
 struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
 {
 	struct buffer_head *bh;
-	bh = getbuf(gl, blkno, CREATE);
+	bh = gfs2_getbuf(gl, blkno, CREATE);
 	meta_prep_new(bh);
 	return bh;
 }
@@ -223,7 +223,7 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
 int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
 		   struct buffer_head **bhp)
 {
-	*bhp = getbuf(gl, blkno, CREATE);
+	*bhp = gfs2_getbuf(gl, blkno, CREATE);
 	if (!buffer_uptodate(*bhp)) {
 		ll_rw_block(READ_META, 1, bhp);
 		if (flags & DIO_WAIT) {
@@ -346,7 +346,7 @@ void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
 	struct buffer_head *bh;
 
 	while (blen) {
-		bh = getbuf(ip->i_gl, bstart, NO_CREATE);
+		bh = gfs2_getbuf(ip->i_gl, bstart, NO_CREATE);
 		if (bh) {
 			lock_buffer(bh);
 			gfs2_log_lock(sdp);
@@ -421,7 +421,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
 	if (extlen > max_ra)
 		extlen = max_ra;
 
-	first_bh = getbuf(gl, dblock, CREATE);
+	first_bh = gfs2_getbuf(gl, dblock, CREATE);
 
 	if (buffer_uptodate(first_bh))
 		goto out;
@@ -432,7 +432,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
 	extlen--;
 
 	while (extlen) {
-		bh = getbuf(gl, dblock, CREATE);
+		bh = gfs2_getbuf(gl, dblock, CREATE);
 
 		if (!buffer_uptodate(bh) && !buffer_locked(bh))
 			ll_rw_block(READA, 1, &bh);
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index 73e3b1c76fe..b1a5f3674d4 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -47,6 +47,7 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
 int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno,
 		   int flags, struct buffer_head **bhp);
 int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
+struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create);
 
 void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
 			 int meta);
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index f55394e57cb..2b556dd034b 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -507,26 +507,23 @@ static int __gfs2_readpage(void *file, struct page *page)
 static int gfs2_readpage(struct file *file, struct page *page)
 {
 	struct gfs2_inode *ip = GFS2_I(page->mapping->host);
-	struct gfs2_holder *gh;
+	struct gfs2_holder gh;
 	int error;
 
-	gh = gfs2_glock_is_locked_by_me(ip->i_gl);
-	if (!gh) {
-		gh = kmalloc(sizeof(struct gfs2_holder), GFP_NOFS);
-		if (!gh)
-			return -ENOBUFS;
-		gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, gh);
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|LM_FLAG_TRY_1CB, &gh);
+	error = gfs2_glock_nq_atime(&gh);
+	if (unlikely(error)) {
 		unlock_page(page);
-		error = gfs2_glock_nq_atime(gh);
-		if (likely(error != 0))
-			goto out;
-		return AOP_TRUNCATED_PAGE;
+		goto out;
 	}
 	error = __gfs2_readpage(file, page);
-	gfs2_glock_dq(gh);
+	gfs2_glock_dq(&gh);
 out:
-	gfs2_holder_uninit(gh);
-	kfree(gh);
+	gfs2_holder_uninit(&gh);
+	if (error == GLR_TRYFAILED) {
+		yield();
+		return AOP_TRUNCATED_PAGE;
+	}
 	return error;
 }
 
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index e1b7d525a06..0ff512a1192 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -669,8 +669,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
 	int error = 0;
 
 	state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
-	flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE 
-		| GL_FLOCK;
+	flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE;
 
 	mutex_lock(&fp->f_fl_mutex);
 
@@ -683,9 +682,8 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
 		gfs2_glock_dq_wait(fl_gh);
 		gfs2_holder_reinit(state, flags, fl_gh);
 	} else {
-		error = gfs2_glock_get(GFS2_SB(&ip->i_inode),
-				      ip->i_no_addr, &gfs2_flock_glops,
-				      CREATE, &gl);
+		error = gfs2_glock_get(GFS2_SB(&ip->i_inode), ip->i_no_addr,
+				       &gfs2_flock_glops, CREATE, &gl);
 		if (error)
 			goto out;
 		gfs2_holder_init(gl, state, flags, fl_gh);
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 2888e4b4b1c..fdd3f0f16d0 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -505,7 +505,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd)
 
 		error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
 					   LM_FLAG_NOEXP | LM_FLAG_PRIORITY |
-					   GL_NOCANCEL | GL_NOCACHE, &t_gh);
+					   GL_NOCACHE, &t_gh);
 		if (error)
 			goto fail_gunlock_ji;
 
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 7aeacbc65f3..12fe38fe498 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -941,8 +941,7 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
 	}
 
 	error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_DEFERRED,
-			       LM_FLAG_PRIORITY | GL_NOCACHE,
-			       t_gh);
+				   GL_NOCACHE, t_gh);
 
 	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
 		error = gfs2_jdesc_check(jd);
-- 
cgit v1.2.3-70-g09d2


From f3c9d38a26be32abf9b8897e9e0afc7166c712dd Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 21 May 2008 17:21:42 +0100
Subject: [GFS2] Fix ordering bug in lock_dlm

This looks like a lot of change, but in fact its not. Mostly its
things moving from one file to another. The change is just that
instead of queuing lock completions and callbacks from the DLM
we now pass them directly to GFS2.

This gives us a net loss of two list heads per glock (a fair
saving in memory) plus a reduction in the latency of delivering
the messages to GFS2, plus we now have one thread fewer as well.
There was a bug where callbacks and completions could be delivered
in the wrong order due to this unnecessary queuing which is fixed
by this patch.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Bob Peterson <rpeterso@redhat.com>
---
 fs/gfs2/locking/dlm/lock.c     | 353 +++++++++++++++++++++++++++++++++--------
 fs/gfs2/locking/dlm/lock_dlm.h |  12 +-
 fs/gfs2/locking/dlm/mount.c    |   4 -
 fs/gfs2/locking/dlm/thread.c   | 324 +++----------------------------------
 4 files changed, 306 insertions(+), 387 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
index fed9a67be0f..871ffc9578f 100644
--- a/fs/gfs2/locking/dlm/lock.c
+++ b/fs/gfs2/locking/dlm/lock.c
@@ -11,46 +11,63 @@
 
 static char junk_lvb[GDLM_LVB_SIZE];
 
-static void queue_complete(struct gdlm_lock *lp)
+
+/* convert dlm lock-mode to gfs lock-state */
+
+static s16 gdlm_make_lmstate(s16 dlmmode)
 {
-	struct gdlm_ls *ls = lp->ls;
+	switch (dlmmode) {
+	case DLM_LOCK_IV:
+	case DLM_LOCK_NL:
+		return LM_ST_UNLOCKED;
+	case DLM_LOCK_EX:
+		return LM_ST_EXCLUSIVE;
+	case DLM_LOCK_CW:
+		return LM_ST_DEFERRED;
+	case DLM_LOCK_PR:
+		return LM_ST_SHARED;
+	}
+	gdlm_assert(0, "unknown DLM mode %d", dlmmode);
+	return -1;
+}
 
-	clear_bit(LFL_ACTIVE, &lp->flags);
+/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm
+   thread gets to it. */
+
+static void queue_submit(struct gdlm_lock *lp)
+{
+	struct gdlm_ls *ls = lp->ls;
 
 	spin_lock(&ls->async_lock);
-	list_add_tail(&lp->clist, &ls->complete);
+	list_add_tail(&lp->delay_list, &ls->submit);
 	spin_unlock(&ls->async_lock);
 	wake_up(&ls->thread_wait);
 }
 
-static inline void gdlm_ast(void *astarg)
+static void wake_up_ast(struct gdlm_lock *lp)
 {
-	queue_complete(astarg);
+	clear_bit(LFL_AST_WAIT, &lp->flags);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&lp->flags, LFL_AST_WAIT);
 }
 
-static inline void gdlm_bast(void *astarg, int mode)
+static void gdlm_delete_lp(struct gdlm_lock *lp)
 {
-	struct gdlm_lock *lp = astarg;
 	struct gdlm_ls *ls = lp->ls;
 
-	if (!mode) {
-		printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n",
-			lp->lockname.ln_type,
-			(unsigned long long)lp->lockname.ln_number);
-		return;
-	}
-
 	spin_lock(&ls->async_lock);
-	if (!lp->bast_mode) {
-		list_add_tail(&lp->blist, &ls->blocking);
-		lp->bast_mode = mode;
-	} else if (lp->bast_mode < mode)
-		lp->bast_mode = mode;
+	if (!list_empty(&lp->delay_list))
+		list_del_init(&lp->delay_list);
+	gdlm_assert(!list_empty(&lp->all_list), "%x,%llx", lp->lockname.ln_type,
+		    (unsigned long long)lp->lockname.ln_number);
+	list_del_init(&lp->all_list);
+	ls->all_locks_count--;
 	spin_unlock(&ls->async_lock);
-	wake_up(&ls->thread_wait);
+
+	kfree(lp);
 }
 
-void gdlm_queue_delayed(struct gdlm_lock *lp)
+static void gdlm_queue_delayed(struct gdlm_lock *lp)
 {
 	struct gdlm_ls *ls = lp->ls;
 
@@ -59,6 +76,249 @@ void gdlm_queue_delayed(struct gdlm_lock *lp)
 	spin_unlock(&ls->async_lock);
 }
 
+static void process_complete(struct gdlm_lock *lp)
+{
+	struct gdlm_ls *ls = lp->ls;
+	struct lm_async_cb acb;
+	s16 prev_mode = lp->cur;
+
+	memset(&acb, 0, sizeof(acb));
+
+	if (lp->lksb.sb_status == -DLM_ECANCEL) {
+		log_info("complete dlm cancel %x,%llx flags %lx",
+		 	 lp->lockname.ln_type,
+			 (unsigned long long)lp->lockname.ln_number,
+			 lp->flags);
+
+		lp->req = lp->cur;
+		acb.lc_ret |= LM_OUT_CANCELED;
+		if (lp->cur == DLM_LOCK_IV)
+			lp->lksb.sb_lkid = 0;
+		goto out;
+	}
+
+	if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) {
+		if (lp->lksb.sb_status != -DLM_EUNLOCK) {
+			log_info("unlock sb_status %d %x,%llx flags %lx",
+				 lp->lksb.sb_status, lp->lockname.ln_type,
+				 (unsigned long long)lp->lockname.ln_number,
+				 lp->flags);
+			return;
+		}
+
+		lp->cur = DLM_LOCK_IV;
+		lp->req = DLM_LOCK_IV;
+		lp->lksb.sb_lkid = 0;
+
+		if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) {
+			gdlm_delete_lp(lp);
+			return;
+		}
+		goto out;
+	}
+
+	if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID)
+		memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
+
+	if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) {
+		if (lp->req == DLM_LOCK_PR)
+			lp->req = DLM_LOCK_CW;
+		else if (lp->req == DLM_LOCK_CW)
+			lp->req = DLM_LOCK_PR;
+	}
+
+	/*
+	 * A canceled lock request.  The lock was just taken off the delayed
+	 * list and was never even submitted to dlm.
+	 */
+
+	if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) {
+		log_info("complete internal cancel %x,%llx",
+		 	 lp->lockname.ln_type,
+			 (unsigned long long)lp->lockname.ln_number);
+		lp->req = lp->cur;
+		acb.lc_ret |= LM_OUT_CANCELED;
+		goto out;
+	}
+
+	/*
+	 * An error occured.
+	 */
+
+	if (lp->lksb.sb_status) {
+		/* a "normal" error */
+		if ((lp->lksb.sb_status == -EAGAIN) &&
+		    (lp->lkf & DLM_LKF_NOQUEUE)) {
+			lp->req = lp->cur;
+			if (lp->cur == DLM_LOCK_IV)
+				lp->lksb.sb_lkid = 0;
+			goto out;
+		}
+
+		/* this could only happen with cancels I think */
+		log_info("ast sb_status %d %x,%llx flags %lx",
+			 lp->lksb.sb_status, lp->lockname.ln_type,
+			 (unsigned long long)lp->lockname.ln_number,
+			 lp->flags);
+		if (lp->lksb.sb_status == -EDEADLOCK &&
+		    lp->ls->fsflags & LM_MFLAG_CONV_NODROP) {
+			lp->req = lp->cur;
+			acb.lc_ret |= LM_OUT_CONV_DEADLK;
+			if (lp->cur == DLM_LOCK_IV)
+				lp->lksb.sb_lkid = 0;
+			goto out;
+		} else
+			return;
+	}
+
+	/*
+	 * This is an AST for an EX->EX conversion for sync_lvb from GFS.
+	 */
+
+	if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) {
+		wake_up_ast(lp);
+		return;
+	}
+
+	/*
+	 * A lock has been demoted to NL because it initially completed during
+	 * BLOCK_LOCKS.  Now it must be requested in the originally requested
+	 * mode.
+	 */
+
+	if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) {
+		gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx",
+			    lp->lockname.ln_type,
+			    (unsigned long long)lp->lockname.ln_number);
+		gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx",
+			    lp->lockname.ln_type,
+			    (unsigned long long)lp->lockname.ln_number);
+
+		lp->cur = DLM_LOCK_NL;
+		lp->req = lp->prev_req;
+		lp->prev_req = DLM_LOCK_IV;
+		lp->lkf &= ~DLM_LKF_CONVDEADLK;
+
+		set_bit(LFL_NOCACHE, &lp->flags);
+
+		if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
+		    !test_bit(LFL_NOBLOCK, &lp->flags))
+			gdlm_queue_delayed(lp);
+		else
+			queue_submit(lp);
+		return;
+	}
+
+	/*
+	 * A request is granted during dlm recovery.  It may be granted
+	 * because the locks of a failed node were cleared.  In that case,
+	 * there may be inconsistent data beneath this lock and we must wait
+	 * for recovery to complete to use it.  When gfs recovery is done this
+	 * granted lock will be converted to NL and then reacquired in this
+	 * granted state.
+	 */
+
+	if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
+	    !test_bit(LFL_NOBLOCK, &lp->flags) &&
+	    lp->req != DLM_LOCK_NL) {
+
+		lp->cur = lp->req;
+		lp->prev_req = lp->req;
+		lp->req = DLM_LOCK_NL;
+		lp->lkf |= DLM_LKF_CONVERT;
+		lp->lkf &= ~DLM_LKF_CONVDEADLK;
+
+		log_debug("rereq %x,%llx id %x %d,%d",
+			  lp->lockname.ln_type,
+			  (unsigned long long)lp->lockname.ln_number,
+			  lp->lksb.sb_lkid, lp->cur, lp->req);
+
+		set_bit(LFL_REREQUEST, &lp->flags);
+		queue_submit(lp);
+		return;
+	}
+
+	/*
+	 * DLM demoted the lock to NL before it was granted so GFS must be
+	 * told it cannot cache data for this lock.
+	 */
+
+	if (lp->lksb.sb_flags & DLM_SBF_DEMOTED)
+		set_bit(LFL_NOCACHE, &lp->flags);
+
+out:
+	/*
+	 * This is an internal lock_dlm lock
+	 */
+
+	if (test_bit(LFL_INLOCK, &lp->flags)) {
+		clear_bit(LFL_NOBLOCK, &lp->flags);
+		lp->cur = lp->req;
+		wake_up_ast(lp);
+		return;
+	}
+
+	/*
+	 * Normal completion of a lock request.  Tell GFS it now has the lock.
+	 */
+
+	clear_bit(LFL_NOBLOCK, &lp->flags);
+	lp->cur = lp->req;
+
+	acb.lc_name = lp->lockname;
+	acb.lc_ret |= gdlm_make_lmstate(lp->cur);
+
+	if (!test_and_clear_bit(LFL_NOCACHE, &lp->flags) &&
+	    (lp->cur > DLM_LOCK_NL) && (prev_mode > DLM_LOCK_NL))
+		acb.lc_ret |= LM_OUT_CACHEABLE;
+
+	ls->fscb(ls->sdp, LM_CB_ASYNC, &acb);
+}
+
+static void gdlm_ast(void *astarg)
+{
+	struct gdlm_lock *lp = astarg;
+	clear_bit(LFL_ACTIVE, &lp->flags);
+	process_complete(lp);
+}
+
+static void process_blocking(struct gdlm_lock *lp, int bast_mode)
+{
+	struct gdlm_ls *ls = lp->ls;
+	unsigned int cb = 0;
+
+	switch (gdlm_make_lmstate(bast_mode)) {
+	case LM_ST_EXCLUSIVE:
+		cb = LM_CB_NEED_E;
+		break;
+	case LM_ST_DEFERRED:
+		cb = LM_CB_NEED_D;
+		break;
+	case LM_ST_SHARED:
+		cb = LM_CB_NEED_S;
+		break;
+	default:
+		gdlm_assert(0, "unknown bast mode %u", bast_mode);
+	}
+
+	ls->fscb(ls->sdp, cb, &lp->lockname);
+}
+
+
+static void gdlm_bast(void *astarg, int mode)
+{
+	struct gdlm_lock *lp = astarg;
+
+	if (!mode) {
+		printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n",
+			lp->lockname.ln_type,
+			(unsigned long long)lp->lockname.ln_number);
+		return;
+	}
+
+	process_blocking(lp, mode);
+}
+
 /* convert gfs lock-state to dlm lock-mode */
 
 static s16 make_mode(s16 lmstate)
@@ -77,24 +337,6 @@ static s16 make_mode(s16 lmstate)
 	return -1;
 }
 
-/* convert dlm lock-mode to gfs lock-state */
-
-s16 gdlm_make_lmstate(s16 dlmmode)
-{
-	switch (dlmmode) {
-	case DLM_LOCK_IV:
-	case DLM_LOCK_NL:
-		return LM_ST_UNLOCKED;
-	case DLM_LOCK_EX:
-		return LM_ST_EXCLUSIVE;
-	case DLM_LOCK_CW:
-		return LM_ST_DEFERRED;
-	case DLM_LOCK_PR:
-		return LM_ST_SHARED;
-	}
-	gdlm_assert(0, "unknown DLM mode %d", dlmmode);
-	return -1;
-}
 
 /* verify agreement with GFS on the current lock state, NB: DLM_LOCK_NL and
    DLM_LOCK_IV are both considered LM_ST_UNLOCKED by GFS. */
@@ -173,10 +415,6 @@ static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
 	make_strname(name, &lp->strname);
 	lp->ls = ls;
 	lp->cur = DLM_LOCK_IV;
-	lp->lvb = NULL;
-	lp->hold_null = NULL;
-	INIT_LIST_HEAD(&lp->clist);
-	INIT_LIST_HEAD(&lp->blist);
 	INIT_LIST_HEAD(&lp->delay_list);
 
 	spin_lock(&ls->async_lock);
@@ -188,26 +426,6 @@ static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
 	return 0;
 }
 
-void gdlm_delete_lp(struct gdlm_lock *lp)
-{
-	struct gdlm_ls *ls = lp->ls;
-
-	spin_lock(&ls->async_lock);
-	if (!list_empty(&lp->clist))
-		list_del_init(&lp->clist);
-	if (!list_empty(&lp->blist))
-		list_del_init(&lp->blist);
-	if (!list_empty(&lp->delay_list))
-		list_del_init(&lp->delay_list);
-	gdlm_assert(!list_empty(&lp->all_list), "%x,%llx", lp->lockname.ln_type,
-		    (unsigned long long)lp->lockname.ln_number);
-	list_del_init(&lp->all_list);
-	ls->all_locks_count--;
-	spin_unlock(&ls->async_lock);
-
-	kfree(lp);
-}
-
 int gdlm_get_lock(void *lockspace, struct lm_lockname *name,
 		  void **lockp)
 {
@@ -261,7 +479,7 @@ unsigned int gdlm_do_lock(struct gdlm_lock *lp)
 
 	if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) {
 		lp->lksb.sb_status = -EAGAIN;
-		queue_complete(lp);
+		gdlm_ast(lp);
 		error = 0;
 	}
 
@@ -308,6 +526,9 @@ unsigned int gdlm_lock(void *lock, unsigned int cur_state,
 {
 	struct gdlm_lock *lp = lock;
 
+	if (req_state == LM_ST_UNLOCKED)
+		return gdlm_unlock(lock, cur_state);
+
 	if (req_state == LM_ST_UNLOCKED)
 		return gdlm_unlock(lock, cur_state);
 
@@ -354,7 +575,7 @@ void gdlm_cancel(void *lock)
 	if (delay_list) {
 		set_bit(LFL_CANCEL, &lp->flags);
 		set_bit(LFL_ACTIVE, &lp->flags);
-		queue_complete(lp);
+		gdlm_ast(lp);
 		return;
 	}
 
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
index a243cf69c54..ad944c64eab 100644
--- a/fs/gfs2/locking/dlm/lock_dlm.h
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -72,15 +72,12 @@ struct gdlm_ls {
 	int			recover_jid_done;
 	int			recover_jid_status;
 	spinlock_t		async_lock;
-	struct list_head	complete;
-	struct list_head	blocking;
 	struct list_head	delayed;
 	struct list_head	submit;
 	struct list_head	all_locks;
 	u32		all_locks_count;
 	wait_queue_head_t	wait_control;
-	struct task_struct	*thread1;
-	struct task_struct	*thread2;
+	struct task_struct	*thread;
 	wait_queue_head_t	thread_wait;
 	unsigned long		drop_time;
 	int			drop_locks_count;
@@ -117,10 +114,6 @@ struct gdlm_lock {
 	u32			lkf;		/* dlm flags DLM_LKF_ */
 	unsigned long		flags;		/* lock_dlm flags LFL_ */
 
-	int			bast_mode;	/* protected by async_lock */
-
-	struct list_head	clist;		/* complete */
-	struct list_head	blist;		/* blocking */
 	struct list_head	delay_list;	/* delayed */
 	struct list_head	all_list;	/* all locks for the fs */
 	struct gdlm_lock	*hold_null;	/* NL lock for hold_lvb */
@@ -159,11 +152,8 @@ void gdlm_release_threads(struct gdlm_ls *);
 
 /* lock.c */
 
-s16 gdlm_make_lmstate(s16);
-void gdlm_queue_delayed(struct gdlm_lock *);
 void gdlm_submit_delayed(struct gdlm_ls *);
 int gdlm_release_all_locks(struct gdlm_ls *);
-void gdlm_delete_lp(struct gdlm_lock *);
 unsigned int gdlm_do_lock(struct gdlm_lock *);
 
 int gdlm_get_lock(void *, struct lm_lockname *, void **);
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index 470bdf650b5..0628520a445 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -28,15 +28,11 @@ static struct gdlm_ls *init_gdlm(lm_callback_t cb, struct gfs2_sbd *sdp,
 	ls->sdp = sdp;
 	ls->fsflags = flags;
 	spin_lock_init(&ls->async_lock);
-	INIT_LIST_HEAD(&ls->complete);
-	INIT_LIST_HEAD(&ls->blocking);
 	INIT_LIST_HEAD(&ls->delayed);
 	INIT_LIST_HEAD(&ls->submit);
 	INIT_LIST_HEAD(&ls->all_locks);
 	init_waitqueue_head(&ls->thread_wait);
 	init_waitqueue_head(&ls->wait_control);
-	ls->thread1 = NULL;
-	ls->thread2 = NULL;
 	ls->drop_time = jiffies;
 	ls->jid = -1;
 
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
index e53db6fd28a..f30350abd62 100644
--- a/fs/gfs2/locking/dlm/thread.c
+++ b/fs/gfs2/locking/dlm/thread.c
@@ -9,255 +9,12 @@
 
 #include "lock_dlm.h"
 
-/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm
-   thread gets to it. */
-
-static void queue_submit(struct gdlm_lock *lp)
-{
-	struct gdlm_ls *ls = lp->ls;
-
-	spin_lock(&ls->async_lock);
-	list_add_tail(&lp->delay_list, &ls->submit);
-	spin_unlock(&ls->async_lock);
-	wake_up(&ls->thread_wait);
-}
-
-static void process_blocking(struct gdlm_lock *lp, int bast_mode)
-{
-	struct gdlm_ls *ls = lp->ls;
-	unsigned int cb = 0;
-
-	switch (gdlm_make_lmstate(bast_mode)) {
-	case LM_ST_EXCLUSIVE:
-		cb = LM_CB_NEED_E;
-		break;
-	case LM_ST_DEFERRED:
-		cb = LM_CB_NEED_D;
-		break;
-	case LM_ST_SHARED:
-		cb = LM_CB_NEED_S;
-		break;
-	default:
-		gdlm_assert(0, "unknown bast mode %u", lp->bast_mode);
-	}
-
-	ls->fscb(ls->sdp, cb, &lp->lockname);
-}
-
-static void wake_up_ast(struct gdlm_lock *lp)
-{
-	clear_bit(LFL_AST_WAIT, &lp->flags);
-	smp_mb__after_clear_bit();
-	wake_up_bit(&lp->flags, LFL_AST_WAIT);
-}
-
-static void process_complete(struct gdlm_lock *lp)
-{
-	struct gdlm_ls *ls = lp->ls;
-	struct lm_async_cb acb;
-	s16 prev_mode = lp->cur;
-
-	memset(&acb, 0, sizeof(acb));
-
-	if (lp->lksb.sb_status == -DLM_ECANCEL) {
-		log_info("complete dlm cancel %x,%llx flags %lx",
-		 	 lp->lockname.ln_type,
-			 (unsigned long long)lp->lockname.ln_number,
-			 lp->flags);
-
-		lp->req = lp->cur;
-		acb.lc_ret |= LM_OUT_CANCELED;
-		if (lp->cur == DLM_LOCK_IV)
-			lp->lksb.sb_lkid = 0;
-		goto out;
-	}
-
-	if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) {
-		if (lp->lksb.sb_status != -DLM_EUNLOCK) {
-			log_info("unlock sb_status %d %x,%llx flags %lx",
-				 lp->lksb.sb_status, lp->lockname.ln_type,
-				 (unsigned long long)lp->lockname.ln_number,
-				 lp->flags);
-			return;
-		}
-
-		lp->cur = DLM_LOCK_IV;
-		lp->req = DLM_LOCK_IV;
-		lp->lksb.sb_lkid = 0;
-
-		if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) {
-			gdlm_delete_lp(lp);
-			return;
-		}
-		goto out;
-	}
-
-	if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID)
-		memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
-
-	if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) {
-		if (lp->req == DLM_LOCK_PR)
-			lp->req = DLM_LOCK_CW;
-		else if (lp->req == DLM_LOCK_CW)
-			lp->req = DLM_LOCK_PR;
-	}
-
-	/*
-	 * A canceled lock request.  The lock was just taken off the delayed
-	 * list and was never even submitted to dlm.
-	 */
-
-	if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) {
-		log_info("complete internal cancel %x,%llx",
-		 	 lp->lockname.ln_type,
-			 (unsigned long long)lp->lockname.ln_number);
-		lp->req = lp->cur;
-		acb.lc_ret |= LM_OUT_CANCELED;
-		goto out;
-	}
-
-	/*
-	 * An error occured.
-	 */
-
-	if (lp->lksb.sb_status) {
-		/* a "normal" error */
-		if ((lp->lksb.sb_status == -EAGAIN) &&
-		    (lp->lkf & DLM_LKF_NOQUEUE)) {
-			lp->req = lp->cur;
-			if (lp->cur == DLM_LOCK_IV)
-				lp->lksb.sb_lkid = 0;
-			goto out;
-		}
-
-		/* this could only happen with cancels I think */
-		log_info("ast sb_status %d %x,%llx flags %lx",
-			 lp->lksb.sb_status, lp->lockname.ln_type,
-			 (unsigned long long)lp->lockname.ln_number,
-			 lp->flags);
-		if (lp->lksb.sb_status == -EDEADLOCK &&
-		    lp->ls->fsflags & LM_MFLAG_CONV_NODROP) {
-			lp->req = lp->cur;
-			acb.lc_ret |= LM_OUT_CONV_DEADLK;
-			if (lp->cur == DLM_LOCK_IV)
-				lp->lksb.sb_lkid = 0;
-			goto out;
-		} else
-			return;
-	}
-
-	/*
-	 * This is an AST for an EX->EX conversion for sync_lvb from GFS.
-	 */
-
-	if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) {
-		wake_up_ast(lp);
-		return;
-	}
-
-	/*
-	 * A lock has been demoted to NL because it initially completed during
-	 * BLOCK_LOCKS.  Now it must be requested in the originally requested
-	 * mode.
-	 */
-
-	if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) {
-		gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx",
-			    lp->lockname.ln_type,
-			    (unsigned long long)lp->lockname.ln_number);
-		gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx",
-			    lp->lockname.ln_type,
-			    (unsigned long long)lp->lockname.ln_number);
-
-		lp->cur = DLM_LOCK_NL;
-		lp->req = lp->prev_req;
-		lp->prev_req = DLM_LOCK_IV;
-		lp->lkf &= ~DLM_LKF_CONVDEADLK;
-
-		set_bit(LFL_NOCACHE, &lp->flags);
-
-		if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
-		    !test_bit(LFL_NOBLOCK, &lp->flags))
-			gdlm_queue_delayed(lp);
-		else
-			queue_submit(lp);
-		return;
-	}
-
-	/*
-	 * A request is granted during dlm recovery.  It may be granted
-	 * because the locks of a failed node were cleared.  In that case,
-	 * there may be inconsistent data beneath this lock and we must wait
-	 * for recovery to complete to use it.  When gfs recovery is done this
-	 * granted lock will be converted to NL and then reacquired in this
-	 * granted state.
-	 */
-
-	if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
-	    !test_bit(LFL_NOBLOCK, &lp->flags) &&
-	    lp->req != DLM_LOCK_NL) {
-
-		lp->cur = lp->req;
-		lp->prev_req = lp->req;
-		lp->req = DLM_LOCK_NL;
-		lp->lkf |= DLM_LKF_CONVERT;
-		lp->lkf &= ~DLM_LKF_CONVDEADLK;
-
-		log_debug("rereq %x,%llx id %x %d,%d",
-			  lp->lockname.ln_type,
-			  (unsigned long long)lp->lockname.ln_number,
-			  lp->lksb.sb_lkid, lp->cur, lp->req);
-
-		set_bit(LFL_REREQUEST, &lp->flags);
-		queue_submit(lp);
-		return;
-	}
-
-	/*
-	 * DLM demoted the lock to NL before it was granted so GFS must be
-	 * told it cannot cache data for this lock.
-	 */
-
-	if (lp->lksb.sb_flags & DLM_SBF_DEMOTED)
-		set_bit(LFL_NOCACHE, &lp->flags);
-
-out:
-	/*
-	 * This is an internal lock_dlm lock
-	 */
-
-	if (test_bit(LFL_INLOCK, &lp->flags)) {
-		clear_bit(LFL_NOBLOCK, &lp->flags);
-		lp->cur = lp->req;
-		wake_up_ast(lp);
-		return;
-	}
-
-	/*
-	 * Normal completion of a lock request.  Tell GFS it now has the lock.
-	 */
-
-	clear_bit(LFL_NOBLOCK, &lp->flags);
-	lp->cur = lp->req;
-
-	acb.lc_name = lp->lockname;
-	acb.lc_ret |= gdlm_make_lmstate(lp->cur);
-
-	if (!test_and_clear_bit(LFL_NOCACHE, &lp->flags) &&
-	    (lp->cur > DLM_LOCK_NL) && (prev_mode > DLM_LOCK_NL))
-		acb.lc_ret |= LM_OUT_CACHEABLE;
-
-	ls->fscb(ls->sdp, LM_CB_ASYNC, &acb);
-}
-
-static inline int no_work(struct gdlm_ls *ls, int blocking)
+static inline int no_work(struct gdlm_ls *ls)
 {
 	int ret;
 
 	spin_lock(&ls->async_lock);
-	ret = list_empty(&ls->complete) && list_empty(&ls->submit);
-	if (ret && blocking)
-		ret = list_empty(&ls->blocking);
+	ret = list_empty(&ls->submit);
 	spin_unlock(&ls->async_lock);
 
 	return ret;
@@ -276,100 +33,55 @@ static inline int check_drop(struct gdlm_ls *ls)
 	return 0;
 }
 
-static int gdlm_thread(void *data, int blist)
+static int gdlm_thread(void *data)
 {
 	struct gdlm_ls *ls = (struct gdlm_ls *) data;
 	struct gdlm_lock *lp = NULL;
-	uint8_t complete, blocking, submit, drop;
-
-	/* Only thread1 is allowed to do blocking callbacks since gfs
-	   may wait for a completion callback within a blocking cb. */
 
 	while (!kthread_should_stop()) {
 		wait_event_interruptible(ls->thread_wait,
-				!no_work(ls, blist) || kthread_should_stop());
-
-		complete = blocking = submit = drop = 0;
+				!no_work(ls) || kthread_should_stop());
 
 		spin_lock(&ls->async_lock);
 
-		if (blist && !list_empty(&ls->blocking)) {
-			lp = list_entry(ls->blocking.next, struct gdlm_lock,
-					blist);
-			list_del_init(&lp->blist);
-			blocking = lp->bast_mode;
-			lp->bast_mode = 0;
-		} else if (!list_empty(&ls->complete)) {
-			lp = list_entry(ls->complete.next, struct gdlm_lock,
-					clist);
-			list_del_init(&lp->clist);
-			complete = 1;
-		} else if (!list_empty(&ls->submit)) {
+		if (!list_empty(&ls->submit)) {
 			lp = list_entry(ls->submit.next, struct gdlm_lock,
 					delay_list);
 			list_del_init(&lp->delay_list);
-			submit = 1;
-		}
-
-		drop = check_drop(ls);
-		spin_unlock(&ls->async_lock);
-
-		if (complete)
-			process_complete(lp);
-
-		else if (blocking)
-			process_blocking(lp, blocking);
-
-		else if (submit)
+			spin_unlock(&ls->async_lock);
 			gdlm_do_lock(lp);
-
-		if (drop)
+			spin_lock(&ls->async_lock);
+		}
+		/* Does this ever happen these days? I hope not anyway */
+		if (check_drop(ls)) {
+			spin_unlock(&ls->async_lock);
 			ls->fscb(ls->sdp, LM_CB_DROPLOCKS, NULL);
-
-		schedule();
+			spin_lock(&ls->async_lock);
+		}
+		spin_unlock(&ls->async_lock);
 	}
 
 	return 0;
 }
 
-static int gdlm_thread1(void *data)
-{
-	return gdlm_thread(data, 1);
-}
-
-static int gdlm_thread2(void *data)
-{
-	return gdlm_thread(data, 0);
-}
-
 int gdlm_init_threads(struct gdlm_ls *ls)
 {
 	struct task_struct *p;
 	int error;
 
-	p = kthread_run(gdlm_thread1, ls, "lock_dlm1");
-	error = IS_ERR(p);
-	if (error) {
-		log_error("can't start lock_dlm1 thread %d", error);
-		return error;
-	}
-	ls->thread1 = p;
-
-	p = kthread_run(gdlm_thread2, ls, "lock_dlm2");
+	p = kthread_run(gdlm_thread, ls, "lock_dlm");
 	error = IS_ERR(p);
 	if (error) {
-		log_error("can't start lock_dlm2 thread %d", error);
-		kthread_stop(ls->thread1);
+		log_error("can't start lock_dlm thread %d", error);
 		return error;
 	}
-	ls->thread2 = p;
+	ls->thread = p;
 
 	return 0;
 }
 
 void gdlm_release_threads(struct gdlm_ls *ls)
 {
-	kthread_stop(ls->thread1);
-	kthread_stop(ls->thread2);
+	kthread_stop(ls->thread);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 048bca223739368aa5b9ce7cfb1d576c32d66cc7 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 23 May 2008 14:46:04 +0100
Subject: [GFS2] No lock_nolock

This patch merges the lock_nolock module into GFS2 itself. As well as removing
some of the overhead of the module, it also means that its now impossible to
build GFS2 without a lock module (which would be a pointless thing to do
anyway).

We also plan to merge lock_dlm into GFS2 in the future, but that is a more
tricky task, and will therefore be a separate patch.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: David Teigland <teigland@redhat.com>
---
 fs/gfs2/Kconfig                 |  18 +--
 fs/gfs2/Makefile                |   1 -
 fs/gfs2/glock.c                 |  15 ++-
 fs/gfs2/locking.c               |  52 ++++++++-
 fs/gfs2/locking/nolock/Makefile |   3 -
 fs/gfs2/locking/nolock/main.c   | 240 ----------------------------------------
 fs/gfs2/ops_fstype.c            |   5 +-
 fs/gfs2/recovery.c              |   3 +
 8 files changed, 71 insertions(+), 266 deletions(-)
 delete mode 100644 fs/gfs2/locking/nolock/Makefile
 delete mode 100644 fs/gfs2/locking/nolock/main.c

(limited to 'fs')

diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 7f7947e3dfb..ab2f57e3fb8 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -14,23 +14,11 @@ config GFS2_FS
 	  GFS is perfect consistency -- changes made to the filesystem on one
 	  machine show up immediately on all other machines in the cluster.
 
-	  To use the GFS2 filesystem, you will need to enable one or more of
-	  the below locking modules. Documentation and utilities for GFS2 can
+	  To use the GFS2 filesystem in a cluster, you will need to enable
+	  the locking module below. Documentation and utilities for GFS2 can
 	  be found here: http://sources.redhat.com/cluster
 
-config GFS2_FS_LOCKING_NOLOCK
-	tristate "GFS2 \"nolock\" locking module"
-	depends on GFS2_FS
-	help
-	  Single node locking module for GFS2.
-
-	  Use this module if you want to use GFS2 on a single node without
-	  its clustering features. You can still take advantage of the
-	  large file support, and upgrade to running a full cluster later on
-	  if required.
-
-	  If you will only be using GFS2 in cluster mode, you do not need this
-	  module.
+	  The "nolock" lock module is now built in to GFS2 by default.
 
 config GFS2_FS_LOCKING_DLM
 	tristate "GFS2 DLM locking module"
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index e2350df02a0..ec65851ec80 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -5,6 +5,5 @@ gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \
 	ops_fstype.o ops_inode.o ops_super.o quota.o \
 	recovery.o rgrp.o super.o sys.o trans.o util.o
 
-obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += locking/nolock/
 obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += locking/dlm/
 
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 519a54cc0b7..be7ed503f01 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -153,7 +153,7 @@ static void glock_free(struct gfs2_glock *gl)
 	struct gfs2_sbd *sdp = gl->gl_sbd;
 	struct inode *aspace = gl->gl_aspace;
 
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+	if (sdp->sd_lockstruct.ls_ops->lm_put_lock)
 		sdp->sd_lockstruct.ls_ops->lm_put_lock(gl->gl_lock);
 
 	if (aspace)
@@ -488,6 +488,10 @@ static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
 				 unsigned int flags)
 {
 	int ret = LM_OUT_ERROR;
+
+	if (!sdp->sd_lockstruct.ls_ops->lm_lock)
+		return req_state == LM_ST_UNLOCKED ? 0 : req_state;
+
 	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
 		ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state,
 							 req_state, flags);
@@ -631,6 +635,8 @@ static int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
 		     void **lockp)
 {
 	int error = -EIO;
+	if (!sdp->sd_lockstruct.ls_ops->lm_get_lock)
+		return 0;
 	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
 		error = sdp->sd_lockstruct.ls_ops->lm_get_lock(
 				sdp->sd_lockstruct.ls_lockspace, name, lockp);
@@ -910,7 +916,8 @@ do_cancel:
 	gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
 	if (!(gh->gh_flags & LM_FLAG_PRIORITY)) {
 		spin_unlock(&gl->gl_spin);
-		sdp->sd_lockstruct.ls_ops->lm_cancel(gl->gl_lock);
+		if (sdp->sd_lockstruct.ls_ops->lm_cancel)
+			sdp->sd_lockstruct.ls_ops->lm_cancel(gl->gl_lock);
 		spin_lock(&gl->gl_spin);
 	}
 	return;
@@ -1187,6 +1194,8 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
 static int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp)
 {
 	int error = -EIO;
+	if (!sdp->sd_lockstruct.ls_ops->lm_hold_lvb)
+		return 0;
 	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
 		error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp);
 	return error;
@@ -1226,7 +1235,7 @@ void gfs2_lvb_unhold(struct gfs2_glock *gl)
 	gfs2_glock_hold(gl);
 	gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0);
 	if (atomic_dec_and_test(&gl->gl_lvb_count)) {
-		if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		if (sdp->sd_lockstruct.ls_ops->lm_unhold_lvb)
 			sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(gl->gl_lock, gl->gl_lvb);
 		gl->gl_lvb = NULL;
 		gfs2_glock_put(gl);
diff --git a/fs/gfs2/locking.c b/fs/gfs2/locking.c
index 663fee72878..a4a367aa5cc 100644
--- a/fs/gfs2/locking.c
+++ b/fs/gfs2/locking.c
@@ -23,12 +23,54 @@ struct lmh_wrapper {
 	const struct lm_lockops *lw_ops;
 };
 
+static int nolock_mount(char *table_name, char *host_data,
+			lm_callback_t cb, void *cb_data,
+			unsigned int min_lvb_size, int flags,
+			struct lm_lockstruct *lockstruct,
+			struct kobject *fskobj);
+
 /* List of registered low-level locking protocols.  A file system selects one
    of them by name at mount time, e.g. lock_nolock, lock_dlm. */
 
+static const struct lm_lockops nolock_ops = {
+	.lm_proto_name = "lock_nolock",
+	.lm_mount = nolock_mount,
+};
+
+static struct lmh_wrapper nolock_proto  = {
+	.lw_list = LIST_HEAD_INIT(nolock_proto.lw_list),
+	.lw_ops = &nolock_ops,
+};
+
 static LIST_HEAD(lmh_list);
 static DEFINE_MUTEX(lmh_lock);
 
+static int nolock_mount(char *table_name, char *host_data,
+			lm_callback_t cb, void *cb_data,
+			unsigned int min_lvb_size, int flags,
+			struct lm_lockstruct *lockstruct,
+			struct kobject *fskobj)
+{
+	char *c;
+	unsigned int jid;
+
+	c = strstr(host_data, "jid=");
+	if (!c)
+		jid = 0;
+	else {
+		c += 4;
+		sscanf(c, "%u", &jid);
+	}
+
+	lockstruct->ls_jid = jid;
+	lockstruct->ls_first = 1;
+	lockstruct->ls_lvb_size = min_lvb_size;
+	lockstruct->ls_ops = &nolock_ops;
+	lockstruct->ls_flags = LM_LSFLAG_LOCAL;
+
+	return 0;
+}
+
 /**
  * gfs2_register_lockproto - Register a low-level locking protocol
  * @proto: the protocol definition
@@ -116,9 +158,13 @@ int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data,
 	int try = 0;
 	int error, found;
 
+
 retry:
 	mutex_lock(&lmh_lock);
 
+	if (list_empty(&nolock_proto.lw_list))
+		list_add(&lmh_list, &nolock_proto.lw_list);
+
 	found = 0;
 	list_for_each_entry(lw, &lmh_list, lw_list) {
 		if (!strcmp(lw->lw_ops->lm_proto_name, proto_name)) {
@@ -139,7 +185,8 @@ retry:
 		goto out;
 	}
 
-	if (!try_module_get(lw->lw_ops->lm_owner)) {
+	if (lw->lw_ops->lm_owner &&
+	    !try_module_get(lw->lw_ops->lm_owner)) {
 		try = 0;
 		mutex_unlock(&lmh_lock);
 		msleep(1000);
@@ -158,7 +205,8 @@ out:
 void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct)
 {
 	mutex_lock(&lmh_lock);
-	lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace);
+	if (lockstruct->ls_ops->lm_unmount)
+		lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace);
 	if (lockstruct->ls_ops->lm_owner)
 		module_put(lockstruct->ls_ops->lm_owner);
 	mutex_unlock(&lmh_lock);
diff --git a/fs/gfs2/locking/nolock/Makefile b/fs/gfs2/locking/nolock/Makefile
deleted file mode 100644
index 35e9730bc3a..00000000000
--- a/fs/gfs2/locking/nolock/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
-obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += lock_nolock.o
-lock_nolock-y := main.o
-
diff --git a/fs/gfs2/locking/nolock/main.c b/fs/gfs2/locking/nolock/main.c
deleted file mode 100644
index 627bfb79bc8..00000000000
--- a/fs/gfs2/locking/nolock/main.c
+++ /dev/null
@@ -1,240 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/fs.h>
-#include <linux/lm_interface.h>
-
-struct nolock_lockspace {
-	unsigned int nl_lvb_size;
-};
-
-static const struct lm_lockops nolock_ops;
-
-static int nolock_mount(char *table_name, char *host_data,
-			lm_callback_t cb, void *cb_data,
-			unsigned int min_lvb_size, int flags,
-			struct lm_lockstruct *lockstruct,
-			struct kobject *fskobj)
-{
-	char *c;
-	unsigned int jid;
-	struct nolock_lockspace *nl;
-
-	c = strstr(host_data, "jid=");
-	if (!c)
-		jid = 0;
-	else {
-		c += 4;
-		sscanf(c, "%u", &jid);
-	}
-
-	nl = kzalloc(sizeof(struct nolock_lockspace), GFP_KERNEL);
-	if (!nl)
-		return -ENOMEM;
-
-	nl->nl_lvb_size = min_lvb_size;
-
-	lockstruct->ls_jid = jid;
-	lockstruct->ls_first = 1;
-	lockstruct->ls_lvb_size = min_lvb_size;
-	lockstruct->ls_lockspace = nl;
-	lockstruct->ls_ops = &nolock_ops;
-	lockstruct->ls_flags = LM_LSFLAG_LOCAL;
-
-	return 0;
-}
-
-static void nolock_others_may_mount(void *lockspace)
-{
-}
-
-static void nolock_unmount(void *lockspace)
-{
-	struct nolock_lockspace *nl = lockspace;
-	kfree(nl);
-}
-
-static void nolock_withdraw(void *lockspace)
-{
-}
-
-/**
- * nolock_get_lock - get a lm_lock_t given a descripton of the lock
- * @lockspace: the lockspace the lock lives in
- * @name: the name of the lock
- * @lockp: return the lm_lock_t here
- *
- * Returns: 0 on success, -EXXX on failure
- */
-
-static int nolock_get_lock(void *lockspace, struct lm_lockname *name,
-			   void **lockp)
-{
-	*lockp = lockspace;
-	return 0;
-}
-
-/**
- * nolock_put_lock - get rid of a lock structure
- * @lock: the lock to throw away
- *
- */
-
-static void nolock_put_lock(void *lock)
-{
-}
-
-/**
- * nolock_lock - acquire a lock
- * @lock: the lock to manipulate
- * @cur_state: the current state
- * @req_state: the requested state
- * @flags: modifier flags
- *
- * Returns: A bitmap of LM_OUT_*
- */
-
-static unsigned int nolock_lock(void *lock, unsigned int cur_state,
-				unsigned int req_state, unsigned int flags)
-{
-	if (req_state == LM_ST_UNLOCKED)
-		return 0;
-	return req_state | LM_OUT_CACHEABLE;
-}
-
-/**
- * nolock_unlock - unlock a lock
- * @lock: the lock to manipulate
- * @cur_state: the current state
- *
- * Returns: 0
- */
-
-static unsigned int nolock_unlock(void *lock, unsigned int cur_state)
-{
-	return 0;
-}
-
-static void nolock_cancel(void *lock)
-{
-}
-
-/**
- * nolock_hold_lvb - hold on to a lock value block
- * @lock: the lock the LVB is associated with
- * @lvbp: return the lm_lvb_t here
- *
- * Returns: 0 on success, -EXXX on failure
- */
-
-static int nolock_hold_lvb(void *lock, char **lvbp)
-{
-	struct nolock_lockspace *nl = lock;
-	int error = 0;
-
-	*lvbp = kzalloc(nl->nl_lvb_size, GFP_NOFS);
-	if (!*lvbp)
-		error = -ENOMEM;
-
-	return error;
-}
-
-/**
- * nolock_unhold_lvb - release a LVB
- * @lock: the lock the LVB is associated with
- * @lvb: the lock value block
- *
- */
-
-static void nolock_unhold_lvb(void *lock, char *lvb)
-{
-	kfree(lvb);
-}
-
-static int nolock_plock_get(void *lockspace, struct lm_lockname *name,
-			    struct file *file, struct file_lock *fl)
-{
-	posix_test_lock(file, fl);
-
-	return 0;
-}
-
-static int nolock_plock(void *lockspace, struct lm_lockname *name,
-			struct file *file, int cmd, struct file_lock *fl)
-{
-	int error;
-	error = posix_lock_file_wait(file, fl);
-	return error;
-}
-
-static int nolock_punlock(void *lockspace, struct lm_lockname *name,
-			  struct file *file, struct file_lock *fl)
-{
-	int error;
-	error = posix_lock_file_wait(file, fl);
-	return error;
-}
-
-static void nolock_recovery_done(void *lockspace, unsigned int jid,
-				 unsigned int message)
-{
-}
-
-static const struct lm_lockops nolock_ops = {
-	.lm_proto_name = "lock_nolock",
-	.lm_mount = nolock_mount,
-	.lm_others_may_mount = nolock_others_may_mount,
-	.lm_unmount = nolock_unmount,
-	.lm_withdraw = nolock_withdraw,
-	.lm_get_lock = nolock_get_lock,
-	.lm_put_lock = nolock_put_lock,
-	.lm_lock = nolock_lock,
-	.lm_unlock = nolock_unlock,
-	.lm_cancel = nolock_cancel,
-	.lm_hold_lvb = nolock_hold_lvb,
-	.lm_unhold_lvb = nolock_unhold_lvb,
-	.lm_plock_get = nolock_plock_get,
-	.lm_plock = nolock_plock,
-	.lm_punlock = nolock_punlock,
-	.lm_recovery_done = nolock_recovery_done,
-	.lm_owner = THIS_MODULE,
-};
-
-static int __init init_nolock(void)
-{
-	int error;
-
-	error = gfs2_register_lockproto(&nolock_ops);
-	if (error) {
-		printk(KERN_WARNING
-		       "lock_nolock: can't register protocol: %d\n", error);
-		return error;
-	}
-
-	printk(KERN_INFO
-	       "Lock_Nolock (built %s %s) installed\n", __DATE__, __TIME__);
-	return 0;
-}
-
-static void __exit exit_nolock(void)
-{
-	gfs2_unregister_lockproto(&nolock_ops);
-}
-
-module_init(init_nolock);
-module_exit(exit_nolock);
-
-MODULE_DESCRIPTION("GFS Nolock Locking Module");
-MODULE_AUTHOR("Red Hat, Inc.");
-MODULE_LICENSE("GPL");
-
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index b2028c82e8d..9bd97c5543b 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -364,6 +364,8 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
 
 static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
 {
+	if (!sdp->sd_lockstruct.ls_ops->lm_others_may_mount)
+		return;
 	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
 		sdp->sd_lockstruct.ls_ops->lm_others_may_mount(
 					sdp->sd_lockstruct.ls_lockspace);
@@ -741,8 +743,7 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
 		goto out;
 	}
 
-	if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lockspace) ||
-	    gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
+	if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
 	    gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >=
 				  GFS2_MIN_LVB_SIZE)) {
 		gfs2_unmount_lockproto(&sdp->sd_lockstruct);
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index fdd3f0f16d0..d5e91f4f6a0 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -428,6 +428,9 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea
 static void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
 				  unsigned int message)
 {
+	if (!sdp->sd_lockstruct.ls_ops->lm_recovery_done)
+		return;
+
 	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
 		sdp->sd_lockstruct.ls_ops->lm_recovery_done(
 			sdp->sd_lockstruct.ls_lockspace, jid, message);
-- 
cgit v1.2.3-70-g09d2


From 2d81afb87972013b43b055b4711dc75fdeeb9ba8 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Thu, 29 May 2008 18:27:51 -0700
Subject: [GFS2] trivial sparse lock annotations

Annotate the &sdp->sd_log_lock.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/log.c | 2 ++
 fs/gfs2/log.h | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 548264b1836..6c6af9f5e3a 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -87,6 +87,8 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
  */
 
 static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+__releases(&sdp->sd_log_lock)
+__acquires(&sdp->sd_log_lock)
 {
 	struct gfs2_bufdata *bd, *s;
 	struct buffer_head *bh;
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 77115281650..7c64510ccfd 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -21,6 +21,7 @@
  */
 
 static inline void gfs2_log_lock(struct gfs2_sbd *sdp)
+__acquires(&sdp->sd_log_lock)
 {
 	spin_lock(&sdp->sd_log_lock);
 }
@@ -32,6 +33,7 @@ static inline void gfs2_log_lock(struct gfs2_sbd *sdp)
  */
 
 static inline void gfs2_log_unlock(struct gfs2_sbd *sdp)
+__releases(&sdp->sd_log_lock)
 {
 	spin_unlock(&sdp->sd_log_lock);
 }
-- 
cgit v1.2.3-70-g09d2


From 80274737220f8c5ea75696dde4c5c7feba39456f Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 2 Jun 2008 09:08:47 +0100
Subject: [GFS2] Fix ordering of args for list_add

The patch to remove lock_nolock managed to get the arguments
of this list_add backwards. This fixes it.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/locking.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/locking.c b/fs/gfs2/locking.c
index a4a367aa5cc..523243a13a2 100644
--- a/fs/gfs2/locking.c
+++ b/fs/gfs2/locking.c
@@ -163,7 +163,7 @@ retry:
 	mutex_lock(&lmh_lock);
 
 	if (list_empty(&nolock_proto.lw_list))
-		list_add(&lmh_list, &nolock_proto.lw_list);
+		list_add(&nolock_proto.lw_list, &lmh_list);
 
 	found = 0;
 	list_for_each_entry(lw, &lmh_list, lw_list) {
-- 
cgit v1.2.3-70-g09d2


From 01b7c7ae88a6376c508b35a22bb61e04cb1b37f0 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 2 Jun 2008 09:14:54 +0100
Subject: [GFS2] Revise readpage locking

The previous attempt to fix the locking in readpage failed due
to the use of a "try lock" which resulted in occasional high
cpu usage during testing (due to repeated tries) and also it
did not resolve all the ordering problems wrt the transaction
lock (although it did solve all the inode lock ordering problems).

This patch avoids the problem by unlocking the page and getting the
locks in the correct order. This means that we have to retest the
page to ensure that it hasn't changed when we relock the page.

This now passes the tests which were previously failing.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_address.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 2b556dd034b..e64a1b04117 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -499,31 +499,34 @@ static int __gfs2_readpage(void *file, struct page *page)
  * @file: The file to read
  * @page: The page of the file
  *
- * This deals with the locking required. We use a trylock in order to
- * avoid the page lock / glock ordering problems returning AOP_TRUNCATED_PAGE
- * in the event that we are unable to get the lock.
+ * This deals with the locking required. We have to unlock and
+ * relock the page in order to get the locking in the right
+ * order.
  */
 
 static int gfs2_readpage(struct file *file, struct page *page)
 {
-	struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+	struct address_space *mapping = page->mapping;
+	struct gfs2_inode *ip = GFS2_I(mapping->host);
 	struct gfs2_holder gh;
 	int error;
 
-	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|LM_FLAG_TRY_1CB, &gh);
+	unlock_page(page);
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
 	error = gfs2_glock_nq_atime(&gh);
-	if (unlikely(error)) {
-		unlock_page(page);
+	if (unlikely(error))
 		goto out;
-	}
-	error = __gfs2_readpage(file, page);
+	error = AOP_TRUNCATED_PAGE;
+	lock_page(page);
+	if (page->mapping == mapping && !PageUptodate(page))
+		error = __gfs2_readpage(file, page);
+	else
+		unlock_page(page);
 	gfs2_glock_dq(&gh);
 out:
 	gfs2_holder_uninit(&gh);
-	if (error == GLR_TRYFAILED) {
-		yield();
-		return AOP_TRUNCATED_PAGE;
-	}
+	if (error && error != AOP_TRUNCATED_PAGE)
+		lock_page(page);
 	return error;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 9171f5a991e7613cbee816874ad8c9515dcab50f Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Mon, 9 Jun 2008 12:08:23 -0500
Subject: [GFS2] kernel panic mounting volume

This patch fixes Red Hat bugzilla bug 450156.

This started with a not-too-improbable mount failure because the
locking protocol was never set back to its proper "lock_dlm" after the
system was rebooted in the middle of a gfs2_fsck.  That left a
(purposely) invalid locking protocol in the superblock, which caused an
error when the file system was mounted the next time.

When there's an error mounting, vfs calls DQUOT_OFF, which calls
vfs_quota_off which calls gfs2_sync_fs.  Next, gfs2_sync_fs calls
gfs2_log_flush passing s_fs_info.  But due to the error, s_fs_info
had been previously set to NULL, and so we have the kernel oops.

My solution in this patch is to test for the NULL value before passing
it.  I tested this patch and it fixes the problem.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 0b7cc920eb8..66907922109 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -155,7 +155,7 @@ static void gfs2_write_super(struct super_block *sb)
 static int gfs2_sync_fs(struct super_block *sb, int wait)
 {
 	sb->s_dirt = 0;
-	if (wait)
+	if (wait && sb->s_fs_info)
 		gfs2_log_flush(sb->s_fs_info, NULL);
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 1bdad606338debc6384b2844f1b53cc436b3ac90 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 3 Jun 2008 14:09:53 +0100
Subject: [GFS2] Remove remote lock dropping code

There are several reasons why this is undesirable:

 1. It never happens during normal operation anyway
 2. If it does happen it causes performance to be very, very poor
 3. It isn't likely to solve the original problem (memory shortage
    on remote DLM node) it was supposed to solve
 4. It uses a bunch of arbitrary constants which are unlikely to be
    correct for any particular situation and for which the tuning seems
    to be a black art.
 5. In an N node cluster, only 1/N of the dropped locked will actually
    contribute to solving the problem on average.

So all in all we are better off without it. This also makes merging
the lock_dlm module into GFS2 a bit easier.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/gfs2.h                 |  5 -----
 fs/gfs2/glock.c                | 12 +++---------
 fs/gfs2/glock.h                |  2 +-
 fs/gfs2/locking/dlm/lock_dlm.h |  3 ---
 fs/gfs2/locking/dlm/mount.c    |  3 ---
 fs/gfs2/locking/dlm/sysfs.c    | 13 -------------
 fs/gfs2/locking/dlm/thread.c   | 19 -------------------
 fs/gfs2/ops_fstype.c           |  2 +-
 fs/gfs2/ops_super.c            |  2 +-
 fs/gfs2/sys.c                  | 14 --------------
 include/linux/lm_interface.h   |  4 ----
 11 files changed, 6 insertions(+), 73 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/gfs2.h b/fs/gfs2/gfs2.h
index 3bb11c0f8b5..ef606e3a5cf 100644
--- a/fs/gfs2/gfs2.h
+++ b/fs/gfs2/gfs2.h
@@ -15,11 +15,6 @@ enum {
 	CREATE = 1,
 };
 
-enum {
-	NO_WAIT = 0,
-	WAIT = 1,
-};
-
 enum {
 	NO_FORCE = 0,
 	FORCE = 1,
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index be7ed503f01..8d5450f3c3e 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1316,11 +1316,6 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
 			wake_up_process(sdp->sd_recoverd_process);
 		return;
 
-	case LM_CB_DROPLOCKS:
-		gfs2_gl_hash_clear(sdp, NO_WAIT);
-		gfs2_quota_scan(sdp);
-		return;
-
 	default:
 		gfs2_assert_warn(sdp, 0);
 		return;
@@ -1508,11 +1503,10 @@ static void clear_glock(struct gfs2_glock *gl)
  * @sdp: the filesystem
  * @wait: wait until it's all gone
  *
- * Called when unmounting the filesystem, or when inter-node lock manager
- * requests DROPLOCKS because it is running out of capacity.
+ * Called when unmounting the filesystem.
  */
 
-void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
+void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
 {
 	unsigned long t;
 	unsigned int x;
@@ -1527,7 +1521,7 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
 				cont = 1;
 		}
 
-		if (!wait || !cont)
+		if (!cont)
 			break;
 
 		if (time_after_eq(jiffies,
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 7389f8ef0a3..971d92af70f 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -132,7 +132,7 @@ void gfs2_lvb_unhold(struct gfs2_glock *gl);
 void gfs2_glock_cb(void *cb_data, unsigned int type, void *data);
 void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
 void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
-void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait);
+void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
 
 int __init gfs2_glock_init(void);
 void gfs2_glock_exit(void);
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
index ad944c64eab..845a27fd303 100644
--- a/fs/gfs2/locking/dlm/lock_dlm.h
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -79,9 +79,6 @@ struct gdlm_ls {
 	wait_queue_head_t	wait_control;
 	struct task_struct	*thread;
 	wait_queue_head_t	thread_wait;
-	unsigned long		drop_time;
-	int			drop_locks_count;
-	int			drop_locks_period;
 };
 
 enum {
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index 0628520a445..fa31c54c2e6 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -22,8 +22,6 @@ static struct gdlm_ls *init_gdlm(lm_callback_t cb, struct gfs2_sbd *sdp,
 	if (!ls)
 		return NULL;
 
-	ls->drop_locks_count = GDLM_DROP_COUNT;
-	ls->drop_locks_period = GDLM_DROP_PERIOD;
 	ls->fscb = cb;
 	ls->sdp = sdp;
 	ls->fsflags = flags;
@@ -33,7 +31,6 @@ static struct gdlm_ls *init_gdlm(lm_callback_t cb, struct gfs2_sbd *sdp,
 	INIT_LIST_HEAD(&ls->all_locks);
 	init_waitqueue_head(&ls->thread_wait);
 	init_waitqueue_head(&ls->wait_control);
-	ls->drop_time = jiffies;
 	ls->jid = -1;
 
 	strncpy(buf, table_name, 256);
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index a4ff271df9e..4ec571c3d8a 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -114,17 +114,6 @@ static ssize_t recover_status_show(struct gdlm_ls *ls, char *buf)
 	return sprintf(buf, "%d\n", ls->recover_jid_status);
 }
 
-static ssize_t drop_count_show(struct gdlm_ls *ls, char *buf)
-{
-	return sprintf(buf, "%d\n", ls->drop_locks_count);
-}
-
-static ssize_t drop_count_store(struct gdlm_ls *ls, const char *buf, size_t len)
-{
-	ls->drop_locks_count = simple_strtol(buf, NULL, 0);
-	return len;
-}
-
 struct gdlm_attr {
 	struct attribute attr;
 	ssize_t (*show)(struct gdlm_ls *, char *);
@@ -144,7 +133,6 @@ GDLM_ATTR(first_done,     0444, first_done_show,     NULL);
 GDLM_ATTR(recover,        0644, recover_show,        recover_store);
 GDLM_ATTR(recover_done,   0444, recover_done_show,   NULL);
 GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
-GDLM_ATTR(drop_count,     0644, drop_count_show,     drop_count_store);
 
 static struct attribute *gdlm_attrs[] = {
 	&gdlm_attr_proto_name.attr,
@@ -157,7 +145,6 @@ static struct attribute *gdlm_attrs[] = {
 	&gdlm_attr_recover.attr,
 	&gdlm_attr_recover_done.attr,
 	&gdlm_attr_recover_status.attr,
-	&gdlm_attr_drop_count.attr,
 	NULL,
 };
 
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
index f30350abd62..38823efd698 100644
--- a/fs/gfs2/locking/dlm/thread.c
+++ b/fs/gfs2/locking/dlm/thread.c
@@ -20,19 +20,6 @@ static inline int no_work(struct gdlm_ls *ls)
 	return ret;
 }
 
-static inline int check_drop(struct gdlm_ls *ls)
-{
-	if (!ls->drop_locks_count)
-		return 0;
-
-	if (time_after(jiffies, ls->drop_time + ls->drop_locks_period * HZ)) {
-		ls->drop_time = jiffies;
-		if (ls->all_locks_count >= ls->drop_locks_count)
-			return 1;
-	}
-	return 0;
-}
-
 static int gdlm_thread(void *data)
 {
 	struct gdlm_ls *ls = (struct gdlm_ls *) data;
@@ -52,12 +39,6 @@ static int gdlm_thread(void *data)
 			gdlm_do_lock(lp);
 			spin_lock(&ls->async_lock);
 		}
-		/* Does this ever happen these days? I hope not anyway */
-		if (check_drop(ls)) {
-			spin_unlock(&ls->async_lock);
-			ls->fscb(ls->sdp, LM_CB_DROPLOCKS, NULL);
-			spin_lock(&ls->async_lock);
-		}
 		spin_unlock(&ls->async_lock);
 	}
 
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 9bd97c5543b..6ba69dd1a72 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -874,7 +874,7 @@ fail_sb:
 fail_locking:
 	init_locking(sdp, &mount_gh, UNDO);
 fail_lm:
-	gfs2_gl_hash_clear(sdp, WAIT);
+	gfs2_gl_hash_clear(sdp);
 	gfs2_lm_unmount(sdp);
 	while (invalidate_inodes(sb))
 		yield();
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 66907922109..f66ea0f7a35 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -126,7 +126,7 @@ static void gfs2_put_super(struct super_block *sb)
 	gfs2_clear_rgrpd(sdp);
 	gfs2_jindex_free(sdp);
 	/*  Take apart glock structures and buffer lists  */
-	gfs2_gl_hash_clear(sdp, WAIT);
+	gfs2_gl_hash_clear(sdp);
 	/*  Unmount the locking protocol  */
 	gfs2_lm_unmount(sdp);
 
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 9ab9fc85ecd..6f7e2e5858e 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -110,18 +110,6 @@ static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf,
 	return len;
 }
 
-static ssize_t shrink_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
-{
-	if (!capable(CAP_SYS_ADMIN))
-		return -EACCES;
-
-	if (simple_strtol(buf, NULL, 0) != 1)
-		return -EINVAL;
-
-	gfs2_gl_hash_clear(sdp, NO_WAIT);
-	return len;
-}
-
 static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
 				size_t len)
 {
@@ -175,7 +163,6 @@ static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store)
 GFS2_ATTR(id,                  0444, id_show,       NULL);
 GFS2_ATTR(fsname,              0444, fsname_show,   NULL);
 GFS2_ATTR(freeze,              0644, freeze_show,   freeze_store);
-GFS2_ATTR(shrink,              0200, NULL,          shrink_store);
 GFS2_ATTR(withdraw,            0644, withdraw_show, withdraw_store);
 GFS2_ATTR(statfs_sync,         0200, NULL,          statfs_sync_store);
 GFS2_ATTR(quota_sync,          0200, NULL,          quota_sync_store);
@@ -186,7 +173,6 @@ static struct attribute *gfs2_attrs[] = {
 	&gfs2_attr_id.attr,
 	&gfs2_attr_fsname.attr,
 	&gfs2_attr_freeze.attr,
-	&gfs2_attr_shrink.attr,
 	&gfs2_attr_withdraw.attr,
 	&gfs2_attr_statfs_sync.attr,
 	&gfs2_attr_quota_sync.attr,
diff --git a/include/linux/lm_interface.h b/include/linux/lm_interface.h
index f274997bc28..d0a7112b971 100644
--- a/include/linux/lm_interface.h
+++ b/include/linux/lm_interface.h
@@ -138,9 +138,6 @@ typedef void (*lm_callback_t) (void *ptr, unsigned int type, void *data);
  * LM_CB_NEED_RECOVERY
  * The given journal needs to be recovered.
  *
- * LM_CB_DROPLOCKS
- * Reduce the number of cached locks.
- *
  * LM_CB_ASYNC
  * The given lock has been granted.
  */
@@ -149,7 +146,6 @@ typedef void (*lm_callback_t) (void *ptr, unsigned int type, void *data);
 #define LM_CB_NEED_D		258
 #define LM_CB_NEED_S		259
 #define LM_CB_NEED_RECOVERY	260
-#define LM_CB_DROPLOCKS		261
 #define LM_CB_ASYNC		262
 
 /*
-- 
cgit v1.2.3-70-g09d2


From b2cad26cfc2091050574a460b304ed103a35dbda Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 3 Jun 2008 14:34:14 +0100
Subject: [GFS2] Remove obsolete conversion deadlock avoidance code

This is only used by GFS1 so can be removed.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/locking/dlm/lock.c   | 23 +----------------------
 include/linux/lm_interface.h |  2 --
 2 files changed, 1 insertion(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
index 871ffc9578f..894df4567a0 100644
--- a/fs/gfs2/locking/dlm/lock.c
+++ b/fs/gfs2/locking/dlm/lock.c
@@ -80,7 +80,6 @@ static void process_complete(struct gdlm_lock *lp)
 {
 	struct gdlm_ls *ls = lp->ls;
 	struct lm_async_cb acb;
-	s16 prev_mode = lp->cur;
 
 	memset(&acb, 0, sizeof(acb));
 
@@ -160,15 +159,7 @@ static void process_complete(struct gdlm_lock *lp)
 			 lp->lksb.sb_status, lp->lockname.ln_type,
 			 (unsigned long long)lp->lockname.ln_number,
 			 lp->flags);
-		if (lp->lksb.sb_status == -EDEADLOCK &&
-		    lp->ls->fsflags & LM_MFLAG_CONV_NODROP) {
-			lp->req = lp->cur;
-			acb.lc_ret |= LM_OUT_CONV_DEADLK;
-			if (lp->cur == DLM_LOCK_IV)
-				lp->lksb.sb_lkid = 0;
-			goto out;
-		} else
-			return;
+		return;
 	}
 
 	/*
@@ -268,10 +259,6 @@ out:
 	acb.lc_name = lp->lockname;
 	acb.lc_ret |= gdlm_make_lmstate(lp->cur);
 
-	if (!test_and_clear_bit(LFL_NOCACHE, &lp->flags) &&
-	    (lp->cur > DLM_LOCK_NL) && (prev_mode > DLM_LOCK_NL))
-		acb.lc_ret |= LM_OUT_CACHEABLE;
-
 	ls->fscb(ls->sdp, LM_CB_ASYNC, &acb);
 }
 
@@ -376,14 +363,6 @@ static inline unsigned int make_flags(struct gdlm_lock *lp,
 
 	if (lp->lksb.sb_lkid != 0) {
 		lkf |= DLM_LKF_CONVERT;
-
-		/* Conversion deadlock avoidance by DLM */
-
-		if (!(lp->ls->fsflags & LM_MFLAG_CONV_NODROP) &&
-		    !test_bit(LFL_FORCE_PROMOTE, &lp->flags) &&
-		    !(lkf & DLM_LKF_NOQUEUE) &&
-		    cur > DLM_LOCK_NL && req > DLM_LOCK_NL && cur != req)
-			lkf |= DLM_LKF_CONVDEADLK;
 	}
 
 	if (lp->lvb)
diff --git a/include/linux/lm_interface.h b/include/linux/lm_interface.h
index d0a7112b971..2ed8fa1b762 100644
--- a/include/linux/lm_interface.h
+++ b/include/linux/lm_interface.h
@@ -122,11 +122,9 @@ typedef void (*lm_callback_t) (void *ptr, unsigned int type, void *data);
  */
 
 #define LM_OUT_ST_MASK		0x00000003
-#define LM_OUT_CACHEABLE	0x00000004
 #define LM_OUT_CANCELED		0x00000008
 #define LM_OUT_ASYNC		0x00000080
 #define LM_OUT_ERROR		0x00000100
-#define LM_OUT_CONV_DEADLK	0x00000200
 
 /*
  * lm_callback_t types
-- 
cgit v1.2.3-70-g09d2


From 31fcba00fe7145527b159f8893ec6c9cc61309fd Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 4 Jun 2008 15:06:21 +0100
Subject: [GFS2] Remove all_list from lock_dlm

I discovered that we had a list onto which every lock_dlm
lock was being put. Its only function was to discover whether
we'd got any locks left after umount. Since there was already
a counter for that purpose as well, I removed the list. The
saving is sizeof(struct list_head) per glock - well worth
having.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/locking/dlm/lock.c     | 23 -----------------------
 fs/gfs2/locking/dlm/lock_dlm.h |  2 --
 fs/gfs2/locking/dlm/mount.c    |  6 +-----
 3 files changed, 1 insertion(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
index 894df4567a0..2482c904750 100644
--- a/fs/gfs2/locking/dlm/lock.c
+++ b/fs/gfs2/locking/dlm/lock.c
@@ -58,9 +58,6 @@ static void gdlm_delete_lp(struct gdlm_lock *lp)
 	spin_lock(&ls->async_lock);
 	if (!list_empty(&lp->delay_list))
 		list_del_init(&lp->delay_list);
-	gdlm_assert(!list_empty(&lp->all_list), "%x,%llx", lp->lockname.ln_type,
-		    (unsigned long long)lp->lockname.ln_number);
-	list_del_init(&lp->all_list);
 	ls->all_locks_count--;
 	spin_unlock(&ls->async_lock);
 
@@ -397,7 +394,6 @@ static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
 	INIT_LIST_HEAD(&lp->delay_list);
 
 	spin_lock(&ls->async_lock);
-	list_add(&lp->all_list, &ls->all_locks);
 	ls->all_locks_count++;
 	spin_unlock(&ls->async_lock);
 
@@ -710,22 +706,3 @@ void gdlm_submit_delayed(struct gdlm_ls *ls)
 	wake_up(&ls->thread_wait);
 }
 
-int gdlm_release_all_locks(struct gdlm_ls *ls)
-{
-	struct gdlm_lock *lp, *safe;
-	int count = 0;
-
-	spin_lock(&ls->async_lock);
-	list_for_each_entry_safe(lp, safe, &ls->all_locks, all_list) {
-		list_del_init(&lp->all_list);
-
-		if (lp->lvb && lp->lvb != junk_lvb)
-			kfree(lp->lvb);
-		kfree(lp);
-		count++;
-	}
-	spin_unlock(&ls->async_lock);
-
-	return count;
-}
-
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
index 845a27fd303..21cf46617d9 100644
--- a/fs/gfs2/locking/dlm/lock_dlm.h
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -74,7 +74,6 @@ struct gdlm_ls {
 	spinlock_t		async_lock;
 	struct list_head	delayed;
 	struct list_head	submit;
-	struct list_head	all_locks;
 	u32		all_locks_count;
 	wait_queue_head_t	wait_control;
 	struct task_struct	*thread;
@@ -112,7 +111,6 @@ struct gdlm_lock {
 	unsigned long		flags;		/* lock_dlm flags LFL_ */
 
 	struct list_head	delay_list;	/* delayed */
-	struct list_head	all_list;	/* all locks for the fs */
 	struct gdlm_lock	*hold_null;	/* NL lock for hold_lvb */
 };
 
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index fa31c54c2e6..947e70673ee 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -28,7 +28,6 @@ static struct gdlm_ls *init_gdlm(lm_callback_t cb, struct gfs2_sbd *sdp,
 	spin_lock_init(&ls->async_lock);
 	INIT_LIST_HEAD(&ls->delayed);
 	INIT_LIST_HEAD(&ls->submit);
-	INIT_LIST_HEAD(&ls->all_locks);
 	init_waitqueue_head(&ls->thread_wait);
 	init_waitqueue_head(&ls->wait_control);
 	ls->jid = -1;
@@ -173,7 +172,6 @@ out:
 static void gdlm_unmount(void *lockspace)
 {
 	struct gdlm_ls *ls = lockspace;
-	int rv;
 
 	log_debug("unmount flags %lx", ls->flags);
 
@@ -187,9 +185,7 @@ static void gdlm_unmount(void *lockspace)
 	gdlm_kobject_release(ls);
 	dlm_release_lockspace(ls->dlm_lockspace, 2);
 	gdlm_release_threads(ls);
-	rv = gdlm_release_all_locks(ls);
-	if (rv)
-		log_info("gdlm_unmount: %d stray locks freed", rv);
+	BUG_ON(ls->all_locks_count);
 out:
 	kfree(ls);
 }
-- 
cgit v1.2.3-70-g09d2


From f17172e00167238cc5e4f61ac4e78c68e5c558ec Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 27 Jun 2008 09:40:57 +0100
Subject: [GFS2] Fix module building

Two lines missed from the previous patch.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/locking/dlm/lock_dlm.h | 1 -
 fs/gfs2/locking/dlm/mount.c    | 1 -
 2 files changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
index 21cf46617d9..3c98e7c6f93 100644
--- a/fs/gfs2/locking/dlm/lock_dlm.h
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -148,7 +148,6 @@ void gdlm_release_threads(struct gdlm_ls *);
 /* lock.c */
 
 void gdlm_submit_delayed(struct gdlm_ls *);
-int gdlm_release_all_locks(struct gdlm_ls *);
 unsigned int gdlm_do_lock(struct gdlm_lock *);
 
 int gdlm_get_lock(void *, struct lm_lockname *, void **);
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index 947e70673ee..09d78c216f4 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -221,7 +221,6 @@ static void gdlm_withdraw(void *lockspace)
 
 	dlm_release_lockspace(ls->dlm_lockspace, 2);
 	gdlm_release_threads(ls);
-	gdlm_release_all_locks(ls);
 	gdlm_kobject_release(ls);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 496d6c32d4d057cb44272d9bd587ff97d023ee92 Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@suse.de>
Date: Thu, 8 May 2008 13:03:09 +1000
Subject: nfsd: fix spurious EACCESS in reconnect_path()

Thanks to Frank Van Maarseveen for the original problem report: "A
privileged process on an NFS client which drops privileges after using
them to change the current working directory, will experience incorrect
EACCES after an NFS server reboot. This problem can also occur after
memory pressure on the server, particularly when the client side is
quiet for some time."

This occurs because the filehandle points to a directory whose parents
are no longer in the dentry cache, and we're attempting to reconnect the
directory to its parents without adequate permissions to perform lookups
in the parent directories.

We can therefore fix the problem by acquiring the necessary capabilities
before attempting the reconnection.  We do this only in the
no_subtree_check case, since the documented behavior of the
subtree_check export option requires the server to check that the user
has lookup permissions on all parents.

The subtree_check case still has a problem, since reconnect_path()
unnecessarily requires both read and lookup permissions on all parent
directories.  However, a fix in that case would be more delicate, and
use of subtree_check is already discouraged for other reasons.

Signed-off-by: Neil Brown <neilb@suse.de>
Cc: Frank van Maarseveen <frankvm@frankvm.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfsfh.c | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index c7b0fdaeac9..f45451eb1e3 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -176,9 +176,24 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
 	if (IS_ERR(exp))
 		return nfserrno(PTR_ERR(exp));
 
-	error = nfsd_setuser_and_check_port(rqstp, exp);
-	if (error)
-		goto out;
+	if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) {
+		/* Elevate privileges so that the lack of 'r' or 'x'
+		 * permission on some parent directory will
+		 * not stop exportfs_decode_fh from being able
+		 * to reconnect a directory into the dentry cache.
+		 * The same problem can affect "SUBTREECHECK" exports,
+		 * but as nfsd_acceptable depends on correct
+		 * access control settings being in effect, we cannot
+		 * fix that case easily.
+		 */
+		current->cap_effective =
+			cap_raise_nfsd_set(current->cap_effective,
+					   current->cap_permitted);
+	} else {
+		error = nfsd_setuser_and_check_port(rqstp, exp);
+		if (error)
+			goto out;
+	}
 
 	/*
 	 * Look up the dentry using the NFS file handle.
@@ -215,6 +230,14 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
 		goto out;
 	}
 
+	if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) {
+		error = nfsd_setuser_and_check_port(rqstp, exp);
+		if (error) {
+			dput(dentry);
+			goto out;
+		}
+	}
+
 	if (S_ISDIR(dentry->d_inode->i_mode) &&
 			(dentry->d_flags & DCACHE_DISCONNECTED)) {
 		printk("nfsd: find_fh_dentry returned a DISCONNECTED directory: %s/%s\n",
-- 
cgit v1.2.3-70-g09d2


From 100766f8347c1aeb5a548c5c7aa9012f4a3276f1 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 30 Jun 2008 14:09:46 -0400
Subject: nfsd: treat all shutdown signals as equivalent

knfsd currently uses 2 signal masks when processing requests. A "loose"
mask (SHUTDOWN_SIGS) that it uses when receiving network requests, and
then a more "strict" mask (ALLOWED_SIGS, which is just SIGKILL) that it
allows when doing the actual operation on the local storage.

This is apparently unnecessarily complicated. The underlying filesystem
should be able to sanely handle a signal in the middle of an operation.
This patch removes the signal mask handling from knfsd altogether. When
knfsd is started as a kthread, all signals are ignored. It then allows
all of the signals in SHUTDOWN_SIGS. There's no need to set the mask
as well.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfssvc.c | 30 +++++-------------------------
 1 file changed, 5 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 96fdbcab8d9..80292ff5e92 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -37,15 +37,6 @@
 
 #define NFSDDBG_FACILITY	NFSDDBG_SVC
 
-/* these signals will be delivered to an nfsd thread 
- * when handling a request
- */
-#define ALLOWED_SIGS	(sigmask(SIGKILL))
-/* these signals will be delivered to an nfsd thread
- * when not handling a request. i.e. when waiting
- */
-#define SHUTDOWN_SIGS	(sigmask(SIGKILL) | sigmask(SIGHUP) | sigmask(SIGINT) | sigmask(SIGQUIT))
-
 extern struct svc_program	nfsd_program;
 static int			nfsd(void *vrqstp);
 struct timeval			nfssvc_boot;
@@ -414,9 +405,7 @@ nfsd(void *vrqstp)
 {
 	struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp;
 	struct fs_struct *fsp;
-	sigset_t shutdown_mask, allowed_mask;
 	int err, preverr = 0;
-	unsigned int signo;
 
 	/* Lock module and set up kernel thread */
 	mutex_lock(&nfsd_mutex);
@@ -433,17 +422,14 @@ nfsd(void *vrqstp)
 	current->fs = fsp;
 	current->fs->umask = 0;
 
-	siginitsetinv(&shutdown_mask, SHUTDOWN_SIGS);
-	siginitsetinv(&allowed_mask, ALLOWED_SIGS);
-
 	/*
 	 * thread is spawned with all signals set to SIG_IGN, re-enable
-	 * the ones that matter
+	 * the ones that will bring down the thread
 	 */
-	for (signo = 1; signo <= _NSIG; signo++) {
-		if (!sigismember(&shutdown_mask, signo))
-			allow_signal(signo);
-	}
+	allow_signal(SIGKILL);
+	allow_signal(SIGHUP);
+	allow_signal(SIGINT);
+	allow_signal(SIGQUIT);
 
 	nfsdstats.th_cnt++;
 	mutex_unlock(&nfsd_mutex);
@@ -460,9 +446,6 @@ nfsd(void *vrqstp)
 	 * The main request loop
 	 */
 	for (;;) {
-		/* Block all but the shutdown signals */
-		sigprocmask(SIG_SETMASK, &shutdown_mask, NULL);
-
 		/*
 		 * Find a socket with data available and call its
 		 * recvfrom routine.
@@ -487,9 +470,6 @@ nfsd(void *vrqstp)
 		/* Lock the export hash tables for reading. */
 		exp_readlock();
 
-		/* Process request with signals blocked. */
-		sigprocmask(SIG_SETMASK, &allowed_mask, NULL);
-
 		svc_process(rqstp);
 
 		/* Unlock export hash tables */
-- 
cgit v1.2.3-70-g09d2


From 18ce3751ccd488c78d3827e9f6bf54e6322676fb Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 1 Jul 2008 09:07:34 +0200
Subject: Properly notify block layer of sync writes

fsync_buffers_list() and sync_dirty_buffer() both issue async writes and
then immediately wait on them. Conceptually, that makes them sync writes
and we should treat them as such so that the IO schedulers can handle
them appropriately.

This patch fixes a write starvation issue that Lin Ming reported, where
xx is stuck for more than 2 minutes because of a large number of
synchronous IO in the system:

INFO: task kjournald:20558 blocked for more than 120 seconds.
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this
message.
kjournald     D ffff810010820978  6712 20558      2
ffff81022ddb1d10 0000000000000046 ffff81022e7baa10 ffffffff803ba6f2
ffff81022ecd0000 ffff8101e6dc9160 ffff81022ecd0348 000000008048b6cb
0000000000000086 ffff81022c4e8d30 0000000000000000 ffffffff80247537
Call Trace:
[<ffffffff803ba6f2>] kobject_get+0x12/0x17
[<ffffffff80247537>] getnstimeofday+0x2f/0x83
[<ffffffff8029c1ac>] sync_buffer+0x0/0x3f
[<ffffffff8066d195>] io_schedule+0x5d/0x9f
[<ffffffff8029c1e7>] sync_buffer+0x3b/0x3f
[<ffffffff8066d3f0>] __wait_on_bit+0x40/0x6f
[<ffffffff8029c1ac>] sync_buffer+0x0/0x3f
[<ffffffff8066d48b>] out_of_line_wait_on_bit+0x6c/0x78
[<ffffffff80243909>] wake_bit_function+0x0/0x23
[<ffffffff8029e3ad>] sync_dirty_buffer+0x98/0xcb
[<ffffffff8030056b>] journal_commit_transaction+0x97d/0xcb6
[<ffffffff8023a676>] lock_timer_base+0x26/0x4b
[<ffffffff8030300a>] kjournald+0xc1/0x1fb
[<ffffffff802438db>] autoremove_wake_function+0x0/0x2e
[<ffffffff80302f49>] kjournald+0x0/0x1fb
[<ffffffff802437bb>] kthread+0x47/0x74
[<ffffffff8022de51>] schedule_tail+0x28/0x5d
[<ffffffff8020cac8>] child_rip+0xa/0x12
[<ffffffff80243774>] kthread+0x0/0x74
[<ffffffff8020cabe>] child_rip+0x0/0x12

Lin Ming confirms that this patch fixes the issue. I've run tests with
it for the past week and no ill effects have been observed, so I'm
proposing it for inclusion into 2.6.26.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/buffer.c        | 13 ++++++++-----
 include/linux/fs.h |  1 +
 2 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index a073f3f4f01..0f51c0f7c26 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -821,7 +821,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 				 * contents - it is a noop if I/O is still in
 				 * flight on potentially older contents.
 				 */
-				ll_rw_block(SWRITE, 1, &bh);
+				ll_rw_block(SWRITE_SYNC, 1, &bh);
 				brelse(bh);
 				spin_lock(lock);
 			}
@@ -2940,16 +2940,19 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
 	for (i = 0; i < nr; i++) {
 		struct buffer_head *bh = bhs[i];
 
-		if (rw == SWRITE)
+		if (rw == SWRITE || rw == SWRITE_SYNC)
 			lock_buffer(bh);
 		else if (test_set_buffer_locked(bh))
 			continue;
 
-		if (rw == WRITE || rw == SWRITE) {
+		if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) {
 			if (test_clear_buffer_dirty(bh)) {
 				bh->b_end_io = end_buffer_write_sync;
 				get_bh(bh);
-				submit_bh(WRITE, bh);
+				if (rw == SWRITE_SYNC)
+					submit_bh(WRITE_SYNC, bh);
+				else
+					submit_bh(WRITE, bh);
 				continue;
 			}
 		} else {
@@ -2978,7 +2981,7 @@ int sync_dirty_buffer(struct buffer_head *bh)
 	if (test_clear_buffer_dirty(bh)) {
 		get_bh(bh);
 		bh->b_end_io = end_buffer_write_sync;
-		ret = submit_bh(WRITE, bh);
+		ret = submit_bh(WRITE_SYNC, bh);
 		wait_on_buffer(bh);
 		if (buffer_eopnotsupp(bh)) {
 			clear_buffer_eopnotsupp(bh);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7c108082683..d8e2762ed14 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -83,6 +83,7 @@ extern int dir_notify_enable;
 #define READ_SYNC	(READ | (1 << BIO_RW_SYNC))
 #define READ_META	(READ | (1 << BIO_RW_META))
 #define WRITE_SYNC	(WRITE | (1 << BIO_RW_SYNC))
+#define SWRITE_SYNC	(SWRITE | (1 << BIO_RW_SYNC))
 #define WRITE_BARRIER	((1 << BIO_RW) | (1 << BIO_RW_BARRIER))
 
 #define SEL_IN		1
-- 
cgit v1.2.3-70-g09d2


From 07cad1d2a4b0112acd41381d5bc6be82fd71ebac Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Tue, 1 Jul 2008 15:38:35 +0200
Subject: nfsd: clean up mnt_want_write calls

Multiple mnt_want_write() calls in the switch statement looks really
ugly.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Acked-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/vfs.c | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 5e05ddda456..0f4481e0502 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1250,36 +1250,34 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		iap->ia_mode = 0;
 	iap->ia_mode = (iap->ia_mode & S_IALLUGO) | type;
 
+	err = nfserr_inval;
+	if (!S_ISREG(type) && !S_ISDIR(type) && !special_file(type)) {
+		printk(KERN_WARNING "nfsd: bad file type %o in nfsd_create\n",
+		       type);
+		goto out;
+	}
+
+	host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
+	if (host_err)
+		goto out_nfserr;
+
 	/*
 	 * Get the dir op function pointer.
 	 */
 	err = 0;
 	switch (type) {
 	case S_IFREG:
-		host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
-		if (host_err)
-			goto out_nfserr;
 		host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
 		break;
 	case S_IFDIR:
-		host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
-		if (host_err)
-			goto out_nfserr;
 		host_err = vfs_mkdir(dirp, dchild, iap->ia_mode);
 		break;
 	case S_IFCHR:
 	case S_IFBLK:
 	case S_IFIFO:
 	case S_IFSOCK:
-		host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
-		if (host_err)
-			goto out_nfserr;
 		host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev);
 		break;
-	default:
-	        printk("nfsd: bad file type %o in nfsd_create\n", type);
-		host_err = -EINVAL;
-		goto out_nfserr;
 	}
 	if (host_err < 0) {
 		mnt_drop_write(fhp->fh_export->ex_path.mnt);
@@ -1291,7 +1289,6 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		write_inode_now(dchild->d_inode, 1);
 	}
 
-
 	err2 = nfsd_create_setattr(rqstp, resfhp, iap);
 	if (err2)
 		err = err2;
-- 
cgit v1.2.3-70-g09d2


From 30cff1ffff3981c8d96dc33870b652e70190ba37 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Wed, 2 Jul 2008 11:13:18 +0300
Subject: nfsd: return nfserr_minor_vers_mismatch when compound minorversion !=
 0

Check minorversion once before decoding any operation and reject with
nfserr_minor_vers_mismatch if != 0 (this still happens in nfsd4_proc_compound).
In this case return a zero length resultdata array as required by RFC3530.

minorversion 1 processing will have its own vector of decoders.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4xdr.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 9547ab63627..413a1533217 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1019,6 +1019,9 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 		}
 	}
 
+	if (argp->minorversion != 0)
+		argp->opcnt = 0;
+
 	for (i = 0; i < argp->opcnt; i++) {
 		op = &argp->ops[i];
 		op->replay = NULL;
@@ -1057,13 +1060,6 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 		op->opnum = ntohl(*argp->p++);
 
 		switch (op->opnum) {
-		case 2: /* Reserved operation */
-			op->opnum = OP_ILLEGAL;
-			if (argp->minorversion == 0)
-				op->status = nfserr_op_illegal;
-			else
-				op->status = nfserr_minor_vers_mismatch;
-			break;
 		case OP_ACCESS:
 			op->status = nfsd4_decode_access(argp, &op->u.access);
 			break;
-- 
cgit v1.2.3-70-g09d2


From 347e0ad9c91b5bd7506d61f236048cc72b7fc151 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Wed, 2 Jul 2008 11:13:41 +0300
Subject: nfsd: tabulate nfs4 xdr decoding functions

In preparation for minorversion 1

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4xdr.c | 153 +++++++++++++++++-------------------------------------
 1 file changed, 48 insertions(+), 105 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 413a1533217..de1fd9db26f 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -985,6 +985,51 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel
 	DECODE_TAIL;
 }
 
+static __be32
+nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
+{
+	return nfs_ok;
+}
+
+typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *);
+
+static nfsd4_dec nfsd4_dec_ops[] = {
+	[OP_ACCESS]		(nfsd4_dec)nfsd4_decode_access,
+	[OP_CLOSE]		(nfsd4_dec)nfsd4_decode_close,
+	[OP_COMMIT]		(nfsd4_dec)nfsd4_decode_commit,
+	[OP_CREATE]		(nfsd4_dec)nfsd4_decode_create,
+	[OP_DELEGRETURN]	(nfsd4_dec)nfsd4_decode_delegreturn,
+	[OP_GETATTR]		(nfsd4_dec)nfsd4_decode_getattr,
+	[OP_GETFH]		(nfsd4_dec)nfsd4_decode_noop,
+	[OP_LINK]		(nfsd4_dec)nfsd4_decode_link,
+	[OP_LOCK]		(nfsd4_dec)nfsd4_decode_lock,
+	[OP_LOCKT]		(nfsd4_dec)nfsd4_decode_lockt,
+	[OP_LOCKU]		(nfsd4_dec)nfsd4_decode_locku,
+	[OP_LOOKUP]		(nfsd4_dec)nfsd4_decode_lookup,
+	[OP_LOOKUPP]		(nfsd4_dec)nfsd4_decode_noop,
+	[OP_NVERIFY]		(nfsd4_dec)nfsd4_decode_verify,
+	[OP_OPEN]		(nfsd4_dec)nfsd4_decode_open,
+	[OP_OPEN_CONFIRM]	(nfsd4_dec)nfsd4_decode_open_confirm,
+	[OP_OPEN_DOWNGRADE]	(nfsd4_dec)nfsd4_decode_open_downgrade,
+	[OP_PUTFH]		(nfsd4_dec)nfsd4_decode_putfh,
+	[OP_PUTROOTFH]		(nfsd4_dec)nfsd4_decode_noop,
+	[OP_READ]		(nfsd4_dec)nfsd4_decode_read,
+	[OP_READDIR]		(nfsd4_dec)nfsd4_decode_readdir,
+	[OP_READLINK]		(nfsd4_dec)nfsd4_decode_noop,
+	[OP_REMOVE]		(nfsd4_dec)nfsd4_decode_remove,
+	[OP_RENAME]		(nfsd4_dec)nfsd4_decode_rename,
+	[OP_RENEW]		(nfsd4_dec)nfsd4_decode_renew,
+	[OP_RESTOREFH]		(nfsd4_dec)nfsd4_decode_noop,
+	[OP_SAVEFH]		(nfsd4_dec)nfsd4_decode_noop,
+	[OP_SECINFO]		(nfsd4_dec)nfsd4_decode_secinfo,
+	[OP_SETATTR]		(nfsd4_dec)nfsd4_decode_setattr,
+	[OP_SETCLIENTID]	(nfsd4_dec)nfsd4_decode_setclientid,
+	[OP_SETCLIENTID_CONFIRM](nfsd4_dec)nfsd4_decode_setclientid_confirm,
+	[OP_VERIFY]		(nfsd4_dec)nfsd4_decode_verify,
+	[OP_WRITE]		(nfsd4_dec)nfsd4_decode_write,
+	[OP_RELEASE_LOCKOWNER]	(nfsd4_dec)nfsd4_decode_release_lockowner,
+};
+
 static __be32
 nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 {
@@ -1059,113 +1104,11 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 		}
 		op->opnum = ntohl(*argp->p++);
 
-		switch (op->opnum) {
-		case OP_ACCESS:
-			op->status = nfsd4_decode_access(argp, &op->u.access);
-			break;
-		case OP_CLOSE:
-			op->status = nfsd4_decode_close(argp, &op->u.close);
-			break;
-		case OP_COMMIT:
-			op->status = nfsd4_decode_commit(argp, &op->u.commit);
-			break;
-		case OP_CREATE:
-			op->status = nfsd4_decode_create(argp, &op->u.create);
-			break;
-		case OP_DELEGRETURN:
-			op->status = nfsd4_decode_delegreturn(argp, &op->u.delegreturn);
-			break;
-		case OP_GETATTR:
-			op->status = nfsd4_decode_getattr(argp, &op->u.getattr);
-			break;
-		case OP_GETFH:
-			op->status = nfs_ok;
-			break;
-		case OP_LINK:
-			op->status = nfsd4_decode_link(argp, &op->u.link);
-			break;
-		case OP_LOCK:
-			op->status = nfsd4_decode_lock(argp, &op->u.lock);
-			break;
-		case OP_LOCKT:
-			op->status = nfsd4_decode_lockt(argp, &op->u.lockt);
-			break;
-		case OP_LOCKU:
-			op->status = nfsd4_decode_locku(argp, &op->u.locku);
-			break;
-		case OP_LOOKUP:
-			op->status = nfsd4_decode_lookup(argp, &op->u.lookup);
-			break;
-		case OP_LOOKUPP:
-			op->status = nfs_ok;
-			break;
-		case OP_NVERIFY:
-			op->status = nfsd4_decode_verify(argp, &op->u.nverify);
-			break;
-		case OP_OPEN:
-			op->status = nfsd4_decode_open(argp, &op->u.open);
-			break;
-		case OP_OPEN_CONFIRM:
-			op->status = nfsd4_decode_open_confirm(argp, &op->u.open_confirm);
-			break;
-		case OP_OPEN_DOWNGRADE:
-			op->status = nfsd4_decode_open_downgrade(argp, &op->u.open_downgrade);
-			break;
-		case OP_PUTFH:
-			op->status = nfsd4_decode_putfh(argp, &op->u.putfh);
-			break;
-		case OP_PUTROOTFH:
-			op->status = nfs_ok;
-			break;
-		case OP_READ:
-			op->status = nfsd4_decode_read(argp, &op->u.read);
-			break;
-		case OP_READDIR:
-			op->status = nfsd4_decode_readdir(argp, &op->u.readdir);
-			break;
-		case OP_READLINK:
-			op->status = nfs_ok;
-			break;
-		case OP_REMOVE:
-			op->status = nfsd4_decode_remove(argp, &op->u.remove);
-			break;
-		case OP_RENAME:
-			op->status = nfsd4_decode_rename(argp, &op->u.rename);
-			break;
-		case OP_RESTOREFH:
-			op->status = nfs_ok;
-			break;
-		case OP_RENEW:
-			op->status = nfsd4_decode_renew(argp, &op->u.renew);
-			break;
-		case OP_SAVEFH:
-			op->status = nfs_ok;
-			break;
-		case OP_SECINFO:
-			op->status = nfsd4_decode_secinfo(argp, &op->u.secinfo);
-			break;
-		case OP_SETATTR:
-			op->status = nfsd4_decode_setattr(argp, &op->u.setattr);
-			break;
-		case OP_SETCLIENTID:
-			op->status = nfsd4_decode_setclientid(argp, &op->u.setclientid);
-			break;
-		case OP_SETCLIENTID_CONFIRM:
-			op->status = nfsd4_decode_setclientid_confirm(argp, &op->u.setclientid_confirm);
-			break;
-		case OP_VERIFY:
-			op->status = nfsd4_decode_verify(argp, &op->u.verify);
-			break;
-		case OP_WRITE:
-			op->status = nfsd4_decode_write(argp, &op->u.write);
-			break;
-		case OP_RELEASE_LOCKOWNER:
-			op->status = nfsd4_decode_release_lockowner(argp, &op->u.release_lockowner);
-			break;
-		default:
+		if (op->opnum >= OP_ACCESS && op->opnum < ARRAY_SIZE(nfsd4_dec_ops))
+			op->status = nfsd4_dec_ops[op->opnum](argp, &op->u);
+		else {
 			op->opnum = OP_ILLEGAL;
 			op->status = nfserr_op_illegal;
-			break;
 		}
 
 		if (op->status) {
-- 
cgit v1.2.3-70-g09d2


From 3c375c6f3a809d0d999d6dc933634f0b97ed7ae9 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Wed, 2 Jul 2008 11:14:01 +0300
Subject: nfsd: unsupported nfs4 ops should fail with nfserr_opnotsupp

nfserr_opnotsupp should be returned for unsupported nfs4 ops
rather than nfserr_op_illegal.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4xdr.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index de1fd9db26f..653951c73e3 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -991,6 +991,12 @@ nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
 	return nfs_ok;
 }
 
+static __be32
+nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p)
+{
+	return nfserr_opnotsupp;
+}
+
 typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *);
 
 static nfsd4_dec nfsd4_dec_ops[] = {
@@ -998,6 +1004,7 @@ static nfsd4_dec nfsd4_dec_ops[] = {
 	[OP_CLOSE]		(nfsd4_dec)nfsd4_decode_close,
 	[OP_COMMIT]		(nfsd4_dec)nfsd4_decode_commit,
 	[OP_CREATE]		(nfsd4_dec)nfsd4_decode_create,
+	[OP_DELEGPURGE]		(nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_DELEGRETURN]	(nfsd4_dec)nfsd4_decode_delegreturn,
 	[OP_GETATTR]		(nfsd4_dec)nfsd4_decode_getattr,
 	[OP_GETFH]		(nfsd4_dec)nfsd4_decode_noop,
@@ -1009,9 +1016,11 @@ static nfsd4_dec nfsd4_dec_ops[] = {
 	[OP_LOOKUPP]		(nfsd4_dec)nfsd4_decode_noop,
 	[OP_NVERIFY]		(nfsd4_dec)nfsd4_decode_verify,
 	[OP_OPEN]		(nfsd4_dec)nfsd4_decode_open,
+	[OP_OPENATTR]		(nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_OPEN_CONFIRM]	(nfsd4_dec)nfsd4_decode_open_confirm,
 	[OP_OPEN_DOWNGRADE]	(nfsd4_dec)nfsd4_decode_open_downgrade,
 	[OP_PUTFH]		(nfsd4_dec)nfsd4_decode_putfh,
+	[OP_PUTPUBFH]		(nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_PUTROOTFH]		(nfsd4_dec)nfsd4_decode_noop,
 	[OP_READ]		(nfsd4_dec)nfsd4_decode_read,
 	[OP_READDIR]		(nfsd4_dec)nfsd4_decode_readdir,
-- 
cgit v1.2.3-70-g09d2


From f2feb96bc3d18e50cab7de9eab142f99d91aa5f6 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Wed, 2 Jul 2008 11:14:22 +0300
Subject: nfsd: nfs4 minorversion decoder vectors

Have separate vectors of operation decoders for each minorversion.
Obsolete ops in newer minorversions have default implementation returning
nfserr_opnotsupp.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4xdr.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 653951c73e3..a40bec53fa6 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1039,11 +1039,21 @@ static nfsd4_dec nfsd4_dec_ops[] = {
 	[OP_RELEASE_LOCKOWNER]	(nfsd4_dec)nfsd4_decode_release_lockowner,
 };
 
+struct nfsd4_minorversion_ops {
+	nfsd4_dec *decoders;
+	int nops;
+};
+
+static struct nfsd4_minorversion_ops nfsd4_minorversion[] = {
+	[0] { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) },
+};
+
 static __be32
 nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 {
 	DECODE_HEAD;
 	struct nfsd4_op *op;
+	struct nfsd4_minorversion_ops *ops;
 	int i;
 
 	/*
@@ -1073,9 +1083,10 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 		}
 	}
 
-	if (argp->minorversion != 0)
+	if (argp->minorversion >= ARRAY_SIZE(nfsd4_minorversion))
 		argp->opcnt = 0;
 
+	ops = &nfsd4_minorversion[argp->minorversion];
 	for (i = 0; i < argp->opcnt; i++) {
 		op = &argp->ops[i];
 		op->replay = NULL;
@@ -1113,8 +1124,8 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 		}
 		op->opnum = ntohl(*argp->p++);
 
-		if (op->opnum >= OP_ACCESS && op->opnum < ARRAY_SIZE(nfsd4_dec_ops))
-			op->status = nfsd4_dec_ops[op->opnum](argp, &op->u);
+		if (op->opnum >= OP_ACCESS && op->opnum < ops->nops)
+			op->status = ops->decoders[op->opnum](argp, &op->u);
 		else {
 			op->opnum = OP_ILLEGAL;
 			op->status = nfserr_op_illegal;
-- 
cgit v1.2.3-70-g09d2


From b7fdf9fdd6457c9ed02099fe82bab92b0b3e291b Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 20 May 2008 19:16:28 +0200
Subject: ocfs2-stack_user: BKL pushdown

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 fs/ocfs2/stack_user.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index b503772cd0e..cd120011104 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -21,6 +21,7 @@
 #include <linux/fs.h>
 #include <linux/miscdevice.h>
 #include <linux/mutex.h>
+#include <linux/smp_lock.h>
 #include <linux/reboot.h>
 #include <asm/uaccess.h>
 
@@ -619,10 +620,12 @@ static int ocfs2_control_open(struct inode *inode, struct file *file)
 		return -ENOMEM;
 	p->op_this_node = -1;
 
+	lock_kernel();
 	mutex_lock(&ocfs2_control_lock);
 	file->private_data = p;
 	list_add(&p->op_list, &ocfs2_control_private_list);
 	mutex_unlock(&ocfs2_control_lock);
+	unlock_kernel();
 
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 9c20616c385ebeaa30257ef5d35e8f346db4ee32 Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Thu, 29 May 2008 17:14:05 -0600
Subject: Make FAT users happier by not deadlocking

The FAT BKL removal patch can cause deadlocks.  It turns out that the new
lock_super() calls are unneeded, remove them (as directed by Linus).

Reported-by: "Tony Luck" <tony.luck@intel.com>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 fs/fat/file.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/file.c b/fs/fat/file.c
index 7059928fb35..bdf91e97397 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -11,7 +11,6 @@
 #include <linux/mount.h>
 #include <linux/time.h>
 #include <linux/msdos_fs.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
@@ -229,8 +228,7 @@ static int fat_free(struct inode *inode, int skip)
 
 void fat_truncate(struct inode *inode)
 {
-	struct super_block *sb = inode->i_sb;
-	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
 	const unsigned int cluster_size = sbi->cluster_size;
 	int nr_clusters;
 
@@ -243,9 +241,7 @@ void fat_truncate(struct inode *inode)
 
 	nr_clusters = (inode->i_size + (cluster_size - 1)) >> sbi->cluster_bits;
 
-	lock_super(sb);
 	fat_free(inode, nr_clusters);
-	unlock_super(sb);
 	fat_flush_inodes(inode->i_sb, inode, NULL);
 }
 
@@ -298,14 +294,11 @@ static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode)
 
 int fat_setattr(struct dentry *dentry, struct iattr *attr)
 {
-	struct super_block *sb = dentry->d_sb;
 	struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
 	struct inode *inode = dentry->d_inode;
 	int mask, error = 0;
 	unsigned int ia_valid;
 
-	lock_super(sb);
-
 	/*
 	 * Expand the file. Since inode_setattr() updates ->i_size
 	 * before calling the ->truncate(), but FAT needs to fill the
@@ -358,7 +351,6 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
 		mask = sbi->options.fs_fmask;
 	inode->i_mode &= S_IFMT | (S_IRWXUGO & ~mask);
 out:
-	unlock_super(sb);
 	return error;
 }
 EXPORT_SYMBOL_GPL(fat_setattr);
-- 
cgit v1.2.3-70-g09d2


From 9465efc9e96135a2cec8154c0c766fa59984a298 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Fri, 27 Jun 2008 11:05:24 +0200
Subject: Remove BKL from remote_llseek v2

- Replace remote_llseek with generic_file_llseek_unlocked (to force compilation
failures in all users)
- Change all users to either use generic_file_llseek_unlocked directly or
take the BKL around. I changed the file systems who don't use the BKL
for anything (CIFS, GFS) to call it directly. NCPFS and SMBFS and NFS
take the BKL, but explicitely in their own source now.

I moved them all over in a single patch to avoid unbisectable sections.

Open problem: 32bit kernels can corrupt fpos because its modification
is not atomic, but they can do that anyways because there's other paths who
modify it without BKL.

Do we need a special lock for the pos/f_version = 0 checks?

Trond says the NFS BKL is likely not needed, but keep it for now
until his full audit.

v2: Use generic_file_llseek_unlocked instead of remote_llseek_unlocked
    and factor duplicated code (suggested by hch)

Cc: Trond.Myklebust@netapp.com
Cc: swhiteho@redhat.com
Cc: sfrench@samba.org
Cc: vandrove@vc.cvut.cz

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 fs/cifs/cifsfs.c   |  2 +-
 fs/gfs2/ops_file.c |  4 ++--
 fs/ncpfs/file.c    | 12 +++++++++++-
 fs/nfs/file.c      |  6 +++++-
 fs/read_write.c    | 38 +++++++++++---------------------------
 fs/smbfs/file.c    | 11 ++++++++++-
 include/linux/fs.h |  3 ++-
 7 files changed, 42 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 427a7c69589..aeff0fe5b6b 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -581,7 +581,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
 		if (retval < 0)
 			return (loff_t)retval;
 	}
-	return remote_llseek(file, offset, origin);
+	return generic_file_llseek_unlocked(file, offset, origin);
 }
 
 struct file_system_type cifs_fs_type = {
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index e1b7d525a06..24dd5945008 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -62,11 +62,11 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
 		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
 					   &i_gh);
 		if (!error) {
-			error = remote_llseek(file, offset, origin);
+			error = generic_file_llseek_unlocked(file, offset, origin);
 			gfs2_glock_dq_uninit(&i_gh);
 		}
 	} else
-		error = remote_llseek(file, offset, origin);
+		error = generic_file_llseek_unlocked(file, offset, origin);
 
 	return error;
 }
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 2b145de45b3..6a7d901f193 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -18,6 +18,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
+#include <linux/smp_lock.h>
 
 #include <linux/ncp_fs.h>
 #include "ncplib_kernel.h"
@@ -281,9 +282,18 @@ static int ncp_release(struct inode *inode, struct file *file) {
 	return 0;
 }
 
+static loff_t ncp_remote_llseek(struct file *file, loff_t offset, int origin)
+{
+	loff_t ret;
+	lock_kernel();
+	ret = generic_file_llseek_unlocked(file, offset, origin);
+	unlock_kernel();
+	return ret;
+}
+
 const struct file_operations ncp_file_operations =
 {
-	.llseek		= remote_llseek,
+	.llseek 	= ncp_remote_llseek,
 	.read		= ncp_file_read,
 	.write		= ncp_file_write,
 	.ioctl		= ncp_ioctl,
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 3536b01164f..a34eb78989f 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -170,6 +170,7 @@ force_reval:
 
 static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
 {
+	loff_t loff;
 	/* origin == SEEK_END => we must revalidate the cached file length */
 	if (origin == SEEK_END) {
 		struct inode *inode = filp->f_mapping->host;
@@ -177,7 +178,10 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
 		if (retval < 0)
 			return (loff_t)retval;
 	}
-	return remote_llseek(filp, offset, origin);
+	lock_kernel();	/* BKL needed? */
+	loff = generic_file_llseek_unlocked(filp, offset, origin);
+	unlock_kernel();
+	return loff;
 }
 
 /*
diff --git a/fs/read_write.c b/fs/read_write.c
index f0d1240a5c6..9ba495d5a29 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -31,12 +31,12 @@ const struct file_operations generic_ro_fops = {
 
 EXPORT_SYMBOL(generic_ro_fops);
 
-loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
+loff_t
+generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
 {
 	loff_t retval;
 	struct inode *inode = file->f_mapping->host;
 
-	mutex_lock(&inode->i_mutex);
 	switch (origin) {
 		case SEEK_END:
 			offset += inode->i_size;
@@ -46,42 +46,26 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
 	}
 	retval = -EINVAL;
 	if (offset>=0 && offset<=inode->i_sb->s_maxbytes) {
+		/* Special lock needed here? */
 		if (offset != file->f_pos) {
 			file->f_pos = offset;
 			file->f_version = 0;
 		}
 		retval = offset;
 	}
-	mutex_unlock(&inode->i_mutex);
 	return retval;
 }
+EXPORT_SYMBOL(generic_file_llseek_unlocked);
 
-EXPORT_SYMBOL(generic_file_llseek);
-
-loff_t remote_llseek(struct file *file, loff_t offset, int origin)
+loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
 {
-	loff_t retval;
-
-	lock_kernel();
-	switch (origin) {
-		case SEEK_END:
-			offset += i_size_read(file->f_path.dentry->d_inode);
-			break;
-		case SEEK_CUR:
-			offset += file->f_pos;
-	}
-	retval = -EINVAL;
-	if (offset>=0 && offset<=file->f_path.dentry->d_inode->i_sb->s_maxbytes) {
-		if (offset != file->f_pos) {
-			file->f_pos = offset;
-			file->f_version = 0;
-		}
-		retval = offset;
-	}
-	unlock_kernel();
-	return retval;
+	loff_t n;
+	mutex_lock(&file->f_dentry->d_inode->i_mutex);
+	n = generic_file_llseek_unlocked(file, offset, origin);
+	mutex_unlock(&file->f_dentry->d_inode->i_mutex);
+	return n;
 }
-EXPORT_SYMBOL(remote_llseek);
+EXPORT_SYMBOL(generic_file_llseek);
 
 loff_t no_llseek(struct file *file, loff_t offset, int origin)
 {
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index efbe29af3d7..2294783320c 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -422,9 +422,18 @@ smb_file_permission(struct inode *inode, int mask, struct nameidata *nd)
 	return error;
 }
 
+static loff_t smb_remote_llseek(struct file *file, loff_t offset, int origin)
+{
+	loff_t ret;
+	lock_kernel();
+	ret = generic_file_llseek_unlocked(file, offset, origin);
+	unlock_kernel();
+	return ret;
+}
+
 const struct file_operations smb_file_operations =
 {
-	.llseek		= remote_llseek,
+	.llseek 	= smb_remote_llseek,
 	.read		= do_sync_read,
 	.aio_read	= smb_file_aio_read,
 	.write		= do_sync_write,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f413085f748..b158e5161bc 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1871,7 +1871,8 @@ extern void
 file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
 extern loff_t no_llseek(struct file *file, loff_t offset, int origin);
 extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin);
-extern loff_t remote_llseek(struct file *file, loff_t offset, int origin);
+extern loff_t generic_file_llseek_unlocked(struct file *file, loff_t offset,
+			int origin);
 extern int generic_file_open(struct inode * inode, struct file * filp);
 extern int nonseekable_open(struct inode * inode, struct file * filp);
 
-- 
cgit v1.2.3-70-g09d2


From dda6445e219b5d1fd67e9426ce8d23f8fbfaaf66 Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Thu, 19 Jun 2008 16:18:25 -0600
Subject: ecryptfs: fasync BKL pushdown

Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 fs/ecryptfs/file.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 2258b8f654a..24749bf0668 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -30,6 +30,7 @@
 #include <linux/security.h>
 #include <linux/compat.h>
 #include <linux/fs_stack.h>
+#include <linux/smp_lock.h>
 #include "ecryptfs_kernel.h"
 
 /**
@@ -277,9 +278,11 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag)
 	int rc = 0;
 	struct file *lower_file = NULL;
 
+	lock_kernel();
 	lower_file = ecryptfs_file_to_lower(file);
 	if (lower_file->f_op && lower_file->f_op->fasync)
 		rc = lower_file->f_op->fasync(fd, lower_file, flag);
+	unlock_kernel();
 	return rc;
 }
 
-- 
cgit v1.2.3-70-g09d2


From a238b790d5f99c7832f9b73ac8847025815b85f7 Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Fri, 20 Jun 2008 09:12:01 -0600
Subject: Call fasync() functions without the BKL

lock_kernel() calls have been pushed down into code which needs it, so
there is no need to take the BKL at this level anymore.

This work inspired and aided by Andi Kleen's unlocked_fasync() patches.

Acked-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 fs/fcntl.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/fcntl.c b/fs/fcntl.c
index bfd776509a7..330a7d78259 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -12,7 +12,6 @@
 #include <linux/fdtable.h>
 #include <linux/capability.h>
 #include <linux/dnotify.h>
-#include <linux/smp_lock.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/security.h>
@@ -227,7 +226,6 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 	if (error)
 		return error;
 
-	lock_kernel();
 	if ((arg ^ filp->f_flags) & FASYNC) {
 		if (filp->f_op && filp->f_op->fasync) {
 			error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
@@ -238,7 +236,6 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 
 	filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
  out:
-	unlock_kernel();
 	return error;
 }
 
-- 
cgit v1.2.3-70-g09d2


From b001a1b6aa960949a24c2cdc28257dfcc9428d74 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Wed, 2 Jul 2008 11:15:03 +0300
Subject: nfsd: dprint operation names

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4proc.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 50 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 5c3683cfd59..eef1629806f 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -846,10 +846,13 @@ struct nfsd4_operation {
 #define ALLOWED_WITHOUT_FH 1
 /* GETATTR and ops not listed as returning NFS4ERR_MOVED: */
 #define ALLOWED_ON_ABSENT_FS 2
+	char *op_name;
 };
 
 static struct nfsd4_operation nfsd4_ops[];
 
+static inline char *nfsd4_op_name(unsigned opnum);
+
 /*
  * COMPOUND call.
  */
@@ -891,7 +894,9 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 	while (!status && resp->opcnt < args->opcnt) {
 		op = &args->ops[resp->opcnt++];
 
-		dprintk("nfsv4 compound op #%d: %d\n", resp->opcnt, op->opnum);
+		dprintk("nfsv4 compound op #%d/%d: %d (%s)\n",
+			resp->opcnt, args->opcnt, op->opnum,
+			nfsd4_op_name(op->opnum));
 
 		/*
 		 * The XDR decode routines may have pre-set op->status;
@@ -962,120 +967,163 @@ out:
 static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = {
 	[OP_ACCESS] = {
 		.op_func = (nfsd4op_func)nfsd4_access,
+		.op_name = "OP_ACCESS",
 	},
 	[OP_CLOSE] = {
 		.op_func = (nfsd4op_func)nfsd4_close,
+		.op_name = "OP_CLOSE",
 	},
 	[OP_COMMIT] = {
 		.op_func = (nfsd4op_func)nfsd4_commit,
+		.op_name = "OP_COMMIT",
 	},
 	[OP_CREATE] = {
 		.op_func = (nfsd4op_func)nfsd4_create,
+		.op_name = "OP_CREATE",
 	},
 	[OP_DELEGRETURN] = {
 		.op_func = (nfsd4op_func)nfsd4_delegreturn,
+		.op_name = "OP_DELEGRETURN",
 	},
 	[OP_GETATTR] = {
 		.op_func = (nfsd4op_func)nfsd4_getattr,
 		.op_flags = ALLOWED_ON_ABSENT_FS,
+		.op_name = "OP_GETATTR",
 	},
 	[OP_GETFH] = {
 		.op_func = (nfsd4op_func)nfsd4_getfh,
+		.op_name = "OP_GETFH",
 	},
 	[OP_LINK] = {
 		.op_func = (nfsd4op_func)nfsd4_link,
+		.op_name = "OP_LINK",
 	},
 	[OP_LOCK] = {
 		.op_func = (nfsd4op_func)nfsd4_lock,
+		.op_name = "OP_LOCK",
 	},
 	[OP_LOCKT] = {
 		.op_func = (nfsd4op_func)nfsd4_lockt,
+		.op_name = "OP_LOCKT",
 	},
 	[OP_LOCKU] = {
 		.op_func = (nfsd4op_func)nfsd4_locku,
+		.op_name = "OP_LOCKU",
 	},
 	[OP_LOOKUP] = {
 		.op_func = (nfsd4op_func)nfsd4_lookup,
+		.op_name = "OP_LOOKUP",
 	},
 	[OP_LOOKUPP] = {
 		.op_func = (nfsd4op_func)nfsd4_lookupp,
+		.op_name = "OP_LOOKUPP",
 	},
 	[OP_NVERIFY] = {
 		.op_func = (nfsd4op_func)nfsd4_nverify,
+		.op_name = "OP_NVERIFY",
 	},
 	[OP_OPEN] = {
 		.op_func = (nfsd4op_func)nfsd4_open,
+		.op_name = "OP_OPEN",
 	},
 	[OP_OPEN_CONFIRM] = {
 		.op_func = (nfsd4op_func)nfsd4_open_confirm,
+		.op_name = "OP_OPEN_CONFIRM",
 	},
 	[OP_OPEN_DOWNGRADE] = {
 		.op_func = (nfsd4op_func)nfsd4_open_downgrade,
+		.op_name = "OP_OPEN_DOWNGRADE",
 	},
 	[OP_PUTFH] = {
 		.op_func = (nfsd4op_func)nfsd4_putfh,
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+		.op_name = "OP_PUTFH",
 	},
 	[OP_PUTPUBFH] = {
-		/* unsupported; just for future reference: */
+		/* unsupported, just for future reference: */
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+		.op_name = "OP_PUTPUBFH",
 	},
 	[OP_PUTROOTFH] = {
 		.op_func = (nfsd4op_func)nfsd4_putrootfh,
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+		.op_name = "OP_PUTROOTFH",
 	},
 	[OP_READ] = {
 		.op_func = (nfsd4op_func)nfsd4_read,
+		.op_name = "OP_READ",
 	},
 	[OP_READDIR] = {
 		.op_func = (nfsd4op_func)nfsd4_readdir,
+		.op_name = "OP_READDIR",
 	},
 	[OP_READLINK] = {
 		.op_func = (nfsd4op_func)nfsd4_readlink,
+		.op_name = "OP_READLINK",
 	},
 	[OP_REMOVE] = {
 		.op_func = (nfsd4op_func)nfsd4_remove,
+		.op_name = "OP_REMOVE",
 	},
 	[OP_RENAME] = {
+		.op_name = "OP_RENAME",
 		.op_func = (nfsd4op_func)nfsd4_rename,
 	},
 	[OP_RENEW] = {
 		.op_func = (nfsd4op_func)nfsd4_renew,
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+		.op_name = "OP_RENEW",
 	},
 	[OP_RESTOREFH] = {
 		.op_func = (nfsd4op_func)nfsd4_restorefh,
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+		.op_name = "OP_RESTOREFH",
 	},
 	[OP_SAVEFH] = {
 		.op_func = (nfsd4op_func)nfsd4_savefh,
+		.op_name = "OP_SAVEFH",
 	},
 	[OP_SECINFO] = {
 		.op_func = (nfsd4op_func)nfsd4_secinfo,
+		.op_name = "OP_SECINFO",
 	},
 	[OP_SETATTR] = {
 		.op_func = (nfsd4op_func)nfsd4_setattr,
+		.op_name = "OP_SETATTR",
 	},
 	[OP_SETCLIENTID] = {
 		.op_func = (nfsd4op_func)nfsd4_setclientid,
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+		.op_name = "OP_SETCLIENTID",
 	},
 	[OP_SETCLIENTID_CONFIRM] = {
 		.op_func = (nfsd4op_func)nfsd4_setclientid_confirm,
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+		.op_name = "OP_SETCLIENTID_CONFIRM",
 	},
 	[OP_VERIFY] = {
 		.op_func = (nfsd4op_func)nfsd4_verify,
+		.op_name = "OP_VERIFY",
 	},
 	[OP_WRITE] = {
 		.op_func = (nfsd4op_func)nfsd4_write,
+		.op_name = "OP_WRITE",
 	},
 	[OP_RELEASE_LOCKOWNER] = {
 		.op_func = (nfsd4op_func)nfsd4_release_lockowner,
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+		.op_name = "OP_RELEASE_LOCKOWNER",
 	},
 };
 
+static inline char *
+nfsd4_op_name(unsigned opnum)
+{
+	if (opnum < ARRAY_SIZE(nfsd4_ops))
+		return nfsd4_ops[opnum].op_name;
+	return "unknown_operation";
+}
+
 #define nfs4svc_decode_voidargs		NULL
 #define nfs4svc_release_void		NULL
 #define nfsd4_voidres			nfsd4_voidargs
-- 
cgit v1.2.3-70-g09d2


From f58ba889106af60f52af792efbe1973e458a2138 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Wed, 2 Jul 2008 21:12:01 +0200
Subject: [GFS2] don't call permission()

GFS2 calls permission() to verify permissions after locks on the files
have been taken.

For this it's sufficient to call gfs2_permission() instead.  This
results in the following changes:

  - IS_RDONLY() check is not performed
  - IS_IMMUTABLE() check is not performed
  - devcgroup_inode_permission() is not called
  - security_inode_permission() is not called

IS_RDONLY() should be unnecessary anyway, as the per-mount read-only
flag should provide protection against read-only remounts during
operations.  do_gfs2_set_flags() has been fixed to perform
mnt_want_write()/mnt_drop_write() to protect against remounting
read-only.

IS_IMMUTABLE has been added to gfs2_permission()

Repeating the security checks seems to be pointless, as they don't
normally change, and if they do, it's independent of the filesystem
state.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/inode.c     |  6 +++---
 fs/gfs2/inode.h     |  1 +
 fs/gfs2/ops_file.c  | 11 +++++++++--
 fs/gfs2/ops_inode.c | 25 +++++++++++++++++--------
 4 files changed, 30 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 09453d057e4..caf40908335 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -504,7 +504,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
 	}
 
 	if (!is_root) {
-		error = permission(dir, MAY_EXEC, NULL);
+		error = gfs2_permission(dir, MAY_EXEC);
 		if (error)
 			goto out;
 	}
@@ -667,7 +667,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
 {
 	int error;
 
-	error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL);
+	error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
 	if (error)
 		return error;
 
@@ -1134,7 +1134,7 @@ int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
 	if (IS_APPEND(&dip->i_inode))
 		return -EPERM;
 
-	error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL);
+	error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
 	if (error)
 		return error;
 
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 580da454b38..04e9fef3f99 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -91,6 +91,7 @@ int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
 		struct gfs2_inode *ip);
 int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
 		   const struct gfs2_inode *ip);
+int gfs2_permission(struct inode *inode, int mask);
 int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to);
 int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
 int gfs2_glock_nq_atime(struct gfs2_holder *gh);
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 0ff512a1192..1737af98a42 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -15,6 +15,7 @@
 #include <linux/uio.h>
 #include <linux/blkdev.h>
 #include <linux/mm.h>
+#include <linux/mount.h>
 #include <linux/fs.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/ext2_fs.h>
@@ -220,10 +221,14 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
 	int error;
 	u32 new_flags, flags;
 
-	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+	error = mnt_want_write(filp->f_path.mnt);
 	if (error)
 		return error;
 
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+	if (error)
+		goto out_drop_write;
+
 	flags = ip->i_di.di_flags;
 	new_flags = (flags & ~mask) | (reqflags & mask);
 	if ((new_flags ^ flags) == 0)
@@ -242,7 +247,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
 	    !capable(CAP_LINUX_IMMUTABLE))
 		goto out;
 	if (!IS_IMMUTABLE(inode)) {
-		error = permission(inode, MAY_WRITE, NULL);
+		error = gfs2_permission(inode, MAY_WRITE);
 		if (error)
 			goto out;
 	}
@@ -272,6 +277,8 @@ out_trans_end:
 	gfs2_trans_end(sdp);
 out:
 	gfs2_glock_dq_uninit(&gh);
+out_drop_write:
+	mnt_drop_write(filp->f_path.mnt);
 	return error;
 }
 
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 2686ad4c002..1e252dfc529 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -163,7 +163,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 	if (error)
 		goto out;
 
-	error = permission(dir, MAY_WRITE | MAY_EXEC, NULL);
+	error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC);
 	if (error)
 		goto out_gunlock;
 
@@ -669,7 +669,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 			}
 		}
 	} else {
-		error = permission(ndir, MAY_WRITE | MAY_EXEC, NULL);
+		error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC);
 		if (error)
 			goto out_gunlock;
 
@@ -704,7 +704,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 	/* Check out the dir to be renamed */
 
 	if (dir_rename) {
-		error = permission(odentry->d_inode, MAY_WRITE, NULL);
+		error = gfs2_permission(odentry->d_inode, MAY_WRITE);
 		if (error)
 			goto out_gunlock;
 	}
@@ -891,7 +891,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
  * Returns: errno
  */
 
-static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
+int gfs2_permission(struct inode *inode, int mask)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_holder i_gh;
@@ -905,13 +905,22 @@ static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
 		unlock = 1;
 	}
 
-	error = generic_permission(inode, mask, gfs2_check_acl);
+	if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
+		error = -EACCES;
+	else
+		error = generic_permission(inode, mask, gfs2_check_acl);
 	if (unlock)
 		gfs2_glock_dq_uninit(&i_gh);
 
 	return error;
 }
 
+static int gfs2_iop_permission(struct inode *inode, int mask,
+			       struct nameidata *nd)
+{
+	return gfs2_permission(inode, mask);
+}
+
 static int setattr_size(struct inode *inode, struct iattr *attr)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
@@ -1141,7 +1150,7 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
 }
 
 const struct inode_operations gfs2_file_iops = {
-	.permission = gfs2_permission,
+	.permission = gfs2_iop_permission,
 	.setattr = gfs2_setattr,
 	.getattr = gfs2_getattr,
 	.setxattr = gfs2_setxattr,
@@ -1160,7 +1169,7 @@ const struct inode_operations gfs2_dir_iops = {
 	.rmdir = gfs2_rmdir,
 	.mknod = gfs2_mknod,
 	.rename = gfs2_rename,
-	.permission = gfs2_permission,
+	.permission = gfs2_iop_permission,
 	.setattr = gfs2_setattr,
 	.getattr = gfs2_getattr,
 	.setxattr = gfs2_setxattr,
@@ -1172,7 +1181,7 @@ const struct inode_operations gfs2_dir_iops = {
 const struct inode_operations gfs2_symlink_iops = {
 	.readlink = gfs2_readlink,
 	.follow_link = gfs2_follow_link,
-	.permission = gfs2_permission,
+	.permission = gfs2_iop_permission,
 	.setattr = gfs2_setattr,
 	.getattr = gfs2_getattr,
 	.setxattr = gfs2_setxattr,
-- 
cgit v1.2.3-70-g09d2


From 51d654e1d885607a6edd02b337105fa5c28b6d33 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Tue, 17 Jun 2008 18:59:56 +0200
Subject: block: Globalize bio_set and bio_vec_slab

Move struct bio_set and biovec_slab definitions to bio.h so they can
be used outside of bio.c.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c            | 30 ++----------------------------
 include/linux/bio.h | 29 +++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index 78562574cb5..7a6598abc96 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -28,25 +28,10 @@
 #include <linux/blktrace_api.h>
 #include <scsi/sg.h>		/* for struct sg_iovec */
 
-#define BIO_POOL_SIZE 2
-
 static struct kmem_cache *bio_slab __read_mostly;
 
-#define BIOVEC_NR_POOLS 6
-
-/*
- * a small number of entries is fine, not going to be performance critical.
- * basically we just need to survive
- */
-#define BIO_SPLIT_ENTRIES 2
 mempool_t *bio_split_pool __read_mostly;
 
-struct biovec_slab {
-	int nr_vecs;
-	char *name; 
-	struct kmem_cache *slab;
-};
-
 /*
  * if you change this list, also change bvec_alloc or things will
  * break badly! cannot be bigger than what you can fit into an
@@ -59,24 +44,13 @@ static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
 };
 #undef BV
 
-/*
- * bio_set is used to allow other portions of the IO system to
- * allocate their own private memory pools for bio and iovec structures.
- * These memory pools in turn all allocate from the bio_slab
- * and the bvec_slabs[].
- */
-struct bio_set {
-	mempool_t *bio_pool;
-	mempool_t *bvec_pools[BIOVEC_NR_POOLS];
-};
-
 /*
  * fs_bio_set is the bio_set containing bio and iovec memory pools used by
  * IO code that does not need private memory pools.
  */
-static struct bio_set *fs_bio_set;
+struct bio_set *fs_bio_set;
 
-static inline struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
+struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
 {
 	struct bio_vec *bvl;
 
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 61c15eaf3fb..49dfb3cb746 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -333,6 +333,35 @@ extern struct bio *bio_copy_user_iov(struct request_queue *, struct sg_iovec *,
 				     int, int);
 extern int bio_uncopy_user(struct bio *);
 void zero_fill_bio(struct bio *bio);
+extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set *);
+
+/*
+ * bio_set is used to allow other portions of the IO system to
+ * allocate their own private memory pools for bio and iovec structures.
+ * These memory pools in turn all allocate from the bio_slab
+ * and the bvec_slabs[].
+ */
+#define BIO_POOL_SIZE 2
+#define BIOVEC_NR_POOLS 6
+
+struct bio_set {
+	mempool_t *bio_pool;
+	mempool_t *bvec_pools[BIOVEC_NR_POOLS];
+};
+
+struct biovec_slab {
+	int nr_vecs;
+	char *name;
+	struct kmem_cache *slab;
+};
+
+extern struct bio_set *fs_bio_set;
+
+/*
+ * a small number of entries is fine, not going to be performance critical.
+ * basically we just need to survive
+ */
+#define BIO_SPLIT_ENTRIES 2
 
 #ifdef CONFIG_HIGHMEM
 /*
-- 
cgit v1.2.3-70-g09d2


From 7ba1ba12eeef0aa7113beb16410ef8b7c748e18b Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Mon, 30 Jun 2008 20:04:41 +0200
Subject: block: Block layer data integrity support

Some block devices support verifying the integrity of requests by way
of checksums or other protection information that is submitted along
with the I/O.

This patch implements support for generating and verifying integrity
metadata, as well as correctly merging, splitting and cloning bios and
requests that have this extra information attached.

See Documentation/block/data-integrity.txt for more information.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/Kconfig          |  12 +
 block/Makefile         |   1 +
 block/blk-core.c       |   7 +
 block/blk-integrity.c  | 382 ++++++++++++++++++++++++++
 block/blk-merge.c      |   3 +
 block/blk.h            |   8 +
 block/elevator.c       |   6 +
 fs/Makefile            |   1 +
 fs/bio-integrity.c     | 708 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/bio.c               |  32 ++-
 include/linux/bio.h    |  94 ++++++-
 include/linux/blkdev.h | 105 ++++++++
 include/linux/genhd.h  |   3 +
 13 files changed, 1355 insertions(+), 7 deletions(-)
 create mode 100644 block/blk-integrity.c
 create mode 100644 fs/bio-integrity.c

(limited to 'fs')

diff --git a/block/Kconfig b/block/Kconfig
index 3e97f2bc446..1ab7c15c8d7 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -81,6 +81,18 @@ config BLK_DEV_BSG
 
 	  If unsure, say N.
 
+config BLK_DEV_INTEGRITY
+	bool "Block layer data integrity support"
+	---help---
+	Some storage devices allow extra information to be
+	stored/retrieved to help protect the data.  The block layer
+	data integrity option provides hooks which can be used by
+	filesystems to ensure better data integrity.
+
+	Say yes here if you have a storage device that provides the
+	T10/SCSI Data Integrity Field or the T13/ATA External Path
+	Protection.  If in doubt, say N.
+
 endif # BLOCK
 
 config BLOCK_COMPAT
diff --git a/block/Makefile b/block/Makefile
index 5a43c7d7959..045f7b62e4b 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -14,3 +14,4 @@ obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
 
 obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
 obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o
+obj-$(CONFIG_BLK_DEV_INTEGRITY)	+= blk-integrity.o
diff --git a/block/blk-core.c b/block/blk-core.c
index 1905aaba49f..e0fb0bcc0c1 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -143,6 +143,10 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 
 		bio->bi_size -= nbytes;
 		bio->bi_sector += (nbytes >> 9);
+
+		if (bio_integrity(bio))
+			bio_integrity_advance(bio, nbytes);
+
 		if (bio->bi_size == 0)
 			bio_endio(bio, error);
 	} else {
@@ -1381,6 +1385,9 @@ end_io:
 		 */
 		blk_partition_remap(bio);
 
+		if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))
+			goto end_io;
+
 		if (old_sector != -1)
 			blk_add_trace_remap(q, bio, old_dev, bio->bi_sector,
 					    old_sector);
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
new file mode 100644
index 00000000000..65f23ef38bb
--- /dev/null
+++ b/block/blk-integrity.c
@@ -0,0 +1,382 @@
+/*
+ * blk-integrity.c - Block layer data integrity extensions
+ *
+ * Copyright (C) 2007, 2008 Oracle Corporation
+ * Written by: Martin K. Petersen <martin.petersen@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
+ * USA.
+ *
+ */
+
+#include <linux/blkdev.h>
+#include <linux/mempool.h>
+#include <linux/bio.h>
+#include <linux/scatterlist.h>
+
+#include "blk.h"
+
+static struct kmem_cache *integrity_cachep;
+
+/**
+ * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements
+ * @rq:		request with integrity metadata attached
+ *
+ * Description: Returns the number of elements required in a
+ * scatterlist corresponding to the integrity metadata in a request.
+ */
+int blk_rq_count_integrity_sg(struct request *rq)
+{
+	struct bio_vec *iv, *ivprv;
+	struct req_iterator iter;
+	unsigned int segments;
+
+	ivprv = NULL;
+	segments = 0;
+
+	rq_for_each_integrity_segment(iv, rq, iter) {
+
+		if (!ivprv || !BIOVEC_PHYS_MERGEABLE(ivprv, iv))
+			segments++;
+
+		ivprv = iv;
+	}
+
+	return segments;
+}
+EXPORT_SYMBOL(blk_rq_count_integrity_sg);
+
+/**
+ * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist
+ * @rq:		request with integrity metadata attached
+ * @sglist:	target scatterlist
+ *
+ * Description: Map the integrity vectors in request into a
+ * scatterlist.  The scatterlist must be big enough to hold all
+ * elements.  I.e. sized using blk_rq_count_integrity_sg().
+ */
+int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist)
+{
+	struct bio_vec *iv, *ivprv;
+	struct req_iterator iter;
+	struct scatterlist *sg;
+	unsigned int segments;
+
+	ivprv = NULL;
+	sg = NULL;
+	segments = 0;
+
+	rq_for_each_integrity_segment(iv, rq, iter) {
+
+		if (ivprv) {
+			if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv))
+				goto new_segment;
+
+			sg->length += iv->bv_len;
+		} else {
+new_segment:
+			if (!sg)
+				sg = sglist;
+			else {
+				sg->page_link &= ~0x02;
+				sg = sg_next(sg);
+			}
+
+			sg_set_page(sg, iv->bv_page, iv->bv_len, iv->bv_offset);
+			segments++;
+		}
+
+		ivprv = iv;
+	}
+
+	if (sg)
+		sg_mark_end(sg);
+
+	return segments;
+}
+EXPORT_SYMBOL(blk_rq_map_integrity_sg);
+
+/**
+ * blk_integrity_compare - Compare integrity profile of two block devices
+ * @b1:		Device to compare
+ * @b2:		Device to compare
+ *
+ * Description: Meta-devices like DM and MD need to verify that all
+ * sub-devices use the same integrity format before advertising to
+ * upper layers that they can send/receive integrity metadata.  This
+ * function can be used to check whether two block devices have
+ * compatible integrity formats.
+ */
+int blk_integrity_compare(struct block_device *bd1, struct block_device *bd2)
+{
+	struct blk_integrity *b1 = bd1->bd_disk->integrity;
+	struct blk_integrity *b2 = bd2->bd_disk->integrity;
+
+	BUG_ON(bd1->bd_disk == NULL);
+	BUG_ON(bd2->bd_disk == NULL);
+
+	if (!b1 || !b2)
+		return 0;
+
+	if (b1->sector_size != b2->sector_size) {
+		printk(KERN_ERR "%s: %s/%s sector sz %u != %u\n", __func__,
+		       bd1->bd_disk->disk_name, bd2->bd_disk->disk_name,
+		       b1->sector_size, b2->sector_size);
+		return -1;
+	}
+
+	if (b1->tuple_size != b2->tuple_size) {
+		printk(KERN_ERR "%s: %s/%s tuple sz %u != %u\n", __func__,
+		       bd1->bd_disk->disk_name, bd2->bd_disk->disk_name,
+		       b1->tuple_size, b2->tuple_size);
+		return -1;
+	}
+
+	if (b1->tag_size && b2->tag_size && (b1->tag_size != b2->tag_size)) {
+		printk(KERN_ERR "%s: %s/%s tag sz %u != %u\n", __func__,
+		       bd1->bd_disk->disk_name, bd2->bd_disk->disk_name,
+		       b1->tag_size, b2->tag_size);
+		return -1;
+	}
+
+	if (strcmp(b1->name, b2->name)) {
+		printk(KERN_ERR "%s: %s/%s type %s != %s\n", __func__,
+		       bd1->bd_disk->disk_name, bd2->bd_disk->disk_name,
+		       b1->name, b2->name);
+		return -1;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(blk_integrity_compare);
+
+struct integrity_sysfs_entry {
+	struct attribute attr;
+	ssize_t (*show)(struct blk_integrity *, char *);
+	ssize_t (*store)(struct blk_integrity *, const char *, size_t);
+};
+
+static ssize_t integrity_attr_show(struct kobject *kobj, struct attribute *attr,
+				   char *page)
+{
+	struct blk_integrity *bi =
+		container_of(kobj, struct blk_integrity, kobj);
+	struct integrity_sysfs_entry *entry =
+		container_of(attr, struct integrity_sysfs_entry, attr);
+
+	return entry->show(bi, page);
+}
+
+static ssize_t integrity_attr_store(struct kobject *kobj, struct attribute *attr,
+				    const char *page, size_t count)
+{
+	struct blk_integrity *bi =
+		container_of(kobj, struct blk_integrity, kobj);
+	struct integrity_sysfs_entry *entry =
+		container_of(attr, struct integrity_sysfs_entry, attr);
+	ssize_t ret = 0;
+
+	if (entry->store)
+		ret = entry->store(bi, page, count);
+
+	return ret;
+}
+
+static ssize_t integrity_format_show(struct blk_integrity *bi, char *page)
+{
+	if (bi != NULL && bi->name != NULL)
+		return sprintf(page, "%s\n", bi->name);
+	else
+		return sprintf(page, "none\n");
+}
+
+static ssize_t integrity_tag_size_show(struct blk_integrity *bi, char *page)
+{
+	if (bi != NULL)
+		return sprintf(page, "%u\n", bi->tag_size);
+	else
+		return sprintf(page, "0\n");
+}
+
+static ssize_t integrity_read_store(struct blk_integrity *bi,
+				    const char *page, size_t count)
+{
+	char *p = (char *) page;
+	unsigned long val = simple_strtoul(p, &p, 10);
+
+	if (val)
+		set_bit(INTEGRITY_FLAG_READ, &bi->flags);
+	else
+		clear_bit(INTEGRITY_FLAG_READ, &bi->flags);
+
+	return count;
+}
+
+static ssize_t integrity_read_show(struct blk_integrity *bi, char *page)
+{
+	return sprintf(page, "%d\n",
+		       test_bit(INTEGRITY_FLAG_READ, &bi->flags) ? 1 : 0);
+}
+
+static ssize_t integrity_write_store(struct blk_integrity *bi,
+				     const char *page, size_t count)
+{
+	char *p = (char *) page;
+	unsigned long val = simple_strtoul(p, &p, 10);
+
+	if (val)
+		set_bit(INTEGRITY_FLAG_WRITE, &bi->flags);
+	else
+		clear_bit(INTEGRITY_FLAG_WRITE, &bi->flags);
+
+	return count;
+}
+
+static ssize_t integrity_write_show(struct blk_integrity *bi, char *page)
+{
+	return sprintf(page, "%d\n",
+		       test_bit(INTEGRITY_FLAG_WRITE, &bi->flags) ? 1 : 0);
+}
+
+static struct integrity_sysfs_entry integrity_format_entry = {
+	.attr = { .name = "format", .mode = S_IRUGO },
+	.show = integrity_format_show,
+};
+
+static struct integrity_sysfs_entry integrity_tag_size_entry = {
+	.attr = { .name = "tag_size", .mode = S_IRUGO },
+	.show = integrity_tag_size_show,
+};
+
+static struct integrity_sysfs_entry integrity_read_entry = {
+	.attr = { .name = "read_verify", .mode = S_IRUGO | S_IWUSR },
+	.show = integrity_read_show,
+	.store = integrity_read_store,
+};
+
+static struct integrity_sysfs_entry integrity_write_entry = {
+	.attr = { .name = "write_generate", .mode = S_IRUGO | S_IWUSR },
+	.show = integrity_write_show,
+	.store = integrity_write_store,
+};
+
+static struct attribute *integrity_attrs[] = {
+	&integrity_format_entry.attr,
+	&integrity_tag_size_entry.attr,
+	&integrity_read_entry.attr,
+	&integrity_write_entry.attr,
+	NULL,
+};
+
+static struct sysfs_ops integrity_ops = {
+	.show	= &integrity_attr_show,
+	.store	= &integrity_attr_store,
+};
+
+static int __init blk_dev_integrity_init(void)
+{
+	integrity_cachep = kmem_cache_create("blkdev_integrity",
+					     sizeof(struct blk_integrity),
+					     0, SLAB_PANIC, NULL);
+	return 0;
+}
+subsys_initcall(blk_dev_integrity_init);
+
+static void blk_integrity_release(struct kobject *kobj)
+{
+	struct blk_integrity *bi =
+		container_of(kobj, struct blk_integrity, kobj);
+
+	kmem_cache_free(integrity_cachep, bi);
+}
+
+static struct kobj_type integrity_ktype = {
+	.default_attrs	= integrity_attrs,
+	.sysfs_ops	= &integrity_ops,
+	.release	= blk_integrity_release,
+};
+
+/**
+ * blk_integrity_register - Register a gendisk as being integrity-capable
+ * @disk:	struct gendisk pointer to make integrity-aware
+ * @template:	integrity profile
+ *
+ * Description: When a device needs to advertise itself as being able
+ * to send/receive integrity metadata it must use this function to
+ * register the capability with the block layer.  The template is a
+ * blk_integrity struct with values appropriate for the underlying
+ * hardware.  See Documentation/block/data-integrity.txt.
+ */
+int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template)
+{
+	struct blk_integrity *bi;
+
+	BUG_ON(disk == NULL);
+	BUG_ON(template == NULL);
+
+	if (disk->integrity == NULL) {
+		bi = kmem_cache_alloc(integrity_cachep, GFP_KERNEL | __GFP_ZERO);
+		if (!bi)
+			return -1;
+
+		if (kobject_init_and_add(&bi->kobj, &integrity_ktype,
+					 &disk->dev.kobj, "%s", "integrity")) {
+			kmem_cache_free(integrity_cachep, bi);
+			return -1;
+		}
+
+		kobject_uevent(&bi->kobj, KOBJ_ADD);
+
+		set_bit(INTEGRITY_FLAG_READ, &bi->flags);
+		set_bit(INTEGRITY_FLAG_WRITE, &bi->flags);
+		bi->sector_size = disk->queue->hardsect_size;
+		disk->integrity = bi;
+	} else
+		bi = disk->integrity;
+
+	/* Use the provided profile as template */
+	bi->name = template->name;
+	bi->generate_fn = template->generate_fn;
+	bi->verify_fn = template->verify_fn;
+	bi->tuple_size = template->tuple_size;
+	bi->set_tag_fn = template->set_tag_fn;
+	bi->get_tag_fn = template->get_tag_fn;
+	bi->tag_size = template->tag_size;
+
+	return 0;
+}
+EXPORT_SYMBOL(blk_integrity_register);
+
+/**
+ * blk_integrity_unregister - Remove block integrity profile
+ * @disk:	disk whose integrity profile to deallocate
+ *
+ * Description: This function frees all memory used by the block
+ * integrity profile.  To be called at device teardown.
+ */
+void blk_integrity_unregister(struct gendisk *disk)
+{
+	struct blk_integrity *bi;
+
+	if (!disk || !disk->integrity)
+		return;
+
+	bi = disk->integrity;
+
+	kobject_uevent(&bi->kobj, KOBJ_REMOVE);
+	kobject_del(&bi->kobj);
+	kobject_put(&disk->dev.kobj);
+	kmem_cache_free(integrity_cachep, bi);
+}
+EXPORT_SYMBOL(blk_integrity_unregister);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 651136aae76..5efc9e7a68b 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -441,6 +441,9 @@ static int attempt_merge(struct request_queue *q, struct request *req,
 	    || next->special)
 		return 0;
 
+	if (blk_integrity_rq(req) != blk_integrity_rq(next))
+		return 0;
+
 	/*
 	 * If we are allowed to merge, then append bio list
 	 * from next to rq and release next. merge_requests_fn
diff --git a/block/blk.h b/block/blk.h
index 59776ab4742..c79f30e1df5 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -51,4 +51,12 @@ static inline int queue_congestion_off_threshold(struct request_queue *q)
 	return q->nr_congestion_off;
 }
 
+#if defined(CONFIG_BLK_DEV_INTEGRITY)
+
+#define rq_for_each_integrity_segment(bvl, _rq, _iter)		\
+	__rq_for_each_bio(_iter.bio, _rq)			\
+		bip_for_each_vec(bvl, _iter.bio->bi_integrity, _iter.i)
+
+#endif /* BLK_DEV_INTEGRITY */
+
 #endif
diff --git a/block/elevator.c b/block/elevator.c
index 902dd1344d5..1f5bfe69602 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -86,6 +86,12 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio)
 	if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special)
 		return 0;
 
+	/*
+	 * only merge integrity protected bio into ditto rq
+	 */
+	if (bio_integrity(bio) != blk_integrity_rq(rq))
+		return 0;
+
 	if (!elv_iosched_allow_merge(rq, bio))
 		return 0;
 
diff --git a/fs/Makefile b/fs/Makefile
index 1e7a11bd4da..277b079dec9 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -19,6 +19,7 @@ else
 obj-y +=	no-block.o
 endif
 
+obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
 obj-$(CONFIG_INOTIFY)		+= inotify.o
 obj-$(CONFIG_INOTIFY_USER)	+= inotify_user.o
 obj-$(CONFIG_EPOLL)		+= eventpoll.o
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
new file mode 100644
index 00000000000..31b08878913
--- /dev/null
+++ b/fs/bio-integrity.c
@@ -0,0 +1,708 @@
+/*
+ * bio-integrity.c - bio data integrity extensions
+ *
+ * Copyright (C) 2007, 2008 Oracle Corporation
+ * Written by: Martin K. Petersen <martin.petersen@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
+ * USA.
+ *
+ */
+
+#include <linux/blkdev.h>
+#include <linux/mempool.h>
+#include <linux/bio.h>
+#include <linux/workqueue.h>
+
+static struct kmem_cache *bio_integrity_slab __read_mostly;
+static struct workqueue_struct *kintegrityd_wq;
+
+/**
+ * bio_integrity_alloc_bioset - Allocate integrity payload and attach it to bio
+ * @bio:	bio to attach integrity metadata to
+ * @gfp_mask:	Memory allocation mask
+ * @nr_vecs:	Number of integrity metadata scatter-gather elements
+ * @bs:		bio_set to allocate from
+ *
+ * Description: This function prepares a bio for attaching integrity
+ * metadata.  nr_vecs specifies the maximum number of pages containing
+ * integrity metadata that can be attached.
+ */
+struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio, gfp_t gfp_mask, unsigned int nr_vecs, struct bio_set *bs)
+{
+	struct bio_integrity_payload *bip;
+	struct bio_vec *iv;
+	unsigned long idx;
+
+	BUG_ON(bio == NULL);
+
+	bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
+	if (unlikely(bip == NULL)) {
+		printk(KERN_ERR "%s: could not alloc bip\n", __func__);
+		return NULL;
+	}
+
+	memset(bip, 0, sizeof(*bip));
+
+	iv = bvec_alloc_bs(gfp_mask, nr_vecs, &idx, bs);
+	if (unlikely(iv == NULL)) {
+		printk(KERN_ERR "%s: could not alloc bip_vec\n", __func__);
+		mempool_free(bip, bs->bio_integrity_pool);
+		return NULL;
+	}
+
+	bip->bip_pool = idx;
+	bip->bip_vec = iv;
+	bip->bip_bio = bio;
+	bio->bi_integrity = bip;
+
+	return bip;
+}
+EXPORT_SYMBOL(bio_integrity_alloc_bioset);
+
+/**
+ * bio_integrity_alloc - Allocate integrity payload and attach it to bio
+ * @bio:	bio to attach integrity metadata to
+ * @gfp_mask:	Memory allocation mask
+ * @nr_vecs:	Number of integrity metadata scatter-gather elements
+ *
+ * Description: This function prepares a bio for attaching integrity
+ * metadata.  nr_vecs specifies the maximum number of pages containing
+ * integrity metadata that can be attached.
+ */
+struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp_mask, unsigned int nr_vecs)
+{
+	return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set);
+}
+EXPORT_SYMBOL(bio_integrity_alloc);
+
+/**
+ * bio_integrity_free - Free bio integrity payload
+ * @bio:	bio containing bip to be freed
+ * @bs:		bio_set this bio was allocated from
+ *
+ * Description: Used to free the integrity portion of a bio. Usually
+ * called from bio_free().
+ */
+void bio_integrity_free(struct bio *bio, struct bio_set *bs)
+{
+	struct bio_integrity_payload *bip = bio->bi_integrity;
+
+	BUG_ON(bip == NULL);
+
+	/* A cloned bio doesn't own the integrity metadata */
+	if (!bio_flagged(bio, BIO_CLONED) && bip->bip_buf != NULL)
+		kfree(bip->bip_buf);
+
+	mempool_free(bip->bip_vec, bs->bvec_pools[bip->bip_pool]);
+	mempool_free(bip, bs->bio_integrity_pool);
+
+	bio->bi_integrity = NULL;
+}
+EXPORT_SYMBOL(bio_integrity_free);
+
+/**
+ * bio_integrity_add_page - Attach integrity metadata
+ * @bio:	bio to update
+ * @page:	page containing integrity metadata
+ * @len:	number of bytes of integrity metadata in page
+ * @offset:	start offset within page
+ *
+ * Description: Attach a page containing integrity metadata to bio.
+ */
+int bio_integrity_add_page(struct bio *bio, struct page *page,
+			   unsigned int len, unsigned int offset)
+{
+	struct bio_integrity_payload *bip = bio->bi_integrity;
+	struct bio_vec *iv;
+
+	if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_pool)) {
+		printk(KERN_ERR "%s: bip_vec full\n", __func__);
+		return 0;
+	}
+
+	iv = bip_vec_idx(bip, bip->bip_vcnt);
+	BUG_ON(iv == NULL);
+	BUG_ON(iv->bv_page != NULL);
+
+	iv->bv_page = page;
+	iv->bv_len = len;
+	iv->bv_offset = offset;
+	bip->bip_vcnt++;
+
+	return len;
+}
+EXPORT_SYMBOL(bio_integrity_add_page);
+
+/**
+ * bio_integrity_enabled - Check whether integrity can be passed
+ * @bio:	bio to check
+ *
+ * Description: Determines whether bio_integrity_prep() can be called
+ * on this bio or not.	bio data direction and target device must be
+ * set prior to calling.  The functions honors the write_generate and
+ * read_verify flags in sysfs.
+ */
+int bio_integrity_enabled(struct bio *bio)
+{
+	/* Already protected? */
+	if (bio_integrity(bio))
+		return 0;
+
+	return bdev_integrity_enabled(bio->bi_bdev, bio_data_dir(bio));
+}
+EXPORT_SYMBOL(bio_integrity_enabled);
+
+/**
+ * bio_integrity_hw_sectors - Convert 512b sectors to hardware ditto
+ * @bi:		blk_integrity profile for device
+ * @sectors:	Number of 512 sectors to convert
+ *
+ * Description: The block layer calculates everything in 512 byte
+ * sectors but integrity metadata is done in terms of the hardware
+ * sector size of the storage device.  Convert the block layer sectors
+ * to physical sectors.
+ */
+static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi, unsigned int sectors)
+{
+	/* At this point there are only 512b or 4096b DIF/EPP devices */
+	if (bi->sector_size == 4096)
+		return sectors >>= 3;
+
+	return sectors;
+}
+
+/**
+ * bio_integrity_tag_size - Retrieve integrity tag space
+ * @bio:	bio to inspect
+ *
+ * Description: Returns the maximum number of tag bytes that can be
+ * attached to this bio. Filesystems can use this to determine how
+ * much metadata to attach to an I/O.
+ */
+unsigned int bio_integrity_tag_size(struct bio *bio)
+{
+	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+
+	BUG_ON(bio->bi_size == 0);
+
+	return bi->tag_size * (bio->bi_size / bi->sector_size);
+}
+EXPORT_SYMBOL(bio_integrity_tag_size);
+
+int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, int set)
+{
+	struct bio_integrity_payload *bip = bio->bi_integrity;
+	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+	unsigned int nr_sectors;
+
+	BUG_ON(bip->bip_buf == NULL);
+
+	if (bi->tag_size == 0)
+		return -1;
+
+	nr_sectors = bio_integrity_hw_sectors(bi, DIV_ROUND_UP(len, bi->tag_size));
+
+	if (nr_sectors * bi->tuple_size > bip->bip_size) {
+		printk(KERN_ERR "%s: tag too big for bio: %u > %u\n",
+		       __func__, nr_sectors * bi->tuple_size, bip->bip_size);
+		return -1;
+	}
+
+	if (set)
+		bi->set_tag_fn(bip->bip_buf, tag_buf, nr_sectors);
+	else
+		bi->get_tag_fn(bip->bip_buf, tag_buf, nr_sectors);
+
+	return 0;
+}
+
+/**
+ * bio_integrity_set_tag - Attach a tag buffer to a bio
+ * @bio:	bio to attach buffer to
+ * @tag_buf:	Pointer to a buffer containing tag data
+ * @len:	Length of the included buffer
+ *
+ * Description: Use this function to tag a bio by leveraging the extra
+ * space provided by devices formatted with integrity protection.  The
+ * size of the integrity buffer must be <= to the size reported by
+ * bio_integrity_tag_size().
+ */
+int bio_integrity_set_tag(struct bio *bio, void *tag_buf, unsigned int len)
+{
+	BUG_ON(bio_data_dir(bio) != WRITE);
+
+	return bio_integrity_tag(bio, tag_buf, len, 1);
+}
+EXPORT_SYMBOL(bio_integrity_set_tag);
+
+/**
+ * bio_integrity_get_tag - Retrieve a tag buffer from a bio
+ * @bio:	bio to retrieve buffer from
+ * @tag_buf:	Pointer to a buffer for the tag data
+ * @len:	Length of the target buffer
+ *
+ * Description: Use this function to retrieve the tag buffer from a
+ * completed I/O. The size of the integrity buffer must be <= to the
+ * size reported by bio_integrity_tag_size().
+ */
+int bio_integrity_get_tag(struct bio *bio, void *tag_buf, unsigned int len)
+{
+	BUG_ON(bio_data_dir(bio) != READ);
+
+	return bio_integrity_tag(bio, tag_buf, len, 0);
+}
+EXPORT_SYMBOL(bio_integrity_get_tag);
+
+/**
+ * bio_integrity_generate - Generate integrity metadata for a bio
+ * @bio:	bio to generate integrity metadata for
+ *
+ * Description: Generates integrity metadata for a bio by calling the
+ * block device's generation callback function.  The bio must have a
+ * bip attached with enough room to accommodate the generated
+ * integrity metadata.
+ */
+static void bio_integrity_generate(struct bio *bio)
+{
+	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+	struct blk_integrity_exchg bix;
+	struct bio_vec *bv;
+	sector_t sector = bio->bi_sector;
+	unsigned int i, sectors, total;
+	void *prot_buf = bio->bi_integrity->bip_buf;
+
+	total = 0;
+	bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
+	bix.sector_size = bi->sector_size;
+
+	bio_for_each_segment(bv, bio, i) {
+		void *kaddr = kmap_atomic(bv->bv_page, KM_USER0);
+		bix.data_buf = kaddr + bv->bv_offset;
+		bix.data_size = bv->bv_len;
+		bix.prot_buf = prot_buf;
+		bix.sector = sector;
+
+		bi->generate_fn(&bix);
+
+		sectors = bv->bv_len / bi->sector_size;
+		sector += sectors;
+		prot_buf += sectors * bi->tuple_size;
+		total += sectors * bi->tuple_size;
+		BUG_ON(total > bio->bi_integrity->bip_size);
+
+		kunmap_atomic(kaddr, KM_USER0);
+	}
+}
+
+/**
+ * bio_integrity_prep - Prepare bio for integrity I/O
+ * @bio:	bio to prepare
+ *
+ * Description: Allocates a buffer for integrity metadata, maps the
+ * pages and attaches them to a bio.  The bio must have data
+ * direction, target device and start sector set priot to calling.  In
+ * the WRITE case, integrity metadata will be generated using the
+ * block device's integrity function.  In the READ case, the buffer
+ * will be prepared for DMA and a suitable end_io handler set up.
+ */
+int bio_integrity_prep(struct bio *bio)
+{
+	struct bio_integrity_payload *bip;
+	struct blk_integrity *bi;
+	struct request_queue *q;
+	void *buf;
+	unsigned long start, end;
+	unsigned int len, nr_pages;
+	unsigned int bytes, offset, i;
+	unsigned int sectors;
+
+	bi = bdev_get_integrity(bio->bi_bdev);
+	q = bdev_get_queue(bio->bi_bdev);
+	BUG_ON(bi == NULL);
+	BUG_ON(bio_integrity(bio));
+
+	sectors = bio_integrity_hw_sectors(bi, bio_sectors(bio));
+
+	/* Allocate kernel buffer for protection data */
+	len = sectors * blk_integrity_tuple_size(bi);
+	buf = kmalloc(len, GFP_NOIO | __GFP_NOFAIL | q->bounce_gfp);
+	if (unlikely(buf == NULL)) {
+		printk(KERN_ERR "could not allocate integrity buffer\n");
+		return -EIO;
+	}
+
+	end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	start = ((unsigned long) buf) >> PAGE_SHIFT;
+	nr_pages = end - start;
+
+	/* Allocate bio integrity payload and integrity vectors */
+	bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages);
+	if (unlikely(bip == NULL)) {
+		printk(KERN_ERR "could not allocate data integrity bioset\n");
+		kfree(buf);
+		return -EIO;
+	}
+
+	bip->bip_buf = buf;
+	bip->bip_size = len;
+	bip->bip_sector = bio->bi_sector;
+
+	/* Map it */
+	offset = offset_in_page(buf);
+	for (i = 0 ; i < nr_pages ; i++) {
+		int ret;
+		bytes = PAGE_SIZE - offset;
+
+		if (len <= 0)
+			break;
+
+		if (bytes > len)
+			bytes = len;
+
+		ret = bio_integrity_add_page(bio, virt_to_page(buf),
+					     bytes, offset);
+
+		if (ret == 0)
+			return 0;
+
+		if (ret < bytes)
+			break;
+
+		buf += bytes;
+		len -= bytes;
+		offset = 0;
+	}
+
+	/* Install custom I/O completion handler if read verify is enabled */
+	if (bio_data_dir(bio) == READ) {
+		bip->bip_end_io = bio->bi_end_io;
+		bio->bi_end_io = bio_integrity_endio;
+	}
+
+	/* Auto-generate integrity metadata if this is a write */
+	if (bio_data_dir(bio) == WRITE)
+		bio_integrity_generate(bio);
+
+	return 0;
+}
+EXPORT_SYMBOL(bio_integrity_prep);
+
+/**
+ * bio_integrity_verify - Verify integrity metadata for a bio
+ * @bio:	bio to verify
+ *
+ * Description: This function is called to verify the integrity of a
+ * bio.	 The data in the bio io_vec is compared to the integrity
+ * metadata returned by the HBA.
+ */
+static int bio_integrity_verify(struct bio *bio)
+{
+	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+	struct blk_integrity_exchg bix;
+	struct bio_vec *bv;
+	sector_t sector = bio->bi_integrity->bip_sector;
+	unsigned int i, sectors, total, ret;
+	void *prot_buf = bio->bi_integrity->bip_buf;
+
+	ret = total = 0;
+	bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
+	bix.sector_size = bi->sector_size;
+
+	bio_for_each_segment(bv, bio, i) {
+		void *kaddr = kmap_atomic(bv->bv_page, KM_USER0);
+		bix.data_buf = kaddr + bv->bv_offset;
+		bix.data_size = bv->bv_len;
+		bix.prot_buf = prot_buf;
+		bix.sector = sector;
+
+		ret = bi->verify_fn(&bix);
+
+		if (ret) {
+			kunmap_atomic(kaddr, KM_USER0);
+			break;
+		}
+
+		sectors = bv->bv_len / bi->sector_size;
+		sector += sectors;
+		prot_buf += sectors * bi->tuple_size;
+		total += sectors * bi->tuple_size;
+		BUG_ON(total > bio->bi_integrity->bip_size);
+
+		kunmap_atomic(kaddr, KM_USER0);
+	}
+
+	return ret;
+}
+
+/**
+ * bio_integrity_verify_fn - Integrity I/O completion worker
+ * @work:	Work struct stored in bio to be verified
+ *
+ * Description: This workqueue function is called to complete a READ
+ * request.  The function verifies the transferred integrity metadata
+ * and then calls the original bio end_io function.
+ */
+static void bio_integrity_verify_fn(struct work_struct *work)
+{
+	struct bio_integrity_payload *bip = 
+		container_of(work, struct bio_integrity_payload, bip_work);
+	struct bio *bio = bip->bip_bio;
+	int error = bip->bip_error;
+
+	if (bio_integrity_verify(bio)) {
+		clear_bit(BIO_UPTODATE, &bio->bi_flags);
+		error = -EIO;
+	}
+
+	/* Restore original bio completion handler */
+	bio->bi_end_io = bip->bip_end_io;
+
+	if (bio->bi_end_io)
+		bio->bi_end_io(bio, error);
+}
+
+/**
+ * bio_integrity_endio - Integrity I/O completion function
+ * @bio:	Protected bio
+ * @error:	Pointer to errno
+ *
+ * Description: Completion for integrity I/O
+ *
+ * Normally I/O completion is done in interrupt context.  However,
+ * verifying I/O integrity is a time-consuming task which must be run
+ * in process context.	This function postpones completion
+ * accordingly.
+ */
+void bio_integrity_endio(struct bio *bio, int error)
+{
+	struct bio_integrity_payload *bip = bio->bi_integrity;
+
+	BUG_ON(bip->bip_bio != bio);
+
+	bip->bip_error = error;
+	INIT_WORK(&bip->bip_work, bio_integrity_verify_fn);
+	queue_work(kintegrityd_wq, &bip->bip_work);
+}
+EXPORT_SYMBOL(bio_integrity_endio);
+
+/**
+ * bio_integrity_mark_head - Advance bip_vec skip bytes
+ * @bip:	Integrity vector to advance
+ * @skip:	Number of bytes to advance it
+ */
+void bio_integrity_mark_head(struct bio_integrity_payload *bip, unsigned int skip)
+{
+	struct bio_vec *iv;
+	unsigned int i;
+
+	bip_for_each_vec(iv, bip, i) {
+		if (skip == 0) {
+			bip->bip_idx = i;
+			return;
+		} else if (skip >= iv->bv_len) {
+			skip -= iv->bv_len;
+		} else { /* skip < iv->bv_len) */
+			iv->bv_offset += skip;
+			iv->bv_len -= skip;
+			bip->bip_idx = i;
+			return;
+		}
+	}
+}
+
+/**
+ * bio_integrity_mark_tail - Truncate bip_vec to be len bytes long
+ * @bip:	Integrity vector to truncate
+ * @len:	New length of integrity vector
+ */
+void bio_integrity_mark_tail(struct bio_integrity_payload *bip, unsigned int len)
+{
+	struct bio_vec *iv;
+	unsigned int i;
+
+	bip_for_each_vec(iv, bip, i) {
+		if (len == 0) {
+			bip->bip_vcnt = i;
+			return;
+		} else if (len >= iv->bv_len) {
+			len -= iv->bv_len;
+		} else { /* len < iv->bv_len) */
+			iv->bv_len = len;
+			len = 0;
+		}
+	}
+}
+
+/**
+ * bio_integrity_advance - Advance integrity vector
+ * @bio:	bio whose integrity vector to update
+ * @bytes_done:	number of data bytes that have been completed
+ *
+ * Description: This function calculates how many integrity bytes the
+ * number of completed data bytes correspond to and advances the
+ * integrity vector accordingly.
+ */
+void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
+{
+	struct bio_integrity_payload *bip = bio->bi_integrity;
+	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+	unsigned int nr_sectors;
+
+	BUG_ON(bip == NULL);
+	BUG_ON(bi == NULL);
+
+	nr_sectors = bio_integrity_hw_sectors(bi, bytes_done >> 9);
+	bio_integrity_mark_head(bip, nr_sectors * bi->tuple_size);
+}
+EXPORT_SYMBOL(bio_integrity_advance);
+
+/**
+ * bio_integrity_trim - Trim integrity vector
+ * @bio:	bio whose integrity vector to update
+ * @offset:	offset to first data sector
+ * @sectors:	number of data sectors
+ *
+ * Description: Used to trim the integrity vector in a cloned bio.
+ * The ivec will be advanced corresponding to 'offset' data sectors
+ * and the length will be truncated corresponding to 'len' data
+ * sectors.
+ */
+void bio_integrity_trim(struct bio *bio, unsigned int offset, unsigned int sectors)
+{
+	struct bio_integrity_payload *bip = bio->bi_integrity;
+	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+	unsigned int nr_sectors;
+
+	BUG_ON(bip == NULL);
+	BUG_ON(bi == NULL);
+	BUG_ON(!bio_flagged(bio, BIO_CLONED));
+
+	nr_sectors = bio_integrity_hw_sectors(bi, sectors);
+	bip->bip_sector = bip->bip_sector + offset;
+	bio_integrity_mark_head(bip, offset * bi->tuple_size);
+	bio_integrity_mark_tail(bip, sectors * bi->tuple_size);
+}
+EXPORT_SYMBOL(bio_integrity_trim);
+
+/**
+ * bio_integrity_split - Split integrity metadata
+ * @bio:	Protected bio
+ * @bp:		Resulting bio_pair
+ * @sectors:	Offset
+ *
+ * Description: Splits an integrity page into a bio_pair.
+ */
+void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors)
+{
+	struct blk_integrity *bi;
+	struct bio_integrity_payload *bip = bio->bi_integrity;
+	unsigned int nr_sectors;
+
+	if (bio_integrity(bio) == 0)
+		return;
+
+	bi = bdev_get_integrity(bio->bi_bdev);
+	BUG_ON(bi == NULL);
+	BUG_ON(bip->bip_vcnt != 1);
+
+	nr_sectors = bio_integrity_hw_sectors(bi, sectors);
+
+	bp->bio1.bi_integrity = &bp->bip1;
+	bp->bio2.bi_integrity = &bp->bip2;
+
+	bp->iv1 = bip->bip_vec[0];
+	bp->iv2 = bip->bip_vec[0];
+
+	bp->bip1.bip_vec = &bp->iv1;
+	bp->bip2.bip_vec = &bp->iv2;
+
+	bp->iv1.bv_len = sectors * bi->tuple_size;
+	bp->iv2.bv_offset += sectors * bi->tuple_size;
+	bp->iv2.bv_len -= sectors * bi->tuple_size;
+
+	bp->bip1.bip_sector = bio->bi_integrity->bip_sector;
+	bp->bip2.bip_sector = bio->bi_integrity->bip_sector + nr_sectors;
+
+	bp->bip1.bip_vcnt = bp->bip2.bip_vcnt = 1;
+	bp->bip1.bip_idx = bp->bip2.bip_idx = 0;
+}
+EXPORT_SYMBOL(bio_integrity_split);
+
+/**
+ * bio_integrity_clone - Callback for cloning bios with integrity metadata
+ * @bio:	New bio
+ * @bio_src:	Original bio
+ * @bs:		bio_set to allocate bip from
+ *
+ * Description:	Called to allocate a bip when cloning a bio
+ */
+int bio_integrity_clone(struct bio *bio, struct bio *bio_src, struct bio_set *bs)
+{
+	struct bio_integrity_payload *bip_src = bio_src->bi_integrity;
+	struct bio_integrity_payload *bip;
+
+	BUG_ON(bip_src == NULL);
+
+	bip = bio_integrity_alloc_bioset(bio, GFP_NOIO, bip_src->bip_vcnt, bs);
+
+	if (bip == NULL)
+		return -EIO;
+
+	memcpy(bip->bip_vec, bip_src->bip_vec,
+	       bip_src->bip_vcnt * sizeof(struct bio_vec));
+
+	bip->bip_sector = bip_src->bip_sector;
+	bip->bip_vcnt = bip_src->bip_vcnt;
+	bip->bip_idx = bip_src->bip_idx;
+
+	return 0;
+}
+EXPORT_SYMBOL(bio_integrity_clone);
+
+int bioset_integrity_create(struct bio_set *bs, int pool_size)
+{
+	bs->bio_integrity_pool = mempool_create_slab_pool(pool_size,
+							  bio_integrity_slab);
+	if (!bs->bio_integrity_pool)
+		return -1;
+
+	return 0;
+}
+EXPORT_SYMBOL(bioset_integrity_create);
+
+void bioset_integrity_free(struct bio_set *bs)
+{
+	if (bs->bio_integrity_pool)
+		mempool_destroy(bs->bio_integrity_pool);
+}
+EXPORT_SYMBOL(bioset_integrity_free);
+
+void __init bio_integrity_init_slab(void)
+{
+	bio_integrity_slab = KMEM_CACHE(bio_integrity_payload,
+					SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+}
+EXPORT_SYMBOL(bio_integrity_init_slab);
+
+static int __init integrity_init(void)
+{
+	kintegrityd_wq = create_workqueue("kintegrityd");
+
+	if (!kintegrityd_wq)
+		panic("Failed to create kintegrityd\n");
+
+	return 0;
+}
+subsys_initcall(integrity_init);
diff --git a/fs/bio.c b/fs/bio.c
index 7a6598abc96..7761c84c703 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -50,6 +50,11 @@ static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
  */
 struct bio_set *fs_bio_set;
 
+unsigned int bvec_nr_vecs(unsigned short idx)
+{
+	return bvec_slabs[idx].nr_vecs;
+}
+
 struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
 {
 	struct bio_vec *bvl;
@@ -91,6 +96,9 @@ void bio_free(struct bio *bio, struct bio_set *bio_set)
 		mempool_free(bio->bi_io_vec, bio_set->bvec_pools[pool_idx]);
 	}
 
+	if (bio_integrity(bio))
+		bio_integrity_free(bio, bio_set);
+
 	mempool_free(bio, bio_set->bio_pool);
 }
 
@@ -249,9 +257,19 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
 {
 	struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set);
 
-	if (b) {
-		b->bi_destructor = bio_fs_destructor;
-		__bio_clone(b, bio);
+	if (!b)
+		return NULL;
+
+	b->bi_destructor = bio_fs_destructor;
+	__bio_clone(b, bio);
+
+	if (bio_integrity(bio)) {
+		int ret;
+
+		ret = bio_integrity_clone(b, bio, fs_bio_set);
+
+		if (ret < 0)
+			return NULL;
 	}
 
 	return b;
@@ -1223,6 +1241,9 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
 	bp->bio1.bi_private = bi;
 	bp->bio2.bi_private = pool;
 
+	if (bio_integrity(bi))
+		bio_integrity_split(bi, bp, first_sectors);
+
 	return bp;
 }
 
@@ -1264,6 +1285,7 @@ void bioset_free(struct bio_set *bs)
 	if (bs->bio_pool)
 		mempool_destroy(bs->bio_pool);
 
+	bioset_integrity_free(bs);
 	biovec_free_pools(bs);
 
 	kfree(bs);
@@ -1280,6 +1302,9 @@ struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size)
 	if (!bs->bio_pool)
 		goto bad;
 
+	if (bioset_integrity_create(bs, bio_pool_size))
+		goto bad;
+
 	if (!biovec_create_pools(bs, bvec_pool_size))
 		return bs;
 
@@ -1306,6 +1331,7 @@ static int __init init_bio(void)
 {
 	bio_slab = KMEM_CACHE(bio, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
 
+	bio_integrity_init_slab();
 	biovec_init_slabs();
 
 	fs_bio_set = bioset_create(BIO_POOL_SIZE, 2);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 49dfb3cb746..6bfc3e8d9d8 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -64,6 +64,7 @@ struct bio_vec {
 
 struct bio_set;
 struct bio;
+struct bio_integrity_payload;
 typedef void (bio_end_io_t) (struct bio *, int);
 typedef void (bio_destructor_t) (struct bio *);
 
@@ -112,6 +113,9 @@ struct bio {
 	atomic_t		bi_cnt;		/* pin count */
 
 	void			*bi_private;
+#if defined(CONFIG_BLK_DEV_INTEGRITY)
+	struct bio_integrity_payload *bi_integrity;  /* data integrity */
+#endif
 
 	bio_destructor_t	*bi_destructor;	/* destructor */
 };
@@ -271,6 +275,29 @@ static inline void *bio_data(struct bio *bio)
  */
 #define bio_get(bio)	atomic_inc(&(bio)->bi_cnt)
 
+#if defined(CONFIG_BLK_DEV_INTEGRITY)
+/*
+ * bio integrity payload
+ */
+struct bio_integrity_payload {
+	struct bio		*bip_bio;	/* parent bio */
+	struct bio_vec		*bip_vec;	/* integrity data vector */
+
+	sector_t		bip_sector;	/* virtual start sector */
+
+	void			*bip_buf;	/* generated integrity data */
+	bio_end_io_t		*bip_end_io;	/* saved I/O completion fn */
+
+	int			bip_error;	/* saved I/O error */
+	unsigned int		bip_size;
+
+	unsigned short		bip_pool;	/* pool the ivec came from */
+	unsigned short		bip_vcnt;	/* # of integrity bio_vecs */
+	unsigned short		bip_idx;	/* current bip_vec index */
+
+	struct work_struct	bip_work;	/* I/O completion */
+};
+#endif /* CONFIG_BLK_DEV_INTEGRITY */
 
 /*
  * A bio_pair is used when we need to split a bio.
@@ -283,10 +310,14 @@ static inline void *bio_data(struct bio *bio)
  *   in bio2.bi_private
  */
 struct bio_pair {
-	struct bio	bio1, bio2;
-	struct bio_vec	bv1, bv2;
-	atomic_t	cnt;
-	int		error;
+	struct bio			bio1, bio2;
+	struct bio_vec			bv1, bv2;
+#if defined(CONFIG_BLK_DEV_INTEGRITY)
+	struct bio_integrity_payload	bip1, bip2;
+	struct bio_vec			iv1, iv2;
+#endif
+	atomic_t			cnt;
+	int				error;
 };
 extern struct bio_pair *bio_split(struct bio *bi, mempool_t *pool,
 				  int first_sectors);
@@ -334,6 +365,7 @@ extern struct bio *bio_copy_user_iov(struct request_queue *, struct sg_iovec *,
 extern int bio_uncopy_user(struct bio *);
 void zero_fill_bio(struct bio *bio);
 extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set *);
+extern unsigned int bvec_nr_vecs(unsigned short idx);
 
 /*
  * bio_set is used to allow other portions of the IO system to
@@ -346,6 +378,9 @@ extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set
 
 struct bio_set {
 	mempool_t *bio_pool;
+#if defined(CONFIG_BLK_DEV_INTEGRITY)
+	mempool_t *bio_integrity_pool;
+#endif
 	mempool_t *bvec_pools[BIOVEC_NR_POOLS];
 };
 
@@ -410,5 +445,56 @@ static inline char *__bio_kmap_irq(struct bio *bio, unsigned short idx,
 	__bio_kmap_irq((bio), (bio)->bi_idx, (flags))
 #define bio_kunmap_irq(buf,flags)	__bio_kunmap_irq(buf, flags)
 
+#if defined(CONFIG_BLK_DEV_INTEGRITY)
+
+#define bip_vec_idx(bip, idx)	(&(bip->bip_vec[(idx)]))
+#define bip_vec(bip)		bip_vec_idx(bip, 0)
+
+#define __bip_for_each_vec(bvl, bip, i, start_idx)			\
+	for (bvl = bip_vec_idx((bip), (start_idx)), i = (start_idx);	\
+	     i < (bip)->bip_vcnt;					\
+	     bvl++, i++)
+
+#define bip_for_each_vec(bvl, bip, i)					\
+	__bip_for_each_vec(bvl, bip, i, (bip)->bip_idx)
+
+#define bio_integrity(bio)	((bio)->bi_integrity ? 1 : 0)
+
+extern struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *, gfp_t, unsigned int, struct bio_set *);
+extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int);
+extern void bio_integrity_free(struct bio *, struct bio_set *);
+extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int);
+extern int bio_integrity_enabled(struct bio *bio);
+extern int bio_integrity_set_tag(struct bio *, void *, unsigned int);
+extern int bio_integrity_get_tag(struct bio *, void *, unsigned int);
+extern int bio_integrity_prep(struct bio *);
+extern void bio_integrity_endio(struct bio *, int);
+extern void bio_integrity_advance(struct bio *, unsigned int);
+extern void bio_integrity_trim(struct bio *, unsigned int, unsigned int);
+extern void bio_integrity_split(struct bio *, struct bio_pair *, int);
+extern int bio_integrity_clone(struct bio *, struct bio *, struct bio_set *);
+extern int bioset_integrity_create(struct bio_set *, int);
+extern void bioset_integrity_free(struct bio_set *);
+extern void bio_integrity_init_slab(void);
+
+#else /* CONFIG_BLK_DEV_INTEGRITY */
+
+#define bio_integrity(a)		(0)
+#define bioset_integrity_create(a, b)	(0)
+#define bio_integrity_prep(a)		(0)
+#define bio_integrity_enabled(a)	(0)
+#define bio_integrity_clone(a, b, c)	(0)
+#define bioset_integrity_free(a)	do { } while (0)
+#define bio_integrity_free(a, b)	do { } while (0)
+#define bio_integrity_endio(a, b)	do { } while (0)
+#define bio_integrity_advance(a, b)	do { } while (0)
+#define bio_integrity_trim(a, b, c)	do { } while (0)
+#define bio_integrity_split(a, b, c)	do { } while (0)
+#define bio_integrity_set_tag(a, b, c)	do { } while (0)
+#define bio_integrity_get_tag(a, b, c)	do { } while (0)
+#define bio_integrity_init_slab(a)	do { } while (0)
+
+#endif /* CONFIG_BLK_DEV_INTEGRITY */
+
 #endif /* CONFIG_BLOCK */
 #endif /* __LINUX_BIO_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6a3da671713..4a9ed45270f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -112,6 +112,7 @@ enum rq_flag_bits {
 	__REQ_ALLOCED,		/* request came from our alloc pool */
 	__REQ_RW_META,		/* metadata io request */
 	__REQ_COPY_USER,	/* contains copies of user pages */
+	__REQ_INTEGRITY,	/* integrity metadata has been remapped */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -134,6 +135,7 @@ enum rq_flag_bits {
 #define REQ_ALLOCED	(1 << __REQ_ALLOCED)
 #define REQ_RW_META	(1 << __REQ_RW_META)
 #define REQ_COPY_USER	(1 << __REQ_COPY_USER)
+#define REQ_INTEGRITY	(1 << __REQ_INTEGRITY)
 
 #define BLK_MAX_CDB	16
 
@@ -865,6 +867,109 @@ void kblockd_flush_work(struct work_struct *work);
 	MODULE_ALIAS("block-major-" __stringify(major) "-*")
 
 
+#if defined(CONFIG_BLK_DEV_INTEGRITY)
+
+#define INTEGRITY_FLAG_READ	1	/* verify data integrity on read */
+#define INTEGRITY_FLAG_WRITE	2	/* generate data integrity on write */
+
+struct blk_integrity_exchg {
+	void			*prot_buf;
+	void			*data_buf;
+	sector_t		sector;
+	unsigned int		data_size;
+	unsigned short		sector_size;
+	const char		*disk_name;
+};
+
+typedef void (integrity_gen_fn) (struct blk_integrity_exchg *);
+typedef int (integrity_vrfy_fn) (struct blk_integrity_exchg *);
+typedef void (integrity_set_tag_fn) (void *, void *, unsigned int);
+typedef void (integrity_get_tag_fn) (void *, void *, unsigned int);
+
+struct blk_integrity {
+	integrity_gen_fn	*generate_fn;
+	integrity_vrfy_fn	*verify_fn;
+	integrity_set_tag_fn	*set_tag_fn;
+	integrity_get_tag_fn	*get_tag_fn;
+
+	unsigned short		flags;
+	unsigned short		tuple_size;
+	unsigned short		sector_size;
+	unsigned short		tag_size;
+
+	const char		*name;
+
+	struct kobject		kobj;
+};
+
+extern int blk_integrity_register(struct gendisk *, struct blk_integrity *);
+extern void blk_integrity_unregister(struct gendisk *);
+extern int blk_integrity_compare(struct block_device *, struct block_device *);
+extern int blk_rq_map_integrity_sg(struct request *, struct scatterlist *);
+extern int blk_rq_count_integrity_sg(struct request *);
+
+static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi)
+{
+	if (bi)
+		return bi->tuple_size;
+
+	return 0;
+}
+
+static inline struct blk_integrity *bdev_get_integrity(struct block_device *bdev)
+{
+	return bdev->bd_disk->integrity;
+}
+
+static inline unsigned int bdev_get_tag_size(struct block_device *bdev)
+{
+	struct blk_integrity *bi = bdev_get_integrity(bdev);
+
+	if (bi)
+		return bi->tag_size;
+
+	return 0;
+}
+
+static inline int bdev_integrity_enabled(struct block_device *bdev, int rw)
+{
+	struct blk_integrity *bi = bdev_get_integrity(bdev);
+
+	if (bi == NULL)
+		return 0;
+
+	if (rw == READ && bi->verify_fn != NULL &&
+	    test_bit(INTEGRITY_FLAG_READ, &bi->flags))
+		return 1;
+
+	if (rw == WRITE && bi->generate_fn != NULL &&
+	    test_bit(INTEGRITY_FLAG_WRITE, &bi->flags))
+		return 1;
+
+	return 0;
+}
+
+static inline int blk_integrity_rq(struct request *rq)
+{
+	BUG_ON(rq->bio == NULL);
+
+	return bio_integrity(rq->bio);
+}
+
+#else /* CONFIG_BLK_DEV_INTEGRITY */
+
+#define blk_integrity_rq(rq)			(0)
+#define blk_rq_count_integrity_sg(a)		(0)
+#define blk_rq_map_integrity_sg(a, b)		(0)
+#define bdev_get_integrity(a)			(0)
+#define bdev_get_tag_size(a)			(0)
+#define blk_integrity_compare(a, b)		(0)
+#define blk_integrity_register(a, b)		(0)
+#define blk_integrity_unregister(a)		do { } while (0);
+
+#endif /* CONFIG_BLK_DEV_INTEGRITY */
+
+
 #else /* CONFIG_BLOCK */
 /*
  * stubs for when the block layer is configured out
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index ae7aec3cabe..524ec96f5a2 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -141,6 +141,9 @@ struct gendisk {
 	struct disk_stats dkstats;
 #endif
 	struct work_struct async_notify;
+#ifdef  CONFIG_BLK_DEV_INTEGRITY
+	struct blk_integrity *integrity;
+#endif
 };
 
 /* 
-- 
cgit v1.2.3-70-g09d2


From b984679efe1a616ec4ac919dba08286d71593900 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 17 Jun 2008 19:05:48 +0200
Subject: block: integrity checkpatch cleanups

> 80 char lines and that sort of thing.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-integrity.c |  8 +++++---
 fs/bio-integrity.c    | 29 ++++++++++++++++++++---------
 2 files changed, 25 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 65f23ef38bb..4ffa3814f6a 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -178,8 +178,9 @@ static ssize_t integrity_attr_show(struct kobject *kobj, struct attribute *attr,
 	return entry->show(bi, page);
 }
 
-static ssize_t integrity_attr_store(struct kobject *kobj, struct attribute *attr,
-				    const char *page, size_t count)
+static ssize_t integrity_attr_store(struct kobject *kobj,
+				    struct attribute *attr, const char *page,
+				    size_t count)
 {
 	struct blk_integrity *bi =
 		container_of(kobj, struct blk_integrity, kobj);
@@ -326,7 +327,8 @@ int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template)
 	BUG_ON(template == NULL);
 
 	if (disk->integrity == NULL) {
-		bi = kmem_cache_alloc(integrity_cachep, GFP_KERNEL | __GFP_ZERO);
+		bi = kmem_cache_alloc(integrity_cachep,
+						GFP_KERNEL | __GFP_ZERO);
 		if (!bi)
 			return -1;
 
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 31b08878913..63e2ee63058 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -39,7 +39,10 @@ static struct workqueue_struct *kintegrityd_wq;
  * metadata.  nr_vecs specifies the maximum number of pages containing
  * integrity metadata that can be attached.
  */
-struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio, gfp_t gfp_mask, unsigned int nr_vecs, struct bio_set *bs)
+struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
+							 gfp_t gfp_mask,
+							 unsigned int nr_vecs,
+							 struct bio_set *bs)
 {
 	struct bio_integrity_payload *bip;
 	struct bio_vec *iv;
@@ -81,7 +84,9 @@ EXPORT_SYMBOL(bio_integrity_alloc_bioset);
  * metadata.  nr_vecs specifies the maximum number of pages containing
  * integrity metadata that can be attached.
  */
-struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp_mask, unsigned int nr_vecs)
+struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
+						  gfp_t gfp_mask,
+						  unsigned int nr_vecs)
 {
 	return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set);
 }
@@ -174,7 +179,8 @@ EXPORT_SYMBOL(bio_integrity_enabled);
  * sector size of the storage device.  Convert the block layer sectors
  * to physical sectors.
  */
-static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi, unsigned int sectors)
+static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi,
+						    unsigned int sectors)
 {
 	/* At this point there are only 512b or 4096b DIF/EPP devices */
 	if (bi->sector_size == 4096)
@@ -212,7 +218,8 @@ int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, int set)
 	if (bi->tag_size == 0)
 		return -1;
 
-	nr_sectors = bio_integrity_hw_sectors(bi, DIV_ROUND_UP(len, bi->tag_size));
+	nr_sectors = bio_integrity_hw_sectors(bi,
+					DIV_ROUND_UP(len, bi->tag_size));
 
 	if (nr_sectors * bi->tuple_size > bip->bip_size) {
 		printk(KERN_ERR "%s: tag too big for bio: %u > %u\n",
@@ -456,7 +463,7 @@ static int bio_integrity_verify(struct bio *bio)
  */
 static void bio_integrity_verify_fn(struct work_struct *work)
 {
-	struct bio_integrity_payload *bip = 
+	struct bio_integrity_payload *bip =
 		container_of(work, struct bio_integrity_payload, bip_work);
 	struct bio *bio = bip->bip_bio;
 	int error = bip->bip_error;
@@ -502,7 +509,8 @@ EXPORT_SYMBOL(bio_integrity_endio);
  * @bip:	Integrity vector to advance
  * @skip:	Number of bytes to advance it
  */
-void bio_integrity_mark_head(struct bio_integrity_payload *bip, unsigned int skip)
+void bio_integrity_mark_head(struct bio_integrity_payload *bip,
+			     unsigned int skip)
 {
 	struct bio_vec *iv;
 	unsigned int i;
@@ -527,7 +535,8 @@ void bio_integrity_mark_head(struct bio_integrity_payload *bip, unsigned int ski
  * @bip:	Integrity vector to truncate
  * @len:	New length of integrity vector
  */
-void bio_integrity_mark_tail(struct bio_integrity_payload *bip, unsigned int len)
+void bio_integrity_mark_tail(struct bio_integrity_payload *bip,
+			     unsigned int len)
 {
 	struct bio_vec *iv;
 	unsigned int i;
@@ -579,7 +588,8 @@ EXPORT_SYMBOL(bio_integrity_advance);
  * and the length will be truncated corresponding to 'len' data
  * sectors.
  */
-void bio_integrity_trim(struct bio *bio, unsigned int offset, unsigned int sectors)
+void bio_integrity_trim(struct bio *bio, unsigned int offset,
+			unsigned int sectors)
 {
 	struct bio_integrity_payload *bip = bio->bi_integrity;
 	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
@@ -648,7 +658,8 @@ EXPORT_SYMBOL(bio_integrity_split);
  *
  * Description:	Called to allocate a bip when cloning a bio
  */
-int bio_integrity_clone(struct bio *bio, struct bio *bio_src, struct bio_set *bs)
+int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
+			struct bio_set *bs)
 {
 	struct bio_integrity_payload *bip_src = bio_src->bi_integrity;
 	struct bio_integrity_payload *bip;
-- 
cgit v1.2.3-70-g09d2


From cc371e66e340f35eed8dc4651c7c18e754c7fb26 Mon Sep 17 00:00:00 2001
From: Alasdair G Kergon <agk@redhat.com>
Date: Thu, 3 Jul 2008 09:53:43 +0200
Subject: Add bvec_merge_data to handle stacked devices and ->merge_bvec()

When devices are stacked, one device's merge_bvec_fn may need to perform
the mapping and then call one or more functions for its underlying devices.

The following bio fields are used:
  bio->bi_sector
  bio->bi_bdev
  bio->bi_size
  bio->bi_rw  using bio_data_dir()

This patch creates a new struct bvec_merge_data holding a copy of those
fields to avoid having to change them directly in the struct bio when
going down the stack only to have to change them back again on the way
back up.  (And then when the bio gets mapped for real, the whole
exercise gets repeated, but that's a problem for another day...)

Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Milan Broz <mbroz@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/pktcdvd.c |  9 +++++----
 drivers/md/linear.c     | 10 ++++++----
 drivers/md/raid0.c      | 10 ++++++----
 drivers/md/raid10.c     | 15 ++++++++-------
 drivers/md/raid5.c      | 10 ++++++----
 fs/bio.c                | 26 +++++++++++++++++++++-----
 include/linux/blkdev.h  |  9 ++++++++-
 7 files changed, 60 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 3ba1df93e9e..589850cff35 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2633,11 +2633,12 @@ end_io:
 
 
-static int pkt_merge_bvec(struct request_queue *q, struct bio *bio, struct bio_vec *bvec)
+static int pkt_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
+			  struct bio_vec *bvec)
 {
 	struct pktcdvd_device *pd = q->queuedata;
-	sector_t zone = ZONE(bio->bi_sector, pd);
-	int used = ((bio->bi_sector - zone) << 9) + bio->bi_size;
+	sector_t zone = ZONE(bmd->bi_sector, pd);
+	int used = ((bmd->bi_sector - zone) << 9) + bmd->bi_size;
 	int remaining = (pd->settings.size << 9) - used;
 	int remaining2;
 
@@ -2645,7 +2646,7 @@ static int pkt_merge_bvec(struct request_queue *q, struct bio *bio, struct bio_v
 	 * A bio <= PAGE_SIZE must be allowed. If it crosses a packet
 	 * boundary, pkt_make_request() will split the bio.
 	 */
-	remaining2 = PAGE_SIZE - bio->bi_size;
+	remaining2 = PAGE_SIZE - bmd->bi_size;
 	remaining = max(remaining, remaining2);
 
 	BUG_ON(remaining < 0);
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 10748240cb2..6a866d7c8ae 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -50,17 +50,19 @@ static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector)
 /**
  *	linear_mergeable_bvec -- tell bio layer if two requests can be merged
  *	@q: request queue
- *	@bio: the buffer head that's been built up so far
+ *	@bvm: properties of new bio
  *	@biovec: the request that could be merged to it.
  *
  *	Return amount of bytes we can take at this offset
  */
-static int linear_mergeable_bvec(struct request_queue *q, struct bio *bio, struct bio_vec *biovec)
+static int linear_mergeable_bvec(struct request_queue *q,
+				 struct bvec_merge_data *bvm,
+				 struct bio_vec *biovec)
 {
 	mddev_t *mddev = q->queuedata;
 	dev_info_t *dev0;
-	unsigned long maxsectors, bio_sectors = bio->bi_size >> 9;
-	sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
+	unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9;
+	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
 
 	dev0 = which_dev(mddev, sector);
 	maxsectors = (dev0->size << 1) - (sector - (dev0->offset<<1));
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 914c04ddec7..bcbb82594a1 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -241,18 +241,20 @@ static int create_strip_zones (mddev_t *mddev)
 /**
  *	raid0_mergeable_bvec -- tell bio layer if a two requests can be merged
  *	@q: request queue
- *	@bio: the buffer head that's been built up so far
+ *	@bvm: properties of new bio
  *	@biovec: the request that could be merged to it.
  *
  *	Return amount of bytes we can accept at this offset
  */
-static int raid0_mergeable_bvec(struct request_queue *q, struct bio *bio, struct bio_vec *biovec)
+static int raid0_mergeable_bvec(struct request_queue *q,
+				struct bvec_merge_data *bvm,
+				struct bio_vec *biovec)
 {
 	mddev_t *mddev = q->queuedata;
-	sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
+	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
 	int max;
 	unsigned int chunk_sectors = mddev->chunk_size >> 9;
-	unsigned int bio_sectors = bio->bi_size >> 9;
+	unsigned int bio_sectors = bvm->bi_size >> 9;
 
 	max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
 	if (max < 0) max = 0; /* bio_add cannot handle a negative return */
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index a71277b640a..22bb2b1b886 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -439,26 +439,27 @@ static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev)
 /**
  *	raid10_mergeable_bvec -- tell bio layer if a two requests can be merged
  *	@q: request queue
- *	@bio: the buffer head that's been built up so far
+ *	@bvm: properties of new bio
  *	@biovec: the request that could be merged to it.
  *
  *	Return amount of bytes we can accept at this offset
  *      If near_copies == raid_disk, there are no striping issues,
  *      but in that case, the function isn't called at all.
  */
-static int raid10_mergeable_bvec(struct request_queue *q, struct bio *bio,
-				struct bio_vec *bio_vec)
+static int raid10_mergeable_bvec(struct request_queue *q,
+				 struct bvec_merge_data *bvm,
+				 struct bio_vec *biovec)
 {
 	mddev_t *mddev = q->queuedata;
-	sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
+	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
 	int max;
 	unsigned int chunk_sectors = mddev->chunk_size >> 9;
-	unsigned int bio_sectors = bio->bi_size >> 9;
+	unsigned int bio_sectors = bvm->bi_size >> 9;
 
 	max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
 	if (max < 0) max = 0; /* bio_add cannot handle a negative return */
-	if (max <= bio_vec->bv_len && bio_sectors == 0)
-		return bio_vec->bv_len;
+	if (max <= biovec->bv_len && bio_sectors == 0)
+		return biovec->bv_len;
 	else
 		return max;
 }
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 54c8ee28fcc..9b00675dc64 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3319,15 +3319,17 @@ static int raid5_congested(void *data, int bits)
 /* We want read requests to align with chunks where possible,
  * but write requests don't need to.
  */
-static int raid5_mergeable_bvec(struct request_queue *q, struct bio *bio, struct bio_vec *biovec)
+static int raid5_mergeable_bvec(struct request_queue *q,
+				struct bvec_merge_data *bvm,
+				struct bio_vec *biovec)
 {
 	mddev_t *mddev = q->queuedata;
-	sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
+	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
 	int max;
 	unsigned int chunk_sectors = mddev->chunk_size >> 9;
-	unsigned int bio_sectors = bio->bi_size >> 9;
+	unsigned int bio_sectors = bvm->bi_size >> 9;
 
-	if (bio_data_dir(bio) == WRITE)
+	if ((bvm->bi_rw & 1) == WRITE)
 		return biovec->bv_len; /* always allow writes to be mergeable */
 
 	max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
diff --git a/fs/bio.c b/fs/bio.c
index 7761c84c703..88322b066ac 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -325,10 +325,19 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 		if (page == prev->bv_page &&
 		    offset == prev->bv_offset + prev->bv_len) {
 			prev->bv_len += len;
-			if (q->merge_bvec_fn &&
-			    q->merge_bvec_fn(q, bio, prev) < len) {
-				prev->bv_len -= len;
-				return 0;
+
+			if (q->merge_bvec_fn) {
+				struct bvec_merge_data bvm = {
+					.bi_bdev = bio->bi_bdev,
+					.bi_sector = bio->bi_sector,
+					.bi_size = bio->bi_size,
+					.bi_rw = bio->bi_rw,
+				};
+
+				if (q->merge_bvec_fn(q, &bvm, prev) < len) {
+					prev->bv_len -= len;
+					return 0;
+				}
 			}
 
 			goto done;
@@ -369,11 +378,18 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 	 * queue to get further control
 	 */
 	if (q->merge_bvec_fn) {
+		struct bvec_merge_data bvm = {
+			.bi_bdev = bio->bi_bdev,
+			.bi_sector = bio->bi_sector,
+			.bi_size = bio->bi_size,
+			.bi_rw = bio->bi_rw,
+		};
+
 		/*
 		 * merge_bvec_fn() returns number of bytes it can accept
 		 * at this offset
 		 */
-		if (q->merge_bvec_fn(q, bio, bvec) < len) {
+		if (q->merge_bvec_fn(q, &bvm, bvec) < len) {
 			bvec->bv_page = NULL;
 			bvec->bv_len = 0;
 			bvec->bv_offset = 0;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7ab8acad5b6..ff9d0bdf2a1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -254,7 +254,14 @@ typedef int (prep_rq_fn) (struct request_queue *, struct request *);
 typedef void (unplug_fn) (struct request_queue *);
 
 struct bio_vec;
-typedef int (merge_bvec_fn) (struct request_queue *, struct bio *, struct bio_vec *);
+struct bvec_merge_data {
+	struct block_device *bi_bdev;
+	sector_t bi_sector;
+	unsigned bi_size;
+	unsigned long bi_rw;
+};
+typedef int (merge_bvec_fn) (struct request_queue *, struct bvec_merge_data *,
+			     struct bio_vec *);
 typedef void (prepare_flush_fn) (struct request_queue *, struct request *);
 typedef void (softirq_done_fn)(struct request *);
 typedef int (dma_drain_needed_fn)(struct request *);
-- 
cgit v1.2.3-70-g09d2


From 2e4bef41a0f7df31be140ef354b9c12f2299016a Mon Sep 17 00:00:00 2001
From: Eric Van Hensbergen <ericvh@gmail.com>
Date: Tue, 24 Jun 2008 17:39:39 -0500
Subject: 9p: fix O_APPEND in legacy mode

The legacy protocol's open operation doesn't handle an append operation
(it is expected that the client take care of it).  We were incorrectly
passing the extended protocol's flag through even in legacy mode.  This
was reported in bugzilla report #10689.  This patch fixes the problem
by disallowing extended protocol open modes from being passed in legacy
mode and implemented append functionality on the client side by adding
a seek after the open.

Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/v9fs_vfs.h  |  2 +-
 fs/9p/vfs_file.c  |  4 +++-
 fs/9p/vfs_inode.c | 18 ++++++++++--------
 3 files changed, 14 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index fd01d90cada..57997fa14e6 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -51,4 +51,4 @@ int v9fs_dir_release(struct inode *inode, struct file *filp);
 int v9fs_file_open(struct inode *inode, struct file *file);
 void v9fs_inode2stat(struct inode *inode, struct p9_stat *stat);
 void v9fs_dentry_release(struct dentry *);
-int v9fs_uflags2omode(int uflags);
+int v9fs_uflags2omode(int uflags, int extended);
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 0d55affe37d..52944d2249a 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -59,7 +59,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 
 	P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p \n", inode, file);
 	v9ses = v9fs_inode2v9ses(inode);
-	omode = v9fs_uflags2omode(file->f_flags);
+	omode = v9fs_uflags2omode(file->f_flags, v9fs_extended(v9ses));
 	fid = file->private_data;
 	if (!fid) {
 		fid = v9fs_fid_clone(file->f_path.dentry);
@@ -75,6 +75,8 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 			inode->i_size = 0;
 			inode->i_blocks = 0;
 		}
+		if ((file->f_flags & O_APPEND) && (!v9fs_extended(v9ses)))
+			generic_file_llseek(file, 0, SEEK_END);
 	}
 
 	file->private_data = fid;
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 40fa807bd92..c95295c6504 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -132,10 +132,10 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
 /**
  * v9fs_uflags2omode- convert posix open flags to plan 9 mode bits
  * @uflags: flags to convert
- *
+ * @extended: if .u extensions are active
  */
 
-int v9fs_uflags2omode(int uflags)
+int v9fs_uflags2omode(int uflags, int extended)
 {
 	int ret;
 
@@ -155,14 +155,16 @@ int v9fs_uflags2omode(int uflags)
 		break;
 	}
 
-	if (uflags & O_EXCL)
-		ret |= P9_OEXCL;
-
 	if (uflags & O_TRUNC)
 		ret |= P9_OTRUNC;
 
-	if (uflags & O_APPEND)
-		ret |= P9_OAPPEND;
+	if (extended) {
+		if (uflags & O_EXCL)
+			ret |= P9_OEXCL;
+
+		if (uflags & O_APPEND)
+			ret |= P9_OAPPEND;
+	}
 
 	return ret;
 }
@@ -506,7 +508,7 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
 		flags = O_RDWR;
 
 	fid = v9fs_create(v9ses, dir, dentry, NULL, perm,
-						v9fs_uflags2omode(flags));
+				v9fs_uflags2omode(flags, v9fs_extended(v9ses)));
 	if (IS_ERR(fid)) {
 		err = PTR_ERR(fid);
 		fid = NULL;
-- 
cgit v1.2.3-70-g09d2


From 8b3d3567f72aa61d5d6f4ce89d289b154e1ea866 Mon Sep 17 00:00:00 2001
From: Octavian Purdila <opurdila@ixiacom.com>
Date: Fri, 4 Jul 2008 09:33:33 +0200
Subject: ramfs: enable splice write

Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/ramfs/file-mmu.c   | 1 +
 fs/ramfs/file-nommu.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 9590b902430..78f613cb9c7 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -45,6 +45,7 @@ const struct file_operations ramfs_file_operations = {
 	.mmap		= generic_file_mmap,
 	.fsync		= simple_sync_file,
 	.splice_read	= generic_file_splice_read,
+	.splice_write	= generic_file_splice_write,
 	.llseek		= generic_file_llseek,
 };
 
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 0989bc2c2f6..52312ec93ff 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -43,6 +43,7 @@ const struct file_operations ramfs_file_operations = {
 	.aio_write		= generic_file_aio_write,
 	.fsync			= simple_sync_file,
 	.splice_read		= generic_file_splice_read,
+	.splice_write		= generic_file_splice_write,
 	.llseek			= generic_file_llseek,
 };
 
-- 
cgit v1.2.3-70-g09d2


From 32502b8413a77b54b9e19809404109590c32dfb7 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 4 Jul 2008 09:35:17 +0200
Subject: splice: fix generic_file_splice_read() race with page invalidation

If a page was invalidated during splicing from file to a pipe, then
generic_file_splice_read() could return a short or zero count.

This manifested itself in rare I/O errors seen on nfs exported fuse
filesystems.  This is because nfsd uses splice_direct_to_actor() to read
files, and fuse uses invalidate_inode_pages2() to invalidate stale data on
open.

Fix by redoing the page find/create if it was found to be truncated
(invalidated).

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/splice.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index aa5f6f60b30..399442179d8 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -379,13 +379,22 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 				lock_page(page);
 
 			/*
-			 * page was truncated, stop here. if this isn't the
-			 * first page, we'll just complete what we already
-			 * added
+			 * Page was truncated, or invalidated by the
+			 * filesystem.  Redo the find/create, but this time the
+			 * page is kept locked, so there's no chance of another
+			 * race with truncate/invalidate.
 			 */
 			if (!page->mapping) {
 				unlock_page(page);
-				break;
+				page = find_or_create_page(mapping, index,
+						mapping_gfp_mask(mapping));
+
+				if (!page) {
+					error = -ENOMEM;
+					break;
+				}
+				page_cache_release(pages[page_nr]);
+				pages[page_nr] = page;
 			}
 			/*
 			 * page was already under io and is now done, great
-- 
cgit v1.2.3-70-g09d2


From f5c8f7dae75e1e6bb3200fc61302e4d5e2df3dc2 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 4 Jul 2008 09:59:33 -0700
Subject: ext3: add missing unlock to error path in ext3_quota_write()

When write in ext3_quota_write() fails, we have to properly release
i_mutex.  One error path has been missing the unlock...

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/super.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index fe3119a71ad..2845425077e 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2875,8 +2875,10 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
 		blk++;
 	}
 out:
-	if (len == towrite)
+	if (len == towrite) {
+		mutex_unlock(&inode->i_mutex);
 		return err;
+	}
 	if (inode->i_size < off+len-towrite) {
 		i_size_write(inode, off+len-towrite);
 		EXT3_I(inode)->i_disksize = inode->i_size;
-- 
cgit v1.2.3-70-g09d2


From 4d04e4fbf8fc9f5136a64d45e2c20de095c08efb Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 4 Jul 2008 09:59:34 -0700
Subject: ext4: add missing unlock to an error path in ext4_quota_write()

When write in ext4_quota_write() fails, we have to properly release
i_mutex.  One error path has been missing the unlock...

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext4/super.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index cb96f127c36..02bf2434397 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3337,8 +3337,10 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 		blk++;
 	}
 out:
-	if (len == towrite)
+	if (len == towrite) {
+		mutex_unlock(&inode->i_mutex);
 		return err;
+	}
 	if (inode->i_size < off+len-towrite) {
 		i_size_write(inode, off+len-towrite);
 		EXT4_I(inode)->i_disksize = inode->i_size;
-- 
cgit v1.2.3-70-g09d2


From 10dd08dc04c881dcc9f7f19e2a3ad8e0778e4db5 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 4 Jul 2008 09:59:34 -0700
Subject: reiserfs: add missing unlock to an error path in
 reiserfs_quota_write()

When write in reiserfs_quota_write() fails, we have to properly release
i_mutex. One error path has been missing the unlock...

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/super.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index ed424d708e6..1d40f2bd197 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2165,8 +2165,10 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
 		blk++;
 	}
 out:
-	if (len == towrite)
+	if (len == towrite) {
+		mutex_unlock(&inode->i_mutex);
 		return err;
+	}
 	if (inode->i_size < off + len - towrite)
 		i_size_write(inode, off + len - towrite);
 	inode->i_version++;
-- 
cgit v1.2.3-70-g09d2


From c4a2d7fbec3029c8891a3ad5fceec2992096a3b7 Mon Sep 17 00:00:00 2001
From: Michael Halcrow <mhalcrow@us.ibm.com>
Date: Fri, 4 Jul 2008 09:59:35 -0700
Subject: ecryptfs: remove unnecessary mux from
 ecryptfs_init_ecryptfs_miscdev()

The misc_mtx should provide all the protection required to keep the daemon
hash table sane during miscdev registration.  Since this mutex is causing
gratuitous lockdep warnings, this patch removes it.

Signed-off-by: Michael Halcrow <mhalcrow@us.ibm.com>
Reported-by: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/miscdev.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 50c994a249a..09a4522f65e 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -575,13 +575,11 @@ int ecryptfs_init_ecryptfs_miscdev(void)
 	int rc;
 
 	atomic_set(&ecryptfs_num_miscdev_opens, 0);
-	mutex_lock(&ecryptfs_daemon_hash_mux);
 	rc = misc_register(&ecryptfs_miscdev);
 	if (rc)
 		printk(KERN_ERR "%s: Failed to register miscellaneous device "
 		       "for communications with userspace daemons; rc = [%d]\n",
 		       __func__, rc);
-	mutex_unlock(&ecryptfs_daemon_hash_mux);
 	return rc;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 337e2ab5d1efca56c6fdd57bffaea7e7899e7283 Mon Sep 17 00:00:00 2001
From: Jess Guerrero <i92guboj@terra.es>
Date: Fri, 4 Jul 2008 09:59:50 -0700
Subject: ntfs: update help text

The url in the help text for ntfs should be updated.

Acked-by: Anton Altaparmakov <aia21@cantab.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index cf12c403b8c..2694648cbd1 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -830,7 +830,7 @@ config NTFS_FS
 	  from the project web site.
 
 	  For more information see <file:Documentation/filesystems/ntfs.txt>
-	  and <http://linux-ntfs.sourceforge.net/>.
+	  and <http://www.linux-ntfs.org/>.
 
 	  To compile this file system support as a module, choose M here: the
 	  module will be called ntfs.
-- 
cgit v1.2.3-70-g09d2


From 6d1029b56329b1cc9b7233e5333c1a48ddbbfad8 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Fri, 4 Jul 2008 09:59:51 -0700
Subject: add kernel-doc for simple_read_from_buffer and
 memory_read_from_buffer

Add kernel-doc comments describing simple_read_from_buffer and
memory_read_from_buffer.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: "Randy.Dunlap" <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/libfs.c | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'fs')

diff --git a/fs/libfs.c b/fs/libfs.c
index 892d41cb338..baeb71ee1cd 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -512,6 +512,20 @@ void simple_release_fs(struct vfsmount **mount, int *count)
 	mntput(mnt);
 }
 
+/**
+ * simple_read_from_buffer - copy data from the buffer to user space
+ * @to: the user space buffer to read to
+ * @count: the maximum number of bytes to read
+ * @ppos: the current position in the buffer
+ * @from: the buffer to read from
+ * @available: the size of the buffer
+ *
+ * The simple_read_from_buffer() function reads up to @count bytes from the
+ * buffer @from at offset @ppos into the user space address starting at @to.
+ *
+ * On success, the number of bytes read is returned and the offset @ppos is
+ * advanced by this number, or negative value is returned on error.
+ **/
 ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
 				const void *from, size_t available)
 {
@@ -528,6 +542,20 @@ ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
 	return count;
 }
 
+/**
+ * memory_read_from_buffer - copy data from the buffer
+ * @to: the kernel space buffer to read to
+ * @count: the maximum number of bytes to read
+ * @ppos: the current position in the buffer
+ * @from: the buffer to read from
+ * @available: the size of the buffer
+ *
+ * The memory_read_from_buffer() function reads up to @count bytes from the
+ * buffer @from at offset @ppos into the kernel space address starting at @to.
+ *
+ * On success, the number of bytes read is returned and the offset @ppos is
+ * advanced by this number, or negative value is returned on error.
+ **/
 ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
 				const void *from, size_t available)
 {
-- 
cgit v1.2.3-70-g09d2


From 086f7316f0d400806d76323beefae996bb3849b1 Mon Sep 17 00:00:00 2001
From: "Andrew G. Morgan" <morgan@kernel.org>
Date: Fri, 4 Jul 2008 09:59:58 -0700
Subject: security: filesystem capabilities: fix fragile setuid fixup code

This commit includes a bugfix for the fragile setuid fixup code in the
case that filesystem capabilities are supported (in access()).  The effect
of this fix is gated on filesystem capability support because changing
securebits is only supported when filesystem capabilities support is
configured.)

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Andrew G. Morgan <morgan@kernel.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/open.c                  | 37 ++++++++++++++++++++++---------------
 include/linux/capability.h |  2 ++
 include/linux/securebits.h | 15 ++++++++-------
 kernel/capability.c        | 21 +++++++++++++++++++++
 4 files changed, 53 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/open.c b/fs/open.c
index a1450086e92..a99ad09c319 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -16,6 +16,7 @@
 #include <linux/namei.h>
 #include <linux/backing-dev.h>
 #include <linux/capability.h>
+#include <linux/securebits.h>
 #include <linux/security.h>
 #include <linux/mount.h>
 #include <linux/vfs.h>
@@ -425,7 +426,7 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 {
 	struct nameidata nd;
 	int old_fsuid, old_fsgid;
-	kernel_cap_t old_cap;
+	kernel_cap_t uninitialized_var(old_cap);  /* !SECURE_NO_SETUID_FIXUP */
 	int res;
 
 	if (mode & ~S_IRWXO)	/* where's F_OK, X_OK, W_OK, R_OK? */
@@ -433,23 +434,27 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 
 	old_fsuid = current->fsuid;
 	old_fsgid = current->fsgid;
-	old_cap = current->cap_effective;
 
 	current->fsuid = current->uid;
 	current->fsgid = current->gid;
 
-	/*
-	 * Clear the capabilities if we switch to a non-root user
-	 *
-	 * FIXME: There is a race here against sys_capset.  The
-	 * capabilities can change yet we will restore the old
-	 * value below.  We should hold task_capabilities_lock,
-	 * but we cannot because user_path_walk can sleep.
-	 */
-	if (current->uid)
-		cap_clear(current->cap_effective);
-	else
-		current->cap_effective = current->cap_permitted;
+	if (!issecure(SECURE_NO_SETUID_FIXUP)) {
+		/*
+		 * Clear the capabilities if we switch to a non-root user
+		 */
+#ifndef CONFIG_SECURITY_FILE_CAPABILITIES
+		/*
+		 * FIXME: There is a race here against sys_capset.  The
+		 * capabilities can change yet we will restore the old
+		 * value below.  We should hold task_capabilities_lock,
+		 * but we cannot because user_path_walk can sleep.
+		 */
+#endif /* ndef CONFIG_SECURITY_FILE_CAPABILITIES */
+		if (current->uid)
+			old_cap = cap_set_effective(__cap_empty_set);
+		else
+			old_cap = cap_set_effective(current->cap_permitted);
+	}
 
 	res = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd);
 	if (res)
@@ -478,7 +483,9 @@ out_path_release:
 out:
 	current->fsuid = old_fsuid;
 	current->fsgid = old_fsgid;
-	current->cap_effective = old_cap;
+
+	if (!issecure(SECURE_NO_SETUID_FIXUP))
+		cap_set_effective(old_cap);
 
 	return res;
 }
diff --git a/include/linux/capability.h b/include/linux/capability.h
index fa830f8de03..02673846d20 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -501,6 +501,8 @@ extern const kernel_cap_t __cap_empty_set;
 extern const kernel_cap_t __cap_full_set;
 extern const kernel_cap_t __cap_init_eff_set;
 
+kernel_cap_t cap_set_effective(const kernel_cap_t pE_new);
+
 int capable(int cap);
 int __capable(struct task_struct *t, int cap);
 
diff --git a/include/linux/securebits.h b/include/linux/securebits.h
index c1f19dbceb0..92f09bdf117 100644
--- a/include/linux/securebits.h
+++ b/include/linux/securebits.h
@@ -7,14 +7,15 @@
    inheritance of root-permissions and suid-root executable under
    compatibility mode. We raise the effective and inheritable bitmasks
    *of the executable file* if the effective uid of the new process is
-   0. If the real uid is 0, we raise the inheritable bitmask of the
+   0. If the real uid is 0, we raise the effective (legacy) bit of the
    executable file. */
 #define SECURE_NOROOT			0
 #define SECURE_NOROOT_LOCKED		1  /* make bit-0 immutable */
 
-/* When set, setuid to/from uid 0 does not trigger capability-"fixes"
-   to be compatible with old programs relying on set*uid to loose
-   privileges. When unset, setuid doesn't change privileges. */
+/* When set, setuid to/from uid 0 does not trigger capability-"fixup".
+   When unset, to provide compatiblility with old programs relying on
+   set*uid to gain/lose privilege, transitions to/from uid 0 cause
+   capabilities to be gained/lost. */
 #define SECURE_NO_SETUID_FIXUP		2
 #define SECURE_NO_SETUID_FIXUP_LOCKED	3  /* make bit-2 immutable */
 
@@ -26,10 +27,10 @@
 #define SECURE_KEEP_CAPS		4
 #define SECURE_KEEP_CAPS_LOCKED		5  /* make bit-4 immutable */
 
-/* Each securesetting is implemented using two bits. One bit specify
+/* Each securesetting is implemented using two bits. One bit specifies
    whether the setting is on or off. The other bit specify whether the
-   setting is fixed or not. A setting which is fixed cannot be changed
-   from user-level. */
+   setting is locked or not. A setting which is locked cannot be
+   changed from user-level. */
 #define issecure_mask(X)	(1 << (X))
 #define issecure(X)		(issecure_mask(X) & current->securebits)
 
diff --git a/kernel/capability.c b/kernel/capability.c
index cfbe4429948..901e0fdc3ff 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -121,6 +121,27 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
  * uninteresting and/or not to be changed.
  */
 
+/*
+ * Atomically modify the effective capabilities returning the original
+ * value. No permission check is performed here - it is assumed that the
+ * caller is permitted to set the desired effective capabilities.
+ */
+kernel_cap_t cap_set_effective(const kernel_cap_t pE_new)
+{
+	kernel_cap_t pE_old;
+
+	spin_lock(&task_capability_lock);
+
+	pE_old = current->cap_effective;
+	current->cap_effective = pE_new;
+
+	spin_unlock(&task_capability_lock);
+
+	return pE_old;
+}
+
+EXPORT_SYMBOL(cap_set_effective);
+
 /**
  * sys_capget - get the capabilities of a given process.
  * @header: pointer to struct that contains capability version and
-- 
cgit v1.2.3-70-g09d2


From 695e12f8d2917378d3b93059209e17415de96204 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 4 Jul 2008 14:38:33 +0300
Subject: nfsd: tabulate nfs4 xdr encoding functions

In preparation for minorversion 1

All encoders now return an nfserr status (typically their
nfserr argument).  Unsupported ops go through nfsd4_encode_operation
too, so use nfsd4_encode_noop to encode nothing for their reply body.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4xdr.c | 205 ++++++++++++++++++++++++------------------------------
 1 file changed, 91 insertions(+), 114 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index a40bec53fa6..9b6a9bafc6b 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1950,7 +1950,7 @@ fail:
 	return -EINVAL;
 }
 
-static void
+static __be32
 nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access)
 {
 	ENCODE_HEAD;
@@ -1961,9 +1961,10 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
 		WRITE32(access->ac_resp_access);
 		ADJUST_ARGS();
 	}
+	return nfserr;
 }
 
-static void
+static __be32
 nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close)
 {
 	ENCODE_SEQID_OP_HEAD;
@@ -1975,10 +1976,11 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c
 		ADJUST_ARGS();
 	}
 	ENCODE_SEQID_OP_TAIL(close->cl_stateowner);
+	return nfserr;
 }
 
 
-static void
+static __be32
 nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_commit *commit)
 {
 	ENCODE_HEAD;
@@ -1988,9 +1990,10 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
 		WRITEMEM(commit->co_verf.data, 8);
 		ADJUST_ARGS();
 	}
+	return nfserr;
 }
 
-static void
+static __be32
 nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create *create)
 {
 	ENCODE_HEAD;
@@ -2003,6 +2006,7 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
 		WRITE32(create->cr_bmval[1]);
 		ADJUST_ARGS();
 	}
+	return nfserr;
 }
 
 static __be32
@@ -2023,9 +2027,10 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
 	return nfserr;
 }
 
-static void
-nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh *fhp)
+static __be32
+nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh **fhpp)
 {
+	struct svc_fh *fhp = *fhpp;
 	unsigned int len;
 	ENCODE_HEAD;
 
@@ -2036,6 +2041,7 @@ nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh
 		WRITEMEM(&fhp->fh_handle.fh_base, len);
 		ADJUST_ARGS();
 	}
+	return nfserr;
 }
 
 /*
@@ -2063,7 +2069,7 @@ nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denie
 	ADJUST_ARGS();
 }
 
-static void
+static __be32
 nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lock *lock)
 {
 	ENCODE_SEQID_OP_HEAD;
@@ -2077,16 +2083,18 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo
 		nfsd4_encode_lock_denied(resp, &lock->lk_denied);
 
 	ENCODE_SEQID_OP_TAIL(lock->lk_replay_owner);
+	return nfserr;
 }
 
-static void
+static __be32
 nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lockt *lockt)
 {
 	if (nfserr == nfserr_denied)
 		nfsd4_encode_lock_denied(resp, &lockt->lt_denied);
+	return nfserr;
 }
 
-static void
+static __be32
 nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_locku *locku)
 {
 	ENCODE_SEQID_OP_HEAD;
@@ -2099,10 +2107,11 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l
 	}
 				        
 	ENCODE_SEQID_OP_TAIL(locku->lu_stateowner);
+	return nfserr;
 }
 
 
-static void
+static __be32
 nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_link *link)
 {
 	ENCODE_HEAD;
@@ -2112,10 +2121,11 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_li
 		WRITECINFO(link->li_cinfo);
 		ADJUST_ARGS();
 	}
+	return nfserr;
 }
 
 
-static void
+static __be32
 nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open)
 {
 	ENCODE_SEQID_OP_HEAD;
@@ -2178,9 +2188,10 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
 	/* XXX save filehandle here */
 out:
 	ENCODE_SEQID_OP_TAIL(open->op_stateowner);
+	return nfserr;
 }
 
-static void
+static __be32
 nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc)
 {
 	ENCODE_SEQID_OP_HEAD;
@@ -2193,9 +2204,10 @@ nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct
 	}
 
 	ENCODE_SEQID_OP_TAIL(oc->oc_stateowner);
+	return nfserr;
 }
 
-static void
+static __be32
 nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od)
 {
 	ENCODE_SEQID_OP_HEAD;
@@ -2208,6 +2220,7 @@ nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struc
 	}
 
 	ENCODE_SEQID_OP_TAIL(od->od_stateowner);
+	return nfserr;
 }
 
 static __be32
@@ -2402,7 +2415,7 @@ err_no_verf:
 	return nfserr;
 }
 
-static void
+static __be32
 nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_remove *remove)
 {
 	ENCODE_HEAD;
@@ -2412,9 +2425,10 @@ nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
 		WRITECINFO(remove->rm_cinfo);
 		ADJUST_ARGS();
 	}
+	return nfserr;
 }
 
-static void
+static __be32
 nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_rename *rename)
 {
 	ENCODE_HEAD;
@@ -2425,9 +2439,10 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
 		WRITECINFO(rename->rn_tinfo);
 		ADJUST_ARGS();
 	}
+	return nfserr;
 }
 
-static void
+static __be32
 nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
 		     struct nfsd4_secinfo *secinfo)
 {
@@ -2491,13 +2506,14 @@ nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
 out:
 	if (exp)
 		exp_put(exp);
+	return nfserr;
 }
 
 /*
  * The SETATTR encode routine is special -- it always encodes a bitmap,
  * regardless of the error status.
  */
-static void
+static __be32
 nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setattr *setattr)
 {
 	ENCODE_HEAD;
@@ -2514,9 +2530,10 @@ nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
 		WRITE32(setattr->sa_bmval[1]);
 	}
 	ADJUST_ARGS();
+	return nfserr;
 }
 
-static void
+static __be32
 nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setclientid *scd)
 {
 	ENCODE_HEAD;
@@ -2533,9 +2550,10 @@ nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct n
 		WRITE32(0);
 		ADJUST_ARGS();
 	}
+	return nfserr;
 }
 
-static void
+static __be32
 nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_write *write)
 {
 	ENCODE_HEAD;
@@ -2547,8 +2565,56 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w
 		WRITEMEM(write->wr_verifier.data, 8);
 		ADJUST_ARGS();
 	}
+	return nfserr;
 }
 
+static __be32
+nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
+{
+	return nfserr;
+}
+
+typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *);
+
+static nfsd4_enc nfsd4_enc_ops[] = {
+	[OP_ACCESS]		(nfsd4_enc)nfsd4_encode_access,
+	[OP_CLOSE]		(nfsd4_enc)nfsd4_encode_close,
+	[OP_COMMIT]		(nfsd4_enc)nfsd4_encode_commit,
+	[OP_CREATE]		(nfsd4_enc)nfsd4_encode_create,
+	[OP_DELEGPURGE]		(nfsd4_enc)nfsd4_encode_noop,
+	[OP_DELEGRETURN]	(nfsd4_enc)nfsd4_encode_noop,
+	[OP_GETATTR]		(nfsd4_enc)nfsd4_encode_getattr,
+	[OP_GETFH]		(nfsd4_enc)nfsd4_encode_getfh,
+	[OP_LINK]		(nfsd4_enc)nfsd4_encode_link,
+	[OP_LOCK]		(nfsd4_enc)nfsd4_encode_lock,
+	[OP_LOCKT]		(nfsd4_enc)nfsd4_encode_lockt,
+	[OP_LOCKU]		(nfsd4_enc)nfsd4_encode_locku,
+	[OP_LOOKUP]		(nfsd4_enc)nfsd4_encode_noop,
+	[OP_LOOKUPP]		(nfsd4_enc)nfsd4_encode_noop,
+	[OP_NVERIFY]		(nfsd4_enc)nfsd4_encode_noop,
+	[OP_OPEN]		(nfsd4_enc)nfsd4_encode_open,
+	[OP_OPEN_CONFIRM]	(nfsd4_enc)nfsd4_encode_open_confirm,
+	[OP_OPEN_DOWNGRADE]	(nfsd4_enc)nfsd4_encode_open_downgrade,
+	[OP_PUTFH]		(nfsd4_enc)nfsd4_encode_noop,
+	[OP_PUTPUBFH]		(nfsd4_enc)nfsd4_encode_noop,
+	[OP_PUTROOTFH]		(nfsd4_enc)nfsd4_encode_noop,
+	[OP_READ]		(nfsd4_enc)nfsd4_encode_read,
+	[OP_READDIR]		(nfsd4_enc)nfsd4_encode_readdir,
+	[OP_READLINK]		(nfsd4_enc)nfsd4_encode_readlink,
+	[OP_REMOVE]		(nfsd4_enc)nfsd4_encode_remove,
+	[OP_RENAME]		(nfsd4_enc)nfsd4_encode_rename,
+	[OP_RENEW]		(nfsd4_enc)nfsd4_encode_noop,
+	[OP_RESTOREFH]		(nfsd4_enc)nfsd4_encode_noop,
+	[OP_SAVEFH]		(nfsd4_enc)nfsd4_encode_noop,
+	[OP_SECINFO]		(nfsd4_enc)nfsd4_encode_secinfo,
+	[OP_SETATTR]		(nfsd4_enc)nfsd4_encode_setattr,
+	[OP_SETCLIENTID]	(nfsd4_enc)nfsd4_encode_setclientid,
+	[OP_SETCLIENTID_CONFIRM](nfsd4_enc)nfsd4_encode_noop,
+	[OP_VERIFY]		(nfsd4_enc)nfsd4_encode_noop,
+	[OP_WRITE]		(nfsd4_enc)nfsd4_encode_write,
+	[OP_RELEASE_LOCKOWNER]	(nfsd4_enc)nfsd4_encode_noop,
+};
+
 void
 nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
 {
@@ -2560,101 +2626,12 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
 	statp = p++;	/* to be backfilled at the end */
 	ADJUST_ARGS();
 
-	switch (op->opnum) {
-	case OP_ACCESS:
-		nfsd4_encode_access(resp, op->status, &op->u.access);
-		break;
-	case OP_CLOSE:
-		nfsd4_encode_close(resp, op->status, &op->u.close);
-		break;
-	case OP_COMMIT:
-		nfsd4_encode_commit(resp, op->status, &op->u.commit);
-		break;
-	case OP_CREATE:
-		nfsd4_encode_create(resp, op->status, &op->u.create);
-		break;
-	case OP_DELEGRETURN:
-		break;
-	case OP_GETATTR:
-		op->status = nfsd4_encode_getattr(resp, op->status, &op->u.getattr);
-		break;
-	case OP_GETFH:
-		nfsd4_encode_getfh(resp, op->status, op->u.getfh);
-		break;
-	case OP_LINK:
-		nfsd4_encode_link(resp, op->status, &op->u.link);
-		break;
-	case OP_LOCK:
-		nfsd4_encode_lock(resp, op->status, &op->u.lock);
-		break;
-	case OP_LOCKT:
-		nfsd4_encode_lockt(resp, op->status, &op->u.lockt);
-		break;
-	case OP_LOCKU:
-		nfsd4_encode_locku(resp, op->status, &op->u.locku);
-		break;
-	case OP_LOOKUP:
-		break;
-	case OP_LOOKUPP:
-		break;
-	case OP_NVERIFY:
-		break;
-	case OP_OPEN:
-		nfsd4_encode_open(resp, op->status, &op->u.open);
-		break;
-	case OP_OPEN_CONFIRM:
-		nfsd4_encode_open_confirm(resp, op->status, &op->u.open_confirm);
-		break;
-	case OP_OPEN_DOWNGRADE:
-		nfsd4_encode_open_downgrade(resp, op->status, &op->u.open_downgrade);
-		break;
-	case OP_PUTFH:
-		break;
-	case OP_PUTROOTFH:
-		break;
-	case OP_READ:
-		op->status = nfsd4_encode_read(resp, op->status, &op->u.read);
-		break;
-	case OP_READDIR:
-		op->status = nfsd4_encode_readdir(resp, op->status, &op->u.readdir);
-		break;
-	case OP_READLINK:
-		op->status = nfsd4_encode_readlink(resp, op->status, &op->u.readlink);
-		break;
-	case OP_REMOVE:
-		nfsd4_encode_remove(resp, op->status, &op->u.remove);
-		break;
-	case OP_RENAME:
-		nfsd4_encode_rename(resp, op->status, &op->u.rename);
-		break;
-	case OP_RENEW:
-		break;
-	case OP_RESTOREFH:
-		break;
-	case OP_SAVEFH:
-		break;
-	case OP_SECINFO:
-		nfsd4_encode_secinfo(resp, op->status, &op->u.secinfo);
-		break;
-	case OP_SETATTR:
-		nfsd4_encode_setattr(resp, op->status, &op->u.setattr);
-		break;
-	case OP_SETCLIENTID:
-		nfsd4_encode_setclientid(resp, op->status, &op->u.setclientid);
-		break;
-	case OP_SETCLIENTID_CONFIRM:
-		break;
-	case OP_VERIFY:
-		break;
-	case OP_WRITE:
-		nfsd4_encode_write(resp, op->status, &op->u.write);
-		break;
-	case OP_RELEASE_LOCKOWNER:
-		break;
-	default:
-		break;
-	}
-
+	if (op->opnum == OP_ILLEGAL)
+		goto status;
+	BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) ||
+	       !nfsd4_enc_ops[op->opnum]);
+	op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u);
+status:
 	/*
 	 * Note: We write the status directly, instead of using WRITE32(),
 	 * since it is already in network byte order.
-- 
cgit v1.2.3-70-g09d2


From 20cbc972617069c1ed434f62151e4de57d26ea46 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Sat, 5 Jul 2008 12:29:05 -0700
Subject: Fix clear_refs_write() use of struct mm_walk

Don't use a static entry, so as to prevent races during concurrent use
of this function.

Reported-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index ab8ccc9d14f..05053d701ac 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -476,10 +476,10 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 		return -ESRCH;
 	mm = get_task_mm(task);
 	if (mm) {
-		static struct mm_walk clear_refs_walk;
-		memset(&clear_refs_walk, 0, sizeof(clear_refs_walk));
-		clear_refs_walk.pmd_entry = clear_refs_pte_range;
-		clear_refs_walk.mm = mm;
+		struct mm_walk clear_refs_walk = {
+			.pmd_entry = clear_refs_pte_range,
+			.mm = mm,
+		};
 		down_read(&mm->mmap_sem);
 		for (vma = mm->mmap; vma; vma = vma->vm_next) {
 			clear_refs_walk.private = vma;
-- 
cgit v1.2.3-70-g09d2


From 5d7e0d2bd98ef4f5a16ac9da1987ae655368dd6a Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Sat, 5 Jul 2008 01:02:01 -0700
Subject: Fix pagemap_read() use of struct mm_walk

Fix some issues in pagemap_read noted by Alexey:

- initialize pagemap_walk.mm to "mm" , so the code starts working as
  advertised

- initialize ->private to "&pm" so it wouldn't immediately oops in
  pagemap_pte_hole()

- unstatic struct pagemap_walk, so two threads won't fsckup each other
  (including those started by root, including flipping ->mm when you don't
  have permissions)

- pagemap_read() contains two calls to ptrace_may_attach(), second one
  looks unneeded.

- avoid possible kmalloc(0) and integer wraparound.

Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
[ Personally, I'd just remove the functionality entirely  - Linus ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 72 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 38 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 05053d701ac..c492449f3b4 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -602,11 +602,6 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	return err;
 }
 
-static struct mm_walk pagemap_walk = {
-	.pmd_entry = pagemap_pte_range,
-	.pte_hole = pagemap_pte_hole
-};
-
 /*
  * /proc/pid/pagemap - an array mapping virtual pages to pfns
  *
@@ -641,6 +636,11 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 	struct pagemapread pm;
 	int pagecount;
 	int ret = -ESRCH;
+	struct mm_walk pagemap_walk;
+	unsigned long src;
+	unsigned long svpfn;
+	unsigned long start_vaddr;
+	unsigned long end_vaddr;
 
 	if (!task)
 		goto out;
@@ -659,11 +659,15 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 	if (!mm)
 		goto out_task;
 
-	ret = -ENOMEM;
+
 	uaddr = (unsigned long)buf & PAGE_MASK;
 	uend = (unsigned long)(buf + count);
 	pagecount = (PAGE_ALIGN(uend) - uaddr) / PAGE_SIZE;
-	pages = kmalloc(pagecount * sizeof(struct page *), GFP_KERNEL);
+	ret = 0;
+	if (pagecount == 0)
+		goto out_mm;
+	pages = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
+	ret = -ENOMEM;
 	if (!pages)
 		goto out_mm;
 
@@ -684,33 +688,33 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 	pm.out = (u64 *)buf;
 	pm.end = (u64 *)(buf + count);
 
-	if (!ptrace_may_attach(task)) {
-		ret = -EIO;
-	} else {
-		unsigned long src = *ppos;
-		unsigned long svpfn = src / PM_ENTRY_BYTES;
-		unsigned long start_vaddr = svpfn << PAGE_SHIFT;
-		unsigned long end_vaddr = TASK_SIZE_OF(task);
-
-		/* watch out for wraparound */
-		if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT)
-			start_vaddr = end_vaddr;
-
-		/*
-		 * The odds are that this will stop walking way
-		 * before end_vaddr, because the length of the
-		 * user buffer is tracked in "pm", and the walk
-		 * will stop when we hit the end of the buffer.
-		 */
-		ret = walk_page_range(start_vaddr, end_vaddr,
-					&pagemap_walk);
-		if (ret == PM_END_OF_BUFFER)
-			ret = 0;
-		/* don't need mmap_sem for these, but this looks cleaner */
-		*ppos += (char *)pm.out - buf;
-		if (!ret)
-			ret = (char *)pm.out - buf;
-	}
+	pagemap_walk.pmd_entry = pagemap_pte_range;
+	pagemap_walk.pte_hole = pagemap_pte_hole;
+	pagemap_walk.mm = mm;
+	pagemap_walk.private = &pm;
+
+	src = *ppos;
+	svpfn = src / PM_ENTRY_BYTES;
+	start_vaddr = svpfn << PAGE_SHIFT;
+	end_vaddr = TASK_SIZE_OF(task);
+
+	/* watch out for wraparound */
+	if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT)
+		start_vaddr = end_vaddr;
+
+	/*
+	 * The odds are that this will stop walking way
+	 * before end_vaddr, because the length of the
+	 * user buffer is tracked in "pm", and the walk
+	 * will stop when we hit the end of the buffer.
+	 */
+	ret = walk_page_range(start_vaddr, end_vaddr, &pagemap_walk);
+	if (ret == PM_END_OF_BUFFER)
+		ret = 0;
+	/* don't need mmap_sem for these, but this looks cleaner */
+	*ppos += (char *)pm.out - buf;
+	if (!ret)
+		ret = (char *)pm.out - buf;
 
 out_pages:
 	for (; pagecount; pagecount--) {
-- 
cgit v1.2.3-70-g09d2


From 265d529cef6fd57698d79b3c0edd3a8178059ea6 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 7 Jul 2008 10:02:36 +0100
Subject: [GFS2] Fix delayed demote race

There is a race in the delayed demote code where it does the wrong thing
if a demotion to UN has occurred for other reasons before the delay has
expired. This patch adds an assert to catch that condition as well as
fixing the root cause by adding an additional check for the UN state.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Bob Peterson <rpeterso@redhat.com>
---
 fs/gfs2/glock.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 8d5450f3c3e..cd0aa213fb2 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -587,6 +587,7 @@ static void run_queue(struct gfs2_glock *gl, const int nonblock)
 		if (nonblock)
 			goto out_sched;
 		set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
+		GLOCK_BUG_ON(gl, gl->gl_demote_state == LM_ST_EXCLUSIVE);
 		gl->gl_target = gl->gl_demote_state;
 	} else {
 		if (test_bit(GLF_DEMOTE, &gl->gl_flags))
@@ -617,7 +618,9 @@ static void glock_work_func(struct work_struct *work)
 	if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags))
 		finish_xmote(gl, gl->gl_reply);
 	spin_lock(&gl->gl_spin);
-	if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags)) {
+	if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
+	    gl->gl_state != LM_ST_UNLOCKED &&
+	    gl->gl_demote_state != LM_ST_EXCLUSIVE) {
 		unsigned long holdtime, now = jiffies;
 		holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
 		if (time_before(now, holdtime))
-- 
cgit v1.2.3-70-g09d2


From 209806aba9d540dde3db0a5ce72307f85f33468f Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 7 Jul 2008 10:07:28 +0100
Subject: [GFS2] Allow local DF locks when holding a cached EX glock

We already allow local SH locks while we hold a cached EX glock, so here
we allow DF locks as well. This works only because we rely on the VFS's
invalidation for locally cached data, and because if we hold an EX lock,
then we know that no other node can be caching data relating to this
file.

It dramatically speeds up initial writes to O_DIRECT files since we fall
back to buffered I/O for this and would otherwise bounce between DF and
EX modes on each and every write call. The lessons to be learned from
that are to ensure that (for the time being anyway) O_DIRECT files are
preallocated and that they are written to using reasonably large I/O
sizes. Even so this change fixes that corner case nicely

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index cd0aa213fb2..13391e54661 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -267,8 +267,12 @@ static inline int may_grant(const struct gfs2_glock *gl, const struct gfs2_holde
 		return 1;
 	if (gh->gh_flags & GL_EXACT)
 		return 0;
-	if (gh->gh_state == LM_ST_SHARED && gl->gl_state == LM_ST_EXCLUSIVE)
-		return 1;
+	if (gl->gl_state == LM_ST_EXCLUSIVE) {
+		if (gh->gh_state == LM_ST_SHARED && gh_head->gh_state == LM_ST_SHARED)
+			return 1;
+		if (gh->gh_state == LM_ST_DEFERRED && gh_head->gh_state == LM_ST_DEFERRED)
+			return 1;
+	}
 	if (gl->gl_state != LM_ST_UNLOCKED && (gh->gh_flags & LM_FLAG_ANY))
 		return 1;
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From 18c6ac383f3e46cfce08d0bf972705852a4e1268 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Mon, 7 Jul 2008 10:06:29 -0700
Subject: [PATCH] ocfs2/dlm: Fixes oops in dlm_new_lockres()

Patch fixes a race that can result in an oops while adding a
lockres to the dlm lockres tracking list.

Bug introduced by mainline commit 29576f8bb54045be944ba809d4fca1ad77c94165.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlm/dlmmaster.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index efc015c6128..44f87caf368 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -606,7 +606,9 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
 
 	res->last_used = 0;
 
+	spin_lock(&dlm->spinlock);
 	list_add_tail(&res->tracking, &dlm->tracking_list);
+	spin_unlock(&dlm->spinlock);
 
 	memset(res->lvb, 0, DLM_LVB_LEN);
 	memset(res->refmap, 0, sizeof(res->refmap));
-- 
cgit v1.2.3-70-g09d2


From 4f83aa302f8f8b42397c6d3703d670f0588c03ec Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Mon, 7 Jul 2008 15:02:02 -0400
Subject: nfsd: document open share bit tracking

It's not immediately obvious from the code why we're doing this.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Cc: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfsd/nfs4state.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index eca8aaa450f..c29b6ed2a0b 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1173,6 +1173,24 @@ static inline int deny_valid(u32 x)
 	return x <= NFS4_SHARE_DENY_BOTH;
 }
 
+/*
+ * We store the NONE, READ, WRITE, and BOTH bits separately in the
+ * st_{access,deny}_bmap field of the stateid, in order to track not
+ * only what share bits are currently in force, but also what
+ * combinations of share bits previous opens have used.  This allows us
+ * to enforce the recommendation of rfc 3530 14.2.19 that the server
+ * return an error if the client attempt to downgrade to a combination
+ * of share bits not explicable by closing some of its previous opens.
+ *
+ * XXX: This enforcement is actually incomplete, since we don't keep
+ * track of access/deny bit combinations; so, e.g., we allow:
+ *
+ *	OPEN allow read, deny write
+ *	OPEN allow both, deny none
+ *	DOWNGRADE allow read, deny none
+ *
+ * which we should reject.
+ */
 static void
 set_access(unsigned int *access, unsigned long bmap) {
 	int i;
-- 
cgit v1.2.3-70-g09d2


From e518f0560a191269bd345178c899c790eb1ad4c8 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 4 Jul 2008 15:38:41 +0300
Subject: nfsd: take file and mnt write in nfs4_upgrade_open

testing with newpynfs revealed this warning:
Jul  3 07:32:50 buml kernel: writeable file with no mnt_want_write()
Jul  3 07:32:50 buml kernel: ------------[ cut here ]------------
Jul  3 07:32:50 buml kernel: WARNING: at /usr0/export/dev/bhalevy/git/linux-pnfs-bh-nfs41/include/linux/fs.h:855 drop_file_write_access+0x6b/0x7e()
Jul  3 07:32:50 buml kernel: Modules linked in: nfsd auth_rpcgss exportfs nfs lockd nfs_acl sunrpc
Jul  3 07:32:50 buml kernel: Call Trace:
Jul  3 07:32:50 buml kernel: 6eaadc88:  [<6002f471>] warn_on_slowpath+0x54/0x8e
Jul  3 07:32:50 buml kernel: 6eaadcc8:  [<601b790d>] printk+0xa0/0x793
Jul  3 07:32:50 buml kernel: 6eaadd38:  [<601b6205>] __mutex_lock_slowpath+0x1db/0x1ea
Jul  3 07:32:50 buml kernel: 6eaadd68:  [<7107d4d5>] nfs4_preprocess_seqid_op+0x2a6/0x31c [nfsd]
Jul  3 07:32:50 buml kernel: 6eaadda8:  [<60078dc9>] drop_file_write_access+0x6b/0x7e
Jul  3 07:32:50 buml kernel: 6eaaddc8:  [<710804e4>] nfsd4_open_downgrade+0x114/0x1de [nfsd]
Jul  3 07:32:50 buml kernel: 6eaade08:  [<71076215>] nfsd4_proc_compound+0x1ba/0x2dc [nfsd]
Jul  3 07:32:50 buml kernel: 6eaade48:  [<71068221>] nfsd_dispatch+0xe5/0x1c2 [nfsd]
Jul  3 07:32:50 buml kernel: 6eaade88:  [<71312f81>] svc_process+0x3fd/0x714 [sunrpc]
Jul  3 07:32:50 buml kernel: 6eaadea8:  [<60039a81>] kernel_sigprocmask+0xf3/0x100
Jul  3 07:32:50 buml kernel: 6eaadee8:  [<7106874b>] nfsd+0x182/0x29b [nfsd]
Jul  3 07:32:50 buml kernel: 6eaadf48:  [<60021cc9>] run_kernel_thread+0x41/0x4a
Jul  3 07:32:50 buml kernel: 6eaadf58:  [<710685c9>] nfsd+0x0/0x29b [nfsd]
Jul  3 07:32:50 buml kernel: 6eaadf98:  [<60021cb0>] run_kernel_thread+0x28/0x4a
Jul  3 07:32:50 buml kernel: 6eaadfc8:  [<60013829>] new_thread_handler+0x72/0x9c
Jul  3 07:32:50 buml kernel:
Jul  3 07:32:50 buml kernel: ---[ end trace 2426dd7cb2fba3bf ]---

Bruce Fields suggested this (Thanks!):
maybe we need to be doing a mnt_want_write on open_upgrade and mnt_put_write on downgrade?

This patch adds a call to mnt_want_write and file_take_write (which is
doing the actual work).

The counter-calls mnt_drop_write a file_release_write are now being properly
called by drop_file_write_access in the exact path printed by the warning
above.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c29b6ed2a0b..1578d7a2667 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1588,6 +1588,10 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct svc_fh *cur_fh, struct nfs4_sta
 		int err = get_write_access(inode);
 		if (err)
 			return nfserrno(err);
+		err = mnt_want_write(cur_fh->fh_export->ex_path.mnt);
+		if (err)
+			return nfserrno(err);
+		file_take_write(filp);
 	}
 	status = nfsd4_truncate(rqstp, cur_fh, open);
 	if (status) {
-- 
cgit v1.2.3-70-g09d2


From ce0c0e50f94e8c55b00a722e8c6e8d6c802be211 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Fri, 2 May 2008 11:46:49 +0200
Subject: x86, generic: CPA add statistics about state of direct mapping v4

Add information about the mapping state of the direct mapping to
/proc/meminfo. I chose /proc/meminfo because that is where all the other
memory statistics are too and it is a generally useful metric even
outside debugging situations. A lot of split kernel pages means the
kernel will run slower.

This way we can see how many large pages are really used for it and how
many are split.

Useful for general insight into the kernel.

v2: Add hotplug locking to 64bit to plug a very obscure theoretical race.
    32bit doesn't need it because it doesn't support hotadd for lowmem.
    Fix some typos
v3: Rename dpages_cnt
    Add CONFIG ifdef for count update as requested by tglx
    Expand description
v4: Fix stupid bugs added in v3
    Move update_page_count to pageattr.c

Signed-off-by: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_32.c     |  5 +++++
 arch/x86/mm/init_64.c     |  7 +++++++
 arch/x86/mm/pageattr.c    | 35 +++++++++++++++++++++++++++++++++++
 fs/proc/proc_misc.c       |  7 +++++++
 include/asm-x86/pgtable.h |  3 +++
 5 files changed, 57 insertions(+)

(limited to 'fs')

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index ec30d10154b..0269ac230bf 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -162,6 +162,7 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 	pgd_t *pgd;
 	pmd_t *pmd;
 	pte_t *pte;
+	unsigned pages_2m = 0, pages_4k = 0;
 
 	pgd_idx = pgd_index(PAGE_OFFSET);
 	pgd = pgd_base + pgd_idx;
@@ -197,6 +198,7 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 				    is_kernel_text(addr2))
 					prot = PAGE_KERNEL_LARGE_EXEC;
 
+				pages_2m++;
 				set_pmd(pmd, pfn_pmd(pfn, prot));
 
 				pfn += PTRS_PER_PTE;
@@ -213,11 +215,14 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 				if (is_kernel_text(addr))
 					prot = PAGE_KERNEL_EXEC;
 
+				pages_4k++;
 				set_pte(pte, pfn_pte(pfn, prot));
 			}
 			max_pfn_mapped = pfn;
 		}
 	}
+	update_page_count(PG_LEVEL_2M, pages_2m);
+	update_page_count(PG_LEVEL_4K, pages_4k);
 }
 
 static inline int page_kills_ppro(unsigned long pagenr)
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 819dad973b1..5e438385905 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -312,6 +312,8 @@ __meminit void early_iounmap(void *addr, unsigned long size)
 static unsigned long __meminit
 phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
 {
+	unsigned long pages = 0;
+
 	int i = pmd_index(address);
 
 	for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
@@ -328,9 +330,11 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
 		if (pmd_val(*pmd))
 			continue;
 
+		pages++;
 		set_pte((pte_t *)pmd,
 			pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
 	}
+	update_page_count(PG_LEVEL_2M, pages);
 	return address;
 }
 
@@ -350,6 +354,7 @@ phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
 static unsigned long __meminit
 phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
 {
+	unsigned long pages = 0;
 	unsigned long last_map_addr = end;
 	int i = pud_index(addr);
 
@@ -374,6 +379,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
 		}
 
 		if (direct_gbpages) {
+			pages++;
 			set_pte((pte_t *)pud,
 				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
 			last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
@@ -390,6 +396,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
 		unmap_low_page(pmd);
 	}
 	__flush_tlb_all();
+	update_page_count(PG_LEVEL_1G, pages);
 
 	return last_map_addr >> PAGE_SHIFT;
 }
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 60bcb5b6a37..668205bca15 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -34,6 +34,19 @@ struct cpa_data {
 	unsigned	force_split : 1;
 };
 
+static unsigned long direct_pages_count[PG_LEVEL_NUM];
+
+void __meminit update_page_count(int level, unsigned long pages)
+{
+#ifdef CONFIG_PROC_FS
+	unsigned long flags;
+	/* Protect against CPA */
+	spin_lock_irqsave(&pgd_lock, flags);
+	direct_pages_count[level] += pages;
+	spin_unlock_irqrestore(&pgd_lock, flags);
+#endif
+}
+
 #ifdef CONFIG_X86_64
 
 static inline unsigned long highmap_start_pfn(void)
@@ -500,6 +513,12 @@ static int split_large_page(pte_t *kpte, unsigned long address)
 	for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
 		set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
 
+	if (address >= (unsigned long)__va(0) &&
+		address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT)) {
+		direct_pages_count[level]--;
+		direct_pages_count[level - 1] += PTRS_PER_PTE;
+	}
+
 	/*
 	 * Install the new, split up pagetable. Important details here:
 	 *
@@ -1029,6 +1048,22 @@ bool kernel_page_present(struct page *page)
 
 #endif /* CONFIG_DEBUG_PAGEALLOC */
 
+#ifdef CONFIG_PROC_FS
+int arch_report_meminfo(char *page)
+{
+	int n;
+	n = sprintf(page, "DirectMap4k:  %8lu\n"
+			  "DirectMap2M:  %8lu\n",
+			direct_pages_count[PG_LEVEL_4K],
+			direct_pages_count[PG_LEVEL_2M]);
+#ifdef CONFIG_X86_64
+	n += sprintf(page + n, "DirectMap1G:  %8lu\n",
+			direct_pages_count[PG_LEVEL_1G]);
+#endif
+	return n;
+}
+#endif
+
 /*
  * The testcases use internal knowledge of the implementation that shouldn't
  * be exposed to the rest of the kernel. Include these directly here.
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 7e277f2ad46..15f7bd41029 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -123,6 +123,11 @@ static int uptime_read_proc(char *page, char **start, off_t off,
 	return proc_calc_metrics(page, start, off, count, eof, len);
 }
 
+int __attribute__((weak)) arch_report_meminfo(char *page)
+{
+	return 0;
+}
+
 static int meminfo_read_proc(char *page, char **start, off_t off,
 				 int count, int *eof, void *data)
 {
@@ -221,6 +226,8 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 
 		len += hugetlb_report_meminfo(page + len);
 
+	len += arch_report_meminfo(page + len);
+
 	return proc_calc_metrics(page, start, off, count, eof, len);
 #undef K
 }
diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h
index 97c271b2910..111564fa5e7 100644
--- a/include/asm-x86/pgtable.h
+++ b/include/asm-x86/pgtable.h
@@ -369,8 +369,11 @@ enum {
 	PG_LEVEL_4K,
 	PG_LEVEL_2M,
 	PG_LEVEL_1G,
+	PG_LEVEL_NUM
 };
 
+void update_page_count(int level, unsigned long pages);
+
 /*
  * Helper function that returns the kernel pagetable entry controlling
  * the virtual address 'address'. NULL means no pagetable entry present.
-- 
cgit v1.2.3-70-g09d2


From 383bc5cecc2ed0b8f44a25488660b03030425ef7 Mon Sep 17 00:00:00 2001
From: Bernhard Walle <bwalle@suse.de>
Date: Thu, 26 Jun 2008 09:23:21 +0200
Subject: x86, crashdump, /proc/vmcore: remove CONFIG_EXPERIMENTAL from kdump

I would suggest to remove the "experimental" status from Kdump.
Kdump is now in the kernel since a long time and used by Enterprise
distributions. I don't think that "experimental" is true any more.

Signed-off-by: Bernhard Walle <bwalle@suse.de>
Cc: vgoyal@redhat.com
Cc: kexec@lists.infradead.org
Cc: Bernhard Walle <bwalle@suse.de>
Cc: akpm@linux-foundation.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig | 1 -
 fs/Kconfig       | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 112afd368c7..25ff7548d3d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1269,7 +1269,6 @@ config KEXEC
 
 config CRASH_DUMP
 	bool "kernel crash dumps (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
 	depends on X86_64 || (X86_32 && HIGHMEM)
 	help
 	  Generate crash dump after being started by kexec.
diff --git a/fs/Kconfig b/fs/Kconfig
index 2694648cbd1..313b2e06ded 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -930,7 +930,7 @@ config PROC_KCORE
 
 config PROC_VMCORE
         bool "/proc/vmcore support (EXPERIMENTAL)"
-        depends on PROC_FS && EXPERIMENTAL && CRASH_DUMP
+        depends on PROC_FS && CRASH_DUMP
 	default y
         help
         Exports the dump image of crashed kernel in ELF format.
-- 
cgit v1.2.3-70-g09d2


From 2aac05a91971fbd1bf6cbed78b8731eb7454b9b7 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 7 Jul 2008 13:26:10 -0400
Subject: NFS: Fix readdir cache invalidation

invalidate_inode_pages2_range() takes page offset arguments, not byte
ranges.

Another thought is that individual pages might perhaps get evicted by VM
pressure, in which case we might perhaps want to re-read not only the
evicted page, but all subsequent pages too (in case the server returns
more/less data per page so that the alignment of the next entry
changes). We should therefore remove the condition that we only do this on
page->index==0.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 58d43daec08..982a2064fe4 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -204,7 +204,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
 	 * Note: assumes we have exclusive access to this mapping either
 	 *	 through inode->i_mutex or some other mechanism.
 	 */
-	if (page->index == 0 && invalidate_inode_pages2_range(inode->i_mapping, PAGE_CACHE_SIZE, -1) < 0) {
+	if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
 		/* Should never happen */
 		nfs_zap_mapping(inode, inode->i_mapping);
 	}
-- 
cgit v1.2.3-70-g09d2


From eb35c218d83ec0780d9db869310f2e333f628702 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Tue, 8 Jul 2008 14:37:06 -0400
Subject: reiserfs: discard prealloc in reiserfs_delete_inode

With the removal of struct file from the xattr code,
reiserfs_file_release() isn't used anymore, so the prealloc isn't
discarded.  This causes hangs later down the line.

This patch adds it to reiserfs_delete_inode.  In most cases it will be a
no-op due to it already having been called, but will avoid hangs with
xattrs.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/inode.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 57917932212..192269698a8 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -45,6 +45,8 @@ void reiserfs_delete_inode(struct inode *inode)
 			goto out;
 		reiserfs_update_inode_transaction(inode);
 
+		reiserfs_discard_prealloc(&th, inode);
+
 		err = reiserfs_delete_object(&th, inode);
 
 		/* Do quota update inside a transaction for journaled quotas. We must do that
-- 
cgit v1.2.3-70-g09d2


From 57dc9a5747942f131a1a8b36d661fec6e6a5e9f6 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 20 Jun 2008 15:35:32 -0400
Subject: NFS: Reduce the stack usage in NFSv4 create operations

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 204 ++++++++++++++++++++++++++----------------------------
 1 file changed, 97 insertions(+), 107 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 1293e0acd82..26fbeb3467c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2079,47 +2079,81 @@ static int nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *n
 	return err;
 }
 
+struct nfs4_createdata {
+	struct rpc_message msg;
+	struct nfs4_create_arg arg;
+	struct nfs4_create_res res;
+	struct nfs_fh fh;
+	struct nfs_fattr fattr;
+	struct nfs_fattr dir_fattr;
+};
+
+static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
+		struct qstr *name, struct iattr *sattr, u32 ftype)
+{
+	struct nfs4_createdata *data;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (data != NULL) {
+		struct nfs_server *server = NFS_SERVER(dir);
+
+		data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE];
+		data->msg.rpc_argp = &data->arg;
+		data->msg.rpc_resp = &data->res;
+		data->arg.dir_fh = NFS_FH(dir);
+		data->arg.server = server;
+		data->arg.name = name;
+		data->arg.attrs = sattr;
+		data->arg.ftype = ftype;
+		data->arg.bitmask = server->attr_bitmask;
+		data->res.server = server;
+		data->res.fh = &data->fh;
+		data->res.fattr = &data->fattr;
+		data->res.dir_fattr = &data->dir_fattr;
+		nfs_fattr_init(data->res.fattr);
+		nfs_fattr_init(data->res.dir_fattr);
+	}
+	return data;
+}
+
+static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data)
+{
+	int status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0);
+	if (status == 0) {
+		update_changeattr(dir, &data->res.dir_cinfo);
+		nfs_post_op_update_inode(dir, data->res.dir_fattr);
+		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+	}
+	return status;
+}
+
+static void nfs4_free_createdata(struct nfs4_createdata *data)
+{
+	kfree(data);
+}
+
 static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
 		struct page *page, unsigned int len, struct iattr *sattr)
 {
-	struct nfs_server *server = NFS_SERVER(dir);
-	struct nfs_fh fhandle;
-	struct nfs_fattr fattr, dir_fattr;
-	struct nfs4_create_arg arg = {
-		.dir_fh = NFS_FH(dir),
-		.server = server,
-		.name = &dentry->d_name,
-		.attrs = sattr,
-		.ftype = NF4LNK,
-		.bitmask = server->attr_bitmask,
-	};
-	struct nfs4_create_res res = {
-		.server = server,
-		.fh = &fhandle,
-		.fattr = &fattr,
-		.dir_fattr = &dir_fattr,
-	};
-	struct rpc_message msg = {
-		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK],
-		.rpc_argp = &arg,
-		.rpc_resp = &res,
-	};
-	int			status;
+	struct nfs4_createdata *data;
+	int status = -ENAMETOOLONG;
 
 	if (len > NFS4_MAXPATHLEN)
-		return -ENAMETOOLONG;
+		goto out;
 
-	arg.u.symlink.pages = &page;
-	arg.u.symlink.len = len;
-	nfs_fattr_init(&fattr);
-	nfs_fattr_init(&dir_fattr);
+	status = -ENOMEM;
+	data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4LNK);
+	if (data == NULL)
+		goto out;
+
+	data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK];
+	data->arg.u.symlink.pages = &page;
+	data->arg.u.symlink.len = len;
 	
-	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
-	if (!status) {
-		update_changeattr(dir, &res.dir_cinfo);
-		nfs_post_op_update_inode(dir, res.dir_fattr);
-		status = nfs_instantiate(dentry, &fhandle, &fattr);
-	}
+	status = nfs4_do_create(dir, dentry, data);
+
+	nfs4_free_createdata(data);
+out:
 	return status;
 }
 
@@ -2140,39 +2174,17 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
 static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
 		struct iattr *sattr)
 {
-	struct nfs_server *server = NFS_SERVER(dir);
-	struct nfs_fh fhandle;
-	struct nfs_fattr fattr, dir_fattr;
-	struct nfs4_create_arg arg = {
-		.dir_fh = NFS_FH(dir),
-		.server = server,
-		.name = &dentry->d_name,
-		.attrs = sattr,
-		.ftype = NF4DIR,
-		.bitmask = server->attr_bitmask,
-	};
-	struct nfs4_create_res res = {
-		.server = server,
-		.fh = &fhandle,
-		.fattr = &fattr,
-		.dir_fattr = &dir_fattr,
-	};
-	struct rpc_message msg = {
-		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE],
-		.rpc_argp = &arg,
-		.rpc_resp = &res,
-	};
-	int			status;
+	struct nfs4_createdata *data;
+	int status = -ENOMEM;
 
-	nfs_fattr_init(&fattr);
-	nfs_fattr_init(&dir_fattr);
-	
-	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
-	if (!status) {
-		update_changeattr(dir, &res.dir_cinfo);
-		nfs_post_op_update_inode(dir, res.dir_fattr);
-		status = nfs_instantiate(dentry, &fhandle, &fattr);
-	}
+	data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4DIR);
+	if (data == NULL)
+		goto out;
+
+	status = nfs4_do_create(dir, dentry, data);
+
+	nfs4_free_createdata(data);
+out:
 	return status;
 }
 
@@ -2242,56 +2254,34 @@ static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
 		struct iattr *sattr, dev_t rdev)
 {
-	struct nfs_server *server = NFS_SERVER(dir);
-	struct nfs_fh fh;
-	struct nfs_fattr fattr, dir_fattr;
-	struct nfs4_create_arg arg = {
-		.dir_fh = NFS_FH(dir),
-		.server = server,
-		.name = &dentry->d_name,
-		.attrs = sattr,
-		.bitmask = server->attr_bitmask,
-	};
-	struct nfs4_create_res res = {
-		.server = server,
-		.fh = &fh,
-		.fattr = &fattr,
-		.dir_fattr = &dir_fattr,
-	};
-	struct rpc_message msg = {
-		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE],
-		.rpc_argp = &arg,
-		.rpc_resp = &res,
-	};
-	int			status;
-	int                     mode = sattr->ia_mode;
-
-	nfs_fattr_init(&fattr);
-	nfs_fattr_init(&dir_fattr);
+	struct nfs4_createdata *data;
+	int mode = sattr->ia_mode;
+	int status = -ENOMEM;
 
 	BUG_ON(!(sattr->ia_valid & ATTR_MODE));
 	BUG_ON(!S_ISFIFO(mode) && !S_ISBLK(mode) && !S_ISCHR(mode) && !S_ISSOCK(mode));
+
+	data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4SOCK);
+	if (data == NULL)
+		goto out;
+
 	if (S_ISFIFO(mode))
-		arg.ftype = NF4FIFO;
+		data->arg.ftype = NF4FIFO;
 	else if (S_ISBLK(mode)) {
-		arg.ftype = NF4BLK;
-		arg.u.device.specdata1 = MAJOR(rdev);
-		arg.u.device.specdata2 = MINOR(rdev);
+		data->arg.ftype = NF4BLK;
+		data->arg.u.device.specdata1 = MAJOR(rdev);
+		data->arg.u.device.specdata2 = MINOR(rdev);
 	}
 	else if (S_ISCHR(mode)) {
-		arg.ftype = NF4CHR;
-		arg.u.device.specdata1 = MAJOR(rdev);
-		arg.u.device.specdata2 = MINOR(rdev);
+		data->arg.ftype = NF4CHR;
+		data->arg.u.device.specdata1 = MAJOR(rdev);
+		data->arg.u.device.specdata2 = MINOR(rdev);
 	}
-	else
-		arg.ftype = NF4SOCK;
 	
-	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
-	if (status == 0) {
-		update_changeattr(dir, &res.dir_cinfo);
-		nfs_post_op_update_inode(dir, res.dir_fattr);
-		status = nfs_instantiate(dentry, &fh, &fattr);
-	}
+	status = nfs4_do_create(dir, dentry, data);
+
+	nfs4_free_createdata(data);
+out:
 	return status;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 0b4aae7aad162ad175ba8a65708332f888066b26 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 20 Jun 2008 17:00:23 -0400
Subject: NFS: Reduce the stack usage in NFSv3 create operations

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs3proc.c | 271 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 142 insertions(+), 129 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index c3523ad03ed..cf7d4e5927d 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -248,6 +248,53 @@ static int nfs3_proc_readlink(struct inode *inode, struct page *page,
 	return status;
 }
 
+struct nfs3_createdata {
+	struct rpc_message msg;
+	union {
+		struct nfs3_createargs create;
+		struct nfs3_mkdirargs mkdir;
+		struct nfs3_symlinkargs symlink;
+		struct nfs3_mknodargs mknod;
+	} arg;
+	struct nfs3_diropres res;
+	struct nfs_fh fh;
+	struct nfs_fattr fattr;
+	struct nfs_fattr dir_attr;
+};
+
+static struct nfs3_createdata *nfs3_alloc_createdata(void)
+{
+	struct nfs3_createdata *data;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (data != NULL) {
+		data->msg.rpc_argp = &data->arg;
+		data->msg.rpc_resp = &data->res;
+		data->res.fh = &data->fh;
+		data->res.fattr = &data->fattr;
+		data->res.dir_attr = &data->dir_attr;
+		nfs_fattr_init(data->res.fattr);
+		nfs_fattr_init(data->res.dir_attr);
+	}
+	return data;
+}
+
+static int nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata *data)
+{
+	int status;
+
+	status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0);
+	nfs_post_op_update_inode(dir, data->res.dir_attr);
+	if (status == 0)
+		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+	return status;
+}
+
+static void nfs3_free_createdata(struct nfs3_createdata *data)
+{
+	kfree(data);
+}
+
 /*
  * Create a regular file.
  * For now, we don't implement O_EXCL.
@@ -256,70 +303,60 @@ static int
 nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 		 int flags, struct nameidata *nd)
 {
-	struct nfs_fh		fhandle;
-	struct nfs_fattr	fattr;
-	struct nfs_fattr	dir_attr;
-	struct nfs3_createargs	arg = {
-		.fh		= NFS_FH(dir),
-		.name		= dentry->d_name.name,
-		.len		= dentry->d_name.len,
-		.sattr		= sattr,
-	};
-	struct nfs3_diropres	res = {
-		.dir_attr	= &dir_attr,
-		.fh		= &fhandle,
-		.fattr		= &fattr
-	};
-	struct rpc_message msg = {
-		.rpc_proc	= &nfs3_procedures[NFS3PROC_CREATE],
-		.rpc_argp	= &arg,
-		.rpc_resp	= &res,
-	};
+	struct nfs3_createdata *data;
 	mode_t mode = sattr->ia_mode;
-	int status;
+	int status = -ENOMEM;
 
 	dprintk("NFS call  create %s\n", dentry->d_name.name);
-	arg.createmode = NFS3_CREATE_UNCHECKED;
+
+	data = nfs3_alloc_createdata();
+	if (data == NULL)
+		goto out;
+
+	data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_CREATE];
+	data->arg.create.fh = NFS_FH(dir);
+	data->arg.create.name = dentry->d_name.name;
+	data->arg.create.len = dentry->d_name.len;
+	data->arg.create.sattr = sattr;
+
+	data->arg.create.createmode = NFS3_CREATE_UNCHECKED;
 	if (flags & O_EXCL) {
-		arg.createmode  = NFS3_CREATE_EXCLUSIVE;
-		arg.verifier[0] = jiffies;
-		arg.verifier[1] = current->pid;
+		data->arg.create.createmode  = NFS3_CREATE_EXCLUSIVE;
+		data->arg.create.verifier[0] = jiffies;
+		data->arg.create.verifier[1] = current->pid;
 	}
 
 	sattr->ia_mode &= ~current->fs->umask;
 
-again:
-	nfs_fattr_init(&dir_attr);
-	nfs_fattr_init(&fattr);
-	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
-	nfs_refresh_inode(dir, &dir_attr);
+	for (;;) {
+		status = nfs3_do_create(dir, dentry, data);
 
-	/* If the server doesn't support the exclusive creation semantics,
-	 * try again with simple 'guarded' mode. */
-	if (status == -ENOTSUPP) {
-		switch (arg.createmode) {
+		if (status != -ENOTSUPP)
+			break;
+		/* If the server doesn't support the exclusive creation
+		 * semantics, try again with simple 'guarded' mode. */
+		switch (data->arg.create.createmode) {
 			case NFS3_CREATE_EXCLUSIVE:
-				arg.createmode = NFS3_CREATE_GUARDED;
+				data->arg.create.createmode = NFS3_CREATE_GUARDED;
 				break;
 
 			case NFS3_CREATE_GUARDED:
-				arg.createmode = NFS3_CREATE_UNCHECKED;
+				data->arg.create.createmode = NFS3_CREATE_UNCHECKED;
 				break;
 
 			case NFS3_CREATE_UNCHECKED:
 				goto out;
 		}
-		goto again;
+		nfs_fattr_init(data->res.dir_attr);
+		nfs_fattr_init(data->res.fattr);
 	}
 
-	if (status == 0)
-		status = nfs_instantiate(dentry, &fhandle, &fattr);
 	if (status != 0)
 		goto out;
 
 	/* When we created the file with exclusive semantics, make
 	 * sure we set the attributes afterwards. */
-	if (arg.createmode == NFS3_CREATE_EXCLUSIVE) {
+	if (data->arg.create.createmode == NFS3_CREATE_EXCLUSIVE) {
 		dprintk("NFS call  setattr (post-create)\n");
 
 		if (!(sattr->ia_valid & ATTR_ATIME_SET))
@@ -330,14 +367,15 @@ again:
 		/* Note: we could use a guarded setattr here, but I'm
 		 * not sure this buys us anything (and I'd have
 		 * to revamp the NFSv3 XDR code) */
-		status = nfs3_proc_setattr(dentry, &fattr, sattr);
-		nfs_post_op_update_inode(dentry->d_inode, &fattr);
+		status = nfs3_proc_setattr(dentry, data->res.fattr, sattr);
+		nfs_post_op_update_inode(dentry->d_inode, data->res.fattr);
 		dprintk("NFS reply setattr (post-create): %d\n", status);
+		if (status != 0)
+			goto out;
 	}
-	if (status != 0)
-		goto out;
 	status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode);
 out:
+	nfs3_free_createdata(data);
 	dprintk("NFS reply create: %d\n", status);
 	return status;
 }
@@ -452,40 +490,28 @@ static int
 nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
 		  unsigned int len, struct iattr *sattr)
 {
-	struct nfs_fh fhandle;
-	struct nfs_fattr fattr, dir_attr;
-	struct nfs3_symlinkargs	arg = {
-		.fromfh		= NFS_FH(dir),
-		.fromname	= dentry->d_name.name,
-		.fromlen	= dentry->d_name.len,
-		.pages		= &page,
-		.pathlen	= len,
-		.sattr		= sattr
-	};
-	struct nfs3_diropres	res = {
-		.dir_attr	= &dir_attr,
-		.fh		= &fhandle,
-		.fattr		= &fattr
-	};
-	struct rpc_message msg = {
-		.rpc_proc	= &nfs3_procedures[NFS3PROC_SYMLINK],
-		.rpc_argp	= &arg,
-		.rpc_resp	= &res,
-	};
-	int			status;
+	struct nfs3_createdata *data;
+	int status = -ENOMEM;
 
 	if (len > NFS3_MAXPATHLEN)
 		return -ENAMETOOLONG;
 
 	dprintk("NFS call  symlink %s\n", dentry->d_name.name);
 
-	nfs_fattr_init(&dir_attr);
-	nfs_fattr_init(&fattr);
-	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
-	nfs_post_op_update_inode(dir, &dir_attr);
-	if (status != 0)
+	data = nfs3_alloc_createdata();
+	if (data == NULL)
 		goto out;
-	status = nfs_instantiate(dentry, &fhandle, &fattr);
+	data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_SYMLINK];
+	data->arg.symlink.fromfh = NFS_FH(dir);
+	data->arg.symlink.fromname = dentry->d_name.name;
+	data->arg.symlink.fromlen = dentry->d_name.len;
+	data->arg.symlink.pages = &page;
+	data->arg.symlink.pathlen = len;
+	data->arg.symlink.sattr = sattr;
+
+	status = nfs3_do_create(dir, dentry, data);
+
+	nfs3_free_createdata(data);
 out:
 	dprintk("NFS reply symlink: %d\n", status);
 	return status;
@@ -494,42 +520,31 @@ out:
 static int
 nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
 {
-	struct nfs_fh fhandle;
-	struct nfs_fattr fattr, dir_attr;
-	struct nfs3_mkdirargs	arg = {
-		.fh		= NFS_FH(dir),
-		.name		= dentry->d_name.name,
-		.len		= dentry->d_name.len,
-		.sattr		= sattr
-	};
-	struct nfs3_diropres	res = {
-		.dir_attr	= &dir_attr,
-		.fh		= &fhandle,
-		.fattr		= &fattr
-	};
-	struct rpc_message msg = {
-		.rpc_proc	= &nfs3_procedures[NFS3PROC_MKDIR],
-		.rpc_argp	= &arg,
-		.rpc_resp	= &res,
-	};
+	struct nfs3_createdata *data;
 	int mode = sattr->ia_mode;
-	int status;
+	int status = -ENOMEM;
 
 	dprintk("NFS call  mkdir %s\n", dentry->d_name.name);
 
 	sattr->ia_mode &= ~current->fs->umask;
 
-	nfs_fattr_init(&dir_attr);
-	nfs_fattr_init(&fattr);
-	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
-	nfs_post_op_update_inode(dir, &dir_attr);
-	if (status != 0)
+	data = nfs3_alloc_createdata();
+	if (data == NULL)
 		goto out;
-	status = nfs_instantiate(dentry, &fhandle, &fattr);
+
+	data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR];
+	data->arg.mkdir.fh = NFS_FH(dir);
+	data->arg.mkdir.name = dentry->d_name.name;
+	data->arg.mkdir.len = dentry->d_name.len;
+	data->arg.mkdir.sattr = sattr;
+
+	status = nfs3_do_create(dir, dentry, data);
 	if (status != 0)
 		goto out;
+
 	status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode);
 out:
+	nfs3_free_createdata(data);
 	dprintk("NFS reply mkdir: %d\n", status);
 	return status;
 }
@@ -615,52 +630,50 @@ static int
 nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 		dev_t rdev)
 {
-	struct nfs_fh fh;
-	struct nfs_fattr fattr, dir_attr;
-	struct nfs3_mknodargs	arg = {
-		.fh		= NFS_FH(dir),
-		.name		= dentry->d_name.name,
-		.len		= dentry->d_name.len,
-		.sattr		= sattr,
-		.rdev		= rdev
-	};
-	struct nfs3_diropres	res = {
-		.dir_attr	= &dir_attr,
-		.fh		= &fh,
-		.fattr		= &fattr
-	};
-	struct rpc_message msg = {
-		.rpc_proc	= &nfs3_procedures[NFS3PROC_MKNOD],
-		.rpc_argp	= &arg,
-		.rpc_resp	= &res,
-	};
+	struct nfs3_createdata *data;
 	mode_t mode = sattr->ia_mode;
-	int status;
-
-	switch (sattr->ia_mode & S_IFMT) {
-	case S_IFBLK:	arg.type = NF3BLK;  break;
-	case S_IFCHR:	arg.type = NF3CHR;  break;
-	case S_IFIFO:	arg.type = NF3FIFO; break;
-	case S_IFSOCK:	arg.type = NF3SOCK; break;
-	default:	return -EINVAL;
-	}
+	int status = -ENOMEM;
 
 	dprintk("NFS call  mknod %s %u:%u\n", dentry->d_name.name,
 			MAJOR(rdev), MINOR(rdev));
 
 	sattr->ia_mode &= ~current->fs->umask;
 
-	nfs_fattr_init(&dir_attr);
-	nfs_fattr_init(&fattr);
-	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
-	nfs_post_op_update_inode(dir, &dir_attr);
-	if (status != 0)
+	data = nfs3_alloc_createdata();
+	if (data == NULL)
 		goto out;
-	status = nfs_instantiate(dentry, &fh, &fattr);
+
+	data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKNOD];
+	data->arg.mknod.fh = NFS_FH(dir);
+	data->arg.mknod.name = dentry->d_name.name;
+	data->arg.mknod.len = dentry->d_name.len;
+	data->arg.mknod.sattr = sattr;
+	data->arg.mknod.rdev = rdev;
+
+	switch (sattr->ia_mode & S_IFMT) {
+	case S_IFBLK:
+		data->arg.mknod.type = NF3BLK;
+		break;
+	case S_IFCHR:
+		data->arg.mknod.type = NF3CHR;
+		break;
+	case S_IFIFO:
+		data->arg.mknod.type = NF3FIFO;
+		break;
+	case S_IFSOCK:
+		data->arg.mknod.type = NF3SOCK;
+		break;
+	default:
+		status = -EINVAL;
+		goto out;
+	}
+
+	status = nfs3_do_create(dir, dentry, data);
 	if (status != 0)
 		goto out;
 	status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode);
 out:
+	nfs3_free_createdata(data);
 	dprintk("NFS reply mknod: %d\n", status);
 	return status;
 }
-- 
cgit v1.2.3-70-g09d2


From f3d47a3a6a1484a93c8cfe1e8c8d4399c95199c7 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 5 Jun 2008 15:17:39 -0400
Subject: NFS: Fix a preemption count leak in nfs_update_request

The commit 2785259631697ebb0749a3782cca206e2e542939 (nfs: use GFP_NOFS
preloads for radix-tree insertion) appears to have introduced a bug:
We only want to call radix_tree_preload() once after creating a request.
Calling it every time we loop after we created the request, will cause
preemption count leaks.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Nick Piggin <npiggin@suse.de>
---
 fs/nfs/write.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index f333848fd3b..dc62bc50469 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -584,13 +584,6 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
 		/* Loop over all inode entries and see if we find
 		 * A request for the page we wish to update
 		 */
-		if (new) {
-			if (radix_tree_preload(GFP_NOFS)) {
-				nfs_release_request(new);
-				return ERR_PTR(-ENOMEM);
-			}
-		}
-
 		spin_lock(&inode->i_lock);
 		req = nfs_page_find_request_locked(page);
 		if (req) {
@@ -630,6 +623,10 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
 		new = nfs_create_request(ctx, inode, page, offset, bytes);
 		if (IS_ERR(new))
 			return new;
+		if (radix_tree_preload(GFP_NOFS)) {
+			nfs_release_request(new);
+			return ERR_PTR(-ENOMEM);
+		}
 	}
 
 	/* We have a request for our page.
-- 
cgit v1.2.3-70-g09d2


From 2116271a347d1181b5497602c2bfada1de8fd53b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 20 May 2008 19:34:39 -0400
Subject: NFS: Add correct bounds checking to NFSv2 locks

NFSv2 file locking currently fails the Connectathon tests, because the
calls to the VFS locking code do not return an EINVAL error if the
struct file_lock overflows the 32-bit boundaries.

The problem is due to the fact that we occasionally call helpers from
fs/locks.c in order to avoid RPC calls to the server when we know that a
local process holds the lock. These helpers are, of course, always
64-bit enabled, so EINVAL is not returned in cases when it would if
the call had gone to the NLM code.

For consistency, we therefore add support for a bounds-checking helper.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/file.c           | 20 +++++++++++++++-----
 fs/nfs/proc.c           | 24 ++++++++++++++++++++++++
 include/linux/nfs_xdr.h |  1 +
 3 files changed, 40 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index d84a3d8f32a..7c73f06692b 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -593,6 +593,7 @@ out:
 static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
 {
 	struct inode * inode = filp->f_mapping->host;
+	int ret = -ENOLCK;
 
 	dprintk("NFS: nfs_lock(f=%s/%ld, t=%x, fl=%x, r=%Ld:%Ld)\n",
 			inode->i_sb->s_id, inode->i_ino,
@@ -602,13 +603,22 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
 
 	/* No mandatory locks over NFS */
 	if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
-		return -ENOLCK;
+		goto out_err;
+
+	if (NFS_PROTO(inode)->lock_check_bounds != NULL) {
+		ret = NFS_PROTO(inode)->lock_check_bounds(fl);
+		if (ret < 0)
+			goto out_err;
+	}
 
 	if (IS_GETLK(cmd))
-		return do_getlk(filp, cmd, fl);
-	if (fl->fl_type == F_UNLCK)
-		return do_unlk(filp, cmd, fl);
-	return do_setlk(filp, cmd, fl);
+		ret = do_getlk(filp, cmd, fl);
+	else if (fl->fl_type == F_UNLCK)
+		ret = do_unlk(filp, cmd, fl);
+	else
+		ret = do_setlk(filp, cmd, fl);
+out_err:
+	return ret;
 }
 
 /*
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 03599bfe81c..5c35b02857f 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -598,6 +598,29 @@ nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
 	return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl);
 }
 
+/* Helper functions for NFS lock bounds checking */
+#define NFS_LOCK32_OFFSET_MAX ((__s32)0x7fffffffUL)
+static int nfs_lock_check_bounds(const struct file_lock *fl)
+{
+	__s32 start, end;
+
+	start = (__s32)fl->fl_start;
+	if ((loff_t)start != fl->fl_start)
+		goto out_einval;
+
+	if (fl->fl_end != OFFSET_MAX) {
+		end = (__s32)fl->fl_end;
+		if ((loff_t)end != fl->fl_end)
+			goto out_einval;
+	} else
+		end = NFS_LOCK32_OFFSET_MAX;
+
+	if (start < 0 || start > end)
+		goto out_einval;
+	return 0;
+out_einval:
+	return -EINVAL;
+}
 
 const struct nfs_rpc_ops nfs_v2_clientops = {
 	.version	= 2,		       /* protocol version */
@@ -633,4 +656,5 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
 	.file_open	= nfs_open,
 	.file_release	= nfs_release,
 	.lock		= nfs_proc_lock,
+	.lock_check_bounds = nfs_lock_check_bounds,
 };
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 24263bb8e0b..8d780de371f 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -832,6 +832,7 @@ struct nfs_rpc_ops {
 	int	(*file_open)   (struct inode *, struct file *);
 	int	(*file_release) (struct inode *, struct file *);
 	int	(*lock)(struct file *, int, struct file_lock *);
+	int	(*lock_check_bounds)(const struct file_lock *);
 	void	(*clear_acl_cache)(struct inode *);
 };
 
-- 
cgit v1.2.3-70-g09d2


From efc91ed0191e3fc62bb1c556ac93fc4e661214d2 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 10 Jun 2008 18:31:00 -0400
Subject: NFS: Optimise append writes with holes

If a file is being extended, and we're creating a hole, we might as well
declare the entire page to be up to date.

This patch significantly improves the write performance for sparse files
in the case where lseek(SEEK_END) is used to append several non-contiguous
writes at intervals of < PAGE_SIZE.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/file.c  | 20 ++++++++++++++++++++
 fs/nfs/write.c | 12 +++---------
 2 files changed, 23 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 7c73f06692b..7ac89a845a5 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -344,6 +344,26 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
 	unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
 	int status;
 
+	/*
+	 * Zero any uninitialised parts of the page, and then mark the page
+	 * as up to date if it turns out that we're extending the file.
+	 */
+	if (!PageUptodate(page)) {
+		unsigned pglen = nfs_page_length(page);
+		unsigned end = offset + len;
+
+		if (pglen == 0) {
+			zero_user_segments(page, 0, offset,
+					end, PAGE_CACHE_SIZE);
+			SetPageUptodate(page);
+		} else if (end >= pglen) {
+			zero_user_segment(page, end, PAGE_CACHE_SIZE);
+			if (offset == 0)
+				SetPageUptodate(page);
+		} else
+			zero_user_segment(page, pglen, PAGE_CACHE_SIZE);
+	}
+
 	lock_kernel();
 	status = nfs_updatepage(file, page, offset, copied);
 	unlock_kernel();
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index dc62bc50469..eea2d2b5278 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -616,7 +616,7 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
 			spin_unlock(&inode->i_lock);
 			radix_tree_preload_end();
 			req = new;
-			goto zero_page;
+			goto out;
 		}
 		spin_unlock(&inode->i_lock);
 
@@ -649,19 +649,13 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
 		req->wb_offset = offset;
 		req->wb_pgbase = offset;
 		req->wb_bytes = max(end, rqend) - req->wb_offset;
-		goto zero_page;
+		goto out;
 	}
 
 	if (end > rqend)
 		req->wb_bytes = end - req->wb_offset;
 
-	return req;
-zero_page:
-	/* If this page might potentially be marked as up to date,
-	 * then we need to zero any uninitalised data. */
-	if (req->wb_pgbase == 0 && req->wb_bytes != PAGE_CACHE_SIZE
-			&& !PageUptodate(req->wb_page))
-		zero_user_segment(req->wb_page, req->wb_bytes, PAGE_CACHE_SIZE);
+out:
 	return req;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 7e5f6146609eb9134fac7d1b6bfee43df1732188 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 25 May 2008 12:59:19 -0400
Subject: NFS: Revert commit 44dd151d

Revert commit 44dd151d "NFS: Don't mark a written page as uptodate until it
is on disk". While it is true that the write may fail, that is always the
case. There is no reason why we should treat data on pages that are not
already marked as PG_uptodate as being special. The only thing we gain is a
noticeable slowdown when re-reading these pages.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/write.c | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index eea2d2b5278..ee6fcdecb87 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -188,6 +188,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
 	}
 	/* Update file length */
 	nfs_grow_file(page, offset, count);
+	nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
 	nfs_clear_page_tag_locked(req);
 	return 0;
 }
@@ -743,12 +744,7 @@ int nfs_updatepage(struct file *file, struct page *page,
 static void nfs_writepage_release(struct nfs_page *req)
 {
 
-	if (PageError(req->wb_page)) {
-		nfs_end_page_writeback(req->wb_page);
-		nfs_inode_remove_request(req);
-	} else if (!nfs_reschedule_unstable_write(req)) {
-		/* Set the PG_uptodate flag */
-		nfs_mark_uptodate(req->wb_page, req->wb_pgbase, req->wb_bytes);
+	if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req)) {
 		nfs_end_page_writeback(req->wb_page);
 		nfs_inode_remove_request(req);
 	} else
@@ -1069,8 +1065,6 @@ static void nfs_writeback_release_full(void *calldata)
 			dprintk(" marked for commit\n");
 			goto next;
 		}
-		/* Set the PG_uptodate flag? */
-		nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
 		dprintk(" OK\n");
 remove_request:
 		nfs_end_page_writeback(page);
@@ -1309,9 +1303,6 @@ static void nfs_commit_release(void *calldata)
 		 * returned by the server against all stored verfs. */
 		if (!memcmp(req->wb_verf.verifier, data->verf.verifier, sizeof(data->verf.verifier))) {
 			/* We have a match */
-			/* Set the PG_uptodate flag */
-			nfs_mark_uptodate(req->wb_page, req->wb_pgbase,
-					req->wb_bytes);
 			nfs_inode_remove_request(req);
 			dprintk(" OK\n");
 			goto next;
-- 
cgit v1.2.3-70-g09d2


From b5418383ef13f70528281546d02c15edc03d8567 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 10 Jun 2008 18:31:02 -0400
Subject: NFS: do_setlk(): don't flush caches when we have a delegation

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/file.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 7ac89a845a5..0213c21038f 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -602,7 +602,8 @@ static int do_setlk(struct file *filp, int cmd, struct file_lock *fl)
 	 * This makes locking act as a cache coherency point.
 	 */
 	nfs_sync_mapping(filp->f_mapping);
-	nfs_zap_caches(inode);
+	if (!nfs_have_delegation(inode, FMODE_READ))
+		nfs_zap_caches(inode);
 out:
 	return status;
 }
-- 
cgit v1.2.3-70-g09d2


From 6fb1bc10303c0d88f635d014324432ab6ee49d1b Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 21 May 2008 17:09:04 -0400
Subject: NFS: Update help text for CONFIG_NFS_FS

Clean up: refresh the help text for Kconfig items related to the NFS
client.  Remove obsolete URLs, and make the language consistent among
the options.

Also move the ROOT_NFS config option next to the options related to the
NFS client.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/Kconfig | 115 ++++++++++++++++++++++++++++++-------------------------------
 1 file changed, 57 insertions(+), 58 deletions(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 2694648cbd1..1c16de9611e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1544,10 +1544,6 @@ config UFS_FS
           The recently released UFS2 variant (used in FreeBSD 5.x) is
           READ-ONLY supported.
 
-	  If you only intend to mount files from some other Unix over the
-	  network using NFS, you don't need the UFS file system support (but
-	  you need NFS file system support obviously).
-
 	  Note that this option is generally not needed for floppies, since a
 	  good portable way to transport files and directories between unixes
 	  (and even other operating systems) is given by the tar program ("man
@@ -1587,6 +1583,7 @@ menuconfig NETWORK_FILESYSTEMS
 	  Say Y here to get to see options for network filesystems and
 	  filesystem-related networking code, such as NFS daemon and
 	  RPCSEC security modules.
+
 	  This option alone does not add any kernel code.
 
 	  If you say N, all options in this submenu will be skipped and
@@ -1595,76 +1592,92 @@ menuconfig NETWORK_FILESYSTEMS
 if NETWORK_FILESYSTEMS
 
 config NFS_FS
-	tristate "NFS file system support"
+	tristate "NFS client support"
 	depends on INET
 	select LOCKD
 	select SUNRPC
 	select NFS_ACL_SUPPORT if NFS_V3_ACL
 	help
-	  If you are connected to some other (usually local) Unix computer
-	  (using SLIP, PLIP, PPP or Ethernet) and want to mount files residing
-	  on that computer (the NFS server) using the Network File Sharing
-	  protocol, say Y. "Mounting files" means that the client can access
-	  the files with usual UNIX commands as if they were sitting on the
-	  client's hard disk. For this to work, the server must run the
-	  programs nfsd and mountd (but does not need to have NFS file system
-	  support enabled in its kernel). NFS is explained in the Network
-	  Administrator's Guide, available from
-	  <http://www.tldp.org/docs.html#guide>, on its man page: "man
-	  nfs", and in the NFS-HOWTO.
-
-	  A superior but less widely used alternative to NFS is provided by
-	  the Coda file system; see "Coda file system support" below.
+	  Choose Y here if you want to access files residing on other
+	  computers using Sun's Network File System protocol.  To compile
+	  this file system support as a module, choose M here: the module
+	  will be called nfs.
 
-	  If you say Y here, you should have said Y to TCP/IP networking also.
-	  This option would enlarge your kernel by about 27 KB.
+	  To mount file systems exported by NFS servers, you also need to
+	  install the user space mount.nfs command which can be found in
+	  the Linux nfs-utils package, available from http://linux-nfs.org/.
+	  Information about using the mount command is available in the
+	  mount(8) man page.  More detail about the Linux NFS client
+	  implementation is available via the nfs(5) man page.
 
-	  To compile this file system support as a module, choose M here: the
-	  module will be called nfs.
+	  Below you can choose which versions of the NFS protocol are
+	  available in the kernel to mount NFS servers.  Support for NFS
+	  version 2 (RFC 1094) is always available when NFS_FS is selected.
 
-	  If you are configuring a diskless machine which will mount its root
-	  file system over NFS at boot time, say Y here and to "Kernel
-	  level IP autoconfiguration" above and to "Root file system on NFS"
-	  below. You cannot compile this driver as a module in this case.
-	  There are two packages designed for booting diskless machines over
-	  the net: netboot, available from
-	  <http://ftp1.sourceforge.net/netboot/>, and Etherboot,
-	  available from <http://ftp1.sourceforge.net/etherboot/>.
+	  To configure a system which mounts its root file system via NFS
+	  at boot time, say Y here, select "Kernel level IP
+	  autoconfiguration" in the NETWORK menu, and select "Root file
+	  system on NFS" below.  You cannot compile this file system as a
+	  module in this case.
 
-	  If you don't know what all this is about, say N.
+	  If unsure, say N.
 
 config NFS_V3
-	bool "Provide NFSv3 client support"
+	bool "NFS client support for NFS version 3"
 	depends on NFS_FS
 	help
-	  Say Y here if you want your NFS client to be able to speak version
-	  3 of the NFS protocol.
+	  This option enables support for version 3 of the NFS protocol
+	  (RFC 1813) in the kernel's NFS client.
 
 	  If unsure, say Y.
 
 config NFS_V3_ACL
-	bool "Provide client support for the NFSv3 ACL protocol extension"
+	bool "NFS client support for the NFSv3 ACL protocol extension"
 	depends on NFS_V3
 	help
-	  Implement the NFSv3 ACL protocol extension for manipulating POSIX
-	  Access Control Lists.  The server should also be compiled with
-	  the NFSv3 ACL protocol extension; see the CONFIG_NFSD_V3_ACL option.
+	  Some NFS servers support an auxiliary NFSv3 ACL protocol that
+	  Sun added to Solaris but never became an official part of the
+	  NFS version 3 protocol.  This protocol extension allows
+	  applications on NFS clients to manipulate POSIX Access Control
+	  Lists on files residing on NFS servers.  NFS servers enforce
+	  ACLs on local files whether this protocol is available or not.
+
+	  Choose Y here if your NFS server supports the Solaris NFSv3 ACL
+	  protocol extension and you want your NFS client to allow
+	  applications to access and modify ACLs on files on the server.
+
+	  Most NFS servers don't support the Solaris NFSv3 ACL protocol
+	  extension.  You can choose N here or specify the "noacl" mount
+	  option to prevent your NFS client from trying to use the NFSv3
+	  ACL protocol.
 
 	  If unsure, say N.
 
 config NFS_V4
-	bool "Provide NFSv4 client support (EXPERIMENTAL)"
+	bool "NFS client support for NFS version 4 (EXPERIMENTAL)"
 	depends on NFS_FS && EXPERIMENTAL
 	select RPCSEC_GSS_KRB5
 	help
-	  Say Y here if you want your NFS client to be able to speak the newer
-	  version 4 of the NFS protocol.
+	  This option enables support for version 4 of the NFS protocol
+	  (RFC 3530) in the kernel's NFS client.
 
-	  Note: Requires auxiliary userspace daemons which may be found on
-		http://www.citi.umich.edu/projects/nfsv4/
+	  To mount NFS servers using NFSv4, you also need to install user
+	  space programs which can be found in the Linux nfs-utils package,
+	  available from http://linux-nfs.org/.
 
 	  If unsure, say N.
 
+config ROOT_NFS
+	bool "Root file system on NFS"
+	depends on NFS_FS=y && IP_PNP
+	help
+	  If you want your system to mount its root file system via NFS,
+	  choose Y here.  This is common practice for managing systems
+	  without local permanent storage.  For details, read
+	  <file:Documentation/filesystems/nfsroot.txt>.
+
+	  Most people say N here.
+
 config NFSD
 	tristate "NFS server support"
 	depends on INET
@@ -1746,20 +1759,6 @@ config NFSD_V4
 
 	  If unsure, say N.
 
-config ROOT_NFS
-	bool "Root file system on NFS"
-	depends on NFS_FS=y && IP_PNP
-	help
-	  If you want your Linux box to mount its whole root file system (the
-	  one containing the directory /) from some other computer over the
-	  net via NFS (presumably because your box doesn't have a hard disk),
-	  say Y. Read <file:Documentation/filesystems/nfsroot.txt> for
-	  details. It is likely that in this case, you also want to say Y to
-	  "Kernel level IP autoconfiguration" so that your box can discover
-	  its network address at boot time.
-
-	  Most people say N here.
-
 config LOCKD
 	tristate
 
-- 
cgit v1.2.3-70-g09d2


From 549177863bac22f23663ee9f70c4e3b9fb269f2c Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 27 May 2008 16:29:07 -0400
Subject: NFS: Make nfs_fsync methods consistent

Clean up: Report the same debugging info, count function calls the same,
and use similar function naming in nfs_fsync_dir() and nfs_fsync().

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c  |  3 ++-
 fs/nfs/file.c | 12 +++++++-----
 2 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 982a2064fe4..5d73fbd6707 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -629,10 +629,11 @@ out:
  */
 static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync)
 {
-	dfprintk(VFS, "NFS: fsync_dir(%s/%s) datasync %d\n",
+	dfprintk(VFS, "NFS: fsync dir(%s/%s) datasync %d\n",
 			dentry->d_parent->d_name.name, dentry->d_name.name,
 			datasync);
 
+	nfs_inc_stats(dentry->d_inode, NFSIOS_VFSFSYNC);
 	return 0;
 }
 
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 0213c21038f..1789de218cc 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -50,7 +50,7 @@ static ssize_t nfs_file_read(struct kiocb *, const struct iovec *iov,
 static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov,
 				unsigned long nr_segs, loff_t pos);
 static int  nfs_file_flush(struct file *, fl_owner_t id);
-static int  nfs_fsync(struct file *, struct dentry *dentry, int datasync);
+static int  nfs_file_fsync(struct file *, struct dentry *dentry, int datasync);
 static int nfs_check_flags(int flags);
 static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
 static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
@@ -72,7 +72,7 @@ const struct file_operations nfs_file_operations = {
 	.open		= nfs_file_open,
 	.flush		= nfs_file_flush,
 	.release	= nfs_file_release,
-	.fsync		= nfs_fsync,
+	.fsync		= nfs_file_fsync,
 	.lock		= nfs_lock,
 	.flock		= nfs_flock,
 	.splice_read	= nfs_file_splice_read,
@@ -181,7 +181,7 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
 }
 
 /*
- * Helper for nfs_file_flush() and nfs_fsync()
+ * Helper for nfs_file_flush() and nfs_file_fsync()
  *
  * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to
  * disk, but it retrieves and clears ctx->error after synching, despite
@@ -296,12 +296,14 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
  * whether any write errors occurred for this process.
  */
 static int
-nfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+nfs_file_fsync(struct file *file, struct dentry *dentry, int datasync)
 {
 	struct nfs_open_context *ctx = nfs_file_open_context(file);
 	struct inode *inode = dentry->d_inode;
 
-	dfprintk(VFS, "nfs: fsync(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino);
+	dfprintk(VFS, "NFS: fsync file(%s/%s) datasync %d\n",
+			dentry->d_parent->d_name.name, dentry->d_name.name,
+			datasync);
 
 	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
 	return nfs_do_fsync(ctx, inode);
-- 
cgit v1.2.3-70-g09d2


From b84e06c58fdefdc42931f771dc295e63f4b27365 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 11 Jun 2008 17:55:34 -0400
Subject: NFS: Make nfs_llseek methods consistent

Clean up: Report the same debugging info in nfs_llseek_dir() and
nfs_llseek_file().

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c  | 12 ++++++++++--
 fs/nfs/file.c |  5 +++++
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 5d73fbd6707..24571067bf7 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -603,7 +603,15 @@ out:
 
 static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
 {
-	mutex_lock(&filp->f_path.dentry->d_inode->i_mutex);
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+
+	dfprintk(VFS, "NFS: llseek dir(%s/%s, %lld, %d)\n",
+			dentry->d_parent->d_name.name,
+			dentry->d_name.name,
+			offset, origin);
+
+	mutex_lock(&inode->i_mutex);
 	switch (origin) {
 		case 1:
 			offset += filp->f_pos;
@@ -619,7 +627,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
 		nfs_file_open_context(filp)->dir_cookie = 0;
 	}
 out:
-	mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex);
+	mutex_unlock(&inode->i_mutex);
 	return offset;
 }
 
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 1789de218cc..cef36362c9e 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -170,6 +170,11 @@ force_reval:
 
 static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
 {
+	dfprintk(VFS, "NFS: llseek file(%s/%s, %lld, %d)\n",
+			filp->f_path.dentry->d_parent->d_name.name,
+			filp->f_path.dentry->d_name.name,
+			offset, origin);
+
 	/* origin == SEEK_END => we must revalidate the cached file length */
 	if (origin == SEEK_END) {
 		struct inode *inode = filp->f_mapping->host;
-- 
cgit v1.2.3-70-g09d2


From cc0dd2d1052aede2946ad1338a8f6f5d5c604740 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 11 Jun 2008 17:55:42 -0400
Subject: NFS: Make nfs_open methods consistent

Clean up: Report the same debugging info and count function calls the
same for files and directories in nfs_opendir() and nfs_file_open().

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c  | 7 +++++--
 fs/nfs/file.c | 4 ++++
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 24571067bf7..c962233c094 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -133,8 +133,11 @@ nfs_opendir(struct inode *inode, struct file *filp)
 {
 	int res;
 
-	dfprintk(VFS, "NFS: opendir(%s/%ld)\n",
-			inode->i_sb->s_id, inode->i_ino);
+	dfprintk(VFS, "NFS: open dir(%s/%s)\n",
+			filp->f_path.dentry->d_parent->d_name.name,
+			filp->f_path.dentry->d_name.name);
+
+	nfs_inc_stats(inode, NFSIOS_VFSOPEN);
 
 	lock_kernel();
 	/* Call generic open code in order to cache credentials */
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index cef36362c9e..202408d96ee 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -119,6 +119,10 @@ nfs_file_open(struct inode *inode, struct file *filp)
 {
 	int res;
 
+	dfprintk(VFS, "NFS: open file(%s/%s)\n",
+			filp->f_path.dentry->d_parent->d_name.name,
+			filp->f_path.dentry->d_name.name);
+
 	res = nfs_check_flags(filp->f_flags);
 	if (res)
 		return res;
-- 
cgit v1.2.3-70-g09d2


From b7eaefaa8722fd98e5c2632640d1abd2b0c83e84 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 11 Jun 2008 17:55:50 -0400
Subject: NFS: Add debugging facility for NFS aops

Recent work in fs/nfs/file.c neglected to add appropriate trace debugging
for the NFS client's address space operations.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/file.c | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 202408d96ee..6c447a37ede 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -335,6 +335,11 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
 	struct page *page;
 	index = pos >> PAGE_CACHE_SHIFT;
 
+	dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n",
+		file->f_path.dentry->d_parent->d_name.name,
+		file->f_path.dentry->d_name.name,
+		mapping->host->i_ino, len, (long long) pos);
+
 	page = __grab_cache_page(mapping, index);
 	if (!page)
 		return -ENOMEM;
@@ -355,6 +360,11 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
 	unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
 	int status;
 
+	dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n",
+		file->f_path.dentry->d_parent->d_name.name,
+		file->f_path.dentry->d_name.name,
+		mapping->host->i_ino, len, (long long) pos);
+
 	/*
 	 * Zero any uninitialised parts of the page, and then mark the page
 	 * as up to date if it turns out that we're extending the file.
@@ -389,6 +399,8 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
 
 static void nfs_invalidate_page(struct page *page, unsigned long offset)
 {
+	dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset);
+
 	if (offset != 0)
 		return;
 	/* Cancel any unstarted writes on this page */
@@ -397,13 +409,20 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset)
 
 static int nfs_release_page(struct page *page, gfp_t gfp)
 {
+	dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
+
 	/* If PagePrivate() is set, then the page is not freeable */
 	return 0;
 }
 
 static int nfs_launder_page(struct page *page)
 {
-	return nfs_wb_page(page->mapping->host, page);
+	struct inode *inode = page->mapping->host;
+
+	dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n",
+		inode->i_ino, (long long)page_offset(page));
+
+	return nfs_wb_page(inode, page);
 }
 
 const struct address_space_operations nfs_file_aops = {
@@ -423,13 +442,19 @@ const struct address_space_operations nfs_file_aops = {
 static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 {
 	struct file *filp = vma->vm_file;
+	struct dentry *dentry = filp->f_path.dentry;
 	unsigned pagelen;
 	int ret = -EINVAL;
 	struct address_space *mapping;
 
+	dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%s/%s(%ld), offset %lld)\n",
+		dentry->d_parent->d_name.name, dentry->d_name.name,
+		filp->f_mapping->host->i_ino,
+		(long long)page_offset(page));
+
 	lock_page(page);
 	mapping = page->mapping;
-	if (mapping != vma->vm_file->f_path.dentry->d_inode->i_mapping)
+	if (mapping != dentry->d_inode->i_mapping)
 		goto out_unlock;
 
 	ret = 0;
-- 
cgit v1.2.3-70-g09d2


From 6da24bc9cfc645c619992e39aab09747164c9f14 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 11 Jun 2008 17:55:58 -0400
Subject: NFS: Use NFSDBG_FILE for all fops

Clean up: some fops use NFSDBG_FILE, some use NFSDBG_VFS.  Let's use
NFSDBG_FILE for all fops, and consistently report file names instead
of inode numbers.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c    | 10 +++++-----
 fs/nfs/direct.c |  4 ++--
 fs/nfs/file.c   | 59 +++++++++++++++++++++++++++++++++++----------------------
 3 files changed, 43 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index c962233c094..b1940660502 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -133,7 +133,7 @@ nfs_opendir(struct inode *inode, struct file *filp)
 {
 	int res;
 
-	dfprintk(VFS, "NFS: open dir(%s/%s)\n",
+	dfprintk(FILE, "NFS: open dir(%s/%s)\n",
 			filp->f_path.dentry->d_parent->d_name.name,
 			filp->f_path.dentry->d_name.name);
 
@@ -531,7 +531,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	struct nfs_fattr fattr;
 	long		res;
 
-	dfprintk(VFS, "NFS: readdir(%s/%s) starting at cookie %Lu\n",
+	dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
 			dentry->d_parent->d_name.name, dentry->d_name.name,
 			(long long)filp->f_pos);
 	nfs_inc_stats(inode, NFSIOS_VFSGETDENTS);
@@ -598,7 +598,7 @@ out:
 	unlock_kernel();
 	if (res > 0)
 		res = 0;
-	dfprintk(VFS, "NFS: readdir(%s/%s) returns %ld\n",
+	dfprintk(FILE, "NFS: readdir(%s/%s) returns %ld\n",
 			dentry->d_parent->d_name.name, dentry->d_name.name,
 			res);
 	return res;
@@ -609,7 +609,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
 	struct dentry *dentry = filp->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
 
-	dfprintk(VFS, "NFS: llseek dir(%s/%s, %lld, %d)\n",
+	dfprintk(FILE, "NFS: llseek dir(%s/%s, %lld, %d)\n",
 			dentry->d_parent->d_name.name,
 			dentry->d_name.name,
 			offset, origin);
@@ -640,7 +640,7 @@ out:
  */
 static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync)
 {
-	dfprintk(VFS, "NFS: fsync dir(%s/%s) datasync %d\n",
+	dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n",
 			dentry->d_parent->d_name.name, dentry->d_name.name,
 			datasync);
 
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 4757a2b326a..08f6b040d28 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -890,7 +890,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
 	count = iov_length(iov, nr_segs);
 	nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
 
-	dprintk("nfs: direct read(%s/%s, %zd@%Ld)\n",
+	dfprintk(FILE, "NFS: direct read(%s/%s, %zd@%Ld)\n",
 		file->f_path.dentry->d_parent->d_name.name,
 		file->f_path.dentry->d_name.name,
 		count, (long long) pos);
@@ -947,7 +947,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	count = iov_length(iov, nr_segs);
 	nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
 
-	dfprintk(VFS, "nfs: direct write(%s/%s, %zd@%Ld)\n",
+	dfprintk(FILE, "NFS: direct write(%s/%s, %zd@%Ld)\n",
 		file->f_path.dentry->d_parent->d_name.name,
 		file->f_path.dentry->d_name.name,
 		count, (long long) pos);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 6c447a37ede..78b26f875ee 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -119,7 +119,7 @@ nfs_file_open(struct inode *inode, struct file *filp)
 {
 	int res;
 
-	dfprintk(VFS, "NFS: open file(%s/%s)\n",
+	dprintk("NFS: open file(%s/%s)\n",
 			filp->f_path.dentry->d_parent->d_name.name,
 			filp->f_path.dentry->d_name.name);
 
@@ -137,9 +137,15 @@ nfs_file_open(struct inode *inode, struct file *filp)
 static int
 nfs_file_release(struct inode *inode, struct file *filp)
 {
+	struct dentry *dentry = filp->f_path.dentry;
+
+	dprintk("NFS: release(%s/%s)\n",
+			dentry->d_parent->d_name.name,
+			dentry->d_name.name);
+
 	/* Ensure that dirty pages are flushed out with the right creds */
 	if (filp->f_mode & FMODE_WRITE)
-		nfs_wb_all(filp->f_path.dentry->d_inode);
+		nfs_wb_all(dentry->d_inode);
 	nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
 	return NFS_PROTO(inode)->file_release(inode, filp);
 }
@@ -174,7 +180,7 @@ force_reval:
 
 static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
 {
-	dfprintk(VFS, "NFS: llseek file(%s/%s, %lld, %d)\n",
+	dprintk("NFS: llseek file(%s/%s, %lld, %d)\n",
 			filp->f_path.dentry->d_parent->d_name.name,
 			filp->f_path.dentry->d_name.name,
 			offset, origin);
@@ -216,16 +222,18 @@ static int nfs_do_fsync(struct nfs_open_context *ctx, struct inode *inode)
 
 /*
  * Flush all dirty pages, and check for write errors.
- *
  */
 static int
 nfs_file_flush(struct file *file, fl_owner_t id)
 {
 	struct nfs_open_context *ctx = nfs_file_open_context(file);
-	struct inode	*inode = file->f_path.dentry->d_inode;
+	struct dentry	*dentry = file->f_path.dentry;
+	struct inode	*inode = dentry->d_inode;
 	int		status;
 
-	dfprintk(VFS, "nfs: flush(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino);
+	dprintk("NFS: flush(%s/%s)\n",
+			dentry->d_parent->d_name.name,
+			dentry->d_name.name);
 
 	if ((file->f_mode & FMODE_WRITE) == 0)
 		return 0;
@@ -250,7 +258,7 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
 	if (iocb->ki_filp->f_flags & O_DIRECT)
 		return nfs_file_direct_read(iocb, iov, nr_segs, pos);
 
-	dfprintk(VFS, "nfs: read(%s/%s, %lu@%lu)\n",
+	dprintk("NFS: read(%s/%s, %lu@%lu)\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name,
 		(unsigned long) count, (unsigned long) pos);
 
@@ -270,7 +278,7 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos,
 	struct inode *inode = dentry->d_inode;
 	ssize_t res;
 
-	dfprintk(VFS, "nfs: splice_read(%s/%s, %lu@%Lu)\n",
+	dprintk("NFS: splice_read(%s/%s, %lu@%Lu)\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name,
 		(unsigned long) count, (unsigned long long) *ppos);
 
@@ -287,7 +295,7 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
 	struct inode *inode = dentry->d_inode;
 	int	status;
 
-	dfprintk(VFS, "nfs: mmap(%s/%s)\n",
+	dprintk("NFS: mmap(%s/%s)\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name);
 
 	status = nfs_revalidate_mapping(inode, file->f_mapping);
@@ -310,7 +318,7 @@ nfs_file_fsync(struct file *file, struct dentry *dentry, int datasync)
 	struct nfs_open_context *ctx = nfs_file_open_context(file);
 	struct inode *inode = dentry->d_inode;
 
-	dfprintk(VFS, "NFS: fsync file(%s/%s) datasync %d\n",
+	dprintk("NFS: fsync file(%s/%s) datasync %d\n",
 			dentry->d_parent->d_name.name, dentry->d_name.name,
 			datasync);
 
@@ -502,9 +510,9 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
 	if (iocb->ki_filp->f_flags & O_DIRECT)
 		return nfs_file_direct_write(iocb, iov, nr_segs, pos);
 
-	dfprintk(VFS, "nfs: write(%s/%s(%ld), %lu@%Ld)\n",
+	dprintk("NFS: write(%s/%s, %lu@%Ld)\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name,
-		inode->i_ino, (unsigned long) count, (long long) pos);
+		(unsigned long) count, (long long) pos);
 
 	result = -EBUSY;
 	if (IS_SWAPFILE(inode))
@@ -649,13 +657,15 @@ out:
  */
 static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
 {
-	struct inode * inode = filp->f_mapping->host;
+	struct inode *inode = filp->f_mapping->host;
 	int ret = -ENOLCK;
 
-	dprintk("NFS: nfs_lock(f=%s/%ld, t=%x, fl=%x, r=%Ld:%Ld)\n",
-			inode->i_sb->s_id, inode->i_ino,
+	dprintk("NFS: lock(%s/%s, t=%x, fl=%x, r=%lld:%lld)\n",
+			filp->f_path.dentry->d_parent->d_name.name,
+			filp->f_path.dentry->d_name.name,
 			fl->fl_type, fl->fl_flags,
 			(long long)fl->fl_start, (long long)fl->fl_end);
+
 	nfs_inc_stats(inode, NFSIOS_VFSLOCK);
 
 	/* No mandatory locks over NFS */
@@ -683,9 +693,9 @@ out_err:
  */
 static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
 {
-	dprintk("NFS: nfs_flock(f=%s/%ld, t=%x, fl=%x)\n",
-			filp->f_path.dentry->d_inode->i_sb->s_id,
-			filp->f_path.dentry->d_inode->i_ino,
+	dprintk("NFS: flock(%s/%s, t=%x, fl=%x)\n",
+			filp->f_path.dentry->d_parent->d_name.name,
+			filp->f_path.dentry->d_name.name,
 			fl->fl_type, fl->fl_flags);
 
 	/*
@@ -708,12 +718,15 @@ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
 	return do_setlk(filp, cmd, fl);
 }
 
+/*
+ * There is no protocol support for leases, so we have no way to implement
+ * them correctly in the face of opens by other clients.
+ */
 static int nfs_setlease(struct file *file, long arg, struct file_lock **fl)
 {
-	/*
-	 * There is no protocol support for leases, so we have no way
-	 * to implement them correctly in the face of opens by other
-	 * clients.
-	 */
+	dprintk("NFS: setlease(%s/%s, arg=%ld)\n",
+			file->f_path.dentry->d_parent->d_name.name,
+			file->f_path.dentry->d_name.name, arg);
+
 	return -EINVAL;
 }
-- 
cgit v1.2.3-70-g09d2


From 48186c7d5734a6b137f9186b37f6dc98097d0429 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 11 Jun 2008 17:56:05 -0400
Subject: NFS: Fix trace debugging nits in write.c

Clean up: fix a few dprintk messages that still need to show the RPC task ID
correctly, and be sure we use the preferred %lld or %llu instead of %Ld or
%Lu.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/write.c | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index ee6fcdecb87..21d8a48b624 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -713,10 +713,10 @@ int nfs_updatepage(struct file *file, struct page *page,
 
 	nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE);
 
-	dprintk("NFS:      nfs_updatepage(%s/%s %d@%Ld)\n",
+	dprintk("NFS:       nfs_updatepage(%s/%s %d@%lld)\n",
 		file->f_path.dentry->d_parent->d_name.name,
 		file->f_path.dentry->d_name.name, count,
-		(long long)(page_offset(page) +offset));
+		(long long)(page_offset(page) + offset));
 
 	/* If we're not using byte range locks, and we know the page
 	 * is up to date, it may be more efficient to extend the write
@@ -736,7 +736,7 @@ int nfs_updatepage(struct file *file, struct page *page,
 	else
 		__set_page_dirty_nobuffers(page);
 
-        dprintk("NFS:      nfs_updatepage returns %d (isize %Ld)\n",
+	dprintk("NFS:       nfs_updatepage returns %d (isize %lld)\n",
 			status, (long long)i_size_read(inode));
 	return status;
 }
@@ -821,7 +821,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
 	NFS_PROTO(inode)->write_setup(data, &msg);
 
 	dprintk("NFS: %5u initiated write call "
-		"(req %s/%Ld, %u bytes @ offset %Lu)\n",
+		"(req %s/%lld, %u bytes @ offset %llu)\n",
 		data->task.tk_pid,
 		inode->i_sb->s_id,
 		(long long)NFS_FILEID(inode),
@@ -965,13 +965,13 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
 static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata)
 {
 	struct nfs_write_data	*data = calldata;
-	struct nfs_page		*req = data->req;
 
-	dprintk("NFS: write (%s/%Ld %d@%Ld)",
-		req->wb_context->path.dentry->d_inode->i_sb->s_id,
-		(long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
-		req->wb_bytes,
-		(long long)req_offset(req));
+	dprintk("NFS: %5u write(%s/%lld %d@%lld)",
+		task->tk_pid,
+		data->req->wb_context->path.dentry->d_inode->i_sb->s_id,
+		(long long)
+		  NFS_FILEID(data->req->wb_context->path.dentry->d_inode),
+		data->req->wb_bytes, (long long)req_offset(data->req));
 
 	nfs_writeback_done(task, data);
 }
@@ -1045,7 +1045,8 @@ static void nfs_writeback_release_full(void *calldata)
 
 		nfs_list_remove_request(req);
 
-		dprintk("NFS: write (%s/%Ld %d@%Ld)",
+		dprintk("NFS: %5u write (%s/%lld %d@%lld)",
+			data->task.tk_pid,
 			req->wb_context->path.dentry->d_inode->i_sb->s_id,
 			(long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
 			req->wb_bytes,
@@ -1118,7 +1119,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 		static unsigned long    complain;
 
 		if (time_before(complain, jiffies)) {
-			dprintk("NFS: faulty NFS server %s:"
+			dprintk("NFS:       faulty NFS server %s:"
 				" (committed = %d) != (stable = %d)\n",
 				NFS_SERVER(data->inode)->nfs_client->cl_hostname,
 				resp->verf->committed, argp->stable);
@@ -1287,7 +1288,7 @@ static void nfs_commit_release(void *calldata)
 		dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
 				BDI_RECLAIMABLE);
 
-		dprintk("NFS: commit (%s/%Ld %d@%Ld)",
+		dprintk("NFS:       commit (%s/%lld %d@%lld)",
 			req->wb_context->path.dentry->d_inode->i_sb->s_id,
 			(long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
 			req->wb_bytes,
-- 
cgit v1.2.3-70-g09d2


From c2d946e55e1be20a7443918e3b7497b905e6b3f7 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Tue, 20 May 2008 01:08:00 +0300
Subject: fs/nfs/nfsroot.c: remove CVS keyword

This patch removes a CVS keyword that wasn't updated for a long time
from a comment.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfsroot.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 531379d3682..15afe3a6c5b 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -1,6 +1,4 @@
 /*
- *  $Id: nfsroot.c,v 1.45 1998/03/07 10:44:46 mj Exp $
- *
  *  Copyright (C) 1995, 1996  Gero Kuhlmann <gero@gkminix.han.de>
  *
  *  Allow an NFS filesystem to be mounted as root. The way this works is:
-- 
cgit v1.2.3-70-g09d2


From 48b605f83c920d8daa50e43fc2c7f718e04c7bfa Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 10 Jun 2008 15:38:39 -0400
Subject: NFS: implement option checking when remounting NFS filesystems
 (resend)

When remounting an NFS or NFS4 filesystem, the new NFS options are not
respected, yet the remount will still return success. This patch adds
a remount_fs sb op for NFS that checks any new nfs mount options against
the existing ones and fails the mount if any have changed.

This is only implemented for string-based mount options since doing
this with binary options isn't really feasible.

This is essentially the same as the original patch I sent out, but
adds a check to see if the addr= option has changed.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 614efeed543..b550e921f66 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -207,6 +207,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type,
 		int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
 static void nfs_kill_super(struct super_block *);
 static void nfs_put_super(struct super_block *);
+static int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
 
 static struct file_system_type nfs_fs_type = {
 	.owner		= THIS_MODULE,
@@ -234,6 +235,7 @@ static const struct super_operations nfs_sops = {
 	.umount_begin	= nfs_umount_begin,
 	.show_options	= nfs_show_options,
 	.show_stats	= nfs_show_stats,
+	.remount_fs	= nfs_remount,
 };
 
 #ifdef CONFIG_NFS_V4
@@ -278,6 +280,7 @@ static const struct super_operations nfs4_sops = {
 	.umount_begin	= nfs_umount_begin,
 	.show_options	= nfs_show_options,
 	.show_stats	= nfs_show_stats,
+	.remount_fs	= nfs_remount,
 };
 #endif
 
@@ -1396,6 +1399,79 @@ out_invalid_fh:
 	return -EINVAL;
 }
 
+static int
+nfs_compare_remount_data(struct nfs_server *nfss,
+			 struct nfs_parsed_mount_data *data)
+{
+	if (data->flags != nfss->flags ||
+	    data->rsize != nfss->rsize ||
+	    data->wsize != nfss->wsize ||
+	    data->retrans != nfss->client->cl_timeout->to_retries ||
+	    data->auth_flavors[0] != nfss->client->cl_auth->au_flavor ||
+	    data->acregmin != nfss->acregmin / HZ ||
+	    data->acregmax != nfss->acregmax / HZ ||
+	    data->acdirmin != nfss->acdirmin / HZ ||
+	    data->acdirmax != nfss->acdirmax / HZ ||
+	    data->timeo != (10U * nfss->client->cl_timeout->to_initval / HZ) ||
+	    data->nfs_server.addrlen != nfss->nfs_client->cl_addrlen ||
+	    memcmp(&data->nfs_server.address, &nfss->nfs_client->cl_addr,
+		   data->nfs_server.addrlen) != 0)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int
+nfs_remount(struct super_block *sb, int *flags, char *raw_data)
+{
+	int error;
+	struct nfs_server *nfss = sb->s_fs_info;
+	struct nfs_parsed_mount_data *data;
+	struct nfs_mount_data *options = (struct nfs_mount_data *)raw_data;
+	struct nfs4_mount_data *options4 = (struct nfs4_mount_data *)raw_data;
+
+	/*
+	 * Userspace mount programs that send binary options generally send
+	 * them populated with default values. We have no way to know which
+	 * ones were explicitly specified. Fall back to legacy behavior and
+	 * just return success.
+	 */
+	if ((sb->s_type == &nfs4_fs_type && options4->version == 1) ||
+	    (sb->s_type == &nfs_fs_type && options->version >= 1 &&
+	     options->version <= 6))
+		return 0;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (data == NULL)
+		return -ENOMEM;
+
+	/* fill out struct with values from existing mount */
+	data->flags = nfss->flags;
+	data->rsize = nfss->rsize;
+	data->wsize = nfss->wsize;
+	data->retrans = nfss->client->cl_timeout->to_retries;
+	data->auth_flavors[0] = nfss->client->cl_auth->au_flavor;
+	data->acregmin = nfss->acregmin / HZ;
+	data->acregmax = nfss->acregmax / HZ;
+	data->acdirmin = nfss->acdirmin / HZ;
+	data->acdirmax = nfss->acdirmax / HZ;
+	data->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ;
+	data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen;
+	memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr,
+		data->nfs_server.addrlen);
+
+	/* overwrite those values with any that were specified */
+	error = nfs_parse_mount_options((char *)options, data);
+	if (error < 0)
+		goto out;
+
+	/* compare new mount options with old ones */
+	error = nfs_compare_remount_data(nfss, data);
+out:
+	kfree(data);
+	return error;
+}
+
 /*
  * Initialise the common bits of the superblock
  */
-- 
cgit v1.2.3-70-g09d2


From b6b6152c46861dd914d0e6cea9c27df057d6e235 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <aglo@citi.umich.edu>
Date: Mon, 9 Jun 2008 16:51:31 -0400
Subject: rpc: bring back cl_chatty

The cl_chatty flag alows us to control whether a given rpc client leaves

	"server X not responding, timed out"

messages in the syslog.  Such messages make sense for ordinary nfs
clients (where an unresponsive server means applications on the
mountpoint are probably hanging), but not for the callback client (which
can fail more commonly, with the only result just of disabling some
optimizations).

Previously cl_chatty was removed, do to lack of users; reinstate it, and
use it for the nfsd's callback client.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfsd/nfs4callback.c      |  2 +-
 include/linux/sunrpc/clnt.h |  4 +++-
 net/sunrpc/clnt.c           | 16 +++++++++++-----
 3 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 4d4760e687c..702fa577aa6 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -381,7 +381,7 @@ static int do_probe_callback(void *data)
 		.program	= &cb_program,
 		.version	= nfs_cb_version[1]->number,
 		.authflavor	= RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */
-		.flags		= (RPC_CLNT_CREATE_NOPING),
+		.flags		= (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
 	};
 	struct rpc_message msg = {
 		.rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 6fff7f82ef1..764fd4c286e 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -42,7 +42,8 @@ struct rpc_clnt {
 
 	unsigned int		cl_softrtry : 1,/* soft timeouts */
 				cl_discrtry : 1,/* disconnect before retry */
-				cl_autobind : 1;/* use getport() */
+				cl_autobind : 1,/* use getport() */
+				cl_chatty   : 1;/* be verbose */
 
 	struct rpc_rtt *	cl_rtt;		/* RTO estimator data */
 	const struct rpc_timeout *cl_timeout;	/* Timeout strategy */
@@ -114,6 +115,7 @@ struct rpc_create_args {
 #define RPC_CLNT_CREATE_NONPRIVPORT	(1UL << 3)
 #define RPC_CLNT_CREATE_NOPING		(1UL << 4)
 #define RPC_CLNT_CREATE_DISCRTRY	(1UL << 5)
+#define RPC_CLNT_CREATE_QUIET		(1UL << 6)
 
 struct rpc_clnt *rpc_create(struct rpc_create_args *args);
 struct rpc_clnt	*rpc_bind_new_program(struct rpc_clnt *,
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 0530eea37b5..09631f6e30e 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -324,6 +324,8 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
 		clnt->cl_autobind = 1;
 	if (args->flags & RPC_CLNT_CREATE_DISCRTRY)
 		clnt->cl_discrtry = 1;
+	if (!(args->flags & RPC_CLNT_CREATE_QUIET))
+		clnt->cl_chatty = 1;
 
 	return clnt;
 }
@@ -1149,7 +1151,8 @@ call_status(struct rpc_task *task)
 		rpc_exit(task, status);
 		break;
 	default:
-		printk("%s: RPC call returned error %d\n",
+		if (clnt->cl_chatty)
+			printk("%s: RPC call returned error %d\n",
 			       clnt->cl_protname, -status);
 		rpc_exit(task, status);
 	}
@@ -1174,7 +1177,8 @@ call_timeout(struct rpc_task *task)
 	task->tk_timeouts++;
 
 	if (RPC_IS_SOFT(task)) {
-		printk(KERN_NOTICE "%s: server %s not responding, timed out\n",
+		if (clnt->cl_chatty)
+			printk(KERN_NOTICE "%s: server %s not responding, timed out\n",
 				clnt->cl_protname, clnt->cl_server);
 		rpc_exit(task, -EIO);
 		return;
@@ -1182,7 +1186,8 @@ call_timeout(struct rpc_task *task)
 
 	if (!(task->tk_flags & RPC_CALL_MAJORSEEN)) {
 		task->tk_flags |= RPC_CALL_MAJORSEEN;
-		printk(KERN_NOTICE "%s: server %s not responding, still trying\n",
+		if (clnt->cl_chatty)
+			printk(KERN_NOTICE "%s: server %s not responding, still trying\n",
 			clnt->cl_protname, clnt->cl_server);
 	}
 	rpc_force_rebind(clnt);
@@ -1213,8 +1218,9 @@ call_decode(struct rpc_task *task)
 			task->tk_pid, task->tk_status);
 
 	if (task->tk_flags & RPC_CALL_MAJORSEEN) {
-		printk(KERN_NOTICE "%s: server %s OK\n",
-			clnt->cl_protname, clnt->cl_server);
+		if (clnt->cl_chatty)
+			printk(KERN_NOTICE "%s: server %s OK\n",
+				clnt->cl_protname, clnt->cl_server);
 		task->tk_flags &= ~RPC_CALL_MAJORSEEN;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 659bfcd6dd88919a5ad453f62afbeffcb3106847 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 10 Jun 2008 19:39:41 -0400
Subject: NFS: Fix the ftruncate() credential problem

ftruncate() access checking is supposed to be performed at open() time,
just like reads and writes.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c    |  4 ++--
 fs/nfs/nfs3proc.c |  2 ++
 fs/nfs/nfs4proc.c | 47 +++++++++++++++++++++++------------------------
 fs/nfs/proc.c     |  2 ++
 4 files changed, 29 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 596c5d8e86f..2e4ab4a5e10 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -347,7 +347,7 @@ out_no_inode:
 	goto out;
 }
 
-#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET)
+#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE)
 
 int
 nfs_setattr(struct dentry *dentry, struct iattr *attr)
@@ -369,7 +369,7 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
 
 	/* Optimization: if the end result is no change, don't RPC */
 	attr->ia_valid &= NFS_VALID_ATTRS;
-	if (attr->ia_valid == 0)
+	if ((attr->ia_valid & ~ATTR_FILE) == 0)
 		return 0;
 
 	lock_kernel();
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index cf7d4e5927d..b9c2d995332 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -129,6 +129,8 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 	int	status;
 
 	dprintk("NFS call  setattr\n");
+	if (sattr->ia_valid & ATTR_FILE)
+		msg.rpc_cred = nfs_file_cred(sattr->ia_file);
 	nfs_fattr_init(fattr);
 	status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
 	if (status == 0)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 26fbeb3467c..31a7e4c54a1 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1139,8 +1139,9 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, int
 	return res;
 }
 
-static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr,
-                struct iattr *sattr, struct nfs4_state *state)
+static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
+			    struct nfs_fattr *fattr, struct iattr *sattr,
+			    struct nfs4_state *state)
 {
 	struct nfs_server *server = NFS_SERVER(inode);
         struct nfs_setattrargs  arg = {
@@ -1154,9 +1155,10 @@ static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr,
 		.server		= server,
         };
         struct rpc_message msg = {
-                .rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
-                .rpc_argp       = &arg,
-                .rpc_resp       = &res,
+		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
+		.rpc_argp	= &arg,
+		.rpc_resp	= &res,
+		.rpc_cred	= cred,
         };
 	unsigned long timestamp = jiffies;
 	int status;
@@ -1166,7 +1168,6 @@ static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr,
 	if (nfs4_copy_delegation_stateid(&arg.stateid, inode)) {
 		/* Use that stateid */
 	} else if (state != NULL) {
-		msg.rpc_cred = state->owner->so_cred;
 		nfs4_copy_stateid(&arg.stateid, state, current->files);
 	} else
 		memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid));
@@ -1177,15 +1178,16 @@ static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr,
 	return status;
 }
 
-static int nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr,
-                struct iattr *sattr, struct nfs4_state *state)
+static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
+			   struct nfs_fattr *fattr, struct iattr *sattr,
+			   struct nfs4_state *state)
 {
 	struct nfs_server *server = NFS_SERVER(inode);
 	struct nfs4_exception exception = { };
 	int err;
 	do {
 		err = nfs4_handle_exception(server,
-				_nfs4_do_setattr(inode, fattr, sattr, state),
+				_nfs4_do_setattr(inode, cred, fattr, sattr, state),
 				&exception);
 	} while (exception.retry);
 	return err;
@@ -1647,29 +1649,25 @@ static int
 nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 		  struct iattr *sattr)
 {
-	struct rpc_cred *cred;
 	struct inode *inode = dentry->d_inode;
-	struct nfs_open_context *ctx;
+	struct rpc_cred *cred = NULL;
 	struct nfs4_state *state = NULL;
 	int status;
 
 	nfs_fattr_init(fattr);
 	
-	cred = rpc_lookup_cred();
-	if (IS_ERR(cred))
-		return PTR_ERR(cred);
-
 	/* Search for an existing open(O_WRITE) file */
-	ctx = nfs_find_open_context(inode, cred, FMODE_WRITE);
-	if (ctx != NULL)
+	if (sattr->ia_valid & ATTR_FILE) {
+		struct nfs_open_context *ctx;
+
+		ctx = nfs_file_open_context(sattr->ia_file);
+		cred = ctx->cred;
 		state = ctx->state;
+	}
 
-	status = nfs4_do_setattr(inode, fattr, sattr, state);
+	status = nfs4_do_setattr(inode, cred, fattr, sattr, state);
 	if (status == 0)
 		nfs_setattr_update_inode(inode, sattr);
-	if (ctx != NULL)
-		put_nfs_open_context(ctx);
-	put_rpccred(cred);
 	return status;
 }
 
@@ -1897,17 +1895,16 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 		goto out;
 	}
 	state = nfs4_do_open(dir, &path, flags, sattr, cred);
-	put_rpccred(cred);
 	d_drop(dentry);
 	if (IS_ERR(state)) {
 		status = PTR_ERR(state);
-		goto out;
+		goto out_putcred;
 	}
 	d_add(dentry, igrab(state->inode));
 	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 	if (flags & O_EXCL) {
 		struct nfs_fattr fattr;
-		status = nfs4_do_setattr(state->inode, &fattr, sattr, state);
+		status = nfs4_do_setattr(state->inode, cred, &fattr, sattr, state);
 		if (status == 0)
 			nfs_setattr_update_inode(state->inode, sattr);
 		nfs_post_op_update_inode(state->inode, &fattr);
@@ -1916,6 +1913,8 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 		status = nfs4_intent_set_file(nd, &path, state);
 	else
 		nfs4_close_sync(&path, state, flags);
+out_putcred:
+	put_rpccred(cred);
 out:
 	return status;
 }
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 5c35b02857f..c7605587d0e 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -129,6 +129,8 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 	sattr->ia_mode &= S_IALLUGO;
 
 	dprintk("NFS call  setattr\n");
+	if (sattr->ia_valid & ATTR_FILE)
+		msg.rpc_cred = nfs_file_cred(sattr->ia_file);
 	nfs_fattr_init(fattr);
 	status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
 	if (status == 0)
-- 
cgit v1.2.3-70-g09d2


From 46cb650c224bb8e64a749090105d74b9e8eda669 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Jun 2008 16:32:46 -0400
Subject: NFS: Remove the redundant file_open entry from struct nfs_rpc_ops

All instances are set to nfs_open(), so we should just remove the redundant
indirection. Ditto for the file_release op

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/file.c           | 4 ++--
 fs/nfs/nfs3proc.c       | 2 --
 fs/nfs/nfs4proc.c       | 2 --
 fs/nfs/proc.c           | 2 --
 include/linux/nfs_xdr.h | 2 --
 5 files changed, 2 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 78b26f875ee..509dcb58959 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -129,7 +129,7 @@ nfs_file_open(struct inode *inode, struct file *filp)
 
 	nfs_inc_stats(inode, NFSIOS_VFSOPEN);
 	lock_kernel();
-	res = NFS_PROTO(inode)->file_open(inode, filp);
+	res = nfs_open(inode, filp);
 	unlock_kernel();
 	return res;
 }
@@ -147,7 +147,7 @@ nfs_file_release(struct inode *inode, struct file *filp)
 	if (filp->f_mode & FMODE_WRITE)
 		nfs_wb_all(dentry->d_inode);
 	nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
-	return NFS_PROTO(inode)->file_release(inode, filp);
+	return nfs_release(inode, filp);
 }
 
 /**
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index b9c2d995332..1e750e4574a 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -816,8 +816,6 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
 	.write_done	= nfs3_write_done,
 	.commit_setup	= nfs3_proc_commit_setup,
 	.commit_done	= nfs3_commit_done,
-	.file_open	= nfs_open,
-	.file_release	= nfs_release,
 	.lock		= nfs3_proc_lock,
 	.clear_acl_cache = nfs3_forget_cached_acls,
 };
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 31a7e4c54a1..dfdd19a6d17 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3714,8 +3714,6 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
 	.write_done	= nfs4_write_done,
 	.commit_setup	= nfs4_proc_commit_setup,
 	.commit_done	= nfs4_commit_done,
-	.file_open      = nfs_open,
-	.file_release   = nfs_release,
 	.lock		= nfs4_proc_lock,
 	.clear_acl_cache = nfs4_zap_acl_attr,
 };
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index c7605587d0e..4dbb84df1b6 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -655,8 +655,6 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
 	.write_setup	= nfs_proc_write_setup,
 	.write_done	= nfs_write_done,
 	.commit_setup	= nfs_proc_commit_setup,
-	.file_open	= nfs_open,
-	.file_release	= nfs_release,
 	.lock		= nfs_proc_lock,
 	.lock_check_bounds = nfs_lock_check_bounds,
 };
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 8d780de371f..8c77c11224d 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -829,8 +829,6 @@ struct nfs_rpc_ops {
 	int	(*write_done)  (struct rpc_task *, struct nfs_write_data *);
 	void	(*commit_setup) (struct nfs_write_data *, struct rpc_message *);
 	int	(*commit_done) (struct rpc_task *, struct nfs_write_data *);
-	int	(*file_open)   (struct inode *, struct file *);
-	int	(*file_release) (struct inode *, struct file *);
 	int	(*lock)(struct file *, int, struct file_lock *);
 	int	(*lock_check_bounds)(const struct file_lock *);
 	void	(*clear_acl_cache)(struct inode *);
-- 
cgit v1.2.3-70-g09d2


From 34e8f92831cb5c40b3137e47a3daf4c09016ef02 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 12 Jun 2008 12:32:25 -0400
Subject: NFS: Move fs/nfs/iostat.h to include/linux

The fs/nfs/iostat.h header has definitions that were designed to be exposed
to user space.  Move these definitions under include/linux so user space can
use the definitions in applications that read /proc/self/mountstats.

Also address a handful of coding style issues called out by checkpatch.pl in
fs/nfs/iostat.h.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/iostat.h            | 119 +++++----------------------------------------
 include/linux/nfs_iostat.h | 119 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 132 insertions(+), 106 deletions(-)
 create mode 100644 include/linux/nfs_iostat.h

(limited to 'fs')

diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index 6350ecbde58..2ec65e12bfe 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -5,135 +5,41 @@
  *
  *  Copyright (C) 2005, 2006 Chuck Lever <cel@netapp.com>
  *
- *  NFS client per-mount statistics provide information about the health of
- *  the NFS client and the health of each NFS mount point.  Generally these
- *  are not for detailed problem diagnosis, but simply to indicate that there
- *  is a problem.
- *
- *  These counters are not meant to be human-readable, but are meant to be
- *  integrated into system monitoring tools such as "sar" and "iostat".  As
- *  such, the counters are sampled by the tools over time, and are never
- *  zeroed after a file system is mounted.  Moving averages can be computed
- *  by the tools by taking the difference between two instantaneous samples
- *  and dividing that by the time between the samples.
  */
 
 #ifndef _NFS_IOSTAT
 #define _NFS_IOSTAT
 
-#define NFS_IOSTAT_VERS		"1.0"
-
-/*
- * NFS byte counters
- *
- * 1.  SERVER - the number of payload bytes read from or written to the
- *     server by the NFS client via an NFS READ or WRITE request.
- *
- * 2.  NORMAL - the number of bytes read or written by applications via
- *     the read(2) and write(2) system call interfaces.
- *
- * 3.  DIRECT - the number of bytes read or written from files opened
- *     with the O_DIRECT flag.
- *
- * These counters give a view of the data throughput into and out of the NFS
- * client.  Comparing the number of bytes requested by an application with the
- * number of bytes the client requests from the server can provide an
- * indication of client efficiency (per-op, cache hits, etc).
- *
- * These counters can also help characterize which access methods are in
- * use.  DIRECT by itself shows whether there is any O_DIRECT traffic.
- * NORMAL + DIRECT shows how much data is going through the system call
- * interface.  A large amount of SERVER traffic without much NORMAL or
- * DIRECT traffic shows that applications are using mapped files.
- *
- * NFS page counters
- *
- * These count the number of pages read or written via nfs_readpage(),
- * nfs_readpages(), or their write equivalents.
- */
-enum nfs_stat_bytecounters {
-	NFSIOS_NORMALREADBYTES = 0,
-	NFSIOS_NORMALWRITTENBYTES,
-	NFSIOS_DIRECTREADBYTES,
-	NFSIOS_DIRECTWRITTENBYTES,
-	NFSIOS_SERVERREADBYTES,
-	NFSIOS_SERVERWRITTENBYTES,
-	NFSIOS_READPAGES,
-	NFSIOS_WRITEPAGES,
-	__NFSIOS_BYTESMAX,
-};
-
-/*
- * NFS event counters
- *
- * These counters provide a low-overhead way of monitoring client activity
- * without enabling NFS trace debugging.  The counters show the rate at
- * which VFS requests are made, and how often the client invalidates its
- * data and attribute caches.  This allows system administrators to monitor
- * such things as how close-to-open is working, and answer questions such
- * as "why are there so many GETATTR requests on the wire?"
- *
- * They also count anamolous events such as short reads and writes, silly
- * renames due to close-after-delete, and operations that change the size
- * of a file (such operations can often be the source of data corruption
- * if applications aren't using file locking properly).
- */
-enum nfs_stat_eventcounters {
-	NFSIOS_INODEREVALIDATE = 0,
-	NFSIOS_DENTRYREVALIDATE,
-	NFSIOS_DATAINVALIDATE,
-	NFSIOS_ATTRINVALIDATE,
-	NFSIOS_VFSOPEN,
-	NFSIOS_VFSLOOKUP,
-	NFSIOS_VFSACCESS,
-	NFSIOS_VFSUPDATEPAGE,
-	NFSIOS_VFSREADPAGE,
-	NFSIOS_VFSREADPAGES,
-	NFSIOS_VFSWRITEPAGE,
-	NFSIOS_VFSWRITEPAGES,
-	NFSIOS_VFSGETDENTS,
-	NFSIOS_VFSSETATTR,
-	NFSIOS_VFSFLUSH,
-	NFSIOS_VFSFSYNC,
-	NFSIOS_VFSLOCK,
-	NFSIOS_VFSRELEASE,
-	NFSIOS_CONGESTIONWAIT,
-	NFSIOS_SETATTRTRUNC,
-	NFSIOS_EXTENDWRITE,
-	NFSIOS_SILLYRENAME,
-	NFSIOS_SHORTREAD,
-	NFSIOS_SHORTWRITE,
-	NFSIOS_DELAY,
-	__NFSIOS_COUNTSMAX,
-};
-
-#ifdef __KERNEL__
-
 #include <linux/percpu.h>
 #include <linux/cache.h>
+#include <linux/nfs_iostat.h>
 
 struct nfs_iostats {
 	unsigned long long	bytes[__NFSIOS_BYTESMAX];
 	unsigned long		events[__NFSIOS_COUNTSMAX];
 } ____cacheline_aligned;
 
-static inline void nfs_inc_server_stats(struct nfs_server *server, enum nfs_stat_eventcounters stat)
+static inline void nfs_inc_server_stats(struct nfs_server *server,
+					enum nfs_stat_eventcounters stat)
 {
 	struct nfs_iostats *iostats;
 	int cpu;
 
 	cpu = get_cpu();
 	iostats = per_cpu_ptr(server->io_stats, cpu);
-	iostats->events[stat] ++;
+	iostats->events[stat]++;
 	put_cpu_no_resched();
 }
 
-static inline void nfs_inc_stats(struct inode *inode, enum nfs_stat_eventcounters stat)
+static inline void nfs_inc_stats(struct inode *inode,
+				 enum nfs_stat_eventcounters stat)
 {
 	nfs_inc_server_stats(NFS_SERVER(inode), stat);
 }
 
-static inline void nfs_add_server_stats(struct nfs_server *server, enum nfs_stat_bytecounters stat, unsigned long addend)
+static inline void nfs_add_server_stats(struct nfs_server *server,
+					enum nfs_stat_bytecounters stat,
+					unsigned long addend)
 {
 	struct nfs_iostats *iostats;
 	int cpu;
@@ -144,7 +50,9 @@ static inline void nfs_add_server_stats(struct nfs_server *server, enum nfs_stat
 	put_cpu_no_resched();
 }
 
-static inline void nfs_add_stats(struct inode *inode, enum nfs_stat_bytecounters stat, unsigned long addend)
+static inline void nfs_add_stats(struct inode *inode,
+				 enum nfs_stat_bytecounters stat,
+				 unsigned long addend)
 {
 	nfs_add_server_stats(NFS_SERVER(inode), stat, addend);
 }
@@ -160,5 +68,4 @@ static inline void nfs_free_iostats(struct nfs_iostats *stats)
 		free_percpu(stats);
 }
 
-#endif
-#endif
+#endif /* _NFS_IOSTAT */
diff --git a/include/linux/nfs_iostat.h b/include/linux/nfs_iostat.h
new file mode 100644
index 00000000000..1cb9a3fed2b
--- /dev/null
+++ b/include/linux/nfs_iostat.h
@@ -0,0 +1,119 @@
+/*
+ *  User-space visible declarations for NFS client per-mount
+ *  point statistics
+ *
+ *  Copyright (C) 2005, 2006 Chuck Lever <cel@netapp.com>
+ *
+ *  NFS client per-mount statistics provide information about the
+ *  health of the NFS client and the health of each NFS mount point.
+ *  Generally these are not for detailed problem diagnosis, but
+ *  simply to indicate that there is a problem.
+ *
+ *  These counters are not meant to be human-readable, but are meant
+ *  to be integrated into system monitoring tools such as "sar" and
+ *  "iostat".  As such, the counters are sampled by the tools over
+ *  time, and are never zeroed after a file system is mounted.
+ *  Moving averages can be computed by the tools by taking the
+ *  difference between two instantaneous samples  and dividing that
+ *  by the time between the samples.
+ */
+
+#ifndef _LINUX_NFS_IOSTAT
+#define _LINUX_NFS_IOSTAT
+
+#define NFS_IOSTAT_VERS		"1.0"
+
+/*
+ * NFS byte counters
+ *
+ * 1.  SERVER - the number of payload bytes read from or written
+ *     to the server by the NFS client via an NFS READ or WRITE
+ *     request.
+ *
+ * 2.  NORMAL - the number of bytes read or written by applications
+ *     via the read(2) and write(2) system call interfaces.
+ *
+ * 3.  DIRECT - the number of bytes read or written from files
+ *     opened with the O_DIRECT flag.
+ *
+ * These counters give a view of the data throughput into and out
+ * of the NFS client.  Comparing the number of bytes requested by
+ * an application with the number of bytes the client requests from
+ * the server can provide an indication of client efficiency
+ * (per-op, cache hits, etc).
+ *
+ * These counters can also help characterize which access methods
+ * are in use.  DIRECT by itself shows whether there is any O_DIRECT
+ * traffic.  NORMAL + DIRECT shows how much data is going through
+ * the system call interface.  A large amount of SERVER traffic
+ * without much NORMAL or DIRECT traffic shows that applications
+ * are using mapped files.
+ *
+ * NFS page counters
+ *
+ * These count the number of pages read or written via nfs_readpage(),
+ * nfs_readpages(), or their write equivalents.
+ *
+ * NB: When adding new byte counters, please include the measured
+ * units in the name of each byte counter to help users of this
+ * interface determine what exactly is being counted.
+ */
+enum nfs_stat_bytecounters {
+	NFSIOS_NORMALREADBYTES = 0,
+	NFSIOS_NORMALWRITTENBYTES,
+	NFSIOS_DIRECTREADBYTES,
+	NFSIOS_DIRECTWRITTENBYTES,
+	NFSIOS_SERVERREADBYTES,
+	NFSIOS_SERVERWRITTENBYTES,
+	NFSIOS_READPAGES,
+	NFSIOS_WRITEPAGES,
+	__NFSIOS_BYTESMAX,
+};
+
+/*
+ * NFS event counters
+ *
+ * These counters provide a low-overhead way of monitoring client
+ * activity without enabling NFS trace debugging.  The counters
+ * show the rate at which VFS requests are made, and how often the
+ * client invalidates its data and attribute caches.  This allows
+ * system administrators to monitor such things as how close-to-open
+ * is working, and answer questions such as "why are there so many
+ * GETATTR requests on the wire?"
+ *
+ * They also count anamolous events such as short reads and writes,
+ * silly renames due to close-after-delete, and operations that
+ * change the size of a file (such operations can often be the
+ * source of data corruption if applications aren't using file
+ * locking properly).
+ */
+enum nfs_stat_eventcounters {
+	NFSIOS_INODEREVALIDATE = 0,
+	NFSIOS_DENTRYREVALIDATE,
+	NFSIOS_DATAINVALIDATE,
+	NFSIOS_ATTRINVALIDATE,
+	NFSIOS_VFSOPEN,
+	NFSIOS_VFSLOOKUP,
+	NFSIOS_VFSACCESS,
+	NFSIOS_VFSUPDATEPAGE,
+	NFSIOS_VFSREADPAGE,
+	NFSIOS_VFSREADPAGES,
+	NFSIOS_VFSWRITEPAGE,
+	NFSIOS_VFSWRITEPAGES,
+	NFSIOS_VFSGETDENTS,
+	NFSIOS_VFSSETATTR,
+	NFSIOS_VFSFLUSH,
+	NFSIOS_VFSFSYNC,
+	NFSIOS_VFSLOCK,
+	NFSIOS_VFSRELEASE,
+	NFSIOS_CONGESTIONWAIT,
+	NFSIOS_SETATTRTRUNC,
+	NFSIOS_EXTENDWRITE,
+	NFSIOS_SILLYRENAME,
+	NFSIOS_SHORTREAD,
+	NFSIOS_SHORTWRITE,
+	NFSIOS_DELAY,
+	__NFSIOS_COUNTSMAX,
+};
+
+#endif	/* _LINUX_NFS_IOSTAT */
-- 
cgit v1.2.3-70-g09d2


From 2e96d2867245668dbdb973729288cf69b9fafa66 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Jun 2008 16:42:05 -0400
Subject: NFS: Fix a warning in nfs4_async_handle_error

We're not modifying the nfs_server when we call nfs_inc_server_stats and
friends, so allow the compiler to pass 'const' pointers too.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/iostat.h   | 8 ++++----
 fs/nfs/nfs4proc.c | 3 +--
 2 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index 2ec65e12bfe..a3695281003 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -19,7 +19,7 @@ struct nfs_iostats {
 	unsigned long		events[__NFSIOS_COUNTSMAX];
 } ____cacheline_aligned;
 
-static inline void nfs_inc_server_stats(struct nfs_server *server,
+static inline void nfs_inc_server_stats(const struct nfs_server *server,
 					enum nfs_stat_eventcounters stat)
 {
 	struct nfs_iostats *iostats;
@@ -31,13 +31,13 @@ static inline void nfs_inc_server_stats(struct nfs_server *server,
 	put_cpu_no_resched();
 }
 
-static inline void nfs_inc_stats(struct inode *inode,
+static inline void nfs_inc_stats(const struct inode *inode,
 				 enum nfs_stat_eventcounters stat)
 {
 	nfs_inc_server_stats(NFS_SERVER(inode), stat);
 }
 
-static inline void nfs_add_server_stats(struct nfs_server *server,
+static inline void nfs_add_server_stats(const struct nfs_server *server,
 					enum nfs_stat_bytecounters stat,
 					unsigned long addend)
 {
@@ -50,7 +50,7 @@ static inline void nfs_add_server_stats(struct nfs_server *server,
 	put_cpu_no_resched();
 }
 
-static inline void nfs_add_stats(struct inode *inode,
+static inline void nfs_add_stats(const struct inode *inode,
 				 enum nfs_stat_bytecounters stat,
 				 unsigned long addend)
 {
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index dfdd19a6d17..058723d9122 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2756,8 +2756,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server)
 			task->tk_status = 0;
 			return -EAGAIN;
 		case -NFS4ERR_DELAY:
-			nfs_inc_server_stats((struct nfs_server *) server,
-						NFSIOS_DELAY);
+			nfs_inc_server_stats(server, NFSIOS_DELAY);
 		case -NFS4ERR_GRACE:
 			rpc_delay(task, NFS4_POLL_RETRY_MAX);
 			task->tk_status = 0;
-- 
cgit v1.2.3-70-g09d2


From f41f741838480aeaa3a189cff6e210503cf9c42d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Jun 2008 17:39:04 -0400
Subject: NFS: Ensure we zap only the access and acl caches when setting new
 acls

...and ensure that we obey the NFS_INO_INVALID_ACL flag when retrieving the
acls.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c    | 4 +---
 fs/nfs/internal.h | 1 +
 fs/nfs/nfs3acl.c  | 9 ++++++---
 fs/nfs/nfs4proc.c | 5 ++++-
 4 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 2e4ab4a5e10..2c23d067e2a 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -57,8 +57,6 @@ static int enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED;
 static void nfs_invalidate_inode(struct inode *);
 static int nfs_update_inode(struct inode *, struct nfs_fattr *);
 
-static void nfs_zap_acl_cache(struct inode *);
-
 static struct kmem_cache * nfs_inode_cachep;
 
 static inline unsigned long
@@ -167,7 +165,7 @@ void nfs_zap_mapping(struct inode *inode, struct address_space *mapping)
 	}
 }
 
-static void nfs_zap_acl_cache(struct inode *inode)
+void nfs_zap_acl_cache(struct inode *inode)
 {
 	void (*clear_acl_cache)(struct inode *);
 
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 04ae867dddb..24241fcbb98 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -150,6 +150,7 @@ extern void nfs_clear_inode(struct inode *);
 #ifdef CONFIG_NFS_V4
 extern void nfs4_clear_inode(struct inode *);
 #endif
+void nfs_zap_acl_cache(struct inode *inode);
 
 /* super.c */
 extern struct file_system_type nfs_xdev_fs_type;
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 9b7362565c0..423842f51ac 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -5,6 +5,8 @@
 #include <linux/posix_acl_xattr.h>
 #include <linux/nfsacl.h>
 
+#include "internal.h"
+
 #define NFSDBG_FACILITY	NFSDBG_PROC
 
 ssize_t nfs3_listxattr(struct dentry *dentry, char *buffer, size_t size)
@@ -205,6 +207,8 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
 	status = nfs_revalidate_inode(server, inode);
 	if (status < 0)
 		return ERR_PTR(status);
+	if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
+		nfs_zap_acl_cache(inode);
 	acl = nfs3_get_cached_acl(inode, type);
 	if (acl != ERR_PTR(-EAGAIN))
 		return acl;
@@ -319,9 +323,8 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
 	dprintk("NFS call setacl\n");
 	msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL];
 	status = rpc_call_sync(server->client_acl, &msg, 0);
-	spin_lock(&inode->i_lock);
-	NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS;
-	spin_unlock(&inode->i_lock);
+	nfs_access_zap_cache(inode);
+	nfs_zap_acl_cache(inode);
 	dprintk("NFS reply setacl: %d\n", status);
 
 	/* pages may have been allocated at the xdr layer. */
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 058723d9122..10f01c05a4e 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2695,6 +2695,8 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)
 	ret = nfs_revalidate_inode(server, inode);
 	if (ret < 0)
 		return ret;
+	if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
+		nfs_zap_acl_cache(inode);
 	ret = nfs4_read_cached_acl(inode, buf, buflen);
 	if (ret != -ENOENT)
 		return ret;
@@ -2722,7 +2724,8 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
 	nfs_inode_return_delegation(inode);
 	buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
 	ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
-	nfs_zap_caches(inode);
+	nfs_access_zap_cache(inode);
+	nfs_zap_acl_cache(inode);
 	return ret;
 }
 
-- 
cgit v1.2.3-70-g09d2


From ecbb3845dd678364148c1faad32adbc9dd833ef9 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 12 Jun 2008 12:37:33 -0400
Subject: NFS: Allow any value for the "retry" option

The kernel NFS mount option parser should ignore the retry= mount option
since it is meaningful only in user space.  Today it expects a number
rather than arbitrary text, so it ignores the option if the value is
numeric, but chokes if there are other characters in the value.

Change it to allow any text (except ",") as its value.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index b550e921f66..5278eef6111 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -101,6 +101,8 @@ enum {
 static match_table_t nfs_mount_option_tokens = {
 	{ Opt_userspace, "bg" },
 	{ Opt_userspace, "fg" },
+	{ Opt_userspace, "retry=%s" },
+
 	{ Opt_soft, "soft" },
 	{ Opt_hard, "hard" },
 	{ Opt_intr, "intr" },
@@ -136,7 +138,6 @@ static match_table_t nfs_mount_option_tokens = {
 	{ Opt_acdirmin, "acdirmin=%u" },
 	{ Opt_acdirmax, "acdirmax=%u" },
 	{ Opt_actimeo, "actimeo=%u" },
-	{ Opt_userspace, "retry=%u" },
 	{ Opt_namelen, "namlen=%u" },
 	{ Opt_mountport, "mountport=%u" },
 	{ Opt_mountvers, "mountvers=%u" },
-- 
cgit v1.2.3-70-g09d2


From d33e4dfeab5eb0c3c1359cc1a399f2f8b49e18de Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 12 Jun 2008 12:37:41 -0400
Subject: NFS: Treat "intr" and "nointr" options as deprecated

Clean up:  the "intr" and "nointr" mount options were recently retired.
Document this in the NFS mount option parser.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 5278eef6111..f824c415c8a 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -65,7 +65,6 @@
 enum {
 	/* Mount options that take no arguments */
 	Opt_soft, Opt_hard,
-	Opt_intr, Opt_nointr,
 	Opt_posix, Opt_noposix,
 	Opt_cto, Opt_nocto,
 	Opt_ac, Opt_noac,
@@ -105,8 +104,8 @@ static match_table_t nfs_mount_option_tokens = {
 
 	{ Opt_soft, "soft" },
 	{ Opt_hard, "hard" },
-	{ Opt_intr, "intr" },
-	{ Opt_nointr, "nointr" },
+	{ Opt_deprecated, "intr" },
+	{ Opt_deprecated, "nointr" },
 	{ Opt_posix, "posix" },
 	{ Opt_noposix, "noposix" },
 	{ Opt_cto, "cto" },
@@ -787,9 +786,6 @@ static int nfs_parse_mount_options(char *raw,
 		case Opt_hard:
 			mnt->flags &= ~NFS_MOUNT_SOFT;
 			break;
-		case Opt_intr:
-		case Opt_nointr:
-			break;
 		case Opt_posix:
 			mnt->flags |= NFS_MOUNT_POSIX;
 			break;
@@ -1105,6 +1101,8 @@ static int nfs_parse_mount_options(char *raw,
 
 		case Opt_userspace:
 		case Opt_deprecated:
+			dfprintk(MOUNT, "NFS:   ignoring mount option "
+					"'%s'\n", p);
 			break;
 
 		default:
-- 
cgit v1.2.3-70-g09d2


From 396cee977f79590673ad51b04f1853e58bc30e7b Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 12 Jun 2008 12:37:49 -0400
Subject: NFS: missing newline in NFS mount debugging message

Clean up.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f824c415c8a..a1065c1a314 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1190,7 +1190,7 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
 	if (status == 0)
 		return 0;
 
-	dfprintk(MOUNT, "NFS: unable to mount server %s, error %d",
+	dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n",
 			hostname, status);
 	return status;
 }
-- 
cgit v1.2.3-70-g09d2


From e7d39069e387a12d4c57f4067d9f48c1d29ea900 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 13 Jun 2008 12:12:32 -0400
Subject: NFS: Clean up nfs_update_request()

Simplify the loop in nfs_update_request by moving into a separate function
the code that attempts to update an existing cached NFS write.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/write.c | 201 +++++++++++++++++++++++++++++----------------------------
 1 file changed, 103 insertions(+), 98 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 21d8a48b624..04f51e52e18 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -34,9 +34,6 @@
 /*
  * Local function declarations
  */
-static struct nfs_page * nfs_update_request(struct nfs_open_context*,
-					    struct page *,
-					    unsigned int, unsigned int);
 static void nfs_pageio_init_write(struct nfs_pageio_descriptor *desc,
 				  struct inode *inode, int ioflags);
 static void nfs_redirty_request(struct nfs_page *req);
@@ -169,30 +166,6 @@ static void nfs_mark_uptodate(struct page *page, unsigned int base, unsigned int
 	SetPageUptodate(page);
 }
 
-static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
-		unsigned int offset, unsigned int count)
-{
-	struct nfs_page	*req;
-	int ret;
-
-	for (;;) {
-		req = nfs_update_request(ctx, page, offset, count);
-		if (!IS_ERR(req))
-			break;
-		ret = PTR_ERR(req);
-		if (ret != -EBUSY)
-			return ret;
-		ret = nfs_wb_page(page->mapping->host, page);
-		if (ret != 0)
-			return ret;
-	}
-	/* Update file length */
-	nfs_grow_file(page, offset, count);
-	nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
-	nfs_clear_page_tag_locked(req);
-	return 0;
-}
-
 static int wb_priority(struct writeback_control *wbc)
 {
 	if (wbc->for_reclaim)
@@ -356,11 +329,19 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 /*
  * Insert a write request into an inode
  */
-static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
+static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	int error;
 
+	error = radix_tree_preload(GFP_NOFS);
+	if (error != 0)
+		goto out;
+
+	/* Lock the request! */
+	nfs_lock_request_dontget(req);
+
+	spin_lock(&inode->i_lock);
 	error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req);
 	BUG_ON(error);
 	if (!nfsi->npages) {
@@ -374,6 +355,10 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
 	kref_get(&req->wb_kref);
 	radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
 				NFS_PAGE_TAG_LOCKED);
+	spin_unlock(&inode->i_lock);
+	radix_tree_preload_end();
+out:
+	return error;
 }
 
 /*
@@ -565,101 +550,121 @@ static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pg
 #endif
 
 /*
- * Try to update any existing write request, or create one if there is none.
- * In order to match, the request's credentials must match those of
- * the calling process.
+ * Search for an existing write request, and attempt to update
+ * it to reflect a new dirty region on a given page.
  *
- * Note: Should always be called with the Page Lock held!
+ * If the attempt fails, then the existing request is flushed out
+ * to disk.
  */
-static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
-		struct page *page, unsigned int offset, unsigned int bytes)
+static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
+		struct page *page,
+		unsigned int offset,
+		unsigned int bytes)
 {
-	struct address_space *mapping = page->mapping;
-	struct inode *inode = mapping->host;
-	struct nfs_page		*req, *new = NULL;
-	pgoff_t		rqend, end;
+	struct nfs_page *req;
+	unsigned int rqend;
+	unsigned int end;
+	int error;
+
+	if (!PagePrivate(page))
+		return NULL;
 
 	end = offset + bytes;
+	spin_lock(&inode->i_lock);
 
 	for (;;) {
-		/* Loop over all inode entries and see if we find
-		 * A request for the page we wish to update
-		 */
-		spin_lock(&inode->i_lock);
 		req = nfs_page_find_request_locked(page);
-		if (req) {
-			if (!nfs_set_page_tag_locked(req)) {
-				int error;
-
-				spin_unlock(&inode->i_lock);
-				error = nfs_wait_on_request(req);
-				nfs_release_request(req);
-				if (error < 0) {
-					if (new) {
-						radix_tree_preload_end();
-						nfs_release_request(new);
-					}
-					return ERR_PTR(error);
-				}
-				continue;
-			}
-			spin_unlock(&inode->i_lock);
-			if (new) {
-				radix_tree_preload_end();
-				nfs_release_request(new);
-			}
+		if (req == NULL)
+			goto out_unlock;
+
+		rqend = req->wb_offset + req->wb_bytes;
+		/*
+		 * Tell the caller to flush out the request if
+		 * the offsets are non-contiguous.
+		 * Note: nfs_flush_incompatible() will already
+		 * have flushed out requests having wrong owners.
+		 */
+		if (!nfs_dirty_request(req)
+		    || offset > rqend
+		    || end < req->wb_offset)
+			goto out_flushme;
+
+		if (nfs_set_page_tag_locked(req))
 			break;
-		}
 
-		if (new) {
-			nfs_lock_request_dontget(new);
-			nfs_inode_add_request(inode, new);
-			spin_unlock(&inode->i_lock);
-			radix_tree_preload_end();
-			req = new;
-			goto out;
-		}
+		/* The request is locked, so wait and then retry */
 		spin_unlock(&inode->i_lock);
-
-		new = nfs_create_request(ctx, inode, page, offset, bytes);
-		if (IS_ERR(new))
-			return new;
-		if (radix_tree_preload(GFP_NOFS)) {
-			nfs_release_request(new);
-			return ERR_PTR(-ENOMEM);
-		}
-	}
-
-	/* We have a request for our page.
-	 * If the creds don't match, or the
-	 * page addresses don't match,
-	 * tell the caller to wait on the conflicting
-	 * request.
-	 */
-	rqend = req->wb_offset + req->wb_bytes;
-	if (req->wb_context != ctx
-	    || req->wb_page != page
-	    || !nfs_dirty_request(req)
-	    || offset > rqend || end < req->wb_offset) {
-		nfs_clear_page_tag_locked(req);
-		return ERR_PTR(-EBUSY);
+		error = nfs_wait_on_request(req);
+		nfs_release_request(req);
+		if (error != 0)
+			goto out_err;
+		spin_lock(&inode->i_lock);
 	}
 
 	/* Okay, the request matches. Update the region */
 	if (offset < req->wb_offset) {
 		req->wb_offset = offset;
 		req->wb_pgbase = offset;
-		req->wb_bytes = max(end, rqend) - req->wb_offset;
-		goto out;
 	}
-
 	if (end > rqend)
 		req->wb_bytes = end - req->wb_offset;
+	else
+		req->wb_bytes = rqend - req->wb_offset;
+out_unlock:
+	spin_unlock(&inode->i_lock);
+	return req;
+out_flushme:
+	spin_unlock(&inode->i_lock);
+	nfs_release_request(req);
+	error = nfs_wb_page(inode, page);
+out_err:
+	return ERR_PTR(error);
+}
 
+/*
+ * Try to update an existing write request, or create one if there is none.
+ *
+ * Note: Should always be called with the Page Lock held to prevent races
+ * if we have to add a new request. Also assumes that the caller has
+ * already called nfs_flush_incompatible() if necessary.
+ */
+static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
+		struct page *page, unsigned int offset, unsigned int bytes)
+{
+	struct inode *inode = page->mapping->host;
+	struct nfs_page	*req;
+	int error;
+
+	req = nfs_try_to_update_request(inode, page, offset, bytes);
+	if (req != NULL)
+		goto out;
+	req = nfs_create_request(ctx, inode, page, offset, bytes);
+	if (IS_ERR(req))
+		goto out;
+	error = nfs_inode_add_request(inode, req);
+	if (error != 0) {
+		nfs_release_request(req);
+		req = ERR_PTR(error);
+	}
 out:
 	return req;
 }
 
+static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
+		unsigned int offset, unsigned int count)
+{
+	struct nfs_page	*req;
+
+	req = nfs_setup_write_request(ctx, page, offset, count);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+	/* Update file length */
+	nfs_grow_file(page, offset, count);
+	nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
+	nfs_clear_page_tag_locked(req);
+	return 0;
+}
+
 int nfs_flush_incompatible(struct file *file, struct page *page)
 {
 	struct nfs_open_context *ctx = nfs_file_open_context(file);
-- 
cgit v1.2.3-70-g09d2


From e468bae97d243fe0e1515abaa1f7d0edf1476ad0 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 13 Jun 2008 13:25:22 -0400
Subject: NFS: Allow redirtying of a completed unstable write.

Currently, if an unstable write completes, we cannot redirty the page in
order to reflect a new change in the page data until after we've sent a
COMMIT request.

This patch allows a page rewrite to proceed without the unnecessary COMMIT
step, putting it immediately back onto the dirty page list, undoing the
VM unstable write accounting, and removing the NFS_PAGE_TAG_COMMIT tag from
the NFS radix tree.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/write.c           | 65 ++++++++++++++++++++++++------------------------
 include/linux/nfs_page.h |  9 ++++---
 2 files changed, 38 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 04f51e52e18..feca8c64876 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -242,12 +242,9 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
 			return ret;
 		spin_lock(&inode->i_lock);
 	}
-	if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) {
-		/* This request is marked for commit */
+	if (test_bit(PG_CLEAN, &req->wb_flags)) {
 		spin_unlock(&inode->i_lock);
-		nfs_clear_page_tag_locked(req);
-		nfs_pageio_complete(pgio);
-		return 0;
+		BUG();
 	}
 	if (nfs_set_page_writeback(page) != 0) {
 		spin_unlock(&inode->i_lock);
@@ -391,19 +388,6 @@ nfs_mark_request_dirty(struct nfs_page *req)
 	__set_page_dirty_nobuffers(req->wb_page);
 }
 
-/*
- * Check if a request is dirty
- */
-static inline int
-nfs_dirty_request(struct nfs_page *req)
-{
-	struct page *page = req->wb_page;
-
-	if (page == NULL || test_bit(PG_NEED_COMMIT, &req->wb_flags))
-		return 0;
-	return !PageWriteback(page);
-}
-
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 /*
  * Add a request to the inode's commit list.
@@ -416,7 +400,7 @@ nfs_mark_request_commit(struct nfs_page *req)
 
 	spin_lock(&inode->i_lock);
 	nfsi->ncommit++;
-	set_bit(PG_NEED_COMMIT, &(req)->wb_flags);
+	set_bit(PG_CLEAN, &(req)->wb_flags);
 	radix_tree_tag_set(&nfsi->nfs_page_tree,
 			req->wb_index,
 			NFS_PAGE_TAG_COMMIT);
@@ -426,6 +410,19 @@ nfs_mark_request_commit(struct nfs_page *req)
 	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 }
 
+static int
+nfs_clear_request_commit(struct nfs_page *req)
+{
+	struct page *page = req->wb_page;
+
+	if (test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) {
+		dec_zone_page_state(page, NR_UNSTABLE_NFS);
+		dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE);
+		return 1;
+	}
+	return 0;
+}
+
 static inline
 int nfs_write_need_commit(struct nfs_write_data *data)
 {
@@ -435,7 +432,7 @@ int nfs_write_need_commit(struct nfs_write_data *data)
 static inline
 int nfs_reschedule_unstable_write(struct nfs_page *req)
 {
-	if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) {
+	if (test_and_clear_bit(PG_NEED_COMMIT, &req->wb_flags)) {
 		nfs_mark_request_commit(req);
 		return 1;
 	}
@@ -451,6 +448,12 @@ nfs_mark_request_commit(struct nfs_page *req)
 {
 }
 
+static inline int
+nfs_clear_request_commit(struct nfs_page *req)
+{
+	return 0;
+}
+
 static inline
 int nfs_write_need_commit(struct nfs_write_data *data)
 {
@@ -508,11 +511,8 @@ static void nfs_cancel_commit_list(struct list_head *head)
 
 	while(!list_empty(head)) {
 		req = nfs_list_entry(head->next);
-		dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-		dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
-				BDI_RECLAIMABLE);
 		nfs_list_remove_request(req);
-		clear_bit(PG_NEED_COMMIT, &(req)->wb_flags);
+		nfs_clear_request_commit(req);
 		nfs_inode_remove_request(req);
 		nfs_unlock_request(req);
 	}
@@ -584,8 +584,7 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
 		 * Note: nfs_flush_incompatible() will already
 		 * have flushed out requests having wrong owners.
 		 */
-		if (!nfs_dirty_request(req)
-		    || offset > rqend
+		if (offset > rqend
 		    || end < req->wb_offset)
 			goto out_flushme;
 
@@ -601,6 +600,10 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
 		spin_lock(&inode->i_lock);
 	}
 
+	if (nfs_clear_request_commit(req))
+		radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree,
+				req->wb_index, NFS_PAGE_TAG_COMMIT);
+
 	/* Okay, the request matches. Update the region */
 	if (offset < req->wb_offset) {
 		req->wb_offset = offset;
@@ -682,8 +685,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
 		req = nfs_page_find_request(page);
 		if (req == NULL)
 			return 0;
-		do_flush = req->wb_page != page || req->wb_context != ctx
-			|| !nfs_dirty_request(req);
+		do_flush = req->wb_page != page || req->wb_context != ctx;
 		nfs_release_request(req);
 		if (!do_flush)
 			return 0;
@@ -1288,10 +1290,7 @@ static void nfs_commit_release(void *calldata)
 	while (!list_empty(&data->pages)) {
 		req = nfs_list_entry(data->pages.next);
 		nfs_list_remove_request(req);
-		clear_bit(PG_NEED_COMMIT, &(req)->wb_flags);
-		dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-		dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
-				BDI_RECLAIMABLE);
+		nfs_clear_request_commit(req);
 
 		dprintk("NFS:       commit (%s/%lld %d@%lld)",
 			req->wb_context->path.dentry->d_inode->i_sb->s_id,
@@ -1467,7 +1466,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
 		req = nfs_page_find_request(page);
 		if (req == NULL)
 			goto out;
-		if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) {
+		if (test_bit(PG_CLEAN, &req->wb_flags)) {
 			nfs_release_request(req);
 			break;
 		}
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index a1676e19e49..3c60685d972 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -27,9 +27,12 @@
 /*
  * Valid flags for a dirty buffer
  */
-#define PG_BUSY			0
-#define PG_NEED_COMMIT		1
-#define PG_NEED_RESCHED		2
+enum {
+	PG_BUSY = 0,
+	PG_CLEAN,
+	PG_NEED_COMMIT,
+	PG_NEED_RESCHED,
+};
 
 struct nfs_inode;
 struct nfs_page {
-- 
cgit v1.2.3-70-g09d2


From cd100725620a8063fbc81bcf16d7c9e71a9583e0 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 17 Jun 2008 16:12:00 -0400
Subject: NFS: Fix a dependency on CONFIG_NFS_V4 in nfs_remount

Fix the 'nfs4_fs_type' undeclared error in nfs_remount when compiling sans
NFSv4...

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Jeff Layton <jlayton@redhat.com>
---
 fs/nfs/super.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index a1065c1a314..b880db18035 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1428,6 +1428,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
 	struct nfs_parsed_mount_data *data;
 	struct nfs_mount_data *options = (struct nfs_mount_data *)raw_data;
 	struct nfs4_mount_data *options4 = (struct nfs4_mount_data *)raw_data;
+	u32 nfsvers = nfss->nfs_client->rpc_ops->version;
 
 	/*
 	 * Userspace mount programs that send binary options generally send
@@ -1435,8 +1436,8 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
 	 * ones were explicitly specified. Fall back to legacy behavior and
 	 * just return success.
 	 */
-	if ((sb->s_type == &nfs4_fs_type && options4->version == 1) ||
-	    (sb->s_type == &nfs_fs_type && options->version >= 1 &&
+	if ((nfsvers == 4 && options4->version == 1) ||
+	    (nfsvers <= 3 && options->version >= 1 &&
 	     options->version <= 6))
 		return 0;
 
-- 
cgit v1.2.3-70-g09d2


From dc04589827f7e1af12714af8bb00e3f3c4c48c62 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 23 Jun 2008 12:36:37 -0400
Subject: NFS: Use common device name parsing logic for NFSv4 and NFSv2/v3

To support passing a raw IPv6 address as a server hostname, we need to
expand the logic that handles splitting the passed-in device name into
a server hostname and export path

Start by pulling device name parsing out of the mount option validation
functions and into separate helper functions.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 124 ++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 79 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index b880db18035..c4ee8b3a27c 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1195,6 +1195,67 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
 	return status;
 }
 
+/*
+ * Split "dev_name" into "hostname:export_path".
+ *
+ * Note: caller frees hostname and export path, even on error.
+ */
+static int nfs_parse_devname(const char *dev_name,
+			     char **hostname, size_t maxnamlen,
+			     char **export_path, size_t maxpathlen)
+{
+	size_t len;
+	char *colon, *comma;
+
+	colon = strchr(dev_name, ':');
+	if (colon == NULL)
+		goto out_bad_devname;
+
+	len = colon - dev_name;
+	if (len > maxnamlen)
+		goto out_hostname;
+
+	/* N.B. caller will free nfs_server.hostname in all cases */
+	*hostname = kstrndup(dev_name, len, GFP_KERNEL);
+	if (!*hostname)
+		goto out_nomem;
+
+	/* kill possible hostname list: not supported */
+	comma = strchr(*hostname, ',');
+	if (comma != NULL) {
+		if (comma == *hostname)
+			goto out_bad_devname;
+		*comma = '\0';
+	}
+
+	colon++;
+	len = strlen(colon);
+	if (len > maxpathlen)
+		goto out_path;
+	*export_path = kstrndup(colon, len, GFP_KERNEL);
+	if (!*export_path)
+		goto out_nomem;
+
+	dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *export_path);
+	return 0;
+
+out_bad_devname:
+	dfprintk(MOUNT, "NFS: device name not in host:path format\n");
+	return -EINVAL;
+
+out_nomem:
+	dfprintk(MOUNT, "NFS: not enough memory to parse device name\n");
+	return -ENOMEM;
+
+out_hostname:
+	dfprintk(MOUNT, "NFS: server hostname too long\n");
+	return -ENAMETOOLONG;
+
+out_path:
+	dfprintk(MOUNT, "NFS: export pathname too long\n");
+	return -ENAMETOOLONG;
+}
+
 /*
  * Validate the NFS2/NFS3 mount data
  * - fills in the mount root filehandle
@@ -1323,8 +1384,6 @@ static int nfs_validate_mount_data(void *options,
 
 		break;
 	default: {
-		unsigned int len;
-		char *c;
 		int status;
 
 		if (nfs_parse_mount_options((char *)options, args) == 0)
@@ -1334,21 +1393,17 @@ static int nfs_validate_mount_data(void *options,
 						&args->nfs_server.address))
 			goto out_no_address;
 
-		c = strchr(dev_name, ':');
-		if (c == NULL)
-			return -EINVAL;
-		len = c - dev_name;
-		/* N.B. caller will free nfs_server.hostname in all cases */
-		args->nfs_server.hostname = kstrndup(dev_name, len, GFP_KERNEL);
-		if (!args->nfs_server.hostname)
-			goto out_nomem;
+		status = nfs_parse_devname(dev_name,
+					   &args->nfs_server.hostname,
+					   PAGE_SIZE,
+					   &args->nfs_server.export_path,
+					   NFS_MAXPATHLEN);
+		if (!status)
+			status = nfs_try_mount(args, mntfh);
 
-		c++;
-		if (strlen(c) > NFS_MAXPATHLEN)
-			return -ENAMETOOLONG;
-		args->nfs_server.export_path = c;
+		kfree(args->nfs_server.export_path);
+		args->nfs_server.export_path = NULL;
 
-		status = nfs_try_mount(args, mntfh);
 		if (status)
 			return status;
 
@@ -1958,7 +2013,7 @@ static int nfs4_validate_mount_data(void *options,
 
 		break;
 	default: {
-		unsigned int len;
+		int status;
 
 		if (nfs_parse_mount_options((char *)options, args) == 0)
 			return -EINVAL;
@@ -1977,34 +2032,17 @@ static int nfs4_validate_mount_data(void *options,
 			goto out_inval_auth;
 		}
 
-		/*
-		 * Split "dev_name" into "hostname:mntpath".
-		 */
-		c = strchr(dev_name, ':');
-		if (c == NULL)
-			return -EINVAL;
-		/* while calculating len, pretend ':' is '\0' */
-		len = c - dev_name;
-		if (len > NFS4_MAXNAMLEN)
-			return -ENAMETOOLONG;
-		/* N.B. caller will free nfs_server.hostname in all cases */
-		args->nfs_server.hostname = kstrndup(dev_name, len, GFP_KERNEL);
-		if (!args->nfs_server.hostname)
-			goto out_nomem;
-
-		c++;			/* step over the ':' */
-		len = strlen(c);
-		if (len > NFS4_MAXPATHLEN)
-			return -ENAMETOOLONG;
-		args->nfs_server.export_path = kstrndup(c, len, GFP_KERNEL);
-		if (!args->nfs_server.export_path)
-			goto out_nomem;
-
-		dprintk("NFS: MNTPATH: '%s'\n", args->nfs_server.export_path);
-
 		if (args->client_address == NULL)
 			goto out_no_client_address;
 
+		status = nfs_parse_devname(dev_name,
+					   &args->nfs_server.hostname,
+					   NFS4_MAXNAMLEN,
+					   &args->nfs_server.export_path,
+					   NFS4_MAXPATHLEN);
+		if (status < 0)
+			return status;
+
 		break;
 		}
 	}
@@ -2020,10 +2058,6 @@ out_inval_auth:
 		 data->auth_flavourlen);
 	return -EINVAL;
 
-out_nomem:
-	dfprintk(MOUNT, "NFS4: not enough memory to handle mount options\n");
-	return -ENOMEM;
-
 out_no_address:
 	dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n");
 	return -EINVAL;
-- 
cgit v1.2.3-70-g09d2


From d1aa08257312f1439c1ab7c8a18e3856f9530f46 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 23 Jun 2008 12:36:45 -0400
Subject: NFS: Support raw IPv6 address hostnames during NFS mount operation

Traditionally the mount command has looked for a ":" to separate the
server's hostname from the export path in the mounted on device name,
like this:

	mount server:/export /mounted/on/dir

The server's hostname is "server" and the export path is "/export".

You can also substitute a specific IPv4 network address for the server
hostname, like this:

	mount 192.168.0.55:/export /mounted/on/dir

Raw IPv6 addresses present a problem, however, because they look
something like this:

	fe80::200:5aff:fe00:30b

Note the use of colons.

To get around the presence of colons, copy the Solaris convention used for
mounting IPv6 servers by address: wrap a raw IPv6 address with square
brackets.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 82 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index c4ee8b3a27c..ea4abd266a7 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1195,14 +1195,9 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
 	return status;
 }
 
-/*
- * Split "dev_name" into "hostname:export_path".
- *
- * Note: caller frees hostname and export path, even on error.
- */
-static int nfs_parse_devname(const char *dev_name,
-			     char **hostname, size_t maxnamlen,
-			     char **export_path, size_t maxpathlen)
+static int nfs_parse_simple_hostname(const char *dev_name,
+				     char **hostname, size_t maxnamlen,
+				     char **export_path, size_t maxpathlen)
 {
 	size_t len;
 	char *colon, *comma;
@@ -1256,6 +1251,85 @@ out_path:
 	return -ENAMETOOLONG;
 }
 
+/*
+ * Hostname has square brackets around it because it contains one or
+ * more colons.  We look for the first closing square bracket, and a
+ * colon must follow it.
+ */
+static int nfs_parse_protected_hostname(const char *dev_name,
+					char **hostname, size_t maxnamlen,
+					char **export_path, size_t maxpathlen)
+{
+	size_t len;
+	char *start, *end;
+
+	start = (char *)(dev_name + 1);
+
+	end = strchr(start, ']');
+	if (end == NULL)
+		goto out_bad_devname;
+	if (*(end + 1) != ':')
+		goto out_bad_devname;
+
+	len = end - start;
+	if (len > maxnamlen)
+		goto out_hostname;
+
+	/* N.B. caller will free nfs_server.hostname in all cases */
+	*hostname = kstrndup(start, len, GFP_KERNEL);
+	if (*hostname == NULL)
+		goto out_nomem;
+
+	end += 2;
+	len = strlen(end);
+	if (len > maxpathlen)
+		goto out_path;
+	*export_path = kstrndup(end, len, GFP_KERNEL);
+	if (!*export_path)
+		goto out_nomem;
+
+	return 0;
+
+out_bad_devname:
+	dfprintk(MOUNT, "NFS: device name not in host:path format\n");
+	return -EINVAL;
+
+out_nomem:
+	dfprintk(MOUNT, "NFS: not enough memory to parse device name\n");
+	return -ENOMEM;
+
+out_hostname:
+	dfprintk(MOUNT, "NFS: server hostname too long\n");
+	return -ENAMETOOLONG;
+
+out_path:
+	dfprintk(MOUNT, "NFS: export pathname too long\n");
+	return -ENAMETOOLONG;
+}
+
+/*
+ * Split "dev_name" into "hostname:export_path".
+ *
+ * The leftmost colon demarks the split between the server's hostname
+ * and the export path.  If the hostname starts with a left square
+ * bracket, then it may contain colons.
+ *
+ * Note: caller frees hostname and export path, even on error.
+ */
+static int nfs_parse_devname(const char *dev_name,
+			     char **hostname, size_t maxnamlen,
+			     char **export_path, size_t maxpathlen)
+{
+	if (*dev_name == '[')
+		return nfs_parse_protected_hostname(dev_name,
+						    hostname, maxnamlen,
+						    export_path, maxpathlen);
+
+	return nfs_parse_simple_hostname(dev_name,
+					 hostname, maxnamlen,
+					 export_path, maxpathlen);
+}
+
 /*
  * Validate the NFS2/NFS3 mount data
  * - fills in the mount root filehandle
-- 
cgit v1.2.3-70-g09d2


From ce3b7e1906ebbe96753fe090b36de6ffb8e0e0e7 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 23 Jun 2008 12:36:53 -0400
Subject: NFS: Add string length argument to nfs_parse_server_address

To make nfs_parse_server_address() more generally useful, allow it to
accept input strings that are not terminated with '\0'.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c       | 100 +++++++++++++++++++++++++++++++++++----------------
 include/linux/inet.h |   7 ++++
 2 files changed, 77 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index ea4abd266a7..d27aa1db007 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -705,38 +705,76 @@ static int nfs_verify_server_address(struct sockaddr *addr)
 	return 0;
 }
 
-/*
- * Parse string addresses passed in via a mount option,
- * and construct a sockaddr based on the result.
- *
- * If address parsing fails, set the sockaddr's address
- * family to AF_UNSPEC to force nfs_verify_server_address()
- * to punt the mount.
- */
-static void nfs_parse_server_address(char *value,
-				     struct sockaddr *sap,
-				     size_t *len)
+static void nfs_parse_ipv4_address(char *string, size_t str_len,
+				   struct sockaddr *sap, size_t *addr_len)
 {
-	if (strchr(value, ':')) {
-		struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap;
-		u8 *addr = (u8 *)&ap->sin6_addr.in6_u;
+	struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+	u8 *addr = (u8 *)&sin->sin_addr.s_addr;
 
-		ap->sin6_family = AF_INET6;
-		*len = sizeof(*ap);
-		if (in6_pton(value, -1, addr, '\0', NULL))
+	if (str_len <= INET_ADDRSTRLEN) {
+		dfprintk(MOUNT, "NFS: parsing IPv4 address %*s\n",
+				(int)str_len, string);
+
+		sin->sin_family = AF_INET;
+		*addr_len = sizeof(*sin);
+		if (in4_pton(string, str_len, addr, '\0', NULL))
 			return;
-	} else {
-		struct sockaddr_in *ap = (struct sockaddr_in *)sap;
-		u8 *addr = (u8 *)&ap->sin_addr.s_addr;
+	}
+
+	sap->sa_family = AF_UNSPEC;
+	*addr_len = 0;
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static void nfs_parse_ipv6_address(char *string, size_t str_len,
+				   struct sockaddr *sap, size_t *addr_len)
+{
+	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+	u8 *addr = (u8 *)&sin6->sin6_addr.in6_u;
+
+	if (str_len <= INET6_ADDRSTRLEN) {
+		dfprintk(MOUNT, "NFS: parsing IPv6 address %*s\n",
+				(int)str_len, string);
 
-		ap->sin_family = AF_INET;
-		*len = sizeof(*ap);
-		if (in4_pton(value, -1, addr, '\0', NULL))
+		sin6->sin6_family = AF_INET6;
+		*addr_len = sizeof(*sin6);
+		if (in6_pton(string, str_len, addr, '\0', NULL))
 			return;
 	}
 
 	sap->sa_family = AF_UNSPEC;
-	*len = 0;
+	*addr_len = 0;
+}
+#else
+static void nfs_parse_ipv6_address(char *string, size_t str_len,
+				   struct sockaddr *sap, size_t *addr_len)
+{
+	sap->sa_family = AF_UNSPEC;
+	*addr_len = 0;
+}
+#endif
+
+/*
+ * Construct a sockaddr based on the contents of a string that contains
+ * an IP address in presentation format.
+ *
+ * If there is a problem constructing the new sockaddr, set the address
+ * family to AF_UNSPEC.
+ */
+static void nfs_parse_ip_address(char *string, size_t str_len,
+				 struct sockaddr *sap, size_t *addr_len)
+{
+	unsigned int i, colons;
+
+	colons = 0;
+	for (i = 0; i < str_len; i++)
+		if (string[i] == ':')
+			colons++;
+
+	if (colons >= 2)
+		nfs_parse_ipv6_address(string, str_len, sap, addr_len);
+	else
+		nfs_parse_ipv4_address(string, str_len, sap, addr_len);
 }
 
 /*
@@ -1070,9 +1108,10 @@ static int nfs_parse_mount_options(char *raw,
 			string = match_strdup(args);
 			if (string == NULL)
 				goto out_nomem;
-			nfs_parse_server_address(string, (struct sockaddr *)
-						 &mnt->nfs_server.address,
-						 &mnt->nfs_server.addrlen);
+			nfs_parse_ip_address(string, strlen(string),
+					     (struct sockaddr *)
+						&mnt->nfs_server.address,
+					     &mnt->nfs_server.addrlen);
 			kfree(string);
 			break;
 		case Opt_clientaddr:
@@ -1093,9 +1132,10 @@ static int nfs_parse_mount_options(char *raw,
 			string = match_strdup(args);
 			if (string == NULL)
 				goto out_nomem;
-			nfs_parse_server_address(string, (struct sockaddr *)
-						 &mnt->mount_server.address,
-						 &mnt->mount_server.addrlen);
+			nfs_parse_ip_address(string, strlen(string),
+					     (struct sockaddr *)
+						&mnt->mount_server.address,
+					     &mnt->mount_server.addrlen);
 			kfree(string);
 			break;
 
diff --git a/include/linux/inet.h b/include/linux/inet.h
index 1354080cf8c..4cca05c9678 100644
--- a/include/linux/inet.h
+++ b/include/linux/inet.h
@@ -44,6 +44,13 @@
 
 #include <linux/types.h>
 
+/*
+ * These mimic similar macros defined in user-space for inet_ntop(3).
+ * See /usr/include/netinet/in.h .
+ */
+#define INET_ADDRSTRLEN		(16)
+#define INET6_ADDRSTRLEN	(48)
+
 extern __be32 in_aton(const char *str);
 extern int in4_pton(const char *src, int srclen, u8 *dst, int delim, const char **end);
 extern int in6_pton(const char *src, int srclen, u8 *dst, int delim, const char **end);
-- 
cgit v1.2.3-70-g09d2


From d8e7748ab8322171ebfd78f6155ff35e7d57ac32 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 23 Jun 2008 12:37:01 -0400
Subject: NFS: handle interface identifiers in incoming IPv6 addresses

Add support in the kernel NFS client's address parser for interface
identifiers.

IPv6 link-local addresses require an additional "interface identifier",
which is a network device name or an integer that indexes the array of
local network interfaces.  They are suffixed to the address with a '%'.
For example:

	fe80::215:c5ff:fe3b:e1b2%2

indicates an interface index of 2.  Or

	fe80::215:c5ff:fe3b:e1b2%eth0

indicates that requests should be routed through the eth0 device.
Without the interface ID, link-local addresses are not usable for NFS.

Both the kernel NFS client mount option parser and the mount.nfs command
can take either form.  The mount.nfs command always passes the address
through getnameinfo(3), which usually re-writes interface indices as
device names.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 41 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index d27aa1db007..73a8e5970f0 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -47,6 +47,7 @@
 #include <linux/inet.h>
 #include <linux/in6.h>
 #include <net/ipv6.h>
+#include <linux/netdevice.h>
 #include <linux/nfs_xdr.h>
 #include <linux/magic.h>
 #include <linux/parser.h>
@@ -725,12 +726,48 @@ static void nfs_parse_ipv4_address(char *string, size_t str_len,
 	*addr_len = 0;
 }
 
+#define IPV6_SCOPE_DELIMITER	'%'
+
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static void nfs_parse_ipv6_scope_id(const char *string, const size_t str_len,
+				    const char *delim,
+				    struct sockaddr_in6 *sin6)
+{
+	char *p;
+	size_t len;
+
+	if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL))
+		return ;
+	if (*delim != IPV6_SCOPE_DELIMITER)
+		return;
+
+	len = (string + str_len) - delim - 1;
+	p = kstrndup(delim + 1, len, GFP_KERNEL);
+	if (p) {
+		unsigned long scope_id = 0;
+		struct net_device *dev;
+
+		dev = dev_get_by_name(&init_net, p);
+		if (dev != NULL) {
+			scope_id = dev->ifindex;
+			dev_put(dev);
+		} else {
+			/* scope_id is set to zero on error */
+			strict_strtoul(p, 10, &scope_id);
+		}
+
+		kfree(p);
+		sin6->sin6_scope_id = scope_id;
+		dfprintk(MOUNT, "NFS: IPv6 scope ID = %lu\n", scope_id);
+	}
+}
+
 static void nfs_parse_ipv6_address(char *string, size_t str_len,
 				   struct sockaddr *sap, size_t *addr_len)
 {
 	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
 	u8 *addr = (u8 *)&sin6->sin6_addr.in6_u;
+	const char *delim;
 
 	if (str_len <= INET6_ADDRSTRLEN) {
 		dfprintk(MOUNT, "NFS: parsing IPv6 address %*s\n",
@@ -738,8 +775,10 @@ static void nfs_parse_ipv6_address(char *string, size_t str_len,
 
 		sin6->sin6_family = AF_INET6;
 		*addr_len = sizeof(*sin6);
-		if (in6_pton(string, str_len, addr, '\0', NULL))
+		if (in6_pton(string, str_len, addr, IPV6_SCOPE_DELIMITER, &delim)) {
+			nfs_parse_ipv6_scope_id(string, str_len, delim, sin6);
 			return;
+		}
 	}
 
 	sap->sa_family = AF_UNSPEC;
-- 
cgit v1.2.3-70-g09d2


From 77e03677ac76d413fbb6d6caaffa1d64877a0c2a Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Tue, 24 Jun 2008 20:25:57 +0300
Subject: nfs: initialize timeout variable in nfs4_proc_setclientid_confirm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

gcc (4.3.0) rightfully warns about this:
/usr0/export/dev/bhalevy/git/linux-pnfs-bh-nfs41/fs/nfs/nfs4proc.c: In function �nfs4_proc_setclientid_confirm�:
/usr0/export/dev/bhalevy/git/linux-pnfs-bh-nfs41/fs/nfs/nfs4proc.c:2936: warning: �timeout� may be used uninitialized in this function

nfs4_delay that's passed a pointer to 'timeout' is looking at its value
and sets it up to some value in the range: NFS4_POLL_RETRY_MIN..NFS4_POLL_RETRY_MAX
	if (*timeout <= 0)
		*timeout = NFS4_POLL_RETRY_MIN;
	if (*timeout > NFS4_POLL_RETRY_MAX)
		*timeout = NFS4_POLL_RETRY_MAX;

Therefore it will end up set to some sane, though rather indeterministic, value.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 10f01c05a4e..4451287a81d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2924,7 +2924,7 @@ static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cre
 
 int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred)
 {
-	long timeout;
+	long timeout = 0;
 	int err;
 	do {
 		err = _nfs4_proc_setclientid_confirm(clp, cred);
-- 
cgit v1.2.3-70-g09d2


From ee84dfc45467fd8e5ce04fa2813d98e0aebe465c Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 11 Jun 2008 10:03:10 -0400
Subject: nfs4: remove BKL from nfs_callback_up and nfs_callback_down

The nfs_callback_mutex is sufficient protection.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/callback.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index c1e7c830062..9e713d2d5d7 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -105,7 +105,6 @@ int nfs_callback_up(void)
 	struct svc_rqst *rqstp;
 	int ret = 0;
 
-	lock_kernel();
 	mutex_lock(&nfs_callback_mutex);
 	if (nfs_callback_info.users++ || nfs_callback_info.task != NULL)
 		goto out;
@@ -149,7 +148,6 @@ out:
 	if (serv)
 		svc_destroy(serv);
 	mutex_unlock(&nfs_callback_mutex);
-	unlock_kernel();
 	return ret;
 out_err:
 	dprintk("Couldn't create callback socket or server thread; err = %d\n",
@@ -163,13 +161,11 @@ out_err:
  */
 void nfs_callback_down(void)
 {
-	lock_kernel();
 	mutex_lock(&nfs_callback_mutex);
 	nfs_callback_info.users--;
 	if (nfs_callback_info.users == 0 && nfs_callback_info.task != NULL)
 		kthread_stop(nfs_callback_info.task);
 	mutex_unlock(&nfs_callback_mutex);
-	unlock_kernel();
 }
 
 static int nfs_callback_authenticate(struct svc_rqst *rqstp)
-- 
cgit v1.2.3-70-g09d2


From 5afc597c5f0bd184457e49b9a330fcb37b69db11 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 11 Jun 2008 10:03:11 -0400
Subject: nfs4: fix potential race with rapid nfs_callback_up/down cycle

If the nfsv4 callback thread is rapidly brought up and down, it's
possible that nfs_callback_svc might never get a chance to run. If
this happens, the cleanup at thread exit might never occur, throwing
the refcounting off and nfs_callback_info in an incorrect state.

Move the clean functions into nfs_callback_down. Also change the
nfs_callback_info struct to track the svc_rqst rather than svc_serv
since we need to know that to call svc_exit_thread.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/callback.c | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 9e713d2d5d7..f447f4b4476 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -27,7 +27,7 @@
 
 struct nfs_callback_data {
 	unsigned int users;
-	struct svc_serv *serv;
+	struct svc_rqst *rqst;
 	struct task_struct *task;
 };
 
@@ -91,18 +91,15 @@ nfs_callback_svc(void *vrqstp)
 		svc_process(rqstp);
 	}
 	unlock_kernel();
-	nfs_callback_info.task = NULL;
-	svc_exit_thread(rqstp);
 	return 0;
 }
 
 /*
- * Bring up the server process if it is not already up.
+ * Bring up the callback thread if it is not already up.
  */
 int nfs_callback_up(void)
 {
 	struct svc_serv *serv = NULL;
-	struct svc_rqst *rqstp;
 	int ret = 0;
 
 	mutex_lock(&nfs_callback_mutex);
@@ -120,22 +117,23 @@ int nfs_callback_up(void)
 	nfs_callback_tcpport = ret;
 	dprintk("Callback port = 0x%x\n", nfs_callback_tcpport);
 
-	rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]);
-	if (IS_ERR(rqstp)) {
-		ret = PTR_ERR(rqstp);
+	nfs_callback_info.rqst = svc_prepare_thread(serv, &serv->sv_pools[0]);
+	if (IS_ERR(nfs_callback_info.rqst)) {
+		ret = PTR_ERR(nfs_callback_info.rqst);
+		nfs_callback_info.rqst = NULL;
 		goto out_err;
 	}
 
 	svc_sock_update_bufs(serv);
-	nfs_callback_info.serv = serv;
 
-	nfs_callback_info.task = kthread_run(nfs_callback_svc, rqstp,
+	nfs_callback_info.task = kthread_run(nfs_callback_svc,
+					     nfs_callback_info.rqst,
 					     "nfsv4-svc");
 	if (IS_ERR(nfs_callback_info.task)) {
 		ret = PTR_ERR(nfs_callback_info.task);
-		nfs_callback_info.serv = NULL;
+		svc_exit_thread(nfs_callback_info.rqst);
+		nfs_callback_info.rqst = NULL;
 		nfs_callback_info.task = NULL;
-		svc_exit_thread(rqstp);
 		goto out_err;
 	}
 out:
@@ -157,14 +155,18 @@ out_err:
 }
 
 /*
- * Kill the server process if it is not already down.
+ * Kill the callback thread if it's no longer being used.
  */
 void nfs_callback_down(void)
 {
 	mutex_lock(&nfs_callback_mutex);
 	nfs_callback_info.users--;
-	if (nfs_callback_info.users == 0 && nfs_callback_info.task != NULL)
+	if (nfs_callback_info.users == 0 && nfs_callback_info.task != NULL) {
 		kthread_stop(nfs_callback_info.task);
+		svc_exit_thread(nfs_callback_info.rqst);
+		nfs_callback_info.rqst = NULL;
+		nfs_callback_info.task = NULL;
+	}
 	mutex_unlock(&nfs_callback_mutex);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 40fef8a649e5344bfb6a67a7cc3def3e0dad6448 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 25 Jun 2008 17:24:54 -0400
Subject: SUNRPC: Use only rpcbind v2 for AF_INET requests

Some server vendors support the higher versions of rpcbind only for
AF_INET6.  The kernel doesn't need to use v3 or v4 for AF_INET anyway,
so change the kernel's rpcbind client to query AF_INET servers over
rpcbind v2 only.

This has a few interesting benefits:

1. If the rpcbind request is going over TCP, and the server doesn't
   support rpcbind versions 3 or 4, the client reduces by two the number
   of ephemeral ports left in TIME_WAIT for each rpcbind request.  This
   will help during NFS mount storms.

2. The rpcbind interaction with servers that don't support rpcbind
   versions 3 or 4 will use less network traffic.  Also helpful
   during mount storms.

3. We can eliminate the kernel build option that controls whether the
   kernel's rpcbind client uses rpcbind version 3 and 4 for AF_INET
   servers.  Less complicated kernel configuration...

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/Kconfig             | 21 ---------------------
 net/sunrpc/rpcb_clnt.c | 12 ------------
 2 files changed, 33 deletions(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 1c16de9611e..0ce72dcd6b9 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1799,27 +1799,6 @@ config SUNRPC_XPRT_RDMA
 
 	  If unsure, say N.
 
-config SUNRPC_BIND34
-	bool "Support for rpcbind versions 3 & 4 (EXPERIMENTAL)"
-	depends on SUNRPC && EXPERIMENTAL
-	default n
-	help
-	  RPC requests over IPv6 networks require support for larger
-	  addresses when performing an RPC bind.  Sun added support for
-	  IPv6 addressing by creating two new versions of the rpcbind
-	  protocol (RFC 1833).
-
-	  This option enables support in the kernel RPC client for
-	  querying rpcbind servers via versions 3 and 4 of the rpcbind
-	  protocol.  The kernel automatically falls back to version 2
-	  if a remote rpcbind service does not support versions 3 or 4.
-	  By themselves, these new versions do not provide support for
-	  RPC over IPv6, but the new protocol versions are necessary to
-	  support it.
-
-	  If unsure, say N to get traditional behavior (version 2 rpcbind
-	  requests only).
-
 config RPCSEC_GSS_KRB5
 	tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)"
 	depends on SUNRPC && EXPERIMENTAL
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 625ba72e624..c62e446723a 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -592,16 +592,6 @@ static struct rpc_procinfo rpcb_procedures4[] = {
 };
 
 static struct rpcb_info rpcb_next_version[] = {
-#ifdef CONFIG_SUNRPC_BIND34
-	{
-		.rpc_vers	= RPCBVERS_4,
-		.rpc_proc	= &rpcb_procedures4[RPCBPROC_GETADDR],
-	},
-	{
-		.rpc_vers	= RPCBVERS_3,
-		.rpc_proc	= &rpcb_procedures3[RPCBPROC_GETADDR],
-	},
-#endif
 	{
 		.rpc_vers	= RPCBVERS_2,
 		.rpc_proc	= &rpcb_procedures2[RPCBPROC_GETPORT],
@@ -612,7 +602,6 @@ static struct rpcb_info rpcb_next_version[] = {
 };
 
 static struct rpcb_info rpcb_next_version6[] = {
-#ifdef CONFIG_SUNRPC_BIND34
 	{
 		.rpc_vers	= RPCBVERS_4,
 		.rpc_proc	= &rpcb_procedures4[RPCBPROC_GETADDR],
@@ -621,7 +610,6 @@ static struct rpcb_info rpcb_next_version6[] = {
 		.rpc_vers	= RPCBVERS_3,
 		.rpc_proc	= &rpcb_procedures3[RPCBPROC_GETADDR],
 	},
-#endif
 	{
 		.rpc_proc	= NULL,
 	},
-- 
cgit v1.2.3-70-g09d2


From 259875efed06d6936f54c9a264e868937f1bc217 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 2 Jul 2008 14:43:47 -0400
Subject: NFS: set transport defaults after mount option parsing is finished

Move the UDP/TCP default timeo/retrans settings for text mounts to
nfs_init_timeout_values(), which was were they were always being
initialised (and sanity checked) for binary mounts.
Document the default timeout values using appropriate #defines.

Ensure that we initialise and sanity check the transport protocols that
may have been specified by the user.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c        | 13 ++++++-----
 fs/nfs/super.c         | 60 +++++++++++++++++++++++++++++++++++---------------
 include/linux/nfs_fs.h |  5 +++++
 3 files changed, 55 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index f2a092ca69b..5ee23e7058b 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -431,14 +431,14 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
 {
 	to->to_initval = timeo * HZ / 10;
 	to->to_retries = retrans;
-	if (!to->to_retries)
-		to->to_retries = 2;
 
 	switch (proto) {
 	case XPRT_TRANSPORT_TCP:
 	case XPRT_TRANSPORT_RDMA:
+		if (to->to_retries == 0)
+			to->to_retries = NFS_DEF_TCP_RETRANS;
 		if (to->to_initval == 0)
-			to->to_initval = 60 * HZ;
+			to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10;
 		if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
 			to->to_initval = NFS_MAX_TCP_TIMEOUT;
 		to->to_increment = to->to_initval;
@@ -450,14 +450,17 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
 		to->to_exponential = 0;
 		break;
 	case XPRT_TRANSPORT_UDP:
-	default:
+		if (to->to_retries == 0)
+			to->to_retries = NFS_DEF_UDP_RETRANS;
 		if (!to->to_initval)
-			to->to_initval = 11 * HZ / 10;
+			to->to_initval = NFS_DEF_UDP_TIMEO * HZ / 10;
 		if (to->to_initval > NFS_MAX_UDP_TIMEOUT)
 			to->to_initval = NFS_MAX_UDP_TIMEOUT;
 		to->to_maxval = NFS_MAX_UDP_TIMEOUT;
 		to->to_exponential = 1;
 		break;
+	default:
+		BUG();
 	}
 }
 
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 73a8e5970f0..9c1a960f5b9 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -816,6 +816,43 @@ static void nfs_parse_ip_address(char *string, size_t str_len,
 		nfs_parse_ipv4_address(string, str_len, sap, addr_len);
 }
 
+/*
+ * Sanity check the NFS transport protocol.
+ *
+ */
+static void nfs_validate_transport_protocol(struct nfs_parsed_mount_data *mnt)
+{
+	switch (mnt->nfs_server.protocol) {
+	case XPRT_TRANSPORT_UDP:
+	case XPRT_TRANSPORT_TCP:
+	case XPRT_TRANSPORT_RDMA:
+		break;
+	default:
+		mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+	}
+}
+
+/*
+ * For text based NFSv2/v3 mounts, the mount protocol transport default
+ * settings should depend upon the specified NFS transport.
+ */
+static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt)
+{
+	nfs_validate_transport_protocol(mnt);
+
+	if (mnt->mount_server.protocol == XPRT_TRANSPORT_UDP ||
+	    mnt->mount_server.protocol == XPRT_TRANSPORT_TCP)
+			return;
+	switch (mnt->nfs_server.protocol) {
+	case XPRT_TRANSPORT_UDP:
+		mnt->mount_server.protocol = XPRT_TRANSPORT_UDP;
+		break;
+	case XPRT_TRANSPORT_TCP:
+	case XPRT_TRANSPORT_RDMA:
+		mnt->mount_server.protocol = XPRT_TRANSPORT_TCP;
+	}
+}
+
 /*
  * Error-check and convert a string of mount options from user space into
  * a data structure
@@ -896,20 +933,14 @@ static int nfs_parse_mount_options(char *raw,
 		case Opt_udp:
 			mnt->flags &= ~NFS_MOUNT_TCP;
 			mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
-			mnt->timeo = 7;
-			mnt->retrans = 5;
 			break;
 		case Opt_tcp:
 			mnt->flags |= NFS_MOUNT_TCP;
 			mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
-			mnt->timeo = 600;
-			mnt->retrans = 2;
 			break;
 		case Opt_rdma:
 			mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */
 			mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
-			mnt->timeo = 600;
-			mnt->retrans = 2;
 			break;
 		case Opt_acl:
 			mnt->flags &= ~NFS_MOUNT_NOACL;
@@ -1103,21 +1134,15 @@ static int nfs_parse_mount_options(char *raw,
 			case Opt_xprt_udp:
 				mnt->flags &= ~NFS_MOUNT_TCP;
 				mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
-				mnt->timeo = 7;
-				mnt->retrans = 5;
 				break;
 			case Opt_xprt_tcp:
 				mnt->flags |= NFS_MOUNT_TCP;
 				mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
-				mnt->timeo = 600;
-				mnt->retrans = 2;
 				break;
 			case Opt_xprt_rdma:
 				/* vector side protocols to TCP */
 				mnt->flags |= NFS_MOUNT_TCP;
 				mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
-				mnt->timeo = 600;
-				mnt->retrans = 2;
 				break;
 			default:
 				goto out_unrec_xprt;
@@ -1438,14 +1463,11 @@ static int nfs_validate_mount_data(void *options,
 	args->flags		= (NFS_MOUNT_VER3 | NFS_MOUNT_TCP);
 	args->rsize		= NFS_MAX_FILE_IO_SIZE;
 	args->wsize		= NFS_MAX_FILE_IO_SIZE;
-	args->timeo		= 600;
-	args->retrans		= 2;
 	args->acregmin		= 3;
 	args->acregmax		= 60;
 	args->acdirmin		= 30;
 	args->acdirmax		= 60;
 	args->mount_server.port	= 0;	/* autobind unless user sets port */
-	args->mount_server.protocol = XPRT_TRANSPORT_UDP;
 	args->nfs_server.port	= 0;	/* autobind unless user sets port */
 	args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
 
@@ -1546,6 +1568,8 @@ static int nfs_validate_mount_data(void *options,
 						&args->nfs_server.address))
 			goto out_no_address;
 
+		nfs_set_mount_transport_protocol(args);
+
 		status = nfs_parse_devname(dev_name,
 					   &args->nfs_server.hostname,
 					   PAGE_SIZE,
@@ -2095,14 +2119,11 @@ static int nfs4_validate_mount_data(void *options,
 
 	args->rsize		= NFS_MAX_FILE_IO_SIZE;
 	args->wsize		= NFS_MAX_FILE_IO_SIZE;
-	args->timeo		= 600;
-	args->retrans		= 2;
 	args->acregmin		= 3;
 	args->acregmax		= 60;
 	args->acdirmin		= 30;
 	args->acdirmax		= 60;
 	args->nfs_server.port	= NFS_PORT; /* 2049 unless user set port= */
-	args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
 
 	switch (data->version) {
 	case 1:
@@ -2163,6 +2184,7 @@ static int nfs4_validate_mount_data(void *options,
 		args->acdirmin	= data->acdirmin;
 		args->acdirmax	= data->acdirmax;
 		args->nfs_server.protocol = data->proto;
+		nfs_validate_transport_protocol(args);
 
 		break;
 	default: {
@@ -2175,6 +2197,8 @@ static int nfs4_validate_mount_data(void *options,
 						&args->nfs_server.address))
 			return -EINVAL;
 
+		nfs_validate_transport_protocol(args);
+
 		switch (args->auth_flavor_len) {
 		case 0:
 			args->auth_flavors[0] = RPC_AUTH_UNIX;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 27d6a8d98ce..830d9cc8cdc 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -12,6 +12,11 @@
 #include <linux/magic.h>
 
 /* Default timeout values */
+#define NFS_DEF_UDP_TIMEO	(11)
+#define NFS_DEF_UDP_RETRANS	(3)
+#define NFS_DEF_TCP_TIMEO	(600)
+#define NFS_DEF_TCP_RETRANS	(2)
+
 #define NFS_MAX_UDP_TIMEOUT	(60*HZ)
 #define NFS_MAX_TCP_TIMEOUT	(600*HZ)
 
-- 
cgit v1.2.3-70-g09d2


From ed596a8adb7964cf9d2f7682f9cf2c37322a775d Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 26 Jun 2008 17:47:05 -0400
Subject: NFS: Move the nfs_set_port() call out of nfs_parse_mount_options()

The remount path does not need to set the port in the server address.
Since it's not really a part of option parsing, move the nfs_set_port()
call to nfs_parse_mount_options()'s callers.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 9c1a960f5b9..de424d2f315 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1214,9 +1214,6 @@ static int nfs_parse_mount_options(char *raw,
 		}
 	}
 
-	nfs_set_port((struct sockaddr *)&mnt->nfs_server.address,
-				mnt->nfs_server.port);
-
 	return 1;
 
 out_nomem:
@@ -1568,6 +1565,9 @@ static int nfs_validate_mount_data(void *options,
 						&args->nfs_server.address))
 			goto out_no_address;
 
+		nfs_set_port((struct sockaddr *)&args->nfs_server.address,
+				args->nfs_server.port);
+
 		nfs_set_mount_transport_protocol(args);
 
 		status = nfs_parse_devname(dev_name,
@@ -2197,6 +2197,9 @@ static int nfs4_validate_mount_data(void *options,
 						&args->nfs_server.address))
 			return -EINVAL;
 
+		nfs_set_port((struct sockaddr *)&args->nfs_server.address,
+				args->nfs_server.port);
+
 		nfs_validate_transport_protocol(args);
 
 		switch (args->auth_flavor_len) {
-- 
cgit v1.2.3-70-g09d2


From 0e0cab744b17a70ef0f08d818d66935feade7cad Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 26 Jun 2008 17:47:12 -0400
Subject: NFS: use documenting macro constants for initializing ac{reg,
 dir}{min, max}

Clean up.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfsroot.c       |  8 ++++----
 fs/nfs/super.c         | 24 ++++++++++++------------
 include/linux/nfs_fs.h |  5 +++++
 3 files changed, 21 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 15afe3a6c5b..46763d1cd39 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -295,10 +295,10 @@ static int __init root_nfs_name(char *name)
 	nfs_data.flags    = NFS_MOUNT_NONLM;	/* No lockd in nfs root yet */
 	nfs_data.rsize    = NFS_DEF_FILE_IO_SIZE;
 	nfs_data.wsize    = NFS_DEF_FILE_IO_SIZE;
-	nfs_data.acregmin = 3;
-	nfs_data.acregmax = 60;
-	nfs_data.acdirmin = 30;
-	nfs_data.acdirmax = 60;
+	nfs_data.acregmin = NFS_DEF_ACREGMIN;
+	nfs_data.acregmax = NFS_DEF_ACREGMAX;
+	nfs_data.acdirmin = NFS_DEF_ACDIRMIN;
+	nfs_data.acdirmax = NFS_DEF_ACDIRMAX;
 	strcpy(buf, NFS_ROOT);
 
 	/* Process options received from the remote server */
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index de424d2f315..ebed63e0ff8 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -518,13 +518,13 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
 	if (nfss->bsize != 0)
 		seq_printf(m, ",bsize=%u", nfss->bsize);
 	seq_printf(m, ",namlen=%u", nfss->namelen);
-	if (nfss->acregmin != 3*HZ || showdefaults)
+	if (nfss->acregmin != NFS_DEF_ACREGMIN*HZ || showdefaults)
 		seq_printf(m, ",acregmin=%u", nfss->acregmin/HZ);
-	if (nfss->acregmax != 60*HZ || showdefaults)
+	if (nfss->acregmax != NFS_DEF_ACREGMAX*HZ || showdefaults)
 		seq_printf(m, ",acregmax=%u", nfss->acregmax/HZ);
-	if (nfss->acdirmin != 30*HZ || showdefaults)
+	if (nfss->acdirmin != NFS_DEF_ACDIRMIN*HZ || showdefaults)
 		seq_printf(m, ",acdirmin=%u", nfss->acdirmin/HZ);
-	if (nfss->acdirmax != 60*HZ || showdefaults)
+	if (nfss->acdirmax != NFS_DEF_ACDIRMAX*HZ || showdefaults)
 		seq_printf(m, ",acdirmax=%u", nfss->acdirmax/HZ);
 	for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) {
 		if (nfss->flags & nfs_infop->flag)
@@ -1460,10 +1460,10 @@ static int nfs_validate_mount_data(void *options,
 	args->flags		= (NFS_MOUNT_VER3 | NFS_MOUNT_TCP);
 	args->rsize		= NFS_MAX_FILE_IO_SIZE;
 	args->wsize		= NFS_MAX_FILE_IO_SIZE;
-	args->acregmin		= 3;
-	args->acregmax		= 60;
-	args->acdirmin		= 30;
-	args->acdirmax		= 60;
+	args->acregmin		= NFS_DEF_ACREGMIN;
+	args->acregmax		= NFS_DEF_ACREGMAX;
+	args->acdirmin		= NFS_DEF_ACDIRMIN;
+	args->acdirmax		= NFS_DEF_ACDIRMAX;
 	args->mount_server.port	= 0;	/* autobind unless user sets port */
 	args->nfs_server.port	= 0;	/* autobind unless user sets port */
 	args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
@@ -2119,10 +2119,10 @@ static int nfs4_validate_mount_data(void *options,
 
 	args->rsize		= NFS_MAX_FILE_IO_SIZE;
 	args->wsize		= NFS_MAX_FILE_IO_SIZE;
-	args->acregmin		= 3;
-	args->acregmax		= 60;
-	args->acdirmin		= 30;
-	args->acdirmax		= 60;
+	args->acregmin		= NFS_DEF_ACREGMIN;
+	args->acregmax		= NFS_DEF_ACREGMAX;
+	args->acdirmin		= NFS_DEF_ACDIRMIN;
+	args->acdirmax		= NFS_DEF_ACDIRMAX;
 	args->nfs_server.port	= NFS_PORT; /* 2049 unless user set port= */
 
 	switch (data->version) {
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 830d9cc8cdc..29d26191873 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -20,6 +20,11 @@
 #define NFS_MAX_UDP_TIMEOUT	(60*HZ)
 #define NFS_MAX_TCP_TIMEOUT	(600*HZ)
 
+#define NFS_DEF_ACREGMIN	(3)
+#define NFS_DEF_ACREGMAX	(60)
+#define NFS_DEF_ACDIRMIN	(30)
+#define NFS_DEF_ACDIRMAX	(60)
+
 /*
  * When flushing a cluster of dirty pages, there can be different
  * strategies:
-- 
cgit v1.2.3-70-g09d2


From 01060c896e3e1ef53dcb914301c186932cd31b81 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 24 Jun 2008 16:33:38 -0400
Subject: NFS: Refactor logic for parsing NFS security flavor mount options

Clean up: Refactor the NFS mount option parsing function to extract the
security flavor parsing logic into a separate function.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 143 +++++++++++++++++++++++++++++++--------------------------
 1 file changed, 78 insertions(+), 65 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index ebed63e0ff8..6eb145ea71a 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -853,6 +853,82 @@ static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt)
 	}
 }
 
+/*
+ * Parse the value of the 'sec=' option.
+ *
+ * The flags setting is for v2/v3.  The flavor_len setting is for v4.
+ * v2/v3 also need to know the difference between NULL and UNIX.
+ */
+static int nfs_parse_security_flavors(char *value,
+				      struct nfs_parsed_mount_data *mnt)
+{
+	substring_t args[MAX_OPT_ARGS];
+
+	dfprintk(MOUNT, "NFS: parsing sec=%s option\n", value);
+
+	switch (match_token(value, nfs_secflavor_tokens, args)) {
+	case Opt_sec_none:
+		mnt->flags &= ~NFS_MOUNT_SECFLAVOUR;
+		mnt->auth_flavor_len = 0;
+		mnt->auth_flavors[0] = RPC_AUTH_NULL;
+		break;
+	case Opt_sec_sys:
+		mnt->flags &= ~NFS_MOUNT_SECFLAVOUR;
+		mnt->auth_flavor_len = 0;
+		mnt->auth_flavors[0] = RPC_AUTH_UNIX;
+		break;
+	case Opt_sec_krb5:
+		mnt->flags |= NFS_MOUNT_SECFLAVOUR;
+		mnt->auth_flavor_len = 1;
+		mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5;
+		break;
+	case Opt_sec_krb5i:
+		mnt->flags |= NFS_MOUNT_SECFLAVOUR;
+		mnt->auth_flavor_len = 1;
+		mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I;
+		break;
+	case Opt_sec_krb5p:
+		mnt->flags |= NFS_MOUNT_SECFLAVOUR;
+		mnt->auth_flavor_len = 1;
+		mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P;
+		break;
+	case Opt_sec_lkey:
+		mnt->flags |= NFS_MOUNT_SECFLAVOUR;
+		mnt->auth_flavor_len = 1;
+		mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY;
+		break;
+	case Opt_sec_lkeyi:
+		mnt->flags |= NFS_MOUNT_SECFLAVOUR;
+		mnt->auth_flavor_len = 1;
+		mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI;
+		break;
+	case Opt_sec_lkeyp:
+		mnt->flags |= NFS_MOUNT_SECFLAVOUR;
+		mnt->auth_flavor_len = 1;
+		mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP;
+		break;
+	case Opt_sec_spkm:
+		mnt->flags |= NFS_MOUNT_SECFLAVOUR;
+		mnt->auth_flavor_len = 1;
+		mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM;
+		break;
+	case Opt_sec_spkmi:
+		mnt->flags |= NFS_MOUNT_SECFLAVOUR;
+		mnt->auth_flavor_len = 1;
+		mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI;
+		break;
+	case Opt_sec_spkmp:
+		mnt->flags |= NFS_MOUNT_SECFLAVOUR;
+		mnt->auth_flavor_len = 1;
+		mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP;
+		break;
+	default:
+		return 0;
+	}
+
+	return 1;
+}
+
 /*
  * Error-check and convert a string of mount options from user space into
  * a data structure
@@ -1054,73 +1130,10 @@ static int nfs_parse_mount_options(char *raw,
 			string = match_strdup(args);
 			if (string == NULL)
 				goto out_nomem;
-			token = match_token(string, nfs_secflavor_tokens, args);
+			rc = nfs_parse_security_flavors(string, mnt);
 			kfree(string);
-
-			/*
-			 * The flags setting is for v2/v3.  The flavor_len
-			 * setting is for v4.  v2/v3 also need to know the
-			 * difference between NULL and UNIX.
-			 */
-			switch (token) {
-			case Opt_sec_none:
-				mnt->flags &= ~NFS_MOUNT_SECFLAVOUR;
-				mnt->auth_flavor_len = 0;
-				mnt->auth_flavors[0] = RPC_AUTH_NULL;
-				break;
-			case Opt_sec_sys:
-				mnt->flags &= ~NFS_MOUNT_SECFLAVOUR;
-				mnt->auth_flavor_len = 0;
-				mnt->auth_flavors[0] = RPC_AUTH_UNIX;
-				break;
-			case Opt_sec_krb5:
-				mnt->flags |= NFS_MOUNT_SECFLAVOUR;
-				mnt->auth_flavor_len = 1;
-				mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5;
-				break;
-			case Opt_sec_krb5i:
-				mnt->flags |= NFS_MOUNT_SECFLAVOUR;
-				mnt->auth_flavor_len = 1;
-				mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I;
-				break;
-			case Opt_sec_krb5p:
-				mnt->flags |= NFS_MOUNT_SECFLAVOUR;
-				mnt->auth_flavor_len = 1;
-				mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P;
-				break;
-			case Opt_sec_lkey:
-				mnt->flags |= NFS_MOUNT_SECFLAVOUR;
-				mnt->auth_flavor_len = 1;
-				mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY;
-				break;
-			case Opt_sec_lkeyi:
-				mnt->flags |= NFS_MOUNT_SECFLAVOUR;
-				mnt->auth_flavor_len = 1;
-				mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI;
-				break;
-			case Opt_sec_lkeyp:
-				mnt->flags |= NFS_MOUNT_SECFLAVOUR;
-				mnt->auth_flavor_len = 1;
-				mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP;
-				break;
-			case Opt_sec_spkm:
-				mnt->flags |= NFS_MOUNT_SECFLAVOUR;
-				mnt->auth_flavor_len = 1;
-				mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM;
-				break;
-			case Opt_sec_spkmi:
-				mnt->flags |= NFS_MOUNT_SECFLAVOUR;
-				mnt->auth_flavor_len = 1;
-				mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI;
-				break;
-			case Opt_sec_spkmp:
-				mnt->flags |= NFS_MOUNT_SECFLAVOUR;
-				mnt->auth_flavor_len = 1;
-				mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP;
-				break;
-			default:
+			if (!rc)
 				goto out_unrec_sec;
-			}
 			break;
 		case Opt_proto:
 			string = match_strdup(args);
-- 
cgit v1.2.3-70-g09d2


From dd07c94750cb1ee4449fb0db06623e1865b3e26e Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 24 Jun 2008 16:33:46 -0400
Subject: NFS: Set security flavor default for NFSv2/3 mounts like other
 defaults

Set the default security flavor when we set the other mount option default
values.  After this change, only the legacy user-space mount path needs to
set the NFS_MOUNT_SECFLAVOUR flag.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 22 +++++-----------------
 1 file changed, 5 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 6eb145ea71a..2f7e7f20c91 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -856,8 +856,7 @@ static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt)
 /*
  * Parse the value of the 'sec=' option.
  *
- * The flags setting is for v2/v3.  The flavor_len setting is for v4.
- * v2/v3 also need to know the difference between NULL and UNIX.
+ * The flavor_len setting is for v4 mounts.
  */
 static int nfs_parse_security_flavors(char *value,
 				      struct nfs_parsed_mount_data *mnt)
@@ -868,57 +867,46 @@ static int nfs_parse_security_flavors(char *value,
 
 	switch (match_token(value, nfs_secflavor_tokens, args)) {
 	case Opt_sec_none:
-		mnt->flags &= ~NFS_MOUNT_SECFLAVOUR;
 		mnt->auth_flavor_len = 0;
 		mnt->auth_flavors[0] = RPC_AUTH_NULL;
 		break;
 	case Opt_sec_sys:
-		mnt->flags &= ~NFS_MOUNT_SECFLAVOUR;
 		mnt->auth_flavor_len = 0;
 		mnt->auth_flavors[0] = RPC_AUTH_UNIX;
 		break;
 	case Opt_sec_krb5:
-		mnt->flags |= NFS_MOUNT_SECFLAVOUR;
 		mnt->auth_flavor_len = 1;
 		mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5;
 		break;
 	case Opt_sec_krb5i:
-		mnt->flags |= NFS_MOUNT_SECFLAVOUR;
 		mnt->auth_flavor_len = 1;
 		mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I;
 		break;
 	case Opt_sec_krb5p:
-		mnt->flags |= NFS_MOUNT_SECFLAVOUR;
 		mnt->auth_flavor_len = 1;
 		mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P;
 		break;
 	case Opt_sec_lkey:
-		mnt->flags |= NFS_MOUNT_SECFLAVOUR;
 		mnt->auth_flavor_len = 1;
 		mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY;
 		break;
 	case Opt_sec_lkeyi:
-		mnt->flags |= NFS_MOUNT_SECFLAVOUR;
 		mnt->auth_flavor_len = 1;
 		mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI;
 		break;
 	case Opt_sec_lkeyp:
-		mnt->flags |= NFS_MOUNT_SECFLAVOUR;
 		mnt->auth_flavor_len = 1;
 		mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP;
 		break;
 	case Opt_sec_spkm:
-		mnt->flags |= NFS_MOUNT_SECFLAVOUR;
 		mnt->auth_flavor_len = 1;
 		mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM;
 		break;
 	case Opt_sec_spkmi:
-		mnt->flags |= NFS_MOUNT_SECFLAVOUR;
 		mnt->auth_flavor_len = 1;
 		mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI;
 		break;
 	case Opt_sec_spkmp:
-		mnt->flags |= NFS_MOUNT_SECFLAVOUR;
 		mnt->auth_flavor_len = 1;
 		mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP;
 		break;
@@ -1480,6 +1468,7 @@ static int nfs_validate_mount_data(void *options,
 	args->mount_server.port	= 0;	/* autobind unless user sets port */
 	args->nfs_server.port	= 0;	/* autobind unless user sets port */
 	args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+	args->auth_flavors[0]	= RPC_AUTH_UNIX;
 
 	switch (data->version) {
 	case 1:
@@ -1537,7 +1526,9 @@ static int nfs_validate_mount_data(void *options,
 		args->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL);
 		args->namlen		= data->namlen;
 		args->bsize		= data->bsize;
-		args->auth_flavors[0]	= data->pseudoflavor;
+
+		if (data->flags & NFS_MOUNT_SECFLAVOUR)
+			args->auth_flavors[0] = data->pseudoflavor;
 		if (!args->nfs_server.hostname)
 			goto out_nomem;
 
@@ -1601,9 +1592,6 @@ static int nfs_validate_mount_data(void *options,
 		}
 	}
 
-	if (!(args->flags & NFS_MOUNT_SECFLAVOUR))
-		args->auth_flavors[0] = RPC_AUTH_UNIX;
-
 #ifndef CONFIG_NFS_V3
 	if (args->flags & NFS_MOUNT_VER3)
 		goto out_v3_not_compiled;
-- 
cgit v1.2.3-70-g09d2


From 6738b2512bdf93ffbefe108b38a9165d5a718c3f Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 24 Jun 2008 16:33:54 -0400
Subject: NFS4: Set security flavor default for NFSv4 mounts like other
 defaults

Set the default security flavor when we set the other mount option
default values for NFSv4.  This cleans up the NFSv4 mount option parsing
path to look like the NFSv2/v3 one.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 22 ++++++----------------
 1 file changed, 6 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 2f7e7f20c91..4bbdbf6de41 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2125,6 +2125,8 @@ static int nfs4_validate_mount_data(void *options,
 	args->acdirmin		= NFS_DEF_ACDIRMIN;
 	args->acdirmax		= NFS_DEF_ACDIRMAX;
 	args->nfs_server.port	= NFS_PORT; /* 2049 unless user set port= */
+	args->auth_flavors[0]	= RPC_AUTH_UNIX;
+	args->auth_flavor_len	= 0;
 
 	switch (data->version) {
 	case 1:
@@ -2140,18 +2142,13 @@ static int nfs4_validate_mount_data(void *options,
 						&args->nfs_server.address))
 			goto out_no_address;
 
-		switch (data->auth_flavourlen) {
-		case 0:
-			args->auth_flavors[0] = RPC_AUTH_UNIX;
-			break;
-		case 1:
+		if (data->auth_flavourlen) {
+			if (data->auth_flavourlen > 1)
+				goto out_inval_auth;
 			if (copy_from_user(&args->auth_flavors[0],
 					   data->auth_flavours,
 					   sizeof(args->auth_flavors[0])))
 				return -EFAULT;
-			break;
-		default:
-			goto out_inval_auth;
 		}
 
 		c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN);
@@ -2203,15 +2200,8 @@ static int nfs4_validate_mount_data(void *options,
 
 		nfs_validate_transport_protocol(args);
 
-		switch (args->auth_flavor_len) {
-		case 0:
-			args->auth_flavors[0] = RPC_AUTH_UNIX;
-			break;
-		case 1:
-			break;
-		default:
+		if (args->auth_flavor_len > 1)
 			goto out_inval_auth;
-		}
 
 		if (args->client_address == NULL)
 			goto out_no_client_address;
-- 
cgit v1.2.3-70-g09d2


From f45663ce5fb30f76a3414ab3ac69f4dd320e760a Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 24 Jun 2008 19:28:02 -0400
Subject: NFS: Allow either strict or sloppy mount option parsing

The kernel's NFS client mount option parser currently doesn't allow
unrecognized or incorrect mount options.  This prevents misspellings or
incorrectly specified mount options from possibly causing silent data
corruption.

However, NFS mount options are not standardized, so different operating
systems can use differently spelled mount options to support similar
features, or can support mount options which no other operating system
supports.

"Sloppy" mount option parsing, which allows the parser to ignore any
option it doesn't recognize, is needed to support automounters that often
use maps that are shared between heterogenous operating systems.

The legacy mount command ignores the validity of the values of mount
options entirely, except for the "sec=" and "proto=" options.  If an
incorrect value is specified, the out-of-range value is passed to the
kernel; if a value is specified that contains non-numeric characters,
it appears as though the legacy mount command sets that option to zero
(probably incorrect behavior in general).

In any case, this sets a precedent which we will partially follow for
the kernel mount option parser:

	+ if "sloppy" is not set, the parser will be strict about both
	  unrecognized options (same as legacy) and invalid option
	  values (stricter than legacy)

	+ if "sloppy" is set, the parser will ignore unrecognized
	  options and invalid option values (same as legacy)

An "invalid" option value in this case means that either the type
(integer, short, or string) or sign (for integer values) of the specified
value is incorrect.

This patch does two things: it changes the NFS client's mount option
parsing loop so that it parses the whole string instead of failing at
the first unrecognized option or invalid option value.  An unrecognized
option or an invalid option value cause the option to be skipped.

Then, the patch adds a "sloppy" mount option that allows the parsing
to succeed anyway if there were any problems during parsing.  When
parsing a set of options is complete, if there are errors and "sloppy"
was specified, return success anyway.  Otherwise, only return success
if there are no errors.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 203 ++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 128 insertions(+), 75 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 4bbdbf6de41..47cf83e917b 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -92,8 +92,8 @@ enum {
 	Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost,
 	Opt_addr, Opt_mountaddr, Opt_clientaddr,
 
-	/* Mount options that are ignored */
-	Opt_userspace, Opt_deprecated,
+	/* Special mount options */
+	Opt_userspace, Opt_deprecated, Opt_sloppy,
 
 	Opt_err
 };
@@ -103,6 +103,8 @@ static match_table_t nfs_mount_option_tokens = {
 	{ Opt_userspace, "fg" },
 	{ Opt_userspace, "retry=%s" },
 
+	{ Opt_sloppy, "sloppy" },
+
 	{ Opt_soft, "soft" },
 	{ Opt_hard, "hard" },
 	{ Opt_deprecated, "intr" },
@@ -917,15 +919,22 @@ static int nfs_parse_security_flavors(char *value,
 	return 1;
 }
 
+static void nfs_parse_invalid_value(const char *option)
+{
+	dfprintk(MOUNT, "NFS:   bad value specified for %s option\n", option);
+}
+
 /*
  * Error-check and convert a string of mount options from user space into
- * a data structure
+ * a data structure.  The whole mount string is processed; bad options are
+ * skipped as they are encountered.  If there were no errors, return 1;
+ * otherwise return 0 (zero).
  */
 static int nfs_parse_mount_options(char *raw,
 				   struct nfs_parsed_mount_data *mnt)
 {
 	char *p, *string, *secdata;
-	int rc;
+	int rc, sloppy = 0, errors = 0;
 
 	if (!raw) {
 		dfprintk(MOUNT, "NFS: mount options string was NULL.\n");
@@ -958,6 +967,10 @@ static int nfs_parse_mount_options(char *raw,
 
 		token = match_token(p, nfs_mount_option_tokens, args);
 		switch (token) {
+
+		/*
+		 * boolean options:  foo/nofoo
+		 */
 		case Opt_soft:
 			mnt->flags |= NFS_MOUNT_SOFT;
 			break;
@@ -1025,103 +1038,145 @@ static int nfs_parse_mount_options(char *raw,
 			mnt->flags |= NFS_MOUNT_UNSHARED;
 			break;
 
+		/*
+		 * options that take numeric values
+		 */
 		case Opt_port:
-			if (match_int(args, &option))
-				return 0;
-			if (option < 0 || option > 65535)
-				return 0;
-			mnt->nfs_server.port = option;
+			if (match_int(args, &option) ||
+			    option < 0 || option > USHORT_MAX) {
+				errors++;
+				nfs_parse_invalid_value("port");
+			} else
+				mnt->nfs_server.port = option;
 			break;
 		case Opt_rsize:
-			if (match_int(args, &mnt->rsize))
-				return 0;
+			if (match_int(args, &option) || option < 0) {
+				errors++;
+				nfs_parse_invalid_value("rsize");
+			} else
+				mnt->rsize = option;
 			break;
 		case Opt_wsize:
-			if (match_int(args, &mnt->wsize))
-				return 0;
+			if (match_int(args, &option) || option < 0) {
+				errors++;
+				nfs_parse_invalid_value("wsize");
+			} else
+				mnt->wsize = option;
 			break;
 		case Opt_bsize:
-			if (match_int(args, &option))
-				return 0;
-			if (option < 0)
-				return 0;
-			mnt->bsize = option;
+			if (match_int(args, &option) || option < 0) {
+				errors++;
+				nfs_parse_invalid_value("bsize");
+			} else
+				mnt->bsize = option;
 			break;
 		case Opt_timeo:
-			if (match_int(args, &mnt->timeo))
-				return 0;
+			if (match_int(args, &option) || option <= 0) {
+				errors++;
+				nfs_parse_invalid_value("timeo");
+			} else
+				mnt->timeo = option;
 			break;
 		case Opt_retrans:
-			if (match_int(args, &mnt->retrans))
-				return 0;
+			if (match_int(args, &option) || option <= 0) {
+				errors++;
+				nfs_parse_invalid_value("retrans");
+			} else
+				mnt->retrans = option;
 			break;
 		case Opt_acregmin:
-			if (match_int(args, &mnt->acregmin))
-				return 0;
+			if (match_int(args, &option) || option < 0) {
+				errors++;
+				nfs_parse_invalid_value("acregmin");
+			} else
+				mnt->acregmin = option;
 			break;
 		case Opt_acregmax:
-			if (match_int(args, &mnt->acregmax))
-				return 0;
+			if (match_int(args, &option) || option < 0) {
+				errors++;
+				nfs_parse_invalid_value("acregmax");
+			} else
+				mnt->acregmax = option;
 			break;
 		case Opt_acdirmin:
-			if (match_int(args, &mnt->acdirmin))
-				return 0;
+			if (match_int(args, &option) || option < 0) {
+				errors++;
+				nfs_parse_invalid_value("acdirmin");
+			} else
+				mnt->acdirmin = option;
 			break;
 		case Opt_acdirmax:
-			if (match_int(args, &mnt->acdirmax))
-				return 0;
+			if (match_int(args, &option) || option < 0) {
+				errors++;
+				nfs_parse_invalid_value("acdirmax");
+			} else
+				mnt->acdirmax = option;
 			break;
 		case Opt_actimeo:
-			if (match_int(args, &option))
-				return 0;
-			if (option < 0)
-				return 0;
-			mnt->acregmin =
-			mnt->acregmax =
-			mnt->acdirmin =
-			mnt->acdirmax = option;
+			if (match_int(args, &option) || option < 0) {
+				errors++;
+				nfs_parse_invalid_value("actimeo");
+			} else
+				mnt->acregmin = mnt->acregmax =
+				mnt->acdirmin = mnt->acdirmax = option;
 			break;
 		case Opt_namelen:
-			if (match_int(args, &mnt->namlen))
-				return 0;
+			if (match_int(args, &option) || option < 0) {
+				errors++;
+				nfs_parse_invalid_value("namlen");
+			} else
+				mnt->namlen = option;
 			break;
 		case Opt_mountport:
-			if (match_int(args, &option))
-				return 0;
-			if (option < 0 || option > 65535)
-				return 0;
-			mnt->mount_server.port = option;
+			if (match_int(args, &option) ||
+			    option < 0 || option > USHORT_MAX) {
+				errors++;
+				nfs_parse_invalid_value("mountport");
+			} else
+				mnt->mount_server.port = option;
 			break;
 		case Opt_mountvers:
-			if (match_int(args, &option))
-				return 0;
-			if (option < 0)
-				return 0;
-			mnt->mount_server.version = option;
+			if (match_int(args, &option) ||
+			    option < NFS_MNT_VERSION ||
+			    option > NFS_MNT3_VERSION) {
+				errors++;
+				nfs_parse_invalid_value("mountvers");
+			} else
+				mnt->mount_server.version = option;
 			break;
 		case Opt_nfsvers:
-			if (match_int(args, &option))
-				return 0;
+			if (match_int(args, &option)) {
+				errors++;
+				nfs_parse_invalid_value("nfsvers");
+				break;
+			}
 			switch (option) {
-			case 2:
+			case NFS2_VERSION:
 				mnt->flags &= ~NFS_MOUNT_VER3;
 				break;
-			case 3:
+			case NFS3_VERSION:
 				mnt->flags |= NFS_MOUNT_VER3;
 				break;
 			default:
-				goto out_unrec_vers;
+				errors++;
+				nfs_parse_invalid_value("nfsvers");
 			}
 			break;
 
+		/*
+		 * options that take text values
+		 */
 		case Opt_sec:
 			string = match_strdup(args);
 			if (string == NULL)
 				goto out_nomem;
 			rc = nfs_parse_security_flavors(string, mnt);
 			kfree(string);
-			if (!rc)
-				goto out_unrec_sec;
+			if (!rc) {
+				errors++;
+				dfprintk(MOUNT, "NFS:   unrecognized "
+						"security flavor\n");
+			}
 			break;
 		case Opt_proto:
 			string = match_strdup(args);
@@ -1146,7 +1201,9 @@ static int nfs_parse_mount_options(char *raw,
 				mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
 				break;
 			default:
-				goto out_unrec_xprt;
+				errors++;
+				dfprintk(MOUNT, "NFS:   unrecognized "
+						"transport protocol\n");
 			}
 			break;
 		case Opt_mountproto:
@@ -1166,7 +1223,9 @@ static int nfs_parse_mount_options(char *raw,
 				break;
 			case Opt_xprt_rdma: /* not used for side protocols */
 			default:
-				goto out_unrec_xprt;
+				errors++;
+				dfprintk(MOUNT, "NFS:   unrecognized "
+						"transport protocol\n");
 			}
 			break;
 		case Opt_addr:
@@ -1204,6 +1263,13 @@ static int nfs_parse_mount_options(char *raw,
 			kfree(string);
 			break;
 
+		/*
+		 * Special options
+		 */
+		case Opt_sloppy:
+			sloppy = 1;
+			dfprintk(MOUNT, "NFS:   relaxing parsing rules\n");
+			break;
 		case Opt_userspace:
 		case Opt_deprecated:
 			dfprintk(MOUNT, "NFS:   ignoring mount option "
@@ -1211,7 +1277,9 @@ static int nfs_parse_mount_options(char *raw,
 			break;
 
 		default:
-			goto out_unknown;
+			errors++;
+			dfprintk(MOUNT, "NFS:   unrecognized mount option "
+					"'%s'\n", p);
 		}
 	}
 
@@ -1224,21 +1292,6 @@ out_security_failure:
 	free_secdata(secdata);
 	printk(KERN_INFO "NFS: security options invalid: %d\n", rc);
 	return 0;
-out_unrec_vers:
-	printk(KERN_INFO "NFS: unrecognized NFS version number\n");
-	return 0;
-
-out_unrec_xprt:
-	printk(KERN_INFO "NFS: unrecognized transport protocol\n");
-	return 0;
-
-out_unrec_sec:
-	printk(KERN_INFO "NFS: unrecognized security flavor\n");
-	return 0;
-
-out_unknown:
-	printk(KERN_INFO "NFS: unknown mount option: %s\n", p);
-	return 0;
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 9cabcdbd4638cf884839ee4cd15780800c223b90 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 10 Jul 2008 15:54:12 +0100
Subject: [GFS2] Replace rgrp "recent list" with mru list

This patch removes the "recent list" which is used during allocation
and replaces it with the (already existing) mru list used during
deletion. The "recent list" was not a true mru list leading to a number
of inefficiencies including a "next" function which made scanning the
list an order N^2 operation wrt to the number of list elements.

This should increase allocation performance with large numbers of rgrps.
Its also a useful preparation and cleanup before some further changes
which are planned in this area.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h     |   2 -
 fs/gfs2/ops_fstype.c |   1 -
 fs/gfs2/rgrp.c       | 108 ++++++---------------------------------------------
 3 files changed, 12 insertions(+), 99 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 4b734c6e34f..4ab3c3a4a96 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -77,7 +77,6 @@ struct gfs2_rgrp_host {
 struct gfs2_rgrpd {
 	struct list_head rd_list;	/* Link with superblock */
 	struct list_head rd_list_mru;
-	struct list_head rd_recent;	/* Recently used rgrps */
 	struct gfs2_glock *rd_gl;	/* Glock for this rgrp */
 	u64 rd_addr;			/* grp block disk address */
 	u64 rd_data0;			/* first data location */
@@ -529,7 +528,6 @@ struct gfs2_sbd {
 	struct mutex sd_rindex_mutex;
 	struct list_head sd_rindex_list;
 	struct list_head sd_rindex_mru_list;
-	struct list_head sd_rindex_recent_list;
 	struct gfs2_rgrpd *sd_rindex_forward;
 	unsigned int sd_rgrps;
 
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 6ba69dd1a72..b4d1d649063 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -64,7 +64,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 	mutex_init(&sdp->sd_rindex_mutex);
 	INIT_LIST_HEAD(&sdp->sd_rindex_list);
 	INIT_LIST_HEAD(&sdp->sd_rindex_mru_list);
-	INIT_LIST_HEAD(&sdp->sd_rindex_recent_list);
 
 	INIT_LIST_HEAD(&sdp->sd_jindex_list);
 	spin_lock_init(&sdp->sd_jindex_spin);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 3401628d742..2d90fb25350 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -371,11 +371,6 @@ static void clear_rgrpdi(struct gfs2_sbd *sdp)
 
 	spin_lock(&sdp->sd_rindex_spin);
 	sdp->sd_rindex_forward = NULL;
-	head = &sdp->sd_rindex_recent_list;
-	while (!list_empty(head)) {
-		rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
-		list_del(&rgd->rd_recent);
-	}
 	spin_unlock(&sdp->sd_rindex_spin);
 
 	head = &sdp->sd_rindex_list;
@@ -944,107 +939,30 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
 	return NULL;
 }
 
-/**
- * recent_rgrp_first - get first RG from "recent" list
- * @sdp: The GFS2 superblock
- * @rglast: address of the rgrp used last
- *
- * Returns: The first rgrp in the recent list
- */
-
-static struct gfs2_rgrpd *recent_rgrp_first(struct gfs2_sbd *sdp,
-					    u64 rglast)
-{
-	struct gfs2_rgrpd *rgd;
-
-	spin_lock(&sdp->sd_rindex_spin);
-
-	if (rglast) {
-		list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
-			if (rgrp_contains_block(rgd, rglast))
-				goto out;
-		}
-	}
-	rgd = NULL;
-	if (!list_empty(&sdp->sd_rindex_recent_list))
-		rgd = list_entry(sdp->sd_rindex_recent_list.next,
-				 struct gfs2_rgrpd, rd_recent);
-out:
-	spin_unlock(&sdp->sd_rindex_spin);
-	return rgd;
-}
-
 /**
  * recent_rgrp_next - get next RG from "recent" list
  * @cur_rgd: current rgrp
- * @remove:
  *
  * Returns: The next rgrp in the recent list
  */
 
-static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd,
-					   int remove)
+static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd)
 {
 	struct gfs2_sbd *sdp = cur_rgd->rd_sbd;
 	struct list_head *head;
 	struct gfs2_rgrpd *rgd;
 
 	spin_lock(&sdp->sd_rindex_spin);
-
-	head = &sdp->sd_rindex_recent_list;
-
-	list_for_each_entry(rgd, head, rd_recent) {
-		if (rgd == cur_rgd) {
-			if (cur_rgd->rd_recent.next != head)
-				rgd = list_entry(cur_rgd->rd_recent.next,
-						 struct gfs2_rgrpd, rd_recent);
-			else
-				rgd = NULL;
-
-			if (remove)
-				list_del(&cur_rgd->rd_recent);
-
-			goto out;
-		}
+	head = &sdp->sd_rindex_mru_list;
+	if (unlikely(cur_rgd->rd_list_mru.next == head)) {
+		spin_unlock(&sdp->sd_rindex_spin);
+		return NULL;
 	}
-
-	rgd = NULL;
-	if (!list_empty(head))
-		rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
-
-out:
+	rgd = list_entry(cur_rgd->rd_list_mru.next, struct gfs2_rgrpd, rd_list_mru);
 	spin_unlock(&sdp->sd_rindex_spin);
 	return rgd;
 }
 
-/**
- * recent_rgrp_add - add an RG to tail of "recent" list
- * @new_rgd: The rgrp to add
- *
- */
-
-static void recent_rgrp_add(struct gfs2_rgrpd *new_rgd)
-{
-	struct gfs2_sbd *sdp = new_rgd->rd_sbd;
-	struct gfs2_rgrpd *rgd;
-	unsigned int count = 0;
-	unsigned int max = sdp->sd_rgrps / gfs2_jindex_size(sdp);
-
-	spin_lock(&sdp->sd_rindex_spin);
-
-	list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
-		if (rgd == new_rgd)
-			goto out;
-
-		if (++count >= max)
-			goto out;
-	}
-	list_add_tail(&new_rgd->rd_recent, &sdp->sd_rindex_recent_list);
-
-out:
-	spin_unlock(&sdp->sd_rindex_spin);
-}
-
 /**
  * forward_rgrp_get - get an rgrp to try next from full list
  * @sdp: The GFS2 superblock
@@ -1112,9 +1030,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
 	int loops = 0;
 	int error, rg_locked;
 
-	/* Try recently successful rgrps */
-
-	rgd = recent_rgrp_first(sdp, ip->i_goal);
+	rgd = gfs2_blk2rgrpd(sdp, ip->i_goal);
 
 	while (rgd) {
 		rg_locked = 0;
@@ -1136,11 +1052,9 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
 				gfs2_glock_dq_uninit(&al->al_rgd_gh);
 			if (inode)
 				return inode;
-			rgd = recent_rgrp_next(rgd, 1);
-			break;
-
+			/* fall through */
 		case GLR_TRYFAILED:
-			rgd = recent_rgrp_next(rgd, 0);
+			rgd = recent_rgrp_next(rgd);
 			break;
 
 		default:
@@ -1199,7 +1113,9 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
 
 out:
 	if (begin) {
-		recent_rgrp_add(rgd);
+		spin_lock(&sdp->sd_rindex_spin);
+		list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
+		spin_unlock(&sdp->sd_rindex_spin);
 		rgd = gfs2_rgrpd_get_next(rgd);
 		if (!rgd)
 			rgd = gfs2_rgrpd_get_first(sdp);
-- 
cgit v1.2.3-70-g09d2


From c9f6a6bbc284ba87337876086f7e2e6e0b0d50dd Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 10 Jul 2008 16:09:29 +0100
Subject: [GFS2] Remove support for unused and pointless flag

The ability to mark files for direct i/o access when opened
normally is both unused and pointless, so this patch removes
support for that feature.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h   |  1 -
 fs/gfs2/inode.c    |  5 -----
 fs/gfs2/ops_file.c | 19 ++-----------------
 fs/gfs2/super.c    |  1 -
 fs/gfs2/sys.c      |  2 --
 5 files changed, 2 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 4ab3c3a4a96..448697a5c46 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -421,7 +421,6 @@ struct gfs2_tune {
 	unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
 	unsigned int gt_atime_quantum; /* Min secs between atime updates */
 	unsigned int gt_new_files_jdata;
-	unsigned int gt_new_files_directio;
 	unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
 	unsigned int gt_stall_secs; /* Detects trouble! */
 	unsigned int gt_complain_secs;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index caf40908335..6da0ab355b8 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -789,12 +789,7 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
 		if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA) ||
 		    gfs2_tune_get(sdp, gt_new_files_jdata))
 			di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
-		if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_DIRECTIO) ||
-		    gfs2_tune_get(sdp, gt_new_files_directio))
-			di->di_flags |= cpu_to_be32(GFS2_DIF_DIRECTIO);
 	} else if (S_ISDIR(mode)) {
-		di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
-					    GFS2_DIF_INHERIT_DIRECTIO);
 		di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
 					    GFS2_DIF_INHERIT_JDATA);
 	}
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 1737af98a42..21b397d0a29 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -134,7 +134,6 @@ static const u32 fsflags_to_gfs2[32] = {
 	[7] = GFS2_DIF_NOATIME,
 	[12] = GFS2_DIF_EXHASH,
 	[14] = GFS2_DIF_INHERIT_JDATA,
-	[20] = GFS2_DIF_INHERIT_DIRECTIO,
 };
 
 static const u32 gfs2_to_fsflags[32] = {
@@ -143,7 +142,6 @@ static const u32 gfs2_to_fsflags[32] = {
 	[gfs2fl_AppendOnly] = FS_APPEND_FL,
 	[gfs2fl_NoAtime] = FS_NOATIME_FL,
 	[gfs2fl_ExHash] = FS_INDEX_FL,
-	[gfs2fl_InheritDirectio] = FS_DIRECTIO_FL,
 	[gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL,
 };
 
@@ -161,12 +159,8 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
 		return error;
 
 	fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_di.di_flags);
-	if (!S_ISDIR(inode->i_mode)) {
-		if (ip->i_di.di_flags & GFS2_DIF_JDATA)
-			fsflags |= FS_JOURNAL_DATA_FL;
-		if (ip->i_di.di_flags & GFS2_DIF_DIRECTIO)
-			fsflags |= FS_DIRECTIO_FL;
-	}
+	if (!S_ISDIR(inode->i_mode) && ip->i_di.di_flags & GFS2_DIF_JDATA)
+		fsflags |= FS_JOURNAL_DATA_FL;
 	if (put_user(fsflags, ptr))
 		error = -EFAULT;
 
@@ -195,13 +189,11 @@ void gfs2_set_inode_flags(struct inode *inode)
 
 /* Flags that can be set by user space */
 #define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA|			\
-			     GFS2_DIF_DIRECTIO|			\
 			     GFS2_DIF_IMMUTABLE|		\
 			     GFS2_DIF_APPENDONLY|		\
 			     GFS2_DIF_NOATIME|			\
 			     GFS2_DIF_SYNC|			\
 			     GFS2_DIF_SYSTEM|			\
-			     GFS2_DIF_INHERIT_DIRECTIO|		\
 			     GFS2_DIF_INHERIT_JDATA)
 
 /**
@@ -292,8 +284,6 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
 	if (!S_ISDIR(inode->i_mode)) {
 		if (gfsflags & GFS2_DIF_INHERIT_JDATA)
 			gfsflags ^= (GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA);
-		if (gfsflags & GFS2_DIF_INHERIT_DIRECTIO)
-			gfsflags ^= (GFS2_DIF_DIRECTIO | GFS2_DIF_INHERIT_DIRECTIO);
 		return do_gfs2_set_flags(filp, gfsflags, ~0);
 	}
 	return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_JDATA);
@@ -494,11 +484,6 @@ static int gfs2_open(struct inode *inode, struct file *file)
 			goto fail_gunlock;
 		}
 
-		/* Listen to the Direct I/O flag */
-
-		if (ip->i_di.di_flags & GFS2_DIF_DIRECTIO)
-			file->f_flags |= O_DIRECT;
-
 		gfs2_glock_dq_uninit(&i_gh);
 	}
 
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 12fe38fe498..63a8a902d9d 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -65,7 +65,6 @@ void gfs2_tune_init(struct gfs2_tune *gt)
 	gt->gt_quota_quantum = 60;
 	gt->gt_atime_quantum = 3600;
 	gt->gt_new_files_jdata = 0;
-	gt->gt_new_files_directio = 0;
 	gt->gt_max_readahead = 1 << 18;
 	gt->gt_stall_secs = 600;
 	gt->gt_complain_secs = 10;
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 6f7e2e5858e..74846559fc3 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -412,7 +412,6 @@ TUNE_ATTR(max_readahead, 0);
 TUNE_ATTR(complain_secs, 0);
 TUNE_ATTR(statfs_slow, 0);
 TUNE_ATTR(new_files_jdata, 0);
-TUNE_ATTR(new_files_directio, 0);
 TUNE_ATTR(quota_simul_sync, 1);
 TUNE_ATTR(quota_cache_secs, 1);
 TUNE_ATTR(stall_secs, 1);
@@ -441,7 +440,6 @@ static struct attribute *tune_attrs[] = {
 	&tune_attr_quotad_secs.attr,
 	&tune_attr_quota_scale.attr,
 	&tune_attr_new_files_jdata.attr,
-	&tune_attr_new_files_directio.attr,
 	NULL,
 };
 
-- 
cgit v1.2.3-70-g09d2


From a93a6ce24215c69126c88f9c488afa50a168e0ca Mon Sep 17 00:00:00 2001
From: Li Xiaodong <lixd@cn.fujitsu.com>
Date: Mon, 7 Jul 2008 18:04:09 +0800
Subject: [GFS2] Remove unused declaration

The implementation of gfs2_inode_attr_in is removed.
So remove its declaration.

Signed-off-by: Li Xiaodong <lixd@cn.fujitsu.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/inode.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 04e9fef3f99..6074c2506f7 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -72,7 +72,6 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
 }
 
 
-void gfs2_inode_attr_in(struct gfs2_inode *ip);
 void gfs2_set_iop(struct inode *inode);
 struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
 				u64 no_addr, u64 no_formal_ino,
-- 
cgit v1.2.3-70-g09d2


From e988cf1cfed4ed80bf40528e655fe18bed6a38b6 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Thu, 10 Jul 2008 09:25:39 -0700
Subject: ocfs2: Fix flags in ocfs2_file_lock

The stack-glue merge changed the way we use flags in dlmglue in that we now
use the fs/dlm equivalents. Unfortunately, a merge error left the new flock
code only partially updated. This took a while to show up though, because
the lock level constants are actually identical between o2dlm and fs/dlm.
The *_CONVERT and *_NOQUEUE flags have different values though, which is
eventually causing a crash in flags_to_o2dlm().

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlmglue.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 394d25a131a..80e20d9f278 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1554,8 +1554,8 @@ out:
  */
 int ocfs2_file_lock(struct file *file, int ex, int trylock)
 {
-	int ret, level = ex ? LKM_EXMODE : LKM_PRMODE;
-	unsigned int lkm_flags = trylock ? LKM_NOQUEUE : 0;
+	int ret, level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+	unsigned int lkm_flags = trylock ? DLM_LKF_NOQUEUE : 0;
 	unsigned long flags;
 	struct ocfs2_file_private *fp = file->private_data;
 	struct ocfs2_lock_res *lockres = &fp->fp_flock;
@@ -1582,7 +1582,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
 		 * Get the lock at NLMODE to start - that way we
 		 * can cancel the upconvert request if need be.
 		 */
-		ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
+		ret = ocfs2_lock_create(osb, lockres, DLM_LOCK_NL, 0);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
@@ -1597,7 +1597,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
 	}
 
 	lockres->l_action = OCFS2_AST_CONVERT;
-	lkm_flags |= LKM_CONVERT;
+	lkm_flags |= DLM_LKF_CONVERT;
 	lockres->l_requested = level;
 	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
 
@@ -1664,7 +1664,7 @@ void ocfs2_file_unlock(struct file *file)
 	if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED))
 		return;
 
-	if (lockres->l_level == LKM_NLMODE)
+	if (lockres->l_level == DLM_LOCK_NL)
 		return;
 
 	mlog(0, "Unlock: \"%s\" flags: 0x%lx, level: %d, act: %d\n",
@@ -1678,11 +1678,11 @@ void ocfs2_file_unlock(struct file *file)
 	lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
 	lockres->l_blocking = DLM_LOCK_EX;
 
-	gen = ocfs2_prepare_downconvert(lockres, LKM_NLMODE);
+	gen = ocfs2_prepare_downconvert(lockres, DLM_LOCK_NL);
 	lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
 	spin_unlock_irqrestore(&lockres->l_lock, flags);
 
-	ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0, gen);
+	ret = ocfs2_downconvert_lock(osb, lockres, DLM_LOCK_NL, 0, gen);
 	if (ret) {
 		mlog_errno(ret);
 		return;
-- 
cgit v1.2.3-70-g09d2


From 96a8e13ed44e380fc2bb6c711d74d5ba698c00b2 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Thu, 10 Jul 2008 21:19:20 +0100
Subject: exec: fix stack excutability without PT_GNU_STACK

Kernel Bugzilla #11063 points out that on some architectures (e.g. x86_32)
exec'ing an ELF without a PT_GNU_STACK program header should default to an
executable stack; but this got broken by the unlimited argv feature because
stack vma is now created before the right personality has been established:
so breaking old binaries using nested function trampolines.

Therefore re-evaluate VM_STACK_FLAGS in setup_arg_pages, where stack
vm_flags used to be set, before the mprotect_fixup.  Checking through
our existing VM_flags, none would have changed since insert_vm_struct:
so this seems safer than finding a way through the personality labyrinth.

Reported-by: pageexec@freemail.hu
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index da94a6f05df..fd9234379e8 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -610,7 +610,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
 	bprm->exec -= stack_shift;
 
 	down_write(&mm->mmap_sem);
-	vm_flags = vma->vm_flags;
+	vm_flags = VM_STACK_FLAGS;
 
 	/*
 	 * Adjust stack execute permissions; explicitly enable for
-- 
cgit v1.2.3-70-g09d2


From 4abaca17e758e3326c96ced88b2cd9b7b84922f6 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 11 Jul 2008 14:39:56 +0100
Subject: [GFS2] Fix GFS2's use of do_div() in its quota calculations

Fix GFS2's need_sync()'s use of do_div() on an s64 by using div_s64() instead.

This does assume that gt_quota_scale_den can be cast to an s32.

This was introduced by patch b3b94faa5fe5968827ba0640ee9fba4b3e7f736e.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/quota.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 56aaf915c59..3e073f5144f 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -904,7 +904,7 @@ static int need_sync(struct gfs2_quota_data *qd)
 		do_sync = 0;
 	else {
 		value *= gfs2_jindex_size(sdp) * num;
-		do_div(value, den);
+		value = div_s64(value, den);
 		value += (s64)be64_to_cpu(qd->qd_qb.qb_value);
 		if (value < (s64)be64_to_cpu(qd->qd_qb.qb_limit))
 			do_sync = 0;
-- 
cgit v1.2.3-70-g09d2


From 0533400b7813df6c22a171499434d30bd57e799c Mon Sep 17 00:00:00 2001
From: Stoyan Gaydarov <stoyboyker@gmail.com>
Date: Mon, 7 Jul 2008 07:45:59 -0500
Subject: [JFFS2] Use .unlocked_ioctl

This changes the .ioctl to the .unlocked_ioctl version.

Signed-off-by: Stoyan Gaydarov <stoyboyker@gmail.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/jffs2/dir.c      | 2 +-
 fs/jffs2/file.c     | 2 +-
 fs/jffs2/ioctl.c    | 3 +--
 fs/jffs2/os-linux.h | 2 +-
 4 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index c0c141f6fde..cd219ef5525 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -38,7 +38,7 @@ const struct file_operations jffs2_dir_operations =
 {
 	.read =		generic_read_dir,
 	.readdir =	jffs2_readdir,
-	.ioctl =	jffs2_ioctl,
+	.unlocked_ioctl=jffs2_ioctl,
 	.fsync =	jffs2_fsync
 };
 
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 5e920343b2c..5a98aa87c85 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -46,7 +46,7 @@ const struct file_operations jffs2_file_operations =
  	.aio_read =	generic_file_aio_read,
  	.write =	do_sync_write,
  	.aio_write =	generic_file_aio_write,
-	.ioctl =	jffs2_ioctl,
+	.unlocked_ioctl=jffs2_ioctl,
 	.mmap =		generic_file_readonly_mmap,
 	.fsync =	jffs2_fsync,
 	.splice_read =	generic_file_splice_read,
diff --git a/fs/jffs2/ioctl.c b/fs/jffs2/ioctl.c
index e2177210f62..9d41f43e47b 100644
--- a/fs/jffs2/ioctl.c
+++ b/fs/jffs2/ioctl.c
@@ -12,8 +12,7 @@
 #include <linux/fs.h>
 #include "nodelist.h"
 
-int jffs2_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
-		unsigned long arg)
+long jffs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	/* Later, this will provide for lsattr.jffs2 and chattr.jffs2, which
 	   will include compression support etc. */
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 2cc866cf134..5e194a5c8e2 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -167,7 +167,7 @@ int jffs2_fsync(struct file *, struct dentry *, int);
 int jffs2_do_readpage_unlock (struct inode *inode, struct page *pg);
 
 /* ioctl.c */
-int jffs2_ioctl(struct inode *, struct file *, unsigned int, unsigned long);
+long jffs2_ioctl(struct file *, unsigned int, unsigned long);
 
 /* symlink.c */
 extern const struct inode_operations jffs2_symlink_inode_operations;
-- 
cgit v1.2.3-70-g09d2


From 49641f1acfdfd437ed9b0a70b86bf36626c02afe Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Fri, 11 Jul 2008 17:43:55 +1000
Subject: Fix reference counting race on log buffers

When we release the iclog, we do an atomic_dec_and_lock to determine if
we are the last reference and need to trigger update of log headers and
writeout.  However, in xlog_state_get_iclog_space() we also need to
check if we have the last reference count there.  If we do, we release
the log buffer, otherwise we decrement the reference count.

But the compare and decrement in xlog_state_get_iclog_space() is not
atomic, so both places can see a reference count of 2 and neither will
release the iclog.  That leads to a filesystem hang.

Close the race by replacing the atomic_read() and atomic_dec() pair with
atomic_add_unless() to ensure that they are executed atomically.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Tim Shimmin <tes@sgi.com>
Tested-by: Eric Sandeen <sandeen@sandeen.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/xfs/xfs_log.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index afaee301b0e..ad3d26ddfe3 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -2427,13 +2427,20 @@ restart:
 	if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) {
 		xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
 
-		/* If I'm the only one writing to this iclog, sync it to disk */
-		if (atomic_read(&iclog->ic_refcnt) == 1) {
+		/*
+		 * If I'm the only one writing to this iclog, sync it to disk.
+		 * We need to do an atomic compare and decrement here to avoid
+		 * racing with concurrent atomic_dec_and_lock() calls in
+		 * xlog_state_release_iclog() when there is more than one
+		 * reference to the iclog.
+		 */
+		if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) {
+			/* we are the only one */
 			spin_unlock(&log->l_icloglock);
-			if ((error = xlog_state_release_iclog(log, iclog)))
+			error = xlog_state_release_iclog(log, iclog);
+			if (error)
 				return error;
 		} else {
-			atomic_dec(&iclog->ic_refcnt);
 			spin_unlock(&log->l_icloglock);
 		}
 		goto restart;
-- 
cgit v1.2.3-70-g09d2


From 91ef4caf800030fa6e5224b8a41f8c74787b303d Mon Sep 17 00:00:00 2001
From: Duane Griffin <duaneg@dghda.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: handle corrupted orphan list at mount

If the orphan node list includes valid, untruncatable nodes with nlink > 0
the ext4_orphan_cleanup loop which attempts to delete them will not do so,
causing it to loop forever. Fix by checking for such nodes in the
ext4_orphan_get function.

This patch fixes the second case (image hdb.20000009.softlockup.gz)
reported in http://bugzilla.kernel.org/show_bug.cgi?id=10882.

Signed-off-by: Duane Griffin <duaneg@dghda.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h   |  1 +
 fs/ext4/ialloc.c |  9 +++++++++
 fs/ext4/inode.c  | 20 ++++++++++++++------
 3 files changed, 24 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8158083f7ac..f7a0758f468 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1039,6 +1039,7 @@ extern void ext4_discard_reservation (struct inode *);
 extern void ext4_dirty_inode(struct inode *);
 extern int ext4_change_inode_journal_flag(struct inode *, int);
 extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
+extern int ext4_can_truncate(struct inode *inode);
 extern void ext4_truncate (struct inode *);
 extern void ext4_set_inode_flags(struct inode *);
 extern void ext4_get_inode_flags(struct ext4_inode_info *);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index c6efbab0c80..11cafe14aa2 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -817,6 +817,14 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
 	if (IS_ERR(inode))
 		goto iget_failed;
 
+	/*
+	 * If the orphans has i_nlinks > 0 then it should be able to be
+	 * truncated, otherwise it won't be removed from the orphan list
+	 * during processing and an infinite loop will result.
+	 */
+	if (inode->i_nlink && !ext4_can_truncate(inode))
+		goto bad_orphan;
+
 	if (NEXT_ORPHAN(inode) > max_ino)
 		goto bad_orphan;
 	brelse(bitmap_bh);
@@ -838,6 +846,7 @@ bad_orphan:
 		printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
 		       NEXT_ORPHAN(inode));
 		printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
+		printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink);
 		/* Avoid freeing blocks if we got a bad deleted inode */
 		if (inode->i_nlink == 0)
 			inode->i_blocks = 0;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8d970774641..269763b6636 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2305,6 +2305,19 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
 	}
 }
 
+int ext4_can_truncate(struct inode *inode)
+{
+	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+		return 0;
+	if (S_ISREG(inode->i_mode))
+		return 1;
+	if (S_ISDIR(inode->i_mode))
+		return 1;
+	if (S_ISLNK(inode->i_mode))
+		return !ext4_inode_is_fast_symlink(inode);
+	return 0;
+}
+
 /*
  * ext4_truncate()
  *
@@ -2349,12 +2362,7 @@ void ext4_truncate(struct inode *inode)
 	unsigned blocksize = inode->i_sb->s_blocksize;
 	struct page *page;
 
-	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-	    S_ISLNK(inode->i_mode)))
-		return;
-	if (ext4_inode_is_fast_symlink(inode))
-		return;
-	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+	if (!ext4_can_truncate(inode))
 		return;
 
 	/*
-- 
cgit v1.2.3-70-g09d2


From 71dc8fbcf5f6363342bd636a646eeac7cfef25c3 Mon Sep 17 00:00:00 2001
From: Duane Griffin <duaneg@dghda.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: handle deleting corrupted indirect blocks

While freeing indirect blocks we attach a journal head to the parent buffer
head, free the blocks, then journal the parent. If the indirect block list
is corrupted and points to the parent the journal head will be detached
when the block is cleared, causing an OOPS.

Check for that explicitly and handle it gracefully.

This patch fixes the third case (image hdb.20000057.nullderef.gz)
reported in http://bugzilla.kernel.org/show_bug.cgi?id=10882.

Signed-off-by: Duane Griffin <duaneg@dghda.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/inode.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 269763b6636..7cce96a6935 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2179,7 +2179,21 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
 
 	if (this_bh) {
 		BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");
-		ext4_journal_dirty_metadata(handle, this_bh);
+
+		/*
+		 * The buffer head should have an attached journal head at this
+		 * point. However, if the data is corrupted and an indirect
+		 * block pointed to itself, it would have been detached when
+		 * the block was cleared. Check for this instead of OOPSing.
+		 */
+		if (bh2jh(this_bh))
+			ext4_journal_dirty_metadata(handle, this_bh);
+		else
+			ext4_error(inode->i_sb, __func__,
+				   "circular indirect block detected, "
+				   "inode=%lu, block=%llu",
+				   inode->i_ino,
+				   (unsigned long long) this_bh->b_blocknr);
 	}
 }
 
-- 
cgit v1.2.3-70-g09d2


From f3b35f063e9a795495fe2f7a2fe55fab11f8ab12 Mon Sep 17 00:00:00 2001
From: Duane Griffin <duaneg@dghda.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: validate directory entry data before use

ext4_dx_find_entry uses ext4_next_entry without verifying that the entry is
valid. If its rec_len == 0 this causes an infinite loop. Refactor the loop
to check the validity of entries before checking whether they match and
moving onto the next one.

There are other uses of ext4_next_entry in this file which also look
problematic. They should be reviewed and fixed if/when we have a test-case
that triggers them.

This patch fixes the first case (image hdb.25.softlockup.gz) reported in
http://bugzilla.kernel.org/show_bug.cgi?id=10882.

Signed-off-by: Duane Griffin <duaneg@dghda.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/namei.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index ab16beaa830..384f1222602 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -993,19 +993,21 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
 		de = (struct ext4_dir_entry_2 *) bh->b_data;
 		top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
 				       EXT4_DIR_REC_LEN(0));
-		for (; de < top; de = ext4_next_entry(de))
-		if (ext4_match (namelen, name, de)) {
-			if (!ext4_check_dir_entry("ext4_find_entry",
-						  dir, de, bh,
-				  (block<<EXT4_BLOCK_SIZE_BITS(sb))
-					  +((char *)de - bh->b_data))) {
-				brelse (bh);
+		for (; de < top; de = ext4_next_entry(de)) {
+			int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
+				  + ((char *) de - bh->b_data);
+
+			if (!ext4_check_dir_entry(__func__, dir, de, bh, off)) {
+				brelse(bh);
 				*err = ERR_BAD_DX_DIR;
 				goto errout;
 			}
-			*res_dir = de;
-			dx_release (frames);
-			return bh;
+
+			if (ext4_match(namelen, name, de)) {
+				*res_dir = de;
+				dx_release(frames);
+				return bh;
+			}
 		}
 		brelse (bh);
 		/* Check to see if we should continue to search */
-- 
cgit v1.2.3-70-g09d2


From e7dfb2463e3c1b10c38372023e0186d25dec1fa6 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: Fix mb_find_next_bit not to return larger than max

Some architectures implement ext4_find_next_bit and
ext4_find_next_zero_bit in such a way that they return
greater than max for some input values. Make sure
mb_find_next_bit and mb_find_next_zero_bit return the
right values.

On 2.6.25 we have include/asm-x86/bitops_32.h
static inline unsigned find_first_bit(const unsigned long *addr, unsigned size)
{
	unsigned x = 0;

	while (x < size) {
		unsigned long val = *addr++;
		if (val)
			return __ffs(val) + x;
		x += (sizeof(*addr)<<3);
	}
	return x;
}

This can return value greater than size.

Reported and fixed here for lustre

https://bugzilla.lustre.org/show_bug.cgi?id=15932
https://bugzilla.lustre.org/attachment.cgi?id=17205

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c9900aade15..ba3aad27f44 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -381,22 +381,28 @@ static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
 
 static inline int mb_find_next_zero_bit(void *addr, int max, int start)
 {
-	int fix = 0;
+	int fix = 0, ret, tmpmax;
 	addr = mb_correct_addr_and_bit(&fix, addr);
-	max += fix;
+	tmpmax = max + fix;
 	start += fix;
 
-	return ext4_find_next_zero_bit(addr, max, start) - fix;
+	ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
+	if (ret > max)
+		return max;
+	return ret;
 }
 
 static inline int mb_find_next_bit(void *addr, int max, int start)
 {
-	int fix = 0;
+	int fix = 0, ret, tmpmax;
 	addr = mb_correct_addr_and_bit(&fix, addr);
-	max += fix;
+	tmpmax = max + fix;
 	start += fix;
 
-	return ext4_find_next_bit(addr, max, start) - fix;
+	ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
+	if (ret > max)
+		return max;
+	return ret;
 }
 
 static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
@@ -3473,8 +3479,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
 		if (bit >= end)
 			break;
 		next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
-		if (next > end)
-			next = end;
 		start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
 				le32_to_cpu(sbi->s_es->s_first_data_block);
 		mb_debug("    free preallocated %u/%u in group %u\n",
-- 
cgit v1.2.3-70-g09d2


From 8a35694e1181a5c6d6496dca09407fc7fa4c86c2 Mon Sep 17 00:00:00 2001
From: Shen Feng <shen@cn.fujitsu.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: fix comments to say "ext4"

Change second/third to fourth.

Signed-off-by: Shen Feng <shen@cn.fujitsu.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    | 2 +-
 fs/ext4/ext4_i.h  | 2 +-
 fs/ext4/ext4_sb.h | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index f7a0758f468..ea8e4bc097c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -22,7 +22,7 @@
 #include "ext4_i.h"
 
 /*
- * The second extended filesystem constants/structures
+ * The fourth extended filesystem constants/structures
  */
 
 /*
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index 26a4ae255d7..abf2744164e 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -79,7 +79,7 @@ struct ext4_ext_cache {
 };
 
 /*
- * third extended file system inode data in memory
+ * fourth extended file system inode data in memory
  */
 struct ext4_inode_info {
 	__le32	i_data[15];	/* unconverted */
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 5802e69f219..4de9a75ca6a 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -25,7 +25,7 @@
 #include <linux/rbtree.h>
 
 /*
- * third extended-fs super-block data in memory
+ * fourth extended-fs super-block data in memory
  */
 struct ext4_sb_info {
 	unsigned long s_desc_size;	/* Size of a group descriptor in bytes */
-- 
cgit v1.2.3-70-g09d2


From ed8f9c751feb3aebf7c0dd25e61481a16412bd6e Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: start searching for the right extent from the goal group.

With mballoc we search for the best extent using different
criteria. We should always use the goal group when we are
starting with a new criteria.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index ba3aad27f44..bdb9f299157 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1736,10 +1736,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 		ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
 		spin_unlock(&sbi->s_md_lock);
 	}
-
-	/* searching for the right group start from the goal value specified */
-	group = ac->ac_g_ex.fe_group;
-
 	/* Let's just scan groups to find more-less suitable blocks */
 	cr = ac->ac_2order ? 0 : 1;
 	/*
@@ -1749,6 +1745,12 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 repeat:
 	for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
 		ac->ac_criteria = cr;
+		/*
+		 * searching for the right group start
+		 * from the goal value specified
+		 */
+		group = ac->ac_g_ex.fe_group;
+
 		for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
 			struct ext4_group_info *grp;
 			struct ext4_group_desc *desc;
-- 
cgit v1.2.3-70-g09d2


From 07d45f126712fea3a9f44068bf65e0a26a162286 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: Use BUG_ON() instead of BUG()

if (...) BUG(); should be replaced with BUG_ON(...) when the test has no
side-effects to allow a definition of BUG_ON that drops the code completely.

The semantic patch that makes this change is as follows:
(http://www.emn.fr/x-info/coccinelle/)

// <smpl>
@ disable unlikely @ expression E,f; @@

(
  if (<... f(...) ...>) { BUG(); }
|
- if (unlikely(E)) { BUG(); }
+ BUG_ON(E);
)

@@ expression E,f; @@

(
  if (<... f(...) ...>) { BUG(); }
|
- if (E) { BUG(); }
+ BUG_ON(E);
)
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 9cc80b9cc8d..c29d774abe5 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -409,8 +409,7 @@ restart:
 		prev = rsv;
 	}
 	printk("Window map complete.\n");
-	if (bad)
-		BUG();
+	BUG_ON(bad);
 }
 #define rsv_window_dump(root, verbose) \
 	__rsv_window_dump((root), (verbose), __func__)
-- 
cgit v1.2.3-70-g09d2


From 91d99827791fdd5f9424458ad5ae870f89dbcadf Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: switch to seq_files

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ext4/mballoc.c | 68 +++++++++++++++++++++++++++----------------------------
 1 file changed, 34 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index bdb9f299157..6280ad3829d 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2583,25 +2583,24 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
 
 
-#define MB_PROC_VALUE_READ(name)				\
-static int ext4_mb_read_##name(char *page, char **start,	\
-		off_t off, int count, int *eof, void *data)	\
+#define MB_PROC_FOPS(name)					\
+static int ext4_mb_##name##_proc_show(struct seq_file *m, void *v)	\
 {								\
-	struct ext4_sb_info *sbi = data;			\
-	int len;						\
-	*eof = 1;						\
-	if (off != 0)						\
-		return 0;					\
-	len = sprintf(page, "%ld\n", sbi->s_mb_##name);		\
-	*start = page;						\
-	return len;						\
-}
-
-#define MB_PROC_VALUE_WRITE(name)				\
-static int ext4_mb_write_##name(struct file *file,		\
-		const char __user *buf, unsigned long cnt, void *data)	\
+	struct ext4_sb_info *sbi = m->private;			\
+								\
+	seq_printf(m, "%ld\n", sbi->s_mb_##name);		\
+	return 0;						\
+}								\
+								\
+static int ext4_mb_##name##_proc_open(struct inode *inode, struct file *file)\
+{								\
+	return single_open(file, ext4_mb_##name##_proc_show, PDE(inode)->data);\
+}								\
+								\
+static ssize_t ext4_mb_##name##_proc_write(struct file *file,	\
+		const char __user *buf, size_t cnt, loff_t *ppos)	\
 {								\
-	struct ext4_sb_info *sbi = data;			\
+	struct ext4_sb_info *sbi = PDE(file->f_path.dentry->d_inode)->data;\
 	char str[32];						\
 	long value;						\
 	if (cnt >= sizeof(str))					\
@@ -2613,31 +2612,32 @@ static int ext4_mb_write_##name(struct file *file,		\
 		return -ERANGE;					\
 	sbi->s_mb_##name = value;				\
 	return cnt;						\
-}
+}								\
+								\
+static const struct file_operations ext4_mb_##name##_proc_fops = {	\
+	.owner		= THIS_MODULE,				\
+	.open		= ext4_mb_##name##_proc_open,		\
+	.read		= seq_read,				\
+	.llseek		= seq_lseek,				\
+	.release	= single_release,			\
+	.write		= ext4_mb_##name##_proc_write,		\
+};
 
-MB_PROC_VALUE_READ(stats);
-MB_PROC_VALUE_WRITE(stats);
-MB_PROC_VALUE_READ(max_to_scan);
-MB_PROC_VALUE_WRITE(max_to_scan);
-MB_PROC_VALUE_READ(min_to_scan);
-MB_PROC_VALUE_WRITE(min_to_scan);
-MB_PROC_VALUE_READ(order2_reqs);
-MB_PROC_VALUE_WRITE(order2_reqs);
-MB_PROC_VALUE_READ(stream_request);
-MB_PROC_VALUE_WRITE(stream_request);
-MB_PROC_VALUE_READ(group_prealloc);
-MB_PROC_VALUE_WRITE(group_prealloc);
+MB_PROC_FOPS(stats);
+MB_PROC_FOPS(max_to_scan);
+MB_PROC_FOPS(min_to_scan);
+MB_PROC_FOPS(order2_reqs);
+MB_PROC_FOPS(stream_request);
+MB_PROC_FOPS(group_prealloc);
 
 #define	MB_PROC_HANDLER(name, var)					\
 do {									\
-	proc = create_proc_entry(name, mode, sbi->s_mb_proc);		\
+	proc = proc_create_data(name, mode, sbi->s_mb_proc,		\
+				&ext4_mb_##var##_proc_fops, sbi);	\
 	if (proc == NULL) {						\
 		printk(KERN_ERR "EXT4-fs: can't to create %s\n", name);	\
 		goto err_out;						\
 	}								\
-	proc->data = sbi;						\
-	proc->read_proc  = ext4_mb_read_##var ;				\
-	proc->write_proc = ext4_mb_write_##var;				\
 } while (0)
 
 static int ext4_mb_init_per_dev_proc(struct super_block *sb)
-- 
cgit v1.2.3-70-g09d2


From 69baee062a044ef1588e423e52131710e7584d1a Mon Sep 17 00:00:00 2001
From: Shen Feng <shen@cn.fujitsu.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: improve some code in rb tree part of dir.c

* remove unnecessary code in free_rb_tree_fname

* rename free_rb_tree_fname to ext4_htree_create_dir_info
  since it and ext4_htree_free_dir_info are a pair

* replace kmalloc with kzalloc in ext4_htree_free_dir_info

All these make the code more readable and simple.
PS: this patch is also suitable for ext3.

Signed-off-by: Shen Feng <shen@cn.fujitsu.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/dir.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 2bf0331ea19..5ed5108766c 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -272,7 +272,7 @@ static void free_rb_tree_fname(struct rb_root *root)
 
 	while (n) {
 		/* Do the node's children first */
-		if ((n)->rb_left) {
+		if (n->rb_left) {
 			n = n->rb_left;
 			continue;
 		}
@@ -301,24 +301,18 @@ static void free_rb_tree_fname(struct rb_root *root)
 			parent->rb_right = NULL;
 		n = parent;
 	}
-	root->rb_node = NULL;
 }
 
 
-static struct dir_private_info *create_dir_info(loff_t pos)
+static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos)
 {
 	struct dir_private_info *p;
 
-	p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
+	p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
 	if (!p)
 		return NULL;
-	p->root.rb_node = NULL;
-	p->curr_node = NULL;
-	p->extra_fname = NULL;
-	p->last_pos = 0;
 	p->curr_hash = pos2maj_hash(pos);
 	p->curr_minor_hash = pos2min_hash(pos);
-	p->next_hash = 0;
 	return p;
 }
 
@@ -433,7 +427,7 @@ static int ext4_dx_readdir(struct file * filp,
 	int	ret;
 
 	if (!info) {
-		info = create_dir_info(filp->f_pos);
+		info = ext4_htree_create_dir_info(filp->f_pos);
 		if (!info)
 			return -ENOMEM;
 		filp->private_data = info;
-- 
cgit v1.2.3-70-g09d2


From 31b481dc7c249eac0a108ec5dfc0d4aef2217e39 Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@us.ibm.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: Fix ext4_mb_init_cache return error

ext4_mb_init_cache() incorrectly always return EIO on success. This
causes the caller of ext4_mb_init_cache() fail when it checks the return
value.

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 6280ad3829d..d429014071c 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -809,6 +809,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 		if (!buffer_uptodate(bh[i]))
 			goto out;
 
+	err = 0;
 	first_block = page->index * blocks_per_page;
 	for (i = 0; i < blocks_per_page; i++) {
 		int group;
-- 
cgit v1.2.3-70-g09d2


From fdf6c7a7683c6272e953a33358920e98a4d93cf0 Mon Sep 17 00:00:00 2001
From: Shen Feng <shen@cn.fujitsu.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: add error processing when calling ext4_mb_init_cache in mballoc

Add error processing for ext4_mb_load_buddy when it calls
ext4_mb_init_cache.

Signed-off-by: Shen Feng <shen@cn.fujitsu.com>
Reviewed-by:   Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d429014071c..b64600be206 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -890,6 +890,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
 	int pnum;
 	int poff;
 	struct page *page;
+	int ret;
 
 	mb_debug("load group %lu\n", group);
 
@@ -921,15 +922,21 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
 		if (page) {
 			BUG_ON(page->mapping != inode->i_mapping);
 			if (!PageUptodate(page)) {
-				ext4_mb_init_cache(page, NULL);
+				ret = ext4_mb_init_cache(page, NULL);
+				if (ret) {
+					unlock_page(page);
+					goto err;
+				}
 				mb_cmp_bitmaps(e4b, page_address(page) +
 					       (poff * sb->s_blocksize));
 			}
 			unlock_page(page);
 		}
 	}
-	if (page == NULL || !PageUptodate(page))
+	if (page == NULL || !PageUptodate(page)) {
+		ret = -EIO;
 		goto err;
+	}
 	e4b->bd_bitmap_page = page;
 	e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
 	mark_page_accessed(page);
@@ -945,14 +952,20 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
 		page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
 		if (page) {
 			BUG_ON(page->mapping != inode->i_mapping);
-			if (!PageUptodate(page))
-				ext4_mb_init_cache(page, e4b->bd_bitmap);
-
+			if (!PageUptodate(page)) {
+				ret = ext4_mb_init_cache(page, e4b->bd_bitmap);
+				if (ret) {
+					unlock_page(page);
+					goto err;
+				}
+			}
 			unlock_page(page);
 		}
 	}
-	if (page == NULL || !PageUptodate(page))
+	if (page == NULL || !PageUptodate(page)) {
+		ret = -EIO;
 		goto err;
+	}
 	e4b->bd_buddy_page = page;
 	e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
 	mark_page_accessed(page);
@@ -969,7 +982,7 @@ err:
 		page_cache_release(e4b->bd_buddy_page);
 	e4b->bd_buddy = NULL;
 	e4b->bd_bitmap = NULL;
-	return -EIO;
+	return ret;
 }
 
 static void ext4_mb_release_desc(struct ext4_buddy *e4b)
-- 
cgit v1.2.3-70-g09d2


From 74767c5a2dca0a60676d60d36377a41f60ca42ba Mon Sep 17 00:00:00 2001
From: Shen Feng <shen@cn.fujitsu.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: miscellaneous error checks and coding cleanups for mballoc

ext4_mb_seq_history_open(): check if sbi->s_mb_history is NULL

ext4_mb_history_init(): replace kmalloc and memset with kzalloc

ext4_mb_init_backend(): remove memset since kzalloc is used

ext4_mb_init(): the return value of ext4_mb_init_backend is int,
	but i is unsigned, replace it with a new int variable.

Signed-off-by: Shen Feng <shen@cn.fujitsu.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b64600be206..6d69dd92aad 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1985,6 +1985,8 @@ static int ext4_mb_seq_history_open(struct inode *inode, struct file *file)
 	int rc;
 	int size;
 
+	if (unlikely(sbi->s_mb_history == NULL))
+		return -ENOMEM;
 	s = kmalloc(sizeof(*s), GFP_KERNEL);
 	if (s == NULL)
 		return -ENOMEM;
@@ -2187,9 +2189,7 @@ static void ext4_mb_history_init(struct super_block *sb)
 	sbi->s_mb_history_cur = 0;
 	spin_lock_init(&sbi->s_mb_history_lock);
 	i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
-	sbi->s_mb_history = kmalloc(i, GFP_KERNEL);
-	if (likely(sbi->s_mb_history != NULL))
-		memset(sbi->s_mb_history, 0, i);
+	sbi->s_mb_history = kzalloc(i, GFP_KERNEL);
 	/* if we can't allocate history, then we simple won't use it */
 }
 
@@ -2303,7 +2303,6 @@ static int ext4_mb_init_backend(struct super_block *sb)
 			i++;
 			goto err_freebuddy;
 		}
-		memset(meta_group_info[j], 0, len);
 		set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
 			&(meta_group_info[j]->bb_state));
 
@@ -2358,6 +2357,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 	unsigned i;
 	unsigned offset;
 	unsigned max;
+	int ret;
 
 	if (!test_opt(sb, MBALLOC))
 		return 0;
@@ -2392,12 +2392,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 	} while (i <= sb->s_blocksize_bits + 1);
 
 	/* init file for buddy data */
-	i = ext4_mb_init_backend(sb);
-	if (i) {
+	ret = ext4_mb_init_backend(sb);
+	if (ret != 0) {
 		clear_opt(sbi->s_mount_opt, MBALLOC);
 		kfree(sbi->s_mb_offsets);
 		kfree(sbi->s_mb_maxs);
-		return i;
+		return ret;
 	}
 
 	spin_lock_init(&sbi->s_md_lock);
-- 
cgit v1.2.3-70-g09d2


From 3537576a707c6df98e883b77b854c6083f844602 Mon Sep 17 00:00:00 2001
From: Shen Feng <shen@cn.fujitsu.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: remove double definitions of xattr macros

remove the definitions of macros XATTR_TRUSTED_PREFIX and XATTR_USER_PREFIX
since they are defined in linux/xattr.h

Signed-off-by: Shen Feng <shen@cn.fujitsu.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/xattr_trusted.c | 4 +---
 fs/ext4/xattr_user.c    | 4 +---
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index fff33382cad..ac1a52cf2a3 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -13,13 +13,11 @@
 #include "ext4.h"
 #include "xattr.h"
 
-#define XATTR_TRUSTED_PREFIX "trusted."
-
 static size_t
 ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
 			const char *name, size_t name_len)
 {
-	const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
+	const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
 	const size_t total_len = prefix_len + name_len + 1;
 
 	if (!capable(CAP_SYS_ADMIN))
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index 67be723fcc4..d91aa61b42a 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -12,13 +12,11 @@
 #include "ext4.h"
 #include "xattr.h"
 
-#define XATTR_USER_PREFIX "user."
-
 static size_t
 ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size,
 		     const char *name, size_t name_len)
 {
-	const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1;
+	const size_t prefix_len = XATTR_USER_PREFIX_LEN;
 	const size_t total_len = prefix_len + name_len + 1;
 
 	if (!test_opt(inode->i_sb, XATTR_USER))
-- 
cgit v1.2.3-70-g09d2


From 574ca174c97f790086e3e6f2251381420ad38fd0 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: Rename read_block_bitmap() to ext4_read_block_bitmap()

Since this a non-static function, make it be ext4 specific to avoid
conflicts with potentially other filesystems.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c  | 12 ++++++------
 fs/ext4/group.h   |  2 +-
 fs/ext4/ialloc.c  |  2 +-
 fs/ext4/mballoc.c | 10 +++++-----
 4 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index c29d774abe5..ba411233cc2 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -295,7 +295,7 @@ err_out:
 	return 0;
 }
 /**
- * read_block_bitmap()
+ * ext4_read_block_bitmap()
  * @sb:			super block
  * @block_group:	given block group
  *
@@ -305,7 +305,7 @@ err_out:
  * Return buffer_head on success or NULL in case of failure.
  */
 struct buffer_head *
-read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
+ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 {
 	struct ext4_group_desc * desc;
 	struct buffer_head * bh = NULL;
@@ -693,7 +693,7 @@ do_more:
 		count -= overflow;
 	}
 	brelse(bitmap_bh);
-	bitmap_bh = read_block_bitmap(sb, block_group);
+	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
 	if (!bitmap_bh)
 		goto error_return;
 	desc = ext4_get_group_desc (sb, block_group, &gd_bh);
@@ -1733,7 +1733,7 @@ retry_alloc:
 		my_rsv = NULL;
 
 	if (free_blocks > 0) {
-		bitmap_bh = read_block_bitmap(sb, group_no);
+		bitmap_bh = ext4_read_block_bitmap(sb, group_no);
 		if (!bitmap_bh)
 			goto io_error;
 		grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
@@ -1769,7 +1769,7 @@ retry_alloc:
 			continue;
 
 		brelse(bitmap_bh);
-		bitmap_bh = read_block_bitmap(sb, group_no);
+		bitmap_bh = ext4_read_block_bitmap(sb, group_no);
 		if (!bitmap_bh)
 			goto io_error;
 		/*
@@ -1985,7 +1985,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
 			continue;
 		desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
 		brelse(bitmap_bh);
-		bitmap_bh = read_block_bitmap(sb, i);
+		bitmap_bh = ext4_read_block_bitmap(sb, i);
 		if (bitmap_bh == NULL)
 			continue;
 
diff --git a/fs/ext4/group.h b/fs/ext4/group.h
index 7eb0604e7ee..c2c0a8d06d0 100644
--- a/fs/ext4/group.h
+++ b/fs/ext4/group.h
@@ -13,7 +13,7 @@ extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
 				   struct ext4_group_desc *gdp);
 extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
 				       struct ext4_group_desc *gdp);
-struct buffer_head *read_block_bitmap(struct super_block *sb,
+struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
 				      ext4_group_t block_group);
 extern unsigned ext4_init_block_bitmap(struct super_block *sb,
 				       struct buffer_head *bh,
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 11cafe14aa2..b30cc79b9fc 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -600,7 +600,7 @@ got:
 	/* We may have to initialize the block bitmap if it isn't already */
 	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
 	    gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-		struct buffer_head *block_bh = read_block_bitmap(sb, group);
+		struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group);
 
 		BUFFER_TRACE(block_bh, "get block bitmap access");
 		err = ext4_journal_get_write_access(handle, block_bh);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 6d69dd92aad..21ee6d42ee7 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2326,7 +2326,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
 			meta_group_info[j]->bb_bitmap =
 				kmalloc(sb->s_blocksize, GFP_KERNEL);
 			BUG_ON(meta_group_info[j]->bb_bitmap == NULL);
-			bh = read_block_bitmap(sb, i);
+			bh = ext4_read_block_bitmap(sb, i);
 			BUG_ON(bh == NULL);
 			memcpy(meta_group_info[j]->bb_bitmap, bh->b_data,
 					sb->s_blocksize);
@@ -2769,7 +2769,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 
 
 	err = -EIO;
-	bitmap_bh = read_block_bitmap(sb, ac->ac_b_ex.fe_group);
+	bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
 	if (!bitmap_bh)
 		goto out_err;
 
@@ -3589,7 +3589,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
 	if (list_empty(&grp->bb_prealloc_list))
 		return 0;
 
-	bitmap_bh = read_block_bitmap(sb, group);
+	bitmap_bh = ext4_read_block_bitmap(sb, group);
 	if (bitmap_bh == NULL) {
 		/* error handling here */
 		ext4_mb_release_desc(&e4b);
@@ -3763,7 +3763,7 @@ repeat:
 		err = ext4_mb_load_buddy(sb, group, &e4b);
 		BUG_ON(err != 0); /* error handling here */
 
-		bitmap_bh = read_block_bitmap(sb, group);
+		bitmap_bh = ext4_read_block_bitmap(sb, group);
 		if (bitmap_bh == NULL) {
 			/* error handling here */
 			ext4_mb_release_desc(&e4b);
@@ -4262,7 +4262,7 @@ do_more:
 		overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
 		count -= overflow;
 	}
-	bitmap_bh = read_block_bitmap(sb, block_group);
+	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
 	if (!bitmap_bh)
 		goto error_return;
 	gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
-- 
cgit v1.2.3-70-g09d2


From 7ad72ca60b6be3c79f90a81ce5883e12a2c6e667 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: Remove unused variable from ext4_show_options

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 02bf2434397..588cfb40864 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -671,7 +671,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	unsigned long def_mount_opts;
 	struct super_block *sb = vfs->mnt_sb;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	journal_t *journal = sbi->s_journal;
 	struct ext4_super_block *es = sbi->s_es;
 
 	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
-- 
cgit v1.2.3-70-g09d2


From f795e1407343ebe6597653f4a6a7f770676e84c5 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: fix build failure if DX_DEBUG is enabled

ext4_next_entry() is used by the debugging function dx_show_leaf(), so
it must be defined before that function.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/namei.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 384f1222602..c7bf01261c7 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -182,6 +182,16 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
 static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			     struct inode *inode);
 
+/*
+ * p is at least 6 bytes before the end of page
+ */
+static inline struct ext4_dir_entry_2 *
+ext4_next_entry(struct ext4_dir_entry_2 *p)
+{
+	return (struct ext4_dir_entry_2 *)((char *)p +
+		ext4_rec_len_from_disk(p->rec_len));
+}
+
 /*
  * Future: use high four bits of block for coalesce-on-delete flags
  * Mask them off for now.
@@ -553,15 +563,6 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
 }
 
 
-/*
- * p is at least 6 bytes before the end of page
- */
-static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *p)
-{
-	return (struct ext4_dir_entry_2 *)((char *)p +
-		ext4_rec_len_from_disk(p->rec_len));
-}
-
 /*
  * This function fills a red-black tree with information from a
  * directory block.  It returns the number directory entries loaded
-- 
cgit v1.2.3-70-g09d2


From cfbe7e4f5e4a0e1fc2ff23b167bfb3fa992f623d Mon Sep 17 00:00:00 2001
From: Shen Feng <shen@cn.fujitsu.com>
Date: Sun, 13 Jul 2008 21:03:31 -0400
Subject: ext4: error proc entry creation when the fs/ext4 is not correctly
 created

When the directory fs/ext4 is not correctly created under proc, the entry
under this directory should not be created.

Signed-off-by: Shen Feng <shen@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 21ee6d42ee7..771a1d60852 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2661,6 +2661,10 @@ static int ext4_mb_init_per_dev_proc(struct super_block *sb)
 	struct proc_dir_entry *proc;
 	char devname[64];
 
+	if (proc_root_ext4 == NULL) {
+		sbi->s_mb_proc = NULL;
+		return -EINVAL;
+	}
 	bdevname(sb->s_bdev, devname);
 	sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4);
 
-- 
cgit v1.2.3-70-g09d2


From 7e5a8cdd843b7af8d6d38a9bf96306145edb66e0 Mon Sep 17 00:00:00 2001
From: Shen Feng <shen@cn.fujitsu.com>
Date: Sun, 13 Jul 2008 21:03:31 -0400
Subject: ext4: fix error processing in mb_free_blocks

The error processing of the return value of mb_free_blocks is meanless
because it only returns 0.  This fix includes

- make mb_free_blocks return void

- remove the error processing part in callers

- unlock group before calling ext4_error in mb_free_blocks

Signed-off-by: Shen Feng <shen@cn.fujitsu.com>
Cc: Mingming Cao <cmm@us.ibm.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 771a1d60852..b882868f466 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1051,7 +1051,7 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
 	}
 }
 
-static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
+static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
 			  int first, int count)
 {
 	int block = 0;
@@ -1091,11 +1091,12 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
 			blocknr += block;
 			blocknr +=
 			    le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-
+			ext4_unlock_group(sb, e4b->bd_group);
 			ext4_error(sb, __func__, "double-free of inode"
 				   " %lu's block %llu(bit %u in group %lu)\n",
 				   inode ? inode->i_ino : 0, blocknr, block,
 				   e4b->bd_group);
+			ext4_lock_group(sb, e4b->bd_group);
 		}
 		mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
 		e4b->bd_info->bb_counters[order]++;
@@ -1133,8 +1134,6 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
 		} while (1);
 	}
 	mb_check_buddy(e4b);
-
-	return 0;
 }
 
 static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
@@ -2570,8 +2569,7 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
 		ext4_lock_group(sb, md->group);
 		for (i = 0; i < md->num; i++) {
 			mb_debug(" %u", md->blocks[i]);
-			err = mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
-			BUG_ON(err != 0);
+			mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
 		}
 		mb_debug("\n");
 		ext4_unlock_group(sb, md->group);
@@ -4333,10 +4331,9 @@ do_more:
 		ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
 	} else {
 		ext4_lock_group(sb, block_group);
-		err = mb_free_blocks(inode, &e4b, bit, count);
+		mb_free_blocks(inode, &e4b, bit, count);
 		ext4_mb_return_to_preallocation(inode, &e4b, block, count);
 		ext4_unlock_group(sb, block_group);
-		BUG_ON(err != 0);
 	}
 
 	spin_lock(sb_bgl_lock(sbi, block_group));
-- 
cgit v1.2.3-70-g09d2


From 4db9c54a53135b7c1c1f403f1aeaf9fc0d7738b8 Mon Sep 17 00:00:00 2001
From: Stoyan Gaydarov <stoyboyker@gmail.com>
Date: Sun, 13 Jul 2008 21:03:29 -0400
Subject: ext4: replace __FUNCTION__ occurrences

__FUNCTION__ is gcc-specific, use __func__ instead

Signed-off-by: Stoyan Gaydarov <stoyboyker@gmail.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h      |  4 ++--
 fs/ext4/ext4_jbd2.h | 14 +++++++-------
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index ea8e4bc097c..109c7d4c19a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -45,7 +45,7 @@
 #define ext4_debug(f, a...)						\
 	do {								\
 		printk (KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:",	\
-			__FILE__, __LINE__, __FUNCTION__);		\
+			__FILE__, __LINE__, __func__);			\
 		printk (KERN_DEBUG f, ## a);				\
 	} while (0)
 #else
@@ -1163,7 +1163,7 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
 #define ext4_std_error(sb, errno)				\
 do {								\
 	if ((errno))						\
-		__ext4_std_error((sb), __FUNCTION__, (errno));	\
+		__ext4_std_error((sb), __func__, (errno));	\
 } while (0)
 
 /*
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 9255a7d28b2..d0aa9ee20f8 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -142,17 +142,17 @@ int __ext4_journal_dirty_metadata(const char *where,
 				handle_t *handle, struct buffer_head *bh);
 
 #define ext4_journal_get_undo_access(handle, bh) \
-	__ext4_journal_get_undo_access(__FUNCTION__, (handle), (bh))
+	__ext4_journal_get_undo_access(__func__, (handle), (bh))
 #define ext4_journal_get_write_access(handle, bh) \
-	__ext4_journal_get_write_access(__FUNCTION__, (handle), (bh))
+	__ext4_journal_get_write_access(__func__, (handle), (bh))
 #define ext4_journal_revoke(handle, blocknr, bh) \
-	__ext4_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh))
+	__ext4_journal_revoke(__func__, (handle), (blocknr), (bh))
 #define ext4_journal_get_create_access(handle, bh) \
-	__ext4_journal_get_create_access(__FUNCTION__, (handle), (bh))
+	__ext4_journal_get_create_access(__func__, (handle), (bh))
 #define ext4_journal_dirty_metadata(handle, bh) \
-	__ext4_journal_dirty_metadata(__FUNCTION__, (handle), (bh))
+	__ext4_journal_dirty_metadata(__func__, (handle), (bh))
 #define ext4_journal_forget(handle, bh) \
-	__ext4_journal_forget(__FUNCTION__, (handle), (bh))
+	__ext4_journal_forget(__func__, (handle), (bh))
 
 int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh);
 
@@ -165,7 +165,7 @@ static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
 }
 
 #define ext4_journal_stop(handle) \
-	__ext4_journal_stop(__FUNCTION__, (handle))
+	__ext4_journal_stop(__func__, (handle))
 
 static inline handle_t *ext4_journal_current_handle(void)
 {
-- 
cgit v1.2.3-70-g09d2


From 736603ab297506f4396cb5af592004499950fcfd Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: jbd2: Add commit time into the commit block

Carlo Wood has demonstrated that it's possible to recover deleted
files from the journal.  Something that will make this easier is if we
can put the time of the commit into commit block.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/commit.c     | 3 +++
 include/linux/jbd2.h | 2 ++
 2 files changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index a2ed72f7cee..92b6ac3df8a 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -112,6 +112,7 @@ static int journal_submit_commit_record(journal_t *journal,
 	struct buffer_head *bh;
 	int ret;
 	int barrier_done = 0;
+	struct timespec now = current_kernel_time();
 
 	if (is_journal_aborted(journal))
 		return 0;
@@ -126,6 +127,8 @@ static int journal_submit_commit_record(journal_t *journal,
 	tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
 	tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
 	tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
+	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
+	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
 
 	if (JBD2_HAS_COMPAT_FEATURE(journal,
 				    JBD2_FEATURE_COMPAT_CHECKSUM)) {
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index d147f0f9036..ec9cadf5822 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -168,6 +168,8 @@ struct commit_header {
 	unsigned char   h_chksum_size;
 	unsigned char 	h_padding[2];
 	__be32 		h_chksum[JBD2_CHECKSUM_BYTES];
+	__be64		h_commit_sec;
+	__be32		h_commit_nsec;
 };
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 772cb7c83ba256a11c7bf99a11bef3858d23767c Mon Sep 17 00:00:00 2001
From: "Jose R. Santos" <jrs@us.ibm.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: New inode allocation for FLEX_BG meta-data groups.

This patch mostly controls the way inode are allocated in order to
make ialloc aware of flex_bg block group grouping.  It achieves this
by bypassing the Orlov allocator when block group meta-data are packed
toghether through mke2fs.  Since the impact on the block allocator is
minimal, this patch should have little or no effect on other block
allocation algorithms. By controlling the inode allocation, it can
basically control where the initial search for new block begins and
thus indirectly manipulate the block allocator.

This allocator favors data and meta-data locality so the disk will
gradually be filled from block group zero upward.  This helps improve
performance by reducing seek time.  Since the group of inode tables
within one flex_bg are treated as one giant inode table, uninitialized
block groups would not need to partially initialize as many inode
table as with Orlov which would help fsck time as the filesystem usage
goes up.

Signed-off-by: Jose R. Santos <jrs@us.ibm.com>
Signed-off-by: Valerie Clement <valerie.clement@bull.net>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c  | 14 ++++++++
 fs/ext4/ext4.h    | 25 ++++++++++++++-
 fs/ext4/ext4_sb.h |  3 ++
 fs/ext4/ialloc.c  | 96 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/mballoc.c | 15 +++++++++
 fs/ext4/super.c   | 57 +++++++++++++++++++++++++++++++++
 6 files changed, 209 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index ba411233cc2..0b2b7549ac6 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -809,6 +809,13 @@ do_more:
 	spin_unlock(sb_bgl_lock(sbi, block_group));
 	percpu_counter_add(&sbi->s_freeblocks_counter, count);
 
+	if (sbi->s_log_groups_per_flex) {
+		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+		spin_lock(sb_bgl_lock(sbi, flex_group));
+		sbi->s_flex_groups[flex_group].free_blocks += count;
+		spin_unlock(sb_bgl_lock(sbi, flex_group));
+	}
+
 	/* We dirtied the bitmap block */
 	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
 	err = ext4_journal_dirty_metadata(handle, bitmap_bh);
@@ -1883,6 +1890,13 @@ allocated:
 	spin_unlock(sb_bgl_lock(sbi, group_no));
 	percpu_counter_sub(&sbi->s_freeblocks_counter, num);
 
+	if (sbi->s_log_groups_per_flex) {
+		ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
+		spin_lock(sb_bgl_lock(sbi, flex_group));
+		sbi->s_flex_groups[flex_group].free_blocks -= num;
+		spin_unlock(sb_bgl_lock(sbi, flex_group));
+	}
+
 	BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
 	err = ext4_journal_dirty_metadata(handle, gdp_bh);
 	if (!fatal)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 109c7d4c19a..0bfeae18f1a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -170,6 +170,15 @@ struct ext4_group_desc
 	__u32	bg_reserved2[3];
 };
 
+/*
+ * Structure of a flex block group info
+ */
+
+struct flex_groups {
+	__u32 free_inodes;
+	__u32 free_blocks;
+};
+
 #define EXT4_BG_INODE_UNINIT	0x0001 /* Inode table/bitmap not in use */
 #define EXT4_BG_BLOCK_UNINIT	0x0002 /* Block bitmap not in use */
 #define EXT4_BG_INODE_ZEROED	0x0004 /* On-disk itable initialized to zero */
@@ -647,7 +656,10 @@ struct ext4_super_block {
 	__le16  s_mmp_interval;         /* # seconds to wait in MMP checking */
 	__le64  s_mmp_block;            /* Block for multi-mount protection */
 	__le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
-	__u32   s_reserved[163];        /* Padding to the end of the block */
+	__u8	s_log_groups_per_flex;  /* FLEX_BG group size */
+	__u8	s_reserved_char_pad2;
+	__le16  s_reserved_pad;
+	__u32   s_reserved[162];        /* Padding to the end of the block */
 };
 
 #ifdef __KERNEL__
@@ -1160,6 +1172,17 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
 }
 
 
+static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
+					     ext4_group_t block_group)
+{
+	return block_group >> sbi->s_log_groups_per_flex;
+}
+
+static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
+{
+	return 1 << sbi->s_log_groups_per_flex;
+}
+
 #define ext4_std_error(sb, errno)				\
 do {								\
 	if ((errno))						\
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 4de9a75ca6a..6300226d553 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -143,6 +143,9 @@ struct ext4_sb_info {
 
 	/* locality groups */
 	struct ext4_locality_group *s_locality_groups;
+
+	unsigned int s_log_groups_per_flex;
+	struct flex_groups *s_flex_groups;
 };
 
 #endif	/* _EXT4_SB */
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index b30cc79b9fc..8b0a10acd70 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -157,6 +157,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
 	struct ext4_super_block * es;
 	struct ext4_sb_info *sbi;
 	int fatal = 0, err;
+	ext4_group_t flex_group;
 
 	if (atomic_read(&inode->i_count) > 1) {
 		printk ("ext4_free_inode: inode has count=%d\n",
@@ -232,6 +233,12 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
 			if (is_directory)
 				percpu_counter_dec(&sbi->s_dirs_counter);
 
+			if (sbi->s_log_groups_per_flex) {
+				flex_group = ext4_flex_group(sbi, block_group);
+				spin_lock(sb_bgl_lock(sbi, flex_group));
+				sbi->s_flex_groups[flex_group].free_inodes++;
+				spin_unlock(sb_bgl_lock(sbi, flex_group));
+			}
 		}
 		BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
 		err = ext4_journal_dirty_metadata(handle, bh2);
@@ -286,6 +293,80 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
 	return ret;
 }
 
+#define free_block_ratio 10
+
+static int find_group_flex(struct super_block *sb, struct inode *parent,
+			   ext4_group_t *best_group)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_desc *desc;
+	struct buffer_head *bh;
+	struct flex_groups *flex_group = sbi->s_flex_groups;
+	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
+	ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
+	ext4_group_t ngroups = sbi->s_groups_count;
+	int flex_size = ext4_flex_bg_size(sbi);
+	ext4_group_t best_flex = parent_fbg_group;
+	int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
+	int flexbg_free_blocks;
+	int flex_freeb_ratio;
+	ext4_group_t n_fbg_groups;
+	ext4_group_t i;
+
+	n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >>
+		sbi->s_log_groups_per_flex;
+
+find_close_to_parent:
+	flexbg_free_blocks = flex_group[best_flex].free_blocks;
+	flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
+	if (flex_group[best_flex].free_inodes &&
+	    flex_freeb_ratio > free_block_ratio)
+		goto found_flexbg;
+
+	if (best_flex && best_flex == parent_fbg_group) {
+		best_flex--;
+		goto find_close_to_parent;
+	}
+
+	for (i = 0; i < n_fbg_groups; i++) {
+		if (i == parent_fbg_group || i == parent_fbg_group - 1)
+			continue;
+
+		flexbg_free_blocks = flex_group[i].free_blocks;
+		flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
+
+		if (flex_freeb_ratio > free_block_ratio &&
+		    flex_group[i].free_inodes) {
+			best_flex = i;
+			goto found_flexbg;
+		}
+
+		if (best_flex < 0 ||
+		    (flex_group[i].free_blocks >
+		     flex_group[best_flex].free_blocks &&
+		     flex_group[i].free_inodes))
+			best_flex = i;
+	}
+
+	if (!flex_group[best_flex].free_inodes ||
+	    !flex_group[best_flex].free_blocks)
+		return -1;
+
+found_flexbg:
+	for (i = best_flex * flex_size; i < ngroups &&
+		     i < (best_flex + 1) * flex_size; i++) {
+		desc = ext4_get_group_desc(sb, i, &bh);
+		if (le16_to_cpu(desc->bg_free_inodes_count)) {
+			*best_group = i;
+			goto out;
+		}
+	}
+
+	return -1;
+out:
+	return 0;
+}
+
 /*
  * Orlov's allocator for directories.
  *
@@ -501,6 +582,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
 	struct inode *ret;
 	ext4_group_t i;
 	int free = 0;
+	ext4_group_t flex_group;
 
 	/* Cannot create files in a deleted directory */
 	if (!dir || !dir->i_nlink)
@@ -514,6 +596,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
 
 	sbi = EXT4_SB(sb);
 	es = sbi->s_es;
+
+	if (sbi->s_log_groups_per_flex) {
+		ret2 = find_group_flex(sb, dir, &group);
+		goto got_group;
+	}
+
 	if (S_ISDIR(mode)) {
 		if (test_opt (sb, OLDALLOC))
 			ret2 = find_group_dir(sb, dir, &group);
@@ -522,6 +610,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
 	} else
 		ret2 = find_group_other(sb, dir, &group);
 
+got_group:
 	err = -ENOSPC;
 	if (ret2 == -1)
 		goto out;
@@ -676,6 +765,13 @@ got:
 		percpu_counter_inc(&sbi->s_dirs_counter);
 	sb->s_dirt = 1;
 
+	if (sbi->s_log_groups_per_flex) {
+		flex_group = ext4_flex_group(sbi, group);
+		spin_lock(sb_bgl_lock(sbi, flex_group));
+		sbi->s_flex_groups[flex_group].free_inodes--;
+		spin_unlock(sb_bgl_lock(sbi, flex_group));
+	}
+
 	inode->i_uid = current->fsuid;
 	if (test_opt (sb, GRPID))
 		inode->i_gid = dir->i_gid;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b882868f466..5dcb826401b 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2842,6 +2842,14 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 	spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
 	percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
 
+	if (sbi->s_log_groups_per_flex) {
+		ext4_group_t flex_group = ext4_flex_group(sbi,
+							  ac->ac_b_ex.fe_group);
+		spin_lock(sb_bgl_lock(sbi, flex_group));
+		sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len;
+		spin_unlock(sb_bgl_lock(sbi, flex_group));
+	}
+
 	err = ext4_journal_dirty_metadata(handle, bitmap_bh);
 	if (err)
 		goto out_err;
@@ -4342,6 +4350,13 @@ do_more:
 	spin_unlock(sb_bgl_lock(sbi, block_group));
 	percpu_counter_add(&sbi->s_freeblocks_counter, count);
 
+	if (sbi->s_log_groups_per_flex) {
+		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+		spin_lock(sb_bgl_lock(sbi, flex_group));
+		sbi->s_flex_groups[flex_group].free_blocks += count;
+		spin_unlock(sb_bgl_lock(sbi, flex_group));
+	}
+
 	ext4_mb_release_desc(&e4b);
 
 	*freed += count;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 588cfb40864..b9ad3d85206 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -517,6 +517,7 @@ static void ext4_put_super (struct super_block * sb)
 	for (i = 0; i < sbi->s_gdb_count; i++)
 		brelse(sbi->s_group_desc[i]);
 	kfree(sbi->s_group_desc);
+	kfree(sbi->s_flex_groups);
 	percpu_counter_destroy(&sbi->s_freeblocks_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -1442,6 +1443,54 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 	return res;
 }
 
+static int ext4_fill_flex_info(struct super_block *sb)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_desc *gdp = NULL;
+	struct buffer_head *bh;
+	ext4_group_t flex_group_count;
+	ext4_group_t flex_group;
+	int groups_per_flex = 0;
+	__u64 block_bitmap = 0;
+	int i;
+
+	if (!sbi->s_es->s_log_groups_per_flex) {
+		sbi->s_log_groups_per_flex = 0;
+		return 1;
+	}
+
+	sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
+	groups_per_flex = 1 << sbi->s_log_groups_per_flex;
+
+	flex_group_count = (sbi->s_groups_count + groups_per_flex - 1) /
+		groups_per_flex;
+	sbi->s_flex_groups = kmalloc(flex_group_count *
+				     sizeof(struct flex_groups), GFP_KERNEL);
+	if (sbi->s_flex_groups == NULL) {
+		printk(KERN_ERR "EXT4-fs: not enough memory\n");
+		goto failed;
+	}
+	memset(sbi->s_flex_groups, 0, flex_group_count *
+	       sizeof(struct flex_groups));
+
+	gdp = ext4_get_group_desc(sb, 1, &bh);
+	block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
+
+	for (i = 0; i < sbi->s_groups_count; i++) {
+		gdp = ext4_get_group_desc(sb, i, &bh);
+
+		flex_group = ext4_flex_group(sbi, i);
+		sbi->s_flex_groups[flex_group].free_inodes +=
+			le16_to_cpu(gdp->bg_free_inodes_count);
+		sbi->s_flex_groups[flex_group].free_blocks +=
+			le16_to_cpu(gdp->bg_free_blocks_count);
+	}
+
+	return 1;
+failed:
+	return 0;
+}
+
 __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
 			    struct ext4_group_desc *gdp)
 {
@@ -2137,6 +2186,14 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 		printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
 		goto failed_mount2;
 	}
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+		if (!ext4_fill_flex_info(sb)) {
+			printk(KERN_ERR
+			       "EXT4-fs: unable to initialize "
+			       "flex_bg meta info!\n");
+			goto failed_mount2;
+		}
+
 	sbi->s_gdb_count = db_count;
 	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
 	spin_lock_init(&sbi->s_next_gen_lock);
-- 
cgit v1.2.3-70-g09d2


From 530576bbf379fc45cfb34f246257d8526db44567 Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@us.ibm.com>
Date: Sun, 13 Jul 2008 21:06:39 -0400
Subject: jbd2: fix race between jbd2_journal_try_to_free_buffers() and jbd2
 commit transaction

journal_try_to_free_buffers() could race with jbd commit transaction
when the later is holding the buffer reference while waiting for the
data buffer to flush to disk. If the caller of
journal_try_to_free_buffers() request tries hard to release the buffers,
it will treat the failure as error and return back to the caller. We
have seen the directo IO failed due to this race.  Some of the caller of
releasepage() also expecting the buffer to be dropped when passed with
GFP_KERNEL mask to the releasepage()->journal_try_to_free_buffers().

With this patch, if the caller is passing the GFP_KERNEL to indicating
this call could wait, in case of try_to_free_buffers() failed, let's
waiting for journal_commit_transaction() to finish commit the current
committing transaction , then try to free those buffers again with
journal locked.

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Reviewed-by: Badari Pulavarty <pbadari@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/transaction.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 58 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index d6e006e6780..ba620c4493d 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -41,7 +41,6 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
  *	new transaction	and we can't block without protecting against other
  *	processes trying to touch the journal while it is in transition.
  *
- * Called under j_state_lock
  */
 
 static transaction_t *
@@ -1656,12 +1655,43 @@ out:
 	return;
 }
 
+/*
+ * jbd2_journal_try_to_free_buffers() could race with
+ * jbd2_journal_commit_transaction(). The later might still hold the
+ * reference count to the buffers when inspecting them on
+ * t_syncdata_list or t_locked_list.
+ *
+ * jbd2_journal_try_to_free_buffers() will call this function to
+ * wait for the current transaction to finish syncing data buffers, before
+ * try to free that buffer.
+ *
+ * Called with journal->j_state_lock hold.
+ */
+static void jbd2_journal_wait_for_transaction_sync_data(journal_t *journal)
+{
+	transaction_t *transaction;
+	tid_t tid;
+
+	spin_lock(&journal->j_state_lock);
+	transaction = journal->j_committing_transaction;
+
+	if (!transaction) {
+		spin_unlock(&journal->j_state_lock);
+		return;
+	}
+
+	tid = transaction->t_tid;
+	spin_unlock(&journal->j_state_lock);
+	jbd2_log_wait_commit(journal, tid);
+}
 
 /**
  * int jbd2_journal_try_to_free_buffers() - try to free page buffers.
  * @journal: journal for operation
  * @page: to try and free
- * @unused_gfp_mask: unused
+ * @gfp_mask: we use the mask to detect how hard should we try to release
+ * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
+ * release the buffers.
  *
  *
  * For all the buffers on this page,
@@ -1690,9 +1720,11 @@ out:
  * journal_try_to_free_buffer() is changing its state.  But that
  * cannot happen because we never reallocate freed data as metadata
  * while the data is part of a transaction.  Yes?
+ *
+ * Return 0 on failure, 1 on success
  */
 int jbd2_journal_try_to_free_buffers(journal_t *journal,
-				struct page *page, gfp_t unused_gfp_mask)
+				struct page *page, gfp_t gfp_mask)
 {
 	struct buffer_head *head;
 	struct buffer_head *bh;
@@ -1708,7 +1740,8 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
 		/*
 		 * We take our own ref against the journal_head here to avoid
 		 * having to add tons of locking around each instance of
-		 * jbd2_journal_remove_journal_head() and jbd2_journal_put_journal_head().
+		 * jbd2_journal_remove_journal_head() and
+		 * jbd2_journal_put_journal_head().
 		 */
 		jh = jbd2_journal_grab_journal_head(bh);
 		if (!jh)
@@ -1721,7 +1754,28 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
 		if (buffer_jbd(bh))
 			goto busy;
 	} while ((bh = bh->b_this_page) != head);
+
 	ret = try_to_free_buffers(page);
+
+	/*
+	 * There are a number of places where jbd2_journal_try_to_free_buffers()
+	 * could race with jbd2_journal_commit_transaction(), the later still
+	 * holds the reference to the buffers to free while processing them.
+	 * try_to_free_buffers() failed to free those buffers. Some of the
+	 * caller of releasepage() request page buffers to be dropped, otherwise
+	 * treat the fail-to-free as errors (such as generic_file_direct_IO())
+	 *
+	 * So, if the caller of try_to_release_page() wants the synchronous
+	 * behaviour(i.e make sure buffers are dropped upon return),
+	 * let's wait for the current transaction to finish flush of
+	 * dirty data buffers, then try to free those buffers again,
+	 * with the journal locked.
+	 */
+	if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) {
+		jbd2_journal_wait_for_transaction_sync_data(journal);
+		ret = try_to_free_buffers(page);
+	}
+
 busy:
 	return ret;
 }
-- 
cgit v1.2.3-70-g09d2


From f9a8ac99fd79eb961a8573ccf058439bc17b4d3a Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: remove redundant code in ext4_fill_super()

The previous sb_min_blocksize() has already set the block size.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index b9ad3d85206..6c80f37433e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1899,11 +1899,6 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 		goto out_fail;
 	}
 
-	if (!sb_set_blocksize(sb, blocksize)) {
-		printk(KERN_ERR "EXT4-fs: bad blocksize %d.\n", blocksize);
-		goto out_fail;
-	}
-
 	/*
 	 * The ext4 superblock will not be buffer aligned for other than 1kB
 	 * block sizes.  We need to calculate the offset from buffer start.
-- 
cgit v1.2.3-70-g09d2


From 363d4251d4bd984c304e0989789f6494343660fd Mon Sep 17 00:00:00 2001
From: Shen Feng <shen@cn.fujitsu.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: remove quota allocation when ext4_mb_new_blocks fails

Quota allocation is not removed when ext4_mb_new_blocks calls
kmem_cache_alloc failed.  Also make sure the allocation context is freed
on the error path.

Signed-off-by: Shen Feng <shen@cn.fujitsu.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 5dcb826401b..cde232bdaed 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4058,8 +4058,9 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 
 	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
 	if (!ac) {
+		ar->len = 0;
 		*errp = -ENOMEM;
-		return 0;
+		goto out1;
 	}
 
 	ext4_mb_poll_new_transaction(sb, handle);
@@ -4067,7 +4068,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 	*errp = ext4_mb_initialize_context(ac, ar);
 	if (*errp) {
 		ar->len = 0;
-		goto out;
+		goto out2;
 	}
 
 	ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
@@ -4115,11 +4116,12 @@ repeat:
 
 	ext4_mb_release_context(ac);
 
-out:
+out2:
+	kmem_cache_free(ext4_ac_cachep, ac);
+out1:
 	if (ar->len < inquota)
 		DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
 
-	kmem_cache_free(ext4_ac_cachep, ac);
 	return block;
 }
 static void ext4_mb_poll_new_transaction(struct super_block *sb,
-- 
cgit v1.2.3-70-g09d2


From a379cd1d6bb00f9f5d2759d4a5621a884df5914e Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: Update i_disksize properly when allocating from fallocate area.

When allocating unitialized space at the end of file which had been
preallocated with the FALLOC_FL_KEEP_SIZE option, the file size is not
updated at that time.  But the later we are not updating the file size
when writing to that preallocated space.

These changes are for code correctness.  This patch allows us to update
the i_disksize at the write_end() callback of filesystem properly.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 47929c4e3da..f7a746b5b7b 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2716,13 +2716,13 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 		goto out2;
 	}
 
-	if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize)
-		EXT4_I(inode)->i_disksize = inode->i_size;
-
 	/* previous routine could use block we allocated */
 	newblock = ext_pblock(&newex);
 	allocated = ext4_ext_get_actual_len(&newex);
 outnew:
+	if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize)
+		EXT4_I(inode)->i_disksize = inode->i_size;
+
 	__set_bit(BH_New, &bh_result->b_state);
 
 	/* Cache only when it is _not_ an uninitialized extent */
-- 
cgit v1.2.3-70-g09d2


From 787e0981fad97a5ca3d07c7afe115a7e345b2861 Mon Sep 17 00:00:00 2001
From: Shen Feng <shen@cn.fujitsu.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: return error when calling ext4_ext_split failed

ext4_ext_create_new_leaf must return error when its
calling to ext4_ext_split failed.

Signed-off-by: Shen Feng <shen@cn.fujitsu.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index f7a746b5b7b..bc17ed74284 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -981,6 +981,8 @@ repeat:
 		/* if we found index with free entry, then use that
 		 * entry: create all needed subtree and add new leaf */
 		err = ext4_ext_split(handle, inode, path, newext, i);
+		if (err)
+			goto out;
 
 		/* refill path */
 		ext4_ext_drop_refs(path);
-- 
cgit v1.2.3-70-g09d2


From 1973adcba570c226de840299056e055a3614185e Mon Sep 17 00:00:00 2001
From: Shen Feng <shen@cn.fujitsu.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: Make ext4_ext_find_extent fills ext_path completely

When pos=0 or depth, the fields of ext4_ext_path is are not
completely filled.  This patch also removes some unnecessary code.

Signed-off-by: Shen Feng <shen@cn.fujitsu.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index bc17ed74284..3e966daaade 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -524,6 +524,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 		alloc = 1;
 	}
 	path[0].p_hdr = eh;
+	path[0].p_bh = NULL;
 
 	i = depth;
 	/* walk through the tree */
@@ -552,12 +553,14 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 	}
 
 	path[ppos].p_depth = i;
-	path[ppos].p_hdr = eh;
 	path[ppos].p_ext = NULL;
 	path[ppos].p_idx = NULL;
 
 	/* find extent */
 	ext4_ext_binsearch(inode, path + ppos, block);
+	/* if not an empty leaf */
+	if (path[ppos].p_ext)
+		path[ppos].p_block = ext_pblock(path[ppos].p_ext);
 
 	ext4_ext_show_path(inode, path);
 
-- 
cgit v1.2.3-70-g09d2


From 9102e4fa8016af8bf1a263df913ee8fdafd4dfb0 Mon Sep 17 00:00:00 2001
From: Shen Feng <shen@cn.fujitsu.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: Fix ext4_ext_journal_restart() to reflect errors up to the
 caller

Fix ext4_ext_journal_restart() so it returns any errors reported by
ext4_journal_extend() and ext4_journal_restart().

Signed-off-by: Shen Feng <shen@cn.fujitsu.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 3e966daaade..17ea8bb12aa 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -92,17 +92,16 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
 	ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
 }
 
-static handle_t *ext4_ext_journal_restart(handle_t *handle, int needed)
+static int ext4_ext_journal_restart(handle_t *handle, int needed)
 {
 	int err;
 
 	if (handle->h_buffer_credits > needed)
-		return handle;
-	if (!ext4_journal_extend(handle, needed))
-		return handle;
-	err = ext4_journal_restart(handle, needed);
-
-	return handle;
+		return 0;
+	err = ext4_journal_extend(handle, needed);
+	if (err)
+		return err;
+	return ext4_journal_restart(handle, needed);
 }
 
 /*
@@ -1888,11 +1887,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 		credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
 #endif
 
-		handle = ext4_ext_journal_restart(handle, credits);
-		if (IS_ERR(handle)) {
-			err = PTR_ERR(handle);
+		err = ext4_ext_journal_restart(handle, credits);
+		if (err)
 			goto out;
-		}
 
 		err = ext4_ext_get_access(handle, inode, path + depth);
 		if (err)
-- 
cgit v1.2.3-70-g09d2


From d9c769b769a8bcd70371c71797fc4e407b37ba75 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: cleanup never-used magic numbers from htree code

dx_root_limit() will had some dead code which forced it to always return
20, and dx_node_limit to always return 22 for debugging purposes.
Remove it.

Acked-by: Andreas Dilger <adilger@sun.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/namei.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index c7bf01261c7..387ad98350c 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -241,13 +241,13 @@ static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
 {
 	unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
 		EXT4_DIR_REC_LEN(2) - infosize;
-	return 0? 20: entry_space / sizeof(struct dx_entry);
+	return entry_space / sizeof(struct dx_entry);
 }
 
 static inline unsigned dx_node_limit (struct inode *dir)
 {
 	unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
-	return 0? 22: entry_space / sizeof(struct dx_entry);
+	return entry_space / sizeof(struct dx_entry);
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 7477827f6686ce29b6216d9eb26f6b9027742dcc Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: Fix sparse warning

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c | 2 +-
 fs/ext4/super.c  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 0b2b7549ac6..6dcbec9b256 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -47,7 +47,7 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
 			ext4_group_t block_group)
 {
 	ext4_group_t actual_group;
-	ext4_get_group_no_and_offset(sb, block, &actual_group, 0);
+	ext4_get_group_no_and_offset(sb, block, &actual_group, NULL);
 	if (actual_group == block_group)
 		return 1;
 	return 0;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 6c80f37433e..80f06159ee9 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1858,8 +1858,8 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
 }
 
 static int ext4_fill_super (struct super_block *sb, void *data, int silent)
-				__releases(kernel_sem)
-				__acquires(kernel_sem)
+				__releases(kernel_lock)
+				__acquires(kernel_lock)
 
 {
 	struct buffer_head * bh;
-- 
cgit v1.2.3-70-g09d2


From 6afd670713c9e7d5c5550e379dfedca8ffab4c90 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: fix ext4_init_block_bitmap() for metablock block group

When meta_bg feature is enabled and s_first_meta_bg != 0,
ext4_init_block_bitmap() miscalculates the number of block used by
the group descriptor table (0 or 1 for metablock block group)

This patch fixes this by using ext4_bg_num_gdb()

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Stephen Tweedie <sct@redhat.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Acked-by: Andreas Dilger <adilger@sun.com>
---
 fs/ext4/balloc.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 6dcbec9b256..1327ac3a04d 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -121,12 +121,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 				le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
 		}
 	} else { /* For META_BG_BLOCK_GROUPS */
-		int group_rel = (block_group -
-				 le32_to_cpu(sbi->s_es->s_first_meta_bg)) %
-				EXT4_DESC_PER_BLOCK(sb);
-		if (group_rel == 0 || group_rel == 1 ||
-		    (group_rel == EXT4_DESC_PER_BLOCK(sb) - 1))
-			bit_max += 1;
+		bit_max += ext4_bg_num_gdb(sb, block_group);
 	}
 
 	if (block_group == sbi->s_groups_count - 1) {
-- 
cgit v1.2.3-70-g09d2


From 7061eba75ceb0835ba61e7cbd757a6f9c1e4af92 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: Use inode preallocation with -o noextents

When mballoc is enabled, block allocation for old block-based
files are allocated using mballoc allocator instead of old
block-based allocator. The old ext3 block reservation is turned
off when mballoc is turned on.

However, the in-core preallocation is not enabled for block-based/
non-extent based file block allocation. This result in performance
regression, as now we don't have "reservation" ore in-core preallocation
to prevent interleaved fragmentation in multiple writes workload.

This patch fix this by enable per inode in-core preallocation
for non extent files when mballoc is used.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c  | 36 ++++++++++++++++++++++++++--
 fs/ext4/ext4.h    |  7 ++++--
 fs/ext4/extents.c |  2 +-
 fs/ext4/inode.c   | 72 +++++++++++++++++++++++++++++++++++++++++--------------
 fs/ext4/xattr.c   |  2 +-
 5 files changed, 95 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 1327ac3a04d..816f1dbaeb3 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -1923,7 +1923,7 @@ out:
 	return 0;
 }
 
-ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode,
+ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
 		ext4_fsblk_t goal, int *errp)
 {
 	struct ext4_allocation_request ar;
@@ -1942,9 +1942,29 @@ ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode,
 	ret = ext4_mb_new_blocks(handle, &ar, errp);
 	return ret;
 }
+ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
+		ext4_fsblk_t goal, unsigned long *count, int *errp)
+{
+	struct ext4_allocation_request ar;
+	ext4_fsblk_t ret;
+
+	if (!test_opt(inode->i_sb, MBALLOC)) {
+		ret = ext4_new_blocks_old(handle, inode, goal, count, errp);
+		return ret;
+	}
+
+	memset(&ar, 0, sizeof(ar));
+	ar.inode = inode;
+	ar.goal = goal;
+	ar.len = *count;
+	ret = ext4_mb_new_blocks(handle, &ar, errp);
+	*count = ar.len;
+	return ret;
+}
 
 ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
-		ext4_fsblk_t goal, unsigned long *count, int *errp)
+				ext4_lblk_t iblock, ext4_fsblk_t goal,
+				unsigned long *count, int *errp)
 {
 	struct ext4_allocation_request ar;
 	ext4_fsblk_t ret;
@@ -1955,9 +1975,21 @@ ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
 	}
 
 	memset(&ar, 0, sizeof(ar));
+	/* Fill with neighbour allocated blocks */
+	ar.lleft  = 0;
+	ar.pleft  = 0;
+	ar.lright = 0;
+	ar.pright = 0;
+
 	ar.inode = inode;
 	ar.goal = goal;
 	ar.len = *count;
+	ar.logical = iblock;
+	if (S_ISREG(inode->i_mode))
+		ar.flags = EXT4_MB_HINT_DATA;
+	else
+		/* disable in-core preallocation for non-regular files */
+		ar.flags = 0;
 	ret = ext4_mb_new_blocks(handle, &ar, errp);
 	*count = ar.len;
 	return ret;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0bfeae18f1a..d44a7fb6a7b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -970,10 +970,13 @@ extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
 extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
 extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
 			ext4_group_t group);
-extern ext4_fsblk_t ext4_new_block (handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t goal, int *errp);
-extern ext4_fsblk_t ext4_new_blocks (handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t goal, unsigned long *count, int *errp);
+extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+					ext4_lblk_t iblock, ext4_fsblk_t goal,
+					unsigned long *count, int *errp);
 extern ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t goal, unsigned long *count, int *errp);
 extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 17ea8bb12aa..c729b3507b4 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -187,7 +187,7 @@ ext4_ext_new_block(handle_t *handle, struct inode *inode,
 	ext4_fsblk_t goal, newblock;
 
 	goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
-	newblock = ext4_new_block(handle, inode, goal, err);
+	newblock = ext4_new_meta_block(handle, inode, goal, err);
 	return newblock;
 }
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 7cce96a6935..bc950562b5b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -508,11 +508,12 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
  *		direct blocks
  */
 static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
-			ext4_fsblk_t goal, int indirect_blks, int blks,
-			ext4_fsblk_t new_blocks[4], int *err)
+				ext4_lblk_t iblock, ext4_fsblk_t goal,
+				int indirect_blks, int blks,
+				ext4_fsblk_t new_blocks[4], int *err)
 {
 	int target, i;
-	unsigned long count = 0;
+	unsigned long count = 0, blk_allocated = 0;
 	int index = 0;
 	ext4_fsblk_t current_block = 0;
 	int ret = 0;
@@ -525,12 +526,13 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
 	 * the first direct block of this branch.  That's the
 	 * minimum number of blocks need to allocate(required)
 	 */
-	target = blks + indirect_blks;
-
-	while (1) {
+	/* first we try to allocate the indirect blocks */
+	target = indirect_blks;
+	while (target > 0) {
 		count = target;
 		/* allocating blocks for indirect blocks and direct blocks */
-		current_block = ext4_new_blocks(handle,inode,goal,&count,err);
+		current_block = ext4_new_meta_blocks(handle, inode,
+							goal, &count, err);
 		if (*err)
 			goto failed_out;
 
@@ -540,16 +542,48 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
 			new_blocks[index++] = current_block++;
 			count--;
 		}
-
-		if (count > 0)
+		if (count > 0) {
+			/*
+			 * save the new block number
+			 * for the first direct block
+			 */
+			new_blocks[index] = current_block;
+			printk(KERN_INFO "%s returned more blocks than "
+						"requested\n", __func__);
+			WARN_ON(1);
 			break;
+		}
 	}
 
-	/* save the new block number for the first direct block */
-	new_blocks[index] = current_block;
-
+	target = blks - count ;
+	blk_allocated = count;
+	if (!target)
+		goto allocated;
+	/* Now allocate data blocks */
+	count = target;
+	/* allocating blocks for indirect blocks and direct blocks */
+	current_block = ext4_new_blocks(handle, inode, iblock,
+						goal, &count, err);
+	if (*err && (target == blks)) {
+		/*
+		 * if the allocation failed and we didn't allocate
+		 * any blocks before
+		 */
+		goto failed_out;
+	}
+	if (!*err) {
+		if (target == blks) {
+		/*
+		 * save the new block number
+		 * for the first direct block
+		 */
+			new_blocks[index] = current_block;
+		}
+		blk_allocated += count;
+	}
+allocated:
 	/* total number of blocks allocated for direct blocks */
-	ret = count;
+	ret = blk_allocated;
 	*err = 0;
 	return ret;
 failed_out:
@@ -584,8 +618,9 @@ failed_out:
  *	as described above and return 0.
  */
 static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
-			int indirect_blks, int *blks, ext4_fsblk_t goal,
-			ext4_lblk_t *offsets, Indirect *branch)
+				ext4_lblk_t iblock, int indirect_blks,
+				int *blks, ext4_fsblk_t goal,
+				ext4_lblk_t *offsets, Indirect *branch)
 {
 	int blocksize = inode->i_sb->s_blocksize;
 	int i, n = 0;
@@ -595,7 +630,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
 	ext4_fsblk_t new_blocks[4];
 	ext4_fsblk_t current_block;
 
-	num = ext4_alloc_blocks(handle, inode, goal, indirect_blks,
+	num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
 				*blks, new_blocks, &err);
 	if (err)
 		return err;
@@ -855,8 +890,9 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
 	/*
 	 * Block out ext4_truncate while we alter the tree
 	 */
-	err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal,
-				offsets + (partial - chain), partial);
+	err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
+					&count, goal,
+					offsets + (partial - chain), partial);
 
 	/*
 	 * The ext4_splice_branch call will free and forget any buffers
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index ff08633f398..93c5fdcdad2 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -810,7 +810,7 @@ inserted:
 			/* We need to allocate a new block */
 			ext4_fsblk_t goal = ext4_group_first_block_no(sb,
 						EXT4_I(inode)->i_block_group);
-			ext4_fsblk_t block = ext4_new_block(handle, inode,
+			ext4_fsblk_t block = ext4_new_meta_block(handle, inode,
 							goal, &error);
 			if (error)
 				goto cleanup;
-- 
cgit v1.2.3-70-g09d2


From 654b4908bc17a6318d18f3036fecc5155de92f55 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: cleanup block allocator

Move the code related to block allocation to a single function and add helper
funtions to differient allocation for data and meta data blocks

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c  | 127 +++++++++++++++++++++++++++++++-----------------------
 fs/ext4/ext4.h    |   2 +-
 fs/ext4/extents.c |  10 +++--
 fs/ext4/inode.c   |   2 +-
 fs/ext4/mballoc.c |   2 +-
 5 files changed, 84 insertions(+), 59 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 816f1dbaeb3..48787e86d43 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -1640,20 +1640,24 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 }
 
 /**
- * ext4_new_blocks_old() -- core block(s) allocation function
+ * ext4_old_new_blocks() -- core block bitmap based block allocation function
+ *
  * @handle:		handle to this transaction
  * @inode:		file inode
  * @goal:		given target block(filesystem wide)
  * @count:		target number of blocks to allocate
  * @errp:		error code
  *
- * ext4_new_blocks uses a goal block to assist allocation.  It tries to
- * allocate block(s) from the block group contains the goal block first. If that
- * fails, it will try to allocate block(s) from other block groups without
- * any specific goal block.
+ * ext4_old_new_blocks uses a goal block to assist allocation and look up
+ * the block bitmap directly to do block allocation.  It tries to
+ * allocate block(s) from the block group contains the goal block first. If
+ * that fails, it will try to allocate block(s) from other block groups
+ * without any specific goal block.
+ *
+ * This function is called when -o nomballoc mount option is enabled
  *
  */
-ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
+ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t goal, unsigned long *count, int *errp)
 {
 	struct buffer_head *bitmap_bh = NULL;
@@ -1923,78 +1927,95 @@ out:
 	return 0;
 }
 
-ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
-		ext4_fsblk_t goal, int *errp)
-{
-	struct ext4_allocation_request ar;
-	ext4_fsblk_t ret;
-
-	if (!test_opt(inode->i_sb, MBALLOC)) {
-		unsigned long count = 1;
-		ret = ext4_new_blocks_old(handle, inode, goal, &count, errp);
-		return ret;
-	}
+#define EXT4_META_BLOCK 0x1
 
-	memset(&ar, 0, sizeof(ar));
-	ar.inode = inode;
-	ar.goal = goal;
-	ar.len = 1;
-	ret = ext4_mb_new_blocks(handle, &ar, errp);
-	return ret;
-}
-ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
-		ext4_fsblk_t goal, unsigned long *count, int *errp)
-{
-	struct ext4_allocation_request ar;
-	ext4_fsblk_t ret;
-
-	if (!test_opt(inode->i_sb, MBALLOC)) {
-		ret = ext4_new_blocks_old(handle, inode, goal, count, errp);
-		return ret;
-	}
-
-	memset(&ar, 0, sizeof(ar));
-	ar.inode = inode;
-	ar.goal = goal;
-	ar.len = *count;
-	ret = ext4_mb_new_blocks(handle, &ar, errp);
-	*count = ar.len;
-	return ret;
-}
-
-ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
 				ext4_lblk_t iblock, ext4_fsblk_t goal,
-				unsigned long *count, int *errp)
+				unsigned long *count, int *errp, int flags)
 {
 	struct ext4_allocation_request ar;
 	ext4_fsblk_t ret;
 
 	if (!test_opt(inode->i_sb, MBALLOC)) {
-		ret = ext4_new_blocks_old(handle, inode, goal, count, errp);
-		return ret;
+		return ext4_old_new_blocks(handle, inode, goal, count, errp);
 	}
 
 	memset(&ar, 0, sizeof(ar));
 	/* Fill with neighbour allocated blocks */
-	ar.lleft  = 0;
-	ar.pleft  = 0;
-	ar.lright = 0;
-	ar.pright = 0;
 
 	ar.inode = inode;
 	ar.goal = goal;
 	ar.len = *count;
 	ar.logical = iblock;
-	if (S_ISREG(inode->i_mode))
+
+	if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
+		/* enable in-core preallocation for data block allocation */
 		ar.flags = EXT4_MB_HINT_DATA;
 	else
 		/* disable in-core preallocation for non-regular files */
 		ar.flags = 0;
+
 	ret = ext4_mb_new_blocks(handle, &ar, errp);
 	*count = ar.len;
 	return ret;
 }
 
+/*
+ * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks
+ *
+ * @handle:             handle to this transaction
+ * @inode:              file inode
+ * @goal:               given target block(filesystem wide)
+ * @errp:               error code
+ *
+ * Return allocated block number on success
+ */
+ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
+		ext4_fsblk_t goal, int *errp)
+{
+	unsigned long count = 1;
+	return do_blk_alloc(handle, inode, 0, goal,
+			&count, errp, EXT4_META_BLOCK);
+}
+
+/*
+ * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
+ *
+ * @handle:             handle to this transaction
+ * @inode:              file inode
+ * @goal:               given target block(filesystem wide)
+ * @count:		total number of blocks need
+ * @errp:               error code
+ *
+ * Return 1st allocated block numberon success, *count stores total account
+ * error stores in errp pointer
+ */
+ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
+		ext4_fsblk_t goal, unsigned long *count, int *errp)
+{
+	return do_blk_alloc(handle, inode, 0, goal,
+			count, errp, EXT4_META_BLOCK);
+}
+
+/*
+ * ext4_new_blocks() -- allocate data blocks
+ *
+ * @handle:             handle to this transaction
+ * @inode:              file inode
+ * @goal:               given target block(filesystem wide)
+ * @count:		total number of blocks need
+ * @errp:               error code
+ *
+ * Return 1st allocated block numberon success, *count stores total account
+ * error stores in errp pointer
+ */
+
+ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+				ext4_lblk_t iblock, ext4_fsblk_t goal,
+				unsigned long *count, int *errp)
+{
+	return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0);
+}
 
 /**
  * ext4_count_free_blocks() -- count filesystem free blocks
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index d44a7fb6a7b..e622ade69d5 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -977,7 +977,7 @@ extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
 extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
 					ext4_lblk_t iblock, ext4_fsblk_t goal,
 					unsigned long *count, int *errp);
-extern ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t goal, unsigned long *count, int *errp);
 extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
 			ext4_fsblk_t block, unsigned long count, int metadata);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index c729b3507b4..555155d239d 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -179,8 +179,11 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 	return bg_start + colour + block;
 }
 
+/*
+ * Allocation for a meta data block
+ */
 static ext4_fsblk_t
-ext4_ext_new_block(handle_t *handle, struct inode *inode,
+ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
 			struct ext4_ext_path *path,
 			struct ext4_extent *ex, int *err)
 {
@@ -690,7 +693,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 	/* allocate all needed blocks */
 	ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
 	for (a = 0; a < depth - at; a++) {
-		newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
+		newblock = ext4_ext_new_meta_block(handle, inode, path,
+						   newext, &err);
 		if (newblock == 0)
 			goto cleanup;
 		ablocks[a] = newblock;
@@ -886,7 +890,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
 	ext4_fsblk_t newblock;
 	int err = 0;
 
-	newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
+	newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err);
 	if (newblock == 0)
 		return err;
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index bc950562b5b..c5d506dce8c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -561,7 +561,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
 		goto allocated;
 	/* Now allocate data blocks */
 	count = target;
-	/* allocating blocks for indirect blocks and direct blocks */
+	/* allocating blocks for data blocks */
 	current_block = ext4_new_blocks(handle, inode, iblock,
 						goal, &count, err);
 	if (*err && (target == blks)) {
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index cde232bdaed..816ba8cce79 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4041,7 +4041,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 	sbi = EXT4_SB(sb);
 
 	if (!test_opt(sb, MBALLOC)) {
-		block = ext4_new_blocks_old(handle, ar->inode, ar->goal,
+		block = ext4_old_new_blocks(handle, ar->inode, ar->goal,
 					    &(ar->len), errp);
 		return block;
 	}
-- 
cgit v1.2.3-70-g09d2


From d755fb384250d6bd7fd18a0930e71965acc8e72e Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: call blkdev_issue_flush on fsync

To ensure that bits are truly on-disk after an fsync,
we should call blkdev_issue_flush if barriers are supported.

Inspired by an old thread on barriers, by reiserfs & xfs
which do the same, and by a patch SuSE ships with their kernel

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/fsync.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 1c8ba48d4f8..a45c3737ad3 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -27,6 +27,7 @@
 #include <linux/sched.h>
 #include <linux/writeback.h>
 #include <linux/jbd2.h>
+#include <linux/blkdev.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
 
@@ -45,6 +46,7 @@
 int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
 {
 	struct inode *inode = dentry->d_inode;
+	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
 	int ret = 0;
 
 	J_ASSERT(ext4_journal_current_handle() == NULL);
@@ -85,6 +87,8 @@ int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
 			.nr_to_write = 0, /* sys_fsync did this */
 		};
 		ret = sync_inode(inode, &wbc);
+		if (journal && (journal->j_flags & JBD2_BARRIER))
+			blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
 	}
 out:
 	return ret;
-- 
cgit v1.2.3-70-g09d2


From 07031431072ece801d53d2c03d5e5bb21f4f64a4 Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@us.ibm.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: mballoc avoid use root reserved blocks for non root allocation

mballoc allocation missed check for blocks reserved for root users. Add
ext4_has_free_blocks() check before allocation. Also modified
ext4_has_free_blocks() to support multiple block allocation request.

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c  | 51 +++++++++++++++++++++++++++++++++------------------
 fs/ext4/ext4.h    |  2 ++
 fs/ext4/mballoc.c |  7 ++++++-
 3 files changed, 41 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 48787e86d43..25f63d8c1b3 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -1599,23 +1599,35 @@ out:
 
 /**
  * ext4_has_free_blocks()
- * @sbi:		in-core super block structure.
+ * @sbi:	in-core super block structure.
+ * @nblocks:	number of neeed blocks
  *
- * Check if filesystem has at least 1 free block available for allocation.
+ * Check if filesystem has free blocks available for allocation.
+ * Return the number of blocks avaible for allocation for this request
+ * On success, return nblocks
  */
-static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
+ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
+						ext4_fsblk_t nblocks)
 {
-	ext4_fsblk_t free_blocks, root_blocks;
+	ext4_fsblk_t free_blocks;
+	ext4_fsblk_t root_blocks = 0;
 
 	free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
-	root_blocks = ext4_r_blocks_count(sbi->s_es);
-	if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
+
+	if (!capable(CAP_SYS_RESOURCE) &&
 		sbi->s_resuid != current->fsuid &&
-		(sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
-		return 0;
-	}
-	return 1;
-}
+		(sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
+		root_blocks = ext4_r_blocks_count(sbi->s_es);
+#ifdef CONFIG_SMP
+	if (free_blocks - root_blocks < FBC_BATCH)
+		free_blocks =
+			percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
+#endif
+	if (free_blocks - root_blocks < nblocks)
+		return free_blocks - root_blocks;
+	return nblocks;
+ }
+
 
 /**
  * ext4_should_retry_alloc()
@@ -1631,7 +1643,7 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
  */
 int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 {
-	if (!ext4_has_free_blocks(EXT4_SB(sb)) || (*retries)++ > 3)
+	if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3)
 		return 0;
 
 	jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
@@ -1681,13 +1693,21 @@ ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
 	ext4_group_t ngroups;
 	unsigned long num = *count;
 
-	*errp = -ENOSPC;
 	sb = inode->i_sb;
 	if (!sb) {
+		*errp = -ENODEV;
 		printk("ext4_new_block: nonexistent device");
 		return 0;
 	}
 
+	sbi = EXT4_SB(sb);
+	*count = ext4_has_free_blocks(sbi, *count);
+	if (*count == 0) {
+		*errp = -ENOSPC;
+		return 0;	/*return with ENOSPC error */
+	}
+	num = *count;
+
 	/*
 	 * Check quota for allocation of this block.
 	 */
@@ -1711,11 +1731,6 @@ ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
 	if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
 		my_rsv = &block_i->rsv_window_node;
 
-	if (!ext4_has_free_blocks(sbi)) {
-		*errp = -ENOSPC;
-		goto out;
-	}
-
 	/*
 	 * First, test whether the goal block is free.
 	 */
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index e622ade69d5..2c4b48519c8 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -979,6 +979,8 @@ extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
 					unsigned long *count, int *errp);
 extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t goal, unsigned long *count, int *errp);
+extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
+						ext4_fsblk_t nblocks);
 extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
 			ext4_fsblk_t block, unsigned long count, int metadata);
 extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb,
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 816ba8cce79..1666ac184e3 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4045,6 +4045,12 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 					    &(ar->len), errp);
 		return block;
 	}
+	ar->len = ext4_has_free_blocks(sbi, ar->len);
+
+	if (ar->len == 0) {
+		*errp = -ENOSPC;
+		return 0;
+	}
 
 	while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
 		ar->flags |= EXT4_MB_HINT_NOPREALLOC;
@@ -4073,7 +4079,6 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 
 	ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
 	if (!ext4_mb_use_preallocated(ac)) {
-
 		ac->ac_op = EXT4_MB_HISTORY_ALLOC;
 		ext4_mb_normalize_request(ac, ar);
 repeat:
-- 
cgit v1.2.3-70-g09d2


From 47b4a50bebfd34b5e1fa2a9c673c8f31fa231cc1 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: Set journal pointer to NULL when journal is released

Set sbi->s_journal to NULL after we call journal_destroy(). This
will be later needed because after journal_destroy() is called,
ext4_clear_inode() can still be called for some inodes (e.g. root
inode) and we'll need to detect there that journal doesn't exists
anymore.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 80f06159ee9..1b330cd71ca 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -506,6 +506,7 @@ static void ext4_put_super (struct super_block * sb)
 	ext4_ext_release(sb);
 	ext4_xattr_put_super(sb);
 	jbd2_journal_destroy(sbi->s_journal);
+	sbi->s_journal = NULL;
 	if (!(sb->s_flags & MS_RDONLY)) {
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 		es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -2423,6 +2424,7 @@ cantfind_ext4:
 
 failed_mount4:
 	jbd2_journal_destroy(sbi->s_journal);
+	sbi->s_journal = NULL;
 failed_mount3:
 	percpu_counter_destroy(&sbi->s_freeblocks_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
-- 
cgit v1.2.3-70-g09d2


From 953e622b601f58b7cc0f29fe644457fa40a18456 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: use atomic functions to set bh_state

Use the BUFFER_FNS functions (set_buffer_foo) to set buffer
head state atomically instead of nonatomic __set_bit().

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 555155d239d..bb36a28f8ff 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2622,8 +2622,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 				 */
 				if (allocated > max_blocks)
 					allocated = max_blocks;
-				/* mark the buffer unwritten */
-				__set_bit(BH_Unwritten, &bh_result->b_state);
+				set_buffer_unwritten(bh_result);
 				goto out2;
 			}
 
@@ -2729,7 +2728,7 @@ outnew:
 	if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize)
 		EXT4_I(inode)->i_disksize = inode->i_size;
 
-	__set_bit(BH_New, &bh_result->b_state);
+	set_buffer_new(bh_result);
 
 	/* Cache only when it is _not_ an uninitialized extent */
 	if (create != EXT4_CREATE_UNINITIALIZED_EXT)
@@ -2739,7 +2738,7 @@ out:
 	if (allocated > max_blocks)
 		allocated = max_blocks;
 	ext4_ext_show_leaf(inode, path);
-	__set_bit(BH_Mapped, &bh_result->b_state);
+	set_buffer_mapped(bh_result);
 	bh_result->b_bdev = inode->i_sb->s_bdev;
 	bh_result->b_blocknr = newblock;
 out2:
-- 
cgit v1.2.3-70-g09d2


From 5f21b0e642d7bf6fe4434c9ba12bc9cb96b17cf7 Mon Sep 17 00:00:00 2001
From: Frederic Bohe <frederic.bohe@bull.net>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: fix online resize with mballoc

Update group infos when updating a group's descriptor.
Add group infos when adding a group's descriptor.
Refresh cache pages used by mb_alloc when changes occur.
This will probably need modifications when META_BG resizing will be allowed.

Signed-off-by: Frederic Bohe <frederic.bohe@bull.net>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
---
 fs/ext4/ext4.h    |   4 +
 fs/ext4/mballoc.c | 234 +++++++++++++++++++++++++++++++++++++++++-------------
 fs/ext4/resize.c  |  52 +++++++++++-
 3 files changed, 234 insertions(+), 56 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 2c4b48519c8..64edb09c481 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1033,6 +1033,10 @@ extern int __init init_ext4_mballoc(void);
 extern void exit_ext4_mballoc(void);
 extern void ext4_mb_free_blocks(handle_t *, struct inode *,
 		unsigned long, unsigned long, int, unsigned long *);
+extern int ext4_mb_add_more_groupinfo(struct super_block *sb,
+		ext4_group_t i, struct ext4_group_desc *desc);
+extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
+		ext4_grpblk_t add);
 
 
 /* inode.c */
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 1666ac184e3..8d254ca83d9 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2236,21 +2236,192 @@ ext4_mb_store_history(struct ext4_allocation_context *ac)
 #define ext4_mb_history_init(sb)
 #endif
 
+
+/* Create and initialize ext4_group_info data for the given group. */
+int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
+			  struct ext4_group_desc *desc)
+{
+	int i, len;
+	int metalen = 0;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_info **meta_group_info;
+
+	/*
+	 * First check if this group is the first of a reserved block.
+	 * If it's true, we have to allocate a new table of pointers
+	 * to ext4_group_info structures
+	 */
+	if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
+		metalen = sizeof(*meta_group_info) <<
+			EXT4_DESC_PER_BLOCK_BITS(sb);
+		meta_group_info = kmalloc(metalen, GFP_KERNEL);
+		if (meta_group_info == NULL) {
+			printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
+			       "buddy group\n");
+			goto exit_meta_group_info;
+		}
+		sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
+			meta_group_info;
+	}
+
+	/*
+	 * calculate needed size. if change bb_counters size,
+	 * don't forget about ext4_mb_generate_buddy()
+	 */
+	len = offsetof(typeof(**meta_group_info),
+		       bb_counters[sb->s_blocksize_bits + 2]);
+
+	meta_group_info =
+		sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
+	i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
+
+	meta_group_info[i] = kzalloc(len, GFP_KERNEL);
+	if (meta_group_info[i] == NULL) {
+		printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
+		goto exit_group_info;
+	}
+	set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
+		&(meta_group_info[i]->bb_state));
+
+	/*
+	 * initialize bb_free to be able to skip
+	 * empty groups without initialization
+	 */
+	if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+		meta_group_info[i]->bb_free =
+			ext4_free_blocks_after_init(sb, group, desc);
+	} else {
+		meta_group_info[i]->bb_free =
+			le16_to_cpu(desc->bg_free_blocks_count);
+	}
+
+	INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
+
+#ifdef DOUBLE_CHECK
+	{
+		struct buffer_head *bh;
+		meta_group_info[i]->bb_bitmap =
+			kmalloc(sb->s_blocksize, GFP_KERNEL);
+		BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
+		bh = ext4_read_block_bitmap(sb, group);
+		BUG_ON(bh == NULL);
+		memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
+			sb->s_blocksize);
+		put_bh(bh);
+	}
+#endif
+
+	return 0;
+
+exit_group_info:
+	/* If a meta_group_info table has been allocated, release it now */
+	if (group % EXT4_DESC_PER_BLOCK(sb) == 0)
+		kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
+exit_meta_group_info:
+	return -ENOMEM;
+} /* ext4_mb_add_groupinfo */
+
+/*
+ * Add a group to the existing groups.
+ * This function is used for online resize
+ */
+int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group,
+			       struct ext4_group_desc *desc)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct inode *inode = sbi->s_buddy_cache;
+	int blocks_per_page;
+	int block;
+	int pnum;
+	struct page *page;
+	int err;
+
+	/* Add group based on group descriptor*/
+	err = ext4_mb_add_groupinfo(sb, group, desc);
+	if (err)
+		return err;
+
+	/*
+	 * Cache pages containing dynamic mb_alloc datas (buddy and bitmap
+	 * datas) are set not up to date so that they will be re-initilaized
+	 * during the next call to ext4_mb_load_buddy
+	 */
+
+	/* Set buddy page as not up to date */
+	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+	block = group * 2;
+	pnum = block / blocks_per_page;
+	page = find_get_page(inode->i_mapping, pnum);
+	if (page != NULL) {
+		ClearPageUptodate(page);
+		page_cache_release(page);
+	}
+
+	/* Set bitmap page as not up to date */
+	block++;
+	pnum = block / blocks_per_page;
+	page = find_get_page(inode->i_mapping, pnum);
+	if (page != NULL) {
+		ClearPageUptodate(page);
+		page_cache_release(page);
+	}
+
+	return 0;
+}
+
+/*
+ * Update an existing group.
+ * This function is used for online resize
+ */
+void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
+{
+	grp->bb_free += add;
+}
+
 static int ext4_mb_init_backend(struct super_block *sb)
 {
 	ext4_group_t i;
-	int j, len, metalen;
+	int metalen;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	int num_meta_group_infos =
-		(sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) >>
-			EXT4_DESC_PER_BLOCK_BITS(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	int num_meta_group_infos;
+	int num_meta_group_infos_max;
+	int array_size;
 	struct ext4_group_info **meta_group_info;
+	struct ext4_group_desc *desc;
+
+	/* This is the number of blocks used by GDT */
+	num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) -
+				1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
 
+	/*
+	 * This is the total number of blocks used by GDT including
+	 * the number of reserved blocks for GDT.
+	 * The s_group_info array is allocated with this value
+	 * to allow a clean online resize without a complex
+	 * manipulation of pointer.
+	 * The drawback is the unused memory when no resize
+	 * occurs but it's very low in terms of pages
+	 * (see comments below)
+	 * Need to handle this properly when META_BG resizing is allowed
+	 */
+	num_meta_group_infos_max = num_meta_group_infos +
+				le16_to_cpu(es->s_reserved_gdt_blocks);
+
+	/*
+	 * array_size is the size of s_group_info array. We round it
+	 * to the next power of two because this approximation is done
+	 * internally by kmalloc so we can have some more memory
+	 * for free here (e.g. may be used for META_BG resize).
+	 */
+	array_size = 1;
+	while (array_size < sizeof(*sbi->s_group_info) *
+	       num_meta_group_infos_max)
+		array_size = array_size << 1;
 	/* An 8TB filesystem with 64-bit pointers requires a 4096 byte
 	 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
 	 * So a two level scheme suffices for now. */
-	sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) *
-				    num_meta_group_infos, GFP_KERNEL);
+	sbi->s_group_info = kmalloc(array_size, GFP_KERNEL);
 	if (sbi->s_group_info == NULL) {
 		printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
 		return -ENOMEM;
@@ -2277,62 +2448,15 @@ static int ext4_mb_init_backend(struct super_block *sb)
 		sbi->s_group_info[i] = meta_group_info;
 	}
 
-	/*
-	 * calculate needed size. if change bb_counters size,
-	 * don't forget about ext4_mb_generate_buddy()
-	 */
-	len = sizeof(struct ext4_group_info);
-	len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2);
 	for (i = 0; i < sbi->s_groups_count; i++) {
-		struct ext4_group_desc *desc;
-
-		meta_group_info =
-			sbi->s_group_info[i >> EXT4_DESC_PER_BLOCK_BITS(sb)];
-		j = i & (EXT4_DESC_PER_BLOCK(sb) - 1);
-
-		meta_group_info[j] = kzalloc(len, GFP_KERNEL);
-		if (meta_group_info[j] == NULL) {
-			printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
-			goto err_freebuddy;
-		}
 		desc = ext4_get_group_desc(sb, i, NULL);
 		if (desc == NULL) {
 			printk(KERN_ERR
 				"EXT4-fs: can't read descriptor %lu\n", i);
-			i++;
 			goto err_freebuddy;
 		}
-		set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
-			&(meta_group_info[j]->bb_state));
-
-		/*
-		 * initialize bb_free to be able to skip
-		 * empty groups without initialization
-		 */
-		if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-			meta_group_info[j]->bb_free =
-				ext4_free_blocks_after_init(sb, i, desc);
-		} else {
-			meta_group_info[j]->bb_free =
-				le16_to_cpu(desc->bg_free_blocks_count);
-		}
-
-		INIT_LIST_HEAD(&meta_group_info[j]->bb_prealloc_list);
-
-#ifdef DOUBLE_CHECK
-		{
-			struct buffer_head *bh;
-			meta_group_info[j]->bb_bitmap =
-				kmalloc(sb->s_blocksize, GFP_KERNEL);
-			BUG_ON(meta_group_info[j]->bb_bitmap == NULL);
-			bh = ext4_read_block_bitmap(sb, i);
-			BUG_ON(bh == NULL);
-			memcpy(meta_group_info[j]->bb_bitmap, bh->b_data,
-					sb->s_blocksize);
-			put_bh(bh);
-		}
-#endif
-
+		if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
+			goto err_freebuddy;
 	}
 
 	return 0;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 9ff7b1c0423..f000fbe2cd9 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -865,6 +865,15 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	gdp->bg_free_inodes_count = cpu_to_le16(EXT4_INODES_PER_GROUP(sb));
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
 
+	/*
+	 * We can allocate memory for mb_alloc based on the new group
+	 * descriptor
+	 */
+	if (test_opt(sb, MBALLOC)) {
+		err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
+		if (err)
+			goto exit_journal;
+	}
 	/*
 	 * Make the new blocks and inodes valid next.  We do this before
 	 * increasing the group count so that once the group is enabled,
@@ -957,6 +966,8 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 	handle_t *handle;
 	int err;
 	unsigned long freed_blocks;
+	ext4_group_t group;
+	struct ext4_group_info *grp;
 
 	/* We don't need to worry about locking wrt other resizers just
 	 * yet: we're going to revalidate es->s_blocks_count after
@@ -988,7 +999,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 	}
 
 	/* Handle the remaining blocks in the last group only. */
-	ext4_get_group_no_and_offset(sb, o_blocks_count, NULL, &last);
+	ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
 
 	if (last == 0) {
 		ext4_warning(sb, __func__,
@@ -1060,6 +1071,45 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 		   o_blocks_count + add);
 	if ((err = ext4_journal_stop(handle)))
 		goto exit_put;
+
+	/*
+	 * Mark mballoc pages as not up to date so that they will be updated
+	 * next time they are loaded by ext4_mb_load_buddy.
+	 */
+	if (test_opt(sb, MBALLOC)) {
+		struct ext4_sb_info *sbi = EXT4_SB(sb);
+		struct inode *inode = sbi->s_buddy_cache;
+		int blocks_per_page;
+		int block;
+		int pnum;
+		struct page *page;
+
+		/* Set buddy page as not up to date */
+		blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+		block = group * 2;
+		pnum = block / blocks_per_page;
+		page = find_get_page(inode->i_mapping, pnum);
+		if (page != NULL) {
+			ClearPageUptodate(page);
+			page_cache_release(page);
+		}
+
+		/* Set bitmap page as not up to date */
+		block++;
+		pnum = block / blocks_per_page;
+		page = find_get_page(inode->i_mapping, pnum);
+		if (page != NULL) {
+			ClearPageUptodate(page);
+			page_cache_release(page);
+		}
+
+		/* Get the info on the last group */
+		grp = ext4_get_group_info(sb, group);
+
+		/* Update free blocks in group info */
+		ext4_mb_update_group_info(grp, add);
+	}
+
 	if (test_opt(sb, DEBUG))
 		printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
 		       ext4_blocks_count(es));
-- 
cgit v1.2.3-70-g09d2


From 2e9ee850355593e311d9a26542290fe51e152f74 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: Use page_mkwrite vma_operations to get mmap write notification.

We would like to get notified when we are doing a write on mmap section.
This is needed with respect to preallocated area. We split the preallocated
area into initialzed extent and uninitialzed extent in the call back. This
let us handle ENOSPC better. Otherwise we get ENOSPC in the writepage and
that would result in data loss. The changes are also needed to handle ENOSPC
when writing to an mmap section of files with holes.

Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h  |  1 +
 fs/ext4/file.c  | 19 +++++++++++++++++-
 fs/ext4/inode.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 80 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 64edb09c481..98760d14e2c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1068,6 +1068,7 @@ extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
 extern int ext4_block_truncate_page(handle_t *handle, struct page *page,
 		struct address_space *mapping, loff_t from);
+extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
 
 /* ioctl.c */
 extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 4159be6366a..b9510ba66a2 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -123,6 +123,23 @@ force_commit:
 	return ret;
 }
 
+static struct vm_operations_struct ext4_file_vm_ops = {
+	.fault		= filemap_fault,
+	.page_mkwrite   = ext4_page_mkwrite,
+};
+
+static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct address_space *mapping = file->f_mapping;
+
+	if (!mapping->a_ops->readpage)
+		return -ENOEXEC;
+	file_accessed(file);
+	vma->vm_ops = &ext4_file_vm_ops;
+	vma->vm_flags |= VM_CAN_NONLINEAR;
+	return 0;
+}
+
 const struct file_operations ext4_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
@@ -133,7 +150,7 @@ const struct file_operations ext4_file_operations = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext4_compat_ioctl,
 #endif
-	.mmap		= generic_file_mmap,
+	.mmap		= ext4_file_mmap,
 	.open		= generic_file_open,
 	.release	= ext4_release_file,
 	.fsync		= ext4_sync_file,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c5d506dce8c..034fc544aa6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3564,3 +3564,64 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 
 	return err;
 }
+
+static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
+{
+	return !buffer_mapped(bh);
+}
+
+int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+	loff_t size;
+	unsigned long len;
+	int ret = -EINVAL;
+	struct file *file = vma->vm_file;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct address_space *mapping = inode->i_mapping;
+
+	/*
+	 * Get i_alloc_sem to stop truncates messing with the inode. We cannot
+	 * get i_mutex because we are already holding mmap_sem.
+	 */
+	down_read(&inode->i_alloc_sem);
+	size = i_size_read(inode);
+	if (page->mapping != mapping || size <= page_offset(page)
+	    || !PageUptodate(page)) {
+		/* page got truncated from under us? */
+		goto out_unlock;
+	}
+	ret = 0;
+	if (PageMappedToDisk(page))
+		goto out_unlock;
+
+	if (page->index == size >> PAGE_CACHE_SHIFT)
+		len = size & ~PAGE_CACHE_MASK;
+	else
+		len = PAGE_CACHE_SIZE;
+
+	if (page_has_buffers(page)) {
+		/* return if we have all the buffers mapped */
+		if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+				       ext4_bh_unmapped))
+			goto out_unlock;
+	}
+	/*
+	 * OK, we need to fill the hole... Do write_begin write_end
+	 * to do block allocation/reservation.We are not holding
+	 * inode.i__mutex here. That allow * parallel write_begin,
+	 * write_end call. lock_page prevent this from happening
+	 * on the same page though
+	 */
+	ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
+			len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+	if (ret < 0)
+		goto out_unlock;
+	ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
+			len, len, page, NULL);
+	if (ret < 0)
+		goto out_unlock;
+	ret = 0;
+out_unlock:
+	up_read(&inode->i_alloc_sem);
+	return ret;
+}
-- 
cgit v1.2.3-70-g09d2


From c7d206b3379f7d6462e778b74f475c470ee3dcaf Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: vfs: Move mark_inode_dirty() from under page lock in
 generic_write_end()

There's no need to call mark_inode_dirty() under page lock in
generic_write_end(). It unnecessarily makes hold time of page lock longer
and more importantly it forces locking order of page lock and transaction
start for journaling filesystems.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/buffer.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index 0f51c0f7c26..f4b033237a0 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2061,6 +2061,7 @@ int generic_write_end(struct file *file, struct address_space *mapping,
 			struct page *page, void *fsdata)
 {
 	struct inode *inode = mapping->host;
+	int i_size_changed = 0;
 
 	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
 
@@ -2073,12 +2074,21 @@ int generic_write_end(struct file *file, struct address_space *mapping,
 	 */
 	if (pos+copied > inode->i_size) {
 		i_size_write(inode, pos+copied);
-		mark_inode_dirty(inode);
+		i_size_changed = 1;
 	}
 
 	unlock_page(page);
 	page_cache_release(page);
 
+	/*
+	 * Don't mark the inode dirty under page lock. First, it unnecessarily
+	 * makes the holding time of page lock longer. Second, it forces lock
+	 * ordering of page lock and transaction start for journaling
+	 * filesystems.
+	 */
+	if (i_size_changed)
+		mark_inode_dirty(inode);
+
 	return copied;
 }
 EXPORT_SYMBOL(generic_write_end);
-- 
cgit v1.2.3-70-g09d2


From cf108bca465dde0c015f32dd453b99457d31c7c7 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: Invert the locking order of page_lock and transaction start

This changes are needed to support data=ordered mode handling via
inodes.  This enables us to get rid of the journal heads and buffer
heads for data buffers in the ordered mode.  With the changes, during
tranasaction commit we writeout the inode pages using the
writepages()/writepage(). That implies we take page lock during
transaction commit. This can cause a deadlock with the locking order
page_lock -> jbd2_journal_start, since the jbd2_journal_start can wait
for the journal_commit to happen and the journal_commit now needs to
take the page lock. To avoid this dead lock reverse the locking order.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    |   4 +-
 fs/ext4/extents.c |  15 +--
 fs/ext4/inode.c   | 292 ++++++++++++++++++++++++++++--------------------------
 3 files changed, 155 insertions(+), 156 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 98760d14e2c..f65829bbe7a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1066,7 +1066,7 @@ extern void ext4_set_inode_flags(struct inode *);
 extern void ext4_get_inode_flags(struct ext4_inode_info *);
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
-extern int ext4_block_truncate_page(handle_t *handle, struct page *page,
+extern int ext4_block_truncate_page(handle_t *handle,
 		struct address_space *mapping, loff_t from);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
 
@@ -1225,7 +1225,7 @@ extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 			ext4_lblk_t iblock,
 			unsigned long max_blocks, struct buffer_head *bh_result,
 			int create, int extend_disksize);
-extern void ext4_ext_truncate(struct inode *, struct page *);
+extern void ext4_ext_truncate(struct inode *);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
 extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index bb36a28f8ff..b722bce7d66 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2749,7 +2749,7 @@ out2:
 	return err ? err : allocated;
 }
 
-void ext4_ext_truncate(struct inode * inode, struct page *page)
+void ext4_ext_truncate(struct inode *inode)
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct super_block *sb = inode->i_sb;
@@ -2762,18 +2762,11 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
 	 */
 	err = ext4_writepage_trans_blocks(inode) + 3;
 	handle = ext4_journal_start(inode, err);
-	if (IS_ERR(handle)) {
-		if (page) {
-			clear_highpage(page);
-			flush_dcache_page(page);
-			unlock_page(page);
-			page_cache_release(page);
-		}
+	if (IS_ERR(handle))
 		return;
-	}
 
-	if (page)
-		ext4_block_truncate_page(handle, page, mapping, inode->i_size);
+	if (inode->i_size & (sb->s_blocksize - 1))
+		ext4_block_truncate_page(handle, mapping, inode->i_size);
 
 	down_write(&EXT4_I(inode)->i_data_sem);
 	ext4_ext_invalidate_cache(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 034fc544aa6..320acb6c35b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1239,19 +1239,20 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
  	to = from + len;
 
 retry:
- 	page = __grab_cache_page(mapping, index);
- 	if (!page)
- 		return -ENOMEM;
- 	*pagep = page;
-
   	handle = ext4_journal_start(inode, needed_blocks);
   	if (IS_ERR(handle)) {
- 		unlock_page(page);
- 		page_cache_release(page);
   		ret = PTR_ERR(handle);
   		goto out;
 	}
 
+	page = __grab_cache_page(mapping, index);
+	if (!page) {
+		ext4_journal_stop(handle);
+		ret = -ENOMEM;
+		goto out;
+	}
+	*pagep = page;
+
 	ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
 							ext4_get_block);
 
@@ -1261,8 +1262,8 @@ retry:
 	}
 
 	if (ret) {
-		ext4_journal_stop(handle);
  		unlock_page(page);
+		ext4_journal_stop(handle);
  		page_cache_release(page);
 	}
 
@@ -1290,29 +1291,6 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 	return ext4_journal_dirty_metadata(handle, bh);
 }
 
-/*
- * Generic write_end handler for ordered and writeback ext4 journal modes.
- * We can't use generic_write_end, because that unlocks the page and we need to
- * unlock the page after ext4_journal_stop, but ext4_journal_stop must run
- * after block_write_end.
- */
-static int ext4_generic_write_end(struct file *file,
-				struct address_space *mapping,
-				loff_t pos, unsigned len, unsigned copied,
-				struct page *page, void *fsdata)
-{
-	struct inode *inode = file->f_mapping->host;
-
-	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
-
-	if (pos+copied > inode->i_size) {
-		i_size_write(inode, pos+copied);
-		mark_inode_dirty(inode);
-	}
-
-	return copied;
-}
-
 /*
  * We need to pick up the new inode size which generic_commit_write gave us
  * `file' can be NULL - eg, when called from page_symlink().
@@ -1326,7 +1304,7 @@ static int ext4_ordered_write_end(struct file *file,
 				struct page *page, void *fsdata)
 {
 	handle_t *handle = ext4_journal_current_handle();
-	struct inode *inode = file->f_mapping->host;
+	struct inode *inode = mapping->host;
 	unsigned from, to;
 	int ret = 0, ret2;
 
@@ -1347,7 +1325,7 @@ static int ext4_ordered_write_end(struct file *file,
 		new_i_size = pos + copied;
 		if (new_i_size > EXT4_I(inode)->i_disksize)
 			EXT4_I(inode)->i_disksize = new_i_size;
-		ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
+		ret2 = generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
 		copied = ret2;
 		if (ret2 < 0)
@@ -1356,8 +1334,6 @@ static int ext4_ordered_write_end(struct file *file,
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
-	unlock_page(page);
-	page_cache_release(page);
 
 	return ret ? ret : copied;
 }
@@ -1368,7 +1344,7 @@ static int ext4_writeback_write_end(struct file *file,
 				struct page *page, void *fsdata)
 {
 	handle_t *handle = ext4_journal_current_handle();
-	struct inode *inode = file->f_mapping->host;
+	struct inode *inode = mapping->host;
 	int ret = 0, ret2;
 	loff_t new_i_size;
 
@@ -1376,7 +1352,7 @@ static int ext4_writeback_write_end(struct file *file,
 	if (new_i_size > EXT4_I(inode)->i_disksize)
 		EXT4_I(inode)->i_disksize = new_i_size;
 
-	ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
+	ret2 = generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
 	copied = ret2;
 	if (ret2 < 0)
@@ -1385,8 +1361,6 @@ static int ext4_writeback_write_end(struct file *file,
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
-	unlock_page(page);
-	page_cache_release(page);
 
 	return ret ? ret : copied;
 }
@@ -1425,10 +1399,10 @@ static int ext4_journalled_write_end(struct file *file,
 			ret = ret2;
 	}
 
+	unlock_page(page);
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
-	unlock_page(page);
 	page_cache_release(page);
 
 	return ret ? ret : copied;
@@ -1505,12 +1479,16 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
 	return 0;
 }
 
+static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
+{
+	return !buffer_mapped(bh) || buffer_delay(bh);
+}
+
 /*
- * Note that we always start a transaction even if we're not journalling
- * data.  This is to preserve ordering: any hole instantiation within
- * __block_write_full_page -> ext4_get_block() should be journalled
- * along with the data so we don't crash and then get metadata which
- * refers to old data.
+ * Note that we don't need to start a transaction unless we're journaling
+ * data because we should have holes filled from ext4_page_mkwrite(). If
+ * we are journaling data, we cannot start transaction directly because
+ * transaction start ranks above page lock so we have to do some magic...
  *
  * In all journalling modes block_write_full_page() will start the I/O.
  *
@@ -1554,10 +1532,8 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
  * disastrous.  Any write() or metadata operation will sync the fs for
  * us.
  *
- * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
- * we don't need to open a transaction here.
  */
-static int ext4_ordered_writepage(struct page *page,
+static int __ext4_ordered_writepage(struct page *page,
 				struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
@@ -1566,22 +1542,6 @@ static int ext4_ordered_writepage(struct page *page,
 	int ret = 0;
 	int err;
 
-	J_ASSERT(PageLocked(page));
-
-	/*
-	 * We give up here if we're reentered, because it might be for a
-	 * different filesystem.
-	 */
-	if (ext4_journal_current_handle())
-		goto out_fail;
-
-	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
-
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		goto out_fail;
-	}
-
 	if (!page_has_buffers(page)) {
 		create_empty_buffers(page, inode->i_sb->s_blocksize,
 				(1 << BH_Dirty)|(1 << BH_Uptodate));
@@ -1605,54 +1565,135 @@ static int ext4_ordered_writepage(struct page *page,
 	 * and generally junk.
 	 */
 	if (ret == 0) {
-		err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
+		handle = ext4_journal_start(inode,
+					ext4_writepage_trans_blocks(inode));
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			goto out_put;
+		}
+
+		ret = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
 					NULL, jbd2_journal_dirty_data_fn);
+		err = ext4_journal_stop(handle);
 		if (!ret)
 			ret = err;
 	}
-	walk_page_buffers(handle, page_bufs, 0,
-			PAGE_CACHE_SIZE, NULL, bput_one);
-	err = ext4_journal_stop(handle);
-	if (!ret)
-		ret = err;
+out_put:
+	walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
+			  bput_one);
 	return ret;
+}
+
+static int ext4_ordered_writepage(struct page *page,
+				struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t size = i_size_read(inode);
+	loff_t len;
+
+	J_ASSERT(PageLocked(page));
+	J_ASSERT(page_has_buffers(page));
+	if (page->index == size >> PAGE_CACHE_SHIFT)
+		len = size & ~PAGE_CACHE_MASK;
+	else
+		len = PAGE_CACHE_SIZE;
+	BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+				 ext4_bh_unmapped_or_delay));
+
+	/*
+	 * We give up here if we're reentered, because it might be for a
+	 * different filesystem.
+	 */
+	if (!ext4_journal_current_handle())
+		return __ext4_ordered_writepage(page, wbc);
 
-out_fail:
 	redirty_page_for_writepage(wbc, page);
 	unlock_page(page);
-	return ret;
+	return 0;
 }
 
+static int __ext4_writeback_writepage(struct page *page,
+				struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+
+	if (test_opt(inode->i_sb, NOBH))
+		return nobh_writepage(page, ext4_get_block, wbc);
+	else
+		return block_write_full_page(page, ext4_get_block, wbc);
+}
+
+
 static int ext4_writeback_writepage(struct page *page,
 				struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
+	loff_t size = i_size_read(inode);
+	loff_t len;
+
+	J_ASSERT(PageLocked(page));
+	J_ASSERT(page_has_buffers(page));
+	if (page->index == size >> PAGE_CACHE_SHIFT)
+		len = size & ~PAGE_CACHE_MASK;
+	else
+		len = PAGE_CACHE_SIZE;
+	BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+				 ext4_bh_unmapped_or_delay));
+
+	if (!ext4_journal_current_handle())
+		return __ext4_writeback_writepage(page, wbc);
+
+	redirty_page_for_writepage(wbc, page);
+	unlock_page(page);
+	return 0;
+}
+
+static int __ext4_journalled_writepage(struct page *page,
+				struct writeback_control *wbc)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *inode = mapping->host;
+	struct buffer_head *page_bufs;
 	handle_t *handle = NULL;
 	int ret = 0;
 	int err;
 
-	if (ext4_journal_current_handle())
-		goto out_fail;
+	ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, ext4_get_block);
+	if (ret != 0)
+		goto out_unlock;
+
+	page_bufs = page_buffers(page);
+	walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
+								bget_one);
+	/* As soon as we unlock the page, it can go away, but we have
+	 * references to buffers so we are safe */
+	unlock_page(page);
 
 	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
-		goto out_fail;
+		goto out;
 	}
 
-	if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
-		ret = nobh_writepage(page, ext4_get_block, wbc);
-	else
-		ret = block_write_full_page(page, ext4_get_block, wbc);
+	ret = walk_page_buffers(handle, page_bufs, 0,
+			PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
 
+	err = walk_page_buffers(handle, page_bufs, 0,
+				PAGE_CACHE_SIZE, NULL, write_end_fn);
+	if (ret == 0)
+		ret = err;
 	err = ext4_journal_stop(handle);
 	if (!ret)
 		ret = err;
-	return ret;
 
-out_fail:
-	redirty_page_for_writepage(wbc, page);
+	walk_page_buffers(handle, page_bufs, 0,
+				PAGE_CACHE_SIZE, NULL, bput_one);
+	EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+	goto out;
+
+out_unlock:
 	unlock_page(page);
+out:
 	return ret;
 }
 
@@ -1660,59 +1701,40 @@ static int ext4_journalled_writepage(struct page *page,
 				struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
-	handle_t *handle = NULL;
-	int ret = 0;
-	int err;
+	loff_t size = i_size_read(inode);
+	loff_t len;
 
-	if (ext4_journal_current_handle())
-		goto no_write;
+	J_ASSERT(PageLocked(page));
+	J_ASSERT(page_has_buffers(page));
+	if (page->index == size >> PAGE_CACHE_SHIFT)
+		len = size & ~PAGE_CACHE_MASK;
+	else
+		len = PAGE_CACHE_SIZE;
+	BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+				 ext4_bh_unmapped_or_delay));
 
-	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
+	if (ext4_journal_current_handle())
 		goto no_write;
-	}
 
-	if (!page_has_buffers(page) || PageChecked(page)) {
+	if (PageChecked(page)) {
 		/*
 		 * It's mmapped pagecache.  Add buffers and journal it.  There
 		 * doesn't seem much point in redirtying the page here.
 		 */
 		ClearPageChecked(page);
-		ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
-					ext4_get_block);
-		if (ret != 0) {
-			ext4_journal_stop(handle);
-			goto out_unlock;
-		}
-		ret = walk_page_buffers(handle, page_buffers(page), 0,
-			PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
-
-		err = walk_page_buffers(handle, page_buffers(page), 0,
-				PAGE_CACHE_SIZE, NULL, write_end_fn);
-		if (ret == 0)
-			ret = err;
-		EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
-		unlock_page(page);
+		return __ext4_journalled_writepage(page, wbc);
 	} else {
 		/*
 		 * It may be a page full of checkpoint-mode buffers.  We don't
 		 * really know unless we go poke around in the buffer_heads.
 		 * But block_write_full_page will do the right thing.
 		 */
-		ret = block_write_full_page(page, ext4_get_block, wbc);
+		return block_write_full_page(page, ext4_get_block, wbc);
 	}
-	err = ext4_journal_stop(handle);
-	if (!ret)
-		ret = err;
-out:
-	return ret;
-
 no_write:
 	redirty_page_for_writepage(wbc, page);
-out_unlock:
 	unlock_page(page);
-	goto out;
+	return 0;
 }
 
 static int ext4_readpage(struct file *file, struct page *page)
@@ -1909,7 +1931,7 @@ void ext4_set_aops(struct inode *inode)
  * This required during truncate. We need to physically zero the tail end
  * of that block so it doesn't yield old data if the file is later grown.
  */
-int ext4_block_truncate_page(handle_t *handle, struct page *page,
+int ext4_block_truncate_page(handle_t *handle,
 		struct address_space *mapping, loff_t from)
 {
 	ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
@@ -1918,8 +1940,13 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
 	ext4_lblk_t iblock;
 	struct inode *inode = mapping->host;
 	struct buffer_head *bh;
+	struct page *page;
 	int err = 0;
 
+	page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT);
+	if (!page)
+		return -EINVAL;
+
 	blocksize = inode->i_sb->s_blocksize;
 	length = blocksize - (offset & (blocksize - 1));
 	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
@@ -2410,46 +2437,25 @@ void ext4_truncate(struct inode *inode)
 	int n;
 	ext4_lblk_t last_block;
 	unsigned blocksize = inode->i_sb->s_blocksize;
-	struct page *page;
 
 	if (!ext4_can_truncate(inode))
 		return;
 
-	/*
-	 * We have to lock the EOF page here, because lock_page() nests
-	 * outside jbd2_journal_start().
-	 */
-	if ((inode->i_size & (blocksize - 1)) == 0) {
-		/* Block boundary? Nothing to do */
-		page = NULL;
-	} else {
-		page = grab_cache_page(mapping,
-				inode->i_size >> PAGE_CACHE_SHIFT);
-		if (!page)
-			return;
-	}
-
 	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
-		ext4_ext_truncate(inode, page);
+		ext4_ext_truncate(inode);
 		return;
 	}
 
 	handle = start_transaction(inode);
-	if (IS_ERR(handle)) {
-		if (page) {
-			clear_highpage(page);
-			flush_dcache_page(page);
-			unlock_page(page);
-			page_cache_release(page);
-		}
+	if (IS_ERR(handle))
 		return;		/* AKPM: return what? */
-	}
 
 	last_block = (inode->i_size + blocksize-1)
 					>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
 
-	if (page)
-		ext4_block_truncate_page(handle, page, mapping, inode->i_size);
+	if (inode->i_size & (blocksize - 1))
+		if (ext4_block_truncate_page(handle, mapping, inode->i_size))
+			goto out_stop;
 
 	n = ext4_block_to_path(inode, last_block, offsets, NULL);
 	if (n == 0)
-- 
cgit v1.2.3-70-g09d2


From 9ddfc3dc75b5cc55ff3cfa586e962d252f1db9d3 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: Fix lock inversion in ext4_ext_truncate()

We cannot call ext4_orphan_add() from under i_data_sem because that
causes a lock ordering violation between i_data_sem and and the
superblock lock.

Updated with Aneesh's locking order fix

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index b722bce7d66..7844bbb2bac 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2768,6 +2768,9 @@ void ext4_ext_truncate(struct inode *inode)
 	if (inode->i_size & (sb->s_blocksize - 1))
 		ext4_block_truncate_page(handle, mapping, inode->i_size);
 
+	if (ext4_orphan_add(handle, inode))
+		goto out_stop;
+
 	down_write(&EXT4_I(inode)->i_data_sem);
 	ext4_ext_invalidate_cache(inode);
 
@@ -2778,8 +2781,6 @@ void ext4_ext_truncate(struct inode *inode)
 	 * Probably we need not scan at all,
 	 * because page truncation is enough.
 	 */
-	if (ext4_orphan_add(handle, inode))
-		goto out_stop;
 
 	/* we have to know where to truncate from in crash case */
 	EXT4_I(inode)->i_disksize = inode->i_size;
@@ -2796,6 +2797,7 @@ void ext4_ext_truncate(struct inode *inode)
 		handle->h_sync = 1;
 
 out_stop:
+	up_write(&EXT4_I(inode)->i_data_sem);
 	/*
 	 * If this was a simple ftruncate() and the file will remain alive,
 	 * then we need to clear up the orphan record which we created above.
@@ -2806,7 +2808,6 @@ out_stop:
 	if (inode->i_nlink)
 		ext4_orphan_del(handle, inode);
 
-	up_write(&EXT4_I(inode)->i_data_sem);
 	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
 	ext4_journal_stop(handle);
-- 
cgit v1.2.3-70-g09d2


From c851ed540173736e60d48b53b91a16ea5c903896 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: jbd2: Implement data=ordered mode handling via inodes

This patch adds necessary framework into JBD2 to be able to track inodes
with each transaction and write-out their dirty data during transaction
commit time.

This new ordered mode brings all sorts of advantages such as possibility
to get rid of journal heads and buffer heads for data buffers in ordered
mode, better ordering of writes on transaction commit, simplification of
 some JBD code, no more anonymous pages when truncate of data being
committed happens.  Also with this new ordered mode, delayed allocation
on ordered mode is much simpler.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/jbd2/commit.c      | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/jbd2/journal.c     | 52 +++++++++++++++++++++++++++++
 fs/jbd2/transaction.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/jbd2.h  | 42 ++++++++++++++++++++++++
 4 files changed, 270 insertions(+)

(limited to 'fs')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 92b6ac3df8a..3ca107b5c86 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -355,6 +355,81 @@ write_out_data:
 	journal_do_submit_data(wbuf, bufs);
 }
 
+/*
+ * Submit all the data buffers of inode associated with the transaction to
+ * disk.
+ *
+ * We are in a committing transaction. Therefore no new inode can be added to
+ * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
+ * operate on from being released while we write out pages.
+ */
+static int journal_submit_inode_data_buffers(journal_t *journal,
+		transaction_t *commit_transaction)
+{
+	struct jbd2_inode *jinode;
+	int err, ret = 0;
+	struct address_space *mapping;
+
+	spin_lock(&journal->j_list_lock);
+	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+		mapping = jinode->i_vfs_inode->i_mapping;
+		jinode->i_flags |= JI_COMMIT_RUNNING;
+		spin_unlock(&journal->j_list_lock);
+		err = filemap_fdatawrite_range(mapping, 0,
+					i_size_read(jinode->i_vfs_inode));
+		if (!ret)
+			ret = err;
+		spin_lock(&journal->j_list_lock);
+		J_ASSERT(jinode->i_transaction == commit_transaction);
+		jinode->i_flags &= ~JI_COMMIT_RUNNING;
+		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
+	}
+	spin_unlock(&journal->j_list_lock);
+	return ret;
+}
+
+/*
+ * Wait for data submitted for writeout, refile inodes to proper
+ * transaction if needed.
+ *
+ */
+static int journal_finish_inode_data_buffers(journal_t *journal,
+		transaction_t *commit_transaction)
+{
+	struct jbd2_inode *jinode, *next_i;
+	int err, ret = 0;
+
+	/* For locking, see the comment in journal_submit_inode_data_buffers() */
+	spin_lock(&journal->j_list_lock);
+	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+		jinode->i_flags |= JI_COMMIT_RUNNING;
+		spin_unlock(&journal->j_list_lock);
+		err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
+		if (!ret)
+			ret = err;
+		spin_lock(&journal->j_list_lock);
+		jinode->i_flags &= ~JI_COMMIT_RUNNING;
+		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
+	}
+
+	/* Now refile inode to proper lists */
+	list_for_each_entry_safe(jinode, next_i,
+				 &commit_transaction->t_inode_list, i_list) {
+		list_del(&jinode->i_list);
+		if (jinode->i_next_transaction) {
+			jinode->i_transaction = jinode->i_next_transaction;
+			jinode->i_next_transaction = NULL;
+			list_add(&jinode->i_list,
+				&jinode->i_transaction->t_inode_list);
+		} else {
+			jinode->i_transaction = NULL;
+		}
+	}
+	spin_unlock(&journal->j_list_lock);
+
+	return ret;
+}
+
 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
 {
 	struct page *page = bh->b_page;
@@ -529,6 +604,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	 */
 	err = 0;
 	journal_submit_data_buffers(journal, commit_transaction);
+	err = journal_submit_inode_data_buffers(journal, commit_transaction);
+	if (err)
+		jbd2_journal_abort(journal, err);
 
 	/*
 	 * Wait for all previously submitted IO to complete if commit
@@ -760,6 +838,17 @@ start_journal_io:
 			__jbd2_journal_abort_hard(journal);
 	}
 
+	/*
+	 * This is the right place to wait for data buffers both for ASYNC
+	 * and !ASYNC commit. If commit is ASYNC, we need to wait only after
+	 * the commit block went to disk (which happens above). If commit is
+	 * SYNC, we need to wait for data buffers before we start writing
+	 * commit block, which happens below in such setting.
+	 */
+	err = journal_finish_inode_data_buffers(journal, commit_transaction);
+	if (err)
+		jbd2_journal_abort(journal, err);
+
 	/* Lo and behold: we have just managed to send a transaction to
            the log.  Before we can commit it, wait for the IO so far to
            complete.  Control buffers being written are on the
@@ -880,6 +969,7 @@ wait_for_iobuf:
 	jbd_debug(3, "JBD: commit phase 7\n");
 
 	J_ASSERT(commit_transaction->t_sync_datalist == NULL);
+	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
 	J_ASSERT(commit_transaction->t_buffers == NULL);
 	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
 	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 2e24567c4a7..78cf7bd7f60 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -82,6 +82,10 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
 EXPORT_SYMBOL(jbd2_journal_invalidatepage);
 EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
 EXPORT_SYMBOL(jbd2_journal_force_commit);
+EXPORT_SYMBOL(jbd2_journal_file_inode);
+EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
+EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
+EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
 
 static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
 static void __journal_abort_soft (journal_t *journal, int errno);
@@ -2194,6 +2198,54 @@ void jbd2_journal_put_journal_head(struct journal_head *jh)
 	jbd_unlock_bh_journal_head(bh);
 }
 
+/*
+ * Initialize jbd inode head
+ */
+void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode)
+{
+	jinode->i_transaction = NULL;
+	jinode->i_next_transaction = NULL;
+	jinode->i_vfs_inode = inode;
+	jinode->i_flags = 0;
+	INIT_LIST_HEAD(&jinode->i_list);
+}
+
+/*
+ * Function to be called before we start removing inode from memory (i.e.,
+ * clear_inode() is a fine place to be called from). It removes inode from
+ * transaction's lists.
+ */
+void jbd2_journal_release_jbd_inode(journal_t *journal,
+				    struct jbd2_inode *jinode)
+{
+	int writeout = 0;
+
+	if (!journal)
+		return;
+restart:
+	spin_lock(&journal->j_list_lock);
+	/* Is commit writing out inode - we have to wait */
+	if (jinode->i_flags & JI_COMMIT_RUNNING) {
+		wait_queue_head_t *wq;
+		DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
+		wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
+		prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		spin_unlock(&journal->j_list_lock);
+		schedule();
+		finish_wait(wq, &wait.wait);
+		goto restart;
+	}
+
+	/* Do we need to wait for data writeback? */
+	if (journal->j_committing_transaction == jinode->i_transaction)
+		writeout = 1;
+	if (jinode->i_transaction) {
+		list_del(&jinode->i_list);
+		jinode->i_transaction = NULL;
+	}
+	spin_unlock(&journal->j_list_lock);
+}
+
 /*
  * debugfs tunables
  */
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index ba620c4493d..98b596d2370 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -51,6 +51,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
 	transaction->t_tid = journal->j_transaction_sequence++;
 	transaction->t_expires = jiffies + journal->j_commit_interval;
 	spin_lock_init(&transaction->t_handle_lock);
+	INIT_LIST_HEAD(&transaction->t_inode_list);
 
 	/* Set up the commit timer for the new transaction. */
 	journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
@@ -2195,3 +2196,88 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
 	spin_unlock(&journal->j_list_lock);
 	__brelse(bh);
 }
+
+/*
+ * File inode in the inode list of the handle's transaction
+ */
+int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
+{
+	transaction_t *transaction = handle->h_transaction;
+	journal_t *journal = transaction->t_journal;
+
+	if (is_handle_aborted(handle))
+		return -EIO;
+
+	jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
+			transaction->t_tid);
+
+	/*
+	 * First check whether inode isn't already on the transaction's
+	 * lists without taking the lock. Note that this check is safe
+	 * without the lock as we cannot race with somebody removing inode
+	 * from the transaction. The reason is that we remove inode from the
+	 * transaction only in journal_release_jbd_inode() and when we commit
+	 * the transaction. We are guarded from the first case by holding
+	 * a reference to the inode. We are safe against the second case
+	 * because if jinode->i_transaction == transaction, commit code
+	 * cannot touch the transaction because we hold reference to it,
+	 * and if jinode->i_next_transaction == transaction, commit code
+	 * will only file the inode where we want it.
+	 */
+	if (jinode->i_transaction == transaction ||
+	    jinode->i_next_transaction == transaction)
+		return 0;
+
+	spin_lock(&journal->j_list_lock);
+
+	if (jinode->i_transaction == transaction ||
+	    jinode->i_next_transaction == transaction)
+		goto done;
+
+	/* On some different transaction's list - should be
+	 * the committing one */
+	if (jinode->i_transaction) {
+		J_ASSERT(jinode->i_next_transaction == NULL);
+		J_ASSERT(jinode->i_transaction ==
+					journal->j_committing_transaction);
+		jinode->i_next_transaction = transaction;
+		goto done;
+	}
+	/* Not on any transaction list... */
+	J_ASSERT(!jinode->i_next_transaction);
+	jinode->i_transaction = transaction;
+	list_add(&jinode->i_list, &transaction->t_inode_list);
+done:
+	spin_unlock(&journal->j_list_lock);
+
+	return 0;
+}
+
+/*
+ * This function must be called when inode is journaled in ordered mode
+ * before truncation happens. It starts writeout of truncated part in
+ * case it is in the committing transaction so that we stand to ordered
+ * mode consistency guarantees.
+ */
+int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
+					loff_t new_size)
+{
+	journal_t *journal;
+	transaction_t *commit_trans;
+	int ret = 0;
+
+	if (!inode->i_transaction && !inode->i_next_transaction)
+		goto out;
+	journal = inode->i_transaction->t_journal;
+	spin_lock(&journal->j_state_lock);
+	commit_trans = journal->j_committing_transaction;
+	spin_unlock(&journal->j_state_lock);
+	if (inode->i_transaction == commit_trans) {
+		ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
+			new_size, LLONG_MAX);
+		if (ret)
+			jbd2_journal_abort(journal, ret);
+	}
+out:
+	return ret;
+}
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index ec9cadf5822..622c3d8ca4e 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -381,6 +381,38 @@ static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
 	bit_spin_unlock(BH_JournalHead, &bh->b_state);
 }
 
+/* Flags in jbd_inode->i_flags */
+#define __JI_COMMIT_RUNNING 0
+/* Commit of the inode data in progress. We use this flag to protect us from
+ * concurrent deletion of inode. We cannot use reference to inode for this
+ * since we cannot afford doing last iput() on behalf of kjournald
+ */
+#define JI_COMMIT_RUNNING (1 << __JI_COMMIT_RUNNING)
+
+/**
+ * struct jbd_inode is the structure linking inodes in ordered mode
+ *   present in a transaction so that we can sync them during commit.
+ */
+struct jbd2_inode {
+	/* Which transaction does this inode belong to? Either the running
+	 * transaction or the committing one. [j_list_lock] */
+	transaction_t *i_transaction;
+
+	/* Pointer to the running transaction modifying inode's data in case
+	 * there is already a committing transaction touching it. [j_list_lock] */
+	transaction_t *i_next_transaction;
+
+	/* List of inodes in the i_transaction [j_list_lock] */
+	struct list_head i_list;
+
+	/* VFS inode this inode belongs to [constant during the lifetime
+	 * of the structure] */
+	struct inode *i_vfs_inode;
+
+	/* Flags of inode [j_list_lock] */
+	unsigned int i_flags;
+};
+
 struct jbd2_revoke_table_s;
 
 /**
@@ -566,6 +598,12 @@ struct transaction_s
 	 */
 	struct journal_head	*t_log_list;
 
+	/*
+	 * List of inodes whose data we've modified in data=ordered mode.
+	 * [j_list_lock]
+	 */
+	struct list_head	t_inode_list;
+
 	/*
 	 * Protects info related to handles
 	 */
@@ -1046,6 +1084,10 @@ extern void	   jbd2_journal_ack_err    (journal_t *);
 extern int	   jbd2_journal_clear_err  (journal_t *);
 extern int	   jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *);
 extern int	   jbd2_journal_force_commit(journal_t *);
+extern int	   jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode);
+extern int	   jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, loff_t new_size);
+extern void	   jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode);
+extern void	   jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode);
 
 /*
  * journal_head management
-- 
cgit v1.2.3-70-g09d2


From 678aaf481496b01473b778685eca231d6784098b Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: Use new framework for data=ordered mode in JBD2

This patch makes ext4 use inode-based implementation of data=ordered mode
in JBD2. It allows us to unify some data=ordered and data=writeback paths
(especially writepage since we don't have to start a transaction anymore)
and remove some buffer walking.

Updated fix from Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
to fix file system hang due to corrupt jinode values.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4_i.h    |   1 +
 fs/ext4/ext4_jbd2.h |   7 ++-
 fs/ext4/inode.c     | 158 ++++++++++++++++------------------------------------
 fs/ext4/super.c     |   5 +-
 4 files changed, 59 insertions(+), 112 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index abf2744164e..c2903ef7215 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -150,6 +150,7 @@ struct ext4_inode_info {
 	 */
 	struct rw_semaphore i_data_sem;
 	struct inode vfs_inode;
+	struct jbd2_inode jinode;
 
 	unsigned long i_ext_generation;
 	struct ext4_ext_cache i_cached_extent;
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index d0aa9ee20f8..eb8bc3afe6e 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -154,8 +154,6 @@ int __ext4_journal_dirty_metadata(const char *where,
 #define ext4_journal_forget(handle, bh) \
 	__ext4_journal_forget(__func__, (handle), (bh))
 
-int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh);
-
 handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
 int __ext4_journal_stop(const char *where, handle_t *handle);
 
@@ -192,6 +190,11 @@ static inline int ext4_journal_force_commit(journal_t *journal)
 	return jbd2_journal_force_commit(journal);
 }
 
+static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
+{
+	return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
+}
+
 /* super.c */
 int ext4_force_commit(struct super_block *sb);
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 320acb6c35b..7b9569179fd 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -39,6 +39,13 @@
 #include "xattr.h"
 #include "acl.h"
 
+static inline int ext4_begin_ordered_truncate(struct inode *inode,
+					      loff_t new_size)
+{
+	return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode,
+						   new_size);
+}
+
 /*
  * Test whether an inode is a fast symlink.
  */
@@ -181,6 +188,8 @@ void ext4_delete_inode (struct inode * inode)
 {
 	handle_t *handle;
 
+	if (ext4_should_order_data(inode))
+		ext4_begin_ordered_truncate(inode, 0);
 	truncate_inode_pages(&inode->i_data, 0);
 
 	if (is_bad_inode(inode))
@@ -1273,15 +1282,6 @@ out:
 	return ret;
 }
 
-int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
-{
-	int err = jbd2_journal_dirty_data(handle, bh);
-	if (err)
-		ext4_journal_abort_handle(__func__, __func__,
-						bh, handle, err);
-	return err;
-}
-
 /* For write_end() in data=journal mode */
 static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 {
@@ -1311,8 +1311,7 @@ static int ext4_ordered_write_end(struct file *file,
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
 
-	ret = walk_page_buffers(handle, page_buffers(page),
-		from, to, NULL, ext4_journal_dirty_data);
+	ret = ext4_jbd2_file_inode(handle, inode);
 
 	if (ret == 0) {
 		/*
@@ -1472,25 +1471,22 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
 	return 0;
 }
 
-static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
-{
-	if (buffer_mapped(bh))
-		return ext4_journal_dirty_data(handle, bh);
-	return 0;
-}
-
 static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
 {
 	return !buffer_mapped(bh) || buffer_delay(bh);
 }
 
 /*
- * Note that we don't need to start a transaction unless we're journaling
- * data because we should have holes filled from ext4_page_mkwrite(). If
- * we are journaling data, we cannot start transaction directly because
- * transaction start ranks above page lock so we have to do some magic...
+ * Note that we don't need to start a transaction unless we're journaling data
+ * because we should have holes filled from ext4_page_mkwrite(). We even don't
+ * need to file the inode to the transaction's list in ordered mode because if
+ * we are writing back data added by write(), the inode is already there and if
+ * we are writing back data modified via mmap(), noone guarantees in which
+ * transaction the data will hit the disk. In case we are journaling data, we
+ * cannot start transaction directly because transaction start ranks above page
+ * lock so we have to do some magic.
  *
- * In all journalling modes block_write_full_page() will start the I/O.
+ * In all journaling modes block_write_full_page() will start the I/O.
  *
  * Problem:
  *
@@ -1533,86 +1529,7 @@ static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
  * us.
  *
  */
-static int __ext4_ordered_writepage(struct page *page,
-				struct writeback_control *wbc)
-{
-	struct inode *inode = page->mapping->host;
-	struct buffer_head *page_bufs;
-	handle_t *handle = NULL;
-	int ret = 0;
-	int err;
-
-	if (!page_has_buffers(page)) {
-		create_empty_buffers(page, inode->i_sb->s_blocksize,
-				(1 << BH_Dirty)|(1 << BH_Uptodate));
-	}
-	page_bufs = page_buffers(page);
-	walk_page_buffers(handle, page_bufs, 0,
-			PAGE_CACHE_SIZE, NULL, bget_one);
-
-	ret = block_write_full_page(page, ext4_get_block, wbc);
-
-	/*
-	 * The page can become unlocked at any point now, and
-	 * truncate can then come in and change things.  So we
-	 * can't touch *page from now on.  But *page_bufs is
-	 * safe due to elevated refcount.
-	 */
-
-	/*
-	 * And attach them to the current transaction.  But only if
-	 * block_write_full_page() succeeded.  Otherwise they are unmapped,
-	 * and generally junk.
-	 */
-	if (ret == 0) {
-		handle = ext4_journal_start(inode,
-					ext4_writepage_trans_blocks(inode));
-		if (IS_ERR(handle)) {
-			ret = PTR_ERR(handle);
-			goto out_put;
-		}
-
-		ret = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
-					NULL, jbd2_journal_dirty_data_fn);
-		err = ext4_journal_stop(handle);
-		if (!ret)
-			ret = err;
-	}
-out_put:
-	walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
-			  bput_one);
-	return ret;
-}
-
-static int ext4_ordered_writepage(struct page *page,
-				struct writeback_control *wbc)
-{
-	struct inode *inode = page->mapping->host;
-	loff_t size = i_size_read(inode);
-	loff_t len;
-
-	J_ASSERT(PageLocked(page));
-	J_ASSERT(page_has_buffers(page));
-	if (page->index == size >> PAGE_CACHE_SHIFT)
-		len = size & ~PAGE_CACHE_MASK;
-	else
-		len = PAGE_CACHE_SIZE;
-	BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
-				 ext4_bh_unmapped_or_delay));
-
-	/*
-	 * We give up here if we're reentered, because it might be for a
-	 * different filesystem.
-	 */
-	if (!ext4_journal_current_handle())
-		return __ext4_ordered_writepage(page, wbc);
-
-	redirty_page_for_writepage(wbc, page);
-	unlock_page(page);
-	return 0;
-}
-
-static int __ext4_writeback_writepage(struct page *page,
+static int __ext4_normal_writepage(struct page *page,
 				struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
@@ -1624,7 +1541,7 @@ static int __ext4_writeback_writepage(struct page *page,
 }
 
 
-static int ext4_writeback_writepage(struct page *page,
+static int ext4_normal_writepage(struct page *page,
 				struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
@@ -1641,7 +1558,7 @@ static int ext4_writeback_writepage(struct page *page,
 				 ext4_bh_unmapped_or_delay));
 
 	if (!ext4_journal_current_handle())
-		return __ext4_writeback_writepage(page, wbc);
+		return __ext4_normal_writepage(page, wbc);
 
 	redirty_page_for_writepage(wbc, page);
 	unlock_page(page);
@@ -1877,7 +1794,7 @@ static int ext4_journalled_set_page_dirty(struct page *page)
 static const struct address_space_operations ext4_ordered_aops = {
 	.readpage	= ext4_readpage,
 	.readpages	= ext4_readpages,
-	.writepage	= ext4_ordered_writepage,
+	.writepage	= ext4_normal_writepage,
 	.sync_page	= block_sync_page,
 	.write_begin	= ext4_write_begin,
 	.write_end	= ext4_ordered_write_end,
@@ -1891,7 +1808,7 @@ static const struct address_space_operations ext4_ordered_aops = {
 static const struct address_space_operations ext4_writeback_aops = {
 	.readpage	= ext4_readpage,
 	.readpages	= ext4_readpages,
-	.writepage	= ext4_writeback_writepage,
+	.writepage	= ext4_normal_writepage,
 	.sync_page	= block_sync_page,
 	.write_begin	= ext4_write_begin,
 	.write_end	= ext4_writeback_write_end,
@@ -2019,7 +1936,7 @@ int ext4_block_truncate_page(handle_t *handle,
 		err = ext4_journal_dirty_metadata(handle, bh);
 	} else {
 		if (ext4_should_order_data(inode))
-			err = ext4_journal_dirty_data(handle, bh);
+			err = ext4_jbd2_file_inode(handle, inode);
 		mark_buffer_dirty(bh);
 	}
 
@@ -3171,7 +3088,14 @@ int ext4_write_inode(struct inode *inode, int wait)
  * be freed, so we have a strong guarantee that no future commit will
  * leave these blocks visible to the user.)
  *
- * Called with inode->sem down.
+ * Another thing we have to assure is that if we are in ordered mode
+ * and inode is still attached to the committing transaction, we must
+ * we start writeout of all the dirty pages which are being truncated.
+ * This way we are sure that all the data written in the previous
+ * transaction are already on disk (truncate waits for pages under
+ * writeback).
+ *
+ * Called with inode->i_mutex down.
  */
 int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 {
@@ -3237,6 +3161,22 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 		if (!error)
 			error = rc;
 		ext4_journal_stop(handle);
+
+		if (ext4_should_order_data(inode)) {
+			error = ext4_begin_ordered_truncate(inode,
+							    attr->ia_size);
+			if (error) {
+				/* Do as much error cleanup as possible */
+				handle = ext4_journal_start(inode, 3);
+				if (IS_ERR(handle)) {
+					ext4_orphan_del(NULL, inode);
+					goto err_out;
+				}
+				ext4_orphan_del(handle, inode);
+				ext4_journal_stop(handle);
+				goto err_out;
+			}
+		}
 	}
 
 	rc = inode_setattr(inode, attr);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 1b330cd71ca..629d0fa27e3 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -573,6 +573,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
 	INIT_LIST_HEAD(&ei->i_prealloc_list);
 	spin_lock_init(&ei->i_prealloc_lock);
+	jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
 	return &ei->vfs_inode;
 }
 
@@ -637,6 +638,8 @@ static void ext4_clear_inode(struct inode *inode)
 	EXT4_I(inode)->i_block_alloc_info = NULL;
 	if (unlikely(rsv))
 		kfree(rsv);
+	jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
+				       &EXT4_I(inode)->jinode);
 }
 
 static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb)
@@ -3378,7 +3381,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 			err = ext4_journal_dirty_metadata(handle, bh);
 		else {
 			/* Always do at least ordered writes for quotas */
-			err = ext4_journal_dirty_data(handle, bh);
+			err = ext4_jbd2_file_inode(handle, inode);
 			mark_buffer_dirty(bh);
 		}
 		brelse(bh);
-- 
cgit v1.2.3-70-g09d2


From 87c89c232c8f7b3820c33c3b9bc803e9358027da Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: jbd2: Remove data=ordered mode support using jbd buffer heads

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/jbd2/checkpoint.c  |   1 -
 fs/jbd2/commit.c      | 221 ++------------------------------------------------
 fs/jbd2/journal.c     |   1 -
 fs/jbd2/transaction.c | 217 ++-----------------------------------------------
 include/linux/jbd2.h  |  29 ++-----
 5 files changed, 21 insertions(+), 448 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 6914598022c..91389c8aee8 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -688,7 +688,6 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
 
 	J_ASSERT(transaction->t_state == T_FINISHED);
 	J_ASSERT(transaction->t_buffers == NULL);
-	J_ASSERT(transaction->t_sync_datalist == NULL);
 	J_ASSERT(transaction->t_forget == NULL);
 	J_ASSERT(transaction->t_iobuf_list == NULL);
 	J_ASSERT(transaction->t_shadow_list == NULL);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 3ca107b5c86..483183d15ed 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -37,8 +37,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 }
 
 /*
- * When an ext3-ordered file is truncated, it is possible that many pages are
- * not sucessfully freed, because they are attached to a committing transaction.
+ * When an ext4 file is truncated, it is possible that some pages are not
+ * successfully freed, because they are attached to a committing transaction.
  * After the transaction commits, these pages are left on the LRU, with no
  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
  * by the VM, but their apparent absence upsets the VM accounting, and it makes
@@ -79,21 +79,6 @@ nope:
 	__brelse(bh);
 }
 
-/*
- * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
- * held.  For ranking reasons we must trylock.  If we lose, schedule away and
- * return 0.  j_list_lock is dropped in this case.
- */
-static int inverted_lock(journal_t *journal, struct buffer_head *bh)
-{
-	if (!jbd_trylock_bh_state(bh)) {
-		spin_unlock(&journal->j_list_lock);
-		schedule();
-		return 0;
-	}
-	return 1;
-}
-
 /*
  * Done it all: now submit the commit record.  We should have
  * cleaned up our previous buffers by now, so if we are in abort
@@ -199,162 +184,6 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
 	return ret;
 }
 
-/*
- * Wait for all submitted IO to complete.
- */
-static int journal_wait_on_locked_list(journal_t *journal,
-				       transaction_t *commit_transaction)
-{
-	int ret = 0;
-	struct journal_head *jh;
-
-	while (commit_transaction->t_locked_list) {
-		struct buffer_head *bh;
-
-		jh = commit_transaction->t_locked_list->b_tprev;
-		bh = jh2bh(jh);
-		get_bh(bh);
-		if (buffer_locked(bh)) {
-			spin_unlock(&journal->j_list_lock);
-			wait_on_buffer(bh);
-			if (unlikely(!buffer_uptodate(bh)))
-				ret = -EIO;
-			spin_lock(&journal->j_list_lock);
-		}
-		if (!inverted_lock(journal, bh)) {
-			put_bh(bh);
-			spin_lock(&journal->j_list_lock);
-			continue;
-		}
-		if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
-			__jbd2_journal_unfile_buffer(jh);
-			jbd_unlock_bh_state(bh);
-			jbd2_journal_remove_journal_head(bh);
-			put_bh(bh);
-		} else {
-			jbd_unlock_bh_state(bh);
-		}
-		put_bh(bh);
-		cond_resched_lock(&journal->j_list_lock);
-	}
-	return ret;
-  }
-
-static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
-{
-	int i;
-
-	for (i = 0; i < bufs; i++) {
-		wbuf[i]->b_end_io = end_buffer_write_sync;
-		/* We use-up our safety reference in submit_bh() */
-		submit_bh(WRITE, wbuf[i]);
-	}
-}
-
-/*
- *  Submit all the data buffers to disk
- */
-static void journal_submit_data_buffers(journal_t *journal,
-				transaction_t *commit_transaction)
-{
-	struct journal_head *jh;
-	struct buffer_head *bh;
-	int locked;
-	int bufs = 0;
-	struct buffer_head **wbuf = journal->j_wbuf;
-
-	/*
-	 * Whenever we unlock the journal and sleep, things can get added
-	 * onto ->t_sync_datalist, so we have to keep looping back to
-	 * write_out_data until we *know* that the list is empty.
-	 *
-	 * Cleanup any flushed data buffers from the data list.  Even in
-	 * abort mode, we want to flush this out as soon as possible.
-	 */
-write_out_data:
-	cond_resched();
-	spin_lock(&journal->j_list_lock);
-
-	while (commit_transaction->t_sync_datalist) {
-		jh = commit_transaction->t_sync_datalist;
-		bh = jh2bh(jh);
-		locked = 0;
-
-		/* Get reference just to make sure buffer does not disappear
-		 * when we are forced to drop various locks */
-		get_bh(bh);
-		/* If the buffer is dirty, we need to submit IO and hence
-		 * we need the buffer lock. We try to lock the buffer without
-		 * blocking. If we fail, we need to drop j_list_lock and do
-		 * blocking lock_buffer().
-		 */
-		if (buffer_dirty(bh)) {
-			if (test_set_buffer_locked(bh)) {
-				BUFFER_TRACE(bh, "needs blocking lock");
-				spin_unlock(&journal->j_list_lock);
-				/* Write out all data to prevent deadlocks */
-				journal_do_submit_data(wbuf, bufs);
-				bufs = 0;
-				lock_buffer(bh);
-				spin_lock(&journal->j_list_lock);
-			}
-			locked = 1;
-		}
-		/* We have to get bh_state lock. Again out of order, sigh. */
-		if (!inverted_lock(journal, bh)) {
-			jbd_lock_bh_state(bh);
-			spin_lock(&journal->j_list_lock);
-		}
-		/* Someone already cleaned up the buffer? */
-		if (!buffer_jbd(bh)
-			|| jh->b_transaction != commit_transaction
-			|| jh->b_jlist != BJ_SyncData) {
-			jbd_unlock_bh_state(bh);
-			if (locked)
-				unlock_buffer(bh);
-			BUFFER_TRACE(bh, "already cleaned up");
-			put_bh(bh);
-			continue;
-		}
-		if (locked && test_clear_buffer_dirty(bh)) {
-			BUFFER_TRACE(bh, "needs writeout, adding to array");
-			wbuf[bufs++] = bh;
-			__jbd2_journal_file_buffer(jh, commit_transaction,
-						BJ_Locked);
-			jbd_unlock_bh_state(bh);
-			if (bufs == journal->j_wbufsize) {
-				spin_unlock(&journal->j_list_lock);
-				journal_do_submit_data(wbuf, bufs);
-				bufs = 0;
-				goto write_out_data;
-			}
-		} else if (!locked && buffer_locked(bh)) {
-			__jbd2_journal_file_buffer(jh, commit_transaction,
-						BJ_Locked);
-			jbd_unlock_bh_state(bh);
-			put_bh(bh);
-		} else {
-			BUFFER_TRACE(bh, "writeout complete: unfile");
-			__jbd2_journal_unfile_buffer(jh);
-			jbd_unlock_bh_state(bh);
-			if (locked)
-				unlock_buffer(bh);
-			jbd2_journal_remove_journal_head(bh);
-			/* Once for our safety reference, once for
-			 * jbd2_journal_remove_journal_head() */
-			put_bh(bh);
-			put_bh(bh);
-		}
-
-		if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
-			spin_unlock(&journal->j_list_lock);
-			goto write_out_data;
-		}
-	}
-	spin_unlock(&journal->j_list_lock);
-	journal_do_submit_data(wbuf, bufs);
-}
-
 /*
  * Submit all the data buffers of inode associated with the transaction to
  * disk.
@@ -602,24 +431,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	 * Now start flushing things to disk, in the order they appear
 	 * on the transaction lists.  Data blocks go first.
 	 */
-	err = 0;
-	journal_submit_data_buffers(journal, commit_transaction);
 	err = journal_submit_inode_data_buffers(journal, commit_transaction);
-	if (err)
-		jbd2_journal_abort(journal, err);
-
-	/*
-	 * Wait for all previously submitted IO to complete if commit
-	 * record is to be written synchronously.
-	 */
-	spin_lock(&journal->j_list_lock);
-	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
-		JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
-		err = journal_wait_on_locked_list(journal,
-						commit_transaction);
-
-	spin_unlock(&journal->j_list_lock);
-
 	if (err)
 		jbd2_journal_abort(journal, err);
 
@@ -627,16 +439,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 
 	jbd_debug(3, "JBD: commit phase 2\n");
 
-	/*
-	 * If we found any dirty or locked buffers, then we should have
-	 * looped back up to the write_out_data label.  If there weren't
-	 * any then journal_clean_data_list should have wiped the list
-	 * clean by now, so check that it is in fact empty.
-	 */
-	J_ASSERT (commit_transaction->t_sync_datalist == NULL);
-
-	jbd_debug (3, "JBD: commit phase 3\n");
-
 	/*
 	 * Way to go: we have now written out all of the data for a
 	 * transaction!  Now comes the tricky part: we need to write out
@@ -655,6 +457,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	J_ASSERT(commit_transaction->t_nr_buffers <=
 		 commit_transaction->t_outstanding_credits);
 
+	err = 0;
 	descriptor = NULL;
 	bufs = 0;
 	while (commit_transaction->t_buffers) {
@@ -829,13 +632,6 @@ start_journal_io:
 						 &cbh, crc32_sum);
 		if (err)
 			__jbd2_journal_abort_hard(journal);
-
-		spin_lock(&journal->j_list_lock);
-		err = journal_wait_on_locked_list(journal,
-						commit_transaction);
-		spin_unlock(&journal->j_list_lock);
-		if (err)
-			__jbd2_journal_abort_hard(journal);
 	}
 
 	/*
@@ -860,7 +656,7 @@ start_journal_io:
 	   so we incur less scheduling load.
 	*/
 
-	jbd_debug(3, "JBD: commit phase 4\n");
+	jbd_debug(3, "JBD: commit phase 3\n");
 
 	/*
 	 * akpm: these are BJ_IO, and j_list_lock is not needed.
@@ -919,7 +715,7 @@ wait_for_iobuf:
 
 	J_ASSERT (commit_transaction->t_shadow_list == NULL);
 
-	jbd_debug(3, "JBD: commit phase 5\n");
+	jbd_debug(3, "JBD: commit phase 4\n");
 
 	/* Here we wait for the revoke record and descriptor record buffers */
  wait_for_ctlbuf:
@@ -946,7 +742,7 @@ wait_for_iobuf:
 		/* AKPM: bforget here */
 	}
 
-	jbd_debug(3, "JBD: commit phase 6\n");
+	jbd_debug(3, "JBD: commit phase 5\n");
 
 	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
 		JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
@@ -966,9 +762,8 @@ wait_for_iobuf:
            transaction can be removed from any checkpoint list it was on
            before. */
 
-	jbd_debug(3, "JBD: commit phase 7\n");
+	jbd_debug(3, "JBD: commit phase 6\n");
 
-	J_ASSERT(commit_transaction->t_sync_datalist == NULL);
 	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
 	J_ASSERT(commit_transaction->t_buffers == NULL);
 	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
@@ -1090,7 +885,7 @@ restart_loop:
 
 	/* Done with this transaction! */
 
-	jbd_debug(3, "JBD: commit phase 8\n");
+	jbd_debug(3, "JBD: commit phase 7\n");
 
 	J_ASSERT(commit_transaction->t_state == T_COMMIT);
 
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 78cf7bd7f60..b26c6d9fe6a 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -50,7 +50,6 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates);
 EXPORT_SYMBOL(jbd2_journal_get_write_access);
 EXPORT_SYMBOL(jbd2_journal_get_create_access);
 EXPORT_SYMBOL(jbd2_journal_get_undo_access);
-EXPORT_SYMBOL(jbd2_journal_dirty_data);
 EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
 EXPORT_SYMBOL(jbd2_journal_release_buffer);
 EXPORT_SYMBOL(jbd2_journal_forget);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 98b596d2370..4f7cadbb19f 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -942,183 +942,6 @@ out:
 	return err;
 }
 
-/**
- * int jbd2_journal_dirty_data() -  mark a buffer as containing dirty data which
- *                             needs to be flushed before we can commit the
- *                             current transaction.
- * @handle: transaction
- * @bh: bufferhead to mark
- *
- * The buffer is placed on the transaction's data list and is marked as
- * belonging to the transaction.
- *
- * Returns error number or 0 on success.
- *
- * jbd2_journal_dirty_data() can be called via page_launder->ext3_writepage
- * by kswapd.
- */
-int jbd2_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
-{
-	journal_t *journal = handle->h_transaction->t_journal;
-	int need_brelse = 0;
-	struct journal_head *jh;
-
-	if (is_handle_aborted(handle))
-		return 0;
-
-	jh = jbd2_journal_add_journal_head(bh);
-	JBUFFER_TRACE(jh, "entry");
-
-	/*
-	 * The buffer could *already* be dirty.  Writeout can start
-	 * at any time.
-	 */
-	jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
-
-	/*
-	 * What if the buffer is already part of a running transaction?
-	 *
-	 * There are two cases:
-	 * 1) It is part of the current running transaction.  Refile it,
-	 *    just in case we have allocated it as metadata, deallocated
-	 *    it, then reallocated it as data.
-	 * 2) It is part of the previous, still-committing transaction.
-	 *    If all we want to do is to guarantee that the buffer will be
-	 *    written to disk before this new transaction commits, then
-	 *    being sure that the *previous* transaction has this same
-	 *    property is sufficient for us!  Just leave it on its old
-	 *    transaction.
-	 *
-	 * In case (2), the buffer must not already exist as metadata
-	 * --- that would violate write ordering (a transaction is free
-	 * to write its data at any point, even before the previous
-	 * committing transaction has committed).  The caller must
-	 * never, ever allow this to happen: there's nothing we can do
-	 * about it in this layer.
-	 */
-	jbd_lock_bh_state(bh);
-	spin_lock(&journal->j_list_lock);
-
-	/* Now that we have bh_state locked, are we really still mapped? */
-	if (!buffer_mapped(bh)) {
-		JBUFFER_TRACE(jh, "unmapped buffer, bailing out");
-		goto no_journal;
-	}
-
-	if (jh->b_transaction) {
-		JBUFFER_TRACE(jh, "has transaction");
-		if (jh->b_transaction != handle->h_transaction) {
-			JBUFFER_TRACE(jh, "belongs to older transaction");
-			J_ASSERT_JH(jh, jh->b_transaction ==
-					journal->j_committing_transaction);
-
-			/* @@@ IS THIS TRUE  ? */
-			/*
-			 * Not any more.  Scenario: someone does a write()
-			 * in data=journal mode.  The buffer's transaction has
-			 * moved into commit.  Then someone does another
-			 * write() to the file.  We do the frozen data copyout
-			 * and set b_next_transaction to point to j_running_t.
-			 * And while we're in that state, someone does a
-			 * writepage() in an attempt to pageout the same area
-			 * of the file via a shared mapping.  At present that
-			 * calls jbd2_journal_dirty_data(), and we get right here.
-			 * It may be too late to journal the data.  Simply
-			 * falling through to the next test will suffice: the
-			 * data will be dirty and wil be checkpointed.  The
-			 * ordering comments in the next comment block still
-			 * apply.
-			 */
-			//J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
-
-			/*
-			 * If we're journalling data, and this buffer was
-			 * subject to a write(), it could be metadata, forget
-			 * or shadow against the committing transaction.  Now,
-			 * someone has dirtied the same darn page via a mapping
-			 * and it is being writepage()'d.
-			 * We *could* just steal the page from commit, with some
-			 * fancy locking there.  Instead, we just skip it -
-			 * don't tie the page's buffers to the new transaction
-			 * at all.
-			 * Implication: if we crash before the writepage() data
-			 * is written into the filesystem, recovery will replay
-			 * the write() data.
-			 */
-			if (jh->b_jlist != BJ_None &&
-					jh->b_jlist != BJ_SyncData &&
-					jh->b_jlist != BJ_Locked) {
-				JBUFFER_TRACE(jh, "Not stealing");
-				goto no_journal;
-			}
-
-			/*
-			 * This buffer may be undergoing writeout in commit.  We
-			 * can't return from here and let the caller dirty it
-			 * again because that can cause the write-out loop in
-			 * commit to never terminate.
-			 */
-			if (buffer_dirty(bh)) {
-				get_bh(bh);
-				spin_unlock(&journal->j_list_lock);
-				jbd_unlock_bh_state(bh);
-				need_brelse = 1;
-				sync_dirty_buffer(bh);
-				jbd_lock_bh_state(bh);
-				spin_lock(&journal->j_list_lock);
-				/* Since we dropped the lock... */
-				if (!buffer_mapped(bh)) {
-					JBUFFER_TRACE(jh, "buffer got unmapped");
-					goto no_journal;
-				}
-				/* The buffer may become locked again at any
-				   time if it is redirtied */
-			}
-
-			/* journal_clean_data_list() may have got there first */
-			if (jh->b_transaction != NULL) {
-				JBUFFER_TRACE(jh, "unfile from commit");
-				__jbd2_journal_temp_unlink_buffer(jh);
-				/* It still points to the committing
-				 * transaction; move it to this one so
-				 * that the refile assert checks are
-				 * happy. */
-				jh->b_transaction = handle->h_transaction;
-			}
-			/* The buffer will be refiled below */
-
-		}
-		/*
-		 * Special case --- the buffer might actually have been
-		 * allocated and then immediately deallocated in the previous,
-		 * committing transaction, so might still be left on that
-		 * transaction's metadata lists.
-		 */
-		if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
-			JBUFFER_TRACE(jh, "not on correct data list: unfile");
-			J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
-			__jbd2_journal_temp_unlink_buffer(jh);
-			jh->b_transaction = handle->h_transaction;
-			JBUFFER_TRACE(jh, "file as data");
-			__jbd2_journal_file_buffer(jh, handle->h_transaction,
-						BJ_SyncData);
-		}
-	} else {
-		JBUFFER_TRACE(jh, "not on a transaction");
-		__jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
-	}
-no_journal:
-	spin_unlock(&journal->j_list_lock);
-	jbd_unlock_bh_state(bh);
-	if (need_brelse) {
-		BUFFER_TRACE(bh, "brelse");
-		__brelse(bh);
-	}
-	JBUFFER_TRACE(jh, "exit");
-	jbd2_journal_put_journal_head(jh);
-	return 0;
-}
-
 /**
  * int jbd2_journal_dirty_metadata() -  mark a buffer as containing dirty metadata
  * @handle: transaction to add buffer to.
@@ -1541,10 +1364,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
  * Remove a buffer from the appropriate transaction list.
  *
  * Note that this function can *change* the value of
- * bh->b_transaction->t_sync_datalist, t_buffers, t_forget,
- * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list.  If the caller
- * is holding onto a copy of one of thee pointers, it could go bad.
- * Generally the caller needs to re-read the pointer from the transaction_t.
+ * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list,
+ * t_log_list or t_reserved_list.  If the caller is holding onto a copy of one
+ * of these pointers, it could go bad.  Generally the caller needs to re-read
+ * the pointer from the transaction_t.
  *
  * Called under j_list_lock.  The journal may not be locked.
  */
@@ -1566,9 +1389,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
 	switch (jh->b_jlist) {
 	case BJ_None:
 		return;
-	case BJ_SyncData:
-		list = &transaction->t_sync_datalist;
-		break;
 	case BJ_Metadata:
 		transaction->t_nr_buffers--;
 		J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
@@ -1589,9 +1409,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
 	case BJ_Reserved:
 		list = &transaction->t_reserved_list;
 		break;
-	case BJ_Locked:
-		list = &transaction->t_locked_list;
-		break;
 	}
 
 	__blist_del_buffer(list, jh);
@@ -1634,15 +1451,7 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
 		goto out;
 
 	spin_lock(&journal->j_list_lock);
-	if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) {
-		if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
-			/* A written-back ordered data buffer */
-			JBUFFER_TRACE(jh, "release data");
-			__jbd2_journal_unfile_buffer(jh);
-			jbd2_journal_remove_journal_head(bh);
-			__brelse(bh);
-		}
-	} else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
+	if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
 		/* written-back checkpointed metadata buffer */
 		if (jh->b_jlist == BJ_None) {
 			JBUFFER_TRACE(jh, "remove from checkpoint list");
@@ -1878,6 +1687,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
 	if (!buffer_jbd(bh))
 		goto zap_buffer_unlocked;
 
+	/* OK, we have data buffer in journaled mode */
 	spin_lock(&journal->j_state_lock);
 	jbd_lock_bh_state(bh);
 	spin_lock(&journal->j_list_lock);
@@ -1941,15 +1751,6 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
 		}
 	} else if (transaction == journal->j_committing_transaction) {
 		JBUFFER_TRACE(jh, "on committing transaction");
-		if (jh->b_jlist == BJ_Locked) {
-			/*
-			 * The buffer is on the committing transaction's locked
-			 * list.  We have the buffer locked, so I/O has
-			 * completed.  So we can nail the buffer now.
-			 */
-			may_free = __dispose_buffer(jh, transaction);
-			goto zap_buffer;
-		}
 		/*
 		 * If it is committing, we simply cannot touch it.  We
 		 * can remove it's next_transaction pointer from the
@@ -2082,9 +1883,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
 		J_ASSERT_JH(jh, !jh->b_committed_data);
 		J_ASSERT_JH(jh, !jh->b_frozen_data);
 		return;
-	case BJ_SyncData:
-		list = &transaction->t_sync_datalist;
-		break;
 	case BJ_Metadata:
 		transaction->t_nr_buffers++;
 		list = &transaction->t_buffers;
@@ -2104,9 +1902,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
 	case BJ_Reserved:
 		list = &transaction->t_reserved_list;
 		break;
-	case BJ_Locked:
-		list =  &transaction->t_locked_list;
-		break;
 	}
 
 	__blist_add_buffer(list, jh);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 622c3d8ca4e..3dd20900709 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -542,24 +542,12 @@ struct transaction_s
 	 */
 	struct journal_head	*t_reserved_list;
 
-	/*
-	 * Doubly-linked circular list of all buffers under writeout during
-	 * commit [j_list_lock]
-	 */
-	struct journal_head	*t_locked_list;
-
 	/*
 	 * Doubly-linked circular list of all metadata buffers owned by this
 	 * transaction [j_list_lock]
 	 */
 	struct journal_head	*t_buffers;
 
-	/*
-	 * Doubly-linked circular list of all data buffers still to be
-	 * flushed before this transaction can be committed [j_list_lock]
-	 */
-	struct journal_head	*t_sync_datalist;
-
 	/*
 	 * Doubly-linked circular list of all forget buffers (superseded
 	 * buffers which we can un-checkpoint once this transaction commits)
@@ -1044,7 +1032,6 @@ extern int	 jbd2_journal_extend (handle_t *, int nblocks);
 extern int	 jbd2_journal_get_write_access(handle_t *, struct buffer_head *);
 extern int	 jbd2_journal_get_create_access (handle_t *, struct buffer_head *);
 extern int	 jbd2_journal_get_undo_access(handle_t *, struct buffer_head *);
-extern int	 jbd2_journal_dirty_data (handle_t *, struct buffer_head *);
 extern int	 jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *);
 extern void	 jbd2_journal_release_buffer (handle_t *, struct buffer_head *);
 extern int	 jbd2_journal_forget (handle_t *, struct buffer_head *);
@@ -1223,15 +1210,13 @@ static inline int jbd_space_needed(journal_t *journal)
 
 /* journaling buffer types */
 #define BJ_None		0	/* Not journaled */
-#define BJ_SyncData	1	/* Normal data: flush before commit */
-#define BJ_Metadata	2	/* Normal journaled metadata */
-#define BJ_Forget	3	/* Buffer superseded by this transaction */
-#define BJ_IO		4	/* Buffer is for temporary IO use */
-#define BJ_Shadow	5	/* Buffer contents being shadowed to the log */
-#define BJ_LogCtl	6	/* Buffer contains log descriptors */
-#define BJ_Reserved	7	/* Buffer is reserved for access by journal */
-#define BJ_Locked	8	/* Locked for I/O during commit */
-#define BJ_Types	9
+#define BJ_Metadata	1	/* Normal journaled metadata */
+#define BJ_Forget	2	/* Buffer superseded by this transaction */
+#define BJ_IO		3	/* Buffer is for temporary IO use */
+#define BJ_Shadow	4	/* Buffer contents being shadowed to the log */
+#define BJ_LogCtl	5	/* Buffer contains log descriptors */
+#define BJ_Reserved	6	/* Buffer is reserved for access by journal */
+#define BJ_Types	7
 
 extern int jbd_blocks_per_page(struct inode *inode);
 
-- 
cgit v1.2.3-70-g09d2


From 29a814d2ee0e43c2980f33f91c1311ec06c0aa35 Mon Sep 17 00:00:00 2001
From: Alex Tomas <alex@clusterfs.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: vfs: add hooks for ext4's delayed allocation support

Export mpage_bio_submit() and __mpage_writepage() for the benefit of
ext4's delayed allocation support.   Also change __block_write_full_page
so that if buffers that have the BH_Delay flag set it will call
get_block() to get the physical block allocated, just as in the
!BH_Mapped case.

Signed-off-by: Alex Tomas <alex@clusterfs.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/buffer.c           |  7 +++++--
 fs/mpage.c            | 14 +++++---------
 include/linux/mpage.h | 10 ++++++++++
 3 files changed, 20 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index f4b033237a0..5fa1512cd9a 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1691,11 +1691,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 			 */
 			clear_buffer_dirty(bh);
 			set_buffer_uptodate(bh);
-		} else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
+		} else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
+			   buffer_dirty(bh)) {
 			WARN_ON(bh->b_size != blocksize);
 			err = get_block(inode, block, bh, 1);
 			if (err)
 				goto recover;
+			clear_buffer_delay(bh);
 			if (buffer_new(bh)) {
 				/* blockdev mappings never come here */
 				clear_buffer_new(bh);
@@ -1774,7 +1776,8 @@ recover:
 	bh = head;
 	/* Recovery: lock and submit the mapped buffers */
 	do {
-		if (buffer_mapped(bh) && buffer_dirty(bh)) {
+		if (buffer_mapped(bh) && buffer_dirty(bh) &&
+		    !buffer_delay(bh)) {
 			lock_buffer(bh);
 			mark_buffer_async_write(bh);
 		} else {
diff --git a/fs/mpage.c b/fs/mpage.c
index 235e4d3873a..dbcc7af76a1 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -82,7 +82,7 @@ static void mpage_end_io_write(struct bio *bio, int err)
 	bio_put(bio);
 }
 
-static struct bio *mpage_bio_submit(int rw, struct bio *bio)
+struct bio *mpage_bio_submit(int rw, struct bio *bio)
 {
 	bio->bi_end_io = mpage_end_io_read;
 	if (rw == WRITE)
@@ -90,6 +90,7 @@ static struct bio *mpage_bio_submit(int rw, struct bio *bio)
 	submit_bio(rw, bio);
 	return NULL;
 }
+EXPORT_SYMBOL(mpage_bio_submit);
 
 static struct bio *
 mpage_alloc(struct block_device *bdev,
@@ -435,15 +436,9 @@ EXPORT_SYMBOL(mpage_readpage);
  * written, so it can intelligently allocate a suitably-sized BIO.  For now,
  * just allocate full-size (16-page) BIOs.
  */
-struct mpage_data {
-	struct bio *bio;
-	sector_t last_block_in_bio;
-	get_block_t *get_block;
-	unsigned use_writepage;
-};
 
-static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
-			     void *data)
+int __mpage_writepage(struct page *page, struct writeback_control *wbc,
+		      void *data)
 {
 	struct mpage_data *mpd = data;
 	struct bio *bio = mpd->bio;
@@ -651,6 +646,7 @@ out:
 	mpd->bio = bio;
 	return ret;
 }
+EXPORT_SYMBOL(__mpage_writepage);
 
 /**
  * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
diff --git a/include/linux/mpage.h b/include/linux/mpage.h
index 068a0c9946a..5c42821da2d 100644
--- a/include/linux/mpage.h
+++ b/include/linux/mpage.h
@@ -11,11 +11,21 @@
  */
 #ifdef CONFIG_BLOCK
 
+struct mpage_data {
+	struct bio *bio;
+	sector_t last_block_in_bio;
+	get_block_t *get_block;
+	unsigned use_writepage;
+};
+
 struct writeback_control;
 
+struct bio *mpage_bio_submit(int rw, struct bio *bio);
 int mpage_readpages(struct address_space *mapping, struct list_head *pages,
 				unsigned nr_pages, get_block_t get_block);
 int mpage_readpage(struct page *page, get_block_t get_block);
+int __mpage_writepage(struct page *page, struct writeback_control *wbc,
+		      void *data);
 int mpage_writepages(struct address_space *mapping,
 		struct writeback_control *wbc, get_block_t get_block);
 int mpage_writepage(struct page *page, get_block_t *get_block,
-- 
cgit v1.2.3-70-g09d2


From 64769240bd07f446f83660bb143bb609d8ab4910 Mon Sep 17 00:00:00 2001
From: Alex Tomas <alex@clusterfs.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: Add delayed allocation support in data=writeback mode

Updated with fixes from Mingming Cao <cmm@us.ibm.com> to unlock and
release the page from page cache if the delalloc write_begin failed, and
properly handle preallocated blocks.  Also added a fix to clear
buffer_delay in block_write_full_page() after allocating a delayed
buffer.

Updated with fixes from Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
to update i_disksize properly and to add bmap support for delayed
allocation.

Updated with a fix from Valerie Clement <valerie.clement@bull.net> to
avoid filesystem corruption when the filesystem is mounted with the
delalloc option and blocksize < pagesize.

Signed-off-by: Alex Tomas <alex@clusterfs.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by:  Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/ext4/ext4.h  |   1 +
 fs/ext4/inode.c | 699 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/ext4/super.c |   6 +-
 3 files changed, 700 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index f65829bbe7a..ee9576dc0ba 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -536,6 +536,7 @@ do {									       \
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT	0x1000000 /* Journal Async Commit */
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
 #define EXT4_MOUNT_MBALLOC		0x4000000 /* Buddy allocation support */
+#define EXT4_MOUNT_DELALLOC		0x8000000 /* Delalloc support */
 /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
 #define clear_opt(o, opt)		o &= ~EXT4_MOUNT_##opt
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 7b9569179fd..2bef4f879e4 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -32,6 +32,7 @@
 #include <linux/string.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
+#include <linux/pagevec.h>
 #include <linux/mpage.h>
 #include <linux/uio.h>
 #include <linux/bio.h>
@@ -46,6 +47,8 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
 						   new_size);
 }
 
+static void ext4_invalidatepage(struct page *page, unsigned long offset);
+
 /*
  * Test whether an inode is a fast symlink.
  */
@@ -1407,6 +1410,669 @@ static int ext4_journalled_write_end(struct file *file,
 	return ret ? ret : copied;
 }
 
+/*
+ * Delayed allocation stuff
+ */
+
+struct mpage_da_data {
+	struct inode *inode;
+	struct buffer_head lbh;			/* extent of blocks */
+	unsigned long first_page, next_page;	/* extent of pages */
+	get_block_t *get_block;
+	struct writeback_control *wbc;
+};
+
+/*
+ * mpage_da_submit_io - walks through extent of pages and try to write
+ * them with __mpage_writepage()
+ *
+ * @mpd->inode: inode
+ * @mpd->first_page: first page of the extent
+ * @mpd->next_page: page after the last page of the extent
+ * @mpd->get_block: the filesystem's block mapper function
+ *
+ * By the time mpage_da_submit_io() is called we expect all blocks
+ * to be allocated. this may be wrong if allocation failed.
+ *
+ * As pages are already locked by write_cache_pages(), we can't use it
+ */
+static int mpage_da_submit_io(struct mpage_da_data *mpd)
+{
+	struct address_space *mapping = mpd->inode->i_mapping;
+	struct mpage_data mpd_pp = {
+		.bio = NULL,
+		.last_block_in_bio = 0,
+		.get_block = mpd->get_block,
+		.use_writepage = 1,
+	};
+	int ret = 0, err, nr_pages, i;
+	unsigned long index, end;
+	struct pagevec pvec;
+
+	BUG_ON(mpd->next_page <= mpd->first_page);
+
+	pagevec_init(&pvec, 0);
+	index = mpd->first_page;
+	end = mpd->next_page - 1;
+
+	while (index <= end) {
+		/* XXX: optimize tail */
+		nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+		if (nr_pages == 0)
+			break;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			index = page->index;
+			if (index > end)
+				break;
+			index++;
+
+			err = __mpage_writepage(page, mpd->wbc, &mpd_pp);
+
+			/*
+			 * In error case, we have to continue because
+			 * remaining pages are still locked
+			 * XXX: unlock and re-dirty them?
+			 */
+			if (ret == 0)
+				ret = err;
+		}
+		pagevec_release(&pvec);
+	}
+	if (mpd_pp.bio)
+		mpage_bio_submit(WRITE, mpd_pp.bio);
+
+	return ret;
+}
+
+/*
+ * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
+ *
+ * @mpd->inode - inode to walk through
+ * @exbh->b_blocknr - first block on a disk
+ * @exbh->b_size - amount of space in bytes
+ * @logical - first logical block to start assignment with
+ *
+ * the function goes through all passed space and put actual disk
+ * block numbers into buffer heads, dropping BH_Delay
+ */
+static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
+				 struct buffer_head *exbh)
+{
+	struct inode *inode = mpd->inode;
+	struct address_space *mapping = inode->i_mapping;
+	int blocks = exbh->b_size >> inode->i_blkbits;
+	sector_t pblock = exbh->b_blocknr, cur_logical;
+	struct buffer_head *head, *bh;
+	unsigned long index, end;
+	struct pagevec pvec;
+	int nr_pages, i;
+
+	index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+	pagevec_init(&pvec, 0);
+
+	while (index <= end) {
+		/* XXX: optimize tail */
+		nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+		if (nr_pages == 0)
+			break;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			index = page->index;
+			if (index > end)
+				break;
+			index++;
+
+			BUG_ON(!PageLocked(page));
+			BUG_ON(PageWriteback(page));
+			BUG_ON(!page_has_buffers(page));
+
+			bh = page_buffers(page);
+			head = bh;
+
+			/* skip blocks out of the range */
+			do {
+				if (cur_logical >= logical)
+					break;
+				cur_logical++;
+			} while ((bh = bh->b_this_page) != head);
+
+			do {
+				if (cur_logical >= logical + blocks)
+					break;
+
+				if (buffer_delay(bh)) {
+					bh->b_blocknr = pblock;
+					clear_buffer_delay(bh);
+				} else if (buffer_mapped(bh)) {
+					BUG_ON(bh->b_blocknr != pblock);
+				}
+
+				cur_logical++;
+				pblock++;
+			} while ((bh = bh->b_this_page) != head);
+		}
+		pagevec_release(&pvec);
+	}
+}
+
+
+/*
+ * __unmap_underlying_blocks - just a helper function to unmap
+ * set of blocks described by @bh
+ */
+static inline void __unmap_underlying_blocks(struct inode *inode,
+					     struct buffer_head *bh)
+{
+	struct block_device *bdev = inode->i_sb->s_bdev;
+	int blocks, i;
+
+	blocks = bh->b_size >> inode->i_blkbits;
+	for (i = 0; i < blocks; i++)
+		unmap_underlying_metadata(bdev, bh->b_blocknr + i);
+}
+
+/*
+ * mpage_da_map_blocks - go through given space
+ *
+ * @mpd->lbh - bh describing space
+ * @mpd->get_block - the filesystem's block mapper function
+ *
+ * The function skips space we know is already mapped to disk blocks.
+ *
+ * The function ignores errors ->get_block() returns, thus real
+ * error handling is postponed to __mpage_writepage()
+ */
+static void mpage_da_map_blocks(struct mpage_da_data *mpd)
+{
+	struct buffer_head *lbh = &mpd->lbh;
+	int err = 0, remain = lbh->b_size;
+	sector_t next = lbh->b_blocknr;
+	struct buffer_head new;
+
+	/*
+	 * We consider only non-mapped and non-allocated blocks
+	 */
+	if (buffer_mapped(lbh) && !buffer_delay(lbh))
+		return;
+
+	while (remain) {
+		new.b_state = lbh->b_state;
+		new.b_blocknr = 0;
+		new.b_size = remain;
+		err = mpd->get_block(mpd->inode, next, &new, 1);
+		if (err) {
+			/*
+			 * Rather than implement own error handling
+			 * here, we just leave remaining blocks
+			 * unallocated and try again with ->writepage()
+			 */
+			break;
+		}
+		BUG_ON(new.b_size == 0);
+
+		if (buffer_new(&new))
+			__unmap_underlying_blocks(mpd->inode, &new);
+
+		/*
+		 * If blocks are delayed marked, we need to
+		 * put actual blocknr and drop delayed bit
+		 */
+		if (buffer_delay(lbh))
+			mpage_put_bnr_to_bhs(mpd, next, &new);
+
+			/* go for the remaining blocks */
+			next += new.b_size >> mpd->inode->i_blkbits;
+			remain -= new.b_size;
+		}
+}
+
+#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay))
+
+/*
+ * mpage_add_bh_to_extent - try to add one more block to extent of blocks
+ *
+ * @mpd->lbh - extent of blocks
+ * @logical - logical number of the block in the file
+ * @bh - bh of the block (used to access block's state)
+ *
+ * the function is used to collect contig. blocks in same state
+ */
+static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
+				   sector_t logical, struct buffer_head *bh)
+{
+	struct buffer_head *lbh = &mpd->lbh;
+	sector_t next;
+
+	next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits);
+
+	/*
+	 * First block in the extent
+	 */
+	if (lbh->b_size == 0) {
+		lbh->b_blocknr = logical;
+		lbh->b_size = bh->b_size;
+		lbh->b_state = bh->b_state & BH_FLAGS;
+		return;
+	}
+
+	/*
+	 * Can we merge the block to our big extent?
+	 */
+	if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
+		lbh->b_size += bh->b_size;
+		return;
+	}
+
+	/*
+	 * We couldn't merge the block to our extent, so we
+	 * need to flush current  extent and start new one
+	 */
+	mpage_da_map_blocks(mpd);
+
+	/*
+	 * Now start a new extent
+	 */
+	lbh->b_size = bh->b_size;
+	lbh->b_state = bh->b_state & BH_FLAGS;
+	lbh->b_blocknr = logical;
+}
+
+/*
+ * __mpage_da_writepage - finds extent of pages and blocks
+ *
+ * @page: page to consider
+ * @wbc: not used, we just follow rules
+ * @data: context
+ *
+ * The function finds extents of pages and scan them for all blocks.
+ */
+static int __mpage_da_writepage(struct page *page,
+				struct writeback_control *wbc, void *data)
+{
+	struct mpage_da_data *mpd = data;
+	struct inode *inode = mpd->inode;
+	struct buffer_head *bh, *head, fake;
+	sector_t logical;
+
+	/*
+	 * Can we merge this page to current extent?
+	 */
+	if (mpd->next_page != page->index) {
+		/*
+		 * Nope, we can't. So, we map non-allocated blocks
+		 * and start IO on them using __mpage_writepage()
+		 */
+		if (mpd->next_page != mpd->first_page) {
+			mpage_da_map_blocks(mpd);
+			mpage_da_submit_io(mpd);
+		}
+
+		/*
+		 * Start next extent of pages ...
+		 */
+		mpd->first_page = page->index;
+
+		/*
+		 * ... and blocks
+		 */
+		mpd->lbh.b_size = 0;
+		mpd->lbh.b_state = 0;
+		mpd->lbh.b_blocknr = 0;
+	}
+
+	mpd->next_page = page->index + 1;
+	logical = (sector_t) page->index <<
+		  (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+	if (!page_has_buffers(page)) {
+		/*
+		 * There is no attached buffer heads yet (mmap?)
+		 * we treat the page asfull of dirty blocks
+		 */
+		bh = &fake;
+		bh->b_size = PAGE_CACHE_SIZE;
+		bh->b_state = 0;
+		set_buffer_dirty(bh);
+		set_buffer_uptodate(bh);
+		mpage_add_bh_to_extent(mpd, logical, bh);
+	} else {
+		/*
+		 * Page with regular buffer heads, just add all dirty ones
+		 */
+		head = page_buffers(page);
+		bh = head;
+		do {
+			BUG_ON(buffer_locked(bh));
+			if (buffer_dirty(bh))
+				mpage_add_bh_to_extent(mpd, logical, bh);
+			logical++;
+		} while ((bh = bh->b_this_page) != head);
+	}
+
+	return 0;
+}
+
+/*
+ * mpage_da_writepages - walk the list of dirty pages of the given
+ * address space, allocates non-allocated blocks, maps newly-allocated
+ * blocks to existing bhs and issue IO them
+ *
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @get_block: the filesystem's block mapper function.
+ *
+ * This is a library function, which implements the writepages()
+ * address_space_operation.
+ *
+ * In order to avoid duplication of logic that deals with partial pages,
+ * multiple bio per page, etc, we find non-allocated blocks, allocate
+ * them with minimal calls to ->get_block() and re-use __mpage_writepage()
+ *
+ * It's important that we call __mpage_writepage() only once for each
+ * involved page, otherwise we'd have to implement more complicated logic
+ * to deal with pages w/o PG_lock or w/ PG_writeback and so on.
+ *
+ * See comments to mpage_writepages()
+ */
+static int mpage_da_writepages(struct address_space *mapping,
+			       struct writeback_control *wbc,
+			       get_block_t get_block)
+{
+	struct mpage_da_data mpd;
+	int ret;
+
+	if (!get_block)
+		return generic_writepages(mapping, wbc);
+
+	mpd.wbc = wbc;
+	mpd.inode = mapping->host;
+	mpd.lbh.b_size = 0;
+	mpd.lbh.b_state = 0;
+	mpd.lbh.b_blocknr = 0;
+	mpd.first_page = 0;
+	mpd.next_page = 0;
+	mpd.get_block = get_block;
+
+	ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
+
+	/*
+	 * Handle last extent of pages
+	 */
+	if (mpd.next_page != mpd.first_page) {
+		mpage_da_map_blocks(&mpd);
+		mpage_da_submit_io(&mpd);
+	}
+
+	return ret;
+}
+
+/*
+ * this is a special callback for ->write_begin() only
+ * it's intention is to return mapped block or reserve space
+ */
+static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+				  struct buffer_head *bh_result, int create)
+{
+	int ret = 0;
+
+	BUG_ON(create == 0);
+	BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+
+	/*
+	 * first, we need to know whether the block is allocated already
+	 * preallocated blocks are unmapped but should treated
+	 * the same as allocated blocks.
+	 */
+	ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1,  bh_result, 0, 0);
+	if (ret == 0) {
+		/* the block isn't allocated yet, let's reserve space */
+		/* XXX: call reservation here */
+		/*
+		 * XXX: __block_prepare_write() unmaps passed block,
+		 * is it OK?
+		 */
+		map_bh(bh_result, inode->i_sb, 0);
+		set_buffer_new(bh_result);
+		set_buffer_delay(bh_result);
+	} else if (ret > 0) {
+		bh_result->b_size = (ret << inode->i_blkbits);
+		ret = 0;
+	}
+
+	return ret;
+}
+
+static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
+				   struct buffer_head *bh_result, int create)
+{
+	int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
+	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+	loff_t disksize = EXT4_I(inode)->i_disksize;
+	handle_t *handle = NULL;
+
+	if (create) {
+		handle = ext4_journal_start(inode, needed_blocks);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			goto out;
+		}
+	}
+
+	ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+				   bh_result, create, 0);
+	if (ret > 0) {
+		bh_result->b_size = (ret << inode->i_blkbits);
+
+		/*
+		 * Update on-disk size along with block allocation
+		 * we don't use 'extend_disksize' as size may change
+		 * within already allocated block -bzzz
+		 */
+		disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
+		if (disksize > i_size_read(inode))
+			disksize = i_size_read(inode);
+		if (disksize > EXT4_I(inode)->i_disksize) {
+			/*
+			 * XXX: replace with spinlock if seen contended -bzzz
+			 */
+			down_write(&EXT4_I(inode)->i_data_sem);
+			if (disksize > EXT4_I(inode)->i_disksize)
+				EXT4_I(inode)->i_disksize = disksize;
+			up_write(&EXT4_I(inode)->i_data_sem);
+
+			if (EXT4_I(inode)->i_disksize == disksize) {
+				if (handle == NULL)
+					handle = ext4_journal_start(inode, 1);
+				if (!IS_ERR(handle))
+					ext4_mark_inode_dirty(handle, inode);
+			}
+		}
+
+		ret = 0;
+	}
+
+out:
+	if (handle && !IS_ERR(handle))
+		ext4_journal_stop(handle);
+
+	return ret;
+}
+/* FIXME!! only support data=writeback mode */
+static int ext4_da_writepage(struct page *page,
+				struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	handle_t *handle = NULL;
+	int ret = 0;
+	int err;
+
+	if (ext4_journal_current_handle())
+		goto out_fail;
+
+	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out_fail;
+	}
+
+	if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
+		ret = nobh_writepage(page, ext4_get_block, wbc);
+	else
+		ret = block_write_full_page(page, ext4_get_block, wbc);
+
+	if (!ret && inode->i_size > EXT4_I(inode)->i_disksize) {
+		EXT4_I(inode)->i_disksize = inode->i_size;
+		ext4_mark_inode_dirty(handle, inode);
+	}
+
+	err = ext4_journal_stop(handle);
+	if (!ret)
+		ret = err;
+	return ret;
+
+out_fail:
+	redirty_page_for_writepage(wbc, page);
+	unlock_page(page);
+	return ret;
+}
+
+static int ext4_da_writepages(struct address_space *mapping,
+				struct writeback_control *wbc)
+{
+	return mpage_da_writepages(mapping, wbc, ext4_da_get_block_write);
+}
+
+static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned flags,
+				struct page **pagep, void **fsdata)
+{
+	int ret;
+	struct page *page;
+	pgoff_t index;
+	unsigned from, to;
+	struct inode *inode = mapping->host;
+	handle_t *handle;
+
+	index = pos >> PAGE_CACHE_SHIFT;
+	from = pos & (PAGE_CACHE_SIZE - 1);
+	to = from + len;
+
+	/*
+	 * With delayed allocation, we don't log the i_disksize update
+	 * if there is delayed block allocation. But we still need
+	 * to journalling the i_disksize update if writes to the end
+	 * of file which has an already mapped buffer.
+	 */
+	handle = ext4_journal_start(inode, 1);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out;
+	}
+
+	page = __grab_cache_page(mapping, index);
+	if (!page)
+		return -ENOMEM;
+	*pagep = page;
+
+	ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+							ext4_da_get_block_prep);
+	if (ret < 0) {
+		unlock_page(page);
+		ext4_journal_stop(handle);
+		page_cache_release(page);
+	}
+
+out:
+	return ret;
+}
+
+static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
+{
+	return !buffer_mapped(bh) || buffer_delay(bh);
+}
+
+static int ext4_da_write_end(struct file *file,
+				struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned copied,
+				struct page *page, void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	int ret = 0, ret2;
+	handle_t *handle = ext4_journal_current_handle();
+	loff_t new_i_size;
+
+	/*
+	 * generic_write_end() will run mark_inode_dirty() if i_size
+	 * changes.  So let's piggyback the i_disksize mark_inode_dirty
+	 * into that.
+	 */
+
+	new_i_size = pos + copied;
+	if (new_i_size > EXT4_I(inode)->i_disksize)
+		if (!walk_page_buffers(NULL, page_buffers(page),
+				       0, len, NULL, ext4_bh_unmapped_or_delay)){
+			/*
+			 * Updating i_disksize when extending file without
+			 * needing block allocation
+			 */
+			if (ext4_should_order_data(inode))
+				ret = ext4_jbd2_file_inode(handle, inode);
+
+			EXT4_I(inode)->i_disksize = new_i_size;
+		}
+	ret2 = generic_write_end(file, mapping, pos, len, copied,
+							page, fsdata);
+	copied = ret2;
+	if (ret2 < 0)
+		ret = ret2;
+	ret2 = ext4_journal_stop(handle);
+	if (!ret)
+		ret = ret2;
+
+	return ret ? ret : copied;
+}
+
+static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
+{
+	struct buffer_head *head, *bh;
+	unsigned int curr_off = 0;
+
+	/*
+	 * Drop reserved blocks
+	 */
+	BUG_ON(!PageLocked(page));
+	if (!page_has_buffers(page))
+		goto out;
+
+	head = page_buffers(page);
+	bh = head;
+	do {
+		unsigned int next_off = curr_off + bh->b_size;
+
+		/*
+		 * is this block fully invalidated?
+		 */
+		if (offset <= curr_off && buffer_delay(bh)) {
+			clear_buffer_delay(bh);
+			/* XXX: add real stuff here */
+		}
+		curr_off = next_off;
+		bh = bh->b_this_page;
+	} while (bh != head);
+
+out:
+	ext4_invalidatepage(page, offset);
+
+	return;
+}
+
+
 /*
  * bmap() is special.  It gets used by applications such as lilo and by
  * the swapper to find the on-disk block of a specific piece of data.
@@ -1427,6 +2093,16 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
 	journal_t *journal;
 	int err;
 
+	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
+			test_opt(inode->i_sb, DELALLOC)) {
+		/*
+		 * With delalloc we want to sync the file
+		 * so that we can make sure we allocate
+		 * blocks for file
+		 */
+		filemap_write_and_wait(mapping);
+	}
+
 	if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
 		/*
 		 * This is a REALLY heavyweight approach, but the use of
@@ -1471,11 +2147,6 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
 	return 0;
 }
 
-static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
-{
-	return !buffer_mapped(bh) || buffer_delay(bh);
-}
-
 /*
  * Note that we don't need to start a transaction unless we're journaling data
  * because we should have holes filled from ext4_page_mkwrite(). We even don't
@@ -1832,10 +2503,28 @@ static const struct address_space_operations ext4_journalled_aops = {
 	.releasepage	= ext4_releasepage,
 };
 
+static const struct address_space_operations ext4_da_aops = {
+	.readpage	= ext4_readpage,
+	.readpages	= ext4_readpages,
+	.writepage	= ext4_da_writepage,
+	.writepages	= ext4_da_writepages,
+	.sync_page	= block_sync_page,
+	.write_begin	= ext4_da_write_begin,
+	.write_end	= ext4_da_write_end,
+	.bmap		= ext4_bmap,
+	.invalidatepage	= ext4_da_invalidatepage,
+	.releasepage	= ext4_releasepage,
+	.direct_IO	= ext4_direct_IO,
+	.migratepage	= buffer_migrate_page,
+};
+
 void ext4_set_aops(struct inode *inode)
 {
 	if (ext4_should_order_data(inode))
 		inode->i_mapping->a_ops = &ext4_ordered_aops;
+	else if (ext4_should_writeback_data(inode) &&
+		 test_opt(inode->i_sb, DELALLOC))
+		inode->i_mapping->a_ops = &ext4_da_aops;
 	else if (ext4_should_writeback_data(inode))
 		inode->i_mapping->a_ops = &ext4_writeback_aops;
 	else
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 629d0fa27e3..de9d3d0eb20 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -898,7 +898,7 @@ enum {
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
 	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
 	Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
-	Opt_mballoc, Opt_nomballoc, Opt_stripe,
+	Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc,
 };
 
 static match_table_t tokens = {
@@ -957,6 +957,7 @@ static match_table_t tokens = {
 	{Opt_nomballoc, "nomballoc"},
 	{Opt_stripe, "stripe=%u"},
 	{Opt_resize, "resize"},
+	{Opt_delalloc, "delalloc"},
 	{Opt_err, NULL},
 };
 
@@ -1335,6 +1336,9 @@ set_qf_format:
 				return 0;
 			sbi->s_stripe = option;
 			break;
+		case Opt_delalloc:
+			set_opt(sbi->s_mount_opt, DELALLOC);
+			break;
 		default:
 			printk (KERN_ERR
 				"EXT4-fs: Unrecognized mount option \"%s\" "
-- 
cgit v1.2.3-70-g09d2


From e8ced39d5e8911c662d4d69a342b9d053eaaac4e Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@us.ibm.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: percpu_counter: new function percpu_counter_sum_and_set

Delayed allocation need to check free blocks at every write time.
percpu_counter_read_positive() is not quit accurate. delayed
allocation need a more accurate accounting, but using
percpu_counter_sum_positive() is frequently is quite expensive.

This patch added a new function to update center counter when sum
per-cpu counter, to increase the accurate rate for next
percpu_counter_read() and require less calling expensive
percpu_counter_sum().

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c               |  2 +-
 include/linux/percpu_counter.h | 12 +++++++++---
 lib/percpu_counter.c           |  7 ++++++-
 3 files changed, 16 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 25f63d8c1b3..6369bacf0dc 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -1621,7 +1621,7 @@ ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
 #ifdef CONFIG_SMP
 	if (free_blocks - root_blocks < FBC_BATCH)
 		free_blocks =
-			percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
+			percpu_counter_sum_and_set(&sbi->s_freeblocks_counter);
 #endif
 	if (free_blocks - root_blocks < nblocks)
 		return free_blocks - root_blocks;
diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 9007ccdfc11..20838883535 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -35,7 +35,7 @@ int percpu_counter_init_irq(struct percpu_counter *fbc, s64 amount);
 void percpu_counter_destroy(struct percpu_counter *fbc);
 void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
 void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch);
-s64 __percpu_counter_sum(struct percpu_counter *fbc);
+s64 __percpu_counter_sum(struct percpu_counter *fbc, int set);
 
 static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
 {
@@ -44,13 +44,19 @@ static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
 
 static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
 {
-	s64 ret = __percpu_counter_sum(fbc);
+	s64 ret = __percpu_counter_sum(fbc, 0);
 	return ret < 0 ? 0 : ret;
 }
 
+static inline s64 percpu_counter_sum_and_set(struct percpu_counter *fbc)
+{
+	return __percpu_counter_sum(fbc, 1);
+}
+
+
 static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
 {
-	return __percpu_counter_sum(fbc);
+	return __percpu_counter_sum(fbc, 0);
 }
 
 static inline s64 percpu_counter_read(struct percpu_counter *fbc)
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index 119174494cb..4a8ba4bf5f6 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -52,7 +52,7 @@ EXPORT_SYMBOL(__percpu_counter_add);
  * Add up all the per-cpu counts, return the result.  This is a more accurate
  * but much slower version of percpu_counter_read_positive()
  */
-s64 __percpu_counter_sum(struct percpu_counter *fbc)
+s64 __percpu_counter_sum(struct percpu_counter *fbc, int set)
 {
 	s64 ret;
 	int cpu;
@@ -62,7 +62,12 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc)
 	for_each_online_cpu(cpu) {
 		s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
 		ret += *pcount;
+		if (set)
+			*pcount = 0;
 	}
+	if (set)
+		fbc->count = ret;
+
 	spin_unlock(&fbc->lock);
 	return ret;
 }
-- 
cgit v1.2.3-70-g09d2


From d2a1763791a634e315ec926b62829c1e88842c86 Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@us.ibm.com>
Date: Mon, 14 Jul 2008 17:52:37 -0400
Subject: ext4: delayed allocation ENOSPC handling

This patch does block reservation for delayed
allocation, to avoid ENOSPC later at page flush time.

Blocks(data and metadata) are reserved at da_write_begin()
time, the freeblocks counter is updated by then, and the number of
reserved blocks is store in per inode counter.

At the writepage time, the unused reserved meta blocks are returned
back. At unlink/truncate time, reserved blocks are properly released.

Updated fix from  Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
to fix the oldallocator block reservation accounting with delalloc, added
lock to guard the counters and also fix the reservation for meta blocks.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/balloc.c       |  49 ++++++++-----
 fs/ext4/dir.c          |   3 +-
 fs/ext4/ext4.h         |   6 +-
 fs/ext4/ext4_extents.h |   1 +
 fs/ext4/ext4_i.h       |   7 ++
 fs/ext4/extents.c      |  32 ++++++++-
 fs/ext4/inode.c        | 184 +++++++++++++++++++++++++++++++++++++++++--------
 fs/ext4/mballoc.c      |  20 +++++-
 fs/ext4/super.c        |   5 ++
 9 files changed, 257 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 6369bacf0dc..495ab21b983 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -1701,7 +1701,12 @@ ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
 	}
 
 	sbi = EXT4_SB(sb);
-	*count = ext4_has_free_blocks(sbi, *count);
+	if (!EXT4_I(inode)->i_delalloc_reserved_flag) {
+		/*
+		 * With delalloc we already reserved the blocks
+		 */
+		*count = ext4_has_free_blocks(sbi, *count);
+	}
 	if (*count == 0) {
 		*errp = -ENOSPC;
 		return 0;	/*return with ENOSPC error */
@@ -1902,7 +1907,8 @@ allocated:
 	le16_add_cpu(&gdp->bg_free_blocks_count, -num);
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
 	spin_unlock(sb_bgl_lock(sbi, group_no));
-	percpu_counter_sub(&sbi->s_freeblocks_counter, num);
+	if (!EXT4_I(inode)->i_delalloc_reserved_flag)
+		percpu_counter_sub(&sbi->s_freeblocks_counter, num);
 
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
@@ -1976,40 +1982,49 @@ static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
 }
 
 /*
- * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks
+ * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
  *
  * @handle:             handle to this transaction
  * @inode:              file inode
  * @goal:               given target block(filesystem wide)
+ * @count:		total number of blocks need
  * @errp:               error code
  *
- * Return allocated block number on success
+ * Return 1st allocated block numberon success, *count stores total account
+ * error stores in errp pointer
  */
-ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
-		ext4_fsblk_t goal, int *errp)
+ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
+		ext4_fsblk_t goal, unsigned long *count, int *errp)
 {
-	unsigned long count = 1;
-	return do_blk_alloc(handle, inode, 0, goal,
-			&count, errp, EXT4_META_BLOCK);
+	ext4_fsblk_t ret;
+	ret = do_blk_alloc(handle, inode, 0, goal,
+				count, errp, EXT4_META_BLOCK);
+	/*
+	 * Account for the allocated meta blocks
+	 */
+	if (!(*errp)) {
+		spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+		EXT4_I(inode)->i_allocated_meta_blocks += *count;
+		spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+	}
+	return ret;
 }
 
 /*
- * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
+ * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks
  *
  * @handle:             handle to this transaction
  * @inode:              file inode
  * @goal:               given target block(filesystem wide)
- * @count:		total number of blocks need
  * @errp:               error code
  *
- * Return 1st allocated block numberon success, *count stores total account
- * error stores in errp pointer
+ * Return allocated block number on success
  */
-ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
-		ext4_fsblk_t goal, unsigned long *count, int *errp)
+ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
+		ext4_fsblk_t goal, int *errp)
 {
-	return do_blk_alloc(handle, inode, 0, goal,
-			count, errp, EXT4_META_BLOCK);
+	unsigned long count = 1;
+	return ext4_new_meta_blocks(handle, inode, goal, &count, errp);
 }
 
 /*
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 5ed5108766c..d3d23d73c08 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -129,7 +129,8 @@ static int ext4_readdir(struct file * filp,
 		struct buffer_head *bh = NULL;
 
 		map_bh.b_state = 0;
-		err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 0, 0);
+		err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh,
+						0, 0, 0);
 		if (err > 0) {
 			pgoff_t index = map_bh.b_blocknr >>
 					(PAGE_CACHE_SHIFT - inode->i_blkbits);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index ee9576dc0ba..0962f4e2657 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -74,6 +74,9 @@
 #define EXT4_MB_HINT_GOAL_ONLY		256
 /* goal is meaningful */
 #define EXT4_MB_HINT_TRY_GOAL		512
+/* blocks already pre-reserved by delayed allocation */
+#define EXT4_MB_DELALLOC_RESERVED      1024
+
 
 struct ext4_allocation_request {
 	/* target inode for block we're allocating */
@@ -1041,6 +1044,7 @@ extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
 
 
 /* inode.c */
+void ext4_da_release_space(struct inode *inode, int used, int to_free);
 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
 		struct buffer_head *bh, ext4_fsblk_t blocknr);
 struct buffer_head *ext4_getblk(handle_t *, struct inode *,
@@ -1234,7 +1238,7 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
 extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
 			sector_t block, unsigned long max_blocks,
 			struct buffer_head *bh, int create,
-			int extend_disksize);
+			int extend_disksize, int flag);
 #endif	/* __KERNEL__ */
 
 #endif	/* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 75333b595fa..6c166c0a54b 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -212,6 +212,7 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
 		(le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
 }
 
+extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
 extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
 extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
 extern int ext4_extent_tree_init(handle_t *, struct inode *);
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index c2903ef7215..ef7409f0e7e 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -163,6 +163,13 @@ struct ext4_inode_info {
 	/* mballoc */
 	struct list_head i_prealloc_list;
 	spinlock_t i_prealloc_lock;
+
+	/* allocation reservation info for delalloc */
+	unsigned long i_reserved_data_blocks;
+	unsigned long i_reserved_meta_blocks;
+	unsigned long i_allocated_meta_blocks;
+	unsigned short i_delalloc_reserved_flag;
+	spinlock_t i_block_reservation_lock;
 };
 
 #endif	/* _EXT4_I */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7844bbb2bac..dabc3b68d24 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -248,6 +248,36 @@ static int ext4_ext_space_root_idx(struct inode *inode)
 	return size;
 }
 
+/*
+ * Calculate the number of metadata blocks needed
+ * to allocate @blocks
+ * Worse case is one block per extent
+ */
+int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
+{
+	int lcap, icap, rcap, leafs, idxs, num;
+	int newextents = blocks;
+
+	rcap = ext4_ext_space_root_idx(inode);
+	lcap = ext4_ext_space_block(inode);
+	icap = ext4_ext_space_block_idx(inode);
+
+	/* number of new leaf blocks needed */
+	num = leafs = (newextents + lcap - 1) / lcap;
+
+	/*
+	 * Worse case, we need separate index block(s)
+	 * to link all new leaf blocks
+	 */
+	idxs = (leafs + icap - 1) / icap;
+	do {
+		num += idxs;
+		idxs = (idxs + icap - 1) / icap;
+	} while (idxs > rcap);
+
+	return num;
+}
+
 static int
 ext4_ext_max_entries(struct inode *inode, int depth)
 {
@@ -2910,7 +2940,7 @@ retry:
 		}
 		ret = ext4_get_blocks_wrap(handle, inode, block,
 					  max_blocks, &map_bh,
-					  EXT4_CREATE_UNINITIALIZED_EXT, 0);
+					  EXT4_CREATE_UNINITIALIZED_EXT, 0, 0);
 		if (ret <= 0) {
 #ifdef EXT4FS_DEBUG
 			WARN_ON(ret <= 0);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2bef4f879e4..a6b800c5847 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -39,6 +39,7 @@
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
+#include "ext4_extents.h"
 
 static inline int ext4_begin_ordered_truncate(struct inode *inode,
 					      loff_t new_size)
@@ -982,7 +983,7 @@ out:
  */
 int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
 			unsigned long max_blocks, struct buffer_head *bh,
-			int create, int extend_disksize)
+			int create, int extend_disksize, int flag)
 {
 	int retval;
 
@@ -1023,6 +1024,15 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
 	 * with create == 1 flag.
 	 */
 	down_write((&EXT4_I(inode)->i_data_sem));
+
+	/*
+	 * if the caller is from delayed allocation writeout path
+	 * we have already reserved fs blocks for allocation
+	 * let the underlying get_block() function know to
+	 * avoid double accounting
+	 */
+	if (flag)
+		EXT4_I(inode)->i_delalloc_reserved_flag = 1;
 	/*
 	 * We need to check for EXT4 here because migrate
 	 * could have changed the inode type in between
@@ -1044,6 +1054,18 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
 							~EXT4_EXT_MIGRATE;
 		}
 	}
+
+	if (flag) {
+		EXT4_I(inode)->i_delalloc_reserved_flag = 0;
+		/*
+		 * Update reserved blocks/metadata blocks
+		 * after successful block allocation
+		 * which were deferred till now
+		 */
+		if ((retval > 0) && buffer_delay(bh))
+			ext4_da_release_space(inode, retval, 0);
+	}
+
 	up_write((&EXT4_I(inode)->i_data_sem));
 	return retval;
 }
@@ -1069,7 +1091,7 @@ static int ext4_get_block(struct inode *inode, sector_t iblock,
 	}
 
 	ret = ext4_get_blocks_wrap(handle, inode, iblock,
-					max_blocks, bh_result, create, 0);
+					max_blocks, bh_result, create, 0, 0);
 	if (ret > 0) {
 		bh_result->b_size = (ret << inode->i_blkbits);
 		ret = 0;
@@ -1095,7 +1117,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
 	dummy.b_blocknr = -1000;
 	buffer_trace_init(&dummy.b_history);
 	err = ext4_get_blocks_wrap(handle, inode, block, 1,
-					&dummy, create, 1);
+					&dummy, create, 1, 0);
 	/*
 	 * ext4_get_blocks_handle() returns number of blocks
 	 * mapped. 0 in case of a HOLE.
@@ -1409,6 +1431,122 @@ static int ext4_journalled_write_end(struct file *file,
 
 	return ret ? ret : copied;
 }
+/*
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate @blocks for non extent file based file
+ */
+static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
+{
+	int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+	int ind_blks, dind_blks, tind_blks;
+
+	/* number of new indirect blocks needed */
+	ind_blks = (blocks + icap - 1) / icap;
+
+	dind_blks = (ind_blks + icap - 1) / icap;
+
+	tind_blks = 1;
+
+	return ind_blks + dind_blks + tind_blks;
+}
+
+/*
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate given number of blocks
+ */
+static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
+{
+	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+		return ext4_ext_calc_metadata_amount(inode, blocks);
+
+	return ext4_indirect_calc_metadata_amount(inode, blocks);
+}
+
+static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+       unsigned long md_needed, mdblocks, total = 0;
+
+	/*
+	 * recalculate the amount of metadata blocks to reserve
+	 * in order to allocate nrblocks
+	 * worse case is one extent per block
+	 */
+	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+	total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
+	mdblocks = ext4_calc_metadata_amount(inode, total);
+	BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks);
+
+	md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
+	total = md_needed + nrblocks;
+
+	if (ext4_has_free_blocks(sbi, total) < total) {
+		spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+		return -ENOSPC;
+	}
+
+	/* reduce fs free blocks counter */
+	percpu_counter_sub(&sbi->s_freeblocks_counter, total);
+
+	EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
+	EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
+
+	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+	return 0;       /* success */
+}
+
+void ext4_da_release_space(struct inode *inode, int used, int to_free)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	int total, mdb, mdb_free, release;
+
+	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+	/* recalculate the number of metablocks still need to be reserved */
+	total = EXT4_I(inode)->i_reserved_data_blocks - used - to_free;
+	mdb = ext4_calc_metadata_amount(inode, total);
+
+	/* figure out how many metablocks to release */
+	BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+	mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
+
+	/* Account for allocated meta_blocks */
+	mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
+
+	release = to_free + mdb_free;
+
+	/* update fs free blocks counter for truncate case */
+	percpu_counter_add(&sbi->s_freeblocks_counter, release);
+
+	/* update per-inode reservations */
+	BUG_ON(used + to_free > EXT4_I(inode)->i_reserved_data_blocks);
+	EXT4_I(inode)->i_reserved_data_blocks -= (used + to_free);
+
+	BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+	EXT4_I(inode)->i_reserved_meta_blocks = mdb;
+	EXT4_I(inode)->i_allocated_meta_blocks = 0;
+	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+}
+
+static void ext4_da_page_release_reservation(struct page *page,
+						unsigned long offset)
+{
+	int to_release = 0;
+	struct buffer_head *head, *bh;
+	unsigned int curr_off = 0;
+
+	head = page_buffers(page);
+	bh = head;
+	do {
+		unsigned int next_off = curr_off + bh->b_size;
+
+		if ((offset <= curr_off) && (buffer_delay(bh))) {
+			to_release++;
+			clear_buffer_delay(bh);
+		}
+		curr_off = next_off;
+	} while ((bh = bh->b_this_page) != head);
+	ext4_da_release_space(page->mapping->host, 0, to_release);
+}
 
 /*
  * Delayed allocation stuff
@@ -1829,14 +1967,18 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 	 * preallocated blocks are unmapped but should treated
 	 * the same as allocated blocks.
 	 */
-	ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1,  bh_result, 0, 0);
-	if (ret == 0) {
-		/* the block isn't allocated yet, let's reserve space */
-		/* XXX: call reservation here */
+	ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1,  bh_result, 0, 0, 0);
+	if ((ret == 0) && !buffer_delay(bh_result)) {
+		/* the block isn't (pre)allocated yet, let's reserve space */
 		/*
 		 * XXX: __block_prepare_write() unmaps passed block,
 		 * is it OK?
 		 */
+		ret = ext4_da_reserve_space(inode, 1);
+		if (ret)
+			/* not enough space to reserve */
+			return ret;
+
 		map_bh(bh_result, inode->i_sb, 0);
 		set_buffer_new(bh_result);
 		set_buffer_delay(bh_result);
@@ -1847,7 +1989,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 
 	return ret;
 }
-
+#define		EXT4_DELALLOC_RSVED	1
 static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
 				   struct buffer_head *bh_result, int create)
 {
@@ -1865,7 +2007,7 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
 	}
 
 	ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
-				   bh_result, create, 0);
+				   bh_result, create, 0, EXT4_DELALLOC_RSVED);
 	if (ret > 0) {
 		bh_result->b_size = (ret << inode->i_blkbits);
 
@@ -1952,7 +2094,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned flags,
 				struct page **pagep, void **fsdata)
 {
-	int ret;
+	int ret, retries = 0;
 	struct page *page;
 	pgoff_t index;
 	unsigned from, to;
@@ -1963,6 +2105,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
 
+retry:
 	/*
 	 * With delayed allocation, we don't log the i_disksize update
 	 * if there is delayed block allocation. But we still need
@@ -1988,6 +2131,8 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 		page_cache_release(page);
 	}
 
+	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+		goto retry;
 out:
 	return ret;
 }
@@ -2040,9 +2185,6 @@ static int ext4_da_write_end(struct file *file,
 
 static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
 {
-	struct buffer_head *head, *bh;
-	unsigned int curr_off = 0;
-
 	/*
 	 * Drop reserved blocks
 	 */
@@ -2050,21 +2192,7 @@ static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
 	if (!page_has_buffers(page))
 		goto out;
 
-	head = page_buffers(page);
-	bh = head;
-	do {
-		unsigned int next_off = curr_off + bh->b_size;
-
-		/*
-		 * is this block fully invalidated?
-		 */
-		if (offset <= curr_off && buffer_delay(bh)) {
-			clear_buffer_delay(bh);
-			/* XXX: add real stuff here */
-		}
-		curr_off = next_off;
-		bh = bh->b_this_page;
-	} while (bh != head);
+	ext4_da_page_release_reservation(page, offset);
 
 out:
 	ext4_invalidatepage(page, offset);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 8d254ca83d9..8d141a25bbe 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2964,7 +2964,15 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 	le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
 	spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
-	percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
+
+	/*
+	 * free blocks account has already be reduced/reserved
+	 * at write_begin() time for delayed allocation
+	 * do not double accounting
+	 */
+	if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
+		percpu_counter_sub(&sbi->s_freeblocks_counter,
+					ac->ac_b_ex.fe_len);
 
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi,
@@ -4169,7 +4177,12 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 					    &(ar->len), errp);
 		return block;
 	}
-	ar->len = ext4_has_free_blocks(sbi, ar->len);
+	if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
+		/*
+		 * With delalloc we already reserved the blocks
+		 */
+		ar->len = ext4_has_free_blocks(sbi, ar->len);
+	}
 
 	if (ar->len == 0) {
 		*errp = -ENOSPC;
@@ -4186,6 +4199,9 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 	}
 	inquota = ar->len;
 
+	if (EXT4_I(ar->inode)->i_delalloc_reserved_flag)
+		ar->flags |= EXT4_MB_DELALLOC_RESERVED;
+
 	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
 	if (!ac) {
 		ar->len = 0;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index de9d3d0eb20..25e2f2488cd 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -574,6 +574,11 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	INIT_LIST_HEAD(&ei->i_prealloc_list);
 	spin_lock_init(&ei->i_prealloc_lock);
 	jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
+	ei->i_reserved_data_blocks = 0;
+	ei->i_reserved_meta_blocks = 0;
+	ei->i_allocated_meta_blocks = 0;
+	ei->i_delalloc_reserved_flag = 0;
+	spin_lock_init(&(ei->i_block_reservation_lock));
 	return &ei->vfs_inode;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 61628a3f3a37af2bf25daf8e26fd6b76a78c4f76 Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@us.ibm.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: Invert lock ordering of page_lock and transaction start in
 delalloc

With the reverse locking, we need to start a transation before taking
the page lock, so in ext4_da_writepages() we need to break the write-out
into chunks, and restart the journal for each chunck to ensure the
write-out fits in a single transaction.

Updated patch from Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
which fixes delalloc sync hang with journal lock inversion, and address
the performance regression issue.

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c |  10 ++-
 fs/ext4/inode.c   | 201 ++++++++++++++++++++++++++++++++++++++----------------
 2 files changed, 152 insertions(+), 59 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index dabc3b68d24..42c4c0c892e 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2565,6 +2565,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	int err = 0, depth, ret;
 	unsigned long allocated = 0;
 	struct ext4_allocation_request ar;
+	loff_t disksize;
 
 	__clear_bit(BH_New, &bh_result->b_state);
 	ext_debug("blocks %u/%lu requested for inode %u\n",
@@ -2755,8 +2756,13 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	newblock = ext_pblock(&newex);
 	allocated = ext4_ext_get_actual_len(&newex);
 outnew:
-	if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize)
-		EXT4_I(inode)->i_disksize = inode->i_size;
+	if (extend_disksize) {
+		disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits;
+		if (disksize > i_size_read(inode))
+			disksize = i_size_read(inode);
+		if (disksize > EXT4_I(inode)->i_disksize)
+			EXT4_I(inode)->i_disksize = disksize;
+	}
 
 	set_buffer_new(bh_result);
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a6b800c5847..7923336ecf9 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -847,6 +847,7 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	int count = 0;
 	ext4_fsblk_t first_block = 0;
+	loff_t disksize;
 
 
 	J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
@@ -922,8 +923,13 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
 	 * protect it if you're about to implement concurrent
 	 * ext4_get_block() -bzzz
 	*/
-	if (!err && extend_disksize && inode->i_size > ei->i_disksize)
-		ei->i_disksize = inode->i_size;
+	if (!err && extend_disksize) {
+		disksize = ((loff_t) iblock + count) << inode->i_blkbits;
+		if (disksize > i_size_read(inode))
+			disksize = i_size_read(inode);
+		if (disksize > ei->i_disksize)
+			ei->i_disksize = disksize;
+	}
 	if (err)
 		goto cleanup;
 
@@ -1683,13 +1689,11 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
 			do {
 				if (cur_logical >= logical + blocks)
 					break;
-
 				if (buffer_delay(bh)) {
 					bh->b_blocknr = pblock;
 					clear_buffer_delay(bh);
-				} else if (buffer_mapped(bh)) {
+				} else if (buffer_mapped(bh))
 					BUG_ON(bh->b_blocknr != pblock);
-				}
 
 				cur_logical++;
 				pblock++;
@@ -1764,10 +1768,10 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
 		if (buffer_delay(lbh))
 			mpage_put_bnr_to_bhs(mpd, next, &new);
 
-			/* go for the remaining blocks */
-			next += new.b_size >> mpd->inode->i_blkbits;
-			remain -= new.b_size;
-		}
+		/* go for the remaining blocks */
+		next += new.b_size >> mpd->inode->i_blkbits;
+		remain -= new.b_size;
+	}
 }
 
 #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay))
@@ -1993,18 +1997,14 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
 				   struct buffer_head *bh_result, int create)
 {
-	int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
+	int ret;
 	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
 	loff_t disksize = EXT4_I(inode)->i_disksize;
 	handle_t *handle = NULL;
 
-	if (create) {
-		handle = ext4_journal_start(inode, needed_blocks);
-		if (IS_ERR(handle)) {
-			ret = PTR_ERR(handle);
-			goto out;
-		}
-	}
+	handle = ext4_journal_current_handle();
+	BUG_ON(handle == NULL);
+	BUG_ON(create == 0);
 
 	ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
 				   bh_result, create, 0, EXT4_DELALLOC_RSVED);
@@ -2029,65 +2029,157 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
 			up_write(&EXT4_I(inode)->i_data_sem);
 
 			if (EXT4_I(inode)->i_disksize == disksize) {
-				if (handle == NULL)
-					handle = ext4_journal_start(inode, 1);
-				if (!IS_ERR(handle))
-					ext4_mark_inode_dirty(handle, inode);
+				ret = ext4_mark_inode_dirty(handle, inode);
+				return ret;
 			}
 		}
-
 		ret = 0;
 	}
-
-out:
-	if (handle && !IS_ERR(handle))
-		ext4_journal_stop(handle);
-
 	return ret;
 }
+
+static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
+{
+	return !buffer_mapped(bh) || buffer_delay(bh);
+}
+
 /* FIXME!! only support data=writeback mode */
+/*
+ * get called vi ext4_da_writepages after taking page lock
+ * We may end up doing block allocation here in case
+ * mpage_da_map_blocks failed to allocate blocks.
+ */
 static int ext4_da_writepage(struct page *page,
 				struct writeback_control *wbc)
 {
-	struct inode *inode = page->mapping->host;
-	handle_t *handle = NULL;
 	int ret = 0;
-	int err;
+	loff_t size;
+	unsigned long len;
+	handle_t *handle = NULL;
+	struct buffer_head *page_bufs;
+	struct inode *inode = page->mapping->host;
 
-	if (ext4_journal_current_handle())
-		goto out_fail;
+	handle = ext4_journal_current_handle();
+	if (!handle) {
+		/*
+		 * This can happen when we aren't called via
+		 * ext4_da_writepages() but directly (shrink_page_list).
+		 * We cannot easily start a transaction here so we just skip
+		 * writing the page in case we would have to do so.
+		 */
+		size = i_size_read(inode);
 
-	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		goto out_fail;
+		page_bufs = page_buffers(page);
+		if (page->index == size >> PAGE_CACHE_SHIFT)
+			len = size & ~PAGE_CACHE_MASK;
+		else
+			len = PAGE_CACHE_SIZE;
+
+		if (walk_page_buffers(NULL, page_bufs, 0,
+				len, NULL, ext4_bh_unmapped_or_delay)) {
+			/*
+			 * We can't do block allocation under
+			 * page lock without a handle . So redirty
+			 * the page and return
+			 */
+			BUG_ON(wbc->sync_mode != WB_SYNC_NONE);
+			redirty_page_for_writepage(wbc, page);
+			unlock_page(page);
+			return 0;
+		}
 	}
 
 	if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
-		ret = nobh_writepage(page, ext4_get_block, wbc);
+		ret = nobh_writepage(page, ext4_da_get_block_write, wbc);
 	else
-		ret = block_write_full_page(page, ext4_get_block, wbc);
-
-	if (!ret && inode->i_size > EXT4_I(inode)->i_disksize) {
-		EXT4_I(inode)->i_disksize = inode->i_size;
-		ext4_mark_inode_dirty(handle, inode);
-	}
+		ret = block_write_full_page(page, ext4_da_get_block_write, wbc);
 
-	err = ext4_journal_stop(handle);
-	if (!ret)
-		ret = err;
-	return ret;
-
-out_fail:
-	redirty_page_for_writepage(wbc, page);
-	unlock_page(page);
 	return ret;
 }
 
+
+/*
+ * For now just follow the DIO way to estimate the max credits
+ * needed to write out EXT4_MAX_WRITEBACK_PAGES.
+ * todo: need to calculate the max credits need for
+ * extent based files, currently the DIO credits is based on
+ * indirect-blocks mapping way.
+ *
+ * Probably should have a generic way to calculate credits
+ * for DIO, writepages, and truncate
+ */
+#define EXT4_MAX_WRITEBACK_PAGES      DIO_MAX_BLOCKS
+#define EXT4_MAX_WRITEBACK_CREDITS    DIO_CREDITS
+
 static int ext4_da_writepages(struct address_space *mapping,
 				struct writeback_control *wbc)
 {
-	return mpage_da_writepages(mapping, wbc, ext4_da_get_block_write);
+	struct inode *inode = mapping->host;
+	handle_t *handle = NULL;
+	int needed_blocks;
+	int ret = 0;
+	long to_write;
+	loff_t range_start = 0;
+
+	/*
+	 * No pages to write? This is mainly a kludge to avoid starting
+	 * a transaction for special inodes like journal inode on last iput()
+	 * because that could violate lock ordering on umount
+	 */
+	if (!mapping->nrpages)
+		return 0;
+
+	/*
+	 *  Estimate the worse case needed credits to write out
+	 * EXT4_MAX_BUF_BLOCKS pages
+	 */
+	needed_blocks = EXT4_MAX_WRITEBACK_CREDITS;
+
+	to_write = wbc->nr_to_write;
+	if (!wbc->range_cyclic) {
+		/*
+		 * If range_cyclic is not set force range_cont
+		 * and save the old writeback_index
+		 */
+		wbc->range_cont = 1;
+		range_start =  wbc->range_start;
+	}
+
+	while (!ret && to_write) {
+		/* start a new transaction*/
+		handle = ext4_journal_start(inode, needed_blocks);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			goto out_writepages;
+		}
+		/*
+		 * set the max dirty pages could be write at a time
+		 * to fit into the reserved transaction credits
+		 */
+		if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
+			wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
+
+		to_write -= wbc->nr_to_write;
+		ret = mpage_da_writepages(mapping, wbc,
+						ext4_da_get_block_write);
+		ext4_journal_stop(handle);
+		if (wbc->nr_to_write) {
+			/*
+			 * There is no more writeout needed
+			 * or we requested for a noblocking writeout
+			 * and we found the device congested
+			 */
+			to_write += wbc->nr_to_write;
+			break;
+		}
+		wbc->nr_to_write = to_write;
+	}
+
+out_writepages:
+	wbc->nr_to_write = to_write;
+	if (range_start)
+		wbc->range_start = range_start;
+	return ret;
 }
 
 static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
@@ -2137,11 +2229,6 @@ out:
 	return ret;
 }
 
-static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
-{
-	return !buffer_mapped(bh) || buffer_delay(bh);
-}
-
 static int ext4_da_write_end(struct file *file,
 				struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned copied,
-- 
cgit v1.2.3-70-g09d2


From cd1aac32923a9c8adcc0ae85e33c1ca0c5855838 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: Add ordered mode support for delalloc

This provides a new ordered mode implementation which gets rid of using
buffer heads to enforce the ordering between metadata change with the
related data chage.  Instead, in the new ordering mode, it keeps track
of all of the inodes touched by each transaction on a list, and when
that transaction is committed, it flushes all of the dirty pages for
those inodes.  In addition, the new ordered mode reverses the lock
ordering of the page lock and transaction lock, which provides easier
support for delayed allocation.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c  | 30 +++++++++++++++++++++++++-----
 fs/jbd2/commit.c | 38 +++++++++++++++++++++++++++++++++-----
 2 files changed, 58 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 7923336ecf9..24518b57733 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2043,11 +2043,12 @@ static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
 	return !buffer_mapped(bh) || buffer_delay(bh);
 }
 
-/* FIXME!! only support data=writeback mode */
 /*
  * get called vi ext4_da_writepages after taking page lock
  * We may end up doing block allocation here in case
  * mpage_da_map_blocks failed to allocate blocks.
+ *
+ * We also get called via journal_submit_inode_data_buffers
  */
 static int ext4_da_writepage(struct page *page,
 				struct writeback_control *wbc)
@@ -2066,6 +2067,7 @@ static int ext4_da_writepage(struct page *page,
 		 * ext4_da_writepages() but directly (shrink_page_list).
 		 * We cannot easily start a transaction here so we just skip
 		 * writing the page in case we would have to do so.
+		 * We reach here also via journal_submit_inode_data_buffers
 		 */
 		size = i_size_read(inode);
 
@@ -2081,8 +2083,11 @@ static int ext4_da_writepage(struct page *page,
 			 * We can't do block allocation under
 			 * page lock without a handle . So redirty
 			 * the page and return
+			 * We may reach here when we do a journal commit
+			 * via journal_submit_inode_data_buffers.
+			 * If we don't have mapping block we just ignore
+			 * them
 			 */
-			BUG_ON(wbc->sync_mode != WB_SYNC_NONE);
 			redirty_page_for_writepage(wbc, page);
 			unlock_page(page);
 			return 0;
@@ -2097,7 +2102,6 @@ static int ext4_da_writepage(struct page *page,
 	return ret;
 }
 
-
 /*
  * For now just follow the DIO way to estimate the max credits
  * needed to write out EXT4_MAX_WRITEBACK_PAGES.
@@ -2130,7 +2134,7 @@ static int ext4_da_writepages(struct address_space *mapping,
 		return 0;
 
 	/*
-	 *  Estimate the worse case needed credits to write out
+	 * Estimate the worse case needed credits to write out
 	 * EXT4_MAX_BUF_BLOCKS pages
 	 */
 	needed_blocks = EXT4_MAX_WRITEBACK_CREDITS;
@@ -2152,6 +2156,19 @@ static int ext4_da_writepages(struct address_space *mapping,
 			ret = PTR_ERR(handle);
 			goto out_writepages;
 		}
+		if (ext4_should_order_data(inode)) {
+			/*
+			 * With ordered mode we need to add
+			 * the inode to the journal handle
+			 * when we do block allocation.
+			 */
+			ret = ext4_jbd2_file_inode(handle, inode);
+			if (ret) {
+				ext4_journal_stop(handle);
+				goto out_writepages;
+			}
+
+		}
 		/*
 		 * set the max dirty pages could be write at a time
 		 * to fit into the reserved transaction credits
@@ -2735,7 +2752,10 @@ static const struct address_space_operations ext4_da_aops = {
 
 void ext4_set_aops(struct inode *inode)
 {
-	if (ext4_should_order_data(inode))
+	if (ext4_should_order_data(inode) &&
+		test_opt(inode->i_sb, DELALLOC))
+		inode->i_mapping->a_ops = &ext4_da_aops;
+	else if (ext4_should_order_data(inode))
 		inode->i_mapping->a_ops = &ext4_ordered_aops;
 	else if (ext4_should_writeback_data(inode) &&
 		 test_opt(inode->i_sb, DELALLOC))
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 483183d15ed..f8b3be87322 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -22,6 +22,8 @@
 #include <linux/pagemap.h>
 #include <linux/jiffies.h>
 #include <linux/crc32.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
 
 /*
  * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -184,6 +186,27 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
 	return ret;
 }
 
+/*
+ * write the filemap data using writepage() address_space_operations.
+ * We don't do block allocation here even for delalloc. We don't
+ * use writepages() because with dealyed allocation we may be doing
+ * block allocation in writepages().
+ */
+static int journal_submit_inode_data_buffers(struct address_space *mapping)
+{
+	int ret;
+	struct writeback_control wbc = {
+		.sync_mode =  WB_SYNC_ALL,
+		.nr_to_write = mapping->nrpages * 2,
+		.range_start = 0,
+		.range_end = i_size_read(mapping->host),
+		.for_writepages = 1,
+	};
+
+	ret = generic_writepages(mapping, &wbc);
+	return ret;
+}
+
 /*
  * Submit all the data buffers of inode associated with the transaction to
  * disk.
@@ -192,7 +215,7 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
  * operate on from being released while we write out pages.
  */
-static int journal_submit_inode_data_buffers(journal_t *journal,
+static int journal_submit_data_buffers(journal_t *journal,
 		transaction_t *commit_transaction)
 {
 	struct jbd2_inode *jinode;
@@ -204,8 +227,13 @@ static int journal_submit_inode_data_buffers(journal_t *journal,
 		mapping = jinode->i_vfs_inode->i_mapping;
 		jinode->i_flags |= JI_COMMIT_RUNNING;
 		spin_unlock(&journal->j_list_lock);
-		err = filemap_fdatawrite_range(mapping, 0,
-					i_size_read(jinode->i_vfs_inode));
+		/*
+		 * submit the inode data buffers. We use writepage
+		 * instead of writepages. Because writepages can do
+		 * block allocation  with delalloc. We need to write
+		 * only allocated blocks here.
+		 */
+		err = journal_submit_inode_data_buffers(mapping);
 		if (!ret)
 			ret = err;
 		spin_lock(&journal->j_list_lock);
@@ -228,7 +256,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
 	struct jbd2_inode *jinode, *next_i;
 	int err, ret = 0;
 
-	/* For locking, see the comment in journal_submit_inode_data_buffers() */
+	/* For locking, see the comment in journal_submit_data_buffers() */
 	spin_lock(&journal->j_list_lock);
 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
 		jinode->i_flags |= JI_COMMIT_RUNNING;
@@ -431,7 +459,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	 * Now start flushing things to disk, in the order they appear
 	 * on the transaction lists.  Data blocks go first.
 	 */
-	err = journal_submit_inode_data_buffers(journal, commit_transaction);
+	err = journal_submit_data_buffers(journal, commit_transaction);
 	if (err)
 		jbd2_journal_abort(journal, err);
 
-- 
cgit v1.2.3-70-g09d2


From f0e6c98593eb8a77edb7dd0edb22bb9f9368c567 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: Handle page without buffers in ext4_*_writepage()

It can happen that buffers are removed from the page before it gets
marked dirty and then is passed to writepage().  In writepage() we just
initialize the buffers and check whether they are mapped and non
delay. If they are mapped and non delay we write the page. Otherwise we
mark them dirty.  With this change we don't do block allocation at all
in ext4_*_write_page.

writepage() can get called under many condition and with a locking order
of journal_start -> lock_page, we should not try to allocate blocks in
writepage() which get called after taking page lock.  writepage() can
get called via shrink_page_list even with a journal handle which was
created for doing inode update.  For example when doing
ext4_da_write_begin we create a journal handle with credit 1 expecting a
i_disksize update for the inode. But ext4_da_write_begin can cause
shrink_page_list via _grab_page_cache. So having a valid handle via
ext4_journal_current_handle is not a guarantee that we can use the
handle for block allocation in writepage, since we shouldn't be using
credits that had been reserved for other updates.  That it could result
in we running out of credits when we update inodes.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 169 +++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 124 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 24518b57733..ce47847bb37 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2003,11 +2003,15 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
 	handle_t *handle = NULL;
 
 	handle = ext4_journal_current_handle();
-	BUG_ON(handle == NULL);
-	BUG_ON(create == 0);
-
-	ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+	if (!handle) {
+		ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+				   bh_result, 0, 0, 0);
+		BUG_ON(!ret);
+	} else {
+		ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
 				   bh_result, create, 0, EXT4_DELALLOC_RSVED);
+	}
+
 	if (ret > 0) {
 		bh_result->b_size = (ret << inode->i_blkbits);
 
@@ -2040,15 +2044,37 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
 
 static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
 {
-	return !buffer_mapped(bh) || buffer_delay(bh);
+	/*
+	 * unmapped buffer is possible for holes.
+	 * delay buffer is possible with delayed allocation
+	 */
+	return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh));
+}
+
+static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
+				   struct buffer_head *bh_result, int create)
+{
+	int ret = 0;
+	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+
+	/*
+	 * we don't want to do block allocation in writepage
+	 * so call get_block_wrap with create = 0
+	 */
+	ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks,
+				   bh_result, 0, 0, 0);
+	if (ret > 0) {
+		bh_result->b_size = (ret << inode->i_blkbits);
+		ret = 0;
+	}
+	return ret;
 }
 
 /*
- * get called vi ext4_da_writepages after taking page lock
- * We may end up doing block allocation here in case
- * mpage_da_map_blocks failed to allocate blocks.
- *
- * We also get called via journal_submit_inode_data_buffers
+ * get called vi ext4_da_writepages after taking page lock (have journal handle)
+ * get called via journal_submit_inode_data_buffers (no journal handle)
+ * get called via shrink_page_list via pdflush (no journal handle)
+ * or grab_page_cache when doing write_begin (have journal handle)
  */
 static int ext4_da_writepage(struct page *page,
 				struct writeback_control *wbc)
@@ -2056,37 +2082,61 @@ static int ext4_da_writepage(struct page *page,
 	int ret = 0;
 	loff_t size;
 	unsigned long len;
-	handle_t *handle = NULL;
 	struct buffer_head *page_bufs;
 	struct inode *inode = page->mapping->host;
 
-	handle = ext4_journal_current_handle();
-	if (!handle) {
-		/*
-		 * This can happen when we aren't called via
-		 * ext4_da_writepages() but directly (shrink_page_list).
-		 * We cannot easily start a transaction here so we just skip
-		 * writing the page in case we would have to do so.
-		 * We reach here also via journal_submit_inode_data_buffers
-		 */
-		size = i_size_read(inode);
+	size = i_size_read(inode);
+	if (page->index == size >> PAGE_CACHE_SHIFT)
+		len = size & ~PAGE_CACHE_MASK;
+	else
+		len = PAGE_CACHE_SIZE;
 
+	if (page_has_buffers(page)) {
 		page_bufs = page_buffers(page);
-		if (page->index == size >> PAGE_CACHE_SHIFT)
-			len = size & ~PAGE_CACHE_MASK;
-		else
-			len = PAGE_CACHE_SIZE;
-
-		if (walk_page_buffers(NULL, page_bufs, 0,
-				len, NULL, ext4_bh_unmapped_or_delay)) {
+		if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+					ext4_bh_unmapped_or_delay)) {
 			/*
-			 * We can't do block allocation under
-			 * page lock without a handle . So redirty
-			 * the page and return
+			 * We don't want to do  block allocation
+			 * So redirty the page and return
 			 * We may reach here when we do a journal commit
 			 * via journal_submit_inode_data_buffers.
 			 * If we don't have mapping block we just ignore
-			 * them
+			 * them. We can also reach here via shrink_page_list
+			 */
+			redirty_page_for_writepage(wbc, page);
+			unlock_page(page);
+			return 0;
+		}
+	} else {
+		/*
+		 * The test for page_has_buffers() is subtle:
+		 * We know the page is dirty but it lost buffers. That means
+		 * that at some moment in time after write_begin()/write_end()
+		 * has been called all buffers have been clean and thus they
+		 * must have been written at least once. So they are all
+		 * mapped and we can happily proceed with mapping them
+		 * and writing the page.
+		 *
+		 * Try to initialize the buffer_heads and check whether
+		 * all are mapped and non delay. We don't want to
+		 * do block allocation here.
+		 */
+		ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+						ext4_normal_get_block_write);
+		if (!ret) {
+			page_bufs = page_buffers(page);
+			/* check whether all are mapped and non delay */
+			if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+						ext4_bh_unmapped_or_delay)) {
+				redirty_page_for_writepage(wbc, page);
+				unlock_page(page);
+				return 0;
+			}
+		} else {
+			/*
+			 * We can't do block allocation here
+			 * so just redity the page and unlock
+			 * and return
 			 */
 			redirty_page_for_writepage(wbc, page);
 			unlock_page(page);
@@ -2095,9 +2145,11 @@ static int ext4_da_writepage(struct page *page,
 	}
 
 	if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
-		ret = nobh_writepage(page, ext4_da_get_block_write, wbc);
+		ret = nobh_writepage(page, ext4_normal_get_block_write, wbc);
 	else
-		ret = block_write_full_page(page, ext4_da_get_block_write, wbc);
+		ret = block_write_full_page(page,
+						ext4_normal_get_block_write,
+						wbc);
 
 	return ret;
 }
@@ -2438,12 +2490,14 @@ static int __ext4_normal_writepage(struct page *page,
 	struct inode *inode = page->mapping->host;
 
 	if (test_opt(inode->i_sb, NOBH))
-		return nobh_writepage(page, ext4_get_block, wbc);
+		return nobh_writepage(page,
+					ext4_normal_get_block_write, wbc);
 	else
-		return block_write_full_page(page, ext4_get_block, wbc);
+		return block_write_full_page(page,
+						ext4_normal_get_block_write,
+						wbc);
 }
 
-
 static int ext4_normal_writepage(struct page *page,
 				struct writeback_control *wbc)
 {
@@ -2452,13 +2506,24 @@ static int ext4_normal_writepage(struct page *page,
 	loff_t len;
 
 	J_ASSERT(PageLocked(page));
-	J_ASSERT(page_has_buffers(page));
 	if (page->index == size >> PAGE_CACHE_SHIFT)
 		len = size & ~PAGE_CACHE_MASK;
 	else
 		len = PAGE_CACHE_SIZE;
-	BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
-				 ext4_bh_unmapped_or_delay));
+
+	if (page_has_buffers(page)) {
+		/* if page has buffers it should all be mapped
+		 * and allocated. If there are not buffers attached
+		 * to the page we know the page is dirty but it lost
+		 * buffers. That means that at some moment in time
+		 * after write_begin() / write_end() has been called
+		 * all buffers have been clean and thus they must have been
+		 * written at least once. So they are all mapped and we can
+		 * happily proceed with mapping them and writing the page.
+		 */
+		BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+					ext4_bh_unmapped_or_delay));
+	}
 
 	if (!ext4_journal_current_handle())
 		return __ext4_normal_writepage(page, wbc);
@@ -2478,7 +2543,8 @@ static int __ext4_journalled_writepage(struct page *page,
 	int ret = 0;
 	int err;
 
-	ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, ext4_get_block);
+	ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+					ext4_normal_get_block_write);
 	if (ret != 0)
 		goto out_unlock;
 
@@ -2525,13 +2591,24 @@ static int ext4_journalled_writepage(struct page *page,
 	loff_t len;
 
 	J_ASSERT(PageLocked(page));
-	J_ASSERT(page_has_buffers(page));
 	if (page->index == size >> PAGE_CACHE_SHIFT)
 		len = size & ~PAGE_CACHE_MASK;
 	else
 		len = PAGE_CACHE_SIZE;
-	BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
-				 ext4_bh_unmapped_or_delay));
+
+	if (page_has_buffers(page)) {
+		/* if page has buffers it should all be mapped
+		 * and allocated. If there are not buffers attached
+		 * to the page we know the page is dirty but it lost
+		 * buffers. That means that at some moment in time
+		 * after write_begin() / write_end() has been called
+		 * all buffers have been clean and thus they must have been
+		 * written at least once. So they are all mapped and we can
+		 * happily proceed with mapping them and writing the page.
+		 */
+		BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+					ext4_bh_unmapped_or_delay));
+	}
 
 	if (ext4_journal_current_handle())
 		goto no_write;
@@ -2549,7 +2626,9 @@ static int ext4_journalled_writepage(struct page *page,
 		 * really know unless we go poke around in the buffer_heads.
 		 * But block_write_full_page will do the right thing.
 		 */
-		return block_write_full_page(page, ext4_get_block, wbc);
+		return block_write_full_page(page,
+						ext4_normal_get_block_write,
+						wbc);
 	}
 no_write:
 	redirty_page_for_writepage(wbc, page);
-- 
cgit v1.2.3-70-g09d2


From 632eaeab1feb5d78c1e2bfb1d2dfc0ebb8ac187f Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@us.ibm.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: fix delalloc i_disksize early update issue

Ext4_da_write_end() used walk_page_buffers() with a callback function of
ext4_bh_unmapped_or_delay() to check if it extended the file size
without allocating any blocks (since in this case i_disksize needs to be
updated).  However, this is didn't work proprely because the buffer head
has not been marked dirty yet --- this is done later in
block_commit_write() --- which caused ext4_bh_unmapped_or_delay() to
always return false.

In addition, walk_page_buffers() checks all of the buffer heads covering
the page, and the only buffer_head that should be checked is the one
covering the end of the write.  Otherwise, given a 1k blocksize
filesystem and a 4k page size, the buffer head covering the first 1k
stripe of the file could be unmapped (because it was a sparse file), and
the second or third buffer_head covering that page could be mapped, and
using walk_page_buffers() would fail in this case since it would stop at
the first unmapped buffer_head and return true.

The core problem is that walk_page_buffers() was intended to do work in
a callback function, and a non-zero return value indicated a failure,
which termined the walk of the buffer heads covering the page.  It was
not intended to be used with a boolean function, such as
ext4_bh_unmapped_or_delay().

Add addtional fix from Aneesh to protect i_disksize update rave with truncate.

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 63 ++++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 47 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ce47847bb37..0fbe678d40b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2298,6 +2298,29 @@ out:
 	return ret;
 }
 
+/*
+ * Check if we should update i_disksize
+ * when write to the end of file but not require block allocation
+ */
+static int ext4_da_should_update_i_disksize(struct page *page,
+					 unsigned long offset)
+{
+	struct buffer_head *bh;
+	struct inode *inode = page->mapping->host;
+	unsigned int idx;
+	int i;
+
+	bh = page_buffers(page);
+	idx = offset >> inode->i_blkbits;
+
+	for (i=0; i < idx; i++)
+		bh = bh->b_this_page;
+
+	if (!buffer_mapped(bh) || (buffer_delay(bh)))
+		return 0;
+	return 1;
+}
+
 static int ext4_da_write_end(struct file *file,
 				struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned copied,
@@ -2307,6 +2330,10 @@ static int ext4_da_write_end(struct file *file,
 	int ret = 0, ret2;
 	handle_t *handle = ext4_journal_current_handle();
 	loff_t new_i_size;
+	unsigned long start, end;
+
+	start = pos & (PAGE_CACHE_SIZE - 1);
+	end = start + copied -1;
 
 	/*
 	 * generic_write_end() will run mark_inode_dirty() if i_size
@@ -2315,18 +2342,23 @@ static int ext4_da_write_end(struct file *file,
 	 */
 
 	new_i_size = pos + copied;
-	if (new_i_size > EXT4_I(inode)->i_disksize)
-		if (!walk_page_buffers(NULL, page_buffers(page),
-				       0, len, NULL, ext4_bh_unmapped_or_delay)){
-			/*
-			 * Updating i_disksize when extending file without
-			 * needing block allocation
-			 */
-			if (ext4_should_order_data(inode))
-				ret = ext4_jbd2_file_inode(handle, inode);
+	if (new_i_size > EXT4_I(inode)->i_disksize) {
+		if (ext4_da_should_update_i_disksize(page, end)) {
+			down_write(&EXT4_I(inode)->i_data_sem);
+			if (new_i_size > EXT4_I(inode)->i_disksize) {
+				/*
+				 * Updating i_disksize when extending file
+				 * without needing block allocation
+				 */
+				if (ext4_should_order_data(inode))
+					ret = ext4_jbd2_file_inode(handle,
+								   inode);
 
-			EXT4_I(inode)->i_disksize = new_i_size;
+				EXT4_I(inode)->i_disksize = new_i_size;
+			}
+			up_write(&EXT4_I(inode)->i_data_sem);
 		}
+	}
 	ret2 = generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
 	copied = ret2;
@@ -3393,6 +3425,11 @@ void ext4_truncate(struct inode *inode)
 	if (ext4_orphan_add(handle, inode))
 		goto out_stop;
 
+	/*
+	 * From here we block out all ext4_get_block() callers who want to
+	 * modify the block allocation tree.
+	 */
+	down_write(&ei->i_data_sem);
 	/*
 	 * The orphan list entry will now protect us from any crash which
 	 * occurs before the truncate completes, so it is now safe to propagate
@@ -3402,12 +3439,6 @@ void ext4_truncate(struct inode *inode)
 	 */
 	ei->i_disksize = inode->i_size;
 
-	/*
-	 * From here we block out all ext4_get_block() callers who want to
-	 * modify the block allocation tree.
-	 */
-	down_write(&ei->i_data_sem);
-
 	if (n == 1) {		/* direct blocks */
 		ext4_free_data(handle, inode, NULL, i_data+offsets[0],
 			       i_data + EXT4_NDIR_BLOCKS);
-- 
cgit v1.2.3-70-g09d2


From 3e3398a08d6e516675d5af853d625dc7dd90eab1 Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@us.ibm.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: delayed allocation i_blocks fix for stat

Right now i_blocks is not getting updated until the blocks are actually
allocaed on disk.  This means with delayed allocation, right after files
are copied, "ls -sF" shoes the file as taking 0 blocks on disk.  "du"
also shows the files taking zero space, which is highly confusing to the
user.

Since delayed allocation already keeps track of per-inode total
number of blocks that are subject to delayed allocation, this patch fix
this by using that to adjust the value returned by stat(2). When real
block allocation is done, the i_blocks will get updated. Since the
reserved blocks for delayed allocation will be decreased, this will be
keep value returned by stat(2) consistent.

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h  |  2 ++
 fs/ext4/file.c  |  1 +
 fs/ext4/inode.c | 26 ++++++++++++++++++++++++++
 3 files changed, 29 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0962f4e2657..303e41cf7b1 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1059,6 +1059,8 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
 extern struct inode *ext4_iget(struct super_block *, unsigned long);
 extern int  ext4_write_inode (struct inode *, int);
 extern int  ext4_setattr (struct dentry *, struct iattr *);
+extern int  ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
+				struct kstat *stat);
 extern void ext4_delete_inode (struct inode *);
 extern int  ext4_sync_inode (handle_t *, struct inode *);
 extern void ext4_discard_reservation (struct inode *);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index b9510ba66a2..430eb7978db 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -161,6 +161,7 @@ const struct file_operations ext4_file_operations = {
 const struct inode_operations ext4_file_inode_operations = {
 	.truncate	= ext4_truncate,
 	.setattr	= ext4_setattr,
+	.getattr	= ext4_getattr,
 #ifdef CONFIG_EXT4DEV_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0fbe678d40b..8ca2763df09 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4231,6 +4231,32 @@ err_out:
 	return error;
 }
 
+int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		 struct kstat *stat)
+{
+	struct inode *inode;
+	unsigned long delalloc_blocks;
+
+	inode = dentry->d_inode;
+	generic_fillattr(inode, stat);
+
+	/*
+	 * We can't update i_blocks if the block allocation is delayed
+	 * otherwise in the case of system crash before the real block
+	 * allocation is done, we will have i_blocks inconsistent with
+	 * on-disk file blocks.
+	 * We always keep i_blocks updated together with real
+	 * allocation. But to not confuse with user, stat
+	 * will return the blocks that include the delayed allocation
+	 * blocks for this file.
+	 */
+	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+	delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+
+	stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
+	return 0;
+}
 
 /*
  * How many blocks doth make a writepage()?
-- 
cgit v1.2.3-70-g09d2


From dd919b9822c5fd9fd72f95a602440130297c3857 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: Enable delalloc by default.

Enable delalloc by default to ensure it gets sufficient testing and
because it makes the filesystem much more efficient.  Add a nodealalloc
option to disable delayed allocation, and update ext4_show_options to
show delayed allocation off if it is disabled.

If the data=journal mount option is used, disable delayed allocation
since the delalloc code doesn't support data=journal yet.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
---
 fs/ext4/super.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 25e2f2488cd..4e104dd0ec2 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -756,6 +756,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_puts(seq, ",nomballoc");
 	if (test_opt(sb, I_VERSION))
 		seq_puts(seq, ",i_version");
+	if (!test_opt(sb, DELALLOC))
+		seq_puts(seq, ",nodelalloc");
+
 
 	if (sbi->s_stripe)
 		seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
@@ -903,7 +906,7 @@ enum {
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
 	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
 	Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
-	Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc,
+	Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
 };
 
 static match_table_t tokens = {
@@ -963,6 +966,7 @@ static match_table_t tokens = {
 	{Opt_stripe, "stripe=%u"},
 	{Opt_resize, "resize"},
 	{Opt_delalloc, "delalloc"},
+	{Opt_nodelalloc, "nodelalloc"},
 	{Opt_err, NULL},
 };
 
@@ -1328,6 +1332,9 @@ set_qf_format:
 			set_opt(sbi->s_mount_opt, I_VERSION);
 			sb->s_flags |= MS_I_VERSION;
 			break;
+		case Opt_nodelalloc:
+			clear_opt(sbi->s_mount_opt, DELALLOC);
+			break;
 		case Opt_mballoc:
 			set_opt(sbi->s_mount_opt, MBALLOC);
 			break;
@@ -1984,6 +1991,13 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 	 */
 	set_opt(sbi->s_mount_opt, MBALLOC);
 
+	/*
+	 * enable delayed allocation by default
+	 * Use -o nodelalloc to turn it off
+	 */
+	set_opt(sbi->s_mount_opt, DELALLOC);
+
+
 	if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
 			    NULL, 0))
 		goto failed_mount;
@@ -2422,6 +2436,13 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 		test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
 		"writeback");
 
+	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+		printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
+				"requested data journaling mode\n");
+		clear_opt(sbi->s_mount_opt, DELALLOC);
+	} else if (test_opt(sb, DELALLOC))
+		printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
+
 	ext4_ext_init(sb);
 	ext4_mb_init(sb, needs_recovery);
 
-- 
cgit v1.2.3-70-g09d2


From c07651b556323e0e763c452587fe29d2b034b314 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: Don't allow nonextenst mount option for large filesystem

The block mapped inode format can address only blocks within 2**32. This
causes a number of issues, the biggest of which is that the block
allocator needs to be taught that certain inodes can not utilize block
numbers > 2**32.  So until this is fixed, it is simplest to fail
mounting of file systems with more than 2**32 blocks if the -o noextents
option is given.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4e104dd0ec2..2bf9cdd7a03 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1004,6 +1004,7 @@ static int parse_options (char *options, struct super_block *sb,
 	int qtype, qfmt;
 	char *qname;
 #endif
+	ext4_fsblk_t last_block;
 
 	if (!options)
 		return 1;
@@ -1326,6 +1327,20 @@ set_qf_format:
 			set_opt (sbi->s_mount_opt, EXTENTS);
 			break;
 		case Opt_noextents:
+			/*
+			 * When e2fsprogs support resizing an already existing
+			 * ext3 file system to greater than 2**32 we need to
+			 * add support to block allocator to handle growing
+			 * already existing block  mapped inode so that blocks
+			 * allocated for them fall within 2**32
+			 */
+			last_block = ext4_blocks_count(sbi->s_es) - 1;
+			if (last_block  > 0xffffffffULL) {
+				printk(KERN_ERR "EXT4-fs: Filesystem too "
+						"large to mount with "
+						"-o noextents options\n");
+				return 0;
+			}
 			clear_opt (sbi->s_mount_opt, EXTENTS);
 			break;
 		case Opt_i_version:
-- 
cgit v1.2.3-70-g09d2


From e4079a11f5ed966b7d972cc69e8d337a0f095e32 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: ext4: do not set extents feature from the kernel

We've talked for a while about getting rid of any feature-
setting from the kernel; this gets rid of the code which would
set the INCOMPAT_EXTENTS flag on the first file write when mounted
as ext4[dev].

With this patch, if the extents feature is not already set on disk,
then mounting as ext4 will fall back to noextents with a warning,
and if -o extents is explicitly requested, the mount will fail,
also with warning.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ialloc.c |  6 +-----
 fs/ext4/super.c  | 21 +++++++++++++++++----
 2 files changed, 18 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 8b0a10acd70..a92eb305344 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -836,14 +836,10 @@ got:
 		goto fail_free_drop;
 
 	if (test_opt(sb, EXTENTS)) {
-		/* set extent flag only for diretory, file and normal symlink*/
+		/* set extent flag only for directory, file and normal symlink*/
 		if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
 			EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
 			ext4_ext_tree_init(handle, inode);
-			err = ext4_update_incompat_feature(handle, sb,
-					EXT4_FEATURE_INCOMPAT_EXTENTS);
-			if (err)
-				goto fail_free_drop;
 		}
 	}
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2bf9cdd7a03..1cb371dcd60 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1324,6 +1324,13 @@ set_qf_format:
 			clear_opt(sbi->s_mount_opt, NOBH);
 			break;
 		case Opt_extents:
+			if (!EXT4_HAS_INCOMPAT_FEATURE(sb,
+					EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+				ext4_warning(sb, __func__,
+					"extents feature not enabled "
+					"on this filesystem, use tune2fs\n");
+				return 0;
+			}
 			set_opt (sbi->s_mount_opt, EXTENTS);
 			break;
 		case Opt_noextents:
@@ -1997,12 +2004,18 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 
 	/*
 	 * turn on extents feature by default in ext4 filesystem
-	 * User -o noextents to turn it off
+	 * only if feature flag already set by mkfs or tune2fs.
+	 * Use -o noextents to turn it off
 	 */
-	set_opt(sbi->s_mount_opt, EXTENTS);
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
+		set_opt(sbi->s_mount_opt, EXTENTS);
+	else
+		ext4_warning(sb, __func__,
+			"extents feature not enabled on this filesystem, "
+			"use tune2fs.\n");
 	/*
-	 * turn on mballoc feature by default in ext4 filesystem
-	 * User -o nomballoc to turn it off
+	 * turn on mballoc code by default in ext4 filesystem
+	 * Use -o nomballoc to turn it off
 	 */
 	set_opt(sbi->s_mount_opt, MBALLOC);
 
-- 
cgit v1.2.3-70-g09d2


From e911d0cc877ff027d5bd09fc33148ab76f0fdf0e Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Sat, 12 Jul 2008 13:47:59 -0700
Subject: cifs: fix inode leak in cifs_get_inode_info_unix

Try this:

    mount a share with unix extensions
    create a file on it
    umount the share

You'll get the following message in the ring buffer:

VFS: Busy inodes after unmount of cifs. Self-destruct in 5 seconds.  Have a
nice day...

...the problem is that cifs_get_inode_info_unix is creating and hashing
a new inode even when it's going to return error anyway. The first
lookup when creating a file returns an error so we end up leaking this
inode before we do the actual create. This appears to be a regression
caused by commit 0e4bbde94fdc33f5b3d793166b21bf768ca3e098.

The following patch seems to fix it for me, and fixes a minor
formatting nit as well.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Acked-by: Steven French <sfrench@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/cifs/inode.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 722be543cee..2e904bd111c 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -219,15 +219,15 @@ int cifs_get_inode_info_unix(struct inode **pinode,
 	rc = CIFSSMBUnixQPathInfo(xid, pTcon, full_path, &find_data,
 				  cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
-	if (rc) {
-		if (rc == -EREMOTE && !is_dfs_referral) {
-			is_dfs_referral = true;
-			cFYI(DBG2, ("DFS ref"));
-			/* for DFS, server does not give us real inode data */
-			fill_fake_finddataunix(&find_data, sb);
-			rc = 0;
-		}
-	}
+	if (rc == -EREMOTE && !is_dfs_referral) {
+		is_dfs_referral = true;
+		cFYI(DBG2, ("DFS ref"));
+		/* for DFS, server does not give us real inode data */
+		fill_fake_finddataunix(&find_data, sb);
+		rc = 0;
+	} else if (rc)
+		goto cgiiu_exit;
+
 	num_of_bytes = le64_to_cpu(find_data.NumOfBytes);
 	end_of_file = le64_to_cpu(find_data.EndOfFile);
 
@@ -236,7 +236,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
 		*pinode = new_inode(sb);
 		if (*pinode == NULL) {
 			rc = -ENOMEM;
-		goto cgiiu_exit;
+			goto cgiiu_exit;
 		}
 		/* Is an i_ino of zero legal? */
 		/* note ino incremented to unique num in new_inode */
-- 
cgit v1.2.3-70-g09d2


From 536abdb0802f3fac1b217530741853843d63c281 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Sat, 12 Jul 2008 13:48:00 -0700
Subject: cifs: fix wksidarr declaration to be big-endian friendly

The current definition of wksidarr works fine on little endian arches
(since cpu_to_le32 is a no-op there), but on big-endian arches, it fails
to compile with this error:

error: braced-group within expression allowed only inside a function

The problem is that this static declaration has cpu_to_le32 embedded
within it, and that expands into a function macro.  We need to use
__constant_cpu_to_le32() instead.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Cc: Steven French <sfrench@us.ibm.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/cifs/cifsacl.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 34902cff540..0e9fc2ba90e 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -34,11 +34,11 @@
 static struct cifs_wksid wksidarr[NUM_WK_SIDS] = {
 	{{1, 0, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0} }, "null user"},
 	{{1, 1, {0, 0, 0, 0, 0, 1}, {0, 0, 0, 0, 0} }, "nobody"},
-	{{1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(11), 0, 0, 0, 0} }, "net-users"},
-	{{1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(18), 0, 0, 0, 0} }, "sys"},
-	{{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(544), 0, 0, 0} }, "root"},
-	{{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(545), 0, 0, 0} }, "users"},
-	{{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(546), 0, 0, 0} }, "guest"} }
+	{{1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(11), 0, 0, 0, 0} }, "net-users"},
+	{{1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(18), 0, 0, 0, 0} }, "sys"},
+	{{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(544), 0, 0, 0} }, "root"},
+	{{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(545), 0, 0, 0} }, "users"},
+	{{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(546), 0, 0, 0} }, "guest"} }
 ;
 
 
-- 
cgit v1.2.3-70-g09d2


From 006ebb40d3d65338bd74abb03b945f8d60e362bd Mon Sep 17 00:00:00 2001
From: Stephen Smalley <sds@tycho.nsa.gov>
Date: Mon, 19 May 2008 08:32:49 -0400
Subject: Security: split proc ptrace checking into read vs. attach

Enable security modules to distinguish reading of process state via
proc from full ptrace access by renaming ptrace_may_attach to
ptrace_may_access and adding a mode argument indicating whether only
read access or full attach access is requested.  This allows security
modules to permit access to reading process state without granting
full ptrace access.  The base DAC/capability checking remains unchanged.

Read access to /proc/pid/mem continues to apply a full ptrace attach
check since check_mem_permission() already requires the current task
to already be ptracing the target.  The other ptrace checks within
proc for elements like environ, maps, and fds are changed to pass the
read mode instead of attach.

In the SELinux case, we model such reading of process state as a
reading of a proc file labeled with the target process' label.  This
enables SELinux policy to permit such reading of process state without
permitting control or manipulation of the target process, as there are
a number of cases where programs probe for such information via proc
but do not need to be able to control the target (e.g. procps,
lsof, PolicyKit, ConsoleKit).  At present we have to choose between
allowing full ptrace in policy (more permissive than required/desired)
or breaking functionality (or in some cases just silencing the denials
via dontaudit rules but this can hide genuine attacks).

This version of the patch incorporates comments from Casey Schaufler
(change/replace existing ptrace_may_attach interface, pass access
mode), and Chris Wright (provide greater consistency in the checking).

Note that like their predecessors __ptrace_may_attach and
ptrace_may_attach, the __ptrace_may_access and ptrace_may_access
interfaces use different return value conventions from each other (0
or -errno vs. 1 or 0).  I retained this difference to avoid any
changes to the caller logic but made the difference clearer by
changing the latter interface to return a bool rather than an int and
by adding a comment about it to ptrace.h for any future callers.

Signed-off-by:  Stephen Smalley <sds@tycho.nsa.gov>
Acked-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/proc/base.c             |  9 +++++----
 fs/proc/task_mmu.c         |  6 +++---
 fs/proc/task_nommu.c       |  2 +-
 include/linux/ptrace.h     |  8 ++++++--
 include/linux/security.h   | 16 +++++++++++-----
 kernel/ptrace.c            | 15 ++++++++-------
 security/commoncap.c       |  3 ++-
 security/dummy.c           |  3 ++-
 security/security.c        |  5 +++--
 security/selinux/hooks.c   | 13 +++++++++++--
 security/smack/smack_lsm.c |  5 +++--
 11 files changed, 55 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3b455371e7f..58c3e6a8e15 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -233,7 +233,7 @@ static int check_mem_permission(struct task_struct *task)
 	 */
 	if (task->parent == current && (task->ptrace & PT_PTRACED) &&
 	    task_is_stopped_or_traced(task) &&
-	    ptrace_may_attach(task))
+	    ptrace_may_access(task, PTRACE_MODE_ATTACH))
 		return 0;
 
 	/*
@@ -251,7 +251,8 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
 	task_lock(task);
 	if (task->mm != mm)
 		goto out;
-	if (task->mm != current->mm && __ptrace_may_attach(task) < 0)
+	if (task->mm != current->mm &&
+	    __ptrace_may_access(task, PTRACE_MODE_READ) < 0)
 		goto out;
 	task_unlock(task);
 	return mm;
@@ -518,7 +519,7 @@ static int proc_fd_access_allowed(struct inode *inode)
 	 */
 	task = get_proc_task(inode);
 	if (task) {
-		allowed = ptrace_may_attach(task);
+		allowed = ptrace_may_access(task, PTRACE_MODE_READ);
 		put_task_struct(task);
 	}
 	return allowed;
@@ -904,7 +905,7 @@ static ssize_t environ_read(struct file *file, char __user *buf,
 	if (!task)
 		goto out_no_task;
 
-	if (!ptrace_may_attach(task))
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
 		goto out;
 
 	ret = -ENOMEM;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index c492449f3b4..164bd9f9ede 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -210,7 +210,7 @@ static int show_map(struct seq_file *m, void *v)
 	dev_t dev = 0;
 	int len;
 
-	if (maps_protect && !ptrace_may_attach(task))
+	if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
 		return -EACCES;
 
 	if (file) {
@@ -646,7 +646,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 		goto out;
 
 	ret = -EACCES;
-	if (!ptrace_may_attach(task))
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
 		goto out_task;
 
 	ret = -EINVAL;
@@ -747,7 +747,7 @@ static int show_numa_map_checked(struct seq_file *m, void *v)
 	struct proc_maps_private *priv = m->private;
 	struct task_struct *task = priv->task;
 
-	if (maps_protect && !ptrace_may_attach(task))
+	if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
 		return -EACCES;
 
 	return show_numa_map(m, v);
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 4b4f9cc2f18..5d84e7121df 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -113,7 +113,7 @@ static int show_map(struct seq_file *m, void *_vml)
 	struct proc_maps_private *priv = m->private;
 	struct task_struct *task = priv->task;
 
-	if (maps_protect && !ptrace_may_attach(task))
+	if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
 		return -EACCES;
 
 	return nommu_vma_show(m, vml->vma);
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index f98501ba557..c6f5f9dd0ce 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -95,8 +95,12 @@ extern void __ptrace_link(struct task_struct *child,
 			  struct task_struct *new_parent);
 extern void __ptrace_unlink(struct task_struct *child);
 extern void ptrace_untrace(struct task_struct *child);
-extern int ptrace_may_attach(struct task_struct *task);
-extern int __ptrace_may_attach(struct task_struct *task);
+#define PTRACE_MODE_READ   1
+#define PTRACE_MODE_ATTACH 2
+/* Returns 0 on success, -errno on denial. */
+extern int __ptrace_may_access(struct task_struct *task, unsigned int mode);
+/* Returns true on success, false on denial. */
+extern bool ptrace_may_access(struct task_struct *task, unsigned int mode);
 
 static inline int ptrace_reparented(struct task_struct *child)
 {
diff --git a/include/linux/security.h b/include/linux/security.h
index 50737c70e78..62bd80cb7f8 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -46,7 +46,8 @@ struct audit_krule;
  */
 extern int cap_capable(struct task_struct *tsk, int cap);
 extern int cap_settime(struct timespec *ts, struct timezone *tz);
-extern int cap_ptrace(struct task_struct *parent, struct task_struct *child);
+extern int cap_ptrace(struct task_struct *parent, struct task_struct *child,
+		      unsigned int mode);
 extern int cap_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted);
 extern int cap_capset_check(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted);
 extern void cap_capset_set(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted);
@@ -1170,6 +1171,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	attributes would be changed by the execve.
  *	@parent contains the task_struct structure for parent process.
  *	@child contains the task_struct structure for child process.
+ *	@mode contains the PTRACE_MODE flags indicating the form of access.
  *	Return 0 if permission is granted.
  * @capget:
  *	Get the @effective, @inheritable, and @permitted capability sets for
@@ -1295,7 +1297,8 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
 struct security_operations {
 	char name[SECURITY_NAME_MAX + 1];
 
-	int (*ptrace) (struct task_struct *parent, struct task_struct *child);
+	int (*ptrace) (struct task_struct *parent, struct task_struct *child,
+		       unsigned int mode);
 	int (*capget) (struct task_struct *target,
 		       kernel_cap_t *effective,
 		       kernel_cap_t *inheritable, kernel_cap_t *permitted);
@@ -1573,7 +1576,8 @@ extern struct dentry *securityfs_create_dir(const char *name, struct dentry *par
 extern void securityfs_remove(struct dentry *dentry);
 
 /* Security operations */
-int security_ptrace(struct task_struct *parent, struct task_struct *child);
+int security_ptrace(struct task_struct *parent, struct task_struct *child,
+		    unsigned int mode);
 int security_capget(struct task_struct *target,
 		    kernel_cap_t *effective,
 		    kernel_cap_t *inheritable,
@@ -1755,9 +1759,11 @@ static inline int security_init(void)
 	return 0;
 }
 
-static inline int security_ptrace(struct task_struct *parent, struct task_struct *child)
+static inline int security_ptrace(struct task_struct *parent,
+				  struct task_struct *child,
+				  unsigned int mode)
 {
-	return cap_ptrace(parent, child);
+	return cap_ptrace(parent, child, mode);
 }
 
 static inline int security_capget(struct task_struct *target,
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 6c19e94fd0a..e337390fce0 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -121,7 +121,7 @@ int ptrace_check_attach(struct task_struct *child, int kill)
 	return ret;
 }
 
-int __ptrace_may_attach(struct task_struct *task)
+int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 {
 	/* May we inspect the given task?
 	 * This check is used both for attaching with ptrace
@@ -148,16 +148,16 @@ int __ptrace_may_attach(struct task_struct *task)
 	if (!dumpable && !capable(CAP_SYS_PTRACE))
 		return -EPERM;
 
-	return security_ptrace(current, task);
+	return security_ptrace(current, task, mode);
 }
 
-int ptrace_may_attach(struct task_struct *task)
+bool ptrace_may_access(struct task_struct *task, unsigned int mode)
 {
 	int err;
 	task_lock(task);
-	err = __ptrace_may_attach(task);
+	err = __ptrace_may_access(task, mode);
 	task_unlock(task);
-	return !err;
+	return (!err ? true : false);
 }
 
 int ptrace_attach(struct task_struct *task)
@@ -195,7 +195,7 @@ repeat:
 	/* the same process cannot be attached many times */
 	if (task->ptrace & PT_PTRACED)
 		goto bad;
-	retval = __ptrace_may_attach(task);
+	retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH);
 	if (retval)
 		goto bad;
 
@@ -494,7 +494,8 @@ int ptrace_traceme(void)
 	 */
 	task_lock(current);
 	if (!(current->ptrace & PT_PTRACED)) {
-		ret = security_ptrace(current->parent, current);
+		ret = security_ptrace(current->parent, current,
+				      PTRACE_MODE_ATTACH);
 		/*
 		 * Set the ptrace bit in the process ptrace flags.
 		 */
diff --git a/security/commoncap.c b/security/commoncap.c
index 33d34330841..0b6537a3672 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -63,7 +63,8 @@ int cap_settime(struct timespec *ts, struct timezone *tz)
 	return 0;
 }
 
-int cap_ptrace (struct task_struct *parent, struct task_struct *child)
+int cap_ptrace (struct task_struct *parent, struct task_struct *child,
+		unsigned int mode)
 {
 	/* Derived from arch/i386/kernel/ptrace.c:sys_ptrace. */
 	if (!cap_issubset(child->cap_permitted, parent->cap_permitted) &&
diff --git a/security/dummy.c b/security/dummy.c
index b8916883b77..1db712d99dc 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -30,7 +30,8 @@
 #include <linux/prctl.h>
 #include <linux/securebits.h>
 
-static int dummy_ptrace (struct task_struct *parent, struct task_struct *child)
+static int dummy_ptrace (struct task_struct *parent, struct task_struct *child,
+			 unsigned int mode)
 {
 	return 0;
 }
diff --git a/security/security.c b/security/security.c
index 59838a99b80..c4507ce2a5a 100644
--- a/security/security.c
+++ b/security/security.c
@@ -161,9 +161,10 @@ int mod_reg_security(const char *name, struct security_operations *ops)
 
 /* Security operations */
 
-int security_ptrace(struct task_struct *parent, struct task_struct *child)
+int security_ptrace(struct task_struct *parent, struct task_struct *child,
+		    unsigned int mode)
 {
-	return security_ops->ptrace(parent, child);
+	return security_ops->ptrace(parent, child, mode);
 }
 
 int security_capget(struct task_struct *target,
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index eca70f42e67..4be156334b2 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1686,14 +1686,23 @@ static inline u32 file_to_av(struct file *file)
 
 /* Hook functions begin here. */
 
-static int selinux_ptrace(struct task_struct *parent, struct task_struct *child)
+static int selinux_ptrace(struct task_struct *parent,
+			  struct task_struct *child,
+			  unsigned int mode)
 {
 	int rc;
 
-	rc = secondary_ops->ptrace(parent, child);
+	rc = secondary_ops->ptrace(parent, child, mode);
 	if (rc)
 		return rc;
 
+	if (mode == PTRACE_MODE_READ) {
+		struct task_security_struct *tsec = parent->security;
+		struct task_security_struct *csec = child->security;
+		return avc_has_perm(tsec->sid, csec->sid,
+				    SECCLASS_FILE, FILE__READ, NULL);
+	}
+
 	return task_has_perm(parent, child, PROCESS__PTRACE);
 }
 
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 4a09293efa0..3c7150b3493 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -95,11 +95,12 @@ struct inode_smack *new_inode_smack(char *smack)
  *
  * Do the capability checks, and require read and write.
  */
-static int smack_ptrace(struct task_struct *ptp, struct task_struct *ctp)
+static int smack_ptrace(struct task_struct *ptp, struct task_struct *ctp,
+			unsigned int mode)
 {
 	int rc;
 
-	rc = cap_ptrace(ptp, ctp);
+	rc = cap_ptrace(ptp, ctp, mode);
 	if (rc != 0)
 		return rc;
 
-- 
cgit v1.2.3-70-g09d2


From 2069f457848f846cb31149c9aa29b330a6b66d1b Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Fri, 4 Jul 2008 09:47:13 +1000
Subject: LSM/SELinux: show LSM mount options in /proc/mounts

This patch causes SELinux mount options to show up in /proc/mounts.  As
with other code in the area seq_put errors are ignored.  Other LSM's
will not have their mount options displayed until they fill in their own
security_sb_show_options() function.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/namespace.c           | 14 +++++++++---
 include/linux/security.h |  9 ++++++++
 security/dummy.c         |  6 ++++++
 security/security.c      |  5 +++++
 security/selinux/hooks.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++-
 5 files changed, 85 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 4fc302c2a0e..4f6f7635b59 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -750,7 +750,7 @@ struct proc_fs_info {
 	const char *str;
 };
 
-static void show_sb_opts(struct seq_file *m, struct super_block *sb)
+static int show_sb_opts(struct seq_file *m, struct super_block *sb)
 {
 	static const struct proc_fs_info fs_info[] = {
 		{ MS_SYNCHRONOUS, ",sync" },
@@ -764,6 +764,8 @@ static void show_sb_opts(struct seq_file *m, struct super_block *sb)
 		if (sb->s_flags & fs_infop->flag)
 			seq_puts(m, fs_infop->str);
 	}
+
+	return security_sb_show_options(m, sb);
 }
 
 static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
@@ -806,11 +808,14 @@ static int show_vfsmnt(struct seq_file *m, void *v)
 	seq_putc(m, ' ');
 	show_type(m, mnt->mnt_sb);
 	seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw");
-	show_sb_opts(m, mnt->mnt_sb);
+	err = show_sb_opts(m, mnt->mnt_sb);
+	if (err)
+		goto out;
 	show_mnt_opts(m, mnt);
 	if (mnt->mnt_sb->s_op->show_options)
 		err = mnt->mnt_sb->s_op->show_options(m, mnt);
 	seq_puts(m, " 0 0\n");
+out:
 	return err;
 }
 
@@ -865,10 +870,13 @@ static int show_mountinfo(struct seq_file *m, void *v)
 	seq_putc(m, ' ');
 	mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
 	seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw");
-	show_sb_opts(m, sb);
+	err = show_sb_opts(m, sb);
+	if (err)
+		goto out;
 	if (sb->s_op->show_options)
 		err = sb->s_op->show_options(m, mnt);
 	seq_putc(m, '\n');
+out:
 	return err;
 }
 
diff --git a/include/linux/security.h b/include/linux/security.h
index 62bd80cb7f8..c8ad8ec684b 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -80,6 +80,7 @@ struct xfrm_selector;
 struct xfrm_policy;
 struct xfrm_state;
 struct xfrm_user_sec_ctx;
+struct seq_file;
 
 extern int cap_netlink_send(struct sock *sk, struct sk_buff *skb);
 extern int cap_netlink_recv(struct sk_buff *skb, int cap);
@@ -1331,6 +1332,7 @@ struct security_operations {
 	void (*sb_free_security) (struct super_block *sb);
 	int (*sb_copy_data) (char *orig, char *copy);
 	int (*sb_kern_mount) (struct super_block *sb, void *data);
+	int (*sb_show_options) (struct seq_file *m, struct super_block *sb);
 	int (*sb_statfs) (struct dentry *dentry);
 	int (*sb_mount) (char *dev_name, struct path *path,
 			 char *type, unsigned long flags, void *data);
@@ -1610,6 +1612,7 @@ int security_sb_alloc(struct super_block *sb);
 void security_sb_free(struct super_block *sb);
 int security_sb_copy_data(char *orig, char *copy);
 int security_sb_kern_mount(struct super_block *sb, void *data);
+int security_sb_show_options(struct seq_file *m, struct super_block *sb);
 int security_sb_statfs(struct dentry *dentry);
 int security_sb_mount(char *dev_name, struct path *path,
 		      char *type, unsigned long flags, void *data);
@@ -1887,6 +1890,12 @@ static inline int security_sb_kern_mount(struct super_block *sb, void *data)
 	return 0;
 }
 
+static inline int security_sb_show_options(struct seq_file *m,
+					   struct super_block *sb)
+{
+	return 0;
+}
+
 static inline int security_sb_statfs(struct dentry *dentry)
 {
 	return 0;
diff --git a/security/dummy.c b/security/dummy.c
index 1db712d99dc..c155f08e9dd 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -194,6 +194,11 @@ static int dummy_sb_kern_mount (struct super_block *sb, void *data)
 	return 0;
 }
 
+static int dummy_sb_show_options(struct seq_file *m, struct super_block *sb)
+{
+	return 0;
+}
+
 static int dummy_sb_statfs (struct dentry *dentry)
 {
 	return 0;
@@ -1088,6 +1093,7 @@ void security_fixup_ops (struct security_operations *ops)
 	set_to_dummy_if_null(ops, sb_free_security);
 	set_to_dummy_if_null(ops, sb_copy_data);
 	set_to_dummy_if_null(ops, sb_kern_mount);
+	set_to_dummy_if_null(ops, sb_show_options);
 	set_to_dummy_if_null(ops, sb_statfs);
 	set_to_dummy_if_null(ops, sb_mount);
 	set_to_dummy_if_null(ops, sb_check_sb);
diff --git a/security/security.c b/security/security.c
index 2c0a5876b93..de74fdccde2 100644
--- a/security/security.c
+++ b/security/security.c
@@ -292,6 +292,11 @@ int security_sb_kern_mount(struct super_block *sb, void *data)
 	return security_ops->sb_kern_mount(sb, data);
 }
 
+int security_sb_show_options(struct seq_file *m, struct super_block *sb)
+{
+	return security_ops->sb_show_options(m, sb);
+}
+
 int security_sb_statfs(struct dentry *dentry)
 {
 	return security_ops->sb_statfs(dentry);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 85f74f66576..33dee83fdd2 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -9,7 +9,8 @@
  *	      James Morris <jmorris@redhat.com>
  *
  *  Copyright (C) 2001,2002 Networks Associates Technology, Inc.
- *  Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com>
+ *  Copyright (C) 2003-2008 Red Hat, Inc., James Morris <jmorris@redhat.com>
+ *					   Eric Paris <eparis@redhat.com>
  *  Copyright (C) 2004-2005 Trusted Computer Solutions, Inc.
  *			    <dgoeddel@trustedcs.com>
  *  Copyright (C) 2006, 2007 Hewlett-Packard Development Company, L.P.
@@ -970,6 +971,57 @@ out_err:
 	return rc;
 }
 
+void selinux_write_opts(struct seq_file *m, struct security_mnt_opts *opts)
+{
+	int i;
+	char *prefix;
+
+	for (i = 0; i < opts->num_mnt_opts; i++) {
+		char *has_comma = strchr(opts->mnt_opts[i], ',');
+
+		switch (opts->mnt_opts_flags[i]) {
+		case CONTEXT_MNT:
+			prefix = CONTEXT_STR;
+			break;
+		case FSCONTEXT_MNT:
+			prefix = FSCONTEXT_STR;
+			break;
+		case ROOTCONTEXT_MNT:
+			prefix = ROOTCONTEXT_STR;
+			break;
+		case DEFCONTEXT_MNT:
+			prefix = DEFCONTEXT_STR;
+			break;
+		default:
+			BUG();
+		};
+		/* we need a comma before each option */
+		seq_putc(m, ',');
+		seq_puts(m, prefix);
+		if (has_comma)
+			seq_putc(m, '\"');
+		seq_puts(m, opts->mnt_opts[i]);
+		if (has_comma)
+			seq_putc(m, '\"');
+	}
+}
+
+static int selinux_sb_show_options(struct seq_file *m, struct super_block *sb)
+{
+	struct security_mnt_opts opts;
+	int rc;
+
+	rc = selinux_get_mnt_opts(sb, &opts);
+	if (rc)
+		return rc;
+
+	selinux_write_opts(m, &opts);
+
+	security_free_mnt_opts(&opts);
+
+	return rc;
+}
+
 static inline u16 inode_mode_to_security_class(umode_t mode)
 {
 	switch (mode & S_IFMT) {
@@ -5365,6 +5417,7 @@ static struct security_operations selinux_ops = {
 	.sb_free_security =		selinux_sb_free_security,
 	.sb_copy_data =			selinux_sb_copy_data,
 	.sb_kern_mount =		selinux_sb_kern_mount,
+	.sb_show_options =		selinux_sb_show_options,
 	.sb_statfs =			selinux_sb_statfs,
 	.sb_mount =			selinux_mount,
 	.sb_umount =			selinux_umount,
-- 
cgit v1.2.3-70-g09d2


From ae8547b0a9e5d718ce272ddc48f91703a0f52a0b Mon Sep 17 00:00:00 2001
From: Hans Reiser <reiser@namesys.com>
Date: Wed, 7 May 2008 15:48:57 +0300
Subject: VFS: move inode_lock into sync_sb_inodes

This patch makes 'sync_sb_inodes()' lock 'inode_lock', rather
than expect that the caller will do this.

This change was previously done by Hans Reiser <reiser@namesys.com>
and sat in the -mm tree.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/fs-writeback.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index ae45f77765c..16519fe1399 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -424,8 +424,6 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
  * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so
  * that it can be located for waiting on in __writeback_single_inode().
  *
- * Called under inode_lock.
- *
  * If `bdi' is non-zero then we're being asked to writeback a specific queue.
  * This function assumes that the blockdev superblock's inodes are backed by
  * a variety of queues, so all inodes are searched.  For other superblocks,
@@ -446,6 +444,7 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
 {
 	const unsigned long start = jiffies;	/* livelock avoidance */
 
+	spin_lock(&inode_lock);
 	if (!wbc->for_kupdate || list_empty(&sb->s_io))
 		queue_io(sb, wbc->older_than_this);
 
@@ -524,6 +523,7 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
 		if (!list_empty(&sb->s_more_io))
 			wbc->more_io = 1;
 	}
+	spin_unlock(&inode_lock);
 	return;		/* Leave any unwritten inodes on s_io */
 }
 
@@ -565,11 +565,8 @@ restart:
 			 * be unmounted by the time it is released.
 			 */
 			if (down_read_trylock(&sb->s_umount)) {
-				if (sb->s_root) {
-					spin_lock(&inode_lock);
+				if (sb->s_root)
 					sync_sb_inodes(sb, wbc);
-					spin_unlock(&inode_lock);
-				}
 				up_read(&sb->s_umount);
 			}
 			spin_lock(&sb_lock);
@@ -607,9 +604,7 @@ void sync_inodes_sb(struct super_block *sb, int wait)
 			(inodes_stat.nr_inodes - inodes_stat.nr_unused) +
 			nr_dirty + nr_unstable;
 	wbc.nr_to_write += wbc.nr_to_write / 2;		/* Bit more for luck */
-	spin_lock(&inode_lock);
 	sync_sb_inodes(sb, &wbc);
-	spin_unlock(&inode_lock);
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 4ee6afd34409d296782a5b667d7991b1050e910a Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Wed, 7 May 2008 21:01:30 +0300
Subject: VFS: export sync_sb_inodes

This patch exports the 'sync_sb_inodes()' which is needed for
UBIFS because it has to force write-back from time to time.
Namely, the UBIFS budgeting subsystem forces write-back when
its pessimistic callculations show that there is no free
space on the media.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/fs-writeback.c  | 11 +++++++++--
 include/linux/fs.h |  2 ++
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 16519fe1399..25adfc3c693 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -439,8 +439,8 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
  * on the writer throttling path, and we get decent balancing between many
  * throttled threads: we don't want them all piling up on inode_sync_wait.
  */
-static void
-sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
+void generic_sync_sb_inodes(struct super_block *sb,
+				struct writeback_control *wbc)
 {
 	const unsigned long start = jiffies;	/* livelock avoidance */
 
@@ -526,6 +526,13 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
 	spin_unlock(&inode_lock);
 	return;		/* Leave any unwritten inodes on s_io */
 }
+EXPORT_SYMBOL_GPL(generic_sync_sb_inodes);
+
+static void sync_sb_inodes(struct super_block *sb,
+				struct writeback_control *wbc)
+{
+	generic_sync_sb_inodes(sb, wbc);
+}
 
 /*
  * Start writeback of dirty pagecache data against all unlocked inodes.
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d8e2762ed14..f9d2aab47ed 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1729,6 +1729,8 @@ static inline void invalidate_remote_inode(struct inode *inode)
 extern int invalidate_inode_pages2(struct address_space *mapping);
 extern int invalidate_inode_pages2_range(struct address_space *mapping,
 					 pgoff_t start, pgoff_t end);
+extern void generic_sync_sb_inodes(struct super_block *sb,
+				struct writeback_control *wbc);
 extern int write_inode_now(struct inode *, int);
 extern int filemap_fdatawrite(struct address_space *);
 extern int filemap_flush(struct address_space *);
-- 
cgit v1.2.3-70-g09d2


From 40be492fe4fab829951681860c2bb26fa1d5fe4a Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Mon, 14 Jul 2008 20:13:50 +0200
Subject: [Bluetooth] Export details about authentication requirements

With the Simple Pairing support, the authentication requirements are
an explicit setting during the bonding process. Track and enforce the
requirements and allow higher layers like L2CAP and RFCOMM to increase
them if needed.

This patch introduces a new IOCTL that allows to query the current
authentication requirements. It is also possible to detect Simple
Pairing support in the kernel this way.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 fs/compat_ioctl.c                |  1 +
 include/net/bluetooth/hci.h      | 18 ++++++++++++++----
 include/net/bluetooth/hci_core.h |  2 ++
 net/bluetooth/hci_conn.c         | 38 ++++++++++++++++++++++++++++++++++----
 net/bluetooth/hci_sock.c         | 18 +++++-------------
 net/bluetooth/l2cap.c            | 14 ++++++++++----
 net/bluetooth/rfcomm/core.c      |  3 ++-
 7 files changed, 68 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 97dba0d9234..d63d430d3b2 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -2399,6 +2399,7 @@ COMPATIBLE_IOCTL(HCIGETDEVLIST)
 COMPATIBLE_IOCTL(HCIGETDEVINFO)
 COMPATIBLE_IOCTL(HCIGETCONNLIST)
 COMPATIBLE_IOCTL(HCIGETCONNINFO)
+COMPATIBLE_IOCTL(HCIGETAUTHINFO)
 COMPATIBLE_IOCTL(HCISETRAW)
 COMPATIBLE_IOCTL(HCISETSCAN)
 COMPATIBLE_IOCTL(HCISETAUTH)
diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 55576e84882..3cc29491931 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -72,8 +72,6 @@ enum {
 	HCI_INQUIRY,
 
 	HCI_RAW,
-
-	HCI_SECMGR
 };
 
 /* HCI ioctl defines */
@@ -86,6 +84,7 @@ enum {
 #define HCIGETDEVINFO	_IOR('H', 211, int)
 #define HCIGETCONNLIST	_IOR('H', 212, int)
 #define HCIGETCONNINFO	_IOR('H', 213, int)
+#define HCIGETAUTHINFO	_IOR('H', 215, int)
 
 #define HCISETRAW	_IOW('H', 220, int)
 #define HCISETSCAN	_IOW('H', 221, int)
@@ -97,8 +96,6 @@ enum {
 #define HCISETACLMTU	_IOW('H', 227, int)
 #define HCISETSCOMTU	_IOW('H', 228, int)
 
-#define HCISETSECMGR	_IOW('H', 230, int)
-
 #define HCIINQUIRY	_IOR('H', 240, int)
 
 /* HCI timeouts */
@@ -203,6 +200,14 @@ enum {
 #define HCI_LM_RELIABLE	0x0010
 #define HCI_LM_SECURE	0x0020
 
+/* Authentication types */
+#define HCI_AT_NO_BONDING		0x00
+#define HCI_AT_NO_BONDING_MITM		0x01
+#define HCI_AT_DEDICATED_BONDING	0x02
+#define HCI_AT_DEDICATED_BONDING_MITM	0x03
+#define HCI_AT_GENERAL_BONDING		0x04
+#define HCI_AT_GENERAL_BONDING_MITM	0x05
+
 /* -----  HCI Commands ---- */
 #define HCI_OP_INQUIRY			0x0401
 struct hci_cp_inquiry {
@@ -1001,6 +1006,11 @@ struct hci_conn_info_req {
 	struct   hci_conn_info conn_info[0];
 };
 
+struct hci_auth_info_req {
+	bdaddr_t bdaddr;
+	__u8     type;
+};
+
 struct hci_inquiry_req {
 	__u16 dev_id;
 	__u16 flags;
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 28fbd0caa53..cbf75109468 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -168,6 +168,7 @@ struct hci_conn {
 	__u16            pkt_type;
 	__u16            link_policy;
 	__u32		 link_mode;
+	__u8             auth_type;
 	__u8             power_save;
 	unsigned long	 pend;
 
@@ -422,6 +423,7 @@ int hci_get_dev_list(void __user *arg);
 int hci_get_dev_info(void __user *arg);
 int hci_get_conn_list(void __user *arg);
 int hci_get_conn_info(struct hci_dev *hdev, void __user *arg);
+int hci_get_auth_info(struct hci_dev *hdev, void __user *arg);
 int hci_inquiry(void __user *arg);
 
 void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb);
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 6f22533e765..0d4b8aeb8e0 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -379,13 +379,21 @@ int hci_conn_auth(struct hci_conn *conn)
 {
 	BT_DBG("conn %p", conn);
 
+	if (conn->ssp_mode > 0 && conn->hdev->ssp_mode > 0) {
+		if (!(conn->auth_type & 0x01)) {
+			conn->auth_type = HCI_AT_GENERAL_BONDING_MITM;
+			conn->link_mode &= ~HCI_LM_AUTH;
+		}
+	}
+
 	if (conn->link_mode & HCI_LM_AUTH)
 		return 1;
 
 	if (!test_and_set_bit(HCI_CONN_AUTH_PEND, &conn->pend)) {
 		struct hci_cp_auth_requested cp;
 		cp.handle = cpu_to_le16(conn->handle);
-		hci_send_cmd(conn->hdev, HCI_OP_AUTH_REQUESTED, sizeof(cp), &cp);
+		hci_send_cmd(conn->hdev, HCI_OP_AUTH_REQUESTED,
+							sizeof(cp), &cp);
 	}
 	return 0;
 }
@@ -397,7 +405,7 @@ int hci_conn_encrypt(struct hci_conn *conn)
 	BT_DBG("conn %p", conn);
 
 	if (conn->link_mode & HCI_LM_ENCRYPT)
-		return 1;
+		return hci_conn_auth(conn);
 
 	if (test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend))
 		return 0;
@@ -406,7 +414,8 @@ int hci_conn_encrypt(struct hci_conn *conn)
 		struct hci_cp_set_conn_encrypt cp;
 		cp.handle  = cpu_to_le16(conn->handle);
 		cp.encrypt = 1;
-		hci_send_cmd(conn->hdev, HCI_OP_SET_CONN_ENCRYPT, sizeof(cp), &cp);
+		hci_send_cmd(conn->hdev, HCI_OP_SET_CONN_ENCRYPT,
+							sizeof(cp), &cp);
 	}
 	return 0;
 }
@@ -420,7 +429,8 @@ int hci_conn_change_link_key(struct hci_conn *conn)
 	if (!test_and_set_bit(HCI_CONN_AUTH_PEND, &conn->pend)) {
 		struct hci_cp_change_conn_link_key cp;
 		cp.handle = cpu_to_le16(conn->handle);
-		hci_send_cmd(conn->hdev, HCI_OP_CHANGE_CONN_LINK_KEY, sizeof(cp), &cp);
+		hci_send_cmd(conn->hdev, HCI_OP_CHANGE_CONN_LINK_KEY,
+							sizeof(cp), &cp);
 	}
 	return 0;
 }
@@ -624,3 +634,23 @@ int hci_get_conn_info(struct hci_dev *hdev, void __user *arg)
 
 	return copy_to_user(ptr, &ci, sizeof(ci)) ? -EFAULT : 0;
 }
+
+int hci_get_auth_info(struct hci_dev *hdev, void __user *arg)
+{
+	struct hci_auth_info_req req;
+	struct hci_conn *conn;
+
+	if (copy_from_user(&req, arg, sizeof(req)))
+		return -EFAULT;
+
+	hci_dev_lock_bh(hdev);
+	conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &req.bdaddr);
+	if (conn)
+		req.type = conn->auth_type;
+	hci_dev_unlock_bh(hdev);
+
+	if (!conn)
+		return -ENOENT;
+
+	return copy_to_user(arg, &req, sizeof(req)) ? -EFAULT : 0;
+}
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 747fabd735d..d62579b6795 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -193,19 +193,11 @@ static inline int hci_sock_bound_ioctl(struct sock *sk, unsigned int cmd, unsign
 
 		return 0;
 
-	case HCISETSECMGR:
-		if (!capable(CAP_NET_ADMIN))
-			return -EACCES;
-
-		if (arg)
-			set_bit(HCI_SECMGR, &hdev->flags);
-		else
-			clear_bit(HCI_SECMGR, &hdev->flags);
-
-		return 0;
-
 	case HCIGETCONNINFO:
-		return hci_get_conn_info(hdev, (void __user *)arg);
+		return hci_get_conn_info(hdev, (void __user *) arg);
+
+	case HCIGETAUTHINFO:
+		return hci_get_auth_info(hdev, (void __user *) arg);
 
 	default:
 		if (hdev->ioctl)
@@ -217,7 +209,7 @@ static inline int hci_sock_bound_ioctl(struct sock *sk, unsigned int cmd, unsign
 static int hci_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 {
 	struct sock *sk = sock->sk;
-	void __user *argp = (void __user *)arg;
+	void __user *argp = (void __user *) arg;
 	int err;
 
 	BT_DBG("cmd %x arg %lx", cmd, arg);
diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c
index 252264062f5..30ad59b717d 100644
--- a/net/bluetooth/l2cap.c
+++ b/net/bluetooth/l2cap.c
@@ -2150,7 +2150,7 @@ static int l2cap_disconn_ind(struct hci_conn *hcon, u8 reason)
 static int l2cap_auth_cfm(struct hci_conn *hcon, u8 status)
 {
 	struct l2cap_chan_list *l;
-	struct l2cap_conn *conn = conn = hcon->l2cap_data;
+	struct l2cap_conn *conn = hcon->l2cap_data;
 	struct l2cap_conn_rsp rsp;
 	struct sock *sk;
 	int result;
@@ -2165,11 +2165,17 @@ static int l2cap_auth_cfm(struct hci_conn *hcon, u8 status)
 	read_lock(&l->lock);
 
 	for (sk = l->head; sk; sk = l2cap_pi(sk)->next_c) {
+		struct l2cap_pinfo *pi = l2cap_pi(sk);
+
 		bh_lock_sock(sk);
 
-		if (sk->sk_state != BT_CONNECT2 ||
-				(l2cap_pi(sk)->link_mode & L2CAP_LM_ENCRYPT) ||
-				(l2cap_pi(sk)->link_mode & L2CAP_LM_SECURE)) {
+		if (sk->sk_state != BT_CONNECT2) {
+			bh_unlock_sock(sk);
+			continue;
+		}
+
+		if ((pi->link_mode & (L2CAP_LM_ENCRYPT | L2CAP_LM_SECURE)) &&
+					!(hcon->link_mode & HCI_LM_ENCRYPT)) {
 			bh_unlock_sock(sk);
 			continue;
 		}
diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index e7a6a03cea3..e56bcfc35a4 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -1969,7 +1969,8 @@ static void rfcomm_auth_cfm(struct hci_conn *conn, u8 status)
 	list_for_each_safe(p, n, &s->dlcs) {
 		d = list_entry(p, struct rfcomm_dlc, list);
 
-		if (d->link_mode & (RFCOMM_LM_ENCRYPT | RFCOMM_LM_SECURE))
+		if ((d->link_mode & (RFCOMM_LM_ENCRYPT | RFCOMM_LM_SECURE)) &&
+				!(conn->link_mode & HCI_LM_ENCRYPT) && !status)
 			continue;
 
 		if (!test_and_clear_bit(RFCOMM_AUTH_PENDING, &d->flags))
-- 
cgit v1.2.3-70-g09d2


From 254ae43ab8d7877c980fca3636624e0777a70fa4 Mon Sep 17 00:00:00 2001
From: Masatake YAMATO <yamato@redhat.com>
Date: Wed, 28 May 2008 14:45:10 +0900
Subject: dlm: check for null in device_write

If `device_write' method is called via "dlm-control",
file->private_data is NULL. (See ctl_device_open() in
user.c. ) Through proc->flags is read.

Signed-off-by: Masatake YAMATO <yamato@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/user.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index ebbcf38fd33..1aa76b32d05 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -538,7 +538,7 @@ static ssize_t device_write(struct file *file, const char __user *buf,
 
 	/* do we really need this? can a write happen after a close? */
 	if ((kbuf->cmd == DLM_USER_LOCK || kbuf->cmd == DLM_USER_UNLOCK) &&
-	    test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))
+	    (proc && test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags)))
 		return -EINVAL;
 
 	sigfillset(&allsigs);
-- 
cgit v1.2.3-70-g09d2


From 329fc4c37212588091b64bdf09afaeb18642aae2 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Tue, 20 May 2008 12:18:10 -0500
Subject: dlm: fix basts for granted CW waiting PR/CW

The fix in commit 3650925893469ccb03dbcc6a440c5d363350f591 was addressing
the case of a granted PR lock with waiting PR and CW locks.  It's a
special case that requires forcing a CW bast.  However, that forced CW
bast was incorrectly applying to a second condition where the granted
lock was CW.  So, the holder of a CW lock could receive an extraneous CW
bast instead of a PR bast.  This fix narrows the original special case to
what was intended.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lock.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 2d3d1027ce2..7ba9586a094 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1782,7 +1782,8 @@ static void grant_pending_locks(struct dlm_rsb *r)
 
 	list_for_each_entry_safe(lkb, s, &r->res_grantqueue, lkb_statequeue) {
 		if (lkb->lkb_bastfn && lock_requires_bast(lkb, high, cw)) {
-			if (cw && high == DLM_LOCK_PR)
+			if (cw && high == DLM_LOCK_PR &&
+			    lkb->lkb_grmode == DLM_LOCK_PR)
 				queue_bast(r, lkb, DLM_LOCK_CW);
 			else
 				queue_bast(r, lkb, high);
-- 
cgit v1.2.3-70-g09d2


From 311f6fc77c51926dbdfbeab0a5d88d70f01fa3f4 Mon Sep 17 00:00:00 2001
From: Masatake YAMATO <yamato@redhat.com>
Date: Fri, 27 Jun 2008 08:35:03 -0500
Subject: dlm: release socket on error

It seems that `sock' allocated by sock_create_kern in
tcp_connect_to_sock() of dlm/fs/lowcomms.c is not released if
dlm_nodeid_to_addr an error.

Acked-by: Christine Caulfield <ccaulfie@redhat.com>
Signed-off-by: Masatake YAMATO <yamato@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 637018c891e..3962262f991 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -891,8 +891,10 @@ static void tcp_connect_to_sock(struct connection *con)
 		goto out_err;
 
 	memset(&saddr, 0, sizeof(saddr));
-	if (dlm_nodeid_to_addr(con->nodeid, &saddr))
+	if (dlm_nodeid_to_addr(con->nodeid, &saddr)) {
+		sock_release(sock);
 		goto out_err;
+	}
 
 	sock->sk->sk_user_data = con;
 	con->rx_action = receive_from_sock;
-- 
cgit v1.2.3-70-g09d2


From 18c60c0a3b16fc7d6a55497a228602ad8509f838 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Mon, 30 Jun 2008 19:59:14 +0300
Subject: dlm: fix uninitialized variable for search_rsb_list callers

gcc 4.3.0 correctly emits the following warning.
search_rsb_list does not *r_ret if no dlm_rsb is found
and _search_rsb may pass the uninitialized value upstream
on the error path when both calls to search_rsb_list
return non-zero error.

The fix sets *r_ret to NULL on search_rsb_list's not-found path.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lock.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 7ba9586a094..724ddac9153 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -363,6 +363,7 @@ static int search_rsb_list(struct list_head *head, char *name, int len,
 		if (len == r->res_length && !memcmp(name, r->res_name, len))
 			goto found;
 	}
+	*r_ret = NULL;
 	return -EBADR;
 
  found:
-- 
cgit v1.2.3-70-g09d2


From ce7231e92dac381f6e4f9cfdfdf9e0ea055223ad Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Tue, 13 May 2008 13:45:14 -0700
Subject: [PATCH 1/2] ocfs2: Add CONFIG_OCFS2_FS_STATS config option

This patch adds config option CONFIG_OCFS2_FS_STATS to allow building
the fs with instrumentation enabled. An upcoming patch will provide
support to instrument cluster locking, which is a crucial overhead in
a cluster file system. This config option allows users to avoid the cpu
and memory overhead that is involved in gathering such statistics.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/Kconfig | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 2694648cbd1..9c07a4f49e6 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -470,6 +470,14 @@ config OCFS2_FS_USERSPACE_CLUSTER
 	  It is safe to say Y, as the clustering method is run-time
 	  selectable.
 
+config OCFS2_FS_STATS
+	bool "OCFS2 statistics"
+	depends on OCFS2_FS
+	default y
+	help
+	  This option allows some fs statistics to be captured. Enabling
+	  this option may increase the memory consumption.
+
 config OCFS2_DEBUG_MASKLOG
 	bool "OCFS2 logging support"
 	depends on OCFS2_FS
-- 
cgit v1.2.3-70-g09d2


From 8ddb7b004dfa1832a750e199df8bff4b75b73565 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Tue, 13 May 2008 13:45:15 -0700
Subject: [PATCH 2/2] ocfs2: Instrument fs cluster locks

This patch adds code to track the number of times the fs takes
various cluster locks as well as the times associated with it.
The information is made available to users via debugfs.

This patch was originally written by Jan Kara <jack@suse.cz>.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlmglue.c | 122 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/ocfs2/ocfs2.h   |  12 ++++++
 2 files changed, 133 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 80e20d9f278..80537b769e4 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -31,6 +31,7 @@
 #include <linux/pagemap.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
+#include <linux/time.h>
 
 #define MLOG_MASK_PREFIX ML_DLM_GLUE
 #include <cluster/masklog.h>
@@ -59,6 +60,9 @@ struct ocfs2_mask_waiter {
 	struct completion	mw_complete;
 	unsigned long		mw_mask;
 	unsigned long		mw_goal;
+#ifdef CONFIG_OCFS2_FS_STATS
+	unsigned long long 	mw_lock_start;
+#endif
 };
 
 static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres);
@@ -366,6 +370,75 @@ static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res)
 	spin_unlock(&ocfs2_dlm_tracking_lock);
 }
 
+#ifdef CONFIG_OCFS2_FS_STATS
+static void ocfs2_init_lock_stats(struct ocfs2_lock_res *res)
+{
+	res->l_lock_num_prmode = 0;
+	res->l_lock_num_prmode_failed = 0;
+	res->l_lock_total_prmode = 0;
+	res->l_lock_max_prmode = 0;
+	res->l_lock_num_exmode = 0;
+	res->l_lock_num_exmode_failed = 0;
+	res->l_lock_total_exmode = 0;
+	res->l_lock_max_exmode = 0;
+	res->l_lock_refresh = 0;
+}
+
+static void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, int level,
+				    struct ocfs2_mask_waiter *mw, int ret)
+{
+	unsigned long long *num, *sum;
+	unsigned int *max, *failed;
+	struct timespec ts = current_kernel_time();
+	unsigned long long time = timespec_to_ns(&ts) - mw->mw_lock_start;
+
+	if (level == LKM_PRMODE) {
+		num = &res->l_lock_num_prmode;
+		sum = &res->l_lock_total_prmode;
+		max = &res->l_lock_max_prmode;
+		failed = &res->l_lock_num_prmode_failed;
+	} else if (level == LKM_EXMODE) {
+		num = &res->l_lock_num_exmode;
+		sum = &res->l_lock_total_exmode;
+		max = &res->l_lock_max_exmode;
+		failed = &res->l_lock_num_exmode_failed;
+	} else
+		return;
+
+	(*num)++;
+	(*sum) += time;
+	if (time > *max)
+		*max = time;
+	if (ret)
+		(*failed)++;
+}
+
+static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
+{
+	lockres->l_lock_refresh++;
+}
+
+static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw)
+{
+	struct timespec ts = current_kernel_time();
+	mw->mw_lock_start = timespec_to_ns(&ts);
+}
+#else
+static inline void ocfs2_init_lock_stats(struct ocfs2_lock_res *res)
+{
+}
+static inline void ocfs2_update_lock_stats(struct ocfs2_lock_res *res,
+			   int level, struct ocfs2_mask_waiter *mw, int ret)
+{
+}
+static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
+{
+}
+static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw)
+{
+}
+#endif
+
 static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
 				       struct ocfs2_lock_res *res,
 				       enum ocfs2_lock_type type,
@@ -385,6 +458,8 @@ static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
 	res->l_flags         = OCFS2_LOCK_INITIALIZED;
 
 	ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug);
+
+	ocfs2_init_lock_stats(res);
 }
 
 void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res)
@@ -1048,6 +1123,7 @@ static void ocfs2_init_mask_waiter(struct ocfs2_mask_waiter *mw)
 {
 	INIT_LIST_HEAD(&mw->mw_item);
 	init_completion(&mw->mw_complete);
+	ocfs2_init_start_time(mw);
 }
 
 static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw)
@@ -1254,6 +1330,7 @@ out:
 			goto again;
 		mlog_errno(ret);
 	}
+	ocfs2_update_lock_stats(lockres, level, &mw, ret);
 
 	mlog_exit(ret);
 	return ret;
@@ -1983,6 +2060,7 @@ static int ocfs2_inode_lock_update(struct inode *inode,
 				le32_to_cpu(fe->i_flags));
 
 		ocfs2_refresh_inode(inode, fe);
+		ocfs2_track_lock_refresh(lockres);
 	}
 
 	status = 0;
@@ -2267,6 +2345,7 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
 
 		if (status < 0)
 			mlog_errno(status);
+		ocfs2_track_lock_refresh(lockres);
 	}
 bail:
 	mlog_exit(status);
@@ -2461,7 +2540,7 @@ static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos)
 }
 
 /* So that debugfs.ocfs2 can determine which format is being used */
-#define OCFS2_DLM_DEBUG_STR_VERSION 1
+#define OCFS2_DLM_DEBUG_STR_VERSION 2
 static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
 {
 	int i;
@@ -2502,6 +2581,47 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
 	for(i = 0; i < DLM_LVB_LEN; i++)
 		seq_printf(m, "0x%x\t", lvb[i]);
 
+#ifdef CONFIG_OCFS2_FS_STATS
+# define lock_num_prmode(_l)		(_l)->l_lock_num_prmode
+# define lock_num_exmode(_l)		(_l)->l_lock_num_exmode
+# define lock_num_prmode_failed(_l)	(_l)->l_lock_num_prmode_failed
+# define lock_num_exmode_failed(_l)	(_l)->l_lock_num_exmode_failed
+# define lock_total_prmode(_l)		(_l)->l_lock_total_prmode
+# define lock_total_exmode(_l)		(_l)->l_lock_total_exmode
+# define lock_max_prmode(_l)		(_l)->l_lock_max_prmode
+# define lock_max_exmode(_l)		(_l)->l_lock_max_exmode
+# define lock_refresh(_l)		(_l)->l_lock_refresh
+#else
+# define lock_num_prmode(_l)		(0)
+# define lock_num_exmode(_l)		(0)
+# define lock_num_prmode_failed(_l)	(0)
+# define lock_num_exmode_failed(_l)	(0)
+# define lock_total_prmode(_l)		(0)
+# define lock_total_exmode(_l)		(0)
+# define lock_max_prmode(_l)		(0)
+# define lock_max_exmode(_l)		(0)
+# define lock_refresh(_l)		(0)
+#endif
+	/* The following seq_print was added in version 2 of this output */
+	seq_printf(m, "%llu\t"
+		   "%llu\t"
+		   "%u\t"
+		   "%u\t"
+		   "%llu\t"
+		   "%llu\t"
+		   "%u\t"
+		   "%u\t"
+		   "%u\t",
+		   lock_num_prmode(lockres),
+		   lock_num_exmode(lockres),
+		   lock_num_prmode_failed(lockres),
+		   lock_num_exmode_failed(lockres),
+		   lock_total_prmode(lockres),
+		   lock_total_exmode(lockres),
+		   lock_max_prmode(lockres),
+		   lock_max_exmode(lockres),
+		   lock_refresh(lockres));
+
 	/* End the line */
 	seq_printf(m, "\n");
 	return 0;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 31692379c17..1cb814be8ef 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -132,6 +132,18 @@ struct ocfs2_lock_res {
 	wait_queue_head_t        l_event;
 
 	struct list_head         l_debug_list;
+
+#ifdef CONFIG_OCFS2_FS_STATS
+	unsigned long long	 l_lock_num_prmode; 	   /* PR acquires */
+	unsigned long long 	 l_lock_num_exmode; 	   /* EX acquires */
+	unsigned int		 l_lock_num_prmode_failed; /* Failed PR gets */
+	unsigned int		 l_lock_num_exmode_failed; /* Failed EX gets */
+	unsigned long long	 l_lock_total_prmode; 	   /* Tot wait for PR */
+	unsigned long long	 l_lock_total_exmode; 	   /* Tot wait for EX */
+	unsigned int		 l_lock_max_prmode; 	   /* Max wait for PR */
+	unsigned int		 l_lock_max_exmode; 	   /* Max wait for EX */
+	unsigned int		 l_lock_refresh;	   /* Disk refreshes */
+#endif
 };
 
 struct ocfs2_dlm_debug {
-- 
cgit v1.2.3-70-g09d2


From dd25e55ea133b14678cfaa9e205b082b24b26dbc Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 28 May 2008 14:41:00 -0700
Subject: ocfs2: fix printk format warnings with OCFS2_FS_STATS=n

Fix printk format warnings when OCFS2_FS_STATS=n:

linux-next-20080528/fs/ocfs2/dlmglue.c: In function 'ocfs2_dlm_seq_show':
linux-next-20080528/fs/ocfs2/dlmglue.c:2623: warning: format '%llu' expects type 'long long unsigned int', but argument 3 has type 'int'
linux-next-20080528/fs/ocfs2/dlmglue.c:2623: warning: format '%llu' expects type 'long long unsigned int', but argument 4 has type 'int'
linux-next-20080528/fs/ocfs2/dlmglue.c:2623: warning: format '%llu' expects type 'long long unsigned int', but argument 7 has type 'int'
linux-next-20080528/fs/ocfs2/dlmglue.c:2623: warning: format '%llu' expects type 'long long unsigned int', but argument 8 has type 'int'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlmglue.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 80537b769e4..eae3d643a5e 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2592,12 +2592,12 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
 # define lock_max_exmode(_l)		(_l)->l_lock_max_exmode
 # define lock_refresh(_l)		(_l)->l_lock_refresh
 #else
-# define lock_num_prmode(_l)		(0)
-# define lock_num_exmode(_l)		(0)
+# define lock_num_prmode(_l)		(0ULL)
+# define lock_num_exmode(_l)		(0ULL)
 # define lock_num_prmode_failed(_l)	(0)
 # define lock_num_exmode_failed(_l)	(0)
-# define lock_total_prmode(_l)		(0)
-# define lock_total_exmode(_l)		(0)
+# define lock_total_prmode(_l)		(0ULL)
+# define lock_total_exmode(_l)		(0ULL)
 # define lock_max_prmode(_l)		(0)
 # define lock_max_exmode(_l)		(0)
 # define lock_refresh(_l)		(0)
-- 
cgit v1.2.3-70-g09d2


From 7600c72b75bab374ad39b2a4799a0728579a8e2f Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Mon, 9 Jun 2008 16:34:23 -0700
Subject: ocfs2: use simple_read_from_buffer()

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Acked-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/stack_user.c | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index c021280dd46..24e0b19c8b6 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -549,26 +549,17 @@ static ssize_t ocfs2_control_read(struct file *file,
 				  size_t count,
 				  loff_t *ppos)
 {
-	char *proto_string = OCFS2_CONTROL_PROTO;
-	size_t to_write = 0;
-
-	if (*ppos >= OCFS2_CONTROL_PROTO_LEN)
-		return 0;
-
-	to_write = OCFS2_CONTROL_PROTO_LEN - *ppos;
-	if (to_write > count)
-		to_write = count;
-	if (copy_to_user(buf, proto_string + *ppos, to_write))
-		return -EFAULT;
+	ssize_t ret;
 
-	*ppos += to_write;
+	ret = simple_read_from_buffer(buf, count, ppos,
+			OCFS2_CONTROL_PROTO, OCFS2_CONTROL_PROTO_LEN);
 
 	/* Have we read the whole protocol list? */
-	if (*ppos >= OCFS2_CONTROL_PROTO_LEN)
+	if (ret > 0 && *ppos >= OCFS2_CONTROL_PROTO_LEN)
 		ocfs2_control_set_handshake_state(file,
 						  OCFS2_CONTROL_HANDSHAKE_READ);
 
-	return to_write;
+	return ret;
 }
 
 static int ocfs2_control_release(struct inode *inode, struct file *file)
-- 
cgit v1.2.3-70-g09d2


From 56753bd3b9220f6f2477eb1cf97f40c24e0a4c91 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Mon, 9 Jun 2008 11:24:41 -0700
Subject: ocfs2: Silence an error message in ocfs2_file_aio_read()

This patch silences an EINVAL error message in ocfs2_file_aio_read()
that is always due to a user error.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 57e0d30cde9..e8514e8b6ce 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2202,7 +2202,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
 
 	ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
 	if (ret == -EINVAL)
-		mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n");
+		mlog(0, "generic_file_aio_read returned -EINVAL\n");
 
 	/* buffered aio wouldn't have proper lock coverage today */
 	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
-- 
cgit v1.2.3-70-g09d2


From 01af482037d32c215aab208a0b110ffe6fd782c0 Mon Sep 17 00:00:00 2001
From: Wengang Wang <wen.gang.wang@oracle.com>
Date: Tue, 10 Jun 2008 14:24:48 +0800
Subject: ocfs2: Handle error during journal load

This patch ensures the mount fails if the fs is unable to load the journal.

Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
Acked-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/super.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index df63ba20ae9..ccecfe5094f 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1703,7 +1703,11 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
 	local = ocfs2_mount_local(osb);
 
 	/* will play back anything left in the journal. */
-	ocfs2_journal_load(osb->journal, local);
+	status = ocfs2_journal_load(osb->journal, local);
+	if (status < 0) {
+		mlog(ML_ERROR, "ocfs2 journal load failed! %d\n", status);
+		goto finally;
+	}
 
 	if (dirty) {
 		/* recover my local alloc if we didn't unmount cleanly. */
-- 
cgit v1.2.3-70-g09d2


From 461c6a30eca6f25add1dadb9fd8a1d8e89a6e627 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Mon, 19 May 2008 16:23:37 -0700
Subject: ocfs2/net: Silence build warnings on sparc64

suseconds_t is type long on most arches except sparc64 where it is type int.
This patch silences the following warnings that are generated when building
on it.

netdebug.c: In function 'nst_seq_show':
netdebug.c:152: warning: format '%lu' expects type 'long unsigned int', but argument 13 has type 'suseconds_t'
netdebug.c:152: warning: format '%lu' expects type 'long unsigned int', but argument 15 has type 'suseconds_t'
netdebug.c:152: warning: format '%lu' expects type 'long unsigned int', but argument 17 has type 'suseconds_t'
netdebug.c: In function 'sc_seq_show':
netdebug.c:332: warning: format '%lu' expects type 'long unsigned int', but argument 19 has type 'suseconds_t'
netdebug.c:332: warning: format '%lu' expects type 'long unsigned int', but argument 21 has type 'suseconds_t'
netdebug.c:332: warning: format '%lu' expects type 'long unsigned int', but argument 23 has type 'suseconds_t'
netdebug.c:332: warning: format '%lu' expects type 'long unsigned int', but argument 25 has type 'suseconds_t'
netdebug.c:332: warning: format '%lu' expects type 'long unsigned int', but argument 27 has type 'suseconds_t'
netdebug.c:332: warning: format '%lu' expects type 'long unsigned int', but argument 29 has type 'suseconds_t'

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/cluster/netdebug.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index 7bf3c0ea7bd..d8bfa0eb41b 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -146,8 +146,10 @@ static int nst_seq_show(struct seq_file *seq, void *v)
 			   nst->st_task->comm, nst->st_node,
 			   nst->st_sc, nst->st_id, nst->st_msg_type,
 			   nst->st_msg_key,
-			   nst->st_sock_time.tv_sec, nst->st_sock_time.tv_usec,
-			   nst->st_send_time.tv_sec, nst->st_send_time.tv_usec,
+			   nst->st_sock_time.tv_sec,
+			   (unsigned long)nst->st_sock_time.tv_usec,
+			   nst->st_send_time.tv_sec,
+			   (unsigned long)nst->st_send_time.tv_usec,
 			   nst->st_status_time.tv_sec,
 			   nst->st_status_time.tv_usec);
 	}
@@ -274,7 +276,7 @@ static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	return sc; /* unused, just needs to be null when done */
 }
 
-#define TV_SEC_USEC(TV) TV.tv_sec, TV.tv_usec
+#define TV_SEC_USEC(TV) TV.tv_sec, (unsigned long)TV.tv_usec
 
 static int sc_seq_show(struct seq_file *seq, void *v)
 {
-- 
cgit v1.2.3-70-g09d2


From e407e39783a7206d20b3e9961aedf272de966e31 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 12 Jun 2008 22:35:39 -0700
Subject: ocfs2: Fix CONFIG_OCFS2_DEBUG_FS #ifdefs

A couple places use OCFS2_DEBUG_FS where they really mean
CONFIG_OCFS2_DEBUG_FS.

Reported-by: Robert P. J. Day <rpjday@crashcourse.ca>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/journal.c    | 2 +-
 fs/ocfs2/localalloc.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 9698338adc3..a8c19cb3cfd 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -329,7 +329,7 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
 
 	mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);
 
-#ifdef OCFS2_DEBUG_FS
+#ifdef CONFIG_OCFS2_DEBUG_FS
 	status = 1;
 #else
 	status = journal_extend(handle, nblocks);
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index be774bdc8b3..28e492e4ec8 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -498,7 +498,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
 
 	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
 
-#ifdef OCFS2_DEBUG_FS
+#ifdef CONFIG_OCFS2_DEBUG_FS
 	if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
 	    ocfs2_local_alloc_count_bits(alloc)) {
 		ocfs2_error(osb->sb, "local alloc inode %llu says it has "
-- 
cgit v1.2.3-70-g09d2


From fe9f387740ac7cb3b7c2fffa76807e997e6c6292 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 12 Jun 2008 22:39:18 -0700
Subject: ocfs2: Don't snprintf() without a format.

Some system files are per-slot.  Their names include the slot number.
ocfs2_sprintf_system_inode_name() uses the system inode definitions to
fill in the slot number with snprintf().

For global system files, there is no node number, and the name was
printed as a format with no arguments.  -Wformat-nonliteral and
-Wformat-security don't like this.  Instead, use a static "%s" format
and the name as the argument.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/ocfs2_fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 52c42666515..3f194517762 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -901,7 +901,7 @@ static inline int ocfs2_sprintf_system_inode_name(char *buf, int len,
          * list has a copy per slot.
          */
 	if (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE)
-		chars = snprintf(buf, len,
+		chars = snprintf(buf, len, "%s",
 				 ocfs2_system_inodes[type].si_name);
 	else
 		chars = snprintf(buf, len,
-- 
cgit v1.2.3-70-g09d2


From 6f61076406251626be39651d114fac412b1e0c39 Mon Sep 17 00:00:00 2001
From: Louis Rilling <Louis.Rilling@kerlabs.com>
Date: Mon, 16 Jun 2008 19:00:58 +0200
Subject: configfs: Introduce configfs_dirent_lock

This patch introduces configfs_dirent_lock spinlock to protect configfs_dirent
traversals against linkage mutations (add/del/move). This will allow
configfs_detach_prep() to avoid locking i_mutexes.

Locking rules for configfs_dirent linkage mutations are the same plus the
requirement of taking configfs_dirent_lock. For configfs_dirent walking, one can
either take appropriate i_mutex as before, or take configfs_dirent_lock.

The spinlock could actually be a mutex, but the critical sections are either
O(1) or should not be too long (default groups walking in last patch).

ChangeLog:
  - Clarify the comment on configfs_dirent_lock usage
  - Move sd->s_element init before linking the new dirent
  - In lseek(), do not release configfs_dirent_lock before the dirent is
    relinked.

Signed-off-by: Louis Rilling <Louis.Rilling@kerlabs.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/configfs/configfs_internal.h |  3 +++
 fs/configfs/dir.c               | 28 +++++++++++++++++++++++++++-
 fs/configfs/inode.c             |  2 ++
 fs/configfs/symlink.c           |  2 ++
 4 files changed, 34 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index cca98609aa7..5a33b58e66d 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -26,6 +26,7 @@
 
 #include <linux/slab.h>
 #include <linux/list.h>
+#include <linux/spinlock.h>
 
 struct configfs_dirent {
 	atomic_t		s_count;
@@ -49,6 +50,8 @@ struct configfs_dirent {
 #define CONFIGFS_USET_DROPPING	0x0100
 #define CONFIGFS_NOT_PINNED	(CONFIGFS_ITEM_ATTR)
 
+extern spinlock_t configfs_dirent_lock;
+
 extern struct vfsmount * configfs_mount;
 extern struct kmem_cache *configfs_dir_cachep;
 
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index a48dc7dd876..2619f485bc3 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -35,6 +35,14 @@
 #include "configfs_internal.h"
 
 DECLARE_RWSEM(configfs_rename_sem);
+/*
+ * Protects mutations of configfs_dirent linkage together with proper i_mutex
+ * Mutators of configfs_dirent linkage must *both* have the proper inode locked
+ * and configfs_dirent_lock locked, in that order.
+ * This allows one to safely traverse configfs_dirent trees without having to
+ * lock inodes.
+ */
+DEFINE_SPINLOCK(configfs_dirent_lock);
 
 static void configfs_d_iput(struct dentry * dentry,
 			    struct inode * inode)
@@ -79,8 +87,10 @@ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * pare
 	atomic_set(&sd->s_count, 1);
 	INIT_LIST_HEAD(&sd->s_links);
 	INIT_LIST_HEAD(&sd->s_children);
-	list_add(&sd->s_sibling, &parent_sd->s_children);
 	sd->s_element = element;
+	spin_lock(&configfs_dirent_lock);
+	list_add(&sd->s_sibling, &parent_sd->s_children);
+	spin_unlock(&configfs_dirent_lock);
 
 	return sd;
 }
@@ -173,7 +183,9 @@ static int create_dir(struct config_item * k, struct dentry * p,
 		} else {
 			struct configfs_dirent *sd = d->d_fsdata;
 			if (sd) {
+				spin_lock(&configfs_dirent_lock);
 				list_del_init(&sd->s_sibling);
+				spin_unlock(&configfs_dirent_lock);
 				configfs_put(sd);
 			}
 		}
@@ -224,7 +236,9 @@ int configfs_create_link(struct configfs_symlink *sl,
 		else {
 			struct configfs_dirent *sd = dentry->d_fsdata;
 			if (sd) {
+				spin_lock(&configfs_dirent_lock);
 				list_del_init(&sd->s_sibling);
+				spin_unlock(&configfs_dirent_lock);
 				configfs_put(sd);
 			}
 		}
@@ -238,7 +252,9 @@ static void remove_dir(struct dentry * d)
 	struct configfs_dirent * sd;
 
 	sd = d->d_fsdata;
+	spin_lock(&configfs_dirent_lock);
 	list_del_init(&sd->s_sibling);
+	spin_unlock(&configfs_dirent_lock);
 	configfs_put(sd);
 	if (d->d_inode)
 		simple_rmdir(parent->d_inode,d);
@@ -410,7 +426,9 @@ static void detach_attrs(struct config_item * item)
 	list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) {
 		if (!sd->s_element || !(sd->s_type & CONFIGFS_NOT_PINNED))
 			continue;
+		spin_lock(&configfs_dirent_lock);
 		list_del_init(&sd->s_sibling);
+		spin_unlock(&configfs_dirent_lock);
 		configfs_drop_dentry(sd, dentry);
 		configfs_put(sd);
 	}
@@ -1268,7 +1286,9 @@ static int configfs_dir_close(struct inode *inode, struct file *file)
 	struct configfs_dirent * cursor = file->private_data;
 
 	mutex_lock(&dentry->d_inode->i_mutex);
+	spin_lock(&configfs_dirent_lock);
 	list_del_init(&cursor->s_sibling);
+	spin_unlock(&configfs_dirent_lock);
 	mutex_unlock(&dentry->d_inode->i_mutex);
 
 	release_configfs_dirent(cursor);
@@ -1308,7 +1328,9 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir
 			/* fallthrough */
 		default:
 			if (filp->f_pos == 2) {
+				spin_lock(&configfs_dirent_lock);
 				list_move(q, &parent_sd->s_children);
+				spin_unlock(&configfs_dirent_lock);
 			}
 			for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
 				struct configfs_dirent *next;
@@ -1331,7 +1353,9 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir
 						 dt_type(next)) < 0)
 					return 0;
 
+				spin_lock(&configfs_dirent_lock);
 				list_move(q, p);
+				spin_unlock(&configfs_dirent_lock);
 				p = q;
 				filp->f_pos++;
 			}
@@ -1362,6 +1386,7 @@ static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin)
 			struct list_head *p;
 			loff_t n = file->f_pos - 2;
 
+			spin_lock(&configfs_dirent_lock);
 			list_del(&cursor->s_sibling);
 			p = sd->s_children.next;
 			while (n && p != &sd->s_children) {
@@ -1373,6 +1398,7 @@ static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin)
 				p = p->next;
 			}
 			list_add_tail(&cursor->s_sibling, p);
+			spin_unlock(&configfs_dirent_lock);
 		}
 	}
 	mutex_unlock(&dentry->d_inode->i_mutex);
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index b9a1d810346..4803ccc9448 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -247,7 +247,9 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name)
 		if (!sd->s_element)
 			continue;
 		if (!strcmp(configfs_get_name(sd), name)) {
+			spin_lock(&configfs_dirent_lock);
 			list_del_init(&sd->s_sibling);
+			spin_unlock(&configfs_dirent_lock);
 			configfs_drop_dentry(sd, dir);
 			configfs_put(sd);
 			break;
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 2a731ef5f30..676c84c416d 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -169,7 +169,9 @@ int configfs_unlink(struct inode *dir, struct dentry *dentry)
 	parent_item = configfs_get_config_item(dentry->d_parent);
 	type = parent_item->ci_type;
 
+	spin_lock(&configfs_dirent_lock);
 	list_del_init(&sd->s_sibling);
+	spin_unlock(&configfs_dirent_lock);
 	configfs_drop_dentry(sd, dentry->d_parent);
 	dput(dentry);
 	configfs_put(sd);
-- 
cgit v1.2.3-70-g09d2


From 5301a77da2da1e4c22573e0e8d394a653b8ad9f9 Mon Sep 17 00:00:00 2001
From: Louis Rilling <Louis.Rilling@kerlabs.com>
Date: Mon, 16 Jun 2008 19:00:59 +0200
Subject: configfs: Protect configfs_dirent s_links list mutations

Symlinks to a config_item are listed under its configfs_dirent s_links, but the
list mutations are not protected by any common lock.

This patch uses the configfs_dirent_lock spinlock to add the necessary
protection.

Note: we should also protect the list_empty() test in configfs_detach_prep() but
1/ the lock should not be released immediately because nothing would prevent the
list from being filled after a successful list_empty() test, making the problem
tricky,
2/ this will be solved by the rmdir() vs rename() deadlock bugfix.

Signed-off-by: Louis Rilling <Louis.Rilling@kerlabs.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/configfs/dir.c     | 5 +++--
 fs/configfs/symlink.c | 8 ++++++--
 2 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 2619f485bc3..a08e5c2f25e 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -37,10 +37,11 @@
 DECLARE_RWSEM(configfs_rename_sem);
 /*
  * Protects mutations of configfs_dirent linkage together with proper i_mutex
+ * Also protects mutations of symlinks linkage to target configfs_dirent
  * Mutators of configfs_dirent linkage must *both* have the proper inode locked
  * and configfs_dirent_lock locked, in that order.
- * This allows one to safely traverse configfs_dirent trees without having to
- * lock inodes.
+ * This allows one to safely traverse configfs_dirent trees and symlinks without
+ * having to lock inodes.
  */
 DEFINE_SPINLOCK(configfs_dirent_lock);
 
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 676c84c416d..faeb4417a10 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -77,12 +77,15 @@ static int create_link(struct config_item *parent_item,
 	sl = kmalloc(sizeof(struct configfs_symlink), GFP_KERNEL);
 	if (sl) {
 		sl->sl_target = config_item_get(item);
-		/* FIXME: needs a lock, I'd bet */
+		spin_lock(&configfs_dirent_lock);
 		list_add(&sl->sl_list, &target_sd->s_links);
+		spin_unlock(&configfs_dirent_lock);
 		ret = configfs_create_link(sl, parent_item->ci_dentry,
 					   dentry);
 		if (ret) {
+			spin_lock(&configfs_dirent_lock);
 			list_del_init(&sl->sl_list);
+			spin_unlock(&configfs_dirent_lock);
 			config_item_put(item);
 			kfree(sl);
 		}
@@ -186,8 +189,9 @@ int configfs_unlink(struct inode *dir, struct dentry *dentry)
 		type->ct_item_ops->drop_link(parent_item,
 					       sl->sl_target);
 
-	/* FIXME: Needs lock */
+	spin_lock(&configfs_dirent_lock);
 	list_del_init(&sl->sl_list);
+	spin_unlock(&configfs_dirent_lock);
 
 	/* Put reference from create_link() */
 	config_item_put(sl->sl_target);
-- 
cgit v1.2.3-70-g09d2


From 107ed40bd070df5e4a0a012042c45c40963dc574 Mon Sep 17 00:00:00 2001
From: Louis Rilling <Louis.Rilling@kerlabs.com>
Date: Mon, 16 Jun 2008 19:01:00 +0200
Subject: configfs: Make configfs_new_dirent() return error code instead of
 NULL

This patch makes configfs_new_dirent return negative error code instead of NULL,
which will be useful in the next patch to differentiate ENOMEM from ENOENT.

Signed-off-by: Louis Rilling <Louis.Rilling@kerlabs.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/configfs/dir.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index a08e5c2f25e..918a332babf 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -30,6 +30,7 @@
 #include <linux/mount.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/err.h>
 
 #include <linux/configfs.h>
 #include "configfs_internal.h"
@@ -83,7 +84,7 @@ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * pare
 
 	sd = kmem_cache_zalloc(configfs_dir_cachep, GFP_KERNEL);
 	if (!sd)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	atomic_set(&sd->s_count, 1);
 	INIT_LIST_HEAD(&sd->s_links);
@@ -129,8 +130,8 @@ int configfs_make_dirent(struct configfs_dirent * parent_sd,
 	struct configfs_dirent * sd;
 
 	sd = configfs_new_dirent(parent_sd, element);
-	if (!sd)
-		return -ENOMEM;
+	if (IS_ERR(sd))
+		return PTR_ERR(sd);
 
 	sd->s_mode = mode;
 	sd->s_type = type;
@@ -1277,7 +1278,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file)
 	file->private_data = configfs_new_dirent(parent_sd, NULL);
 	mutex_unlock(&dentry->d_inode->i_mutex);
 
-	return file->private_data ? 0 : -ENOMEM;
+	return IS_ERR(file->private_data) ? PTR_ERR(file->private_data) : 0;
 
 }
 
-- 
cgit v1.2.3-70-g09d2


From b3e76af87441fc36eef3516d73ab2314e7b2d911 Mon Sep 17 00:00:00 2001
From: Louis Rilling <Louis.Rilling@kerlabs.com>
Date: Mon, 16 Jun 2008 19:01:01 +0200
Subject: configfs: Fix deadlock with racing rmdir() and rename()

This patch fixes the deadlock between racing sys_rename() and configfs_rmdir().

The idea is to avoid locking i_mutexes of default groups in
configfs_detach_prep(), and rely instead on the new configfs_dirent_lock to
protect against configfs_dirent's linkage mutations. To ensure that an mkdir()
racing with rmdir() will not create new items in a to-be-removed default group,
we make configfs_new_dirent() check for the CONFIGFS_USET_DROPPING flag right
before linking the new dirent, and return error if the flag is set. This makes
racing mkdir()/symlink()/dir_open() fail in places where errors could already
happen, resp. in (attach_item()|attach_group())/create_link()/new_dirent().

configfs_depend() remains safe since it locks all the path from configfs root,
and is thus mutually exclusive with rmdir().

An advantage of this is that now detach_groups() unconditionnaly takes the
default groups i_mutex, which makes it more consistent with populate_groups().

Signed-off-by: Louis Rilling <Louis.Rilling@kerlabs.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/configfs/dir.c | 45 +++++++++++++++++++++++++--------------------
 1 file changed, 25 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 918a332babf..d5b5985716b 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -43,6 +43,10 @@ DECLARE_RWSEM(configfs_rename_sem);
  * and configfs_dirent_lock locked, in that order.
  * This allows one to safely traverse configfs_dirent trees and symlinks without
  * having to lock inodes.
+ *
+ * Protects setting of CONFIGFS_USET_DROPPING: checking the flag
+ * unlocked is not reliable unless in detach_groups() called from
+ * rmdir()/unregister() and from configfs_attach_group()
  */
 DEFINE_SPINLOCK(configfs_dirent_lock);
 
@@ -91,6 +95,11 @@ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * pare
 	INIT_LIST_HEAD(&sd->s_children);
 	sd->s_element = element;
 	spin_lock(&configfs_dirent_lock);
+	if (parent_sd->s_type & CONFIGFS_USET_DROPPING) {
+		spin_unlock(&configfs_dirent_lock);
+		kmem_cache_free(configfs_dir_cachep, sd);
+		return ERR_PTR(-ENOENT);
+	}
 	list_add(&sd->s_sibling, &parent_sd->s_children);
 	spin_unlock(&configfs_dirent_lock);
 
@@ -349,11 +358,11 @@ static struct dentry * configfs_lookup(struct inode *dir,
 
 /*
  * Only subdirectories count here.  Files (CONFIGFS_NOT_PINNED) are
- * attributes and are removed by rmdir().  We recurse, taking i_mutex
- * on all children that are candidates for default detach.  If the
- * result is clean, then configfs_detach_group() will handle dropping
- * i_mutex.  If there is an error, the caller will clean up the i_mutex
- * holders via configfs_detach_rollback().
+ * attributes and are removed by rmdir().  We recurse, setting
+ * CONFIGFS_USET_DROPPING on all children that are candidates for
+ * default detach.
+ * If there is an error, the caller will reset the flags via
+ * configfs_detach_rollback().
  */
 static int configfs_detach_prep(struct dentry *dentry)
 {
@@ -370,8 +379,7 @@ static int configfs_detach_prep(struct dentry *dentry)
 		if (sd->s_type & CONFIGFS_NOT_PINNED)
 			continue;
 		if (sd->s_type & CONFIGFS_USET_DEFAULT) {
-			mutex_lock(&sd->s_dentry->d_inode->i_mutex);
-			/* Mark that we've taken i_mutex */
+			/* Mark that we're trying to drop the group */
 			sd->s_type |= CONFIGFS_USET_DROPPING;
 
 			/*
@@ -392,7 +400,7 @@ out:
 }
 
 /*
- * Walk the tree, dropping i_mutex wherever CONFIGFS_USET_DROPPING is
+ * Walk the tree, resetting CONFIGFS_USET_DROPPING wherever it was
  * set.
  */
 static void configfs_detach_rollback(struct dentry *dentry)
@@ -403,11 +411,7 @@ static void configfs_detach_rollback(struct dentry *dentry)
 	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
 		if (sd->s_type & CONFIGFS_USET_DEFAULT) {
 			configfs_detach_rollback(sd->s_dentry);
-
-			if (sd->s_type & CONFIGFS_USET_DROPPING) {
-				sd->s_type &= ~CONFIGFS_USET_DROPPING;
-				mutex_unlock(&sd->s_dentry->d_inode->i_mutex);
-			}
+			sd->s_type &= ~CONFIGFS_USET_DROPPING;
 		}
 	}
 }
@@ -486,16 +490,12 @@ static void detach_groups(struct config_group *group)
 
 		child = sd->s_dentry;
 
+		mutex_lock(&child->d_inode->i_mutex);
+
 		configfs_detach_group(sd->s_element);
 		child->d_inode->i_flags |= S_DEAD;
 
-		/*
-		 * From rmdir/unregister, a configfs_detach_prep() pass
-		 * has taken our i_mutex for us.  Drop it.
-		 * From mkdir/register cleanup, there is no sem held.
-		 */
-		if (sd->s_type & CONFIGFS_USET_DROPPING)
-			mutex_unlock(&child->d_inode->i_mutex);
+		mutex_unlock(&child->d_inode->i_mutex);
 
 		d_delete(child);
 		dput(child);
@@ -1181,12 +1181,15 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
 		return -EINVAL;
 	}
 
+	spin_lock(&configfs_dirent_lock);
 	ret = configfs_detach_prep(dentry);
 	if (ret) {
 		configfs_detach_rollback(dentry);
+		spin_unlock(&configfs_dirent_lock);
 		config_item_put(parent_item);
 		return ret;
 	}
+	spin_unlock(&configfs_dirent_lock);
 
 	/* Get a working ref for the duration of this function */
 	item = configfs_get_config_item(dentry);
@@ -1476,9 +1479,11 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
 	mutex_lock_nested(&configfs_sb->s_root->d_inode->i_mutex,
 			  I_MUTEX_PARENT);
 	mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
+	spin_lock(&configfs_dirent_lock);
 	if (configfs_detach_prep(dentry)) {
 		printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n");
 	}
+	spin_unlock(&configfs_dirent_lock);
 	configfs_detach_group(&group->cg_item);
 	dentry->d_inode->i_flags |= S_DEAD;
 	mutex_unlock(&dentry->d_inode->i_mutex);
-- 
cgit v1.2.3-70-g09d2


From 6d8344baee99402de58b5fa5dfea197242955c15 Mon Sep 17 00:00:00 2001
From: Louis Rilling <Louis.Rilling@kerlabs.com>
Date: Mon, 16 Jun 2008 19:01:02 +0200
Subject: configfs: Fix failing mkdir() making racing rmdir() fail

When fixing the rename() vs rmdir() deadlock, we stopped locking default groups'
inodes in configfs_detach_prep(), letting racing mkdir() in default groups
proceed concurrently. This enables races like below happen, which leads to a
failing mkdir() making rmdir() fail, despite the group to remove having no
user-created directory under it in the end.

	process A: 			process B:
	/* PWD=A/B */
	mkdir("C")
	  make_item("C")
	  attach_group("C")
					rmdir("A")
					  detach_prep("A")
					    detach_prep("B")
					      error because of "C"
					  return -ENOTEMPTY
	    attach_group("C/D")
	      error (eg -ENOMEM)
	  return -ENOMEM

This patch prevents such scenarii by making rmdir() wait as long as
detach_prep() fails because a racing mkdir() is in the middle of attach_group().
To achieve this, mkdir() sets a flag CONFIGFS_USET_IN_MKDIR in parent's
configfs_dirent before calling attach_group(), and clears the flag once
attach_group() is done. detach_prep() fails with -EAGAIN whenever the flag is
hit and returns the guilty inode's mutex so that rmdir() can wait on it.

Signed-off-by: Louis Rilling <Louis.Rilling@kerlabs.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/configfs/configfs_internal.h |  1 +
 fs/configfs/dir.c               | 53 +++++++++++++++++++++++++++++++++--------
 2 files changed, 44 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index 5a33b58e66d..da015c12e3e 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -48,6 +48,7 @@ struct configfs_dirent {
 #define CONFIGFS_USET_DIR	0x0040
 #define CONFIGFS_USET_DEFAULT	0x0080
 #define CONFIGFS_USET_DROPPING	0x0100
+#define CONFIGFS_USET_IN_MKDIR	0x0200
 #define CONFIGFS_NOT_PINNED	(CONFIGFS_ITEM_ATTR)
 
 extern spinlock_t configfs_dirent_lock;
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index d5b5985716b..614e382a604 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -364,7 +364,7 @@ static struct dentry * configfs_lookup(struct inode *dir,
  * If there is an error, the caller will reset the flags via
  * configfs_detach_rollback().
  */
-static int configfs_detach_prep(struct dentry *dentry)
+static int configfs_detach_prep(struct dentry *dentry, struct mutex **wait_mutex)
 {
 	struct configfs_dirent *parent_sd = dentry->d_fsdata;
 	struct configfs_dirent *sd;
@@ -379,6 +379,12 @@ static int configfs_detach_prep(struct dentry *dentry)
 		if (sd->s_type & CONFIGFS_NOT_PINNED)
 			continue;
 		if (sd->s_type & CONFIGFS_USET_DEFAULT) {
+			/* Abort if racing with mkdir() */
+			if (sd->s_type & CONFIGFS_USET_IN_MKDIR) {
+				if (wait_mutex)
+					*wait_mutex = &sd->s_dentry->d_inode->i_mutex;
+				return -EAGAIN;
+			}
 			/* Mark that we're trying to drop the group */
 			sd->s_type |= CONFIGFS_USET_DROPPING;
 
@@ -386,7 +392,7 @@ static int configfs_detach_prep(struct dentry *dentry)
 			 * Yup, recursive.  If there's a problem, blame
 			 * deep nesting of default_groups
 			 */
-			ret = configfs_detach_prep(sd->s_dentry);
+			ret = configfs_detach_prep(sd->s_dentry, wait_mutex);
 			if (!ret)
 				continue;
 		} else
@@ -1113,11 +1119,26 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	 */
 	module_got = 1;
 
+	/*
+	 * Make racing rmdir() fail if it did not tag parent with
+	 * CONFIGFS_USET_DROPPING
+	 * Note: if CONFIGFS_USET_DROPPING is already set, attach_group() will
+	 * fail and let rmdir() terminate correctly
+	 */
+	spin_lock(&configfs_dirent_lock);
+	/* This will make configfs_detach_prep() fail */
+	sd->s_type |= CONFIGFS_USET_IN_MKDIR;
+	spin_unlock(&configfs_dirent_lock);
+
 	if (group)
 		ret = configfs_attach_group(parent_item, item, dentry);
 	else
 		ret = configfs_attach_item(parent_item, item, dentry);
 
+	spin_lock(&configfs_dirent_lock);
+	sd->s_type &= ~CONFIGFS_USET_IN_MKDIR;
+	spin_unlock(&configfs_dirent_lock);
+
 out_unlink:
 	if (ret) {
 		/* Tear down everything we built up */
@@ -1182,13 +1203,25 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
 	}
 
 	spin_lock(&configfs_dirent_lock);
-	ret = configfs_detach_prep(dentry);
-	if (ret) {
-		configfs_detach_rollback(dentry);
-		spin_unlock(&configfs_dirent_lock);
-		config_item_put(parent_item);
-		return ret;
-	}
+	do {
+		struct mutex *wait_mutex;
+
+		ret = configfs_detach_prep(dentry, &wait_mutex);
+		if (ret) {
+			configfs_detach_rollback(dentry);
+			spin_unlock(&configfs_dirent_lock);
+			if (ret != -EAGAIN) {
+				config_item_put(parent_item);
+				return ret;
+			}
+
+			/* Wait until the racing operation terminates */
+			mutex_lock(wait_mutex);
+			mutex_unlock(wait_mutex);
+
+			spin_lock(&configfs_dirent_lock);
+		}
+	} while (ret == -EAGAIN);
 	spin_unlock(&configfs_dirent_lock);
 
 	/* Get a working ref for the duration of this function */
@@ -1480,7 +1513,7 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
 			  I_MUTEX_PARENT);
 	mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
 	spin_lock(&configfs_dirent_lock);
-	if (configfs_detach_prep(dentry)) {
+	if (configfs_detach_prep(dentry, NULL)) {
 		printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n");
 	}
 	spin_unlock(&configfs_dirent_lock);
-- 
cgit v1.2.3-70-g09d2


From 11c3b79218390a139f2d474ee1e983a672d5839a Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 12 Jun 2008 14:00:18 -0700
Subject: configfs: Allow ->make_item() and ->make_group() to return detailed
 errors.

The configfs operations ->make_item() and ->make_group() currently
return a new item/group.  A return of NULL signifies an error.  Because
of this, -ENOMEM is the only return code bubbled up the stack.

Multiple folks have requested the ability to return specific error codes
when these operations fail.  This patch adds that ability by changing the
->make_item/group() ops to return an int.

Also updated are the in-kernel users of configfs.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 Documentation/filesystems/configfs/configfs.txt    | 10 +++--
 .../filesystems/configfs/configfs_example.c        | 14 ++++---
 drivers/net/netconsole.c                           | 10 +++--
 fs/configfs/dir.c                                  | 13 +++----
 fs/dlm/config.c                                    | 45 ++++++++++++++--------
 fs/ocfs2/cluster/heartbeat.c                       | 17 ++++----
 fs/ocfs2/cluster/nodemanager.c                     | 45 ++++++++++++++--------
 include/linux/configfs.h                           |  4 +-
 8 files changed, 94 insertions(+), 64 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/configfs/configfs.txt b/Documentation/filesystems/configfs/configfs.txt
index 44c97e6accb..15838d706ea 100644
--- a/Documentation/filesystems/configfs/configfs.txt
+++ b/Documentation/filesystems/configfs/configfs.txt
@@ -233,10 +233,12 @@ accomplished via the group operations specified on the group's
 config_item_type.
 
 	struct configfs_group_operations {
-		struct config_item *(*make_item)(struct config_group *group,
-						 const char *name);
-		struct config_group *(*make_group)(struct config_group *group,
-						   const char *name);
+		int (*make_item)(struct config_group *group,
+				 const char *name,
+				 struct config_item **new_item);
+		int (*make_group)(struct config_group *group,
+				  const char *name,
+				  struct config_group **new_group);
 		int (*commit_item)(struct config_item *item);
 		void (*disconnect_notify)(struct config_group *group,
 					  struct config_item *item);
diff --git a/Documentation/filesystems/configfs/configfs_example.c b/Documentation/filesystems/configfs/configfs_example.c
index 25151fd5c2c..0b422acd470 100644
--- a/Documentation/filesystems/configfs/configfs_example.c
+++ b/Documentation/filesystems/configfs/configfs_example.c
@@ -273,13 +273,13 @@ static inline struct simple_children *to_simple_children(struct config_item *ite
 	return item ? container_of(to_config_group(item), struct simple_children, group) : NULL;
 }
 
-static struct config_item *simple_children_make_item(struct config_group *group, const char *name)
+static int simple_children_make_item(struct config_group *group, const char *name, struct config_item **new_item)
 {
 	struct simple_child *simple_child;
 
 	simple_child = kzalloc(sizeof(struct simple_child), GFP_KERNEL);
 	if (!simple_child)
-		return NULL;
+		return -ENOMEM;
 
 
 	config_item_init_type_name(&simple_child->item, name,
@@ -287,7 +287,8 @@ static struct config_item *simple_children_make_item(struct config_group *group,
 
 	simple_child->storeme = 0;
 
-	return &simple_child->item;
+	*new_item = &simple_child->item;
+	return 0;
 }
 
 static struct configfs_attribute simple_children_attr_description = {
@@ -359,20 +360,21 @@ static struct configfs_subsystem simple_children_subsys = {
  * children of its own.
  */
 
-static struct config_group *group_children_make_group(struct config_group *group, const char *name)
+static int group_children_make_group(struct config_group *group, const char *name, struct config_group **new_group)
 {
 	struct simple_children *simple_children;
 
 	simple_children = kzalloc(sizeof(struct simple_children),
 				  GFP_KERNEL);
 	if (!simple_children)
-		return NULL;
+		return -ENOMEM;
 
 
 	config_group_init_type_name(&simple_children->group, name,
 				    &simple_children_type);
 
-	return &simple_children->group;
+	*new_group = &simple_children->group;
+	return 0;
 }
 
 static struct configfs_attribute group_children_attr_description = {
diff --git a/drivers/net/netconsole.c b/drivers/net/netconsole.c
index 665341e4305..387a1339501 100644
--- a/drivers/net/netconsole.c
+++ b/drivers/net/netconsole.c
@@ -585,8 +585,9 @@ static struct config_item_type netconsole_target_type = {
  * Group operations and type for netconsole_subsys.
  */
 
-static struct config_item *make_netconsole_target(struct config_group *group,
-						  const char *name)
+static int make_netconsole_target(struct config_group *group,
+				  const char *name,
+				  struct config_item **new_item)
 {
 	unsigned long flags;
 	struct netconsole_target *nt;
@@ -598,7 +599,7 @@ static struct config_item *make_netconsole_target(struct config_group *group,
 	nt = kzalloc(sizeof(*nt), GFP_KERNEL);
 	if (!nt) {
 		printk(KERN_ERR "netconsole: failed to allocate memory\n");
-		return NULL;
+		return -ENOMEM;
 	}
 
 	nt->np.name = "netconsole";
@@ -615,7 +616,8 @@ static struct config_item *make_netconsole_target(struct config_group *group,
 	list_add(&nt->list, &target_list);
 	spin_unlock_irqrestore(&target_list_lock, flags);
 
-	return &nt->item;
+	*new_item = &nt->item;
+	return 0;
 }
 
 static void drop_netconsole_target(struct config_group *group,
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 614e382a604..0e64312a084 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1073,25 +1073,24 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	group = NULL;
 	item = NULL;
 	if (type->ct_group_ops->make_group) {
-		group = type->ct_group_ops->make_group(to_config_group(parent_item), name);
-		if (group) {
+		ret = type->ct_group_ops->make_group(to_config_group(parent_item), name, &group);
+		if (!ret) {
 			link_group(to_config_group(parent_item), group);
 			item = &group->cg_item;
 		}
 	} else {
-		item = type->ct_group_ops->make_item(to_config_group(parent_item), name);
-		if (item)
+		ret = type->ct_group_ops->make_item(to_config_group(parent_item), name, &item);
+		if (!ret)
 			link_obj(parent_item, item);
 	}
 	mutex_unlock(&subsys->su_mutex);
 
 	kfree(name);
-	if (!item) {
+	if (ret) {
 		/*
-		 * If item == NULL, then link_obj() was never called.
+		 * If ret != 0, then link_obj() was never called.
 		 * There are no extra references to clean up.
 		 */
-		ret = -ENOMEM;
 		goto out_put;
 	}
 
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index eac23bd288b..492d8caaaf2 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -41,16 +41,20 @@ struct comm;
 struct nodes;
 struct node;
 
-static struct config_group *make_cluster(struct config_group *, const char *);
+static int make_cluster(struct config_group *, const char *,
+			struct config_group **);
 static void drop_cluster(struct config_group *, struct config_item *);
 static void release_cluster(struct config_item *);
-static struct config_group *make_space(struct config_group *, const char *);
+static int make_space(struct config_group *, const char *,
+		      struct config_group **);
 static void drop_space(struct config_group *, struct config_item *);
 static void release_space(struct config_item *);
-static struct config_item *make_comm(struct config_group *, const char *);
+static int make_comm(struct config_group *, const char *,
+		     struct config_item **);
 static void drop_comm(struct config_group *, struct config_item *);
 static void release_comm(struct config_item *);
-static struct config_item *make_node(struct config_group *, const char *);
+static int make_node(struct config_group *, const char *,
+		     struct config_item **);
 static void drop_node(struct config_group *, struct config_item *);
 static void release_node(struct config_item *);
 
@@ -392,8 +396,8 @@ static struct node *to_node(struct config_item *i)
 	return i ? container_of(i, struct node, item) : NULL;
 }
 
-static struct config_group *make_cluster(struct config_group *g,
-					 const char *name)
+static int make_cluster(struct config_group *g, const char *name,
+			struct config_group **new_g)
 {
 	struct cluster *cl = NULL;
 	struct spaces *sps = NULL;
@@ -431,14 +435,15 @@ static struct config_group *make_cluster(struct config_group *g,
 
 	space_list = &sps->ss_group;
 	comm_list = &cms->cs_group;
-	return &cl->group;
+	*new_g = &cl->group;
+	return 0;
 
  fail:
 	kfree(cl);
 	kfree(gps);
 	kfree(sps);
 	kfree(cms);
-	return NULL;
+	return -ENOMEM;
 }
 
 static void drop_cluster(struct config_group *g, struct config_item *i)
@@ -466,7 +471,8 @@ static void release_cluster(struct config_item *i)
 	kfree(cl);
 }
 
-static struct config_group *make_space(struct config_group *g, const char *name)
+static int make_space(struct config_group *g, const char *name,
+		      struct config_group **new_g)
 {
 	struct space *sp = NULL;
 	struct nodes *nds = NULL;
@@ -489,13 +495,14 @@ static struct config_group *make_space(struct config_group *g, const char *name)
 	INIT_LIST_HEAD(&sp->members);
 	mutex_init(&sp->members_lock);
 	sp->members_count = 0;
-	return &sp->group;
+	*new_g = &sp->group;
+	return 0;
 
  fail:
 	kfree(sp);
 	kfree(gps);
 	kfree(nds);
-	return NULL;
+	return -ENOMEM;
 }
 
 static void drop_space(struct config_group *g, struct config_item *i)
@@ -522,19 +529,21 @@ static void release_space(struct config_item *i)
 	kfree(sp);
 }
 
-static struct config_item *make_comm(struct config_group *g, const char *name)
+static int make_comm(struct config_group *g, const char *name,
+		     struct config_item **new_i)
 {
 	struct comm *cm;
 
 	cm = kzalloc(sizeof(struct comm), GFP_KERNEL);
 	if (!cm)
-		return NULL;
+		return -ENOMEM;
 
 	config_item_init_type_name(&cm->item, name, &comm_type);
 	cm->nodeid = -1;
 	cm->local = 0;
 	cm->addr_count = 0;
-	return &cm->item;
+	*new_i = &cm->item;
+	return 0;
 }
 
 static void drop_comm(struct config_group *g, struct config_item *i)
@@ -554,14 +563,15 @@ static void release_comm(struct config_item *i)
 	kfree(cm);
 }
 
-static struct config_item *make_node(struct config_group *g, const char *name)
+static int make_node(struct config_group *g, const char *name,
+		     struct config_item **new_i)
 {
 	struct space *sp = to_space(g->cg_item.ci_parent);
 	struct node *nd;
 
 	nd = kzalloc(sizeof(struct node), GFP_KERNEL);
 	if (!nd)
-		return NULL;
+		return -ENOMEM;
 
 	config_item_init_type_name(&nd->item, name, &node_type);
 	nd->nodeid = -1;
@@ -573,7 +583,8 @@ static struct config_item *make_node(struct config_group *g, const char *name)
 	sp->members_count++;
 	mutex_unlock(&sp->members_lock);
 
-	return &nd->item;
+	*new_i = &nd->item;
+	return 0;
 }
 
 static void drop_node(struct config_group *g, struct config_item *i)
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index f02ccb34604..443d108211a 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1489,25 +1489,28 @@ static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group
 		: NULL;
 }
 
-static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group,
-							  const char *name)
+static int o2hb_heartbeat_group_make_item(struct config_group *group,
+					  const char *name,
+					  struct config_item **new_item)
 {
 	struct o2hb_region *reg = NULL;
-	struct config_item *ret = NULL;
+	int ret = 0;
 
 	reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL);
-	if (reg == NULL)
-		goto out; /* ENOMEM */
+	if (reg == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
 
-	ret = &reg->hr_item;
+	*new_item = &reg->hr_item;
 
 	spin_lock(&o2hb_live_lock);
 	list_add_tail(&reg->hr_all_item, &o2hb_all_regions);
 	spin_unlock(&o2hb_live_lock);
 out:
-	if (ret == NULL)
+	if (ret)
 		kfree(reg);
 
 	return ret;
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index cfdb08b484e..b364b7052e4 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -644,27 +644,32 @@ out:
 	return ret;
 }
 
-static struct config_item *o2nm_node_group_make_item(struct config_group *group,
-						     const char *name)
+static int o2nm_node_group_make_item(struct config_group *group,
+				     const char *name,
+				     struct config_item **new_item)
 {
 	struct o2nm_node *node = NULL;
-	struct config_item *ret = NULL;
+	int ret = 0;
 
-	if (strlen(name) > O2NM_MAX_NAME_LEN)
-		goto out; /* ENAMETOOLONG */
+	if (strlen(name) > O2NM_MAX_NAME_LEN) {
+		ret = -ENAMETOOLONG;
+		goto out;
+	}
 
 	node = kzalloc(sizeof(struct o2nm_node), GFP_KERNEL);
-	if (node == NULL)
-		goto out; /* ENOMEM */
+	if (node == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	strcpy(node->nd_name, name); /* use item.ci_namebuf instead? */
 	config_item_init_type_name(&node->nd_item, name, &o2nm_node_type);
 	spin_lock_init(&node->nd_lock);
 
-	ret = &node->nd_item;
+	*new_item = &node->nd_item;
 
 out:
-	if (ret == NULL)
+	if (ret)
 		kfree(node);
 
 	return ret;
@@ -751,25 +756,31 @@ static struct o2nm_cluster_group *to_o2nm_cluster_group(struct config_group *gro
 }
 #endif
 
-static struct config_group *o2nm_cluster_group_make_group(struct config_group *group,
-							  const char *name)
+static int o2nm_cluster_group_make_group(struct config_group *group,
+					 const char *name,
+					 struct config_group **new_group)
 {
 	struct o2nm_cluster *cluster = NULL;
 	struct o2nm_node_group *ns = NULL;
-	struct config_group *o2hb_group = NULL, *ret = NULL;
+	struct config_group *o2hb_group = NULL;
 	void *defs = NULL;
+	int ret = 0;
 
 	/* this runs under the parent dir's i_mutex; there can be only
 	 * one caller in here at a time */
-	if (o2nm_single_cluster)
-		goto out; /* ENOSPC */
+	if (o2nm_single_cluster) {
+		ret = -ENOSPC;
+		goto out;
+	}
 
 	cluster = kzalloc(sizeof(struct o2nm_cluster), GFP_KERNEL);
 	ns = kzalloc(sizeof(struct o2nm_node_group), GFP_KERNEL);
 	defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
 	o2hb_group = o2hb_alloc_hb_set();
-	if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL)
+	if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL) {
+		ret = -ENOMEM;
 		goto out;
+	}
 
 	config_group_init_type_name(&cluster->cl_group, name,
 				    &o2nm_cluster_type);
@@ -786,11 +797,11 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g
 	cluster->cl_idle_timeout_ms    = O2NET_IDLE_TIMEOUT_MS_DEFAULT;
 	cluster->cl_keepalive_delay_ms = O2NET_KEEPALIVE_DELAY_MS_DEFAULT;
 
-	ret = &cluster->cl_group;
+	*new_group = &cluster->cl_group;
 	o2nm_single_cluster = cluster;
 
 out:
-	if (ret == NULL) {
+	if (ret) {
 		kfree(cluster);
 		kfree(ns);
 		o2hb_free_hb_set(o2hb_group);
diff --git a/include/linux/configfs.h b/include/linux/configfs.h
index 3ae65b1bf90..0488f937634 100644
--- a/include/linux/configfs.h
+++ b/include/linux/configfs.h
@@ -165,8 +165,8 @@ struct configfs_item_operations {
 };
 
 struct configfs_group_operations {
-	struct config_item *(*make_item)(struct config_group *group, const char *name);
-	struct config_group *(*make_group)(struct config_group *group, const char *name);
+	int (*make_item)(struct config_group *group, const char *name, struct config_item **new_item);
+	int (*make_group)(struct config_group *group, const char *name, struct config_group **new_group);
 	int (*commit_item)(struct config_item *item);
 	void (*disconnect_notify)(struct config_group *group, struct config_item *item);
 	void (*drop_item)(struct config_group *group, struct config_item *item);
-- 
cgit v1.2.3-70-g09d2


From e75206517504461778c283b942440ef312e437d5 Mon Sep 17 00:00:00 2001
From: Louis Rilling <Louis.Rilling@kerlabs.com>
Date: Thu, 12 Jun 2008 17:26:47 +0200
Subject: configfs: call drop_link() to cleanup after create_link() failure

When allow_link() succeeds but create_link() fails, the subsystem is not
informed of the failure.

This patch fixes this by calling drop_link() on create_link() failures.

Signed-off-by: Louis Rilling <Louis.Rilling@kerlabs.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/configfs/symlink.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index faeb4417a10..0004d18c40a 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -140,8 +140,12 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna
 		goto out_put;
 
 	ret = type->ct_item_ops->allow_link(parent_item, target_item);
-	if (!ret)
+	if (!ret) {
 		ret = create_link(parent_item, target_item, dentry);
+		if (ret && type->ct_item_ops->drop_link)
+			type->ct_item_ops->drop_link(parent_item,
+						     target_item);
+	}
 
 	config_item_put(target_item);
 	path_put(&nd.path);
-- 
cgit v1.2.3-70-g09d2


From 1e51764a3c2ac05a23a22b2a95ddee4d9bffb16d Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Mon, 14 Jul 2008 19:08:37 +0300
Subject: UBIFS: add new flash file system

This is a new flash file system. See
http://www.linux-mtd.infradead.org/doc/ubifs.html

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: Adrian Hunter <ext-adrian.hunter@nokia.com>
---
 fs/ubifs/budget.c      |  731 ++++++++++++
 fs/ubifs/commit.c      |  677 +++++++++++
 fs/ubifs/compress.c    |  253 +++++
 fs/ubifs/debug.c       | 2289 +++++++++++++++++++++++++++++++++++++
 fs/ubifs/debug.h       |  403 +++++++
 fs/ubifs/dir.c         | 1240 ++++++++++++++++++++
 fs/ubifs/file.c        | 1275 +++++++++++++++++++++
 fs/ubifs/find.c        |  975 ++++++++++++++++
 fs/ubifs/gc.c          |  773 +++++++++++++
 fs/ubifs/io.c          |  914 +++++++++++++++
 fs/ubifs/ioctl.c       |  204 ++++
 fs/ubifs/journal.c     | 1387 +++++++++++++++++++++++
 fs/ubifs/key.h         |  533 +++++++++
 fs/ubifs/log.c         |  805 +++++++++++++
 fs/ubifs/lprops.c      | 1357 ++++++++++++++++++++++
 fs/ubifs/lpt.c         | 2243 ++++++++++++++++++++++++++++++++++++
 fs/ubifs/lpt_commit.c  | 1648 +++++++++++++++++++++++++++
 fs/ubifs/master.c      |  387 +++++++
 fs/ubifs/misc.h        |  342 ++++++
 fs/ubifs/orphan.c      |  958 ++++++++++++++++
 fs/ubifs/recovery.c    | 1519 +++++++++++++++++++++++++
 fs/ubifs/replay.c      | 1075 ++++++++++++++++++
 fs/ubifs/sb.c          |  629 +++++++++++
 fs/ubifs/scan.c        |  362 ++++++
 fs/ubifs/shrinker.c    |  322 ++++++
 fs/ubifs/super.c       | 1951 ++++++++++++++++++++++++++++++++
 fs/ubifs/tnc.c         | 2956 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ubifs/tnc_commit.c  | 1103 ++++++++++++++++++
 fs/ubifs/tnc_misc.c    |  494 ++++++++
 fs/ubifs/ubifs-media.h |  745 ++++++++++++
 fs/ubifs/ubifs.h       | 1649 +++++++++++++++++++++++++++
 fs/ubifs/xattr.c       |  581 ++++++++++
 32 files changed, 32780 insertions(+)
 create mode 100644 fs/ubifs/budget.c
 create mode 100644 fs/ubifs/commit.c
 create mode 100644 fs/ubifs/compress.c
 create mode 100644 fs/ubifs/debug.c
 create mode 100644 fs/ubifs/debug.h
 create mode 100644 fs/ubifs/dir.c
 create mode 100644 fs/ubifs/file.c
 create mode 100644 fs/ubifs/find.c
 create mode 100644 fs/ubifs/gc.c
 create mode 100644 fs/ubifs/io.c
 create mode 100644 fs/ubifs/ioctl.c
 create mode 100644 fs/ubifs/journal.c
 create mode 100644 fs/ubifs/key.h
 create mode 100644 fs/ubifs/log.c
 create mode 100644 fs/ubifs/lprops.c
 create mode 100644 fs/ubifs/lpt.c
 create mode 100644 fs/ubifs/lpt_commit.c
 create mode 100644 fs/ubifs/master.c
 create mode 100644 fs/ubifs/misc.h
 create mode 100644 fs/ubifs/orphan.c
 create mode 100644 fs/ubifs/recovery.c
 create mode 100644 fs/ubifs/replay.c
 create mode 100644 fs/ubifs/sb.c
 create mode 100644 fs/ubifs/scan.c
 create mode 100644 fs/ubifs/shrinker.c
 create mode 100644 fs/ubifs/super.c
 create mode 100644 fs/ubifs/tnc.c
 create mode 100644 fs/ubifs/tnc_commit.c
 create mode 100644 fs/ubifs/tnc_misc.c
 create mode 100644 fs/ubifs/ubifs-media.h
 create mode 100644 fs/ubifs/ubifs.h
 create mode 100644 fs/ubifs/xattr.c

(limited to 'fs')

diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
new file mode 100644
index 00000000000..d81fb9ed2b8
--- /dev/null
+++ b/fs/ubifs/budget.c
@@ -0,0 +1,731 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements the budgeting sub-system which is responsible for UBIFS
+ * space management.
+ *
+ * Factors such as compression, wasted space at the ends of LEBs, space in other
+ * journal heads, the effect of updates on the index, and so on, make it
+ * impossible to accurately predict the amount of space needed. Consequently
+ * approximations are used.
+ */
+
+#include "ubifs.h"
+#include <linux/writeback.h>
+#include <asm/div64.h>
+
+/*
+ * When pessimistic budget calculations say that there is no enough space,
+ * UBIFS starts writing back dirty inodes and pages, doing garbage collection,
+ * or committing. The below constants define maximum number of times UBIFS
+ * repeats the operations.
+ */
+#define MAX_SHRINK_RETRIES 8
+#define MAX_GC_RETRIES     4
+#define MAX_CMT_RETRIES    2
+#define MAX_NOSPC_RETRIES  1
+
+/*
+ * The below constant defines amount of dirty pages which should be written
+ * back at when trying to shrink the liability.
+ */
+#define NR_TO_WRITE 16
+
+/**
+ * struct retries_info - information about re-tries while making free space.
+ * @prev_liability: previous liability
+ * @shrink_cnt: how many times the liability was shrinked
+ * @shrink_retries: count of liability shrink re-tries (increased when
+ *                  liability does not shrink)
+ * @try_gc: GC should be tried first
+ * @gc_retries: how many times GC was run
+ * @cmt_retries: how many times commit has been done
+ * @nospc_retries: how many times GC returned %-ENOSPC
+ *
+ * Since we consider budgeting to be the fast-path, and this structure has to
+ * be allocated on stack and zeroed out, we make it smaller using bit-fields.
+ */
+struct retries_info {
+	long long prev_liability;
+	unsigned int shrink_cnt;
+	unsigned int shrink_retries:5;
+	unsigned int try_gc:1;
+	unsigned int gc_retries:4;
+	unsigned int cmt_retries:3;
+	unsigned int nospc_retries:1;
+};
+
+/**
+ * shrink_liability - write-back some dirty pages/inodes.
+ * @c: UBIFS file-system description object
+ * @nr_to_write: how many dirty pages to write-back
+ *
+ * This function shrinks UBIFS liability by means of writing back some amount
+ * of dirty inodes and their pages. Returns the amount of pages which were
+ * written back. The returned value does not include dirty inodes which were
+ * synchronized.
+ *
+ * Note, this function synchronizes even VFS inodes which are locked
+ * (@i_mutex) by the caller of the budgeting function, because write-back does
+ * not touch @i_mutex.
+ */
+static int shrink_liability(struct ubifs_info *c, int nr_to_write)
+{
+	int nr_written;
+	struct writeback_control wbc = {
+		.sync_mode   = WB_SYNC_NONE,
+		.range_end   = LLONG_MAX,
+		.nr_to_write = nr_to_write,
+	};
+
+	generic_sync_sb_inodes(c->vfs_sb, &wbc);
+	nr_written = nr_to_write - wbc.nr_to_write;
+
+	if (!nr_written) {
+		/*
+		 * Re-try again but wait on pages/inodes which are being
+		 * written-back concurrently (e.g., by pdflush).
+		 */
+		memset(&wbc, 0, sizeof(struct writeback_control));
+		wbc.sync_mode   = WB_SYNC_ALL;
+		wbc.range_end   = LLONG_MAX;
+		wbc.nr_to_write = nr_to_write;
+		generic_sync_sb_inodes(c->vfs_sb, &wbc);
+		nr_written = nr_to_write - wbc.nr_to_write;
+	}
+
+	dbg_budg("%d pages were written back", nr_written);
+	return nr_written;
+}
+
+
+/**
+ * run_gc - run garbage collector.
+ * @c: UBIFS file-system description object
+ *
+ * This function runs garbage collector to make some more free space. Returns
+ * zero if a free LEB has been produced, %-EAGAIN if commit is required, and a
+ * negative error code in case of failure.
+ */
+static int run_gc(struct ubifs_info *c)
+{
+	int err, lnum;
+
+	/* Make some free space by garbage-collecting dirty space */
+	down_read(&c->commit_sem);
+	lnum = ubifs_garbage_collect(c, 1);
+	up_read(&c->commit_sem);
+	if (lnum < 0)
+		return lnum;
+
+	/* GC freed one LEB, return it to lprops */
+	dbg_budg("GC freed LEB %d", lnum);
+	err = ubifs_return_leb(c, lnum);
+	if (err)
+		return err;
+	return 0;
+}
+
+/**
+ * make_free_space - make more free space on the file-system.
+ * @c: UBIFS file-system description object
+ * @ri: information about previous invocations of this function
+ *
+ * This function is called when an operation cannot be budgeted because there
+ * is supposedly no free space. But in most cases there is some free space:
+ *   o budgeting is pessimistic, so it always budgets more then it is actually
+ *     needed, so shrinking the liability is one way to make free space - the
+ *     cached data will take less space then it was budgeted for;
+ *   o GC may turn some dark space into free space (budgeting treats dark space
+ *     as not available);
+ *   o commit may free some LEB, i.e., turn freeable LEBs into free LEBs.
+ *
+ * So this function tries to do the above. Returns %-EAGAIN if some free space
+ * was presumably made and the caller has to re-try budgeting the operation.
+ * Returns %-ENOSPC if it couldn't do more free space, and other negative error
+ * codes on failures.
+ */
+static int make_free_space(struct ubifs_info *c, struct retries_info *ri)
+{
+	int err;
+
+	/*
+	 * If we have some dirty pages and inodes (liability), try to write
+	 * them back unless this was tried too many times without effect
+	 * already.
+	 */
+	if (ri->shrink_retries < MAX_SHRINK_RETRIES && !ri->try_gc) {
+		long long liability;
+
+		spin_lock(&c->space_lock);
+		liability = c->budg_idx_growth + c->budg_data_growth +
+			    c->budg_dd_growth;
+		spin_unlock(&c->space_lock);
+
+		if (ri->prev_liability >= liability) {
+			/* Liability does not shrink, next time try GC then */
+			ri->shrink_retries += 1;
+			if (ri->gc_retries < MAX_GC_RETRIES)
+				ri->try_gc = 1;
+			dbg_budg("liability did not shrink: retries %d of %d",
+				 ri->shrink_retries, MAX_SHRINK_RETRIES);
+		}
+
+		dbg_budg("force write-back (count %d)", ri->shrink_cnt);
+		shrink_liability(c, NR_TO_WRITE + ri->shrink_cnt);
+
+		ri->prev_liability = liability;
+		ri->shrink_cnt += 1;
+		return -EAGAIN;
+	}
+
+	/*
+	 * Try to run garbage collector unless it was already tried too many
+	 * times.
+	 */
+	if (ri->gc_retries < MAX_GC_RETRIES) {
+		ri->gc_retries += 1;
+		dbg_budg("run GC, retries %d of %d",
+			 ri->gc_retries, MAX_GC_RETRIES);
+
+		ri->try_gc = 0;
+		err = run_gc(c);
+		if (!err)
+			return -EAGAIN;
+
+		if (err == -EAGAIN) {
+			dbg_budg("GC asked to commit");
+			err = ubifs_run_commit(c);
+			if (err)
+				return err;
+			return -EAGAIN;
+		}
+
+		if (err != -ENOSPC)
+			return err;
+
+		/*
+		 * GC could not make any progress. If this is the first time,
+		 * then it makes sense to try to commit, because it might make
+		 * some dirty space.
+		 */
+		dbg_budg("GC returned -ENOSPC, retries %d",
+			 ri->nospc_retries);
+		if (ri->nospc_retries >= MAX_NOSPC_RETRIES)
+			return err;
+		ri->nospc_retries += 1;
+	}
+
+	/* Neither GC nor write-back helped, try to commit */
+	if (ri->cmt_retries < MAX_CMT_RETRIES) {
+		ri->cmt_retries += 1;
+		dbg_budg("run commit, retries %d of %d",
+			 ri->cmt_retries, MAX_CMT_RETRIES);
+		err = ubifs_run_commit(c);
+		if (err)
+			return err;
+		return -EAGAIN;
+	}
+	return -ENOSPC;
+}
+
+/**
+ * ubifs_calc_min_idx_lebs - calculate amount of eraseblocks for the index.
+ * @c: UBIFS file-system description object
+ *
+ * This function calculates and returns the number of eraseblocks which should
+ * be kept for index usage.
+ */
+int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
+{
+	int ret;
+	uint64_t idx_size;
+
+	idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx;
+
+	/* And make sure we have twice the index size of space reserved */
+	idx_size <<= 1;
+
+	/*
+	 * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes'
+	 * pair, nor similarly the two variables for the new index size, so we
+	 * have to do this costly 64-bit division on fast-path.
+	 */
+	if (do_div(idx_size, c->leb_size - c->max_idx_node_sz))
+		ret = idx_size + 1;
+	else
+		ret = idx_size;
+	/*
+	 * The index head is not available for the in-the-gaps method, so add an
+	 * extra LEB to compensate.
+	 */
+	ret += 1;
+	/*
+	 * At present the index needs at least 2 LEBs: one for the index head
+	 * and one for in-the-gaps method (which currently does not cater for
+	 * the index head and so excludes it from consideration).
+	 */
+	if (ret < 2)
+		ret = 2;
+	return ret;
+}
+
+/**
+ * ubifs_calc_available - calculate available FS space.
+ * @c: UBIFS file-system description object
+ * @min_idx_lebs: minimum number of LEBs reserved for the index
+ *
+ * This function calculates and returns amount of FS space available for use.
+ */
+long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs)
+{
+	int subtract_lebs;
+	long long available;
+
+	/*
+	 * Force the amount available to the total size reported if the used
+	 * space is zero.
+	 */
+	if (c->lst.total_used <= UBIFS_INO_NODE_SZ &&
+	    c->budg_data_growth + c->budg_dd_growth == 0) {
+		/* Do the same calculation as for c->block_cnt */
+		available = c->main_lebs - 2;
+		available *= c->leb_size - c->dark_wm;
+		return available;
+	}
+
+	available = c->main_bytes - c->lst.total_used;
+
+	/*
+	 * Now 'available' contains theoretically available flash space
+	 * assuming there is no index, so we have to subtract the space which
+	 * is reserved for the index.
+	 */
+	subtract_lebs = min_idx_lebs;
+
+	/* Take into account that GC reserves one LEB for its own needs */
+	subtract_lebs += 1;
+
+	/*
+	 * The GC journal head LEB is not really accessible. And since
+	 * different write types go to different heads, we may count only on
+	 * one head's space.
+	 */
+	subtract_lebs += c->jhead_cnt - 1;
+
+	/* We also reserve one LEB for deletions, which bypass budgeting */
+	subtract_lebs += 1;
+
+	available -= (long long)subtract_lebs * c->leb_size;
+
+	/* Subtract the dead space which is not available for use */
+	available -= c->lst.total_dead;
+
+	/*
+	 * Subtract dark space, which might or might not be usable - it depends
+	 * on the data which we have on the media and which will be written. If
+	 * this is a lot of uncompressed or not-compressible data, the dark
+	 * space cannot be used.
+	 */
+	available -= c->lst.total_dark;
+
+	/*
+	 * However, there is more dark space. The index may be bigger than
+	 * @min_idx_lebs. Those extra LEBs are assumed to be available, but
+	 * their dark space is not included in total_dark, so it is subtracted
+	 * here.
+	 */
+	if (c->lst.idx_lebs > min_idx_lebs) {
+		subtract_lebs = c->lst.idx_lebs - min_idx_lebs;
+		available -= subtract_lebs * c->dark_wm;
+	}
+
+	/* The calculations are rough and may end up with a negative number */
+	return available > 0 ? available : 0;
+}
+
+/**
+ * can_use_rp - check whether the user is allowed to use reserved pool.
+ * @c: UBIFS file-system description object
+ *
+ * UBIFS has so-called "reserved pool" which is flash space reserved
+ * for the superuser and for uses whose UID/GID is recorded in UBIFS superblock.
+ * This function checks whether current user is allowed to use reserved pool.
+ * Returns %1  current user is allowed to use reserved pool and %0 otherwise.
+ */
+static int can_use_rp(struct ubifs_info *c)
+{
+	if (current->fsuid == c->rp_uid || capable(CAP_SYS_RESOURCE) ||
+	    (c->rp_gid != 0 && in_group_p(c->rp_gid)))
+		return 1;
+	return 0;
+}
+
+/**
+ * do_budget_space - reserve flash space for index and data growth.
+ * @c: UBIFS file-system description object
+ *
+ * This function makes sure UBIFS has enough free eraseblocks for index growth
+ * and data.
+ *
+ * When budgeting index space, UBIFS reserves twice as more LEBs as the index
+ * would take if it was consolidated and written to the flash. This guarantees
+ * that the "in-the-gaps" commit method always succeeds and UBIFS will always
+ * be able to commit dirty index. So this function basically adds amount of
+ * budgeted index space to the size of the current index, multiplies this by 2,
+ * and makes sure this does not exceed the amount of free eraseblocks.
+ *
+ * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables:
+ * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might
+ *    be large, because UBIFS does not do any index consolidation as long as
+ *    there is free space. IOW, the index may take a lot of LEBs, but the LEBs
+ *    will contain a lot of dirt.
+ * o @c->min_idx_lebs is the the index presumably takes. IOW, the index may be
+ *   consolidated to take up to @c->min_idx_lebs LEBs.
+ *
+ * This function returns zero in case of success, and %-ENOSPC in case of
+ * failure.
+ */
+static int do_budget_space(struct ubifs_info *c)
+{
+	long long outstanding, available;
+	int lebs, rsvd_idx_lebs, min_idx_lebs;
+
+	/* First budget index space */
+	min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+
+	/* Now 'min_idx_lebs' contains number of LEBs to reserve */
+	if (min_idx_lebs > c->lst.idx_lebs)
+		rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs;
+	else
+		rsvd_idx_lebs = 0;
+
+	/*
+	 * The number of LEBs that are available to be used by the index is:
+	 *
+	 *    @c->lst.empty_lebs + @c->freeable_cnt + @c->idx_gc_cnt -
+	 *    @c->lst.taken_empty_lebs
+	 *
+	 * @empty_lebs are available because they are empty. @freeable_cnt are
+	 * available because they contain only free and dirty space and the
+	 * index allocation always occurs after wbufs are synch'ed.
+	 * @idx_gc_cnt are available because they are index LEBs that have been
+	 * garbage collected (including trivial GC) and are awaiting the commit
+	 * before they can be unmapped - note that the in-the-gaps method will
+	 * grab these if it needs them. @taken_empty_lebs are empty_lebs that
+	 * have already been allocated for some purpose (also includes those
+	 * LEBs on the @idx_gc list).
+	 *
+	 * Note, @taken_empty_lebs may temporarily be higher by one because of
+	 * the way we serialize LEB allocations and budgeting. See a comment in
+	 * 'ubifs_find_free_space()'.
+	 */
+	lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
+	       c->lst.taken_empty_lebs;
+	if (unlikely(rsvd_idx_lebs > lebs)) {
+		dbg_budg("out of indexing space: min_idx_lebs %d (old %d), "
+			 "rsvd_idx_lebs %d", min_idx_lebs, c->min_idx_lebs,
+			 rsvd_idx_lebs);
+		return -ENOSPC;
+	}
+
+	available = ubifs_calc_available(c, min_idx_lebs);
+	outstanding = c->budg_data_growth + c->budg_dd_growth;
+
+	if (unlikely(available < outstanding)) {
+		dbg_budg("out of data space: available %lld, outstanding %lld",
+			 available, outstanding);
+		return -ENOSPC;
+	}
+
+	if (available - outstanding <= c->rp_size && !can_use_rp(c))
+		return -ENOSPC;
+
+	c->min_idx_lebs = min_idx_lebs;
+	return 0;
+}
+
+/**
+ * calc_idx_growth - calculate approximate index growth from budgeting request.
+ * @c: UBIFS file-system description object
+ * @req: budgeting request
+ *
+ * For now we assume each new node adds one znode. But this is rather poor
+ * approximation, though.
+ */
+static int calc_idx_growth(const struct ubifs_info *c,
+			   const struct ubifs_budget_req *req)
+{
+	int znodes;
+
+	znodes = req->new_ino + (req->new_page << UBIFS_BLOCKS_PER_PAGE_SHIFT) +
+		 req->new_dent;
+	return znodes * c->max_idx_node_sz;
+}
+
+/**
+ * calc_data_growth - calculate approximate amount of new data from budgeting
+ * request.
+ * @c: UBIFS file-system description object
+ * @req: budgeting request
+ */
+static int calc_data_growth(const struct ubifs_info *c,
+			    const struct ubifs_budget_req *req)
+{
+	int data_growth;
+
+	data_growth = req->new_ino  ? c->inode_budget : 0;
+	if (req->new_page)
+		data_growth += c->page_budget;
+	if (req->new_dent)
+		data_growth += c->dent_budget;
+	data_growth += req->new_ino_d;
+	return data_growth;
+}
+
+/**
+ * calc_dd_growth - calculate approximate amount of data which makes other data
+ * dirty from budgeting request.
+ * @c: UBIFS file-system description object
+ * @req: budgeting request
+ */
+static int calc_dd_growth(const struct ubifs_info *c,
+			  const struct ubifs_budget_req *req)
+{
+	int dd_growth;
+
+	dd_growth = req->dirtied_page ? c->page_budget : 0;
+
+	if (req->dirtied_ino)
+		dd_growth += c->inode_budget << (req->dirtied_ino - 1);
+	if (req->mod_dent)
+		dd_growth += c->dent_budget;
+	dd_growth += req->dirtied_ino_d;
+	return dd_growth;
+}
+
+/**
+ * ubifs_budget_space - ensure there is enough space to complete an operation.
+ * @c: UBIFS file-system description object
+ * @req: budget request
+ *
+ * This function allocates budget for an operation. It uses pessimistic
+ * approximation of how much flash space the operation needs. The goal of this
+ * function is to make sure UBIFS always has flash space to flush all dirty
+ * pages, dirty inodes, and dirty znodes (liability). This function may force
+ * commit, garbage-collection or write-back. Returns zero in case of success,
+ * %-ENOSPC if there is no free space and other negative error codes in case of
+ * failures.
+ */
+int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req)
+{
+	int uninitialized_var(cmt_retries), uninitialized_var(wb_retries);
+	int err, idx_growth, data_growth, dd_growth;
+	struct retries_info ri;
+
+	ubifs_assert(req->dirtied_ino <= 4);
+	ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4);
+
+	data_growth = calc_data_growth(c, req);
+	dd_growth = calc_dd_growth(c, req);
+	if (!data_growth && !dd_growth)
+		return 0;
+	idx_growth = calc_idx_growth(c, req);
+	memset(&ri, 0, sizeof(struct retries_info));
+
+again:
+	spin_lock(&c->space_lock);
+	ubifs_assert(c->budg_idx_growth >= 0);
+	ubifs_assert(c->budg_data_growth >= 0);
+	ubifs_assert(c->budg_dd_growth >= 0);
+
+	if (unlikely(c->nospace) && (c->nospace_rp || !can_use_rp(c))) {
+		dbg_budg("no space");
+		spin_unlock(&c->space_lock);
+		return -ENOSPC;
+	}
+
+	c->budg_idx_growth += idx_growth;
+	c->budg_data_growth += data_growth;
+	c->budg_dd_growth += dd_growth;
+
+	err = do_budget_space(c);
+	if (likely(!err)) {
+		req->idx_growth = idx_growth;
+		req->data_growth = data_growth;
+		req->dd_growth = dd_growth;
+		spin_unlock(&c->space_lock);
+		return 0;
+	}
+
+	/* Restore the old values */
+	c->budg_idx_growth -= idx_growth;
+	c->budg_data_growth -= data_growth;
+	c->budg_dd_growth -= dd_growth;
+	spin_unlock(&c->space_lock);
+
+	if (req->fast) {
+		dbg_budg("no space for fast budgeting");
+		return err;
+	}
+
+	err = make_free_space(c, &ri);
+	if (err == -EAGAIN) {
+		dbg_budg("try again");
+		cond_resched();
+		goto again;
+	} else if (err == -ENOSPC) {
+		dbg_budg("FS is full, -ENOSPC");
+		c->nospace = 1;
+		if (can_use_rp(c) || c->rp_size == 0)
+			c->nospace_rp = 1;
+		smp_wmb();
+	} else
+		ubifs_err("cannot budget space, error %d", err);
+	return err;
+}
+
+/**
+ * ubifs_release_budget - release budgeted free space.
+ * @c: UBIFS file-system description object
+ * @req: budget request
+ *
+ * This function releases the space budgeted by 'ubifs_budget_space()'. Note,
+ * since the index changes (which were budgeted for in @req->idx_growth) will
+ * only be written to the media on commit, this function moves the index budget
+ * from @c->budg_idx_growth to @c->budg_uncommitted_idx. The latter will be
+ * zeroed by the commit operation.
+ */
+void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
+{
+	ubifs_assert(req->dirtied_ino <= 4);
+	ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4);
+	if (!req->recalculate) {
+		ubifs_assert(req->idx_growth >= 0);
+		ubifs_assert(req->data_growth >= 0);
+		ubifs_assert(req->dd_growth >= 0);
+	}
+
+	if (req->recalculate) {
+		req->data_growth = calc_data_growth(c, req);
+		req->dd_growth = calc_dd_growth(c, req);
+		req->idx_growth = calc_idx_growth(c, req);
+	}
+
+	if (!req->data_growth && !req->dd_growth)
+		return;
+
+	c->nospace = c->nospace_rp = 0;
+	smp_wmb();
+
+	spin_lock(&c->space_lock);
+	c->budg_idx_growth -= req->idx_growth;
+	c->budg_uncommitted_idx += req->idx_growth;
+	c->budg_data_growth -= req->data_growth;
+	c->budg_dd_growth -= req->dd_growth;
+	c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+
+	ubifs_assert(c->budg_idx_growth >= 0);
+	ubifs_assert(c->budg_data_growth >= 0);
+	ubifs_assert(c->min_idx_lebs < c->main_lebs);
+	spin_unlock(&c->space_lock);
+}
+
+/**
+ * ubifs_convert_page_budget - convert budget of a new page.
+ * @c: UBIFS file-system description object
+ *
+ * This function converts budget which was allocated for a new page of data to
+ * the budget of changing an existing page of data. The latter is smaller then
+ * the former, so this function only does simple re-calculation and does not
+ * involve any write-back.
+ */
+void ubifs_convert_page_budget(struct ubifs_info *c)
+{
+	spin_lock(&c->space_lock);
+	/* Release the index growth reservation */
+	c->budg_idx_growth -= c->max_idx_node_sz << UBIFS_BLOCKS_PER_PAGE_SHIFT;
+	/* Release the data growth reservation */
+	c->budg_data_growth -= c->page_budget;
+	/* Increase the dirty data growth reservation instead */
+	c->budg_dd_growth += c->page_budget;
+	/* And re-calculate the indexing space reservation */
+	c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+	spin_unlock(&c->space_lock);
+}
+
+/**
+ * ubifs_release_dirty_inode_budget - release dirty inode budget.
+ * @c: UBIFS file-system description object
+ * @ui: UBIFS inode to release the budget for
+ *
+ * This function releases budget corresponding to a dirty inode. It is usually
+ * called when after the inode has been written to the media and marked as
+ * clean.
+ */
+void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
+				      struct ubifs_inode *ui)
+{
+	struct ubifs_budget_req req = {.dd_growth = c->inode_budget,
+				       .dirtied_ino_d = ui->data_len};
+
+	ubifs_release_budget(c, &req);
+}
+
+/**
+ * ubifs_budg_get_free_space - return amount of free space.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns amount of free space on the file-system.
+ */
+long long ubifs_budg_get_free_space(struct ubifs_info *c)
+{
+	int min_idx_lebs, rsvd_idx_lebs;
+	long long available, outstanding, free;
+
+	/* Do exactly the same calculations as in 'do_budget_space()' */
+	spin_lock(&c->space_lock);
+	min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+
+	if (min_idx_lebs > c->lst.idx_lebs)
+		rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs;
+	else
+		rsvd_idx_lebs = 0;
+
+	if (rsvd_idx_lebs > c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt
+				- c->lst.taken_empty_lebs) {
+		spin_unlock(&c->space_lock);
+		return 0;
+	}
+
+	available = ubifs_calc_available(c, min_idx_lebs);
+	outstanding = c->budg_data_growth + c->budg_dd_growth;
+	c->min_idx_lebs = min_idx_lebs;
+	spin_unlock(&c->space_lock);
+
+	if (available > outstanding)
+		free = ubifs_reported_space(c, available - outstanding);
+	else
+		free = 0;
+	return free;
+}
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
new file mode 100644
index 00000000000..3b516316c9b
--- /dev/null
+++ b/fs/ubifs/commit.c
@@ -0,0 +1,677 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements functions that manage the running of the commit process.
+ * Each affected module has its own functions to accomplish their part in the
+ * commit and those functions are called here.
+ *
+ * The commit is the process whereby all updates to the index and LEB properties
+ * are written out together and the journal becomes empty. This keeps the
+ * file system consistent - at all times the state can be recreated by reading
+ * the index and LEB properties and then replaying the journal.
+ *
+ * The commit is split into two parts named "commit start" and "commit end".
+ * During commit start, the commit process has exclusive access to the journal
+ * by holding the commit semaphore down for writing. As few I/O operations as
+ * possible are performed during commit start, instead the nodes that are to be
+ * written are merely identified. During commit end, the commit semaphore is no
+ * longer held and the journal is again in operation, allowing users to continue
+ * to use the file system while the bulk of the commit I/O is performed. The
+ * purpose of this two-step approach is to prevent the commit from causing any
+ * latency blips. Note that in any case, the commit does not prevent lookups
+ * (as permitted by the TNC mutex), or access to VFS data structures e.g. page
+ * cache.
+ */
+
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include "ubifs.h"
+
+/**
+ * do_commit - commit the journal.
+ * @c: UBIFS file-system description object
+ *
+ * This function implements UBIFS commit. It has to be called with commit lock
+ * locked. Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+static int do_commit(struct ubifs_info *c)
+{
+	int err, new_ltail_lnum, old_ltail_lnum, i;
+	struct ubifs_zbranch zroot;
+	struct ubifs_lp_stats lst;
+
+	dbg_cmt("start");
+	if (c->ro_media) {
+		err = -EROFS;
+		goto out_up;
+	}
+
+	/* Sync all write buffers (necessary for recovery) */
+	for (i = 0; i < c->jhead_cnt; i++) {
+		err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
+		if (err)
+			goto out_up;
+	}
+
+	err = ubifs_gc_start_commit(c);
+	if (err)
+		goto out_up;
+	err = dbg_check_lprops(c);
+	if (err)
+		goto out_up;
+	err = ubifs_log_start_commit(c, &new_ltail_lnum);
+	if (err)
+		goto out_up;
+	err = ubifs_tnc_start_commit(c, &zroot);
+	if (err)
+		goto out_up;
+	err = ubifs_lpt_start_commit(c);
+	if (err)
+		goto out_up;
+	err = ubifs_orphan_start_commit(c);
+	if (err)
+		goto out_up;
+
+	ubifs_get_lp_stats(c, &lst);
+
+	up_write(&c->commit_sem);
+
+	err = ubifs_tnc_end_commit(c);
+	if (err)
+		goto out;
+	err = ubifs_lpt_end_commit(c);
+	if (err)
+		goto out;
+	err = ubifs_orphan_end_commit(c);
+	if (err)
+		goto out;
+	old_ltail_lnum = c->ltail_lnum;
+	err = ubifs_log_end_commit(c, new_ltail_lnum);
+	if (err)
+		goto out;
+	err = dbg_check_old_index(c, &zroot);
+	if (err)
+		goto out;
+
+	mutex_lock(&c->mst_mutex);
+	c->mst_node->cmt_no      = cpu_to_le64(++c->cmt_no);
+	c->mst_node->log_lnum    = cpu_to_le32(new_ltail_lnum);
+	c->mst_node->root_lnum   = cpu_to_le32(zroot.lnum);
+	c->mst_node->root_offs   = cpu_to_le32(zroot.offs);
+	c->mst_node->root_len    = cpu_to_le32(zroot.len);
+	c->mst_node->ihead_lnum  = cpu_to_le32(c->ihead_lnum);
+	c->mst_node->ihead_offs  = cpu_to_le32(c->ihead_offs);
+	c->mst_node->index_size  = cpu_to_le64(c->old_idx_sz);
+	c->mst_node->lpt_lnum    = cpu_to_le32(c->lpt_lnum);
+	c->mst_node->lpt_offs    = cpu_to_le32(c->lpt_offs);
+	c->mst_node->nhead_lnum  = cpu_to_le32(c->nhead_lnum);
+	c->mst_node->nhead_offs  = cpu_to_le32(c->nhead_offs);
+	c->mst_node->ltab_lnum   = cpu_to_le32(c->ltab_lnum);
+	c->mst_node->ltab_offs   = cpu_to_le32(c->ltab_offs);
+	c->mst_node->lsave_lnum  = cpu_to_le32(c->lsave_lnum);
+	c->mst_node->lsave_offs  = cpu_to_le32(c->lsave_offs);
+	c->mst_node->lscan_lnum  = cpu_to_le32(c->lscan_lnum);
+	c->mst_node->empty_lebs  = cpu_to_le32(lst.empty_lebs);
+	c->mst_node->idx_lebs    = cpu_to_le32(lst.idx_lebs);
+	c->mst_node->total_free  = cpu_to_le64(lst.total_free);
+	c->mst_node->total_dirty = cpu_to_le64(lst.total_dirty);
+	c->mst_node->total_used  = cpu_to_le64(lst.total_used);
+	c->mst_node->total_dead  = cpu_to_le64(lst.total_dead);
+	c->mst_node->total_dark  = cpu_to_le64(lst.total_dark);
+	if (c->no_orphs)
+		c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
+	else
+		c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_NO_ORPHS);
+	err = ubifs_write_master(c);
+	mutex_unlock(&c->mst_mutex);
+	if (err)
+		goto out;
+
+	err = ubifs_log_post_commit(c, old_ltail_lnum);
+	if (err)
+		goto out;
+	err = ubifs_gc_end_commit(c);
+	if (err)
+		goto out;
+	err = ubifs_lpt_post_commit(c);
+	if (err)
+		goto out;
+
+	spin_lock(&c->cs_lock);
+	c->cmt_state = COMMIT_RESTING;
+	wake_up(&c->cmt_wq);
+	dbg_cmt("commit end");
+	spin_unlock(&c->cs_lock);
+
+	return 0;
+
+out_up:
+	up_write(&c->commit_sem);
+out:
+	ubifs_err("commit failed, error %d", err);
+	spin_lock(&c->cs_lock);
+	c->cmt_state = COMMIT_BROKEN;
+	wake_up(&c->cmt_wq);
+	spin_unlock(&c->cs_lock);
+	ubifs_ro_mode(c, err);
+	return err;
+}
+
+/**
+ * run_bg_commit - run background commit if it is needed.
+ * @c: UBIFS file-system description object
+ *
+ * This function runs background commit if it is needed. Returns zero in case
+ * of success and a negative error code in case of failure.
+ */
+static int run_bg_commit(struct ubifs_info *c)
+{
+	spin_lock(&c->cs_lock);
+	/*
+	 * Run background commit only if background commit was requested or if
+	 * commit is required.
+	 */
+	if (c->cmt_state != COMMIT_BACKGROUND &&
+	    c->cmt_state != COMMIT_REQUIRED)
+		goto out;
+	spin_unlock(&c->cs_lock);
+
+	down_write(&c->commit_sem);
+	spin_lock(&c->cs_lock);
+	if (c->cmt_state == COMMIT_REQUIRED)
+		c->cmt_state = COMMIT_RUNNING_REQUIRED;
+	else if (c->cmt_state == COMMIT_BACKGROUND)
+		c->cmt_state = COMMIT_RUNNING_BACKGROUND;
+	else
+		goto out_cmt_unlock;
+	spin_unlock(&c->cs_lock);
+
+	return do_commit(c);
+
+out_cmt_unlock:
+	up_write(&c->commit_sem);
+out:
+	spin_unlock(&c->cs_lock);
+	return 0;
+}
+
+/**
+ * ubifs_bg_thread - UBIFS background thread function.
+ * @info: points to the file-system description object
+ *
+ * This function implements various file-system background activities:
+ * o when a write-buffer timer expires it synchronizes the appropriate
+ *   write-buffer;
+ * o when the journal is about to be full, it starts in-advance commit.
+ *
+ * Note, other stuff like background garbage collection may be added here in
+ * future.
+ */
+int ubifs_bg_thread(void *info)
+{
+	int err;
+	struct ubifs_info *c = info;
+
+	ubifs_msg("background thread \"%s\" started, PID %d",
+		  c->bgt_name, current->pid);
+	set_freezable();
+
+	while (1) {
+		if (kthread_should_stop())
+			break;
+
+		if (try_to_freeze())
+			continue;
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		/* Check if there is something to do */
+		if (!c->need_bgt) {
+			/*
+			 * Nothing prevents us from going sleep now and
+			 * be never woken up and block the task which
+			 * could wait in 'kthread_stop()' forever.
+			 */
+			if (kthread_should_stop())
+				break;
+			schedule();
+			continue;
+		} else
+			__set_current_state(TASK_RUNNING);
+
+		c->need_bgt = 0;
+		err = ubifs_bg_wbufs_sync(c);
+		if (err)
+			ubifs_ro_mode(c, err);
+
+		run_bg_commit(c);
+		cond_resched();
+	}
+
+	dbg_msg("background thread \"%s\" stops", c->bgt_name);
+	return 0;
+}
+
+/**
+ * ubifs_commit_required - set commit state to "required".
+ * @c: UBIFS file-system description object
+ *
+ * This function is called if a commit is required but cannot be done from the
+ * calling function, so it is just flagged instead.
+ */
+void ubifs_commit_required(struct ubifs_info *c)
+{
+	spin_lock(&c->cs_lock);
+	switch (c->cmt_state) {
+	case COMMIT_RESTING:
+	case COMMIT_BACKGROUND:
+		dbg_cmt("old: %s, new: %s", dbg_cstate(c->cmt_state),
+			dbg_cstate(COMMIT_REQUIRED));
+		c->cmt_state = COMMIT_REQUIRED;
+		break;
+	case COMMIT_RUNNING_BACKGROUND:
+		dbg_cmt("old: %s, new: %s", dbg_cstate(c->cmt_state),
+			dbg_cstate(COMMIT_RUNNING_REQUIRED));
+		c->cmt_state = COMMIT_RUNNING_REQUIRED;
+		break;
+	case COMMIT_REQUIRED:
+	case COMMIT_RUNNING_REQUIRED:
+	case COMMIT_BROKEN:
+		break;
+	}
+	spin_unlock(&c->cs_lock);
+}
+
+/**
+ * ubifs_request_bg_commit - notify the background thread to do a commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function is called if the journal is full enough to make a commit
+ * worthwhile, so background thread is kicked to start it.
+ */
+void ubifs_request_bg_commit(struct ubifs_info *c)
+{
+	spin_lock(&c->cs_lock);
+	if (c->cmt_state == COMMIT_RESTING) {
+		dbg_cmt("old: %s, new: %s", dbg_cstate(c->cmt_state),
+			dbg_cstate(COMMIT_BACKGROUND));
+		c->cmt_state = COMMIT_BACKGROUND;
+		spin_unlock(&c->cs_lock);
+		ubifs_wake_up_bgt(c);
+	} else
+		spin_unlock(&c->cs_lock);
+}
+
+/**
+ * wait_for_commit - wait for commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function sleeps until the commit operation is no longer running.
+ */
+static int wait_for_commit(struct ubifs_info *c)
+{
+	dbg_cmt("pid %d goes sleep", current->pid);
+
+	/*
+	 * The following sleeps if the condition is false, and will be woken
+	 * when the commit ends. It is possible, although very unlikely, that we
+	 * will wake up and see the subsequent commit running, rather than the
+	 * one we were waiting for, and go back to sleep.  However, we will be
+	 * woken again, so there is no danger of sleeping forever.
+	 */
+	wait_event(c->cmt_wq, c->cmt_state != COMMIT_RUNNING_BACKGROUND &&
+			      c->cmt_state != COMMIT_RUNNING_REQUIRED);
+	dbg_cmt("commit finished, pid %d woke up", current->pid);
+	return 0;
+}
+
+/**
+ * ubifs_run_commit - run or wait for commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function runs commit and returns zero in case of success and a negative
+ * error code in case of failure.
+ */
+int ubifs_run_commit(struct ubifs_info *c)
+{
+	int err = 0;
+
+	spin_lock(&c->cs_lock);
+	if (c->cmt_state == COMMIT_BROKEN) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (c->cmt_state == COMMIT_RUNNING_BACKGROUND)
+		/*
+		 * We set the commit state to 'running required' to indicate
+		 * that we want it to complete as quickly as possible.
+		 */
+		c->cmt_state = COMMIT_RUNNING_REQUIRED;
+
+	if (c->cmt_state == COMMIT_RUNNING_REQUIRED) {
+		spin_unlock(&c->cs_lock);
+		return wait_for_commit(c);
+	}
+	spin_unlock(&c->cs_lock);
+
+	/* Ok, the commit is indeed needed */
+
+	down_write(&c->commit_sem);
+	spin_lock(&c->cs_lock);
+	/*
+	 * Since we unlocked 'c->cs_lock', the state may have changed, so
+	 * re-check it.
+	 */
+	if (c->cmt_state == COMMIT_BROKEN) {
+		err = -EINVAL;
+		goto out_cmt_unlock;
+	}
+
+	if (c->cmt_state == COMMIT_RUNNING_BACKGROUND)
+		c->cmt_state = COMMIT_RUNNING_REQUIRED;
+
+	if (c->cmt_state == COMMIT_RUNNING_REQUIRED) {
+		up_write(&c->commit_sem);
+		spin_unlock(&c->cs_lock);
+		return wait_for_commit(c);
+	}
+	c->cmt_state = COMMIT_RUNNING_REQUIRED;
+	spin_unlock(&c->cs_lock);
+
+	err = do_commit(c);
+	return err;
+
+out_cmt_unlock:
+	up_write(&c->commit_sem);
+out:
+	spin_unlock(&c->cs_lock);
+	return err;
+}
+
+/**
+ * ubifs_gc_should_commit - determine if it is time for GC to run commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function is called by garbage collection to determine if commit should
+ * be run. If commit state is @COMMIT_BACKGROUND, which means that the journal
+ * is full enough to start commit, this function returns true. It is not
+ * absolutely necessary to commit yet, but it feels like this should be better
+ * then to keep doing GC. This function returns %1 if GC has to initiate commit
+ * and %0 if not.
+ */
+int ubifs_gc_should_commit(struct ubifs_info *c)
+{
+	int ret = 0;
+
+	spin_lock(&c->cs_lock);
+	if (c->cmt_state == COMMIT_BACKGROUND) {
+		dbg_cmt("commit required now");
+		c->cmt_state = COMMIT_REQUIRED;
+	} else
+		dbg_cmt("commit not requested");
+	if (c->cmt_state == COMMIT_REQUIRED)
+		ret = 1;
+	spin_unlock(&c->cs_lock);
+	return ret;
+}
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+
+/**
+ * struct idx_node - hold index nodes during index tree traversal.
+ * @list: list
+ * @iip: index in parent (slot number of this indexing node in the parent
+ *       indexing node)
+ * @upper_key: all keys in this indexing node have to be less or equivalent to
+ *             this key
+ * @idx: index node (8-byte aligned because all node structures must be 8-byte
+ *       aligned)
+ */
+struct idx_node {
+	struct list_head list;
+	int iip;
+	union ubifs_key upper_key;
+	struct ubifs_idx_node idx __attribute__((aligned(8)));
+};
+
+/**
+ * dbg_old_index_check_init - get information for the next old index check.
+ * @c: UBIFS file-system description object
+ * @zroot: root of the index
+ *
+ * This function records information about the index that will be needed for the
+ * next old index check i.e. 'dbg_check_old_index()'.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot)
+{
+	struct ubifs_idx_node *idx;
+	int lnum, offs, len, err = 0;
+
+	c->old_zroot = *zroot;
+
+	lnum = c->old_zroot.lnum;
+	offs = c->old_zroot.offs;
+	len = c->old_zroot.len;
+
+	idx = kmalloc(c->max_idx_node_sz, GFP_NOFS);
+	if (!idx)
+		return -ENOMEM;
+
+	err = ubifs_read_node(c, idx, UBIFS_IDX_NODE, len, lnum, offs);
+	if (err)
+		goto out;
+
+	c->old_zroot_level = le16_to_cpu(idx->level);
+	c->old_zroot_sqnum = le64_to_cpu(idx->ch.sqnum);
+out:
+	kfree(idx);
+	return err;
+}
+
+/**
+ * dbg_check_old_index - check the old copy of the index.
+ * @c: UBIFS file-system description object
+ * @zroot: root of the new index
+ *
+ * In order to be able to recover from an unclean unmount, a complete copy of
+ * the index must exist on flash. This is the "old" index. The commit process
+ * must write the "new" index to flash without overwriting or destroying any
+ * part of the old index. This function is run at commit end in order to check
+ * that the old index does indeed exist completely intact.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
+{
+	int lnum, offs, len, err = 0, uninitialized_var(last_level), child_cnt;
+	int first = 1, iip;
+	union ubifs_key lower_key, upper_key, l_key, u_key;
+	unsigned long long uninitialized_var(last_sqnum);
+	struct ubifs_idx_node *idx;
+	struct list_head list;
+	struct idx_node *i;
+	size_t sz;
+
+	if (!(ubifs_chk_flags & UBIFS_CHK_OLD_IDX))
+		goto out;
+
+	INIT_LIST_HEAD(&list);
+
+	sz = sizeof(struct idx_node) + ubifs_idx_node_sz(c, c->fanout) -
+	     UBIFS_IDX_NODE_SZ;
+
+	/* Start at the old zroot */
+	lnum = c->old_zroot.lnum;
+	offs = c->old_zroot.offs;
+	len = c->old_zroot.len;
+	iip = 0;
+
+	/*
+	 * Traverse the index tree preorder depth-first i.e. do a node and then
+	 * its subtrees from left to right.
+	 */
+	while (1) {
+		struct ubifs_branch *br;
+
+		/* Get the next index node */
+		i = kmalloc(sz, GFP_NOFS);
+		if (!i) {
+			err = -ENOMEM;
+			goto out_free;
+		}
+		i->iip = iip;
+		/* Keep the index nodes on our path in a linked list */
+		list_add_tail(&i->list, &list);
+		/* Read the index node */
+		idx = &i->idx;
+		err = ubifs_read_node(c, idx, UBIFS_IDX_NODE, len, lnum, offs);
+		if (err)
+			goto out_free;
+		/* Validate index node */
+		child_cnt = le16_to_cpu(idx->child_cnt);
+		if (child_cnt < 1 || child_cnt > c->fanout) {
+			err = 1;
+			goto out_dump;
+		}
+		if (first) {
+			first = 0;
+			/* Check root level and sqnum */
+			if (le16_to_cpu(idx->level) != c->old_zroot_level) {
+				err = 2;
+				goto out_dump;
+			}
+			if (le64_to_cpu(idx->ch.sqnum) != c->old_zroot_sqnum) {
+				err = 3;
+				goto out_dump;
+			}
+			/* Set last values as though root had a parent */
+			last_level = le16_to_cpu(idx->level) + 1;
+			last_sqnum = le64_to_cpu(idx->ch.sqnum) + 1;
+			key_read(c, ubifs_idx_key(c, idx), &lower_key);
+			highest_ino_key(c, &upper_key, INUM_WATERMARK);
+		}
+		key_copy(c, &upper_key, &i->upper_key);
+		if (le16_to_cpu(idx->level) != last_level - 1) {
+			err = 3;
+			goto out_dump;
+		}
+		/*
+		 * The index is always written bottom up hence a child's sqnum
+		 * is always less than the parents.
+		 */
+		if (le64_to_cpu(idx->ch.sqnum) >= last_sqnum) {
+			err = 4;
+			goto out_dump;
+		}
+		/* Check key range */
+		key_read(c, ubifs_idx_key(c, idx), &l_key);
+		br = ubifs_idx_branch(c, idx, child_cnt - 1);
+		key_read(c, &br->key, &u_key);
+		if (keys_cmp(c, &lower_key, &l_key) > 0) {
+			err = 5;
+			goto out_dump;
+		}
+		if (keys_cmp(c, &upper_key, &u_key) < 0) {
+			err = 6;
+			goto out_dump;
+		}
+		if (keys_cmp(c, &upper_key, &u_key) == 0)
+			if (!is_hash_key(c, &u_key)) {
+				err = 7;
+				goto out_dump;
+			}
+		/* Go to next index node */
+		if (le16_to_cpu(idx->level) == 0) {
+			/* At the bottom, so go up until can go right */
+			while (1) {
+				/* Drop the bottom of the list */
+				list_del(&i->list);
+				kfree(i);
+				/* No more list means we are done */
+				if (list_empty(&list))
+					goto out;
+				/* Look at the new bottom */
+				i = list_entry(list.prev, struct idx_node,
+					       list);
+				idx = &i->idx;
+				/* Can we go right */
+				if (iip + 1 < le16_to_cpu(idx->child_cnt)) {
+					iip = iip + 1;
+					break;
+				} else
+					/* Nope, so go up again */
+					iip = i->iip;
+			}
+		} else
+			/* Go down left */
+			iip = 0;
+		/*
+		 * We have the parent in 'idx' and now we set up for reading the
+		 * child pointed to by slot 'iip'.
+		 */
+		last_level = le16_to_cpu(idx->level);
+		last_sqnum = le64_to_cpu(idx->ch.sqnum);
+		br = ubifs_idx_branch(c, idx, iip);
+		lnum = le32_to_cpu(br->lnum);
+		offs = le32_to_cpu(br->offs);
+		len = le32_to_cpu(br->len);
+		key_read(c, &br->key, &lower_key);
+		if (iip + 1 < le16_to_cpu(idx->child_cnt)) {
+			br = ubifs_idx_branch(c, idx, iip + 1);
+			key_read(c, &br->key, &upper_key);
+		} else
+			key_copy(c, &i->upper_key, &upper_key);
+	}
+out:
+	err = dbg_old_index_check_init(c, zroot);
+	if (err)
+		goto out_free;
+
+	return 0;
+
+out_dump:
+	dbg_err("dumping index node (iip=%d)", i->iip);
+	dbg_dump_node(c, idx);
+	list_del(&i->list);
+	kfree(i);
+	if (!list_empty(&list)) {
+		i = list_entry(list.prev, struct idx_node, list);
+		dbg_err("dumping parent index node");
+		dbg_dump_node(c, &i->idx);
+	}
+out_free:
+	while (!list_empty(&list)) {
+		i = list_entry(list.next, struct idx_node, list);
+		list_del(&i->list);
+		kfree(i);
+	}
+	ubifs_err("failed, error %d", err);
+	if (err > 0)
+		err = -EINVAL;
+	return err;
+}
+
+#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c
new file mode 100644
index 00000000000..5bb51dac3c1
--- /dev/null
+++ b/fs/ubifs/compress.c
@@ -0,0 +1,253 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ * Copyright (C) 2006, 2007 University of Szeged, Hungary
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ *          Zoltan Sogor
+ */
+
+/*
+ * This file provides a single place to access to compression and
+ * decompression.
+ */
+
+#include <linux/crypto.h>
+#include "ubifs.h"
+
+/* Fake description object for the "none" compressor */
+static struct ubifs_compressor none_compr = {
+	.compr_type = UBIFS_COMPR_NONE,
+	.name = "no compression",
+	.capi_name = "",
+};
+
+#ifdef CONFIG_UBIFS_FS_LZO
+static DEFINE_MUTEX(lzo_mutex);
+
+static struct ubifs_compressor lzo_compr = {
+	.compr_type = UBIFS_COMPR_LZO,
+	.comp_mutex = &lzo_mutex,
+	.name = "LZO",
+	.capi_name = "lzo",
+};
+#else
+static struct ubifs_compressor lzo_compr = {
+	.compr_type = UBIFS_COMPR_LZO,
+	.name = "LZO",
+};
+#endif
+
+#ifdef CONFIG_UBIFS_FS_ZLIB
+static DEFINE_MUTEX(deflate_mutex);
+static DEFINE_MUTEX(inflate_mutex);
+
+static struct ubifs_compressor zlib_compr = {
+	.compr_type = UBIFS_COMPR_ZLIB,
+	.comp_mutex = &deflate_mutex,
+	.decomp_mutex = &inflate_mutex,
+	.name = "zlib",
+	.capi_name = "deflate",
+};
+#else
+static struct ubifs_compressor zlib_compr = {
+	.compr_type = UBIFS_COMPR_ZLIB,
+	.name = "zlib",
+};
+#endif
+
+/* All UBIFS compressors */
+struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
+
+/**
+ * ubifs_compress - compress data.
+ * @in_buf: data to compress
+ * @in_len: length of the data to compress
+ * @out_buf: output buffer where compressed data should be stored
+ * @out_len: output buffer length is returned here
+ * @compr_type: type of compression to use on enter, actually used compression
+ *              type on exit
+ *
+ * This function compresses input buffer @in_buf of length @in_len and stores
+ * the result in the output buffer @out_buf and the resulting length in
+ * @out_len. If the input buffer does not compress, it is just copied to the
+ * @out_buf. The same happens if @compr_type is %UBIFS_COMPR_NONE or if
+ * compression error occurred.
+ *
+ * Note, if the input buffer was not compressed, it is copied to the output
+ * buffer and %UBIFS_COMPR_NONE is returned in @compr_type.
+ *
+ * This functions returns %0 on success or a negative error code on failure.
+ */
+void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
+		    int *compr_type)
+{
+	int err;
+	struct ubifs_compressor *compr = ubifs_compressors[*compr_type];
+
+	if (*compr_type == UBIFS_COMPR_NONE)
+		goto no_compr;
+
+	/* If the input data is small, do not even try to compress it */
+	if (in_len < UBIFS_MIN_COMPR_LEN)
+		goto no_compr;
+
+	if (compr->comp_mutex)
+		mutex_lock(compr->comp_mutex);
+	err = crypto_comp_compress(compr->cc, in_buf, in_len, out_buf,
+				   out_len);
+	if (compr->comp_mutex)
+		mutex_unlock(compr->comp_mutex);
+	if (unlikely(err)) {
+		ubifs_warn("cannot compress %d bytes, compressor %s, "
+			   "error %d, leave data uncompressed",
+			   in_len, compr->name, err);
+		 goto no_compr;
+	}
+
+	/*
+	 * Presently, we just require that compression results in less data,
+	 * rather than any defined minimum compression ratio or amount.
+	 */
+	if (ALIGN(*out_len, 8) >= ALIGN(in_len, 8))
+		goto no_compr;
+
+	return;
+
+no_compr:
+	memcpy(out_buf, in_buf, in_len);
+	*out_len = in_len;
+	*compr_type = UBIFS_COMPR_NONE;
+}
+
+/**
+ * ubifs_decompress - decompress data.
+ * @in_buf: data to decompress
+ * @in_len: length of the data to decompress
+ * @out_buf: output buffer where decompressed data should
+ * @out_len: output length is returned here
+ * @compr_type: type of compression
+ *
+ * This function decompresses data from buffer @in_buf into buffer @out_buf.
+ * The length of the uncompressed data is returned in @out_len. This functions
+ * returns %0 on success or a negative error code on failure.
+ */
+int ubifs_decompress(const void *in_buf, int in_len, void *out_buf,
+		     int *out_len, int compr_type)
+{
+	int err;
+	struct ubifs_compressor *compr;
+
+	if (unlikely(compr_type < 0 || compr_type >= UBIFS_COMPR_TYPES_CNT)) {
+		ubifs_err("invalid compression type %d", compr_type);
+		return -EINVAL;
+	}
+
+	compr = ubifs_compressors[compr_type];
+
+	if (unlikely(!compr->capi_name)) {
+		ubifs_err("%s compression is not compiled in", compr->name);
+		return -EINVAL;
+	}
+
+	if (compr_type == UBIFS_COMPR_NONE) {
+		memcpy(out_buf, in_buf, in_len);
+		*out_len = in_len;
+		return 0;
+	}
+
+	if (compr->decomp_mutex)
+		mutex_lock(compr->decomp_mutex);
+	err = crypto_comp_decompress(compr->cc, in_buf, in_len, out_buf,
+				     out_len);
+	if (compr->decomp_mutex)
+		mutex_unlock(compr->decomp_mutex);
+	if (err)
+		ubifs_err("cannot decompress %d bytes, compressor %s, "
+			  "error %d", in_len, compr->name, err);
+
+	return err;
+}
+
+/**
+ * compr_init - initialize a compressor.
+ * @compr: compressor description object
+ *
+ * This function initializes the requested compressor and returns zero in case
+ * of success or a negative error code in case of failure.
+ */
+static int __init compr_init(struct ubifs_compressor *compr)
+{
+	if (compr->capi_name) {
+		compr->cc = crypto_alloc_comp(compr->capi_name, 0, 0);
+		if (IS_ERR(compr->cc)) {
+			ubifs_err("cannot initialize compressor %s, error %ld",
+				  compr->name, PTR_ERR(compr->cc));
+			return PTR_ERR(compr->cc);
+		}
+	}
+
+	ubifs_compressors[compr->compr_type] = compr;
+	return 0;
+}
+
+/**
+ * compr_exit - de-initialize a compressor.
+ * @compr: compressor description object
+ */
+static void compr_exit(struct ubifs_compressor *compr)
+{
+	if (compr->capi_name)
+		crypto_free_comp(compr->cc);
+	return;
+}
+
+/**
+ * ubifs_compressors_init - initialize UBIFS compressors.
+ *
+ * This function initializes the compressor which were compiled in. Returns
+ * zero in case of success and a negative error code in case of failure.
+ */
+int __init ubifs_compressors_init(void)
+{
+	int err;
+
+	err = compr_init(&lzo_compr);
+	if (err)
+		return err;
+
+	err = compr_init(&zlib_compr);
+	if (err)
+		goto out_lzo;
+
+	ubifs_compressors[UBIFS_COMPR_NONE] = &none_compr;
+	return 0;
+
+out_lzo:
+	compr_exit(&lzo_compr);
+	return err;
+}
+
+/**
+ * ubifs_compressors_exit - de-initialize UBIFS compressors.
+ */
+void __exit ubifs_compressors_exit(void)
+{
+	compr_exit(&lzo_compr);
+	compr_exit(&zlib_compr);
+}
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
new file mode 100644
index 00000000000..4e3aaeba4ec
--- /dev/null
+++ b/fs/ubifs/debug.c
@@ -0,0 +1,2289 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/*
+ * This file implements most of the debugging stuff which is compiled in only
+ * when it is enabled. But some debugging check functions are implemented in
+ * corresponding subsystem, just because they are closely related and utilize
+ * various local functions of those subsystems.
+ */
+
+#define UBIFS_DBG_PRESERVE_UBI
+
+#include "ubifs.h"
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+
+DEFINE_SPINLOCK(dbg_lock);
+
+static char dbg_key_buf0[128];
+static char dbg_key_buf1[128];
+
+unsigned int ubifs_msg_flags = UBIFS_MSG_FLAGS_DEFAULT;
+unsigned int ubifs_chk_flags = UBIFS_CHK_FLAGS_DEFAULT;
+unsigned int ubifs_tst_flags;
+
+module_param_named(debug_msgs, ubifs_msg_flags, uint, S_IRUGO | S_IWUSR);
+module_param_named(debug_chks, ubifs_chk_flags, uint, S_IRUGO | S_IWUSR);
+module_param_named(debug_tsts, ubifs_tst_flags, uint, S_IRUGO | S_IWUSR);
+
+MODULE_PARM_DESC(debug_msgs, "Debug message type flags");
+MODULE_PARM_DESC(debug_chks, "Debug check flags");
+MODULE_PARM_DESC(debug_tsts, "Debug special test flags");
+
+static const char *get_key_fmt(int fmt)
+{
+	switch (fmt) {
+	case UBIFS_SIMPLE_KEY_FMT:
+		return "simple";
+	default:
+		return "unknown/invalid format";
+	}
+}
+
+static const char *get_key_hash(int hash)
+{
+	switch (hash) {
+	case UBIFS_KEY_HASH_R5:
+		return "R5";
+	case UBIFS_KEY_HASH_TEST:
+		return "test";
+	default:
+		return "unknown/invalid name hash";
+	}
+}
+
+static const char *get_key_type(int type)
+{
+	switch (type) {
+	case UBIFS_INO_KEY:
+		return "inode";
+	case UBIFS_DENT_KEY:
+		return "direntry";
+	case UBIFS_XENT_KEY:
+		return "xentry";
+	case UBIFS_DATA_KEY:
+		return "data";
+	case UBIFS_TRUN_KEY:
+		return "truncate";
+	default:
+		return "unknown/invalid key";
+	}
+}
+
+static void sprintf_key(const struct ubifs_info *c, const union ubifs_key *key,
+			char *buffer)
+{
+	char *p = buffer;
+	int type = key_type(c, key);
+
+	if (c->key_fmt == UBIFS_SIMPLE_KEY_FMT) {
+		switch (type) {
+		case UBIFS_INO_KEY:
+			sprintf(p, "(%lu, %s)", key_inum(c, key),
+			       get_key_type(type));
+			break;
+		case UBIFS_DENT_KEY:
+		case UBIFS_XENT_KEY:
+			sprintf(p, "(%lu, %s, %#08x)", key_inum(c, key),
+				get_key_type(type), key_hash(c, key));
+			break;
+		case UBIFS_DATA_KEY:
+			sprintf(p, "(%lu, %s, %u)", key_inum(c, key),
+				get_key_type(type), key_block(c, key));
+			break;
+		case UBIFS_TRUN_KEY:
+			sprintf(p, "(%lu, %s)",
+				key_inum(c, key), get_key_type(type));
+			break;
+		default:
+			sprintf(p, "(bad key type: %#08x, %#08x)",
+				key->u32[0], key->u32[1]);
+		}
+	} else
+		sprintf(p, "bad key format %d", c->key_fmt);
+}
+
+const char *dbg_key_str0(const struct ubifs_info *c, const union ubifs_key *key)
+{
+	/* dbg_lock must be held */
+	sprintf_key(c, key, dbg_key_buf0);
+	return dbg_key_buf0;
+}
+
+const char *dbg_key_str1(const struct ubifs_info *c, const union ubifs_key *key)
+{
+	/* dbg_lock must be held */
+	sprintf_key(c, key, dbg_key_buf1);
+	return dbg_key_buf1;
+}
+
+const char *dbg_ntype(int type)
+{
+	switch (type) {
+	case UBIFS_PAD_NODE:
+		return "padding node";
+	case UBIFS_SB_NODE:
+		return "superblock node";
+	case UBIFS_MST_NODE:
+		return "master node";
+	case UBIFS_REF_NODE:
+		return "reference node";
+	case UBIFS_INO_NODE:
+		return "inode node";
+	case UBIFS_DENT_NODE:
+		return "direntry node";
+	case UBIFS_XENT_NODE:
+		return "xentry node";
+	case UBIFS_DATA_NODE:
+		return "data node";
+	case UBIFS_TRUN_NODE:
+		return "truncate node";
+	case UBIFS_IDX_NODE:
+		return "indexing node";
+	case UBIFS_CS_NODE:
+		return "commit start node";
+	case UBIFS_ORPH_NODE:
+		return "orphan node";
+	default:
+		return "unknown node";
+	}
+}
+
+static const char *dbg_gtype(int type)
+{
+	switch (type) {
+	case UBIFS_NO_NODE_GROUP:
+		return "no node group";
+	case UBIFS_IN_NODE_GROUP:
+		return "in node group";
+	case UBIFS_LAST_OF_NODE_GROUP:
+		return "last of node group";
+	default:
+		return "unknown";
+	}
+}
+
+const char *dbg_cstate(int cmt_state)
+{
+	switch (cmt_state) {
+	case COMMIT_RESTING:
+		return "commit resting";
+	case COMMIT_BACKGROUND:
+		return "background commit requested";
+	case COMMIT_REQUIRED:
+		return "commit required";
+	case COMMIT_RUNNING_BACKGROUND:
+		return "BACKGROUND commit running";
+	case COMMIT_RUNNING_REQUIRED:
+		return "commit running and required";
+	case COMMIT_BROKEN:
+		return "broken commit";
+	default:
+		return "unknown commit state";
+	}
+}
+
+static void dump_ch(const struct ubifs_ch *ch)
+{
+	printk(KERN_DEBUG "\tmagic          %#x\n", le32_to_cpu(ch->magic));
+	printk(KERN_DEBUG "\tcrc            %#x\n", le32_to_cpu(ch->crc));
+	printk(KERN_DEBUG "\tnode_type      %d (%s)\n", ch->node_type,
+	       dbg_ntype(ch->node_type));
+	printk(KERN_DEBUG "\tgroup_type     %d (%s)\n", ch->group_type,
+	       dbg_gtype(ch->group_type));
+	printk(KERN_DEBUG "\tsqnum          %llu\n",
+	       (unsigned long long)le64_to_cpu(ch->sqnum));
+	printk(KERN_DEBUG "\tlen            %u\n", le32_to_cpu(ch->len));
+}
+
+void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode)
+{
+	const struct ubifs_inode *ui = ubifs_inode(inode);
+
+	printk(KERN_DEBUG "inode      %lu\n", inode->i_ino);
+	printk(KERN_DEBUG "size       %llu\n",
+	       (unsigned long long)i_size_read(inode));
+	printk(KERN_DEBUG "nlink      %u\n", inode->i_nlink);
+	printk(KERN_DEBUG "uid        %u\n", (unsigned int)inode->i_uid);
+	printk(KERN_DEBUG "gid        %u\n", (unsigned int)inode->i_gid);
+	printk(KERN_DEBUG "atime      %u.%u\n",
+	       (unsigned int)inode->i_atime.tv_sec,
+	       (unsigned int)inode->i_atime.tv_nsec);
+	printk(KERN_DEBUG "mtime      %u.%u\n",
+	       (unsigned int)inode->i_mtime.tv_sec,
+	       (unsigned int)inode->i_mtime.tv_nsec);
+	printk(KERN_DEBUG "ctime       %u.%u\n",
+	       (unsigned int)inode->i_ctime.tv_sec,
+	       (unsigned int)inode->i_ctime.tv_nsec);
+	printk(KERN_DEBUG "creat_sqnum %llu\n", ui->creat_sqnum);
+	printk(KERN_DEBUG "xattr_size  %u\n", ui->xattr_size);
+	printk(KERN_DEBUG "xattr_cnt   %u\n", ui->xattr_cnt);
+	printk(KERN_DEBUG "xattr_names %u\n", ui->xattr_names);
+	printk(KERN_DEBUG "dirty       %u\n", ui->dirty);
+	printk(KERN_DEBUG "xattr       %u\n", ui->xattr);
+	printk(KERN_DEBUG "flags       %d\n", ui->flags);
+	printk(KERN_DEBUG "compr_type  %d\n", ui->compr_type);
+	printk(KERN_DEBUG "data_len    %d\n", ui->data_len);
+}
+
+void dbg_dump_node(const struct ubifs_info *c, const void *node)
+{
+	int i, n;
+	union ubifs_key key;
+	const struct ubifs_ch *ch = node;
+
+	if (dbg_failure_mode)
+		return;
+
+	/* If the magic is incorrect, just hexdump the first bytes */
+	if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC) {
+		printk(KERN_DEBUG "Not a node, first %zu bytes:", UBIFS_CH_SZ);
+		print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 1,
+			       (void *)node, UBIFS_CH_SZ, 1);
+		return;
+	}
+
+	spin_lock(&dbg_lock);
+	dump_ch(node);
+
+	switch (ch->node_type) {
+	case UBIFS_PAD_NODE:
+	{
+		const struct ubifs_pad_node *pad = node;
+
+		printk(KERN_DEBUG "\tpad_len        %u\n",
+		       le32_to_cpu(pad->pad_len));
+		break;
+	}
+	case UBIFS_SB_NODE:
+	{
+		const struct ubifs_sb_node *sup = node;
+		unsigned int sup_flags = le32_to_cpu(sup->flags);
+
+		printk(KERN_DEBUG "\tkey_hash       %d (%s)\n",
+		       (int)sup->key_hash, get_key_hash(sup->key_hash));
+		printk(KERN_DEBUG "\tkey_fmt        %d (%s)\n",
+		       (int)sup->key_fmt, get_key_fmt(sup->key_fmt));
+		printk(KERN_DEBUG "\tflags          %#x\n", sup_flags);
+		printk(KERN_DEBUG "\t  big_lpt      %u\n",
+		       !!(sup_flags & UBIFS_FLG_BIGLPT));
+		printk(KERN_DEBUG "\tmin_io_size    %u\n",
+		       le32_to_cpu(sup->min_io_size));
+		printk(KERN_DEBUG "\tleb_size       %u\n",
+		       le32_to_cpu(sup->leb_size));
+		printk(KERN_DEBUG "\tleb_cnt        %u\n",
+		       le32_to_cpu(sup->leb_cnt));
+		printk(KERN_DEBUG "\tmax_leb_cnt    %u\n",
+		       le32_to_cpu(sup->max_leb_cnt));
+		printk(KERN_DEBUG "\tmax_bud_bytes  %llu\n",
+		       (unsigned long long)le64_to_cpu(sup->max_bud_bytes));
+		printk(KERN_DEBUG "\tlog_lebs       %u\n",
+		       le32_to_cpu(sup->log_lebs));
+		printk(KERN_DEBUG "\tlpt_lebs       %u\n",
+		       le32_to_cpu(sup->lpt_lebs));
+		printk(KERN_DEBUG "\torph_lebs      %u\n",
+		       le32_to_cpu(sup->orph_lebs));
+		printk(KERN_DEBUG "\tjhead_cnt      %u\n",
+		       le32_to_cpu(sup->jhead_cnt));
+		printk(KERN_DEBUG "\tfanout         %u\n",
+		       le32_to_cpu(sup->fanout));
+		printk(KERN_DEBUG "\tlsave_cnt      %u\n",
+		       le32_to_cpu(sup->lsave_cnt));
+		printk(KERN_DEBUG "\tdefault_compr  %u\n",
+		       (int)le16_to_cpu(sup->default_compr));
+		printk(KERN_DEBUG "\trp_size        %llu\n",
+		       (unsigned long long)le64_to_cpu(sup->rp_size));
+		printk(KERN_DEBUG "\trp_uid         %u\n",
+		       le32_to_cpu(sup->rp_uid));
+		printk(KERN_DEBUG "\trp_gid         %u\n",
+		       le32_to_cpu(sup->rp_gid));
+		printk(KERN_DEBUG "\tfmt_version    %u\n",
+		       le32_to_cpu(sup->fmt_version));
+		printk(KERN_DEBUG "\ttime_gran      %u\n",
+		       le32_to_cpu(sup->time_gran));
+		printk(KERN_DEBUG "\tUUID           %02X%02X%02X%02X-%02X%02X"
+		       "-%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X\n",
+		       sup->uuid[0], sup->uuid[1], sup->uuid[2], sup->uuid[3],
+		       sup->uuid[4], sup->uuid[5], sup->uuid[6], sup->uuid[7],
+		       sup->uuid[8], sup->uuid[9], sup->uuid[10], sup->uuid[11],
+		       sup->uuid[12], sup->uuid[13], sup->uuid[14],
+		       sup->uuid[15]);
+		break;
+	}
+	case UBIFS_MST_NODE:
+	{
+		const struct ubifs_mst_node *mst = node;
+
+		printk(KERN_DEBUG "\thighest_inum   %llu\n",
+		       (unsigned long long)le64_to_cpu(mst->highest_inum));
+		printk(KERN_DEBUG "\tcommit number  %llu\n",
+		       (unsigned long long)le64_to_cpu(mst->cmt_no));
+		printk(KERN_DEBUG "\tflags          %#x\n",
+		       le32_to_cpu(mst->flags));
+		printk(KERN_DEBUG "\tlog_lnum       %u\n",
+		       le32_to_cpu(mst->log_lnum));
+		printk(KERN_DEBUG "\troot_lnum      %u\n",
+		       le32_to_cpu(mst->root_lnum));
+		printk(KERN_DEBUG "\troot_offs      %u\n",
+		       le32_to_cpu(mst->root_offs));
+		printk(KERN_DEBUG "\troot_len       %u\n",
+		       le32_to_cpu(mst->root_len));
+		printk(KERN_DEBUG "\tgc_lnum        %u\n",
+		       le32_to_cpu(mst->gc_lnum));
+		printk(KERN_DEBUG "\tihead_lnum     %u\n",
+		       le32_to_cpu(mst->ihead_lnum));
+		printk(KERN_DEBUG "\tihead_offs     %u\n",
+		       le32_to_cpu(mst->ihead_offs));
+		printk(KERN_DEBUG "\tindex_size     %u\n",
+		       le32_to_cpu(mst->index_size));
+		printk(KERN_DEBUG "\tlpt_lnum       %u\n",
+		       le32_to_cpu(mst->lpt_lnum));
+		printk(KERN_DEBUG "\tlpt_offs       %u\n",
+		       le32_to_cpu(mst->lpt_offs));
+		printk(KERN_DEBUG "\tnhead_lnum     %u\n",
+		       le32_to_cpu(mst->nhead_lnum));
+		printk(KERN_DEBUG "\tnhead_offs     %u\n",
+		       le32_to_cpu(mst->nhead_offs));
+		printk(KERN_DEBUG "\tltab_lnum      %u\n",
+		       le32_to_cpu(mst->ltab_lnum));
+		printk(KERN_DEBUG "\tltab_offs      %u\n",
+		       le32_to_cpu(mst->ltab_offs));
+		printk(KERN_DEBUG "\tlsave_lnum     %u\n",
+		       le32_to_cpu(mst->lsave_lnum));
+		printk(KERN_DEBUG "\tlsave_offs     %u\n",
+		       le32_to_cpu(mst->lsave_offs));
+		printk(KERN_DEBUG "\tlscan_lnum     %u\n",
+		       le32_to_cpu(mst->lscan_lnum));
+		printk(KERN_DEBUG "\tleb_cnt        %u\n",
+		       le32_to_cpu(mst->leb_cnt));
+		printk(KERN_DEBUG "\tempty_lebs     %u\n",
+		       le32_to_cpu(mst->empty_lebs));
+		printk(KERN_DEBUG "\tidx_lebs       %u\n",
+		       le32_to_cpu(mst->idx_lebs));
+		printk(KERN_DEBUG "\ttotal_free     %llu\n",
+		       (unsigned long long)le64_to_cpu(mst->total_free));
+		printk(KERN_DEBUG "\ttotal_dirty    %llu\n",
+		       (unsigned long long)le64_to_cpu(mst->total_dirty));
+		printk(KERN_DEBUG "\ttotal_used     %llu\n",
+		       (unsigned long long)le64_to_cpu(mst->total_used));
+		printk(KERN_DEBUG "\ttotal_dead     %llu\n",
+		       (unsigned long long)le64_to_cpu(mst->total_dead));
+		printk(KERN_DEBUG "\ttotal_dark     %llu\n",
+		       (unsigned long long)le64_to_cpu(mst->total_dark));
+		break;
+	}
+	case UBIFS_REF_NODE:
+	{
+		const struct ubifs_ref_node *ref = node;
+
+		printk(KERN_DEBUG "\tlnum           %u\n",
+		       le32_to_cpu(ref->lnum));
+		printk(KERN_DEBUG "\toffs           %u\n",
+		       le32_to_cpu(ref->offs));
+		printk(KERN_DEBUG "\tjhead          %u\n",
+		       le32_to_cpu(ref->jhead));
+		break;
+	}
+	case UBIFS_INO_NODE:
+	{
+		const struct ubifs_ino_node *ino = node;
+
+		key_read(c, &ino->key, &key);
+		printk(KERN_DEBUG "\tkey            %s\n", DBGKEY(&key));
+		printk(KERN_DEBUG "\tcreat_sqnum    %llu\n",
+		       (unsigned long long)le64_to_cpu(ino->creat_sqnum));
+		printk(KERN_DEBUG "\tsize           %llu\n",
+		       (unsigned long long)le64_to_cpu(ino->size));
+		printk(KERN_DEBUG "\tnlink          %u\n",
+		       le32_to_cpu(ino->nlink));
+		printk(KERN_DEBUG "\tatime          %lld.%u\n",
+		       (long long)le64_to_cpu(ino->atime_sec),
+		       le32_to_cpu(ino->atime_nsec));
+		printk(KERN_DEBUG "\tmtime          %lld.%u\n",
+		       (long long)le64_to_cpu(ino->mtime_sec),
+		       le32_to_cpu(ino->mtime_nsec));
+		printk(KERN_DEBUG "\tctime          %lld.%u\n",
+		       (long long)le64_to_cpu(ino->ctime_sec),
+		       le32_to_cpu(ino->ctime_nsec));
+		printk(KERN_DEBUG "\tuid            %u\n",
+		       le32_to_cpu(ino->uid));
+		printk(KERN_DEBUG "\tgid            %u\n",
+		       le32_to_cpu(ino->gid));
+		printk(KERN_DEBUG "\tmode           %u\n",
+		       le32_to_cpu(ino->mode));
+		printk(KERN_DEBUG "\tflags          %#x\n",
+		       le32_to_cpu(ino->flags));
+		printk(KERN_DEBUG "\txattr_cnt      %u\n",
+		       le32_to_cpu(ino->xattr_cnt));
+		printk(KERN_DEBUG "\txattr_size     %u\n",
+		       le32_to_cpu(ino->xattr_size));
+		printk(KERN_DEBUG "\txattr_names    %u\n",
+		       le32_to_cpu(ino->xattr_names));
+		printk(KERN_DEBUG "\tcompr_type     %#x\n",
+		       (int)le16_to_cpu(ino->compr_type));
+		printk(KERN_DEBUG "\tdata len       %u\n",
+		       le32_to_cpu(ino->data_len));
+		break;
+	}
+	case UBIFS_DENT_NODE:
+	case UBIFS_XENT_NODE:
+	{
+		const struct ubifs_dent_node *dent = node;
+		int nlen = le16_to_cpu(dent->nlen);
+
+		key_read(c, &dent->key, &key);
+		printk(KERN_DEBUG "\tkey            %s\n", DBGKEY(&key));
+		printk(KERN_DEBUG "\tinum           %llu\n",
+		       (unsigned long long)le64_to_cpu(dent->inum));
+		printk(KERN_DEBUG "\ttype           %d\n", (int)dent->type);
+		printk(KERN_DEBUG "\tnlen           %d\n", nlen);
+		printk(KERN_DEBUG "\tname           ");
+
+		if (nlen > UBIFS_MAX_NLEN)
+			printk(KERN_DEBUG "(bad name length, not printing, "
+					  "bad or corrupted node)");
+		else {
+			for (i = 0; i < nlen && dent->name[i]; i++)
+				printk("%c", dent->name[i]);
+		}
+		printk("\n");
+
+		break;
+	}
+	case UBIFS_DATA_NODE:
+	{
+		const struct ubifs_data_node *dn = node;
+		int dlen = le32_to_cpu(ch->len) - UBIFS_DATA_NODE_SZ;
+
+		key_read(c, &dn->key, &key);
+		printk(KERN_DEBUG "\tkey            %s\n", DBGKEY(&key));
+		printk(KERN_DEBUG "\tsize           %u\n",
+		       le32_to_cpu(dn->size));
+		printk(KERN_DEBUG "\tcompr_typ      %d\n",
+		       (int)le16_to_cpu(dn->compr_type));
+		printk(KERN_DEBUG "\tdata size      %d\n",
+		       dlen);
+		printk(KERN_DEBUG "\tdata:\n");
+		print_hex_dump(KERN_DEBUG, "\t", DUMP_PREFIX_OFFSET, 32, 1,
+			       (void *)&dn->data, dlen, 0);
+		break;
+	}
+	case UBIFS_TRUN_NODE:
+	{
+		const struct ubifs_trun_node *trun = node;
+
+		printk(KERN_DEBUG "\tinum           %u\n",
+		       le32_to_cpu(trun->inum));
+		printk(KERN_DEBUG "\told_size       %llu\n",
+		       (unsigned long long)le64_to_cpu(trun->old_size));
+		printk(KERN_DEBUG "\tnew_size       %llu\n",
+		       (unsigned long long)le64_to_cpu(trun->new_size));
+		break;
+	}
+	case UBIFS_IDX_NODE:
+	{
+		const struct ubifs_idx_node *idx = node;
+
+		n = le16_to_cpu(idx->child_cnt);
+		printk(KERN_DEBUG "\tchild_cnt      %d\n", n);
+		printk(KERN_DEBUG "\tlevel          %d\n",
+		       (int)le16_to_cpu(idx->level));
+		printk(KERN_DEBUG "\tBranches:\n");
+
+		for (i = 0; i < n && i < c->fanout - 1; i++) {
+			const struct ubifs_branch *br;
+
+			br = ubifs_idx_branch(c, idx, i);
+			key_read(c, &br->key, &key);
+			printk(KERN_DEBUG "\t%d: LEB %d:%d len %d key %s\n",
+			       i, le32_to_cpu(br->lnum), le32_to_cpu(br->offs),
+			       le32_to_cpu(br->len), DBGKEY(&key));
+		}
+		break;
+	}
+	case UBIFS_CS_NODE:
+		break;
+	case UBIFS_ORPH_NODE:
+	{
+		const struct ubifs_orph_node *orph = node;
+
+		printk(KERN_DEBUG "\tcommit number  %llu\n",
+		       (unsigned long long)
+				le64_to_cpu(orph->cmt_no) & LLONG_MAX);
+		printk(KERN_DEBUG "\tlast node flag %llu\n",
+		       (unsigned long long)(le64_to_cpu(orph->cmt_no)) >> 63);
+		n = (le32_to_cpu(ch->len) - UBIFS_ORPH_NODE_SZ) >> 3;
+		printk(KERN_DEBUG "\t%d orphan inode numbers:\n", n);
+		for (i = 0; i < n; i++)
+			printk(KERN_DEBUG "\t  ino %llu\n",
+			       le64_to_cpu(orph->inos[i]));
+		break;
+	}
+	default:
+		printk(KERN_DEBUG "node type %d was not recognized\n",
+		       (int)ch->node_type);
+	}
+	spin_unlock(&dbg_lock);
+}
+
+void dbg_dump_budget_req(const struct ubifs_budget_req *req)
+{
+	spin_lock(&dbg_lock);
+	printk(KERN_DEBUG "Budgeting request: new_ino %d, dirtied_ino %d\n",
+	       req->new_ino, req->dirtied_ino);
+	printk(KERN_DEBUG "\tnew_ino_d   %d, dirtied_ino_d %d\n",
+	       req->new_ino_d, req->dirtied_ino_d);
+	printk(KERN_DEBUG "\tnew_page    %d, dirtied_page %d\n",
+	       req->new_page, req->dirtied_page);
+	printk(KERN_DEBUG "\tnew_dent    %d, mod_dent     %d\n",
+	       req->new_dent, req->mod_dent);
+	printk(KERN_DEBUG "\tidx_growth  %d\n", req->idx_growth);
+	printk(KERN_DEBUG "\tdata_growth %d dd_growth     %d\n",
+	       req->data_growth, req->dd_growth);
+	spin_unlock(&dbg_lock);
+}
+
+void dbg_dump_lstats(const struct ubifs_lp_stats *lst)
+{
+	spin_lock(&dbg_lock);
+	printk(KERN_DEBUG "Lprops statistics: empty_lebs %d, idx_lebs  %d\n",
+	       lst->empty_lebs, lst->idx_lebs);
+	printk(KERN_DEBUG "\ttaken_empty_lebs %d, total_free %lld, "
+	       "total_dirty %lld\n", lst->taken_empty_lebs, lst->total_free,
+	       lst->total_dirty);
+	printk(KERN_DEBUG "\ttotal_used %lld, total_dark %lld, "
+	       "total_dead %lld\n", lst->total_used, lst->total_dark,
+	       lst->total_dead);
+	spin_unlock(&dbg_lock);
+}
+
+void dbg_dump_budg(struct ubifs_info *c)
+{
+	int i;
+	struct rb_node *rb;
+	struct ubifs_bud *bud;
+	struct ubifs_gced_idx_leb *idx_gc;
+
+	spin_lock(&dbg_lock);
+	printk(KERN_DEBUG "Budgeting info: budg_data_growth %lld, "
+	       "budg_dd_growth %lld, budg_idx_growth %lld\n",
+	       c->budg_data_growth, c->budg_dd_growth, c->budg_idx_growth);
+	printk(KERN_DEBUG "\tdata budget sum %lld, total budget sum %lld, "
+	       "freeable_cnt %d\n", c->budg_data_growth + c->budg_dd_growth,
+	       c->budg_data_growth + c->budg_dd_growth + c->budg_idx_growth,
+	       c->freeable_cnt);
+	printk(KERN_DEBUG "\tmin_idx_lebs %d, old_idx_sz %lld, "
+	       "calc_idx_sz %lld, idx_gc_cnt %d\n", c->min_idx_lebs,
+	       c->old_idx_sz, c->calc_idx_sz, c->idx_gc_cnt);
+	printk(KERN_DEBUG "\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, "
+	       "clean_zn_cnt %ld\n", atomic_long_read(&c->dirty_pg_cnt),
+	       atomic_long_read(&c->dirty_zn_cnt),
+	       atomic_long_read(&c->clean_zn_cnt));
+	printk(KERN_DEBUG "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n",
+	       c->dark_wm, c->dead_wm, c->max_idx_node_sz);
+	printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n",
+	       c->gc_lnum, c->ihead_lnum);
+	for (i = 0; i < c->jhead_cnt; i++)
+		printk(KERN_DEBUG "\tjhead %d\t LEB %d\n",
+		       c->jheads[i].wbuf.jhead, c->jheads[i].wbuf.lnum);
+	for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) {
+		bud = rb_entry(rb, struct ubifs_bud, rb);
+		printk(KERN_DEBUG "\tbud LEB %d\n", bud->lnum);
+	}
+	list_for_each_entry(bud, &c->old_buds, list)
+		printk(KERN_DEBUG "\told bud LEB %d\n", bud->lnum);
+	list_for_each_entry(idx_gc, &c->idx_gc, list)
+		printk(KERN_DEBUG "\tGC'ed idx LEB %d unmap %d\n",
+		       idx_gc->lnum, idx_gc->unmap);
+	printk(KERN_DEBUG "\tcommit state %d\n", c->cmt_state);
+	spin_unlock(&dbg_lock);
+}
+
+void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
+{
+	printk(KERN_DEBUG "LEB %d lprops: free %d, dirty %d (used %d), "
+	       "flags %#x\n", lp->lnum, lp->free, lp->dirty,
+	       c->leb_size - lp->free - lp->dirty, lp->flags);
+}
+
+void dbg_dump_lprops(struct ubifs_info *c)
+{
+	int lnum, err;
+	struct ubifs_lprops lp;
+	struct ubifs_lp_stats lst;
+
+	printk(KERN_DEBUG "Dumping LEB properties\n");
+	ubifs_get_lp_stats(c, &lst);
+	dbg_dump_lstats(&lst);
+
+	for (lnum = c->main_first; lnum < c->leb_cnt; lnum++) {
+		err = ubifs_read_one_lp(c, lnum, &lp);
+		if (err)
+			ubifs_err("cannot read lprops for LEB %d", lnum);
+
+		dbg_dump_lprop(c, &lp);
+	}
+}
+
+void dbg_dump_leb(const struct ubifs_info *c, int lnum)
+{
+	struct ubifs_scan_leb *sleb;
+	struct ubifs_scan_node *snod;
+
+	if (dbg_failure_mode)
+		return;
+
+	printk(KERN_DEBUG "Dumping LEB %d\n", lnum);
+
+	sleb = ubifs_scan(c, lnum, 0, c->dbg_buf);
+	if (IS_ERR(sleb)) {
+		ubifs_err("scan error %d", (int)PTR_ERR(sleb));
+		return;
+	}
+
+	printk(KERN_DEBUG "LEB %d has %d nodes ending at %d\n", lnum,
+	       sleb->nodes_cnt, sleb->endpt);
+
+	list_for_each_entry(snod, &sleb->nodes, list) {
+		cond_resched();
+		printk(KERN_DEBUG "Dumping node at LEB %d:%d len %d\n", lnum,
+		       snod->offs, snod->len);
+		dbg_dump_node(c, snod->node);
+	}
+
+	ubifs_scan_destroy(sleb);
+	return;
+}
+
+void dbg_dump_znode(const struct ubifs_info *c,
+		    const struct ubifs_znode *znode)
+{
+	int n;
+	const struct ubifs_zbranch *zbr;
+
+	spin_lock(&dbg_lock);
+	if (znode->parent)
+		zbr = &znode->parent->zbranch[znode->iip];
+	else
+		zbr = &c->zroot;
+
+	printk(KERN_DEBUG "znode %p, LEB %d:%d len %d parent %p iip %d level %d"
+	       " child_cnt %d flags %lx\n", znode, zbr->lnum, zbr->offs,
+	       zbr->len, znode->parent, znode->iip, znode->level,
+	       znode->child_cnt, znode->flags);
+
+	if (znode->child_cnt <= 0 || znode->child_cnt > c->fanout) {
+		spin_unlock(&dbg_lock);
+		return;
+	}
+
+	printk(KERN_DEBUG "zbranches:\n");
+	for (n = 0; n < znode->child_cnt; n++) {
+		zbr = &znode->zbranch[n];
+		if (znode->level > 0)
+			printk(KERN_DEBUG "\t%d: znode %p LEB %d:%d len %d key "
+					  "%s\n", n, zbr->znode, zbr->lnum,
+					  zbr->offs, zbr->len,
+					  DBGKEY(&zbr->key));
+		else
+			printk(KERN_DEBUG "\t%d: LNC %p LEB %d:%d len %d key "
+					  "%s\n", n, zbr->znode, zbr->lnum,
+					  zbr->offs, zbr->len,
+					  DBGKEY(&zbr->key));
+	}
+	spin_unlock(&dbg_lock);
+}
+
+void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat)
+{
+	int i;
+
+	printk(KERN_DEBUG "Dumping heap cat %d (%d elements)\n",
+	       cat, heap->cnt);
+	for (i = 0; i < heap->cnt; i++) {
+		struct ubifs_lprops *lprops = heap->arr[i];
+
+		printk(KERN_DEBUG "\t%d. LEB %d hpos %d free %d dirty %d "
+		       "flags %d\n", i, lprops->lnum, lprops->hpos,
+		       lprops->free, lprops->dirty, lprops->flags);
+	}
+}
+
+void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
+		    struct ubifs_nnode *parent, int iip)
+{
+	int i;
+
+	printk(KERN_DEBUG "Dumping pnode:\n");
+	printk(KERN_DEBUG "\taddress %zx parent %zx cnext %zx\n",
+	       (size_t)pnode, (size_t)parent, (size_t)pnode->cnext);
+	printk(KERN_DEBUG "\tflags %lu iip %d level %d num %d\n",
+	       pnode->flags, iip, pnode->level, pnode->num);
+	for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+		struct ubifs_lprops *lp = &pnode->lprops[i];
+
+		printk(KERN_DEBUG "\t%d: free %d dirty %d flags %d lnum %d\n",
+		       i, lp->free, lp->dirty, lp->flags, lp->lnum);
+	}
+}
+
+void dbg_dump_tnc(struct ubifs_info *c)
+{
+	struct ubifs_znode *znode;
+	int level;
+
+	printk(KERN_DEBUG "\n");
+	printk(KERN_DEBUG "Dumping the TNC tree\n");
+	znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL);
+	level = znode->level;
+	printk(KERN_DEBUG "== Level %d ==\n", level);
+	while (znode) {
+		if (level != znode->level) {
+			level = znode->level;
+			printk(KERN_DEBUG "== Level %d ==\n", level);
+		}
+		dbg_dump_znode(c, znode);
+		znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode);
+	}
+
+	printk(KERN_DEBUG "\n");
+}
+
+static int dump_znode(struct ubifs_info *c, struct ubifs_znode *znode,
+		      void *priv)
+{
+	dbg_dump_znode(c, znode);
+	return 0;
+}
+
+/**
+ * dbg_dump_index - dump the on-flash index.
+ * @c: UBIFS file-system description object
+ *
+ * This function dumps whole UBIFS indexing B-tree, unlike 'dbg_dump_tnc()'
+ * which dumps only in-memory znodes and does not read znodes which from flash.
+ */
+void dbg_dump_index(struct ubifs_info *c)
+{
+	dbg_walk_index(c, NULL, dump_znode, NULL);
+}
+
+/**
+ * dbg_check_synced_i_size - check synchronized inode size.
+ * @inode: inode to check
+ *
+ * If inode is clean, synchronized inode size has to be equivalent to current
+ * inode size. This function has to be called only for locked inodes (@i_mutex
+ * has to be locked). Returns %0 if synchronized inode size if correct, and
+ * %-EINVAL if not.
+ */
+int dbg_check_synced_i_size(struct inode *inode)
+{
+	int err = 0;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+
+	if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+		return 0;
+	if (!S_ISREG(inode->i_mode))
+		return 0;
+
+	mutex_lock(&ui->ui_mutex);
+	spin_lock(&ui->ui_lock);
+	if (ui->ui_size != ui->synced_i_size && !ui->dirty) {
+		ubifs_err("ui_size is %lld, synced_i_size is %lld, but inode "
+			  "is clean", ui->ui_size, ui->synced_i_size);
+		ubifs_err("i_ino %lu, i_mode %#x, i_size %lld", inode->i_ino,
+			  inode->i_mode, i_size_read(inode));
+		dbg_dump_stack();
+		err = -EINVAL;
+	}
+	spin_unlock(&ui->ui_lock);
+	mutex_unlock(&ui->ui_mutex);
+	return err;
+}
+
+/*
+ * dbg_check_dir - check directory inode size and link count.
+ * @c: UBIFS file-system description object
+ * @dir: the directory to calculate size for
+ * @size: the result is returned here
+ *
+ * This function makes sure that directory size and link count are correct.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ *
+ * Note, it is good idea to make sure the @dir->i_mutex is locked before
+ * calling this function.
+ */
+int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir)
+{
+	unsigned int nlink = 2;
+	union ubifs_key key;
+	struct ubifs_dent_node *dent, *pdent = NULL;
+	struct qstr nm = { .name = NULL };
+	loff_t size = UBIFS_INO_NODE_SZ;
+
+	if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+		return 0;
+
+	if (!S_ISDIR(dir->i_mode))
+		return 0;
+
+	lowest_dent_key(c, &key, dir->i_ino);
+	while (1) {
+		int err;
+
+		dent = ubifs_tnc_next_ent(c, &key, &nm);
+		if (IS_ERR(dent)) {
+			err = PTR_ERR(dent);
+			if (err == -ENOENT)
+				break;
+			return err;
+		}
+
+		nm.name = dent->name;
+		nm.len = le16_to_cpu(dent->nlen);
+		size += CALC_DENT_SIZE(nm.len);
+		if (dent->type == UBIFS_ITYPE_DIR)
+			nlink += 1;
+		kfree(pdent);
+		pdent = dent;
+		key_read(c, &dent->key, &key);
+	}
+	kfree(pdent);
+
+	if (i_size_read(dir) != size) {
+		ubifs_err("directory inode %lu has size %llu, "
+			  "but calculated size is %llu", dir->i_ino,
+			  (unsigned long long)i_size_read(dir),
+			  (unsigned long long)size);
+		dump_stack();
+		return -EINVAL;
+	}
+	if (dir->i_nlink != nlink) {
+		ubifs_err("directory inode %lu has nlink %u, but calculated "
+			  "nlink is %u", dir->i_ino, dir->i_nlink, nlink);
+		dump_stack();
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * dbg_check_key_order - make sure that colliding keys are properly ordered.
+ * @c: UBIFS file-system description object
+ * @zbr1: first zbranch
+ * @zbr2: following zbranch
+ *
+ * In UBIFS indexing B-tree colliding keys has to be sorted in binary order of
+ * names of the direntries/xentries which are referred by the keys. This
+ * function reads direntries/xentries referred by @zbr1 and @zbr2 and makes
+ * sure the name of direntry/xentry referred by @zbr1 is less than
+ * direntry/xentry referred by @zbr2. Returns zero if this is true, %1 if not,
+ * and a negative error code in case of failure.
+ */
+static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
+			       struct ubifs_zbranch *zbr2)
+{
+	int err, nlen1, nlen2, cmp;
+	struct ubifs_dent_node *dent1, *dent2;
+	union ubifs_key key;
+
+	ubifs_assert(!keys_cmp(c, &zbr1->key, &zbr2->key));
+	dent1 = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
+	if (!dent1)
+		return -ENOMEM;
+	dent2 = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
+	if (!dent2) {
+		err = -ENOMEM;
+		goto out_free;
+	}
+
+	err = ubifs_tnc_read_node(c, zbr1, dent1);
+	if (err)
+		goto out_free;
+	err = ubifs_validate_entry(c, dent1);
+	if (err)
+		goto out_free;
+
+	err = ubifs_tnc_read_node(c, zbr2, dent2);
+	if (err)
+		goto out_free;
+	err = ubifs_validate_entry(c, dent2);
+	if (err)
+		goto out_free;
+
+	/* Make sure node keys are the same as in zbranch */
+	err = 1;
+	key_read(c, &dent1->key, &key);
+	if (keys_cmp(c, &zbr1->key, &key)) {
+		dbg_err("1st entry at %d:%d has key %s", zbr1->lnum,
+			zbr1->offs, DBGKEY(&key));
+		dbg_err("but it should have key %s according to tnc",
+			DBGKEY(&zbr1->key));
+			dbg_dump_node(c, dent1);
+			goto out_free;
+	}
+
+	key_read(c, &dent2->key, &key);
+	if (keys_cmp(c, &zbr2->key, &key)) {
+		dbg_err("2nd entry at %d:%d has key %s", zbr1->lnum,
+			zbr1->offs, DBGKEY(&key));
+		dbg_err("but it should have key %s according to tnc",
+			DBGKEY(&zbr2->key));
+			dbg_dump_node(c, dent2);
+			goto out_free;
+	}
+
+	nlen1 = le16_to_cpu(dent1->nlen);
+	nlen2 = le16_to_cpu(dent2->nlen);
+
+	cmp = memcmp(dent1->name, dent2->name, min_t(int, nlen1, nlen2));
+	if (cmp < 0 || (cmp == 0 && nlen1 < nlen2)) {
+		err = 0;
+		goto out_free;
+	}
+	if (cmp == 0 && nlen1 == nlen2)
+		dbg_err("2 xent/dent nodes with the same name");
+	else
+		dbg_err("bad order of colliding key %s",
+			DBGKEY(&key));
+
+	dbg_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs);
+	dbg_dump_node(c, dent1);
+	dbg_msg("second node at %d:%d\n", zbr2->lnum, zbr2->offs);
+	dbg_dump_node(c, dent2);
+
+out_free:
+	kfree(dent2);
+	kfree(dent1);
+	return err;
+}
+
+/**
+ * dbg_check_znode - check if znode is all right.
+ * @c: UBIFS file-system description object
+ * @zbr: zbranch which points to this znode
+ *
+ * This function makes sure that znode referred to by @zbr is all right.
+ * Returns zero if it is, and %-EINVAL if it is not.
+ */
+static int dbg_check_znode(struct ubifs_info *c, struct ubifs_zbranch *zbr)
+{
+	struct ubifs_znode *znode = zbr->znode;
+	struct ubifs_znode *zp = znode->parent;
+	int n, err, cmp;
+
+	if (znode->child_cnt <= 0 || znode->child_cnt > c->fanout) {
+		err = 1;
+		goto out;
+	}
+	if (znode->level < 0) {
+		err = 2;
+		goto out;
+	}
+	if (znode->iip < 0 || znode->iip >= c->fanout) {
+		err = 3;
+		goto out;
+	}
+
+	if (zbr->len == 0)
+		/* Only dirty zbranch may have no on-flash nodes */
+		if (!ubifs_zn_dirty(znode)) {
+			err = 4;
+			goto out;
+		}
+
+	if (ubifs_zn_dirty(znode)) {
+		/*
+		 * If znode is dirty, its parent has to be dirty as well. The
+		 * order of the operation is important, so we have to have
+		 * memory barriers.
+		 */
+		smp_mb();
+		if (zp && !ubifs_zn_dirty(zp)) {
+			/*
+			 * The dirty flag is atomic and is cleared outside the
+			 * TNC mutex, so znode's dirty flag may now have
+			 * been cleared. The child is always cleared before the
+			 * parent, so we just need to check again.
+			 */
+			smp_mb();
+			if (ubifs_zn_dirty(znode)) {
+				err = 5;
+				goto out;
+			}
+		}
+	}
+
+	if (zp) {
+		const union ubifs_key *min, *max;
+
+		if (znode->level != zp->level - 1) {
+			err = 6;
+			goto out;
+		}
+
+		/* Make sure the 'parent' pointer in our znode is correct */
+		err = ubifs_search_zbranch(c, zp, &zbr->key, &n);
+		if (!err) {
+			/* This zbranch does not exist in the parent */
+			err = 7;
+			goto out;
+		}
+
+		if (znode->iip >= zp->child_cnt) {
+			err = 8;
+			goto out;
+		}
+
+		if (znode->iip != n) {
+			/* This may happen only in case of collisions */
+			if (keys_cmp(c, &zp->zbranch[n].key,
+				     &zp->zbranch[znode->iip].key)) {
+				err = 9;
+				goto out;
+			}
+			n = znode->iip;
+		}
+
+		/*
+		 * Make sure that the first key in our znode is greater than or
+		 * equal to the key in the pointing zbranch.
+		 */
+		min = &zbr->key;
+		cmp = keys_cmp(c, min, &znode->zbranch[0].key);
+		if (cmp == 1) {
+			err = 10;
+			goto out;
+		}
+
+		if (n + 1 < zp->child_cnt) {
+			max = &zp->zbranch[n + 1].key;
+
+			/*
+			 * Make sure the last key in our znode is less or
+			 * equivalent than the the key in zbranch which goes
+			 * after our pointing zbranch.
+			 */
+			cmp = keys_cmp(c, max,
+				&znode->zbranch[znode->child_cnt - 1].key);
+			if (cmp == -1) {
+				err = 11;
+				goto out;
+			}
+		}
+	} else {
+		/* This may only be root znode */
+		if (zbr != &c->zroot) {
+			err = 12;
+			goto out;
+		}
+	}
+
+	/*
+	 * Make sure that next key is greater or equivalent then the previous
+	 * one.
+	 */
+	for (n = 1; n < znode->child_cnt; n++) {
+		cmp = keys_cmp(c, &znode->zbranch[n - 1].key,
+			       &znode->zbranch[n].key);
+		if (cmp > 0) {
+			err = 13;
+			goto out;
+		}
+		if (cmp == 0) {
+			/* This can only be keys with colliding hash */
+			if (!is_hash_key(c, &znode->zbranch[n].key)) {
+				err = 14;
+				goto out;
+			}
+
+			if (znode->level != 0 || c->replaying)
+				continue;
+
+			/*
+			 * Colliding keys should follow binary order of
+			 * corresponding xentry/dentry names.
+			 */
+			err = dbg_check_key_order(c, &znode->zbranch[n - 1],
+						  &znode->zbranch[n]);
+			if (err < 0)
+				return err;
+			if (err) {
+				err = 15;
+				goto out;
+			}
+		}
+	}
+
+	for (n = 0; n < znode->child_cnt; n++) {
+		if (!znode->zbranch[n].znode &&
+		    (znode->zbranch[n].lnum == 0 ||
+		     znode->zbranch[n].len == 0)) {
+			err = 16;
+			goto out;
+		}
+
+		if (znode->zbranch[n].lnum != 0 &&
+		    znode->zbranch[n].len == 0) {
+			err = 17;
+			goto out;
+		}
+
+		if (znode->zbranch[n].lnum == 0 &&
+		    znode->zbranch[n].len != 0) {
+			err = 18;
+			goto out;
+		}
+
+		if (znode->zbranch[n].lnum == 0 &&
+		    znode->zbranch[n].offs != 0) {
+			err = 19;
+			goto out;
+		}
+
+		if (znode->level != 0 && znode->zbranch[n].znode)
+			if (znode->zbranch[n].znode->parent != znode) {
+				err = 20;
+				goto out;
+			}
+	}
+
+	return 0;
+
+out:
+	ubifs_err("failed, error %d", err);
+	ubifs_msg("dump of the znode");
+	dbg_dump_znode(c, znode);
+	if (zp) {
+		ubifs_msg("dump of the parent znode");
+		dbg_dump_znode(c, zp);
+	}
+	dump_stack();
+	return -EINVAL;
+}
+
+/**
+ * dbg_check_tnc - check TNC tree.
+ * @c: UBIFS file-system description object
+ * @extra: do extra checks that are possible at start commit
+ *
+ * This function traverses whole TNC tree and checks every znode. Returns zero
+ * if everything is all right and %-EINVAL if something is wrong with TNC.
+ */
+int dbg_check_tnc(struct ubifs_info *c, int extra)
+{
+	struct ubifs_znode *znode;
+	long clean_cnt = 0, dirty_cnt = 0;
+	int err, last;
+
+	if (!(ubifs_chk_flags & UBIFS_CHK_TNC))
+		return 0;
+
+	ubifs_assert(mutex_is_locked(&c->tnc_mutex));
+	if (!c->zroot.znode)
+		return 0;
+
+	znode = ubifs_tnc_postorder_first(c->zroot.znode);
+	while (1) {
+		struct ubifs_znode *prev;
+		struct ubifs_zbranch *zbr;
+
+		if (!znode->parent)
+			zbr = &c->zroot;
+		else
+			zbr = &znode->parent->zbranch[znode->iip];
+
+		err = dbg_check_znode(c, zbr);
+		if (err)
+			return err;
+
+		if (extra) {
+			if (ubifs_zn_dirty(znode))
+				dirty_cnt += 1;
+			else
+				clean_cnt += 1;
+		}
+
+		prev = znode;
+		znode = ubifs_tnc_postorder_next(znode);
+		if (!znode)
+			break;
+
+		/*
+		 * If the last key of this znode is equivalent to the first key
+		 * of the next znode (collision), then check order of the keys.
+		 */
+		last = prev->child_cnt - 1;
+		if (prev->level == 0 && znode->level == 0 && !c->replaying &&
+		    !keys_cmp(c, &prev->zbranch[last].key,
+			      &znode->zbranch[0].key)) {
+			err = dbg_check_key_order(c, &prev->zbranch[last],
+						  &znode->zbranch[0]);
+			if (err < 0)
+				return err;
+			if (err) {
+				ubifs_msg("first znode");
+				dbg_dump_znode(c, prev);
+				ubifs_msg("second znode");
+				dbg_dump_znode(c, znode);
+				return -EINVAL;
+			}
+		}
+	}
+
+	if (extra) {
+		if (clean_cnt != atomic_long_read(&c->clean_zn_cnt)) {
+			ubifs_err("incorrect clean_zn_cnt %ld, calculated %ld",
+				  atomic_long_read(&c->clean_zn_cnt),
+				  clean_cnt);
+			return -EINVAL;
+		}
+		if (dirty_cnt != atomic_long_read(&c->dirty_zn_cnt)) {
+			ubifs_err("incorrect dirty_zn_cnt %ld, calculated %ld",
+				  atomic_long_read(&c->dirty_zn_cnt),
+				  dirty_cnt);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * dbg_walk_index - walk the on-flash index.
+ * @c: UBIFS file-system description object
+ * @leaf_cb: called for each leaf node
+ * @znode_cb: called for each indexing node
+ * @priv: private date which is passed to callbacks
+ *
+ * This function walks the UBIFS index and calls the @leaf_cb for each leaf
+ * node and @znode_cb for each indexing node. Returns zero in case of success
+ * and a negative error code in case of failure.
+ *
+ * It would be better if this function removed every znode it pulled to into
+ * the TNC, so that the behavior more closely matched the non-debugging
+ * behavior.
+ */
+int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb,
+		   dbg_znode_callback znode_cb, void *priv)
+{
+	int err;
+	struct ubifs_zbranch *zbr;
+	struct ubifs_znode *znode, *child;
+
+	mutex_lock(&c->tnc_mutex);
+	/* If the root indexing node is not in TNC - pull it */
+	if (!c->zroot.znode) {
+		c->zroot.znode = ubifs_load_znode(c, &c->zroot, NULL, 0);
+		if (IS_ERR(c->zroot.znode)) {
+			err = PTR_ERR(c->zroot.znode);
+			c->zroot.znode = NULL;
+			goto out_unlock;
+		}
+	}
+
+	/*
+	 * We are going to traverse the indexing tree in the postorder manner.
+	 * Go down and find the leftmost indexing node where we are going to
+	 * start from.
+	 */
+	znode = c->zroot.znode;
+	while (znode->level > 0) {
+		zbr = &znode->zbranch[0];
+		child = zbr->znode;
+		if (!child) {
+			child = ubifs_load_znode(c, zbr, znode, 0);
+			if (IS_ERR(child)) {
+				err = PTR_ERR(child);
+				goto out_unlock;
+			}
+			zbr->znode = child;
+		}
+
+		znode = child;
+	}
+
+	/* Iterate over all indexing nodes */
+	while (1) {
+		int idx;
+
+		cond_resched();
+
+		if (znode_cb) {
+			err = znode_cb(c, znode, priv);
+			if (err) {
+				ubifs_err("znode checking function returned "
+					  "error %d", err);
+				dbg_dump_znode(c, znode);
+				goto out_dump;
+			}
+		}
+		if (leaf_cb && znode->level == 0) {
+			for (idx = 0; idx < znode->child_cnt; idx++) {
+				zbr = &znode->zbranch[idx];
+				err = leaf_cb(c, zbr, priv);
+				if (err) {
+					ubifs_err("leaf checking function "
+						  "returned error %d, for leaf "
+						  "at LEB %d:%d",
+						  err, zbr->lnum, zbr->offs);
+					goto out_dump;
+				}
+			}
+		}
+
+		if (!znode->parent)
+			break;
+
+		idx = znode->iip + 1;
+		znode = znode->parent;
+		if (idx < znode->child_cnt) {
+			/* Switch to the next index in the parent */
+			zbr = &znode->zbranch[idx];
+			child = zbr->znode;
+			if (!child) {
+				child = ubifs_load_znode(c, zbr, znode, idx);
+				if (IS_ERR(child)) {
+					err = PTR_ERR(child);
+					goto out_unlock;
+				}
+				zbr->znode = child;
+			}
+			znode = child;
+		} else
+			/*
+			 * This is the last child, switch to the parent and
+			 * continue.
+			 */
+			continue;
+
+		/* Go to the lowest leftmost znode in the new sub-tree */
+		while (znode->level > 0) {
+			zbr = &znode->zbranch[0];
+			child = zbr->znode;
+			if (!child) {
+				child = ubifs_load_znode(c, zbr, znode, 0);
+				if (IS_ERR(child)) {
+					err = PTR_ERR(child);
+					goto out_unlock;
+				}
+				zbr->znode = child;
+			}
+			znode = child;
+		}
+	}
+
+	mutex_unlock(&c->tnc_mutex);
+	return 0;
+
+out_dump:
+	if (znode->parent)
+		zbr = &znode->parent->zbranch[znode->iip];
+	else
+		zbr = &c->zroot;
+	ubifs_msg("dump of znode at LEB %d:%d", zbr->lnum, zbr->offs);
+	dbg_dump_znode(c, znode);
+out_unlock:
+	mutex_unlock(&c->tnc_mutex);
+	return err;
+}
+
+/**
+ * add_size - add znode size to partially calculated index size.
+ * @c: UBIFS file-system description object
+ * @znode: znode to add size for
+ * @priv: partially calculated index size
+ *
+ * This is a helper function for 'dbg_check_idx_size()' which is called for
+ * every indexing node and adds its size to the 'long long' variable pointed to
+ * by @priv.
+ */
+static int add_size(struct ubifs_info *c, struct ubifs_znode *znode, void *priv)
+{
+	long long *idx_size = priv;
+	int add;
+
+	add = ubifs_idx_node_sz(c, znode->child_cnt);
+	add = ALIGN(add, 8);
+	*idx_size += add;
+	return 0;
+}
+
+/**
+ * dbg_check_idx_size - check index size.
+ * @c: UBIFS file-system description object
+ * @idx_size: size to check
+ *
+ * This function walks the UBIFS index, calculates its size and checks that the
+ * size is equivalent to @idx_size. Returns zero in case of success and a
+ * negative error code in case of failure.
+ */
+int dbg_check_idx_size(struct ubifs_info *c, long long idx_size)
+{
+	int err;
+	long long calc = 0;
+
+	if (!(ubifs_chk_flags & UBIFS_CHK_IDX_SZ))
+		return 0;
+
+	err = dbg_walk_index(c, NULL, add_size, &calc);
+	if (err) {
+		ubifs_err("error %d while walking the index", err);
+		return err;
+	}
+
+	if (calc != idx_size) {
+		ubifs_err("index size check failed: calculated size is %lld, "
+			  "should be %lld", calc, idx_size);
+		dump_stack();
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * struct fsck_inode - information about an inode used when checking the file-system.
+ * @rb: link in the RB-tree of inodes
+ * @inum: inode number
+ * @mode: inode type, permissions, etc
+ * @nlink: inode link count
+ * @xattr_cnt: count of extended attributes
+ * @references: how many directory/xattr entries refer this inode (calculated
+ *              while walking the index)
+ * @calc_cnt: for directory inode count of child directories
+ * @size: inode size (read from on-flash inode)
+ * @xattr_sz: summary size of all extended attributes (read from on-flash
+ *            inode)
+ * @calc_sz: for directories calculated directory size
+ * @calc_xcnt: count of extended attributes
+ * @calc_xsz: calculated summary size of all extended attributes
+ * @xattr_nms: sum of lengths of all extended attribute names belonging to this
+ *             inode (read from on-flash inode)
+ * @calc_xnms: calculated sum of lengths of all extended attribute names
+ */
+struct fsck_inode {
+	struct rb_node rb;
+	ino_t inum;
+	umode_t mode;
+	unsigned int nlink;
+	unsigned int xattr_cnt;
+	int references;
+	int calc_cnt;
+	long long size;
+	unsigned int xattr_sz;
+	long long calc_sz;
+	long long calc_xcnt;
+	long long calc_xsz;
+	unsigned int xattr_nms;
+	long long calc_xnms;
+};
+
+/**
+ * struct fsck_data - private FS checking information.
+ * @inodes: RB-tree of all inodes (contains @struct fsck_inode objects)
+ */
+struct fsck_data {
+	struct rb_root inodes;
+};
+
+/**
+ * add_inode - add inode information to RB-tree of inodes.
+ * @c: UBIFS file-system description object
+ * @fsckd: FS checking information
+ * @ino: raw UBIFS inode to add
+ *
+ * This is a helper function for 'check_leaf()' which adds information about
+ * inode @ino to the RB-tree of inodes. Returns inode information pointer in
+ * case of success and a negative error code in case of failure.
+ */
+static struct fsck_inode *add_inode(struct ubifs_info *c,
+				    struct fsck_data *fsckd,
+				    struct ubifs_ino_node *ino)
+{
+	struct rb_node **p, *parent = NULL;
+	struct fsck_inode *fscki;
+	ino_t inum = key_inum_flash(c, &ino->key);
+
+	p = &fsckd->inodes.rb_node;
+	while (*p) {
+		parent = *p;
+		fscki = rb_entry(parent, struct fsck_inode, rb);
+		if (inum < fscki->inum)
+			p = &(*p)->rb_left;
+		else if (inum > fscki->inum)
+			p = &(*p)->rb_right;
+		else
+			return fscki;
+	}
+
+	if (inum > c->highest_inum) {
+		ubifs_err("too high inode number, max. is %lu",
+			  c->highest_inum);
+		return ERR_PTR(-EINVAL);
+	}
+
+	fscki = kzalloc(sizeof(struct fsck_inode), GFP_NOFS);
+	if (!fscki)
+		return ERR_PTR(-ENOMEM);
+
+	fscki->inum = inum;
+	fscki->nlink = le32_to_cpu(ino->nlink);
+	fscki->size = le64_to_cpu(ino->size);
+	fscki->xattr_cnt = le32_to_cpu(ino->xattr_cnt);
+	fscki->xattr_sz = le32_to_cpu(ino->xattr_size);
+	fscki->xattr_nms = le32_to_cpu(ino->xattr_names);
+	fscki->mode = le32_to_cpu(ino->mode);
+	if (S_ISDIR(fscki->mode)) {
+		fscki->calc_sz = UBIFS_INO_NODE_SZ;
+		fscki->calc_cnt = 2;
+	}
+	rb_link_node(&fscki->rb, parent, p);
+	rb_insert_color(&fscki->rb, &fsckd->inodes);
+	return fscki;
+}
+
+/**
+ * search_inode - search inode in the RB-tree of inodes.
+ * @fsckd: FS checking information
+ * @inum: inode number to search
+ *
+ * This is a helper function for 'check_leaf()' which searches inode @inum in
+ * the RB-tree of inodes and returns an inode information pointer or %NULL if
+ * the inode was not found.
+ */
+static struct fsck_inode *search_inode(struct fsck_data *fsckd, ino_t inum)
+{
+	struct rb_node *p;
+	struct fsck_inode *fscki;
+
+	p = fsckd->inodes.rb_node;
+	while (p) {
+		fscki = rb_entry(p, struct fsck_inode, rb);
+		if (inum < fscki->inum)
+			p = p->rb_left;
+		else if (inum > fscki->inum)
+			p = p->rb_right;
+		else
+			return fscki;
+	}
+	return NULL;
+}
+
+/**
+ * read_add_inode - read inode node and add it to RB-tree of inodes.
+ * @c: UBIFS file-system description object
+ * @fsckd: FS checking information
+ * @inum: inode number to read
+ *
+ * This is a helper function for 'check_leaf()' which finds inode node @inum in
+ * the index, reads it, and adds it to the RB-tree of inodes. Returns inode
+ * information pointer in case of success and a negative error code in case of
+ * failure.
+ */
+static struct fsck_inode *read_add_inode(struct ubifs_info *c,
+					 struct fsck_data *fsckd, ino_t inum)
+{
+	int n, err;
+	union ubifs_key key;
+	struct ubifs_znode *znode;
+	struct ubifs_zbranch *zbr;
+	struct ubifs_ino_node *ino;
+	struct fsck_inode *fscki;
+
+	fscki = search_inode(fsckd, inum);
+	if (fscki)
+		return fscki;
+
+	ino_key_init(c, &key, inum);
+	err = ubifs_lookup_level0(c, &key, &znode, &n);
+	if (!err) {
+		ubifs_err("inode %lu not found in index", inum);
+		return ERR_PTR(-ENOENT);
+	} else if (err < 0) {
+		ubifs_err("error %d while looking up inode %lu", err, inum);
+		return ERR_PTR(err);
+	}
+
+	zbr = &znode->zbranch[n];
+	if (zbr->len < UBIFS_INO_NODE_SZ) {
+		ubifs_err("bad node %lu node length %d", inum, zbr->len);
+		return ERR_PTR(-EINVAL);
+	}
+
+	ino = kmalloc(zbr->len, GFP_NOFS);
+	if (!ino)
+		return ERR_PTR(-ENOMEM);
+
+	err = ubifs_tnc_read_node(c, zbr, ino);
+	if (err) {
+		ubifs_err("cannot read inode node at LEB %d:%d, error %d",
+			  zbr->lnum, zbr->offs, err);
+		kfree(ino);
+		return ERR_PTR(err);
+	}
+
+	fscki = add_inode(c, fsckd, ino);
+	kfree(ino);
+	if (IS_ERR(fscki)) {
+		ubifs_err("error %ld while adding inode %lu node",
+			  PTR_ERR(fscki), inum);
+		return fscki;
+	}
+
+	return fscki;
+}
+
+/**
+ * check_leaf - check leaf node.
+ * @c: UBIFS file-system description object
+ * @zbr: zbranch of the leaf node to check
+ * @priv: FS checking information
+ *
+ * This is a helper function for 'dbg_check_filesystem()' which is called for
+ * every single leaf node while walking the indexing tree. It checks that the
+ * leaf node referred from the indexing tree exists, has correct CRC, and does
+ * some other basic validation. This function is also responsible for building
+ * an RB-tree of inodes - it adds all inodes into the RB-tree. It also
+ * calculates reference count, size, etc for each inode in order to later
+ * compare them to the information stored inside the inodes and detect possible
+ * inconsistencies. Returns zero in case of success and a negative error code
+ * in case of failure.
+ */
+static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+		      void *priv)
+{
+	ino_t inum;
+	void *node;
+	struct ubifs_ch *ch;
+	int err, type = key_type(c, &zbr->key);
+	struct fsck_inode *fscki;
+
+	if (zbr->len < UBIFS_CH_SZ) {
+		ubifs_err("bad leaf length %d (LEB %d:%d)",
+			  zbr->len, zbr->lnum, zbr->offs);
+		return -EINVAL;
+	}
+
+	node = kmalloc(zbr->len, GFP_NOFS);
+	if (!node)
+		return -ENOMEM;
+
+	err = ubifs_tnc_read_node(c, zbr, node);
+	if (err) {
+		ubifs_err("cannot read leaf node at LEB %d:%d, error %d",
+			  zbr->lnum, zbr->offs, err);
+		goto out_free;
+	}
+
+	/* If this is an inode node, add it to RB-tree of inodes */
+	if (type == UBIFS_INO_KEY) {
+		fscki = add_inode(c, priv, node);
+		if (IS_ERR(fscki)) {
+			err = PTR_ERR(fscki);
+			ubifs_err("error %d while adding inode node", err);
+			goto out_dump;
+		}
+		goto out;
+	}
+
+	if (type != UBIFS_DENT_KEY && type != UBIFS_XENT_KEY &&
+	    type != UBIFS_DATA_KEY) {
+		ubifs_err("unexpected node type %d at LEB %d:%d",
+			  type, zbr->lnum, zbr->offs);
+		err = -EINVAL;
+		goto out_free;
+	}
+
+	ch = node;
+	if (le64_to_cpu(ch->sqnum) > c->max_sqnum) {
+		ubifs_err("too high sequence number, max. is %llu",
+			  c->max_sqnum);
+		err = -EINVAL;
+		goto out_dump;
+	}
+
+	if (type == UBIFS_DATA_KEY) {
+		long long blk_offs;
+		struct ubifs_data_node *dn = node;
+
+		/*
+		 * Search the inode node this data node belongs to and insert
+		 * it to the RB-tree of inodes.
+		 */
+		inum = key_inum_flash(c, &dn->key);
+		fscki = read_add_inode(c, priv, inum);
+		if (IS_ERR(fscki)) {
+			err = PTR_ERR(fscki);
+			ubifs_err("error %d while processing data node and "
+				  "trying to find inode node %lu", err, inum);
+			goto out_dump;
+		}
+
+		/* Make sure the data node is within inode size */
+		blk_offs = key_block_flash(c, &dn->key);
+		blk_offs <<= UBIFS_BLOCK_SHIFT;
+		blk_offs += le32_to_cpu(dn->size);
+		if (blk_offs > fscki->size) {
+			ubifs_err("data node at LEB %d:%d is not within inode "
+				  "size %lld", zbr->lnum, zbr->offs,
+				  fscki->size);
+			err = -EINVAL;
+			goto out_dump;
+		}
+	} else {
+		int nlen;
+		struct ubifs_dent_node *dent = node;
+		struct fsck_inode *fscki1;
+
+		err = ubifs_validate_entry(c, dent);
+		if (err)
+			goto out_dump;
+
+		/*
+		 * Search the inode node this entry refers to and the parent
+		 * inode node and insert them to the RB-tree of inodes.
+		 */
+		inum = le64_to_cpu(dent->inum);
+		fscki = read_add_inode(c, priv, inum);
+		if (IS_ERR(fscki)) {
+			err = PTR_ERR(fscki);
+			ubifs_err("error %d while processing entry node and "
+				  "trying to find inode node %lu", err, inum);
+			goto out_dump;
+		}
+
+		/* Count how many direntries or xentries refers this inode */
+		fscki->references += 1;
+
+		inum = key_inum_flash(c, &dent->key);
+		fscki1 = read_add_inode(c, priv, inum);
+		if (IS_ERR(fscki1)) {
+			err = PTR_ERR(fscki);
+			ubifs_err("error %d while processing entry node and "
+				  "trying to find parent inode node %lu",
+				  err, inum);
+			goto out_dump;
+		}
+
+		nlen = le16_to_cpu(dent->nlen);
+		if (type == UBIFS_XENT_KEY) {
+			fscki1->calc_xcnt += 1;
+			fscki1->calc_xsz += CALC_DENT_SIZE(nlen);
+			fscki1->calc_xsz += CALC_XATTR_BYTES(fscki->size);
+			fscki1->calc_xnms += nlen;
+		} else {
+			fscki1->calc_sz += CALC_DENT_SIZE(nlen);
+			if (dent->type == UBIFS_ITYPE_DIR)
+				fscki1->calc_cnt += 1;
+		}
+	}
+
+out:
+	kfree(node);
+	return 0;
+
+out_dump:
+	ubifs_msg("dump of node at LEB %d:%d", zbr->lnum, zbr->offs);
+	dbg_dump_node(c, node);
+out_free:
+	kfree(node);
+	return err;
+}
+
+/**
+ * free_inodes - free RB-tree of inodes.
+ * @fsckd: FS checking information
+ */
+static void free_inodes(struct fsck_data *fsckd)
+{
+	struct rb_node *this = fsckd->inodes.rb_node;
+	struct fsck_inode *fscki;
+
+	while (this) {
+		if (this->rb_left)
+			this = this->rb_left;
+		else if (this->rb_right)
+			this = this->rb_right;
+		else {
+			fscki = rb_entry(this, struct fsck_inode, rb);
+			this = rb_parent(this);
+			if (this) {
+				if (this->rb_left == &fscki->rb)
+					this->rb_left = NULL;
+				else
+					this->rb_right = NULL;
+			}
+			kfree(fscki);
+		}
+	}
+}
+
+/**
+ * check_inodes - checks all inodes.
+ * @c: UBIFS file-system description object
+ * @fsckd: FS checking information
+ *
+ * This is a helper function for 'dbg_check_filesystem()' which walks the
+ * RB-tree of inodes after the index scan has been finished, and checks that
+ * inode nlink, size, etc are correct. Returns zero if inodes are fine,
+ * %-EINVAL if not, and a negative error code in case of failure.
+ */
+static int check_inodes(struct ubifs_info *c, struct fsck_data *fsckd)
+{
+	int n, err;
+	union ubifs_key key;
+	struct ubifs_znode *znode;
+	struct ubifs_zbranch *zbr;
+	struct ubifs_ino_node *ino;
+	struct fsck_inode *fscki;
+	struct rb_node *this = rb_first(&fsckd->inodes);
+
+	while (this) {
+		fscki = rb_entry(this, struct fsck_inode, rb);
+		this = rb_next(this);
+
+		if (S_ISDIR(fscki->mode)) {
+			/*
+			 * Directories have to have exactly one reference (they
+			 * cannot have hardlinks), although root inode is an
+			 * exception.
+			 */
+			if (fscki->inum != UBIFS_ROOT_INO &&
+			    fscki->references != 1) {
+				ubifs_err("directory inode %lu has %d "
+					  "direntries which refer it, but "
+					  "should be 1", fscki->inum,
+					  fscki->references);
+				goto out_dump;
+			}
+			if (fscki->inum == UBIFS_ROOT_INO &&
+			    fscki->references != 0) {
+				ubifs_err("root inode %lu has non-zero (%d) "
+					  "direntries which refer it",
+					  fscki->inum, fscki->references);
+				goto out_dump;
+			}
+			if (fscki->calc_sz != fscki->size) {
+				ubifs_err("directory inode %lu size is %lld, "
+					  "but calculated size is %lld",
+					  fscki->inum, fscki->size,
+					  fscki->calc_sz);
+				goto out_dump;
+			}
+			if (fscki->calc_cnt != fscki->nlink) {
+				ubifs_err("directory inode %lu nlink is %d, "
+					  "but calculated nlink is %d",
+					  fscki->inum, fscki->nlink,
+					  fscki->calc_cnt);
+				goto out_dump;
+			}
+		} else {
+			if (fscki->references != fscki->nlink) {
+				ubifs_err("inode %lu nlink is %d, but "
+					  "calculated nlink is %d", fscki->inum,
+					  fscki->nlink, fscki->references);
+				goto out_dump;
+			}
+		}
+		if (fscki->xattr_sz != fscki->calc_xsz) {
+			ubifs_err("inode %lu has xattr size %u, but "
+				  "calculated size is %lld",
+				  fscki->inum, fscki->xattr_sz,
+				  fscki->calc_xsz);
+			goto out_dump;
+		}
+		if (fscki->xattr_cnt != fscki->calc_xcnt) {
+			ubifs_err("inode %lu has %u xattrs, but "
+				  "calculated count is %lld", fscki->inum,
+				  fscki->xattr_cnt, fscki->calc_xcnt);
+			goto out_dump;
+		}
+		if (fscki->xattr_nms != fscki->calc_xnms) {
+			ubifs_err("inode %lu has xattr names' size %u, but "
+				  "calculated names' size is %lld",
+				  fscki->inum, fscki->xattr_nms,
+				  fscki->calc_xnms);
+			goto out_dump;
+		}
+	}
+
+	return 0;
+
+out_dump:
+	/* Read the bad inode and dump it */
+	ino_key_init(c, &key, fscki->inum);
+	err = ubifs_lookup_level0(c, &key, &znode, &n);
+	if (!err) {
+		ubifs_err("inode %lu not found in index", fscki->inum);
+		return -ENOENT;
+	} else if (err < 0) {
+		ubifs_err("error %d while looking up inode %lu",
+			  err, fscki->inum);
+		return err;
+	}
+
+	zbr = &znode->zbranch[n];
+	ino = kmalloc(zbr->len, GFP_NOFS);
+	if (!ino)
+		return -ENOMEM;
+
+	err = ubifs_tnc_read_node(c, zbr, ino);
+	if (err) {
+		ubifs_err("cannot read inode node at LEB %d:%d, error %d",
+			  zbr->lnum, zbr->offs, err);
+		kfree(ino);
+		return err;
+	}
+
+	ubifs_msg("dump of the inode %lu sitting in LEB %d:%d",
+		  fscki->inum, zbr->lnum, zbr->offs);
+	dbg_dump_node(c, ino);
+	kfree(ino);
+	return -EINVAL;
+}
+
+/**
+ * dbg_check_filesystem - check the file-system.
+ * @c: UBIFS file-system description object
+ *
+ * This function checks the file system, namely:
+ * o makes sure that all leaf nodes exist and their CRCs are correct;
+ * o makes sure inode nlink, size, xattr size/count are correct (for all
+ *   inodes).
+ *
+ * The function reads whole indexing tree and all nodes, so it is pretty
+ * heavy-weight. Returns zero if the file-system is consistent, %-EINVAL if
+ * not, and a negative error code in case of failure.
+ */
+int dbg_check_filesystem(struct ubifs_info *c)
+{
+	int err;
+	struct fsck_data fsckd;
+
+	if (!(ubifs_chk_flags & UBIFS_CHK_FS))
+		return 0;
+
+	fsckd.inodes = RB_ROOT;
+	err = dbg_walk_index(c, check_leaf, NULL, &fsckd);
+	if (err)
+		goto out_free;
+
+	err = check_inodes(c, &fsckd);
+	if (err)
+		goto out_free;
+
+	free_inodes(&fsckd);
+	return 0;
+
+out_free:
+	ubifs_err("file-system check failed with error %d", err);
+	dump_stack();
+	free_inodes(&fsckd);
+	return err;
+}
+
+static int invocation_cnt;
+
+int dbg_force_in_the_gaps(void)
+{
+	if (!dbg_force_in_the_gaps_enabled)
+		return 0;
+	/* Force in-the-gaps every 8th commit */
+	return !((invocation_cnt++) & 0x7);
+}
+
+/* Failure mode for recovery testing */
+
+#define chance(n, d) (simple_rand() <= (n) * 32768LL / (d))
+
+struct failure_mode_info {
+	struct list_head list;
+	struct ubifs_info *c;
+};
+
+static LIST_HEAD(fmi_list);
+static DEFINE_SPINLOCK(fmi_lock);
+
+static unsigned int next;
+
+static int simple_rand(void)
+{
+	if (next == 0)
+		next = current->pid;
+	next = next * 1103515245 + 12345;
+	return (next >> 16) & 32767;
+}
+
+void dbg_failure_mode_registration(struct ubifs_info *c)
+{
+	struct failure_mode_info *fmi;
+
+	fmi = kmalloc(sizeof(struct failure_mode_info), GFP_NOFS);
+	if (!fmi) {
+		dbg_err("Failed to register failure mode - no memory");
+		return;
+	}
+	fmi->c = c;
+	spin_lock(&fmi_lock);
+	list_add_tail(&fmi->list, &fmi_list);
+	spin_unlock(&fmi_lock);
+}
+
+void dbg_failure_mode_deregistration(struct ubifs_info *c)
+{
+	struct failure_mode_info *fmi, *tmp;
+
+	spin_lock(&fmi_lock);
+	list_for_each_entry_safe(fmi, tmp, &fmi_list, list)
+		if (fmi->c == c) {
+			list_del(&fmi->list);
+			kfree(fmi);
+		}
+	spin_unlock(&fmi_lock);
+}
+
+static struct ubifs_info *dbg_find_info(struct ubi_volume_desc *desc)
+{
+	struct failure_mode_info *fmi;
+
+	spin_lock(&fmi_lock);
+	list_for_each_entry(fmi, &fmi_list, list)
+		if (fmi->c->ubi == desc) {
+			struct ubifs_info *c = fmi->c;
+
+			spin_unlock(&fmi_lock);
+			return c;
+		}
+	spin_unlock(&fmi_lock);
+	return NULL;
+}
+
+static int in_failure_mode(struct ubi_volume_desc *desc)
+{
+	struct ubifs_info *c = dbg_find_info(desc);
+
+	if (c && dbg_failure_mode)
+		return c->failure_mode;
+	return 0;
+}
+
+static int do_fail(struct ubi_volume_desc *desc, int lnum, int write)
+{
+	struct ubifs_info *c = dbg_find_info(desc);
+
+	if (!c || !dbg_failure_mode)
+		return 0;
+	if (c->failure_mode)
+		return 1;
+	if (!c->fail_cnt) {
+		/* First call - decide delay to failure */
+		if (chance(1, 2)) {
+			unsigned int delay = 1 << (simple_rand() >> 11);
+
+			if (chance(1, 2)) {
+				c->fail_delay = 1;
+				c->fail_timeout = jiffies +
+						  msecs_to_jiffies(delay);
+				dbg_rcvry("failing after %ums", delay);
+			} else {
+				c->fail_delay = 2;
+				c->fail_cnt_max = delay;
+				dbg_rcvry("failing after %u calls", delay);
+			}
+		}
+		c->fail_cnt += 1;
+	}
+	/* Determine if failure delay has expired */
+	if (c->fail_delay == 1) {
+		if (time_before(jiffies, c->fail_timeout))
+			return 0;
+	} else if (c->fail_delay == 2)
+		if (c->fail_cnt++ < c->fail_cnt_max)
+			return 0;
+	if (lnum == UBIFS_SB_LNUM) {
+		if (write) {
+			if (chance(1, 2))
+				return 0;
+		} else if (chance(19, 20))
+			return 0;
+		dbg_rcvry("failing in super block LEB %d", lnum);
+	} else if (lnum == UBIFS_MST_LNUM || lnum == UBIFS_MST_LNUM + 1) {
+		if (chance(19, 20))
+			return 0;
+		dbg_rcvry("failing in master LEB %d", lnum);
+	} else if (lnum >= UBIFS_LOG_LNUM && lnum <= c->log_last) {
+		if (write) {
+			if (chance(99, 100))
+				return 0;
+		} else if (chance(399, 400))
+			return 0;
+		dbg_rcvry("failing in log LEB %d", lnum);
+	} else if (lnum >= c->lpt_first && lnum <= c->lpt_last) {
+		if (write) {
+			if (chance(7, 8))
+				return 0;
+		} else if (chance(19, 20))
+			return 0;
+		dbg_rcvry("failing in LPT LEB %d", lnum);
+	} else if (lnum >= c->orph_first && lnum <= c->orph_last) {
+		if (write) {
+			if (chance(1, 2))
+				return 0;
+		} else if (chance(9, 10))
+			return 0;
+		dbg_rcvry("failing in orphan LEB %d", lnum);
+	} else if (lnum == c->ihead_lnum) {
+		if (chance(99, 100))
+			return 0;
+		dbg_rcvry("failing in index head LEB %d", lnum);
+	} else if (c->jheads && lnum == c->jheads[GCHD].wbuf.lnum) {
+		if (chance(9, 10))
+			return 0;
+		dbg_rcvry("failing in GC head LEB %d", lnum);
+	} else if (write && !RB_EMPTY_ROOT(&c->buds) &&
+		   !ubifs_search_bud(c, lnum)) {
+		if (chance(19, 20))
+			return 0;
+		dbg_rcvry("failing in non-bud LEB %d", lnum);
+	} else if (c->cmt_state == COMMIT_RUNNING_BACKGROUND ||
+		   c->cmt_state == COMMIT_RUNNING_REQUIRED) {
+		if (chance(999, 1000))
+			return 0;
+		dbg_rcvry("failing in bud LEB %d commit running", lnum);
+	} else {
+		if (chance(9999, 10000))
+			return 0;
+		dbg_rcvry("failing in bud LEB %d commit not running", lnum);
+	}
+	ubifs_err("*** SETTING FAILURE MODE ON (LEB %d) ***", lnum);
+	c->failure_mode = 1;
+	dump_stack();
+	return 1;
+}
+
+static void cut_data(const void *buf, int len)
+{
+	int flen, i;
+	unsigned char *p = (void *)buf;
+
+	flen = (len * (long long)simple_rand()) >> 15;
+	for (i = flen; i < len; i++)
+		p[i] = 0xff;
+}
+
+int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
+		 int len, int check)
+{
+	if (in_failure_mode(desc))
+		return -EIO;
+	return ubi_leb_read(desc, lnum, buf, offset, len, check);
+}
+
+int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
+		  int offset, int len, int dtype)
+{
+	int err;
+
+	if (in_failure_mode(desc))
+		return -EIO;
+	if (do_fail(desc, lnum, 1))
+		cut_data(buf, len);
+	err = ubi_leb_write(desc, lnum, buf, offset, len, dtype);
+	if (err)
+		return err;
+	if (in_failure_mode(desc))
+		return -EIO;
+	return 0;
+}
+
+int dbg_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf,
+		   int len, int dtype)
+{
+	int err;
+
+	if (do_fail(desc, lnum, 1))
+		return -EIO;
+	err = ubi_leb_change(desc, lnum, buf, len, dtype);
+	if (err)
+		return err;
+	if (do_fail(desc, lnum, 1))
+		return -EIO;
+	return 0;
+}
+
+int dbg_leb_erase(struct ubi_volume_desc *desc, int lnum)
+{
+	int err;
+
+	if (do_fail(desc, lnum, 0))
+		return -EIO;
+	err = ubi_leb_erase(desc, lnum);
+	if (err)
+		return err;
+	if (do_fail(desc, lnum, 0))
+		return -EIO;
+	return 0;
+}
+
+int dbg_leb_unmap(struct ubi_volume_desc *desc, int lnum)
+{
+	int err;
+
+	if (do_fail(desc, lnum, 0))
+		return -EIO;
+	err = ubi_leb_unmap(desc, lnum);
+	if (err)
+		return err;
+	if (do_fail(desc, lnum, 0))
+		return -EIO;
+	return 0;
+}
+
+int dbg_is_mapped(struct ubi_volume_desc *desc, int lnum)
+{
+	if (in_failure_mode(desc))
+		return -EIO;
+	return ubi_is_mapped(desc, lnum);
+}
+
+int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype)
+{
+	int err;
+
+	if (do_fail(desc, lnum, 0))
+		return -EIO;
+	err = ubi_leb_map(desc, lnum, dtype);
+	if (err)
+		return err;
+	if (do_fail(desc, lnum, 0))
+		return -EIO;
+	return 0;
+}
+
+#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
new file mode 100644
index 00000000000..3c4f1e93c9e
--- /dev/null
+++ b/fs/ubifs/debug.h
@@ -0,0 +1,403 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+#ifndef __UBIFS_DEBUG_H__
+#define __UBIFS_DEBUG_H__
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+
+#define UBIFS_DBG(op) op
+
+#define ubifs_assert(expr)  do {                                               \
+	if (unlikely(!(expr))) {                                               \
+		printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \
+		       __func__, __LINE__, current->pid);                      \
+		dbg_dump_stack();                                              \
+	}                                                                      \
+} while (0)
+
+#define ubifs_assert_cmt_locked(c) do {                                        \
+	if (unlikely(down_write_trylock(&(c)->commit_sem))) {                  \
+		up_write(&(c)->commit_sem);                                    \
+		printk(KERN_CRIT "commit lock is not locked!\n");              \
+		ubifs_assert(0);                                               \
+	}                                                                      \
+} while (0)
+
+#define dbg_dump_stack() do {                                                  \
+	if (!dbg_failure_mode)                                                 \
+		dump_stack();                                                  \
+} while (0)
+
+/* Generic debugging messages */
+#define dbg_msg(fmt, ...) do {                                                 \
+	spin_lock(&dbg_lock);                                                  \
+	printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", current->pid,   \
+	       __func__, ##__VA_ARGS__);                                       \
+	spin_unlock(&dbg_lock);                                                \
+} while (0)
+
+#define dbg_do_msg(typ, fmt, ...) do {                                         \
+	if (ubifs_msg_flags & typ)                                             \
+		dbg_msg(fmt, ##__VA_ARGS__);                                   \
+} while (0)
+
+#define dbg_err(fmt, ...) do {                                                 \
+	spin_lock(&dbg_lock);                                                  \
+	ubifs_err(fmt, ##__VA_ARGS__);                                         \
+	spin_unlock(&dbg_lock);                                                \
+} while (0)
+
+const char *dbg_key_str0(const struct ubifs_info *c,
+			 const union ubifs_key *key);
+const char *dbg_key_str1(const struct ubifs_info *c,
+			 const union ubifs_key *key);
+
+/*
+ * DBGKEY macros require dbg_lock to be held, which it is in the dbg message
+ * macros.
+ */
+#define DBGKEY(key) dbg_key_str0(c, (key))
+#define DBGKEY1(key) dbg_key_str1(c, (key))
+
+/* General messages */
+#define dbg_gen(fmt, ...)        dbg_do_msg(UBIFS_MSG_GEN, fmt, ##__VA_ARGS__)
+
+/* Additional journal messages */
+#define dbg_jnl(fmt, ...)        dbg_do_msg(UBIFS_MSG_JNL, fmt, ##__VA_ARGS__)
+
+/* Additional TNC messages */
+#define dbg_tnc(fmt, ...)        dbg_do_msg(UBIFS_MSG_TNC, fmt, ##__VA_ARGS__)
+
+/* Additional lprops messages */
+#define dbg_lp(fmt, ...)         dbg_do_msg(UBIFS_MSG_LP, fmt, ##__VA_ARGS__)
+
+/* Additional LEB find messages */
+#define dbg_find(fmt, ...)       dbg_do_msg(UBIFS_MSG_FIND, fmt, ##__VA_ARGS__)
+
+/* Additional mount messages */
+#define dbg_mnt(fmt, ...)        dbg_do_msg(UBIFS_MSG_MNT, fmt, ##__VA_ARGS__)
+
+/* Additional I/O messages */
+#define dbg_io(fmt, ...)         dbg_do_msg(UBIFS_MSG_IO, fmt, ##__VA_ARGS__)
+
+/* Additional commit messages */
+#define dbg_cmt(fmt, ...)        dbg_do_msg(UBIFS_MSG_CMT, fmt, ##__VA_ARGS__)
+
+/* Additional budgeting messages */
+#define dbg_budg(fmt, ...)       dbg_do_msg(UBIFS_MSG_BUDG, fmt, ##__VA_ARGS__)
+
+/* Additional log messages */
+#define dbg_log(fmt, ...)        dbg_do_msg(UBIFS_MSG_LOG, fmt, ##__VA_ARGS__)
+
+/* Additional gc messages */
+#define dbg_gc(fmt, ...)         dbg_do_msg(UBIFS_MSG_GC, fmt, ##__VA_ARGS__)
+
+/* Additional scan messages */
+#define dbg_scan(fmt, ...)       dbg_do_msg(UBIFS_MSG_SCAN, fmt, ##__VA_ARGS__)
+
+/* Additional recovery messages */
+#define dbg_rcvry(fmt, ...)      dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__)
+
+/*
+ * Debugging message type flags (must match msg_type_names in debug.c).
+ *
+ * UBIFS_MSG_GEN: general messages
+ * UBIFS_MSG_JNL: journal messages
+ * UBIFS_MSG_MNT: mount messages
+ * UBIFS_MSG_CMT: commit messages
+ * UBIFS_MSG_FIND: LEB find messages
+ * UBIFS_MSG_BUDG: budgeting messages
+ * UBIFS_MSG_GC: garbage collection messages
+ * UBIFS_MSG_TNC: TNC messages
+ * UBIFS_MSG_LP: lprops messages
+ * UBIFS_MSG_IO: I/O messages
+ * UBIFS_MSG_LOG: log messages
+ * UBIFS_MSG_SCAN: scan messages
+ * UBIFS_MSG_RCVRY: recovery messages
+ */
+enum {
+	UBIFS_MSG_GEN   = 0x1,
+	UBIFS_MSG_JNL   = 0x2,
+	UBIFS_MSG_MNT   = 0x4,
+	UBIFS_MSG_CMT   = 0x8,
+	UBIFS_MSG_FIND  = 0x10,
+	UBIFS_MSG_BUDG  = 0x20,
+	UBIFS_MSG_GC    = 0x40,
+	UBIFS_MSG_TNC   = 0x80,
+	UBIFS_MSG_LP    = 0x100,
+	UBIFS_MSG_IO    = 0x200,
+	UBIFS_MSG_LOG   = 0x400,
+	UBIFS_MSG_SCAN  = 0x800,
+	UBIFS_MSG_RCVRY = 0x1000,
+};
+
+/* Debugging message type flags for each default debug message level */
+#define UBIFS_MSG_LVL_0 0
+#define UBIFS_MSG_LVL_1 0x1
+#define UBIFS_MSG_LVL_2 0x7f
+#define UBIFS_MSG_LVL_3 0xffff
+
+/*
+ * Debugging check flags (must match chk_names in debug.c).
+ *
+ * UBIFS_CHK_GEN: general checks
+ * UBIFS_CHK_TNC: check TNC
+ * UBIFS_CHK_IDX_SZ: check index size
+ * UBIFS_CHK_ORPH: check orphans
+ * UBIFS_CHK_OLD_IDX: check the old index
+ * UBIFS_CHK_LPROPS: check lprops
+ * UBIFS_CHK_FS: check the file-system
+ */
+enum {
+	UBIFS_CHK_GEN     = 0x1,
+	UBIFS_CHK_TNC     = 0x2,
+	UBIFS_CHK_IDX_SZ  = 0x4,
+	UBIFS_CHK_ORPH    = 0x8,
+	UBIFS_CHK_OLD_IDX = 0x10,
+	UBIFS_CHK_LPROPS  = 0x20,
+	UBIFS_CHK_FS      = 0x40,
+};
+
+/*
+ * Special testing flags (must match tst_names in debug.c).
+ *
+ * UBIFS_TST_FORCE_IN_THE_GAPS: force the use of in-the-gaps method
+ * UBIFS_TST_RCVRY: failure mode for recovery testing
+ */
+enum {
+	UBIFS_TST_FORCE_IN_THE_GAPS = 0x2,
+	UBIFS_TST_RCVRY             = 0x4,
+};
+
+#if CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 1
+#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_1
+#elif CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 2
+#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_2
+#elif CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 3
+#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_3
+#else
+#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_0
+#endif
+
+#ifdef CONFIG_UBIFS_FS_DEBUG_CHKS
+#define UBIFS_CHK_FLAGS_DEFAULT 0xffffffff
+#else
+#define UBIFS_CHK_FLAGS_DEFAULT 0
+#endif
+
+extern spinlock_t dbg_lock;
+
+extern unsigned int ubifs_msg_flags;
+extern unsigned int ubifs_chk_flags;
+extern unsigned int ubifs_tst_flags;
+
+/* Dump functions */
+
+const char *dbg_ntype(int type);
+const char *dbg_cstate(int cmt_state);
+const char *dbg_get_key_dump(const struct ubifs_info *c,
+			     const union ubifs_key *key);
+void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode);
+void dbg_dump_node(const struct ubifs_info *c, const void *node);
+void dbg_dump_budget_req(const struct ubifs_budget_req *req);
+void dbg_dump_lstats(const struct ubifs_lp_stats *lst);
+void dbg_dump_budg(struct ubifs_info *c);
+void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp);
+void dbg_dump_lprops(struct ubifs_info *c);
+void dbg_dump_leb(const struct ubifs_info *c, int lnum);
+void dbg_dump_znode(const struct ubifs_info *c,
+		    const struct ubifs_znode *znode);
+void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat);
+void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
+		    struct ubifs_nnode *parent, int iip);
+void dbg_dump_tnc(struct ubifs_info *c);
+void dbg_dump_index(struct ubifs_info *c);
+
+/* Checking helper functions */
+
+typedef int (*dbg_leaf_callback)(struct ubifs_info *c,
+				 struct ubifs_zbranch *zbr, void *priv);
+typedef int (*dbg_znode_callback)(struct ubifs_info *c,
+				  struct ubifs_znode *znode, void *priv);
+
+int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb,
+		   dbg_znode_callback znode_cb, void *priv);
+
+/* Checking functions */
+
+int dbg_check_lprops(struct ubifs_info *c);
+
+int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot);
+int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot);
+
+int dbg_check_cats(struct ubifs_info *c);
+
+int dbg_check_ltab(struct ubifs_info *c);
+
+int dbg_check_synced_i_size(struct inode *inode);
+
+int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir);
+
+int dbg_check_tnc(struct ubifs_info *c, int extra);
+
+int dbg_check_idx_size(struct ubifs_info *c, long long idx_size);
+
+int dbg_check_filesystem(struct ubifs_info *c);
+
+void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat,
+		    int add_pos);
+
+int dbg_check_lprops(struct ubifs_info *c);
+int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
+			int row, int col);
+
+/* Force the use of in-the-gaps method for testing */
+
+#define dbg_force_in_the_gaps_enabled \
+	(ubifs_tst_flags & UBIFS_TST_FORCE_IN_THE_GAPS)
+
+int dbg_force_in_the_gaps(void);
+
+/* Failure mode for recovery testing */
+
+#define dbg_failure_mode (ubifs_tst_flags & UBIFS_TST_RCVRY)
+
+void dbg_failure_mode_registration(struct ubifs_info *c);
+void dbg_failure_mode_deregistration(struct ubifs_info *c);
+
+#ifndef UBIFS_DBG_PRESERVE_UBI
+
+#define ubi_leb_read   dbg_leb_read
+#define ubi_leb_write  dbg_leb_write
+#define ubi_leb_change dbg_leb_change
+#define ubi_leb_erase  dbg_leb_erase
+#define ubi_leb_unmap  dbg_leb_unmap
+#define ubi_is_mapped  dbg_is_mapped
+#define ubi_leb_map    dbg_leb_map
+
+#endif
+
+int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
+		 int len, int check);
+int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
+		  int offset, int len, int dtype);
+int dbg_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf,
+		   int len, int dtype);
+int dbg_leb_erase(struct ubi_volume_desc *desc, int lnum);
+int dbg_leb_unmap(struct ubi_volume_desc *desc, int lnum);
+int dbg_is_mapped(struct ubi_volume_desc *desc, int lnum);
+int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype);
+
+static inline int dbg_read(struct ubi_volume_desc *desc, int lnum, char *buf,
+			   int offset, int len)
+{
+	return dbg_leb_read(desc, lnum, buf, offset, len, 0);
+}
+
+static inline int dbg_write(struct ubi_volume_desc *desc, int lnum,
+			    const void *buf, int offset, int len)
+{
+	return dbg_leb_write(desc, lnum, buf, offset, len, UBI_UNKNOWN);
+}
+
+static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,
+				    const void *buf, int len)
+{
+	return dbg_leb_change(desc, lnum, buf, len, UBI_UNKNOWN);
+}
+
+#else /* !CONFIG_UBIFS_FS_DEBUG */
+
+#define UBIFS_DBG(op)
+#define ubifs_assert(expr)                         ({})
+#define ubifs_assert_cmt_locked(c)
+#define dbg_dump_stack()
+#define dbg_err(fmt, ...)                          ({})
+#define dbg_msg(fmt, ...)                          ({})
+#define dbg_key(c, key, fmt, ...)                  ({})
+
+#define dbg_gen(fmt, ...)                          ({})
+#define dbg_jnl(fmt, ...)                          ({})
+#define dbg_tnc(fmt, ...)                          ({})
+#define dbg_lp(fmt, ...)                           ({})
+#define dbg_find(fmt, ...)                         ({})
+#define dbg_mnt(fmt, ...)                          ({})
+#define dbg_io(fmt, ...)                           ({})
+#define dbg_cmt(fmt, ...)                          ({})
+#define dbg_budg(fmt, ...)                         ({})
+#define dbg_log(fmt, ...)                          ({})
+#define dbg_gc(fmt, ...)                           ({})
+#define dbg_scan(fmt, ...)                         ({})
+#define dbg_rcvry(fmt, ...)                        ({})
+
+#define dbg_ntype(type)                            ""
+#define dbg_cstate(cmt_state)                      ""
+#define dbg_get_key_dump(c, key)                   ({})
+#define dbg_dump_inode(c, inode)                   ({})
+#define dbg_dump_node(c, node)                     ({})
+#define dbg_dump_budget_req(req)                   ({})
+#define dbg_dump_lstats(lst)                       ({})
+#define dbg_dump_budg(c)                           ({})
+#define dbg_dump_lprop(c, lp)                      ({})
+#define dbg_dump_lprops(c)                         ({})
+#define dbg_dump_leb(c, lnum)                      ({})
+#define dbg_dump_znode(c, znode)                   ({})
+#define dbg_dump_heap(c, heap, cat)                ({})
+#define dbg_dump_pnode(c, pnode, parent, iip)      ({})
+#define dbg_dump_tnc(c)                            ({})
+#define dbg_dump_index(c)                          ({})
+
+#define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0
+
+#define dbg_old_index_check_init(c, zroot)         0
+#define dbg_check_old_index(c, zroot)              0
+
+#define dbg_check_cats(c)                          0
+
+#define dbg_check_ltab(c)                          0
+
+#define dbg_check_synced_i_size(inode)             0
+
+#define dbg_check_dir_size(c, dir)                 0
+
+#define dbg_check_tnc(c, x)                        0
+
+#define dbg_check_idx_size(c, idx_size)            0
+
+#define dbg_check_filesystem(c)                    0
+
+#define dbg_check_heap(c, heap, cat, add_pos)      ({})
+
+#define dbg_check_lprops(c)                        0
+#define dbg_check_lpt_nodes(c, cnode, row, col)    0
+
+#define dbg_force_in_the_gaps_enabled              0
+#define dbg_force_in_the_gaps()                    0
+
+#define dbg_failure_mode                           0
+#define dbg_failure_mode_registration(c)           ({})
+#define dbg_failure_mode_deregistration(c)         ({})
+
+#endif /* !CONFIG_UBIFS_FS_DEBUG */
+
+#endif /* !__UBIFS_DEBUG_H__ */
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
new file mode 100644
index 00000000000..e90374be7d3
--- /dev/null
+++ b/fs/ubifs/dir.c
@@ -0,0 +1,1240 @@
+/* * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ * Copyright (C) 2006, 2007 University of Szeged, Hungary
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ *          Zoltan Sogor
+ */
+
+/*
+ * This file implements directory operations.
+ *
+ * All FS operations in this file allocate budget before writing anything to the
+ * media. If they fail to allocate it, the error is returned. The only
+ * exceptions are 'ubifs_unlink()' and 'ubifs_rmdir()' which keep working even
+ * if they unable to allocate the budget, because deletion %-ENOSPC failure is
+ * not what users are usually ready to get. UBIFS budgeting subsystem has some
+ * space reserved for these purposes.
+ *
+ * All operations in this file write all inodes which they change straight
+ * away, instead of marking them dirty. For example, 'ubifs_link()' changes
+ * @i_size of the parent inode and writes the parent inode together with the
+ * target inode. This was done to simplify file-system recovery which would
+ * otherwise be very difficult to do. The only exception is rename which marks
+ * the re-named inode dirty (because its @i_ctime is updated) but does not
+ * write it, but just marks it as dirty.
+ */
+
+#include "ubifs.h"
+
+/**
+ * inherit_flags - inherit flags of the parent inode.
+ * @dir: parent inode
+ * @mode: new inode mode flags
+ *
+ * This is a helper function for 'ubifs_new_inode()' which inherits flag of the
+ * parent directory inode @dir. UBIFS inodes inherit the following flags:
+ * o %UBIFS_COMPR_FL, which is useful to switch compression on/of on
+ *   sub-directory basis;
+ * o %UBIFS_SYNC_FL - useful for the same reasons;
+ * o %UBIFS_DIRSYNC_FL - similar, but relevant only to directories.
+ *
+ * This function returns the inherited flags.
+ */
+static int inherit_flags(const struct inode *dir, int mode)
+{
+	int flags;
+	const struct ubifs_inode *ui = ubifs_inode(dir);
+
+	if (!S_ISDIR(dir->i_mode))
+		/*
+		 * The parent is not a directory, which means that an extended
+		 * attribute inode is being created. No flags.
+		 */
+		return 0;
+
+	flags = ui->flags & (UBIFS_COMPR_FL | UBIFS_SYNC_FL | UBIFS_DIRSYNC_FL);
+	if (!S_ISDIR(mode))
+		/* The "DIRSYNC" flag only applies to directories */
+		flags &= ~UBIFS_DIRSYNC_FL;
+	return flags;
+}
+
+/**
+ * ubifs_new_inode - allocate new UBIFS inode object.
+ * @c: UBIFS file-system description object
+ * @dir: parent directory inode
+ * @mode: inode mode flags
+ *
+ * This function finds an unused inode number, allocates new inode and
+ * initializes it. Returns new inode in case of success and an error code in
+ * case of failure.
+ */
+struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
+			      int mode)
+{
+	struct inode *inode;
+	struct ubifs_inode *ui;
+
+	inode = new_inode(c->vfs_sb);
+	ui = ubifs_inode(inode);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	/*
+	 * Set 'S_NOCMTIME' to prevent VFS form updating [mc]time of inodes and
+	 * marking them dirty in file write path (see 'file_update_time()').
+	 * UBIFS has to fully control "clean <-> dirty" transitions of inodes
+	 * to make budgeting work.
+	 */
+	inode->i_flags |= (S_NOCMTIME);
+
+	inode->i_uid = current->fsuid;
+	if (dir->i_mode & S_ISGID) {
+		inode->i_gid = dir->i_gid;
+		if (S_ISDIR(mode))
+			mode |= S_ISGID;
+	} else
+		inode->i_gid = current->fsgid;
+	inode->i_mode = mode;
+	inode->i_mtime = inode->i_atime = inode->i_ctime =
+			 ubifs_current_time(inode);
+	inode->i_mapping->nrpages = 0;
+	/* Disable readahead */
+	inode->i_mapping->backing_dev_info = &c->bdi;
+
+	switch (mode & S_IFMT) {
+	case S_IFREG:
+		inode->i_mapping->a_ops = &ubifs_file_address_operations;
+		inode->i_op = &ubifs_file_inode_operations;
+		inode->i_fop = &ubifs_file_operations;
+		break;
+	case S_IFDIR:
+		inode->i_op  = &ubifs_dir_inode_operations;
+		inode->i_fop = &ubifs_dir_operations;
+		inode->i_size = ui->ui_size = UBIFS_INO_NODE_SZ;
+		break;
+	case S_IFLNK:
+		inode->i_op = &ubifs_symlink_inode_operations;
+		break;
+	case S_IFSOCK:
+	case S_IFIFO:
+	case S_IFBLK:
+	case S_IFCHR:
+		inode->i_op  = &ubifs_file_inode_operations;
+		break;
+	default:
+		BUG();
+	}
+
+	ui->flags = inherit_flags(dir, mode);
+	ubifs_set_inode_flags(inode);
+	if (S_ISREG(mode))
+		ui->compr_type = c->default_compr;
+	else
+		ui->compr_type = UBIFS_COMPR_NONE;
+	ui->synced_i_size = 0;
+
+	spin_lock(&c->cnt_lock);
+	/* Inode number overflow is currently not supported */
+	if (c->highest_inum >= INUM_WARN_WATERMARK) {
+		if (c->highest_inum >= INUM_WATERMARK) {
+			spin_unlock(&c->cnt_lock);
+			ubifs_err("out of inode numbers");
+			make_bad_inode(inode);
+			iput(inode);
+			return ERR_PTR(-EINVAL);
+		}
+		ubifs_warn("running out of inode numbers (current %lu, max %d)",
+			   c->highest_inum, INUM_WATERMARK);
+	}
+
+	inode->i_ino = ++c->highest_inum;
+	inode->i_generation = ++c->vfs_gen;
+	/*
+	 * The creation sequence number remains with this inode for its
+	 * lifetime. All nodes for this inode have a greater sequence number,
+	 * and so it is possible to distinguish obsolete nodes belonging to a
+	 * previous incarnation of the same inode number - for example, for the
+	 * purpose of rebuilding the index.
+	 */
+	ui->creat_sqnum = ++c->max_sqnum;
+	spin_unlock(&c->cnt_lock);
+	return inode;
+}
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+
+static int dbg_check_name(struct ubifs_dent_node *dent, struct qstr *nm)
+{
+	if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+		return 0;
+	if (le16_to_cpu(dent->nlen) != nm->len)
+		return -EINVAL;
+	if (memcmp(dent->name, nm->name, nm->len))
+		return -EINVAL;
+	return 0;
+}
+
+#else
+
+#define dbg_check_name(dent, nm) 0
+
+#endif
+
+static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
+				   struct nameidata *nd)
+{
+	int err;
+	union ubifs_key key;
+	struct inode *inode = NULL;
+	struct ubifs_dent_node *dent;
+	struct ubifs_info *c = dir->i_sb->s_fs_info;
+
+	dbg_gen("'%.*s' in dir ino %lu",
+		dentry->d_name.len, dentry->d_name.name, dir->i_ino);
+
+	if (dentry->d_name.len > UBIFS_MAX_NLEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	dent = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
+	if (!dent)
+		return ERR_PTR(-ENOMEM);
+
+	dent_key_init(c, &key, dir->i_ino, &dentry->d_name);
+
+	err = ubifs_tnc_lookup_nm(c, &key, dent, &dentry->d_name);
+	if (err) {
+		/*
+		 * Do not hash the direntry if parent 'i_nlink' is zero, because
+		 * this has side-effects - '->delete_inode()' call will not be
+		 * called for the parent orphan inode, because 'd_count' of its
+		 * direntry will stay 1 (it'll be negative direntry I guess)
+		 * and prevent 'iput_final()' until the dentry is destroyed due
+		 * to unmount or memory pressure.
+		 */
+		if (err == -ENOENT && dir->i_nlink != 0) {
+			dbg_gen("not found");
+			goto done;
+		}
+		goto out;
+	}
+
+	if (dbg_check_name(dent, &dentry->d_name)) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	inode = ubifs_iget(dir->i_sb, le64_to_cpu(dent->inum));
+	if (IS_ERR(inode)) {
+		/*
+		 * This should not happen. Probably the file-system needs
+		 * checking.
+		 */
+		err = PTR_ERR(inode);
+		ubifs_err("dead directory entry '%.*s', error %d",
+			  dentry->d_name.len, dentry->d_name.name, err);
+		ubifs_ro_mode(c, err);
+		goto out;
+	}
+
+done:
+	kfree(dent);
+	/*
+	 * Note, d_splice_alias() would be required instead if we supported
+	 * NFS.
+	 */
+	d_add(dentry, inode);
+	return NULL;
+
+out:
+	kfree(dent);
+	return ERR_PTR(err);
+}
+
+static int ubifs_create(struct inode *dir, struct dentry *dentry, int mode,
+			struct nameidata *nd)
+{
+	struct inode *inode;
+	struct ubifs_info *c = dir->i_sb->s_fs_info;
+	int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
+					.dirtied_ino = 1 };
+	struct ubifs_inode *dir_ui = ubifs_inode(dir);
+
+	/*
+	 * Budget request settings: new inode, new direntry, changing the
+	 * parent directory inode.
+	 */
+
+	dbg_gen("dent '%.*s', mode %#x in dir ino %lu",
+		dentry->d_name.len, dentry->d_name.name, mode, dir->i_ino);
+
+	err = ubifs_budget_space(c, &req);
+	if (err)
+		return err;
+
+	inode = ubifs_new_inode(c, dir, mode);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out_budg;
+	}
+
+	mutex_lock(&dir_ui->ui_mutex);
+	dir->i_size += sz_change;
+	dir_ui->ui_size = dir->i_size;
+	dir->i_mtime = dir->i_ctime = inode->i_ctime;
+	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+	if (err)
+		goto out_cancel;
+	mutex_unlock(&dir_ui->ui_mutex);
+
+	ubifs_release_budget(c, &req);
+	insert_inode_hash(inode);
+	d_instantiate(dentry, inode);
+	return 0;
+
+out_cancel:
+	dir->i_size -= sz_change;
+	dir_ui->ui_size = dir->i_size;
+	mutex_unlock(&dir_ui->ui_mutex);
+	make_bad_inode(inode);
+	iput(inode);
+out_budg:
+	ubifs_release_budget(c, &req);
+	ubifs_err("cannot create regular file, error %d", err);
+	return err;
+}
+
+/**
+ * vfs_dent_type - get VFS directory entry type.
+ * @type: UBIFS directory entry type
+ *
+ * This function converts UBIFS directory entry type into VFS directory entry
+ * type.
+ */
+static unsigned int vfs_dent_type(uint8_t type)
+{
+	switch (type) {
+	case UBIFS_ITYPE_REG:
+		return DT_REG;
+	case UBIFS_ITYPE_DIR:
+		return DT_DIR;
+	case UBIFS_ITYPE_LNK:
+		return DT_LNK;
+	case UBIFS_ITYPE_BLK:
+		return DT_BLK;
+	case UBIFS_ITYPE_CHR:
+		return DT_CHR;
+	case UBIFS_ITYPE_FIFO:
+		return DT_FIFO;
+	case UBIFS_ITYPE_SOCK:
+		return DT_SOCK;
+	default:
+		BUG();
+	}
+	return 0;
+}
+
+/*
+ * The classical Unix view for directory is that it is a linear array of
+ * (name, inode number) entries. Linux/VFS assumes this model as well.
+ * Particularly, 'readdir()' call wants us to return a directory entry offset
+ * which later may be used to continue 'readdir()'ing the directory or to
+ * 'seek()' to that specific direntry. Obviously UBIFS does not really fit this
+ * model because directory entries are identified by keys, which may collide.
+ *
+ * UBIFS uses directory entry hash value for directory offsets, so
+ * 'seekdir()'/'telldir()' may not always work because of possible key
+ * collisions. But UBIFS guarantees that consecutive 'readdir()' calls work
+ * properly by means of saving full directory entry name in the private field
+ * of the file description object.
+ *
+ * This means that UBIFS cannot support NFS which requires full
+ * 'seekdir()'/'telldir()' support.
+ */
+static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
+{
+	int err, over = 0;
+	struct qstr nm;
+	union ubifs_key key;
+	struct ubifs_dent_node *dent;
+	struct inode *dir = file->f_path.dentry->d_inode;
+	struct ubifs_info *c = dir->i_sb->s_fs_info;
+
+	dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, file->f_pos);
+
+	if (file->f_pos > UBIFS_S_KEY_HASH_MASK || file->f_pos == 2)
+		/*
+		 * The directory was seek'ed to a senseless position or there
+		 * are no more entries.
+		 */
+		return 0;
+
+	/* File positions 0 and 1 correspond to "." and ".." */
+	if (file->f_pos == 0) {
+		ubifs_assert(!file->private_data);
+		over = filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR);
+		if (over)
+			return 0;
+		file->f_pos = 1;
+	}
+
+	if (file->f_pos == 1) {
+		ubifs_assert(!file->private_data);
+		over = filldir(dirent, "..", 2, 1,
+			       parent_ino(file->f_path.dentry), DT_DIR);
+		if (over)
+			return 0;
+
+		/* Find the first entry in TNC and save it */
+		lowest_dent_key(c, &key, dir->i_ino);
+		nm.name = NULL;
+		dent = ubifs_tnc_next_ent(c, &key, &nm);
+		if (IS_ERR(dent)) {
+			err = PTR_ERR(dent);
+			goto out;
+		}
+
+		file->f_pos = key_hash_flash(c, &dent->key);
+		file->private_data = dent;
+	}
+
+	dent = file->private_data;
+	if (!dent) {
+		/*
+		 * The directory was seek'ed to and is now readdir'ed.
+		 * Find the entry corresponding to @file->f_pos or the
+		 * closest one.
+		 */
+		dent_key_init_hash(c, &key, dir->i_ino, file->f_pos);
+		nm.name = NULL;
+		dent = ubifs_tnc_next_ent(c, &key, &nm);
+		if (IS_ERR(dent)) {
+			err = PTR_ERR(dent);
+			goto out;
+		}
+		file->f_pos = key_hash_flash(c, &dent->key);
+		file->private_data = dent;
+	}
+
+	while (1) {
+		dbg_gen("feed '%s', ino %llu, new f_pos %#x",
+			dent->name, le64_to_cpu(dent->inum),
+			key_hash_flash(c, &dent->key));
+		ubifs_assert(dent->ch.sqnum > ubifs_inode(dir)->creat_sqnum);
+
+		nm.len = le16_to_cpu(dent->nlen);
+		over = filldir(dirent, dent->name, nm.len, file->f_pos,
+			       le64_to_cpu(dent->inum),
+			       vfs_dent_type(dent->type));
+		if (over)
+			return 0;
+
+		/* Switch to the next entry */
+		key_read(c, &dent->key, &key);
+		nm.name = dent->name;
+		dent = ubifs_tnc_next_ent(c, &key, &nm);
+		if (IS_ERR(dent)) {
+			err = PTR_ERR(dent);
+			goto out;
+		}
+
+		kfree(file->private_data);
+		file->f_pos = key_hash_flash(c, &dent->key);
+		file->private_data = dent;
+		cond_resched();
+	}
+
+out:
+	if (err != -ENOENT) {
+		ubifs_err("cannot find next direntry, error %d", err);
+		return err;
+	}
+
+	kfree(file->private_data);
+	file->private_data = NULL;
+	file->f_pos = 2;
+	return 0;
+}
+
+/* If a directory is seeked, we have to free saved readdir() state */
+static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int origin)
+{
+	kfree(file->private_data);
+	file->private_data = NULL;
+	return generic_file_llseek(file, offset, origin);
+}
+
+/* Free saved readdir() state when the directory is closed */
+static int ubifs_dir_release(struct inode *dir, struct file *file)
+{
+	kfree(file->private_data);
+	file->private_data = NULL;
+	return 0;
+}
+
+/**
+ * lock_2_inodes - lock two UBIFS inodes.
+ * @inode1: first inode
+ * @inode2: second inode
+ */
+static void lock_2_inodes(struct inode *inode1, struct inode *inode2)
+{
+	if (inode1->i_ino < inode2->i_ino) {
+		mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_2);
+		mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_3);
+	} else {
+		mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
+		mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_3);
+	}
+}
+
+/**
+ * unlock_2_inodes - unlock two UBIFS inodes inodes.
+ * @inode1: first inode
+ * @inode2: second inode
+ */
+static void unlock_2_inodes(struct inode *inode1, struct inode *inode2)
+{
+	mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
+	mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
+}
+
+static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
+		      struct dentry *dentry)
+{
+	struct ubifs_info *c = dir->i_sb->s_fs_info;
+	struct inode *inode = old_dentry->d_inode;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	struct ubifs_inode *dir_ui = ubifs_inode(dir);
+	int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+	struct ubifs_budget_req req = { .new_dent = 1, .dirtied_ino = 2,
+					.dirtied_ino_d = ui->data_len };
+
+	/*
+	 * Budget request settings: new direntry, changing the target inode,
+	 * changing the parent inode.
+	 */
+
+	dbg_gen("dent '%.*s' to ino %lu (nlink %d) in dir ino %lu",
+		dentry->d_name.len, dentry->d_name.name, inode->i_ino,
+		inode->i_nlink, dir->i_ino);
+	err = dbg_check_synced_i_size(inode);
+	if (err)
+		return err;
+
+	err = ubifs_budget_space(c, &req);
+	if (err)
+		return err;
+
+	lock_2_inodes(dir, inode);
+	inc_nlink(inode);
+	atomic_inc(&inode->i_count);
+	inode->i_ctime = ubifs_current_time(inode);
+	dir->i_size += sz_change;
+	dir_ui->ui_size = dir->i_size;
+	dir->i_mtime = dir->i_ctime = inode->i_ctime;
+	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+	if (err)
+		goto out_cancel;
+	unlock_2_inodes(dir, inode);
+
+	ubifs_release_budget(c, &req);
+	d_instantiate(dentry, inode);
+	return 0;
+
+out_cancel:
+	dir->i_size -= sz_change;
+	dir_ui->ui_size = dir->i_size;
+	drop_nlink(inode);
+	unlock_2_inodes(dir, inode);
+	ubifs_release_budget(c, &req);
+	iput(inode);
+	return err;
+}
+
+static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct ubifs_info *c = dir->i_sb->s_fs_info;
+	struct inode *inode = dentry->d_inode;
+	struct ubifs_inode *dir_ui = ubifs_inode(dir);
+	int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+	int err, budgeted = 1;
+	struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
+
+	/*
+	 * Budget request settings: deletion direntry, deletion inode (+1 for
+	 * @dirtied_ino), changing the parent directory inode. If budgeting
+	 * fails, go ahead anyway because we have extra space reserved for
+	 * deletions.
+	 */
+
+	dbg_gen("dent '%.*s' from ino %lu (nlink %d) in dir ino %lu",
+		dentry->d_name.len, dentry->d_name.name, inode->i_ino,
+		inode->i_nlink, dir->i_ino);
+	err = dbg_check_synced_i_size(inode);
+	if (err)
+		return err;
+
+	err = ubifs_budget_space(c, &req);
+	if (err) {
+		if (err != -ENOSPC)
+			return err;
+		err = 0;
+		budgeted = 0;
+	}
+
+	lock_2_inodes(dir, inode);
+	inode->i_ctime = ubifs_current_time(dir);
+	drop_nlink(inode);
+	dir->i_size -= sz_change;
+	dir_ui->ui_size = dir->i_size;
+	dir->i_mtime = dir->i_ctime = inode->i_ctime;
+	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0);
+	if (err)
+		goto out_cancel;
+	unlock_2_inodes(dir, inode);
+
+	if (budgeted)
+		ubifs_release_budget(c, &req);
+	else {
+		/* We've deleted something - clean the "no space" flags */
+		c->nospace = c->nospace_rp = 0;
+		smp_wmb();
+	}
+	return 0;
+
+out_cancel:
+	dir->i_size += sz_change;
+	dir_ui->ui_size = dir->i_size;
+	inc_nlink(inode);
+	unlock_2_inodes(dir, inode);
+	if (budgeted)
+		ubifs_release_budget(c, &req);
+	return err;
+}
+
+/**
+ * check_dir_empty - check if a directory is empty or not.
+ * @c: UBIFS file-system description object
+ * @dir: VFS inode object of the directory to check
+ *
+ * This function checks if directory @dir is empty. Returns zero if the
+ * directory is empty, %-ENOTEMPTY if it is not, and other negative error codes
+ * in case of of errors.
+ */
+static int check_dir_empty(struct ubifs_info *c, struct inode *dir)
+{
+	struct qstr nm = { .name = NULL };
+	struct ubifs_dent_node *dent;
+	union ubifs_key key;
+	int err;
+
+	lowest_dent_key(c, &key, dir->i_ino);
+	dent = ubifs_tnc_next_ent(c, &key, &nm);
+	if (IS_ERR(dent)) {
+		err = PTR_ERR(dent);
+		if (err == -ENOENT)
+			err = 0;
+	} else {
+		kfree(dent);
+		err = -ENOTEMPTY;
+	}
+	return err;
+}
+
+static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct ubifs_info *c = dir->i_sb->s_fs_info;
+	struct inode *inode = dentry->d_inode;
+	int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+	int err, budgeted = 1;
+	struct ubifs_inode *dir_ui = ubifs_inode(dir);
+	struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
+
+	/*
+	 * Budget request settings: deletion direntry, deletion inode and
+	 * changing the parent inode. If budgeting fails, go ahead anyway
+	 * because we have extra space reserved for deletions.
+	 */
+
+	dbg_gen("directory '%.*s', ino %lu in dir ino %lu", dentry->d_name.len,
+		dentry->d_name.name, inode->i_ino, dir->i_ino);
+
+	err = check_dir_empty(c, dentry->d_inode);
+	if (err)
+		return err;
+
+	err = ubifs_budget_space(c, &req);
+	if (err) {
+		if (err != -ENOSPC)
+			return err;
+		budgeted = 0;
+	}
+
+	lock_2_inodes(dir, inode);
+	inode->i_ctime = ubifs_current_time(dir);
+	clear_nlink(inode);
+	drop_nlink(dir);
+	dir->i_size -= sz_change;
+	dir_ui->ui_size = dir->i_size;
+	dir->i_mtime = dir->i_ctime = inode->i_ctime;
+	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0);
+	if (err)
+		goto out_cancel;
+	unlock_2_inodes(dir, inode);
+
+	if (budgeted)
+		ubifs_release_budget(c, &req);
+	else {
+		/* We've deleted something - clean the "no space" flags */
+		c->nospace = c->nospace_rp = 0;
+		smp_wmb();
+	}
+	return 0;
+
+out_cancel:
+	dir->i_size += sz_change;
+	dir_ui->ui_size = dir->i_size;
+	inc_nlink(dir);
+	inc_nlink(inode);
+	inc_nlink(inode);
+	unlock_2_inodes(dir, inode);
+	if (budgeted)
+		ubifs_release_budget(c, &req);
+	return err;
+}
+
+static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct inode *inode;
+	struct ubifs_inode *dir_ui = ubifs_inode(dir);
+	struct ubifs_info *c = dir->i_sb->s_fs_info;
+	int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
+					.dirtied_ino_d = 1 };
+
+	/*
+	 * Budget request settings: new inode, new direntry and changing parent
+	 * directory inode.
+	 */
+
+	dbg_gen("dent '%.*s', mode %#x in dir ino %lu",
+		dentry->d_name.len, dentry->d_name.name, mode, dir->i_ino);
+
+	err = ubifs_budget_space(c, &req);
+	if (err)
+		return err;
+
+	inode = ubifs_new_inode(c, dir, S_IFDIR | mode);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out_budg;
+	}
+
+	mutex_lock(&dir_ui->ui_mutex);
+	insert_inode_hash(inode);
+	inc_nlink(inode);
+	inc_nlink(dir);
+	dir->i_size += sz_change;
+	dir_ui->ui_size = dir->i_size;
+	dir->i_mtime = dir->i_ctime = inode->i_ctime;
+	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+	if (err) {
+		ubifs_err("cannot create directory, error %d", err);
+		goto out_cancel;
+	}
+	mutex_unlock(&dir_ui->ui_mutex);
+
+	ubifs_release_budget(c, &req);
+	d_instantiate(dentry, inode);
+	return 0;
+
+out_cancel:
+	dir->i_size -= sz_change;
+	dir_ui->ui_size = dir->i_size;
+	drop_nlink(dir);
+	mutex_unlock(&dir_ui->ui_mutex);
+	make_bad_inode(inode);
+	iput(inode);
+out_budg:
+	ubifs_release_budget(c, &req);
+	return err;
+}
+
+static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
+		       int mode, dev_t rdev)
+{
+	struct inode *inode;
+	struct ubifs_inode *ui;
+	struct ubifs_inode *dir_ui = ubifs_inode(dir);
+	struct ubifs_info *c = dir->i_sb->s_fs_info;
+	union ubifs_dev_desc *dev = NULL;
+	int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+	int err, devlen = 0;
+	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
+					.new_ino_d = devlen, .dirtied_ino = 1 };
+
+	/*
+	 * Budget request settings: new inode, new direntry and changing parent
+	 * directory inode.
+	 */
+
+	dbg_gen("dent '%.*s' in dir ino %lu",
+		dentry->d_name.len, dentry->d_name.name, dir->i_ino);
+
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+
+	if (S_ISBLK(mode) || S_ISCHR(mode)) {
+		dev = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS);
+		if (!dev)
+			return -ENOMEM;
+		devlen = ubifs_encode_dev(dev, rdev);
+	}
+
+	err = ubifs_budget_space(c, &req);
+	if (err) {
+		kfree(dev);
+		return err;
+	}
+
+	inode = ubifs_new_inode(c, dir, mode);
+	if (IS_ERR(inode)) {
+		kfree(dev);
+		err = PTR_ERR(inode);
+		goto out_budg;
+	}
+
+	init_special_inode(inode, inode->i_mode, rdev);
+	inode->i_size = ubifs_inode(inode)->ui_size = devlen;
+	ui = ubifs_inode(inode);
+	ui->data = dev;
+	ui->data_len = devlen;
+
+	mutex_lock(&dir_ui->ui_mutex);
+	dir->i_size += sz_change;
+	dir_ui->ui_size = dir->i_size;
+	dir->i_mtime = dir->i_ctime = inode->i_ctime;
+	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+	if (err)
+		goto out_cancel;
+	mutex_unlock(&dir_ui->ui_mutex);
+
+	ubifs_release_budget(c, &req);
+	insert_inode_hash(inode);
+	d_instantiate(dentry, inode);
+	return 0;
+
+out_cancel:
+	dir->i_size -= sz_change;
+	dir_ui->ui_size = dir->i_size;
+	mutex_unlock(&dir_ui->ui_mutex);
+	make_bad_inode(inode);
+	iput(inode);
+out_budg:
+	ubifs_release_budget(c, &req);
+	return err;
+}
+
+static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
+			 const char *symname)
+{
+	struct inode *inode;
+	struct ubifs_inode *ui;
+	struct ubifs_inode *dir_ui = ubifs_inode(dir);
+	struct ubifs_info *c = dir->i_sb->s_fs_info;
+	int err, len = strlen(symname);
+	int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
+					.new_ino_d = len, .dirtied_ino = 1 };
+
+	/*
+	 * Budget request settings: new inode, new direntry and changing parent
+	 * directory inode.
+	 */
+
+	dbg_gen("dent '%.*s', target '%s' in dir ino %lu", dentry->d_name.len,
+		dentry->d_name.name, symname, dir->i_ino);
+
+	if (len > UBIFS_MAX_INO_DATA)
+		return -ENAMETOOLONG;
+
+	err = ubifs_budget_space(c, &req);
+	if (err)
+		return err;
+
+	inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out_budg;
+	}
+
+	ui = ubifs_inode(inode);
+	ui->data = kmalloc(len + 1, GFP_NOFS);
+	if (!ui->data) {
+		err = -ENOMEM;
+		goto out_inode;
+	}
+
+	memcpy(ui->data, symname, len);
+	((char *)ui->data)[len] = '\0';
+	/*
+	 * The terminating zero byte is not written to the flash media and it
+	 * is put just to make later in-memory string processing simpler. Thus,
+	 * data length is @len, not @len + %1.
+	 */
+	ui->data_len = len;
+	inode->i_size = ubifs_inode(inode)->ui_size = len;
+
+	mutex_lock(&dir_ui->ui_mutex);
+	dir->i_size += sz_change;
+	dir_ui->ui_size = dir->i_size;
+	dir->i_mtime = dir->i_ctime = inode->i_ctime;
+	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+	if (err)
+		goto out_cancel;
+	mutex_unlock(&dir_ui->ui_mutex);
+
+	ubifs_release_budget(c, &req);
+	insert_inode_hash(inode);
+	d_instantiate(dentry, inode);
+	return 0;
+
+out_cancel:
+	dir->i_size -= sz_change;
+	dir_ui->ui_size = dir->i_size;
+	mutex_unlock(&dir_ui->ui_mutex);
+out_inode:
+	make_bad_inode(inode);
+	iput(inode);
+out_budg:
+	ubifs_release_budget(c, &req);
+	return err;
+}
+
+/**
+ * lock_3_inodes - lock three UBIFS inodes for rename.
+ * @inode1: first inode
+ * @inode2: second inode
+ * @inode3: third inode
+ *
+ * For 'ubifs_rename()', @inode1 may be the same as @inode2 whereas @inode3 may
+ * be null.
+ */
+static void lock_3_inodes(struct inode *inode1, struct inode *inode2,
+			  struct inode *inode3)
+{
+	struct inode *i1, *i2, *i3;
+
+	if (!inode3) {
+		if (inode1 != inode2) {
+			lock_2_inodes(inode1, inode2);
+			return;
+		}
+		mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
+		return;
+	}
+
+	if (inode1 == inode2) {
+		lock_2_inodes(inode1, inode3);
+		return;
+	}
+
+	/* 3 different inodes */
+	if (inode1 < inode2) {
+		i3 = inode2;
+		if (inode1 < inode3) {
+			i1 = inode1;
+			i2 = inode3;
+		} else {
+			i1 = inode3;
+			i2 = inode1;
+		}
+	} else {
+		i3 = inode1;
+		if (inode2 < inode3) {
+			i1 = inode2;
+			i2 = inode3;
+		} else {
+			i1 = inode3;
+			i2 = inode2;
+		}
+	}
+	mutex_lock_nested(&ubifs_inode(i1)->ui_mutex, WB_MUTEX_1);
+	lock_2_inodes(i2, i3);
+}
+
+/**
+ * unlock_3_inodes - unlock three UBIFS inodes for rename.
+ * @inode1: first inode
+ * @inode2: second inode
+ * @inode3: third inode
+ */
+static void unlock_3_inodes(struct inode *inode1, struct inode *inode2,
+			    struct inode *inode3)
+{
+	mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
+	if (inode1 != inode2)
+		mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
+	if (inode3)
+		mutex_unlock(&ubifs_inode(inode3)->ui_mutex);
+}
+
+static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
+			struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct ubifs_info *c = old_dir->i_sb->s_fs_info;
+	struct inode *old_inode = old_dentry->d_inode;
+	struct inode *new_inode = new_dentry->d_inode;
+	struct ubifs_inode *old_inode_ui = ubifs_inode(old_inode);
+	int err, release, sync = 0, move = (new_dir != old_dir);
+	int is_dir = S_ISDIR(old_inode->i_mode);
+	int unlink = !!new_inode;
+	int new_sz = CALC_DENT_SIZE(new_dentry->d_name.len);
+	int old_sz = CALC_DENT_SIZE(old_dentry->d_name.len);
+	struct ubifs_budget_req req = { .new_dent = 1, .mod_dent = 1,
+					.dirtied_ino = 3 };
+	struct ubifs_budget_req ino_req = { .dirtied_ino = 1,
+				.dirtied_ino_d = old_inode_ui->data_len };
+	struct timespec time;
+
+	/*
+	 * Budget request settings: deletion direntry, new direntry, removing
+	 * the old inode, and changing old and new parent directory inodes.
+	 *
+	 * However, this operation also marks the target inode as dirty and
+	 * does not write it, so we allocate budget for the target inode
+	 * separately.
+	 */
+
+	dbg_gen("dent '%.*s' ino %lu in dir ino %lu to dent '%.*s' in "
+		"dir ino %lu", old_dentry->d_name.len, old_dentry->d_name.name,
+		old_inode->i_ino, old_dir->i_ino, new_dentry->d_name.len,
+		new_dentry->d_name.name, new_dir->i_ino);
+
+	if (unlink && is_dir) {
+		err = check_dir_empty(c, new_inode);
+		if (err)
+			return err;
+	}
+
+	err = ubifs_budget_space(c, &req);
+	if (err)
+		return err;
+	err = ubifs_budget_space(c, &ino_req);
+	if (err) {
+		ubifs_release_budget(c, &req);
+		return err;
+	}
+
+	lock_3_inodes(old_dir, new_dir, new_inode);
+
+	/*
+	 * Like most other Unix systems, set the @i_ctime for inodes on a
+	 * rename.
+	 */
+	time = ubifs_current_time(old_dir);
+	old_inode->i_ctime = time;
+
+	/* We must adjust parent link count when renaming directories */
+	if (is_dir) {
+		if (move) {
+			/*
+			 * @old_dir loses a link because we are moving
+			 * @old_inode to a different directory.
+			 */
+			drop_nlink(old_dir);
+			/*
+			 * @new_dir only gains a link if we are not also
+			 * overwriting an existing directory.
+			 */
+			if (!unlink)
+				inc_nlink(new_dir);
+		} else {
+			/*
+			 * @old_inode is not moving to a different directory,
+			 * but @old_dir still loses a link if we are
+			 * overwriting an existing directory.
+			 */
+			if (unlink)
+				drop_nlink(old_dir);
+		}
+	}
+
+	old_dir->i_size -= old_sz;
+	ubifs_inode(old_dir)->ui_size = old_dir->i_size;
+	old_dir->i_mtime = old_dir->i_ctime = time;
+	new_dir->i_mtime = new_dir->i_ctime = time;
+
+	/*
+	 * And finally, if we unlinked a direntry which happened to have the
+	 * same name as the moved direntry, we have to decrement @i_nlink of
+	 * the unlinked inode and change its ctime.
+	 */
+	if (unlink) {
+		/*
+		 * Directories cannot have hard-links, so if this is a
+		 * directory, decrement its @i_nlink twice because an empty
+		 * directory has @i_nlink 2.
+		 */
+		if (is_dir)
+			drop_nlink(new_inode);
+		new_inode->i_ctime = time;
+		drop_nlink(new_inode);
+	} else {
+		new_dir->i_size += new_sz;
+		ubifs_inode(new_dir)->ui_size = new_dir->i_size;
+	}
+
+	/*
+	 * Do not ask 'ubifs_jnl_rename()' to flush write-buffer if @old_inode
+	 * is dirty, because this will be done later on at the end of
+	 * 'ubifs_rename()'.
+	 */
+	if (IS_SYNC(old_inode)) {
+		sync = IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir);
+		if (unlink && IS_SYNC(new_inode))
+			sync = 1;
+	}
+	err = ubifs_jnl_rename(c, old_dir, old_dentry, new_dir, new_dentry,
+			       sync);
+	if (err)
+		goto out_cancel;
+
+	unlock_3_inodes(old_dir, new_dir, new_inode);
+	ubifs_release_budget(c, &req);
+
+	mutex_lock(&old_inode_ui->ui_mutex);
+	release = old_inode_ui->dirty;
+	mark_inode_dirty_sync(old_inode);
+	mutex_unlock(&old_inode_ui->ui_mutex);
+
+	if (release)
+		ubifs_release_budget(c, &ino_req);
+	if (IS_SYNC(old_inode))
+		err = old_inode->i_sb->s_op->write_inode(old_inode, 1);
+	return err;
+
+out_cancel:
+	if (unlink) {
+		if (is_dir)
+			inc_nlink(new_inode);
+		inc_nlink(new_inode);
+	} else {
+		new_dir->i_size -= new_sz;
+		ubifs_inode(new_dir)->ui_size = new_dir->i_size;
+	}
+	old_dir->i_size += old_sz;
+	ubifs_inode(old_dir)->ui_size = old_dir->i_size;
+	if (is_dir) {
+		if (move) {
+			inc_nlink(old_dir);
+			if (!unlink)
+				drop_nlink(new_dir);
+		} else {
+			if (unlink)
+				inc_nlink(old_dir);
+		}
+	}
+	unlock_3_inodes(old_dir, new_dir, new_inode);
+	ubifs_release_budget(c, &ino_req);
+	ubifs_release_budget(c, &req);
+	return err;
+}
+
+int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		  struct kstat *stat)
+{
+	loff_t size;
+	struct inode *inode = dentry->d_inode;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+
+	mutex_lock(&ui->ui_mutex);
+	stat->dev = inode->i_sb->s_dev;
+	stat->ino = inode->i_ino;
+	stat->mode = inode->i_mode;
+	stat->nlink = inode->i_nlink;
+	stat->uid = inode->i_uid;
+	stat->gid = inode->i_gid;
+	stat->rdev = inode->i_rdev;
+	stat->atime = inode->i_atime;
+	stat->mtime = inode->i_mtime;
+	stat->ctime = inode->i_ctime;
+	stat->blksize = UBIFS_BLOCK_SIZE;
+	stat->size = ui->ui_size;
+
+	/*
+	 * Unfortunately, the 'stat()' system call was designed for block
+	 * device based file systems, and it is not appropriate for UBIFS,
+	 * because UBIFS does not have notion of "block". For example, it is
+	 * difficult to tell how many block a directory takes - it actually
+	 * takes less than 300 bytes, but we have to round it to block size,
+	 * which introduces large mistake. This makes utilities like 'du' to
+	 * report completely senseless numbers. This is the reason why UBIFS
+	 * goes the same way as JFFS2 - it reports zero blocks for everything
+	 * but regular files, which makes more sense than reporting completely
+	 * wrong sizes.
+	 */
+	if (S_ISREG(inode->i_mode)) {
+		size = ui->xattr_size;
+		size += stat->size;
+		size = ALIGN(size, UBIFS_BLOCK_SIZE);
+		/*
+		 * Note, user-space expects 512-byte blocks count irrespectively
+		 * of what was reported in @stat->size.
+		 */
+		stat->blocks = size >> 9;
+	} else
+		stat->blocks = 0;
+	mutex_unlock(&ui->ui_mutex);
+	return 0;
+}
+
+struct inode_operations ubifs_dir_inode_operations = {
+	.lookup      = ubifs_lookup,
+	.create      = ubifs_create,
+	.link        = ubifs_link,
+	.symlink     = ubifs_symlink,
+	.unlink      = ubifs_unlink,
+	.mkdir       = ubifs_mkdir,
+	.rmdir       = ubifs_rmdir,
+	.mknod       = ubifs_mknod,
+	.rename      = ubifs_rename,
+	.setattr     = ubifs_setattr,
+	.getattr     = ubifs_getattr,
+#ifdef CONFIG_UBIFS_FS_XATTR
+	.setxattr    = ubifs_setxattr,
+	.getxattr    = ubifs_getxattr,
+	.listxattr   = ubifs_listxattr,
+	.removexattr = ubifs_removexattr,
+#endif
+};
+
+struct file_operations ubifs_dir_operations = {
+	.llseek         = ubifs_dir_llseek,
+	.release        = ubifs_dir_release,
+	.read           = generic_read_dir,
+	.readdir        = ubifs_readdir,
+	.fsync          = ubifs_fsync,
+	.unlocked_ioctl = ubifs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl   = ubifs_compat_ioctl,
+#endif
+};
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
new file mode 100644
index 00000000000..005a3b854d9
--- /dev/null
+++ b/fs/ubifs/file.c
@@ -0,0 +1,1275 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/*
+ * This file implements VFS file and inode operations of regular files, device
+ * nodes and symlinks as well as address space operations.
+ *
+ * UBIFS uses 2 page flags: PG_private and PG_checked. PG_private is set if the
+ * page is dirty and is used for budgeting purposes - dirty pages should not be
+ * budgeted. The PG_checked flag is set if full budgeting is required for the
+ * page e.g., when it corresponds to a file hole or it is just beyond the file
+ * size. The budgeting is done in 'ubifs_write_begin()', because it is OK to
+ * fail in this function, and the budget is released in 'ubifs_write_end()'. So
+ * the PG_private and PG_checked flags carry the information about how the page
+ * was budgeted, to make it possible to release the budget properly.
+ *
+ * A thing to keep in mind: inode's 'i_mutex' is locked in most VFS operations
+ * we implement. However, this is not true for '->writepage()', which might be
+ * called with 'i_mutex' unlocked. For example, when pdflush is performing
+ * write-back, it calls 'writepage()' with unlocked 'i_mutex', although the
+ * inode has 'I_LOCK' flag in this case. At "normal" work-paths 'i_mutex' is
+ * locked in '->writepage', e.g. in "sys_write -> alloc_pages -> direct reclaim
+ * path'. So, in '->writepage()' we are only guaranteed that the page is
+ * locked.
+ *
+ * Similarly, 'i_mutex' does not have to be locked in readpage(), e.g.,
+ * readahead path does not have it locked ("sys_read -> generic_file_aio_read
+ * -> ondemand_readahead -> readpage"). In case of readahead, 'I_LOCK' flag is
+ * not set as well. However, UBIFS disables readahead.
+ *
+ * This, for example means that there might be 2 concurrent '->writepage()'
+ * calls for the same inode, but different inode dirty pages.
+ */
+
+#include "ubifs.h"
+#include <linux/mount.h>
+
+static int read_block(struct inode *inode, void *addr, unsigned int block,
+		      struct ubifs_data_node *dn)
+{
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+	int err, len, out_len;
+	union ubifs_key key;
+	unsigned int dlen;
+
+	data_key_init(c, &key, inode->i_ino, block);
+	err = ubifs_tnc_lookup(c, &key, dn);
+	if (err) {
+		if (err == -ENOENT)
+			/* Not found, so it must be a hole */
+			memset(addr, 0, UBIFS_BLOCK_SIZE);
+		return err;
+	}
+
+	ubifs_assert(dn->ch.sqnum > ubifs_inode(inode)->creat_sqnum);
+
+	len = le32_to_cpu(dn->size);
+	if (len <= 0 || len > UBIFS_BLOCK_SIZE)
+		goto dump;
+
+	dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
+	out_len = UBIFS_BLOCK_SIZE;
+	err = ubifs_decompress(&dn->data, dlen, addr, &out_len,
+			       le16_to_cpu(dn->compr_type));
+	if (err || len != out_len)
+		goto dump;
+
+	/*
+	 * Data length can be less than a full block, even for blocks that are
+	 * not the last in the file (e.g., as a result of making a hole and
+	 * appending data). Ensure that the remainder is zeroed out.
+	 */
+	if (len < UBIFS_BLOCK_SIZE)
+		memset(addr + len, 0, UBIFS_BLOCK_SIZE - len);
+
+	return 0;
+
+dump:
+	ubifs_err("bad data node (block %u, inode %lu)",
+		  block, inode->i_ino);
+	dbg_dump_node(c, dn);
+	return -EINVAL;
+}
+
+static int do_readpage(struct page *page)
+{
+	void *addr;
+	int err = 0, i;
+	unsigned int block, beyond;
+	struct ubifs_data_node *dn;
+	struct inode *inode = page->mapping->host;
+	loff_t i_size = i_size_read(inode);
+
+	dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx",
+		inode->i_ino, page->index, i_size, page->flags);
+	ubifs_assert(!PageChecked(page));
+	ubifs_assert(!PagePrivate(page));
+
+	addr = kmap(page);
+
+	block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
+	beyond = (i_size + UBIFS_BLOCK_SIZE - 1) >> UBIFS_BLOCK_SHIFT;
+	if (block >= beyond) {
+		/* Reading beyond inode */
+		SetPageChecked(page);
+		memset(addr, 0, PAGE_CACHE_SIZE);
+		goto out;
+	}
+
+	dn = kmalloc(UBIFS_MAX_DATA_NODE_SZ, GFP_NOFS);
+	if (!dn) {
+		err = -ENOMEM;
+		goto error;
+	}
+
+	i = 0;
+	while (1) {
+		int ret;
+
+		if (block >= beyond) {
+			/* Reading beyond inode */
+			err = -ENOENT;
+			memset(addr, 0, UBIFS_BLOCK_SIZE);
+		} else {
+			ret = read_block(inode, addr, block, dn);
+			if (ret) {
+				err = ret;
+				if (err != -ENOENT)
+					break;
+			}
+		}
+		if (++i >= UBIFS_BLOCKS_PER_PAGE)
+			break;
+		block += 1;
+		addr += UBIFS_BLOCK_SIZE;
+	}
+	if (err) {
+		if (err == -ENOENT) {
+			/* Not found, so it must be a hole */
+			SetPageChecked(page);
+			dbg_gen("hole");
+			goto out_free;
+		}
+		ubifs_err("cannot read page %lu of inode %lu, error %d",
+			  page->index, inode->i_ino, err);
+		goto error;
+	}
+
+out_free:
+	kfree(dn);
+out:
+	SetPageUptodate(page);
+	ClearPageError(page);
+	flush_dcache_page(page);
+	kunmap(page);
+	return 0;
+
+error:
+	kfree(dn);
+	ClearPageUptodate(page);
+	SetPageError(page);
+	flush_dcache_page(page);
+	kunmap(page);
+	return err;
+}
+
+/**
+ * release_new_page_budget - release budget of a new page.
+ * @c: UBIFS file-system description object
+ *
+ * This is a helper function which releases budget corresponding to the budget
+ * of one new page of data.
+ */
+static void release_new_page_budget(struct ubifs_info *c)
+{
+	struct ubifs_budget_req req = { .recalculate = 1, .new_page = 1 };
+
+	ubifs_release_budget(c, &req);
+}
+
+/**
+ * release_existing_page_budget - release budget of an existing page.
+ * @c: UBIFS file-system description object
+ *
+ * This is a helper function which releases budget corresponding to the budget
+ * of changing one one page of data which already exists on the flash media.
+ */
+static void release_existing_page_budget(struct ubifs_info *c)
+{
+	struct ubifs_budget_req req = { .dd_growth = c->page_budget};
+
+	ubifs_release_budget(c, &req);
+}
+
+static int write_begin_slow(struct address_space *mapping,
+			    loff_t pos, unsigned len, struct page **pagep)
+{
+	struct inode *inode = mapping->host;
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	struct ubifs_budget_req req = { .new_page = 1 };
+	int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
+	struct page *page;
+
+	dbg_gen("ino %lu, pos %llu, len %u, i_size %lld",
+		inode->i_ino, pos, len, inode->i_size);
+
+	/*
+	 * At the slow path we have to budget before locking the page, because
+	 * budgeting may force write-back, which would wait on locked pages and
+	 * deadlock if we had the page locked. At this point we do not know
+	 * anything about the page, so assume that this is a new page which is
+	 * written to a hole. This corresponds to largest budget. Later the
+	 * budget will be amended if this is not true.
+	 */
+	if (appending)
+		/* We are appending data, budget for inode change */
+		req.dirtied_ino = 1;
+
+	err = ubifs_budget_space(c, &req);
+	if (unlikely(err))
+		return err;
+
+	page = __grab_cache_page(mapping, index);
+	if (unlikely(!page)) {
+		ubifs_release_budget(c, &req);
+		return -ENOMEM;
+	}
+
+	if (!PageUptodate(page)) {
+		if (!(pos & PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
+			SetPageChecked(page);
+		else {
+			err = do_readpage(page);
+			if (err) {
+				unlock_page(page);
+				page_cache_release(page);
+				return err;
+			}
+		}
+
+		SetPageUptodate(page);
+		ClearPageError(page);
+	}
+
+	if (PagePrivate(page))
+		/*
+		 * The page is dirty, which means it was budgeted twice:
+		 *   o first time the budget was allocated by the task which
+		 *     made the page dirty and set the PG_private flag;
+		 *   o and then we budgeted for it for the second time at the
+		 *     very beginning of this function.
+		 *
+		 * So what we have to do is to release the page budget we
+		 * allocated.
+		 */
+		release_new_page_budget(c);
+	else if (!PageChecked(page))
+		/*
+		 * We are changing a page which already exists on the media.
+		 * This means that changing the page does not make the amount
+		 * of indexing information larger, and this part of the budget
+		 * which we have already acquired may be released.
+		 */
+		ubifs_convert_page_budget(c);
+
+	if (appending) {
+		struct ubifs_inode *ui = ubifs_inode(inode);
+
+		/*
+		 * 'ubifs_write_end()' is optimized from the fast-path part of
+		 * 'ubifs_write_begin()' and expects the @ui_mutex to be locked
+		 * if data is appended.
+		 */
+		mutex_lock(&ui->ui_mutex);
+		if (ui->dirty)
+			/*
+			 * The inode is dirty already, so we may free the
+			 * budget we allocated.
+			 */
+			ubifs_release_dirty_inode_budget(c, ui);
+	}
+
+	*pagep = page;
+	return 0;
+}
+
+/**
+ * allocate_budget - allocate budget for 'ubifs_write_begin()'.
+ * @c: UBIFS file-system description object
+ * @page: page to allocate budget for
+ * @ui: UBIFS inode object the page belongs to
+ * @appending: non-zero if the page is appended
+ *
+ * This is a helper function for 'ubifs_write_begin()' which allocates budget
+ * for the operation. The budget is allocated differently depending on whether
+ * this is appending, whether the page is dirty or not, and so on. This
+ * function leaves the @ui->ui_mutex locked in case of appending. Returns zero
+ * in case of success and %-ENOSPC in case of failure.
+ */
+static int allocate_budget(struct ubifs_info *c, struct page *page,
+			   struct ubifs_inode *ui, int appending)
+{
+	struct ubifs_budget_req req = { .fast = 1 };
+
+	if (PagePrivate(page)) {
+		if (!appending)
+			/*
+			 * The page is dirty and we are not appending, which
+			 * means no budget is needed at all.
+			 */
+			return 0;
+
+		mutex_lock(&ui->ui_mutex);
+		if (ui->dirty)
+			/*
+			 * The page is dirty and we are appending, so the inode
+			 * has to be marked as dirty. However, it is already
+			 * dirty, so we do not need any budget. We may return,
+			 * but @ui->ui_mutex hast to be left locked because we
+			 * should prevent write-back from flushing the inode
+			 * and freeing the budget. The lock will be released in
+			 * 'ubifs_write_end()'.
+			 */
+			return 0;
+
+		/*
+		 * The page is dirty, we are appending, the inode is clean, so
+		 * we need to budget the inode change.
+		 */
+		req.dirtied_ino = 1;
+	} else {
+		if (PageChecked(page))
+			/*
+			 * The page corresponds to a hole and does not
+			 * exist on the media. So changing it makes
+			 * make the amount of indexing information
+			 * larger, and we have to budget for a new
+			 * page.
+			 */
+			req.new_page = 1;
+		else
+			/*
+			 * Not a hole, the change will not add any new
+			 * indexing information, budget for page
+			 * change.
+			 */
+			req.dirtied_page = 1;
+
+		if (appending) {
+			mutex_lock(&ui->ui_mutex);
+			if (!ui->dirty)
+				/*
+				 * The inode is clean but we will have to mark
+				 * it as dirty because we are appending. This
+				 * needs a budget.
+				 */
+				req.dirtied_ino = 1;
+		}
+	}
+
+	return ubifs_budget_space(c, &req);
+}
+
+/*
+ * This function is called when a page of data is going to be written. Since
+ * the page of data will not necessarily go to the flash straight away, UBIFS
+ * has to reserve space on the media for it, which is done by means of
+ * budgeting.
+ *
+ * This is the hot-path of the file-system and we are trying to optimize it as
+ * much as possible. For this reasons it is split on 2 parts - slow and fast.
+ *
+ * There many budgeting cases:
+ *     o a new page is appended - we have to budget for a new page and for
+ *       changing the inode; however, if the inode is already dirty, there is
+ *       no need to budget for it;
+ *     o an existing clean page is changed - we have budget for it; if the page
+ *       does not exist on the media (a hole), we have to budget for a new
+ *       page; otherwise, we may budget for changing an existing page; the
+ *       difference between these cases is that changing an existing page does
+ *       not introduce anything new to the FS indexing information, so it does
+ *       not grow, and smaller budget is acquired in this case;
+ *     o an existing dirty page is changed - no need to budget at all, because
+ *       the page budget has been acquired by earlier, when the page has been
+ *       marked dirty.
+ *
+ * UBIFS budgeting sub-system may force write-back if it thinks there is no
+ * space to reserve. This imposes some locking restrictions and makes it
+ * impossible to take into account the above cases, and makes it impossible to
+ * optimize budgeting.
+ *
+ * The solution for this is that the fast path of 'ubifs_write_begin()' assumes
+ * there is a plenty of flash space and the budget will be acquired quickly,
+ * without forcing write-back. The slow path does not make this assumption.
+ */
+static int ubifs_write_begin(struct file *file, struct address_space *mapping,
+			     loff_t pos, unsigned len, unsigned flags,
+			     struct page **pagep, void **fsdata)
+{
+	struct inode *inode = mapping->host;
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
+	struct page *page;
+
+
+	ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size);
+
+	if (unlikely(c->ro_media))
+		return -EROFS;
+
+	/* Try out the fast-path part first */
+	page = __grab_cache_page(mapping, index);
+	if (unlikely(!page))
+		return -ENOMEM;
+
+	if (!PageUptodate(page)) {
+		/* The page is not loaded from the flash */
+		if (!(pos & PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
+			/*
+			 * We change whole page so no need to load it. But we
+			 * have to set the @PG_checked flag to make the further
+			 * code the page is new. This might be not true, but it
+			 * is better to budget more that to read the page from
+			 * the media.
+			 */
+			SetPageChecked(page);
+		else {
+			err = do_readpage(page);
+			if (err) {
+				unlock_page(page);
+				page_cache_release(page);
+				return err;
+			}
+		}
+
+		SetPageUptodate(page);
+		ClearPageError(page);
+	}
+
+	err = allocate_budget(c, page, ui, appending);
+	if (unlikely(err)) {
+		ubifs_assert(err == -ENOSPC);
+		/*
+		 * Budgeting failed which means it would have to force
+		 * write-back but didn't, because we set the @fast flag in the
+		 * request. Write-back cannot be done now, while we have the
+		 * page locked, because it would deadlock. Unlock and free
+		 * everything and fall-back to slow-path.
+		 */
+		if (appending) {
+			ubifs_assert(mutex_is_locked(&ui->ui_mutex));
+			mutex_unlock(&ui->ui_mutex);
+		}
+		unlock_page(page);
+		page_cache_release(page);
+
+		return write_begin_slow(mapping, pos, len, pagep);
+	}
+
+	/*
+	 * Whee, we aquired budgeting quickly - without involving
+	 * garbage-collection, committing or forceing write-back. We return
+	 * with @ui->ui_mutex locked if we are appending pages, and unlocked
+	 * otherwise. This is an optimization (slightly hacky though).
+	 */
+	*pagep = page;
+	return 0;
+
+}
+
+/**
+ * cancel_budget - cancel budget.
+ * @c: UBIFS file-system description object
+ * @page: page to cancel budget for
+ * @ui: UBIFS inode object the page belongs to
+ * @appending: non-zero if the page is appended
+ *
+ * This is a helper function for a page write operation. It unlocks the
+ * @ui->ui_mutex in case of appending.
+ */
+static void cancel_budget(struct ubifs_info *c, struct page *page,
+			  struct ubifs_inode *ui, int appending)
+{
+	if (appending) {
+		if (!ui->dirty)
+			ubifs_release_dirty_inode_budget(c, ui);
+		mutex_unlock(&ui->ui_mutex);
+	}
+	if (!PagePrivate(page)) {
+		if (PageChecked(page))
+			release_new_page_budget(c);
+		else
+			release_existing_page_budget(c);
+	}
+}
+
+static int ubifs_write_end(struct file *file, struct address_space *mapping,
+			   loff_t pos, unsigned len, unsigned copied,
+			   struct page *page, void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+	loff_t end_pos = pos + len;
+	int appending = !!(end_pos > inode->i_size);
+
+	dbg_gen("ino %lu, pos %llu, pg %lu, len %u, copied %d, i_size %lld",
+		inode->i_ino, pos, page->index, len, copied, inode->i_size);
+
+	if (unlikely(copied < len && len == PAGE_CACHE_SIZE)) {
+		/*
+		 * VFS copied less data to the page that it intended and
+		 * declared in its '->write_begin()' call via the @len
+		 * argument. If the page was not up-to-date, and @len was
+		 * @PAGE_CACHE_SIZE, the 'ubifs_write_begin()' function did
+		 * not load it from the media (for optimization reasons). This
+		 * means that part of the page contains garbage. So read the
+		 * page now.
+		 */
+		dbg_gen("copied %d instead of %d, read page and repeat",
+			copied, len);
+		cancel_budget(c, page, ui, appending);
+
+		/*
+		 * Return 0 to force VFS to repeat the whole operation, or the
+		 * error code if 'do_readpage()' failes.
+		 */
+		copied = do_readpage(page);
+		goto out;
+	}
+
+	if (!PagePrivate(page)) {
+		SetPagePrivate(page);
+		atomic_long_inc(&c->dirty_pg_cnt);
+		__set_page_dirty_nobuffers(page);
+	}
+
+	if (appending) {
+		i_size_write(inode, end_pos);
+		ui->ui_size = end_pos;
+		/*
+		 * Note, we do not set @I_DIRTY_PAGES (which means that the
+		 * inode has dirty pages), this has been done in
+		 * '__set_page_dirty_nobuffers()'.
+		 */
+		__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+		ubifs_assert(mutex_is_locked(&ui->ui_mutex));
+		mutex_unlock(&ui->ui_mutex);
+	}
+
+out:
+	unlock_page(page);
+	page_cache_release(page);
+	return copied;
+}
+
+static int ubifs_readpage(struct file *file, struct page *page)
+{
+	do_readpage(page);
+	unlock_page(page);
+	return 0;
+}
+
+static int do_writepage(struct page *page, int len)
+{
+	int err = 0, i, blen;
+	unsigned int block;
+	void *addr;
+	union ubifs_key key;
+	struct inode *inode = page->mapping->host;
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+
+#ifdef UBIFS_DEBUG
+	spin_lock(&ui->ui_lock);
+	ubifs_assert(page->index <= ui->synced_i_size << PAGE_CACHE_SIZE);
+	spin_unlock(&ui->ui_lock);
+#endif
+
+	/* Update radix tree tags */
+	set_page_writeback(page);
+
+	addr = kmap(page);
+	block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
+	i = 0;
+	while (len) {
+		blen = min_t(int, len, UBIFS_BLOCK_SIZE);
+		data_key_init(c, &key, inode->i_ino, block);
+		err = ubifs_jnl_write_data(c, inode, &key, addr, blen);
+		if (err)
+			break;
+		if (++i >= UBIFS_BLOCKS_PER_PAGE)
+			break;
+		block += 1;
+		addr += blen;
+		len -= blen;
+	}
+	if (err) {
+		SetPageError(page);
+		ubifs_err("cannot write page %lu of inode %lu, error %d",
+			  page->index, inode->i_ino, err);
+		ubifs_ro_mode(c, err);
+	}
+
+	ubifs_assert(PagePrivate(page));
+	if (PageChecked(page))
+		release_new_page_budget(c);
+	else
+		release_existing_page_budget(c);
+
+	atomic_long_dec(&c->dirty_pg_cnt);
+	ClearPagePrivate(page);
+	ClearPageChecked(page);
+
+	kunmap(page);
+	unlock_page(page);
+	end_page_writeback(page);
+	return err;
+}
+
+/*
+ * When writing-back dirty inodes, VFS first writes-back pages belonging to the
+ * inode, then the inode itself. For UBIFS this may cause a problem. Consider a
+ * situation when a we have an inode with size 0, then a megabyte of data is
+ * appended to the inode, then write-back starts and flushes some amount of the
+ * dirty pages, the journal becomes full, commit happens and finishes, and then
+ * an unclean reboot happens. When the file system is mounted next time, the
+ * inode size would still be 0, but there would be many pages which are beyond
+ * the inode size, they would be indexed and consume flash space. Because the
+ * journal has been committed, the replay would not be able to detect this
+ * situation and correct the inode size. This means UBIFS would have to scan
+ * whole index and correct all inode sizes, which is long an unacceptable.
+ *
+ * To prevent situations like this, UBIFS writes pages back only if they are
+ * within last synchronized inode size, i.e. the the size which has been
+ * written to the flash media last time. Otherwise, UBIFS forces inode
+ * write-back, thus making sure the on-flash inode contains current inode size,
+ * and then keeps writing pages back.
+ *
+ * Some locking issues explanation. 'ubifs_writepage()' first is called with
+ * the page locked, and it locks @ui_mutex. However, write-back does take inode
+ * @i_mutex, which means other VFS operations may be run on this inode at the
+ * same time. And the problematic one is truncation to smaller size, from where
+ * we have to call 'vmtruncate()', which first changes @inode->i_size, then
+ * drops the truncated pages. And while dropping the pages, it takes the page
+ * lock. This means that 'do_truncation()' cannot call 'vmtruncate()' with
+ * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This
+ * means that @inode->i_size is changed while @ui_mutex is unlocked.
+ *
+ * But in 'ubifs_writepage()' we have to guarantee that we do not write beyond
+ * inode size. How do we do this if @inode->i_size may became smaller while we
+ * are in the middle of 'ubifs_writepage()'? The UBIFS solution is the
+ * @ui->ui_isize "shadow" field which UBIFS uses instead of @inode->i_size
+ * internally and updates it under @ui_mutex.
+ *
+ * Q: why we do not worry that if we race with truncation, we may end up with a
+ * situation when the inode is truncated while we are in the middle of
+ * 'do_writepage()', so we do write beyond inode size?
+ * A: If we are in the middle of 'do_writepage()', truncation would be locked
+ * on the page lock and it would not write the truncated inode node to the
+ * journal before we have finished.
+ */
+static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	loff_t i_size =  i_size_read(inode), synced_i_size;
+	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	int err, len = i_size & (PAGE_CACHE_SIZE - 1);
+	void *kaddr;
+
+	dbg_gen("ino %lu, pg %lu, pg flags %#lx",
+		inode->i_ino, page->index, page->flags);
+	ubifs_assert(PagePrivate(page));
+
+	/* Is the page fully outside @i_size? (truncate in progress) */
+	if (page->index > end_index || (page->index == end_index && !len)) {
+		err = 0;
+		goto out_unlock;
+	}
+
+	spin_lock(&ui->ui_lock);
+	synced_i_size = ui->synced_i_size;
+	spin_unlock(&ui->ui_lock);
+
+	/* Is the page fully inside @i_size? */
+	if (page->index < end_index) {
+		if (page->index >= synced_i_size >> PAGE_CACHE_SHIFT) {
+			err = inode->i_sb->s_op->write_inode(inode, 1);
+			if (err)
+				goto out_unlock;
+			/*
+			 * The inode has been written, but the write-buffer has
+			 * not been synchronized, so in case of an unclean
+			 * reboot we may end up with some pages beyond inode
+			 * size, but they would be in the journal (because
+			 * commit flushes write buffers) and recovery would deal
+			 * with this.
+			 */
+		}
+		return do_writepage(page, PAGE_CACHE_SIZE);
+	}
+
+	/*
+	 * The page straddles @i_size. It must be zeroed out on each and every
+	 * writepage invocation because it may be mmapped. "A file is mapped
+	 * in multiples of the page size. For a file that is not a multiple of
+	 * the page size, the remaining memory is zeroed when mapped, and
+	 * writes to that region are not written out to the file."
+	 */
+	kaddr = kmap_atomic(page, KM_USER0);
+	memset(kaddr + len, 0, PAGE_CACHE_SIZE - len);
+	flush_dcache_page(page);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	if (i_size > synced_i_size) {
+		err = inode->i_sb->s_op->write_inode(inode, 1);
+		if (err)
+			goto out_unlock;
+	}
+
+	return do_writepage(page, len);
+
+out_unlock:
+	unlock_page(page);
+	return err;
+}
+
+/**
+ * do_attr_changes - change inode attributes.
+ * @inode: inode to change attributes for
+ * @attr: describes attributes to change
+ */
+static void do_attr_changes(struct inode *inode, const struct iattr *attr)
+{
+	if (attr->ia_valid & ATTR_UID)
+		inode->i_uid = attr->ia_uid;
+	if (attr->ia_valid & ATTR_GID)
+		inode->i_gid = attr->ia_gid;
+	if (attr->ia_valid & ATTR_ATIME)
+		inode->i_atime = timespec_trunc(attr->ia_atime,
+						inode->i_sb->s_time_gran);
+	if (attr->ia_valid & ATTR_MTIME)
+		inode->i_mtime = timespec_trunc(attr->ia_mtime,
+						inode->i_sb->s_time_gran);
+	if (attr->ia_valid & ATTR_CTIME)
+		inode->i_ctime = timespec_trunc(attr->ia_ctime,
+						inode->i_sb->s_time_gran);
+	if (attr->ia_valid & ATTR_MODE) {
+		umode_t mode = attr->ia_mode;
+
+		if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+			mode &= ~S_ISGID;
+		inode->i_mode = mode;
+	}
+}
+
+/**
+ * do_truncation - truncate an inode.
+ * @c: UBIFS file-system description object
+ * @inode: inode to truncate
+ * @attr: inode attribute changes description
+ *
+ * This function implements VFS '->setattr()' call when the inode is truncated
+ * to a smaller size. Returns zero in case of success and a negative error code
+ * in case of failure.
+ */
+static int do_truncation(struct ubifs_info *c, struct inode *inode,
+			 const struct iattr *attr)
+{
+	int err;
+	struct ubifs_budget_req req;
+	loff_t old_size = inode->i_size, new_size = attr->ia_size;
+	int offset = new_size & (UBIFS_BLOCK_SIZE - 1);
+	struct ubifs_inode *ui = ubifs_inode(inode);
+
+	dbg_gen("ino %lu, size %lld -> %lld", inode->i_ino, old_size, new_size);
+	memset(&req, 0, sizeof(struct ubifs_budget_req));
+
+	/*
+	 * If this is truncation to a smaller size, and we do not truncate on a
+	 * block boundary, budget for changing one data block, because the last
+	 * block will be re-written.
+	 */
+	if (new_size & (UBIFS_BLOCK_SIZE - 1))
+		req.dirtied_page = 1;
+
+	req.dirtied_ino = 1;
+	/* A funny way to budget for truncation node */
+	req.dirtied_ino_d = UBIFS_TRUN_NODE_SZ;
+	err = ubifs_budget_space(c, &req);
+	if (err)
+		return err;
+
+	err = vmtruncate(inode, new_size);
+	if (err)
+		goto out_budg;
+
+	if (offset) {
+		pgoff_t index = new_size >> PAGE_CACHE_SHIFT;
+		struct page *page;
+
+		page = find_lock_page(inode->i_mapping, index);
+		if (page) {
+			if (PageDirty(page)) {
+				/*
+				 * 'ubifs_jnl_truncate()' will try to truncate
+				 * the last data node, but it contains
+				 * out-of-date data because the page is dirty.
+				 * Write the page now, so that
+				 * 'ubifs_jnl_truncate()' will see an already
+				 * truncated (and up to date) data node.
+				 */
+				ubifs_assert(PagePrivate(page));
+
+				clear_page_dirty_for_io(page);
+				if (UBIFS_BLOCKS_PER_PAGE_SHIFT)
+					offset = new_size &
+						 (PAGE_CACHE_SIZE - 1);
+				err = do_writepage(page, offset);
+				page_cache_release(page);
+				if (err)
+					goto out_budg;
+				/*
+				 * We could now tell 'ubifs_jnl_truncate()' not
+				 * to read the last block.
+				 */
+			} else {
+				/*
+				 * We could 'kmap()' the page and pass the data
+				 * to 'ubifs_jnl_truncate()' to save it from
+				 * having to read it.
+				 */
+				unlock_page(page);
+				page_cache_release(page);
+			}
+		}
+	}
+
+	mutex_lock(&ui->ui_mutex);
+	ui->ui_size = inode->i_size;
+	/* Truncation changes inode [mc]time */
+	inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
+	/* The other attributes may be changed at the same time as well */
+	do_attr_changes(inode, attr);
+
+	err = ubifs_jnl_truncate(c, inode, old_size, new_size);
+	mutex_unlock(&ui->ui_mutex);
+out_budg:
+	ubifs_release_budget(c, &req);
+	return err;
+}
+
+/**
+ * do_setattr - change inode attributes.
+ * @c: UBIFS file-system description object
+ * @inode: inode to change attributes for
+ * @attr: inode attribute changes description
+ *
+ * This function implements VFS '->setattr()' call for all cases except
+ * truncations to smaller size. Returns zero in case of success and a negative
+ * error code in case of failure.
+ */
+static int do_setattr(struct ubifs_info *c, struct inode *inode,
+		      const struct iattr *attr)
+{
+	int err, release;
+	loff_t new_size = attr->ia_size;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	struct ubifs_budget_req req = { .dirtied_ino = 1,
+					.dirtied_ino_d = ui->data_len };
+
+	err = ubifs_budget_space(c, &req);
+	if (err)
+		return err;
+
+	if (attr->ia_valid & ATTR_SIZE) {
+		dbg_gen("size %lld -> %lld", inode->i_size, new_size);
+		err = vmtruncate(inode, new_size);
+		if (err)
+			goto out;
+	}
+
+	mutex_lock(&ui->ui_mutex);
+	if (attr->ia_valid & ATTR_SIZE) {
+		/* Truncation changes inode [mc]time */
+		inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
+		/* 'vmtruncate()' changed @i_size, update @ui_size */
+		ui->ui_size = inode->i_size;
+	}
+
+	do_attr_changes(inode, attr);
+
+	release = ui->dirty;
+	if (attr->ia_valid & ATTR_SIZE)
+		/*
+		 * Inode length changed, so we have to make sure
+		 * @I_DIRTY_DATASYNC is set.
+		 */
+		 __mark_inode_dirty(inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+	else
+		mark_inode_dirty_sync(inode);
+	mutex_unlock(&ui->ui_mutex);
+
+	if (release)
+		ubifs_release_budget(c, &req);
+	if (IS_SYNC(inode))
+		err = inode->i_sb->s_op->write_inode(inode, 1);
+	return err;
+
+out:
+	ubifs_release_budget(c, &req);
+	return err;
+}
+
+int ubifs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	int err;
+	struct inode *inode = dentry->d_inode;
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+
+	dbg_gen("ino %lu, ia_valid %#x", inode->i_ino, attr->ia_valid);
+	err = inode_change_ok(inode, attr);
+	if (err)
+		return err;
+
+	err = dbg_check_synced_i_size(inode);
+	if (err)
+		return err;
+
+	if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size < inode->i_size)
+		/* Truncation to a smaller size */
+		err = do_truncation(c, inode, attr);
+	else
+		err = do_setattr(c, inode, attr);
+
+	return err;
+}
+
+static void ubifs_invalidatepage(struct page *page, unsigned long offset)
+{
+	struct inode *inode = page->mapping->host;
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+
+	ubifs_assert(PagePrivate(page));
+	if (offset)
+		/* Partial page remains dirty */
+		return;
+
+	if (PageChecked(page))
+		release_new_page_budget(c);
+	else
+		release_existing_page_budget(c);
+
+	atomic_long_dec(&c->dirty_pg_cnt);
+	ClearPagePrivate(page);
+	ClearPageChecked(page);
+}
+
+static void *ubifs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct ubifs_inode *ui = ubifs_inode(dentry->d_inode);
+
+	nd_set_link(nd, ui->data);
+	return NULL;
+}
+
+int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+	int err;
+
+	dbg_gen("syncing inode %lu", inode->i_ino);
+
+	/*
+	 * VFS has already synchronized dirty pages for this inode. Synchronize
+	 * the inode unless this is a 'datasync()' call.
+	 */
+	if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) {
+		err = inode->i_sb->s_op->write_inode(inode, 1);
+		if (err)
+			return err;
+	}
+
+	/*
+	 * Nodes related to this inode may still sit in a write-buffer. Flush
+	 * them.
+	 */
+	err = ubifs_sync_wbufs_by_inode(c, inode);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+/**
+ * mctime_update_needed - check if mtime or ctime update is needed.
+ * @inode: the inode to do the check for
+ * @now: current time
+ *
+ * This helper function checks if the inode mtime/ctime should be updated or
+ * not. If current values of the time-stamps are within the UBIFS inode time
+ * granularity, they are not updated. This is an optimization.
+ */
+static inline int mctime_update_needed(const struct inode *inode,
+				       const struct timespec *now)
+{
+	if (!timespec_equal(&inode->i_mtime, now) ||
+	    !timespec_equal(&inode->i_ctime, now))
+		return 1;
+	return 0;
+}
+
+/**
+ * update_ctime - update mtime and ctime of an inode.
+ * @c: UBIFS file-system description object
+ * @inode: inode to update
+ *
+ * This function updates mtime and ctime of the inode if it is not equivalent to
+ * current time. Returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int update_mctime(struct ubifs_info *c, struct inode *inode)
+{
+	struct timespec now = ubifs_current_time(inode);
+	struct ubifs_inode *ui = ubifs_inode(inode);
+
+	if (mctime_update_needed(inode, &now)) {
+		int err, release;
+		struct ubifs_budget_req req = { .dirtied_ino = 1,
+						.dirtied_ino_d = ui->data_len };
+
+		err = ubifs_budget_space(c, &req);
+		if (err)
+			return err;
+
+		mutex_lock(&ui->ui_mutex);
+		inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
+		release = ui->dirty;
+		mark_inode_dirty_sync(inode);
+		mutex_unlock(&ui->ui_mutex);
+		if (release)
+			ubifs_release_budget(c, &req);
+	}
+
+	return 0;
+}
+
+static ssize_t ubifs_aio_write(struct kiocb *iocb, const struct iovec *iov,
+			       unsigned long nr_segs, loff_t pos)
+{
+	int err;
+	ssize_t ret;
+	struct inode *inode = iocb->ki_filp->f_mapping->host;
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+
+	err = update_mctime(c, inode);
+	if (err)
+		return err;
+
+	ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+	if (ret < 0)
+		return ret;
+
+	if (ret > 0 && (IS_SYNC(inode) || iocb->ki_filp->f_flags & O_SYNC)) {
+		err = ubifs_sync_wbufs_by_inode(c, inode);
+		if (err)
+			return err;
+	}
+
+	return ret;
+}
+
+static int ubifs_set_page_dirty(struct page *page)
+{
+	int ret;
+
+	ret = __set_page_dirty_nobuffers(page);
+	/*
+	 * An attempt to dirty a page without budgeting for it - should not
+	 * happen.
+	 */
+	ubifs_assert(ret == 0);
+	return ret;
+}
+
+static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags)
+{
+	/*
+	 * An attempt to release a dirty page without budgeting for it - should
+	 * not happen.
+	 */
+	if (PageWriteback(page))
+		return 0;
+	ubifs_assert(PagePrivate(page));
+	ubifs_assert(0);
+	ClearPagePrivate(page);
+	ClearPageChecked(page);
+	return 1;
+}
+
+/*
+ * mmap()d file has taken write protection fault and is being made
+ * writable. UBIFS must ensure page is budgeted for.
+ */
+static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+	struct timespec now = ubifs_current_time(inode);
+	struct ubifs_budget_req req = { .new_page = 1 };
+	int err, update_time;
+
+	dbg_gen("ino %lu, pg %lu, i_size %lld",	inode->i_ino, page->index,
+		i_size_read(inode));
+	ubifs_assert(!(inode->i_sb->s_flags & MS_RDONLY));
+
+	if (unlikely(c->ro_media))
+		return -EROFS;
+
+	/*
+	 * We have not locked @page so far so we may budget for changing the
+	 * page. Note, we cannot do this after we locked the page, because
+	 * budgeting may cause write-back which would cause deadlock.
+	 *
+	 * At the moment we do not know whether the page is dirty or not, so we
+	 * assume that it is not and budget for a new page. We could look at
+	 * the @PG_private flag and figure this out, but we may race with write
+	 * back and the page state may change by the time we lock it, so this
+	 * would need additional care. We do not bother with this at the
+	 * moment, although it might be good idea to do. Instead, we allocate
+	 * budget for a new page and amend it later on if the page was in fact
+	 * dirty.
+	 *
+	 * The budgeting-related logic of this function is similar to what we
+	 * do in 'ubifs_write_begin()' and 'ubifs_write_end()'. Glance there
+	 * for more comments.
+	 */
+	update_time = mctime_update_needed(inode, &now);
+	if (update_time)
+		/*
+		 * We have to change inode time stamp which requires extra
+		 * budgeting.
+		 */
+		req.dirtied_ino = 1;
+
+	err = ubifs_budget_space(c, &req);
+	if (unlikely(err)) {
+		if (err == -ENOSPC)
+			ubifs_warn("out of space for mmapped file "
+				   "(inode number %lu)", inode->i_ino);
+		return err;
+	}
+
+	lock_page(page);
+	if (unlikely(page->mapping != inode->i_mapping ||
+		     page_offset(page) > i_size_read(inode))) {
+		/* Page got truncated out from underneath us */
+		err = -EINVAL;
+		goto out_unlock;
+	}
+
+	if (PagePrivate(page))
+		release_new_page_budget(c);
+	else {
+		if (!PageChecked(page))
+			ubifs_convert_page_budget(c);
+		SetPagePrivate(page);
+		atomic_long_inc(&c->dirty_pg_cnt);
+		__set_page_dirty_nobuffers(page);
+	}
+
+	if (update_time) {
+		int release;
+		struct ubifs_inode *ui = ubifs_inode(inode);
+
+		mutex_lock(&ui->ui_mutex);
+		inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
+		release = ui->dirty;
+		mark_inode_dirty_sync(inode);
+		mutex_unlock(&ui->ui_mutex);
+		if (release)
+			ubifs_release_dirty_inode_budget(c, ui);
+	}
+
+	unlock_page(page);
+	return 0;
+
+out_unlock:
+	unlock_page(page);
+	ubifs_release_budget(c, &req);
+	return err;
+}
+
+static struct vm_operations_struct ubifs_file_vm_ops = {
+	.fault        = filemap_fault,
+	.page_mkwrite = ubifs_vm_page_mkwrite,
+};
+
+static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	int err;
+
+	/* 'generic_file_mmap()' takes care of NOMMU case */
+	err = generic_file_mmap(file, vma);
+	if (err)
+		return err;
+	vma->vm_ops = &ubifs_file_vm_ops;
+	return 0;
+}
+
+struct address_space_operations ubifs_file_address_operations = {
+	.readpage       = ubifs_readpage,
+	.writepage      = ubifs_writepage,
+	.write_begin    = ubifs_write_begin,
+	.write_end      = ubifs_write_end,
+	.invalidatepage = ubifs_invalidatepage,
+	.set_page_dirty = ubifs_set_page_dirty,
+	.releasepage    = ubifs_releasepage,
+};
+
+struct inode_operations ubifs_file_inode_operations = {
+	.setattr     = ubifs_setattr,
+	.getattr     = ubifs_getattr,
+#ifdef CONFIG_UBIFS_FS_XATTR
+	.setxattr    = ubifs_setxattr,
+	.getxattr    = ubifs_getxattr,
+	.listxattr   = ubifs_listxattr,
+	.removexattr = ubifs_removexattr,
+#endif
+};
+
+struct inode_operations ubifs_symlink_inode_operations = {
+	.readlink    = generic_readlink,
+	.follow_link = ubifs_follow_link,
+	.setattr     = ubifs_setattr,
+	.getattr     = ubifs_getattr,
+};
+
+struct file_operations ubifs_file_operations = {
+	.llseek         = generic_file_llseek,
+	.read           = do_sync_read,
+	.write          = do_sync_write,
+	.aio_read       = generic_file_aio_read,
+	.aio_write      = ubifs_aio_write,
+	.mmap           = ubifs_file_mmap,
+	.fsync          = ubifs_fsync,
+	.unlocked_ioctl = ubifs_ioctl,
+	.splice_read	= generic_file_splice_read,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl   = ubifs_compat_ioctl,
+#endif
+};
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
new file mode 100644
index 00000000000..10394c54836
--- /dev/null
+++ b/fs/ubifs/find.c
@@ -0,0 +1,975 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/*
+ * This file contains functions for finding LEBs for various purposes e.g.
+ * garbage collection. In general, lprops category heaps and lists are used
+ * for fast access, falling back on scanning the LPT as a last resort.
+ */
+
+#include <linux/sort.h>
+#include "ubifs.h"
+
+/**
+ * struct scan_data - data provided to scan callback functions
+ * @min_space: minimum number of bytes for which to scan
+ * @pick_free: whether it is OK to scan for empty LEBs
+ * @lnum: LEB number found is returned here
+ * @exclude_index: whether to exclude index LEBs
+ */
+struct scan_data {
+	int min_space;
+	int pick_free;
+	int lnum;
+	int exclude_index;
+};
+
+/**
+ * valuable - determine whether LEB properties are valuable.
+ * @c: the UBIFS file-system description object
+ * @lprops: LEB properties
+ *
+ * This function return %1 if the LEB properties should be added to the LEB
+ * properties tree in memory. Otherwise %0 is returned.
+ */
+static int valuable(struct ubifs_info *c, const struct ubifs_lprops *lprops)
+{
+	int n, cat = lprops->flags & LPROPS_CAT_MASK;
+	struct ubifs_lpt_heap *heap;
+
+	switch (cat) {
+	case LPROPS_DIRTY:
+	case LPROPS_DIRTY_IDX:
+	case LPROPS_FREE:
+		heap = &c->lpt_heap[cat - 1];
+		if (heap->cnt < heap->max_cnt)
+			return 1;
+		if (lprops->free + lprops->dirty >= c->dark_wm)
+			return 1;
+		return 0;
+	case LPROPS_EMPTY:
+		n = c->lst.empty_lebs + c->freeable_cnt -
+		    c->lst.taken_empty_lebs;
+		if (n < c->lsave_cnt)
+			return 1;
+		return 0;
+	case LPROPS_FREEABLE:
+		return 1;
+	case LPROPS_FRDI_IDX:
+		return 1;
+	}
+	return 0;
+}
+
+/**
+ * scan_for_dirty_cb - dirty space scan callback.
+ * @c: the UBIFS file-system description object
+ * @lprops: LEB properties to scan
+ * @in_tree: whether the LEB properties are in main memory
+ * @data: information passed to and from the caller of the scan
+ *
+ * This function returns a code that indicates whether the scan should continue
+ * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
+ * in main memory (%LPT_SCAN_ADD), or whether the scan should stop
+ * (%LPT_SCAN_STOP).
+ */
+static int scan_for_dirty_cb(struct ubifs_info *c,
+			     const struct ubifs_lprops *lprops, int in_tree,
+			     struct scan_data *data)
+{
+	int ret = LPT_SCAN_CONTINUE;
+
+	/* Exclude LEBs that are currently in use */
+	if (lprops->flags & LPROPS_TAKEN)
+		return LPT_SCAN_CONTINUE;
+	/* Determine whether to add these LEB properties to the tree */
+	if (!in_tree && valuable(c, lprops))
+		ret |= LPT_SCAN_ADD;
+	/* Exclude LEBs with too little space */
+	if (lprops->free + lprops->dirty < data->min_space)
+		return ret;
+	/* If specified, exclude index LEBs */
+	if (data->exclude_index && lprops->flags & LPROPS_INDEX)
+		return ret;
+	/* If specified, exclude empty or freeable LEBs */
+	if (lprops->free + lprops->dirty == c->leb_size) {
+		if (!data->pick_free)
+			return ret;
+	/* Exclude LEBs with too little dirty space (unless it is empty) */
+	} else if (lprops->dirty < c->dead_wm)
+		return ret;
+	/* Finally we found space */
+	data->lnum = lprops->lnum;
+	return LPT_SCAN_ADD | LPT_SCAN_STOP;
+}
+
+/**
+ * scan_for_dirty - find a data LEB with free space.
+ * @c: the UBIFS file-system description object
+ * @min_space: minimum amount free plus dirty space the returned LEB has to
+ *             have
+ * @pick_free: if it is OK to return a free or freeable LEB
+ * @exclude_index: whether to exclude index LEBs
+ *
+ * This function returns a pointer to the LEB properties found or a negative
+ * error code.
+ */
+static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c,
+						 int min_space, int pick_free,
+						 int exclude_index)
+{
+	const struct ubifs_lprops *lprops;
+	struct ubifs_lpt_heap *heap;
+	struct scan_data data;
+	int err, i;
+
+	/* There may be an LEB with enough dirty space on the free heap */
+	heap = &c->lpt_heap[LPROPS_FREE - 1];
+	for (i = 0; i < heap->cnt; i++) {
+		lprops = heap->arr[i];
+		if (lprops->free + lprops->dirty < min_space)
+			continue;
+		if (lprops->dirty < c->dead_wm)
+			continue;
+		return lprops;
+	}
+	/*
+	 * A LEB may have fallen off of the bottom of the dirty heap, and ended
+	 * up as uncategorized even though it has enough dirty space for us now,
+	 * so check the uncategorized list. N.B. neither empty nor freeable LEBs
+	 * can end up as uncategorized because they are kept on lists not
+	 * finite-sized heaps.
+	 */
+	list_for_each_entry(lprops, &c->uncat_list, list) {
+		if (lprops->flags & LPROPS_TAKEN)
+			continue;
+		if (lprops->free + lprops->dirty < min_space)
+			continue;
+		if (exclude_index && (lprops->flags & LPROPS_INDEX))
+			continue;
+		if (lprops->dirty < c->dead_wm)
+			continue;
+		return lprops;
+	}
+	/* We have looked everywhere in main memory, now scan the flash */
+	if (c->pnodes_have >= c->pnode_cnt)
+		/* All pnodes are in memory, so skip scan */
+		return ERR_PTR(-ENOSPC);
+	data.min_space = min_space;
+	data.pick_free = pick_free;
+	data.lnum = -1;
+	data.exclude_index = exclude_index;
+	err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
+				    (ubifs_lpt_scan_callback)scan_for_dirty_cb,
+				    &data);
+	if (err)
+		return ERR_PTR(err);
+	ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt);
+	c->lscan_lnum = data.lnum;
+	lprops = ubifs_lpt_lookup_dirty(c, data.lnum);
+	if (IS_ERR(lprops))
+		return lprops;
+	ubifs_assert(lprops->lnum == data.lnum);
+	ubifs_assert(lprops->free + lprops->dirty >= min_space);
+	ubifs_assert(lprops->dirty >= c->dead_wm ||
+		     (pick_free &&
+		      lprops->free + lprops->dirty == c->leb_size));
+	ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+	ubifs_assert(!exclude_index || !(lprops->flags & LPROPS_INDEX));
+	return lprops;
+}
+
+/**
+ * ubifs_find_dirty_leb - find a dirty LEB for the Garbage Collector.
+ * @c: the UBIFS file-system description object
+ * @ret_lp: LEB properties are returned here on exit
+ * @min_space: minimum amount free plus dirty space the returned LEB has to
+ *             have
+ * @pick_free: controls whether it is OK to pick empty or index LEBs
+ *
+ * This function tries to find a dirty logical eraseblock which has at least
+ * @min_space free and dirty space. It prefers to take an LEB from the dirty or
+ * dirty index heap, and it falls-back to LPT scanning if the heaps are empty
+ * or do not have an LEB which satisfies the @min_space criteria.
+ *
+ * Note:
+ *   o LEBs which have less than dead watermark of dirty space are never picked
+ *   by this function;
+ *
+ * Returns zero and the LEB properties of
+ * found dirty LEB in case of success, %-ENOSPC if no dirty LEB was found and a
+ * negative error code in case of other failures. The returned LEB is marked as
+ * "taken".
+ *
+ * The additional @pick_free argument controls if this function has to return a
+ * free or freeable LEB if one is present. For example, GC must to set it to %1,
+ * when called from the journal space reservation function, because the
+ * appearance of free space may coincide with the loss of enough dirty space
+ * for GC to succeed anyway.
+ *
+ * In contrast, if the Garbage Collector is called from budgeting, it should
+ * just make free space, not return LEBs which are already free or freeable.
+ *
+ * In addition @pick_free is set to %2 by the recovery process in order to
+ * recover gc_lnum in which case an index LEB must not be returned.
+ */
+int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
+			 int min_space, int pick_free)
+{
+	int err = 0, sum, exclude_index = pick_free == 2 ? 1 : 0;
+	const struct ubifs_lprops *lp = NULL, *idx_lp = NULL;
+	struct ubifs_lpt_heap *heap, *idx_heap;
+
+	ubifs_get_lprops(c);
+
+	if (pick_free) {
+		int lebs, rsvd_idx_lebs = 0;
+
+		spin_lock(&c->space_lock);
+		lebs = c->lst.empty_lebs;
+		lebs += c->freeable_cnt - c->lst.taken_empty_lebs;
+
+		/*
+		 * Note, the index may consume more LEBs than have been reserved
+		 * for it. It is OK because it might be consolidated by GC.
+		 * But if the index takes fewer LEBs than it is reserved for it,
+		 * this function must avoid picking those reserved LEBs.
+		 */
+		if (c->min_idx_lebs >= c->lst.idx_lebs) {
+			rsvd_idx_lebs = c->min_idx_lebs -  c->lst.idx_lebs;
+			exclude_index = 1;
+		}
+		spin_unlock(&c->space_lock);
+
+		/* Check if there are enough free LEBs for the index */
+		if (rsvd_idx_lebs < lebs) {
+			/* OK, try to find an empty LEB */
+			lp = ubifs_fast_find_empty(c);
+			if (lp)
+				goto found;
+
+			/* Or a freeable LEB */
+			lp = ubifs_fast_find_freeable(c);
+			if (lp)
+				goto found;
+		} else
+			/*
+			 * We cannot pick free/freeable LEBs in the below code.
+			 */
+			pick_free = 0;
+	} else {
+		spin_lock(&c->space_lock);
+		exclude_index = (c->min_idx_lebs >= c->lst.idx_lebs);
+		spin_unlock(&c->space_lock);
+	}
+
+	/* Look on the dirty and dirty index heaps */
+	heap = &c->lpt_heap[LPROPS_DIRTY - 1];
+	idx_heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
+
+	if (idx_heap->cnt && !exclude_index) {
+		idx_lp = idx_heap->arr[0];
+		sum = idx_lp->free + idx_lp->dirty;
+		/*
+		 * Since we reserve twice as more space for the index than it
+		 * actually takes, it does not make sense to pick indexing LEBs
+		 * with less than half LEB of dirty space.
+		 */
+		if (sum < min_space || sum < c->half_leb_size)
+			idx_lp = NULL;
+	}
+
+	if (heap->cnt) {
+		lp = heap->arr[0];
+		if (lp->dirty + lp->free < min_space)
+			lp = NULL;
+	}
+
+	/* Pick the LEB with most space */
+	if (idx_lp && lp) {
+		if (idx_lp->free + idx_lp->dirty >= lp->free + lp->dirty)
+			lp = idx_lp;
+	} else if (idx_lp && !lp)
+		lp = idx_lp;
+
+	if (lp) {
+		ubifs_assert(lp->dirty >= c->dead_wm);
+		goto found;
+	}
+
+	/* Did not find a dirty LEB on the dirty heaps, have to scan */
+	dbg_find("scanning LPT for a dirty LEB");
+	lp = scan_for_dirty(c, min_space, pick_free, exclude_index);
+	if (IS_ERR(lp)) {
+		err = PTR_ERR(lp);
+		goto out;
+	}
+	ubifs_assert(lp->dirty >= c->dead_wm ||
+		     (pick_free && lp->free + lp->dirty == c->leb_size));
+
+found:
+	dbg_find("found LEB %d, free %d, dirty %d, flags %#x",
+		 lp->lnum, lp->free, lp->dirty, lp->flags);
+
+	lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC,
+			     lp->flags | LPROPS_TAKEN, 0);
+	if (IS_ERR(lp)) {
+		err = PTR_ERR(lp);
+		goto out;
+	}
+
+	memcpy(ret_lp, lp, sizeof(struct ubifs_lprops));
+
+out:
+	ubifs_release_lprops(c);
+	return err;
+}
+
+/**
+ * scan_for_free_cb - free space scan callback.
+ * @c: the UBIFS file-system description object
+ * @lprops: LEB properties to scan
+ * @in_tree: whether the LEB properties are in main memory
+ * @data: information passed to and from the caller of the scan
+ *
+ * This function returns a code that indicates whether the scan should continue
+ * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
+ * in main memory (%LPT_SCAN_ADD), or whether the scan should stop
+ * (%LPT_SCAN_STOP).
+ */
+static int scan_for_free_cb(struct ubifs_info *c,
+			    const struct ubifs_lprops *lprops, int in_tree,
+			    struct scan_data *data)
+{
+	int ret = LPT_SCAN_CONTINUE;
+
+	/* Exclude LEBs that are currently in use */
+	if (lprops->flags & LPROPS_TAKEN)
+		return LPT_SCAN_CONTINUE;
+	/* Determine whether to add these LEB properties to the tree */
+	if (!in_tree && valuable(c, lprops))
+		ret |= LPT_SCAN_ADD;
+	/* Exclude index LEBs */
+	if (lprops->flags & LPROPS_INDEX)
+		return ret;
+	/* Exclude LEBs with too little space */
+	if (lprops->free < data->min_space)
+		return ret;
+	/* If specified, exclude empty LEBs */
+	if (!data->pick_free && lprops->free == c->leb_size)
+		return ret;
+	/*
+	 * LEBs that have only free and dirty space must not be allocated
+	 * because they may have been unmapped already or they may have data
+	 * that is obsolete only because of nodes that are still sitting in a
+	 * wbuf.
+	 */
+	if (lprops->free + lprops->dirty == c->leb_size && lprops->dirty > 0)
+		return ret;
+	/* Finally we found space */
+	data->lnum = lprops->lnum;
+	return LPT_SCAN_ADD | LPT_SCAN_STOP;
+}
+
+/**
+ * do_find_free_space - find a data LEB with free space.
+ * @c: the UBIFS file-system description object
+ * @min_space: minimum amount of free space required
+ * @pick_free: whether it is OK to scan for empty LEBs
+ * @squeeze: whether to try to find space in a non-empty LEB first
+ *
+ * This function returns a pointer to the LEB properties found or a negative
+ * error code.
+ */
+static
+const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c,
+					      int min_space, int pick_free,
+					      int squeeze)
+{
+	const struct ubifs_lprops *lprops;
+	struct ubifs_lpt_heap *heap;
+	struct scan_data data;
+	int err, i;
+
+	if (squeeze) {
+		lprops = ubifs_fast_find_free(c);
+		if (lprops && lprops->free >= min_space)
+			return lprops;
+	}
+	if (pick_free) {
+		lprops = ubifs_fast_find_empty(c);
+		if (lprops)
+			return lprops;
+	}
+	if (!squeeze) {
+		lprops = ubifs_fast_find_free(c);
+		if (lprops && lprops->free >= min_space)
+			return lprops;
+	}
+	/* There may be an LEB with enough free space on the dirty heap */
+	heap = &c->lpt_heap[LPROPS_DIRTY - 1];
+	for (i = 0; i < heap->cnt; i++) {
+		lprops = heap->arr[i];
+		if (lprops->free >= min_space)
+			return lprops;
+	}
+	/*
+	 * A LEB may have fallen off of the bottom of the free heap, and ended
+	 * up as uncategorized even though it has enough free space for us now,
+	 * so check the uncategorized list. N.B. neither empty nor freeable LEBs
+	 * can end up as uncategorized because they are kept on lists not
+	 * finite-sized heaps.
+	 */
+	list_for_each_entry(lprops, &c->uncat_list, list) {
+		if (lprops->flags & LPROPS_TAKEN)
+			continue;
+		if (lprops->flags & LPROPS_INDEX)
+			continue;
+		if (lprops->free >= min_space)
+			return lprops;
+	}
+	/* We have looked everywhere in main memory, now scan the flash */
+	if (c->pnodes_have >= c->pnode_cnt)
+		/* All pnodes are in memory, so skip scan */
+		return ERR_PTR(-ENOSPC);
+	data.min_space = min_space;
+	data.pick_free = pick_free;
+	data.lnum = -1;
+	err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
+				    (ubifs_lpt_scan_callback)scan_for_free_cb,
+				    &data);
+	if (err)
+		return ERR_PTR(err);
+	ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt);
+	c->lscan_lnum = data.lnum;
+	lprops = ubifs_lpt_lookup_dirty(c, data.lnum);
+	if (IS_ERR(lprops))
+		return lprops;
+	ubifs_assert(lprops->lnum == data.lnum);
+	ubifs_assert(lprops->free >= min_space);
+	ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+	ubifs_assert(!(lprops->flags & LPROPS_INDEX));
+	return lprops;
+}
+
+/**
+ * ubifs_find_free_space - find a data LEB with free space.
+ * @c: the UBIFS file-system description object
+ * @min_space: minimum amount of required free space
+ * @free: contains amount of free space in the LEB on exit
+ * @squeeze: whether to try to find space in a non-empty LEB first
+ *
+ * This function looks for an LEB with at least @min_space bytes of free space.
+ * It tries to find an empty LEB if possible. If no empty LEBs are available,
+ * this function searches for a non-empty data LEB. The returned LEB is marked
+ * as "taken".
+ *
+ * This function returns found LEB number in case of success, %-ENOSPC if it
+ * failed to find a LEB with @min_space bytes of free space and other a negative
+ * error codes in case of failure.
+ */
+int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
+			  int squeeze)
+{
+	const struct ubifs_lprops *lprops;
+	int lebs, rsvd_idx_lebs, pick_free = 0, err, lnum, flags;
+
+	dbg_find("min_space %d", min_space);
+	ubifs_get_lprops(c);
+
+	/* Check if there are enough empty LEBs for commit */
+	spin_lock(&c->space_lock);
+	if (c->min_idx_lebs > c->lst.idx_lebs)
+		rsvd_idx_lebs = c->min_idx_lebs -  c->lst.idx_lebs;
+	else
+		rsvd_idx_lebs = 0;
+	lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
+	       c->lst.taken_empty_lebs;
+	ubifs_assert(lebs + c->lst.idx_lebs >= c->min_idx_lebs);
+	if (rsvd_idx_lebs < lebs)
+		/*
+		 * OK to allocate an empty LEB, but we still don't want to go
+		 * looking for one if there aren't any.
+		 */
+		if (c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) {
+			pick_free = 1;
+			/*
+			 * Because we release the space lock, we must account
+			 * for this allocation here. After the LEB properties
+			 * flags have been updated, we subtract one. Note, the
+			 * result of this is that lprops also decreases
+			 * @taken_empty_lebs in 'ubifs_change_lp()', so it is
+			 * off by one for a short period of time which may
+			 * introduce a small disturbance to budgeting
+			 * calculations, but this is harmless because at the
+			 * worst case this would make the budgeting subsystem
+			 * be more pessimistic than needed.
+			 *
+			 * Fundamentally, this is about serialization of the
+			 * budgeting and lprops subsystems. We could make the
+			 * @space_lock a mutex and avoid dropping it before
+			 * calling 'ubifs_change_lp()', but mutex is more
+			 * heavy-weight, and we want budgeting to be as fast as
+			 * possible.
+			 */
+			c->lst.taken_empty_lebs += 1;
+		}
+	spin_unlock(&c->space_lock);
+
+	lprops = do_find_free_space(c, min_space, pick_free, squeeze);
+	if (IS_ERR(lprops)) {
+		err = PTR_ERR(lprops);
+		goto out;
+	}
+
+	lnum = lprops->lnum;
+	flags = lprops->flags | LPROPS_TAKEN;
+
+	lprops = ubifs_change_lp(c, lprops, LPROPS_NC, LPROPS_NC, flags, 0);
+	if (IS_ERR(lprops)) {
+		err = PTR_ERR(lprops);
+		goto out;
+	}
+
+	if (pick_free) {
+		spin_lock(&c->space_lock);
+		c->lst.taken_empty_lebs -= 1;
+		spin_unlock(&c->space_lock);
+	}
+
+	*free = lprops->free;
+	ubifs_release_lprops(c);
+
+	if (*free == c->leb_size) {
+		/*
+		 * Ensure that empty LEBs have been unmapped. They may not have
+		 * been, for example, because of an unclean unmount.  Also
+		 * LEBs that were freeable LEBs (free + dirty == leb_size) will
+		 * not have been unmapped.
+		 */
+		err = ubifs_leb_unmap(c, lnum);
+		if (err)
+			return err;
+	}
+
+	dbg_find("found LEB %d, free %d", lnum, *free);
+	ubifs_assert(*free >= min_space);
+	return lnum;
+
+out:
+	if (pick_free) {
+		spin_lock(&c->space_lock);
+		c->lst.taken_empty_lebs -= 1;
+		spin_unlock(&c->space_lock);
+	}
+	ubifs_release_lprops(c);
+	return err;
+}
+
+/**
+ * scan_for_idx_cb - callback used by the scan for a free LEB for the index.
+ * @c: the UBIFS file-system description object
+ * @lprops: LEB properties to scan
+ * @in_tree: whether the LEB properties are in main memory
+ * @data: information passed to and from the caller of the scan
+ *
+ * This function returns a code that indicates whether the scan should continue
+ * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
+ * in main memory (%LPT_SCAN_ADD), or whether the scan should stop
+ * (%LPT_SCAN_STOP).
+ */
+static int scan_for_idx_cb(struct ubifs_info *c,
+			   const struct ubifs_lprops *lprops, int in_tree,
+			   struct scan_data *data)
+{
+	int ret = LPT_SCAN_CONTINUE;
+
+	/* Exclude LEBs that are currently in use */
+	if (lprops->flags & LPROPS_TAKEN)
+		return LPT_SCAN_CONTINUE;
+	/* Determine whether to add these LEB properties to the tree */
+	if (!in_tree && valuable(c, lprops))
+		ret |= LPT_SCAN_ADD;
+	/* Exclude index LEBS */
+	if (lprops->flags & LPROPS_INDEX)
+		return ret;
+	/* Exclude LEBs that cannot be made empty */
+	if (lprops->free + lprops->dirty != c->leb_size)
+		return ret;
+	/*
+	 * We are allocating for the index so it is safe to allocate LEBs with
+	 * only free and dirty space, because write buffers are sync'd at commit
+	 * start.
+	 */
+	data->lnum = lprops->lnum;
+	return LPT_SCAN_ADD | LPT_SCAN_STOP;
+}
+
+/**
+ * scan_for_leb_for_idx - scan for a free LEB for the index.
+ * @c: the UBIFS file-system description object
+ */
+static const struct ubifs_lprops *scan_for_leb_for_idx(struct ubifs_info *c)
+{
+	struct ubifs_lprops *lprops;
+	struct scan_data data;
+	int err;
+
+	data.lnum = -1;
+	err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
+				    (ubifs_lpt_scan_callback)scan_for_idx_cb,
+				    &data);
+	if (err)
+		return ERR_PTR(err);
+	ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt);
+	c->lscan_lnum = data.lnum;
+	lprops = ubifs_lpt_lookup_dirty(c, data.lnum);
+	if (IS_ERR(lprops))
+		return lprops;
+	ubifs_assert(lprops->lnum == data.lnum);
+	ubifs_assert(lprops->free + lprops->dirty == c->leb_size);
+	ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+	ubifs_assert(!(lprops->flags & LPROPS_INDEX));
+	return lprops;
+}
+
+/**
+ * ubifs_find_free_leb_for_idx - find a free LEB for the index.
+ * @c: the UBIFS file-system description object
+ *
+ * This function looks for a free LEB and returns that LEB number. The returned
+ * LEB is marked as "taken", "index".
+ *
+ * Only empty LEBs are allocated. This is for two reasons. First, the commit
+ * calculates the number of LEBs to allocate based on the assumption that they
+ * will be empty. Secondly, free space at the end of an index LEB is not
+ * guaranteed to be empty because it may have been used by the in-the-gaps
+ * method prior to an unclean unmount.
+ *
+ * If no LEB is found %-ENOSPC is returned. For other failures another negative
+ * error code is returned.
+ */
+int ubifs_find_free_leb_for_idx(struct ubifs_info *c)
+{
+	const struct ubifs_lprops *lprops;
+	int lnum = -1, err, flags;
+
+	ubifs_get_lprops(c);
+
+	lprops = ubifs_fast_find_empty(c);
+	if (!lprops) {
+		lprops = ubifs_fast_find_freeable(c);
+		if (!lprops) {
+			ubifs_assert(c->freeable_cnt == 0);
+			if (c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) {
+				lprops = scan_for_leb_for_idx(c);
+				if (IS_ERR(lprops)) {
+					err = PTR_ERR(lprops);
+					goto out;
+				}
+			}
+		}
+	}
+
+	if (!lprops) {
+		err = -ENOSPC;
+		goto out;
+	}
+
+	lnum = lprops->lnum;
+
+	dbg_find("found LEB %d, free %d, dirty %d, flags %#x",
+		 lnum, lprops->free, lprops->dirty, lprops->flags);
+
+	flags = lprops->flags | LPROPS_TAKEN | LPROPS_INDEX;
+	lprops = ubifs_change_lp(c, lprops, c->leb_size, 0, flags, 0);
+	if (IS_ERR(lprops)) {
+		err = PTR_ERR(lprops);
+		goto out;
+	}
+
+	ubifs_release_lprops(c);
+
+	/*
+	 * Ensure that empty LEBs have been unmapped. They may not have been,
+	 * for example, because of an unclean unmount. Also LEBs that were
+	 * freeable LEBs (free + dirty == leb_size) will not have been unmapped.
+	 */
+	err = ubifs_leb_unmap(c, lnum);
+	if (err) {
+		ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
+				    LPROPS_TAKEN | LPROPS_INDEX, 0);
+		return err;
+	}
+
+	return lnum;
+
+out:
+	ubifs_release_lprops(c);
+	return err;
+}
+
+static int cmp_dirty_idx(const struct ubifs_lprops **a,
+			 const struct ubifs_lprops **b)
+{
+	const struct ubifs_lprops *lpa = *a;
+	const struct ubifs_lprops *lpb = *b;
+
+	return lpa->dirty + lpa->free - lpb->dirty - lpb->free;
+}
+
+static void swap_dirty_idx(struct ubifs_lprops **a, struct ubifs_lprops **b,
+			   int size)
+{
+	struct ubifs_lprops *t = *a;
+
+	*a = *b;
+	*b = t;
+}
+
+/**
+ * ubifs_save_dirty_idx_lnums - save an array of the most dirty index LEB nos.
+ * @c: the UBIFS file-system description object
+ *
+ * This function is called each commit to create an array of LEB numbers of
+ * dirty index LEBs sorted in order of dirty and free space.  This is used by
+ * the in-the-gaps method of TNC commit.
+ */
+int ubifs_save_dirty_idx_lnums(struct ubifs_info *c)
+{
+	int i;
+
+	ubifs_get_lprops(c);
+	/* Copy the LPROPS_DIRTY_IDX heap */
+	c->dirty_idx.cnt = c->lpt_heap[LPROPS_DIRTY_IDX - 1].cnt;
+	memcpy(c->dirty_idx.arr, c->lpt_heap[LPROPS_DIRTY_IDX - 1].arr,
+	       sizeof(void *) * c->dirty_idx.cnt);
+	/* Sort it so that the dirtiest is now at the end */
+	sort(c->dirty_idx.arr, c->dirty_idx.cnt, sizeof(void *),
+	     (int (*)(const void *, const void *))cmp_dirty_idx,
+	     (void (*)(void *, void *, int))swap_dirty_idx);
+	dbg_find("found %d dirty index LEBs", c->dirty_idx.cnt);
+	if (c->dirty_idx.cnt)
+		dbg_find("dirtiest index LEB is %d with dirty %d and free %d",
+			 c->dirty_idx.arr[c->dirty_idx.cnt - 1]->lnum,
+			 c->dirty_idx.arr[c->dirty_idx.cnt - 1]->dirty,
+			 c->dirty_idx.arr[c->dirty_idx.cnt - 1]->free);
+	/* Replace the lprops pointers with LEB numbers */
+	for (i = 0; i < c->dirty_idx.cnt; i++)
+		c->dirty_idx.arr[i] = (void *)(size_t)c->dirty_idx.arr[i]->lnum;
+	ubifs_release_lprops(c);
+	return 0;
+}
+
+/**
+ * scan_dirty_idx_cb - callback used by the scan for a dirty index LEB.
+ * @c: the UBIFS file-system description object
+ * @lprops: LEB properties to scan
+ * @in_tree: whether the LEB properties are in main memory
+ * @data: information passed to and from the caller of the scan
+ *
+ * This function returns a code that indicates whether the scan should continue
+ * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
+ * in main memory (%LPT_SCAN_ADD), or whether the scan should stop
+ * (%LPT_SCAN_STOP).
+ */
+static int scan_dirty_idx_cb(struct ubifs_info *c,
+			   const struct ubifs_lprops *lprops, int in_tree,
+			   struct scan_data *data)
+{
+	int ret = LPT_SCAN_CONTINUE;
+
+	/* Exclude LEBs that are currently in use */
+	if (lprops->flags & LPROPS_TAKEN)
+		return LPT_SCAN_CONTINUE;
+	/* Determine whether to add these LEB properties to the tree */
+	if (!in_tree && valuable(c, lprops))
+		ret |= LPT_SCAN_ADD;
+	/* Exclude non-index LEBs */
+	if (!(lprops->flags & LPROPS_INDEX))
+		return ret;
+	/* Exclude LEBs with too little space */
+	if (lprops->free + lprops->dirty < c->min_idx_node_sz)
+		return ret;
+	/* Finally we found space */
+	data->lnum = lprops->lnum;
+	return LPT_SCAN_ADD | LPT_SCAN_STOP;
+}
+
+/**
+ * find_dirty_idx_leb - find a dirty index LEB.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns LEB number upon success and a negative error code upon
+ * failure.  In particular, -ENOSPC is returned if a dirty index LEB is not
+ * found.
+ *
+ * Note that this function scans the entire LPT but it is called very rarely.
+ */
+static int find_dirty_idx_leb(struct ubifs_info *c)
+{
+	const struct ubifs_lprops *lprops;
+	struct ubifs_lpt_heap *heap;
+	struct scan_data data;
+	int err, i, ret;
+
+	/* Check all structures in memory first */
+	data.lnum = -1;
+	heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
+	for (i = 0; i < heap->cnt; i++) {
+		lprops = heap->arr[i];
+		ret = scan_dirty_idx_cb(c, lprops, 1, &data);
+		if (ret & LPT_SCAN_STOP)
+			goto found;
+	}
+	list_for_each_entry(lprops, &c->frdi_idx_list, list) {
+		ret = scan_dirty_idx_cb(c, lprops, 1, &data);
+		if (ret & LPT_SCAN_STOP)
+			goto found;
+	}
+	list_for_each_entry(lprops, &c->uncat_list, list) {
+		ret = scan_dirty_idx_cb(c, lprops, 1, &data);
+		if (ret & LPT_SCAN_STOP)
+			goto found;
+	}
+	if (c->pnodes_have >= c->pnode_cnt)
+		/* All pnodes are in memory, so skip scan */
+		return -ENOSPC;
+	err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
+				    (ubifs_lpt_scan_callback)scan_dirty_idx_cb,
+				    &data);
+	if (err)
+		return err;
+found:
+	ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt);
+	c->lscan_lnum = data.lnum;
+	lprops = ubifs_lpt_lookup_dirty(c, data.lnum);
+	if (IS_ERR(lprops))
+		return PTR_ERR(lprops);
+	ubifs_assert(lprops->lnum == data.lnum);
+	ubifs_assert(lprops->free + lprops->dirty >= c->min_idx_node_sz);
+	ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+	ubifs_assert((lprops->flags & LPROPS_INDEX));
+
+	dbg_find("found dirty LEB %d, free %d, dirty %d, flags %#x",
+		 lprops->lnum, lprops->free, lprops->dirty, lprops->flags);
+
+	lprops = ubifs_change_lp(c, lprops, LPROPS_NC, LPROPS_NC,
+				 lprops->flags | LPROPS_TAKEN, 0);
+	if (IS_ERR(lprops))
+		return PTR_ERR(lprops);
+
+	return lprops->lnum;
+}
+
+/**
+ * get_idx_gc_leb - try to get a LEB number from trivial GC.
+ * @c: the UBIFS file-system description object
+ */
+static int get_idx_gc_leb(struct ubifs_info *c)
+{
+	const struct ubifs_lprops *lp;
+	int err, lnum;
+
+	err = ubifs_get_idx_gc_leb(c);
+	if (err < 0)
+		return err;
+	lnum = err;
+	/*
+	 * The LEB was due to be unmapped after the commit but
+	 * it is needed now for this commit.
+	 */
+	lp = ubifs_lpt_lookup_dirty(c, lnum);
+	if (unlikely(IS_ERR(lp)))
+		return PTR_ERR(lp);
+	lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC,
+			     lp->flags | LPROPS_INDEX, -1);
+	if (unlikely(IS_ERR(lp)))
+		return PTR_ERR(lp);
+	dbg_find("LEB %d, dirty %d and free %d flags %#x",
+		 lp->lnum, lp->dirty, lp->free, lp->flags);
+	return lnum;
+}
+
+/**
+ * find_dirtiest_idx_leb - find dirtiest index LEB from dirtiest array.
+ * @c: the UBIFS file-system description object
+ */
+static int find_dirtiest_idx_leb(struct ubifs_info *c)
+{
+	const struct ubifs_lprops *lp;
+	int lnum;
+
+	while (1) {
+		if (!c->dirty_idx.cnt)
+			return -ENOSPC;
+		/* The lprops pointers were replaced by LEB numbers */
+		lnum = (size_t)c->dirty_idx.arr[--c->dirty_idx.cnt];
+		lp = ubifs_lpt_lookup(c, lnum);
+		if (IS_ERR(lp))
+			return PTR_ERR(lp);
+		if ((lp->flags & LPROPS_TAKEN) || !(lp->flags & LPROPS_INDEX))
+			continue;
+		lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC,
+				     lp->flags | LPROPS_TAKEN, 0);
+		if (IS_ERR(lp))
+			return PTR_ERR(lp);
+		break;
+	}
+	dbg_find("LEB %d, dirty %d and free %d flags %#x", lp->lnum, lp->dirty,
+		 lp->free, lp->flags);
+	ubifs_assert(lp->flags | LPROPS_TAKEN);
+	ubifs_assert(lp->flags | LPROPS_INDEX);
+	return lnum;
+}
+
+/**
+ * ubifs_find_dirty_idx_leb - try to find dirtiest index LEB as at last commit.
+ * @c: the UBIFS file-system description object
+ *
+ * This function attempts to find an untaken index LEB with the most free and
+ * dirty space that can be used without overwriting index nodes that were in the
+ * last index committed.
+ */
+int ubifs_find_dirty_idx_leb(struct ubifs_info *c)
+{
+	int err;
+
+	ubifs_get_lprops(c);
+
+	/*
+	 * We made an array of the dirtiest index LEB numbers as at the start of
+	 * last commit.  Try that array first.
+	 */
+	err = find_dirtiest_idx_leb(c);
+
+	/* Next try scanning the entire LPT */
+	if (err == -ENOSPC)
+		err = find_dirty_idx_leb(c);
+
+	/* Finally take any index LEBs awaiting trivial GC */
+	if (err == -ENOSPC)
+		err = get_idx_gc_leb(c);
+
+	ubifs_release_lprops(c);
+	return err;
+}
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
new file mode 100644
index 00000000000..d0f3dac2908
--- /dev/null
+++ b/fs/ubifs/gc.c
@@ -0,0 +1,773 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements garbage collection. The procedure for garbage collection
+ * is different depending on whether a LEB as an index LEB (contains index
+ * nodes) or not. For non-index LEBs, garbage collection finds a LEB which
+ * contains a lot of dirty space (obsolete nodes), and copies the non-obsolete
+ * nodes to the journal, at which point the garbage-collected LEB is free to be
+ * reused. For index LEBs, garbage collection marks the non-obsolete index nodes
+ * dirty in the TNC, and after the next commit, the garbage-collected LEB is
+ * to be reused. Garbage collection will cause the number of dirty index nodes
+ * to grow, however sufficient space is reserved for the index to ensure the
+ * commit will never run out of space.
+ */
+
+#include <linux/pagemap.h>
+#include "ubifs.h"
+
+/*
+ * GC tries to optimize the way it fit nodes to available space, and it sorts
+ * nodes a little. The below constants are watermarks which define "large",
+ * "medium", and "small" nodes.
+ */
+#define MEDIUM_NODE_WM (UBIFS_BLOCK_SIZE / 4)
+#define SMALL_NODE_WM  UBIFS_MAX_DENT_NODE_SZ
+
+/*
+ * GC may need to move more then one LEB to make progress. The below constants
+ * define "soft" and "hard" limits on the number of LEBs the garbage collector
+ * may move.
+ */
+#define SOFT_LEBS_LIMIT 4
+#define HARD_LEBS_LIMIT 32
+
+/**
+ * switch_gc_head - switch the garbage collection journal head.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to write
+ * @len: length of the buffer to write
+ * @lnum: LEB number written is returned here
+ * @offs: offset written is returned here
+ *
+ * This function switch the GC head to the next LEB which is reserved in
+ * @c->gc_lnum. Returns %0 in case of success, %-EAGAIN if commit is required,
+ * and other negative error code in case of failures.
+ */
+static int switch_gc_head(struct ubifs_info *c)
+{
+	int err, gc_lnum = c->gc_lnum;
+	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
+
+	ubifs_assert(gc_lnum != -1);
+	dbg_gc("switch GC head from LEB %d:%d to LEB %d (waste %d bytes)",
+	       wbuf->lnum, wbuf->offs + wbuf->used, gc_lnum,
+	       c->leb_size - wbuf->offs - wbuf->used);
+
+	err = ubifs_wbuf_sync_nolock(wbuf);
+	if (err)
+		return err;
+
+	/*
+	 * The GC write-buffer was synchronized, we may safely unmap
+	 * 'c->gc_lnum'.
+	 */
+	err = ubifs_leb_unmap(c, gc_lnum);
+	if (err)
+		return err;
+
+	err = ubifs_add_bud_to_log(c, GCHD, gc_lnum, 0);
+	if (err)
+		return err;
+
+	c->gc_lnum = -1;
+	err = ubifs_wbuf_seek_nolock(wbuf, gc_lnum, 0, UBI_LONGTERM);
+	return err;
+}
+
+/**
+ * move_nodes - move nodes.
+ * @c: UBIFS file-system description object
+ * @sleb: describes nodes to move
+ *
+ * This function moves valid nodes from data LEB described by @sleb to the GC
+ * journal head. The obsolete nodes are dropped.
+ *
+ * When moving nodes we have to deal with classical bin-packing problem: the
+ * space in the current GC journal head LEB and in @c->gc_lnum are the "bins",
+ * where the nodes in the @sleb->nodes list are the elements which should be
+ * fit optimally to the bins. This function uses the "first fit decreasing"
+ * strategy, although it does not really sort the nodes but just split them on
+ * 3 classes - large, medium, and small, so they are roughly sorted.
+ *
+ * This function returns zero in case of success, %-EAGAIN if commit is
+ * required, and other negative error codes in case of other failures.
+ */
+static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
+{
+	struct ubifs_scan_node *snod, *tmp;
+	struct list_head large, medium, small;
+	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
+	int avail, err, min = INT_MAX;
+
+	INIT_LIST_HEAD(&large);
+	INIT_LIST_HEAD(&medium);
+	INIT_LIST_HEAD(&small);
+
+	list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
+		struct list_head *lst;
+
+		ubifs_assert(snod->type != UBIFS_IDX_NODE);
+		ubifs_assert(snod->type != UBIFS_REF_NODE);
+		ubifs_assert(snod->type != UBIFS_CS_NODE);
+
+		err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum,
+					 snod->offs, 0);
+		if (err < 0)
+			goto out;
+
+		lst = &snod->list;
+		list_del(lst);
+		if (!err) {
+			/* The node is obsolete, remove it from the list */
+			kfree(snod);
+			continue;
+		}
+
+		/*
+		 * Sort the list of nodes so that large nodes go first, and
+		 * small nodes go last.
+		 */
+		if (snod->len > MEDIUM_NODE_WM)
+			list_add(lst, &large);
+		else if (snod->len > SMALL_NODE_WM)
+			list_add(lst, &medium);
+		else
+			list_add(lst, &small);
+
+		/* And find the smallest node */
+		if (snod->len < min)
+			min = snod->len;
+	}
+
+	/*
+	 * Join the tree lists so that we'd have one roughly sorted list
+	 * ('large' will be the head of the joined list).
+	 */
+	list_splice(&medium, large.prev);
+	list_splice(&small, large.prev);
+
+	if (wbuf->lnum == -1) {
+		/*
+		 * The GC journal head is not set, because it is the first GC
+		 * invocation since mount.
+		 */
+		err = switch_gc_head(c);
+		if (err)
+			goto out;
+	}
+
+	/* Write nodes to their new location. Use the first-fit strategy */
+	while (1) {
+		avail = c->leb_size - wbuf->offs - wbuf->used;
+		list_for_each_entry_safe(snod, tmp, &large, list) {
+			int new_lnum, new_offs;
+
+			if (avail < min)
+				break;
+
+			if (snod->len > avail)
+				/* This node does not fit */
+				continue;
+
+			cond_resched();
+
+			new_lnum = wbuf->lnum;
+			new_offs = wbuf->offs + wbuf->used;
+			err = ubifs_wbuf_write_nolock(wbuf, snod->node,
+						      snod->len);
+			if (err)
+				goto out;
+			err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
+						snod->offs, new_lnum, new_offs,
+						snod->len);
+			if (err)
+				goto out;
+
+			avail = c->leb_size - wbuf->offs - wbuf->used;
+			list_del(&snod->list);
+			kfree(snod);
+		}
+
+		if (list_empty(&large))
+			break;
+
+		/*
+		 * Waste the rest of the space in the LEB and switch to the
+		 * next LEB.
+		 */
+		err = switch_gc_head(c);
+		if (err)
+			goto out;
+	}
+
+	return 0;
+
+out:
+	list_for_each_entry_safe(snod, tmp, &large, list) {
+		list_del(&snod->list);
+		kfree(snod);
+	}
+	return err;
+}
+
+/**
+ * gc_sync_wbufs - sync write-buffers for GC.
+ * @c: UBIFS file-system description object
+ *
+ * We must guarantee that obsoleting nodes are on flash. Unfortunately they may
+ * be in a write-buffer instead. That is, a node could be written to a
+ * write-buffer, obsoleting another node in a LEB that is GC'd. If that LEB is
+ * erased before the write-buffer is sync'd and then there is an unclean
+ * unmount, then an existing node is lost. To avoid this, we sync all
+ * write-buffers.
+ *
+ * This function returns %0 on success or a negative error code on failure.
+ */
+static int gc_sync_wbufs(struct ubifs_info *c)
+{
+	int err, i;
+
+	for (i = 0; i < c->jhead_cnt; i++) {
+		if (i == GCHD)
+			continue;
+		err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+/**
+ * ubifs_garbage_collect_leb - garbage-collect a logical eraseblock.
+ * @c: UBIFS file-system description object
+ * @lp: describes the LEB to garbage collect
+ *
+ * This function garbage-collects an LEB and returns one of the @LEB_FREED,
+ * @LEB_RETAINED, etc positive codes in case of success, %-EAGAIN if commit is
+ * required, and other negative error codes in case of failures.
+ */
+int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
+{
+	struct ubifs_scan_leb *sleb;
+	struct ubifs_scan_node *snod;
+	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
+	int err = 0, lnum = lp->lnum;
+
+	ubifs_assert(c->gc_lnum != -1 || wbuf->offs + wbuf->used == 0 ||
+		     c->need_recovery);
+	ubifs_assert(c->gc_lnum != lnum);
+	ubifs_assert(wbuf->lnum != lnum);
+
+	/*
+	 * We scan the entire LEB even though we only really need to scan up to
+	 * (c->leb_size - lp->free).
+	 */
+	sleb = ubifs_scan(c, lnum, 0, c->sbuf);
+	if (IS_ERR(sleb))
+		return PTR_ERR(sleb);
+
+	ubifs_assert(!list_empty(&sleb->nodes));
+	snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list);
+
+	if (snod->type == UBIFS_IDX_NODE) {
+		struct ubifs_gced_idx_leb *idx_gc;
+
+		dbg_gc("indexing LEB %d (free %d, dirty %d)",
+		       lnum, lp->free, lp->dirty);
+		list_for_each_entry(snod, &sleb->nodes, list) {
+			struct ubifs_idx_node *idx = snod->node;
+			int level = le16_to_cpu(idx->level);
+
+			ubifs_assert(snod->type == UBIFS_IDX_NODE);
+			key_read(c, ubifs_idx_key(c, idx), &snod->key);
+			err = ubifs_dirty_idx_node(c, &snod->key, level, lnum,
+						   snod->offs);
+			if (err)
+				goto out;
+		}
+
+		idx_gc = kmalloc(sizeof(struct ubifs_gced_idx_leb), GFP_NOFS);
+		if (!idx_gc) {
+			err = -ENOMEM;
+			goto out;
+		}
+
+		idx_gc->lnum = lnum;
+		idx_gc->unmap = 0;
+		list_add(&idx_gc->list, &c->idx_gc);
+
+		/*
+		 * Don't release the LEB until after the next commit, because
+		 * it may contain date which is needed for recovery. So
+		 * although we freed this LEB, it will become usable only after
+		 * the commit.
+		 */
+		err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0,
+					  LPROPS_INDEX, 1);
+		if (err)
+			goto out;
+		err = LEB_FREED_IDX;
+	} else {
+		dbg_gc("data LEB %d (free %d, dirty %d)",
+		       lnum, lp->free, lp->dirty);
+
+		err = move_nodes(c, sleb);
+		if (err)
+			goto out;
+
+		err = gc_sync_wbufs(c);
+		if (err)
+			goto out;
+
+		err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0, 0, 0);
+		if (err)
+			goto out;
+
+		if (c->gc_lnum == -1) {
+			c->gc_lnum = lnum;
+			err = LEB_RETAINED;
+		} else {
+			err = ubifs_wbuf_sync_nolock(wbuf);
+			if (err)
+				goto out;
+
+			err = ubifs_leb_unmap(c, lnum);
+			if (err)
+				goto out;
+
+			err = LEB_FREED;
+		}
+	}
+
+out:
+	ubifs_scan_destroy(sleb);
+	return err;
+}
+
+/**
+ * ubifs_garbage_collect - UBIFS garbage collector.
+ * @c: UBIFS file-system description object
+ * @anyway: do GC even if there are free LEBs
+ *
+ * This function does out-of-place garbage collection. The return codes are:
+ *   o positive LEB number if the LEB has been freed and may be used;
+ *   o %-EAGAIN if the caller has to run commit;
+ *   o %-ENOSPC if GC failed to make any progress;
+ *   o other negative error codes in case of other errors.
+ *
+ * Garbage collector writes data to the journal when GC'ing data LEBs, and just
+ * marking indexing nodes dirty when GC'ing indexing LEBs. Thus, at some point
+ * commit may be required. But commit cannot be run from inside GC, because the
+ * caller might be holding the commit lock, so %-EAGAIN is returned instead;
+ * And this error code means that the caller has to run commit, and re-run GC
+ * if there is still no free space.
+ *
+ * There are many reasons why this function may return %-EAGAIN:
+ * o the log is full and there is no space to write an LEB reference for
+ *   @c->gc_lnum;
+ * o the journal is too large and exceeds size limitations;
+ * o GC moved indexing LEBs, but they can be used only after the commit;
+ * o the shrinker fails to find clean znodes to free and requests the commit;
+ * o etc.
+ *
+ * Note, if the file-system is close to be full, this function may return
+ * %-EAGAIN infinitely, so the caller has to limit amount of re-invocations of
+ * the function. E.g., this happens if the limits on the journal size are too
+ * tough and GC writes too much to the journal before an LEB is freed. This
+ * might also mean that the journal is too large, and the TNC becomes to big,
+ * so that the shrinker is constantly called, finds not clean znodes to free,
+ * and requests commit. Well, this may also happen if the journal is all right,
+ * but another kernel process consumes too much memory. Anyway, infinite
+ * %-EAGAIN may happen, but in some extreme/misconfiguration cases.
+ */
+int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
+{
+	int i, err, ret, min_space = c->dead_wm;
+	struct ubifs_lprops lp;
+	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
+
+	ubifs_assert_cmt_locked(c);
+
+	if (ubifs_gc_should_commit(c))
+		return -EAGAIN;
+
+	mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+
+	if (c->ro_media) {
+		ret = -EROFS;
+		goto out_unlock;
+	}
+
+	/* We expect the write-buffer to be empty on entry */
+	ubifs_assert(!wbuf->used);
+
+	for (i = 0; ; i++) {
+		int space_before = c->leb_size - wbuf->offs - wbuf->used;
+		int space_after;
+
+		cond_resched();
+
+		/* Give the commit an opportunity to run */
+		if (ubifs_gc_should_commit(c)) {
+			ret = -EAGAIN;
+			break;
+		}
+
+		if (i > SOFT_LEBS_LIMIT && !list_empty(&c->idx_gc)) {
+			/*
+			 * We've done enough iterations. Indexing LEBs were
+			 * moved and will be available after the commit.
+			 */
+			dbg_gc("soft limit, some index LEBs GC'ed, -EAGAIN");
+			ubifs_commit_required(c);
+			ret = -EAGAIN;
+			break;
+		}
+
+		if (i > HARD_LEBS_LIMIT) {
+			/*
+			 * We've moved too many LEBs and have not made
+			 * progress, give up.
+			 */
+			dbg_gc("hard limit, -ENOSPC");
+			ret = -ENOSPC;
+			break;
+		}
+
+		/*
+		 * Empty and freeable LEBs can turn up while we waited for
+		 * the wbuf lock, or while we have been running GC. In that
+		 * case, we should just return one of those instead of
+		 * continuing to GC dirty LEBs. Hence we request
+		 * 'ubifs_find_dirty_leb()' to return an empty LEB if it can.
+		 */
+		ret = ubifs_find_dirty_leb(c, &lp, min_space, anyway ? 0 : 1);
+		if (ret) {
+			if (ret == -ENOSPC)
+				dbg_gc("no more dirty LEBs");
+			break;
+		}
+
+		dbg_gc("found LEB %d: free %d, dirty %d, sum %d "
+		       "(min. space %d)", lp.lnum, lp.free, lp.dirty,
+		       lp.free + lp.dirty, min_space);
+
+		if (lp.free + lp.dirty == c->leb_size) {
+			/* An empty LEB was returned */
+			dbg_gc("LEB %d is free, return it", lp.lnum);
+			/*
+			 * ubifs_find_dirty_leb() doesn't return freeable index
+			 * LEBs.
+			 */
+			ubifs_assert(!(lp.flags & LPROPS_INDEX));
+			if (lp.free != c->leb_size) {
+				/*
+				 * Write buffers must be sync'd before
+				 * unmapping freeable LEBs, because one of them
+				 * may contain data which obsoletes something
+				 * in 'lp.pnum'.
+				 */
+				ret = gc_sync_wbufs(c);
+				if (ret)
+					goto out;
+				ret = ubifs_change_one_lp(c, lp.lnum,
+							  c->leb_size, 0, 0, 0,
+							  0);
+				if (ret)
+					goto out;
+			}
+			ret = ubifs_leb_unmap(c, lp.lnum);
+			if (ret)
+				goto out;
+			ret = lp.lnum;
+			break;
+		}
+
+		space_before = c->leb_size - wbuf->offs - wbuf->used;
+		if (wbuf->lnum == -1)
+			space_before = 0;
+
+		ret = ubifs_garbage_collect_leb(c, &lp);
+		if (ret < 0) {
+			if (ret == -EAGAIN || ret == -ENOSPC) {
+				/*
+				 * These codes are not errors, so we have to
+				 * return the LEB to lprops. But if the
+				 * 'ubifs_return_leb()' function fails, its
+				 * failure code is propagated to the caller
+				 * instead of the original '-EAGAIN' or
+				 * '-ENOSPC'.
+				 */
+				err = ubifs_return_leb(c, lp.lnum);
+				if (err)
+					ret = err;
+				break;
+			}
+			goto out;
+		}
+
+		if (ret == LEB_FREED) {
+			/* An LEB has been freed and is ready for use */
+			dbg_gc("LEB %d freed, return", lp.lnum);
+			ret = lp.lnum;
+			break;
+		}
+
+		if (ret == LEB_FREED_IDX) {
+			/*
+			 * This was an indexing LEB and it cannot be
+			 * immediately used. And instead of requesting the
+			 * commit straight away, we try to garbage collect some
+			 * more.
+			 */
+			dbg_gc("indexing LEB %d freed, continue", lp.lnum);
+			continue;
+		}
+
+		ubifs_assert(ret == LEB_RETAINED);
+		space_after = c->leb_size - wbuf->offs - wbuf->used;
+		dbg_gc("LEB %d retained, freed %d bytes", lp.lnum,
+		       space_after - space_before);
+
+		if (space_after > space_before) {
+			/* GC makes progress, keep working */
+			min_space >>= 1;
+			if (min_space < c->dead_wm)
+				min_space = c->dead_wm;
+			continue;
+		}
+
+		dbg_gc("did not make progress");
+
+		/*
+		 * GC moved an LEB bud have not done any progress. This means
+		 * that the previous GC head LEB contained too few free space
+		 * and the LEB which was GC'ed contained only large nodes which
+		 * did not fit that space.
+		 *
+		 * We can do 2 things:
+		 * 1. pick another LEB in a hope it'll contain a small node
+		 *    which will fit the space we have at the end of current GC
+		 *    head LEB, but there is no guarantee, so we try this out
+		 *    unless we have already been working for too long;
+		 * 2. request an LEB with more dirty space, which will force
+		 *    'ubifs_find_dirty_leb()' to start scanning the lprops
+		 *    table, instead of just picking one from the heap
+		 *    (previously it already picked the dirtiest LEB).
+		 */
+		if (i < SOFT_LEBS_LIMIT) {
+			dbg_gc("try again");
+			continue;
+		}
+
+		min_space <<= 1;
+		if (min_space > c->dark_wm)
+			min_space = c->dark_wm;
+		dbg_gc("set min. space to %d", min_space);
+	}
+
+	if (ret == -ENOSPC && !list_empty(&c->idx_gc)) {
+		dbg_gc("no space, some index LEBs GC'ed, -EAGAIN");
+		ubifs_commit_required(c);
+		ret = -EAGAIN;
+	}
+
+	err = ubifs_wbuf_sync_nolock(wbuf);
+	if (!err)
+		err = ubifs_leb_unmap(c, c->gc_lnum);
+	if (err) {
+		ret = err;
+		goto out;
+	}
+out_unlock:
+	mutex_unlock(&wbuf->io_mutex);
+	return ret;
+
+out:
+	ubifs_assert(ret < 0);
+	ubifs_assert(ret != -ENOSPC && ret != -EAGAIN);
+	ubifs_ro_mode(c, ret);
+	ubifs_wbuf_sync_nolock(wbuf);
+	mutex_unlock(&wbuf->io_mutex);
+	ubifs_return_leb(c, lp.lnum);
+	return ret;
+}
+
+/**
+ * ubifs_gc_start_commit - garbage collection at start of commit.
+ * @c: UBIFS file-system description object
+ *
+ * If a LEB has only dirty and free space, then we may safely unmap it and make
+ * it free.  Note, we cannot do this with indexing LEBs because dirty space may
+ * correspond index nodes that are required for recovery.  In that case, the
+ * LEB cannot be unmapped until after the next commit.
+ *
+ * This function returns %0 upon success and a negative error code upon failure.
+ */
+int ubifs_gc_start_commit(struct ubifs_info *c)
+{
+	struct ubifs_gced_idx_leb *idx_gc;
+	const struct ubifs_lprops *lp;
+	int err = 0, flags;
+
+	ubifs_get_lprops(c);
+
+	/*
+	 * Unmap (non-index) freeable LEBs. Note that recovery requires that all
+	 * wbufs are sync'd before this, which is done in 'do_commit()'.
+	 */
+	while (1) {
+		lp = ubifs_fast_find_freeable(c);
+		if (unlikely(IS_ERR(lp))) {
+			err = PTR_ERR(lp);
+			goto out;
+		}
+		if (!lp)
+			break;
+		ubifs_assert(!(lp->flags & LPROPS_TAKEN));
+		ubifs_assert(!(lp->flags & LPROPS_INDEX));
+		err = ubifs_leb_unmap(c, lp->lnum);
+		if (err)
+			goto out;
+		lp = ubifs_change_lp(c, lp, c->leb_size, 0, lp->flags, 0);
+		if (unlikely(IS_ERR(lp))) {
+			err = PTR_ERR(lp);
+			goto out;
+		}
+		ubifs_assert(!(lp->flags & LPROPS_TAKEN));
+		ubifs_assert(!(lp->flags & LPROPS_INDEX));
+	}
+
+	/* Mark GC'd index LEBs OK to unmap after this commit finishes */
+	list_for_each_entry(idx_gc, &c->idx_gc, list)
+		idx_gc->unmap = 1;
+
+	/* Record index freeable LEBs for unmapping after commit */
+	while (1) {
+		lp = ubifs_fast_find_frdi_idx(c);
+		if (unlikely(IS_ERR(lp))) {
+			err = PTR_ERR(lp);
+			goto out;
+		}
+		if (!lp)
+			break;
+		idx_gc = kmalloc(sizeof(struct ubifs_gced_idx_leb), GFP_NOFS);
+		if (!idx_gc) {
+			err = -ENOMEM;
+			goto out;
+		}
+		ubifs_assert(!(lp->flags & LPROPS_TAKEN));
+		ubifs_assert(lp->flags & LPROPS_INDEX);
+		/* Don't release the LEB until after the next commit */
+		flags = (lp->flags | LPROPS_TAKEN) ^ LPROPS_INDEX;
+		lp = ubifs_change_lp(c, lp, c->leb_size, 0, flags, 1);
+		if (unlikely(IS_ERR(lp))) {
+			err = PTR_ERR(lp);
+			kfree(idx_gc);
+			goto out;
+		}
+		ubifs_assert(lp->flags & LPROPS_TAKEN);
+		ubifs_assert(!(lp->flags & LPROPS_INDEX));
+		idx_gc->lnum = lp->lnum;
+		idx_gc->unmap = 1;
+		list_add(&idx_gc->list, &c->idx_gc);
+	}
+out:
+	ubifs_release_lprops(c);
+	return err;
+}
+
+/**
+ * ubifs_gc_end_commit - garbage collection at end of commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function completes out-of-place garbage collection of index LEBs.
+ */
+int ubifs_gc_end_commit(struct ubifs_info *c)
+{
+	struct ubifs_gced_idx_leb *idx_gc, *tmp;
+	struct ubifs_wbuf *wbuf;
+	int err = 0;
+
+	wbuf = &c->jheads[GCHD].wbuf;
+	mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+	list_for_each_entry_safe(idx_gc, tmp, &c->idx_gc, list)
+		if (idx_gc->unmap) {
+			dbg_gc("LEB %d", idx_gc->lnum);
+			err = ubifs_leb_unmap(c, idx_gc->lnum);
+			if (err)
+				goto out;
+			err = ubifs_change_one_lp(c, idx_gc->lnum, LPROPS_NC,
+					  LPROPS_NC, 0, LPROPS_TAKEN, -1);
+			if (err)
+				goto out;
+			list_del(&idx_gc->list);
+			kfree(idx_gc);
+		}
+out:
+	mutex_unlock(&wbuf->io_mutex);
+	return err;
+}
+
+/**
+ * ubifs_destroy_idx_gc - destroy idx_gc list.
+ * @c: UBIFS file-system description object
+ *
+ * This function destroys the idx_gc list. It is called when unmounting or
+ * remounting read-only so locks are not needed.
+ */
+void ubifs_destroy_idx_gc(struct ubifs_info *c)
+{
+	while (!list_empty(&c->idx_gc)) {
+		struct ubifs_gced_idx_leb *idx_gc;
+
+		idx_gc = list_entry(c->idx_gc.next, struct ubifs_gced_idx_leb,
+				    list);
+		c->idx_gc_cnt -= 1;
+		list_del(&idx_gc->list);
+		kfree(idx_gc);
+	}
+
+}
+
+/**
+ * ubifs_get_idx_gc_leb - get a LEB from GC'd index LEB list.
+ * @c: UBIFS file-system description object
+ *
+ * Called during start commit so locks are not needed.
+ */
+int ubifs_get_idx_gc_leb(struct ubifs_info *c)
+{
+	struct ubifs_gced_idx_leb *idx_gc;
+	int lnum;
+
+	if (list_empty(&c->idx_gc))
+		return -ENOSPC;
+	idx_gc = list_entry(c->idx_gc.next, struct ubifs_gced_idx_leb, list);
+	lnum = idx_gc->lnum;
+	/* c->idx_gc_cnt is updated by the caller when lprops are updated */
+	list_del(&idx_gc->list);
+	kfree(idx_gc);
+	return lnum;
+}
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
new file mode 100644
index 00000000000..3374f91b670
--- /dev/null
+++ b/fs/ubifs/io.c
@@ -0,0 +1,914 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ * Copyright (C) 2006, 2007 University of Szeged, Hungary
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ *          Zoltan Sogor
+ */
+
+/*
+ * This file implements UBIFS I/O subsystem which provides various I/O-related
+ * helper functions (reading/writing/checking/validating nodes) and implements
+ * write-buffering support. Write buffers help to save space which otherwise
+ * would have been wasted for padding to the nearest minimal I/O unit boundary.
+ * Instead, data first goes to the write-buffer and is flushed when the
+ * buffer is full or when it is not used for some time (by timer). This is
+ * similarto the mechanism is used by JFFS2.
+ *
+ * Write-buffers are defined by 'struct ubifs_wbuf' objects and protected by
+ * mutexes defined inside these objects. Since sometimes upper-level code
+ * has to lock the write-buffer (e.g. journal space reservation code), many
+ * functions related to write-buffers have "nolock" suffix which means that the
+ * caller has to lock the write-buffer before calling this function.
+ *
+ * UBIFS stores nodes at 64 bit-aligned addresses. If the node length is not
+ * aligned, UBIFS starts the next node from the aligned address, and the padded
+ * bytes may contain any rubbish. In other words, UBIFS does not put padding
+ * bytes in those small gaps. Common headers of nodes store real node lengths,
+ * not aligned lengths. Indexing nodes also store real lengths in branches.
+ *
+ * UBIFS uses padding when it pads to the next min. I/O unit. In this case it
+ * uses padding nodes or padding bytes, if the padding node does not fit.
+ *
+ * All UBIFS nodes are protected by CRC checksums and UBIFS checks all nodes
+ * every time they are read from the flash media.
+ */
+
+#include <linux/crc32.h>
+#include "ubifs.h"
+
+/**
+ * ubifs_check_node - check node.
+ * @c: UBIFS file-system description object
+ * @buf: node to check
+ * @lnum: logical eraseblock number
+ * @offs: offset within the logical eraseblock
+ * @quiet: print no messages
+ *
+ * This function checks node magic number and CRC checksum. This function also
+ * validates node length to prevent UBIFS from becoming crazy when an attacker
+ * feeds it a file-system image with incorrect nodes. For example, too large
+ * node length in the common header could cause UBIFS to read memory outside of
+ * allocated buffer when checking the CRC checksum.
+ *
+ * This function returns zero in case of success %-EUCLEAN in case of bad CRC
+ * or magic.
+ */
+int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
+		     int offs, int quiet)
+{
+	int err = -EINVAL, type, node_len;
+	uint32_t crc, node_crc, magic;
+	const struct ubifs_ch *ch = buf;
+
+	ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
+	ubifs_assert(!(offs & 7) && offs < c->leb_size);
+
+	magic = le32_to_cpu(ch->magic);
+	if (magic != UBIFS_NODE_MAGIC) {
+		if (!quiet)
+			ubifs_err("bad magic %#08x, expected %#08x",
+				  magic, UBIFS_NODE_MAGIC);
+		err = -EUCLEAN;
+		goto out;
+	}
+
+	type = ch->node_type;
+	if (type < 0 || type >= UBIFS_NODE_TYPES_CNT) {
+		if (!quiet)
+			ubifs_err("bad node type %d", type);
+		goto out;
+	}
+
+	node_len = le32_to_cpu(ch->len);
+	if (node_len + offs > c->leb_size)
+		goto out_len;
+
+	if (c->ranges[type].max_len == 0) {
+		if (node_len != c->ranges[type].len)
+			goto out_len;
+	} else if (node_len < c->ranges[type].min_len ||
+		   node_len > c->ranges[type].max_len)
+		goto out_len;
+
+	crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
+	node_crc = le32_to_cpu(ch->crc);
+	if (crc != node_crc) {
+		if (!quiet)
+			ubifs_err("bad CRC: calculated %#08x, read %#08x",
+				  crc, node_crc);
+		err = -EUCLEAN;
+		goto out;
+	}
+
+	return 0;
+
+out_len:
+	if (!quiet)
+		ubifs_err("bad node length %d", node_len);
+out:
+	if (!quiet) {
+		ubifs_err("bad node at LEB %d:%d", lnum, offs);
+		dbg_dump_node(c, buf);
+		dbg_dump_stack();
+	}
+	return err;
+}
+
+/**
+ * ubifs_pad - pad flash space.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to put padding to
+ * @pad: how many bytes to pad
+ *
+ * The flash media obliges us to write only in chunks of %c->min_io_size and
+ * when we have to write less data we add padding node to the write-buffer and
+ * pad it to the next minimal I/O unit's boundary. Padding nodes help when the
+ * media is being scanned. If the amount of wasted space is not enough to fit a
+ * padding node which takes %UBIFS_PAD_NODE_SZ bytes, we write padding bytes
+ * pattern (%UBIFS_PADDING_BYTE).
+ *
+ * Padding nodes are also used to fill gaps when the "commit-in-gaps" method is
+ * used.
+ */
+void ubifs_pad(const struct ubifs_info *c, void *buf, int pad)
+{
+	uint32_t crc;
+
+	ubifs_assert(pad >= 0 && !(pad & 7));
+
+	if (pad >= UBIFS_PAD_NODE_SZ) {
+		struct ubifs_ch *ch = buf;
+		struct ubifs_pad_node *pad_node = buf;
+
+		ch->magic = cpu_to_le32(UBIFS_NODE_MAGIC);
+		ch->node_type = UBIFS_PAD_NODE;
+		ch->group_type = UBIFS_NO_NODE_GROUP;
+		ch->padding[0] = ch->padding[1] = 0;
+		ch->sqnum = 0;
+		ch->len = cpu_to_le32(UBIFS_PAD_NODE_SZ);
+		pad -= UBIFS_PAD_NODE_SZ;
+		pad_node->pad_len = cpu_to_le32(pad);
+		crc = crc32(UBIFS_CRC32_INIT, buf + 8, UBIFS_PAD_NODE_SZ - 8);
+		ch->crc = cpu_to_le32(crc);
+		memset(buf + UBIFS_PAD_NODE_SZ, 0, pad);
+	} else if (pad > 0)
+		/* Too little space, padding node won't fit */
+		memset(buf, UBIFS_PADDING_BYTE, pad);
+}
+
+/**
+ * next_sqnum - get next sequence number.
+ * @c: UBIFS file-system description object
+ */
+static unsigned long long next_sqnum(struct ubifs_info *c)
+{
+	unsigned long long sqnum;
+
+	spin_lock(&c->cnt_lock);
+	sqnum = ++c->max_sqnum;
+	spin_unlock(&c->cnt_lock);
+
+	if (unlikely(sqnum >= SQNUM_WARN_WATERMARK)) {
+		if (sqnum >= SQNUM_WATERMARK) {
+			ubifs_err("sequence number overflow %llu, end of life",
+				  sqnum);
+			ubifs_ro_mode(c, -EINVAL);
+		}
+		ubifs_warn("running out of sequence numbers, end of life soon");
+	}
+
+	return sqnum;
+}
+
+/**
+ * ubifs_prepare_node - prepare node to be written to flash.
+ * @c: UBIFS file-system description object
+ * @node: the node to pad
+ * @len: node length
+ * @pad: if the buffer has to be padded
+ *
+ * This function prepares node at @node to be written to the media - it
+ * calculates node CRC, fills the common header, and adds proper padding up to
+ * the next minimum I/O unit if @pad is not zero.
+ */
+void ubifs_prepare_node(struct ubifs_info *c, void *node, int len, int pad)
+{
+	uint32_t crc;
+	struct ubifs_ch *ch = node;
+	unsigned long long sqnum = next_sqnum(c);
+
+	ubifs_assert(len >= UBIFS_CH_SZ);
+
+	ch->magic = cpu_to_le32(UBIFS_NODE_MAGIC);
+	ch->len = cpu_to_le32(len);
+	ch->group_type = UBIFS_NO_NODE_GROUP;
+	ch->sqnum = cpu_to_le64(sqnum);
+	ch->padding[0] = ch->padding[1] = 0;
+	crc = crc32(UBIFS_CRC32_INIT, node + 8, len - 8);
+	ch->crc = cpu_to_le32(crc);
+
+	if (pad) {
+		len = ALIGN(len, 8);
+		pad = ALIGN(len, c->min_io_size) - len;
+		ubifs_pad(c, node + len, pad);
+	}
+}
+
+/**
+ * ubifs_prep_grp_node - prepare node of a group to be written to flash.
+ * @c: UBIFS file-system description object
+ * @node: the node to pad
+ * @len: node length
+ * @last: indicates the last node of the group
+ *
+ * This function prepares node at @node to be written to the media - it
+ * calculates node CRC and fills the common header.
+ */
+void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last)
+{
+	uint32_t crc;
+	struct ubifs_ch *ch = node;
+	unsigned long long sqnum = next_sqnum(c);
+
+	ubifs_assert(len >= UBIFS_CH_SZ);
+
+	ch->magic = cpu_to_le32(UBIFS_NODE_MAGIC);
+	ch->len = cpu_to_le32(len);
+	if (last)
+		ch->group_type = UBIFS_LAST_OF_NODE_GROUP;
+	else
+		ch->group_type = UBIFS_IN_NODE_GROUP;
+	ch->sqnum = cpu_to_le64(sqnum);
+	ch->padding[0] = ch->padding[1] = 0;
+	crc = crc32(UBIFS_CRC32_INIT, node + 8, len - 8);
+	ch->crc = cpu_to_le32(crc);
+}
+
+/**
+ * wbuf_timer_callback - write-buffer timer callback function.
+ * @data: timer data (write-buffer descriptor)
+ *
+ * This function is called when the write-buffer timer expires.
+ */
+static void wbuf_timer_callback_nolock(unsigned long data)
+{
+	struct ubifs_wbuf *wbuf = (struct ubifs_wbuf *)data;
+
+	wbuf->need_sync = 1;
+	wbuf->c->need_wbuf_sync = 1;
+	ubifs_wake_up_bgt(wbuf->c);
+}
+
+/**
+ * new_wbuf_timer - start new write-buffer timer.
+ * @wbuf: write-buffer descriptor
+ */
+static void new_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
+{
+	ubifs_assert(!timer_pending(&wbuf->timer));
+
+	if (!wbuf->timeout)
+		return;
+
+	wbuf->timer.expires = jiffies + wbuf->timeout;
+	add_timer(&wbuf->timer);
+}
+
+/**
+ * cancel_wbuf_timer - cancel write-buffer timer.
+ * @wbuf: write-buffer descriptor
+ */
+static void cancel_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
+{
+	/*
+	 * If the syncer is waiting for the lock (from the background thread's
+	 * context) and another task is changing write-buffer then the syncing
+	 * should be canceled.
+	 */
+	wbuf->need_sync = 0;
+	del_timer(&wbuf->timer);
+}
+
+/**
+ * ubifs_wbuf_sync_nolock - synchronize write-buffer.
+ * @wbuf: write-buffer to synchronize
+ *
+ * This function synchronizes write-buffer @buf and returns zero in case of
+ * success or a negative error code in case of failure.
+ */
+int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
+{
+	struct ubifs_info *c = wbuf->c;
+	int err, dirt;
+
+	cancel_wbuf_timer_nolock(wbuf);
+	if (!wbuf->used || wbuf->lnum == -1)
+		/* Write-buffer is empty or not seeked */
+		return 0;
+
+	dbg_io("LEB %d:%d, %d bytes",
+	       wbuf->lnum, wbuf->offs, wbuf->used);
+	ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY));
+	ubifs_assert(!(wbuf->avail & 7));
+	ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size);
+
+	if (c->ro_media)
+		return -EROFS;
+
+	ubifs_pad(c, wbuf->buf + wbuf->used, wbuf->avail);
+	err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
+			    c->min_io_size, wbuf->dtype);
+	if (err) {
+		ubifs_err("cannot write %d bytes to LEB %d:%d",
+			  c->min_io_size, wbuf->lnum, wbuf->offs);
+		dbg_dump_stack();
+		return err;
+	}
+
+	dirt = wbuf->avail;
+
+	spin_lock(&wbuf->lock);
+	wbuf->offs += c->min_io_size;
+	wbuf->avail = c->min_io_size;
+	wbuf->used = 0;
+	wbuf->next_ino = 0;
+	spin_unlock(&wbuf->lock);
+
+	if (wbuf->sync_callback)
+		err = wbuf->sync_callback(c, wbuf->lnum,
+					  c->leb_size - wbuf->offs, dirt);
+	return err;
+}
+
+/**
+ * ubifs_wbuf_seek_nolock - seek write-buffer.
+ * @wbuf: write-buffer
+ * @lnum: logical eraseblock number to seek to
+ * @offs: logical eraseblock offset to seek to
+ * @dtype: data type
+ *
+ * This function targets the write buffer to logical eraseblock @lnum:@offs.
+ * The write-buffer is synchronized if it is not empty. Returns zero in case of
+ * success and a negative error code in case of failure.
+ */
+int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
+			   int dtype)
+{
+	const struct ubifs_info *c = wbuf->c;
+
+	dbg_io("LEB %d:%d", lnum, offs);
+	ubifs_assert(lnum >= 0 && lnum < c->leb_cnt);
+	ubifs_assert(offs >= 0 && offs <= c->leb_size);
+	ubifs_assert(offs % c->min_io_size == 0 && !(offs & 7));
+	ubifs_assert(lnum != wbuf->lnum);
+
+	if (wbuf->used > 0) {
+		int err = ubifs_wbuf_sync_nolock(wbuf);
+
+		if (err)
+			return err;
+	}
+
+	spin_lock(&wbuf->lock);
+	wbuf->lnum = lnum;
+	wbuf->offs = offs;
+	wbuf->avail = c->min_io_size;
+	wbuf->used = 0;
+	spin_unlock(&wbuf->lock);
+	wbuf->dtype = dtype;
+
+	return 0;
+}
+
+/**
+ * ubifs_bg_wbufs_sync - synchronize write-buffers.
+ * @c: UBIFS file-system description object
+ *
+ * This function is called by background thread to synchronize write-buffers.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+int ubifs_bg_wbufs_sync(struct ubifs_info *c)
+{
+	int err, i;
+
+	if (!c->need_wbuf_sync)
+		return 0;
+	c->need_wbuf_sync = 0;
+
+	if (c->ro_media) {
+		err = -EROFS;
+		goto out_timers;
+	}
+
+	dbg_io("synchronize");
+	for (i = 0; i < c->jhead_cnt; i++) {
+		struct ubifs_wbuf *wbuf = &c->jheads[i].wbuf;
+
+		cond_resched();
+
+		/*
+		 * If the mutex is locked then wbuf is being changed, so
+		 * synchronization is not necessary.
+		 */
+		if (mutex_is_locked(&wbuf->io_mutex))
+			continue;
+
+		mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+		if (!wbuf->need_sync) {
+			mutex_unlock(&wbuf->io_mutex);
+			continue;
+		}
+
+		err = ubifs_wbuf_sync_nolock(wbuf);
+		mutex_unlock(&wbuf->io_mutex);
+		if (err) {
+			ubifs_err("cannot sync write-buffer, error %d", err);
+			ubifs_ro_mode(c, err);
+			goto out_timers;
+		}
+	}
+
+	return 0;
+
+out_timers:
+	/* Cancel all timers to prevent repeated errors */
+	for (i = 0; i < c->jhead_cnt; i++) {
+		struct ubifs_wbuf *wbuf = &c->jheads[i].wbuf;
+
+		mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+		cancel_wbuf_timer_nolock(wbuf);
+		mutex_unlock(&wbuf->io_mutex);
+	}
+	return err;
+}
+
+/**
+ * ubifs_wbuf_write_nolock - write data to flash via write-buffer.
+ * @wbuf: write-buffer
+ * @buf: node to write
+ * @len: node length
+ *
+ * This function writes data to flash via write-buffer @wbuf. This means that
+ * the last piece of the node won't reach the flash media immediately if it
+ * does not take whole minimal I/O unit. Instead, the node will sit in RAM
+ * until the write-buffer is synchronized (e.g., by timer).
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure. If the node cannot be written because there is no more
+ * space in this logical eraseblock, %-ENOSPC is returned.
+ */
+int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
+{
+	struct ubifs_info *c = wbuf->c;
+	int err, written, n, aligned_len = ALIGN(len, 8), offs;
+
+	dbg_io("%d bytes (%s) to wbuf at LEB %d:%d", len,
+	       dbg_ntype(((struct ubifs_ch *)buf)->node_type), wbuf->lnum,
+	       wbuf->offs + wbuf->used);
+	ubifs_assert(len > 0 && wbuf->lnum >= 0 && wbuf->lnum < c->leb_cnt);
+	ubifs_assert(wbuf->offs >= 0 && wbuf->offs % c->min_io_size == 0);
+	ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size);
+	ubifs_assert(wbuf->avail > 0 && wbuf->avail <= c->min_io_size);
+	ubifs_assert(mutex_is_locked(&wbuf->io_mutex));
+
+	if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) {
+		err = -ENOSPC;
+		goto out;
+	}
+
+	cancel_wbuf_timer_nolock(wbuf);
+
+	if (c->ro_media)
+		return -EROFS;
+
+	if (aligned_len <= wbuf->avail) {
+		/*
+		 * The node is not very large and fits entirely within
+		 * write-buffer.
+		 */
+		memcpy(wbuf->buf + wbuf->used, buf, len);
+
+		if (aligned_len == wbuf->avail) {
+			dbg_io("flush wbuf to LEB %d:%d", wbuf->lnum,
+				wbuf->offs);
+			err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf,
+					    wbuf->offs, c->min_io_size,
+					    wbuf->dtype);
+			if (err)
+				goto out;
+
+			spin_lock(&wbuf->lock);
+			wbuf->offs += c->min_io_size;
+			wbuf->avail = c->min_io_size;
+			wbuf->used = 0;
+			wbuf->next_ino = 0;
+			spin_unlock(&wbuf->lock);
+		} else {
+			spin_lock(&wbuf->lock);
+			wbuf->avail -= aligned_len;
+			wbuf->used += aligned_len;
+			spin_unlock(&wbuf->lock);
+		}
+
+		goto exit;
+	}
+
+	/*
+	 * The node is large enough and does not fit entirely within current
+	 * minimal I/O unit. We have to fill and flush write-buffer and switch
+	 * to the next min. I/O unit.
+	 */
+	dbg_io("flush wbuf to LEB %d:%d", wbuf->lnum, wbuf->offs);
+	memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail);
+	err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
+			    c->min_io_size, wbuf->dtype);
+	if (err)
+		goto out;
+
+	offs = wbuf->offs + c->min_io_size;
+	len -= wbuf->avail;
+	aligned_len -= wbuf->avail;
+	written = wbuf->avail;
+
+	/*
+	 * The remaining data may take more whole min. I/O units, so write the
+	 * remains multiple to min. I/O unit size directly to the flash media.
+	 * We align node length to 8-byte boundary because we anyway flash wbuf
+	 * if the remaining space is less than 8 bytes.
+	 */
+	n = aligned_len >> c->min_io_shift;
+	if (n) {
+		n <<= c->min_io_shift;
+		dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, offs);
+		err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written, offs, n,
+				    wbuf->dtype);
+		if (err)
+			goto out;
+		offs += n;
+		aligned_len -= n;
+		len -= n;
+		written += n;
+	}
+
+	spin_lock(&wbuf->lock);
+	if (aligned_len)
+		/*
+		 * And now we have what's left and what does not take whole
+		 * min. I/O unit, so write it to the write-buffer and we are
+		 * done.
+		 */
+		memcpy(wbuf->buf, buf + written, len);
+
+	wbuf->offs = offs;
+	wbuf->used = aligned_len;
+	wbuf->avail = c->min_io_size - aligned_len;
+	wbuf->next_ino = 0;
+	spin_unlock(&wbuf->lock);
+
+exit:
+	if (wbuf->sync_callback) {
+		int free = c->leb_size - wbuf->offs - wbuf->used;
+
+		err = wbuf->sync_callback(c, wbuf->lnum, free, 0);
+		if (err)
+			goto out;
+	}
+
+	if (wbuf->used)
+		new_wbuf_timer_nolock(wbuf);
+
+	return 0;
+
+out:
+	ubifs_err("cannot write %d bytes to LEB %d:%d, error %d",
+		  len, wbuf->lnum, wbuf->offs, err);
+	dbg_dump_node(c, buf);
+	dbg_dump_stack();
+	dbg_dump_leb(c, wbuf->lnum);
+	return err;
+}
+
+/**
+ * ubifs_write_node - write node to the media.
+ * @c: UBIFS file-system description object
+ * @buf: the node to write
+ * @len: node length
+ * @lnum: logical eraseblock number
+ * @offs: offset within the logical eraseblock
+ * @dtype: node life-time hint (%UBI_LONGTERM, %UBI_SHORTTERM, %UBI_UNKNOWN)
+ *
+ * This function automatically fills node magic number, assigns sequence
+ * number, and calculates node CRC checksum. The length of the @buf buffer has
+ * to be aligned to the minimal I/O unit size. This function automatically
+ * appends padding node and padding bytes if needed. Returns zero in case of
+ * success and a negative error code in case of failure.
+ */
+int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum,
+		     int offs, int dtype)
+{
+	int err, buf_len = ALIGN(len, c->min_io_size);
+
+	dbg_io("LEB %d:%d, %s, length %d (aligned %d)",
+	       lnum, offs, dbg_ntype(((struct ubifs_ch *)buf)->node_type), len,
+	       buf_len);
+	ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
+	ubifs_assert(offs % c->min_io_size == 0 && offs < c->leb_size);
+
+	if (c->ro_media)
+		return -EROFS;
+
+	ubifs_prepare_node(c, buf, len, 1);
+	err = ubi_leb_write(c->ubi, lnum, buf, offs, buf_len, dtype);
+	if (err) {
+		ubifs_err("cannot write %d bytes to LEB %d:%d, error %d",
+			  buf_len, lnum, offs, err);
+		dbg_dump_node(c, buf);
+		dbg_dump_stack();
+	}
+
+	return err;
+}
+
+/**
+ * ubifs_read_node_wbuf - read node from the media or write-buffer.
+ * @wbuf: wbuf to check for un-written data
+ * @buf: buffer to read to
+ * @type: node type
+ * @len: node length
+ * @lnum: logical eraseblock number
+ * @offs: offset within the logical eraseblock
+ *
+ * This function reads a node of known type and length, checks it and stores
+ * in @buf. If the node partially or fully sits in the write-buffer, this
+ * function takes data from the buffer, otherwise it reads the flash media.
+ * Returns zero in case of success, %-EUCLEAN if CRC mismatched and a negative
+ * error code in case of failure.
+ */
+int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
+			 int lnum, int offs)
+{
+	const struct ubifs_info *c = wbuf->c;
+	int err, rlen, overlap;
+	struct ubifs_ch *ch = buf;
+
+	dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len);
+	ubifs_assert(wbuf && lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
+	ubifs_assert(!(offs & 7) && offs < c->leb_size);
+	ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT);
+
+	spin_lock(&wbuf->lock);
+	overlap = (lnum == wbuf->lnum && offs + len > wbuf->offs);
+	if (!overlap) {
+		/* We may safely unlock the write-buffer and read the data */
+		spin_unlock(&wbuf->lock);
+		return ubifs_read_node(c, buf, type, len, lnum, offs);
+	}
+
+	/* Don't read under wbuf */
+	rlen = wbuf->offs - offs;
+	if (rlen < 0)
+		rlen = 0;
+
+	/* Copy the rest from the write-buffer */
+	memcpy(buf + rlen, wbuf->buf + offs + rlen - wbuf->offs, len - rlen);
+	spin_unlock(&wbuf->lock);
+
+	if (rlen > 0) {
+		/* Read everything that goes before write-buffer */
+		err = ubi_read(c->ubi, lnum, buf, offs, rlen);
+		if (err && err != -EBADMSG) {
+			ubifs_err("failed to read node %d from LEB %d:%d, "
+				  "error %d", type, lnum, offs, err);
+			dbg_dump_stack();
+			return err;
+		}
+	}
+
+	if (type != ch->node_type) {
+		ubifs_err("bad node type (%d but expected %d)",
+			  ch->node_type, type);
+		goto out;
+	}
+
+	err = ubifs_check_node(c, buf, lnum, offs, 0);
+	if (err) {
+		ubifs_err("expected node type %d", type);
+		return err;
+	}
+
+	rlen = le32_to_cpu(ch->len);
+	if (rlen != len) {
+		ubifs_err("bad node length %d, expected %d", rlen, len);
+		goto out;
+	}
+
+	return 0;
+
+out:
+	ubifs_err("bad node at LEB %d:%d", lnum, offs);
+	dbg_dump_node(c, buf);
+	dbg_dump_stack();
+	return -EINVAL;
+}
+
+/**
+ * ubifs_read_node - read node.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to read to
+ * @type: node type
+ * @len: node length (not aligned)
+ * @lnum: logical eraseblock number
+ * @offs: offset within the logical eraseblock
+ *
+ * This function reads a node of known type and and length, checks it and
+ * stores in @buf. Returns zero in case of success, %-EUCLEAN if CRC mismatched
+ * and a negative error code in case of failure.
+ */
+int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len,
+		    int lnum, int offs)
+{
+	int err, l;
+	struct ubifs_ch *ch = buf;
+
+	dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len);
+	ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
+	ubifs_assert(len >= UBIFS_CH_SZ && offs + len <= c->leb_size);
+	ubifs_assert(!(offs & 7) && offs < c->leb_size);
+	ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT);
+
+	err = ubi_read(c->ubi, lnum, buf, offs, len);
+	if (err && err != -EBADMSG) {
+		ubifs_err("cannot read node %d from LEB %d:%d, error %d",
+			  type, lnum, offs, err);
+		return err;
+	}
+
+	if (type != ch->node_type) {
+		ubifs_err("bad node type (%d but expected %d)",
+			  ch->node_type, type);
+		goto out;
+	}
+
+	err = ubifs_check_node(c, buf, lnum, offs, 0);
+	if (err) {
+		ubifs_err("expected node type %d", type);
+		return err;
+	}
+
+	l = le32_to_cpu(ch->len);
+	if (l != len) {
+		ubifs_err("bad node length %d, expected %d", l, len);
+		goto out;
+	}
+
+	return 0;
+
+out:
+	ubifs_err("bad node at LEB %d:%d", lnum, offs);
+	dbg_dump_node(c, buf);
+	dbg_dump_stack();
+	return -EINVAL;
+}
+
+/**
+ * ubifs_wbuf_init - initialize write-buffer.
+ * @c: UBIFS file-system description object
+ * @wbuf: write-buffer to initialize
+ *
+ * This function initializes write buffer. Returns zero in case of success
+ * %-ENOMEM in case of failure.
+ */
+int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf)
+{
+	size_t size;
+
+	wbuf->buf = kmalloc(c->min_io_size, GFP_KERNEL);
+	if (!wbuf->buf)
+		return -ENOMEM;
+
+	size = (c->min_io_size / UBIFS_CH_SZ + 1) * sizeof(ino_t);
+	wbuf->inodes = kmalloc(size, GFP_KERNEL);
+	if (!wbuf->inodes) {
+		kfree(wbuf->buf);
+		wbuf->buf = NULL;
+		return -ENOMEM;
+	}
+
+	wbuf->used = 0;
+	wbuf->lnum = wbuf->offs = -1;
+	wbuf->avail = c->min_io_size;
+	wbuf->dtype = UBI_UNKNOWN;
+	wbuf->sync_callback = NULL;
+	mutex_init(&wbuf->io_mutex);
+	spin_lock_init(&wbuf->lock);
+
+	wbuf->c = c;
+	init_timer(&wbuf->timer);
+	wbuf->timer.function = wbuf_timer_callback_nolock;
+	wbuf->timer.data = (unsigned long)wbuf;
+	wbuf->timeout = DEFAULT_WBUF_TIMEOUT;
+	wbuf->next_ino = 0;
+
+	return 0;
+}
+
+/**
+ * ubifs_wbuf_add_ino_nolock - add an inode number into the wbuf inode array.
+ * @wbuf: the write-buffer whereto add
+ * @inum: the inode number
+ *
+ * This function adds an inode number to the inode array of the write-buffer.
+ */
+void ubifs_wbuf_add_ino_nolock(struct ubifs_wbuf *wbuf, ino_t inum)
+{
+	if (!wbuf->buf)
+		/* NOR flash or something similar */
+		return;
+
+	spin_lock(&wbuf->lock);
+	if (wbuf->used)
+		wbuf->inodes[wbuf->next_ino++] = inum;
+	spin_unlock(&wbuf->lock);
+}
+
+/**
+ * wbuf_has_ino - returns if the wbuf contains data from the inode.
+ * @wbuf: the write-buffer
+ * @inum: the inode number
+ *
+ * This function returns with %1 if the write-buffer contains some data from the
+ * given inode otherwise it returns with %0.
+ */
+static int wbuf_has_ino(struct ubifs_wbuf *wbuf, ino_t inum)
+{
+	int i, ret = 0;
+
+	spin_lock(&wbuf->lock);
+	for (i = 0; i < wbuf->next_ino; i++)
+		if (inum == wbuf->inodes[i]) {
+			ret = 1;
+			break;
+		}
+	spin_unlock(&wbuf->lock);
+
+	return ret;
+}
+
+/**
+ * ubifs_sync_wbufs_by_inode - synchronize write-buffers for an inode.
+ * @c: UBIFS file-system description object
+ * @inode: inode to synchronize
+ *
+ * This function synchronizes write-buffers which contain nodes belonging to
+ * @inode. Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+int ubifs_sync_wbufs_by_inode(struct ubifs_info *c, struct inode *inode)
+{
+	int i, err = 0;
+
+	for (i = 0; i < c->jhead_cnt; i++) {
+		struct ubifs_wbuf *wbuf = &c->jheads[i].wbuf;
+
+		if (i == GCHD)
+			/*
+			 * GC head is special, do not look at it. Even if the
+			 * head contains something related to this inode, it is
+			 * a _copy_ of corresponding on-flash node which sits
+			 * somewhere else.
+			 */
+			continue;
+
+		if (!wbuf_has_ino(wbuf, inode->i_ino))
+			continue;
+
+		mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+		if (wbuf_has_ino(wbuf, inode->i_ino))
+			err = ubifs_wbuf_sync_nolock(wbuf);
+		mutex_unlock(&wbuf->io_mutex);
+
+		if (err) {
+			ubifs_ro_mode(c, err);
+			return err;
+		}
+	}
+	return 0;
+}
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
new file mode 100644
index 00000000000..5e82cffe969
--- /dev/null
+++ b/fs/ubifs/ioctl.c
@@ -0,0 +1,204 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ * Copyright (C) 2006, 2007 University of Szeged, Hungary
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Zoltan Sogor
+ *          Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/* This file implements EXT2-compatible extended attribute ioctl() calls */
+
+#include <linux/compat.h>
+#include <linux/smp_lock.h>
+#include <linux/mount.h>
+#include "ubifs.h"
+
+/**
+ * ubifs_set_inode_flags - set VFS inode flags.
+ * @inode: VFS inode to set flags for
+ *
+ * This function propagates flags from UBIFS inode object to VFS inode object.
+ */
+void ubifs_set_inode_flags(struct inode *inode)
+{
+	unsigned int flags = ubifs_inode(inode)->flags;
+
+	inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_DIRSYNC);
+	if (flags & UBIFS_SYNC_FL)
+		inode->i_flags |= S_SYNC;
+	if (flags & UBIFS_APPEND_FL)
+		inode->i_flags |= S_APPEND;
+	if (flags & UBIFS_IMMUTABLE_FL)
+		inode->i_flags |= S_IMMUTABLE;
+	if (flags & UBIFS_DIRSYNC_FL)
+		inode->i_flags |= S_DIRSYNC;
+}
+
+/*
+ * ioctl2ubifs - convert ioctl inode flags to UBIFS inode flags.
+ * @ioctl_flags: flags to convert
+ *
+ * This function convert ioctl flags (@FS_COMPR_FL, etc) to UBIFS inode flags
+ * (@UBIFS_COMPR_FL, etc).
+ */
+static int ioctl2ubifs(int ioctl_flags)
+{
+	int ubifs_flags = 0;
+
+	if (ioctl_flags & FS_COMPR_FL)
+		ubifs_flags |= UBIFS_COMPR_FL;
+	if (ioctl_flags & FS_SYNC_FL)
+		ubifs_flags |= UBIFS_SYNC_FL;
+	if (ioctl_flags & FS_APPEND_FL)
+		ubifs_flags |= UBIFS_APPEND_FL;
+	if (ioctl_flags & FS_IMMUTABLE_FL)
+		ubifs_flags |= UBIFS_IMMUTABLE_FL;
+	if (ioctl_flags & FS_DIRSYNC_FL)
+		ubifs_flags |= UBIFS_DIRSYNC_FL;
+
+	return ubifs_flags;
+}
+
+/*
+ * ubifs2ioctl - convert UBIFS inode flags to ioctl inode flags.
+ * @ubifs_flags: flags to convert
+ *
+ * This function convert UBIFS (@UBIFS_COMPR_FL, etc) to ioctl flags
+ * (@FS_COMPR_FL, etc).
+ */
+static int ubifs2ioctl(int ubifs_flags)
+{
+	int ioctl_flags = 0;
+
+	if (ubifs_flags & UBIFS_COMPR_FL)
+		ioctl_flags |= FS_COMPR_FL;
+	if (ubifs_flags & UBIFS_SYNC_FL)
+		ioctl_flags |= FS_SYNC_FL;
+	if (ubifs_flags & UBIFS_APPEND_FL)
+		ioctl_flags |= FS_APPEND_FL;
+	if (ubifs_flags & UBIFS_IMMUTABLE_FL)
+		ioctl_flags |= FS_IMMUTABLE_FL;
+	if (ubifs_flags & UBIFS_DIRSYNC_FL)
+		ioctl_flags |= FS_DIRSYNC_FL;
+
+	return ioctl_flags;
+}
+
+static int setflags(struct inode *inode, int flags)
+{
+	int oldflags, err, release;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+	struct ubifs_budget_req req = { .dirtied_ino = 1,
+					.dirtied_ino_d = ui->data_len };
+
+	err = ubifs_budget_space(c, &req);
+	if (err)
+		return err;
+
+	/*
+	 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
+	 * the relevant capability.
+	 */
+	mutex_lock(&ui->ui_mutex);
+	oldflags = ubifs2ioctl(ui->flags);
+	if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
+		if (!capable(CAP_LINUX_IMMUTABLE)) {
+			err = -EPERM;
+			goto out_unlock;
+		}
+	}
+
+	ui->flags = ioctl2ubifs(flags);
+	ubifs_set_inode_flags(inode);
+	inode->i_ctime = ubifs_current_time(inode);
+	release = ui->dirty;
+	mark_inode_dirty_sync(inode);
+	mutex_unlock(&ui->ui_mutex);
+
+	if (release)
+		ubifs_release_budget(c, &req);
+	if (IS_SYNC(inode))
+		err = write_inode_now(inode, 1);
+	return err;
+
+out_unlock:
+	ubifs_err("can't modify inode %lu attributes", inode->i_ino);
+	mutex_unlock(&ui->ui_mutex);
+	ubifs_release_budget(c, &req);
+	return err;
+}
+
+long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	int flags, err;
+	struct inode *inode = file->f_path.dentry->d_inode;
+
+	switch (cmd) {
+	case FS_IOC_GETFLAGS:
+		flags = ubifs2ioctl(ubifs_inode(inode)->flags);
+
+		return put_user(flags, (int __user *) arg);
+
+	case FS_IOC_SETFLAGS: {
+		if (IS_RDONLY(inode))
+			return -EROFS;
+
+		if (!is_owner_or_cap(inode))
+			return -EACCES;
+
+		if (get_user(flags, (int __user *) arg))
+			return -EFAULT;
+
+		if (!S_ISDIR(inode->i_mode))
+			flags &= ~FS_DIRSYNC_FL;
+
+		/*
+		 * Make sure the file-system is read-write and make sure it
+		 * will not become read-only while we are changing the flags.
+		 */
+		err = mnt_want_write(file->f_path.mnt);
+		if (err)
+			return err;
+		err = setflags(inode, flags);
+		mnt_drop_write(file->f_path.mnt);
+		return err;
+	}
+
+	default:
+		return -ENOTTY;
+	}
+}
+
+#ifdef CONFIG_COMPAT
+long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	switch (cmd) {
+	case FS_IOC32_GETFLAGS:
+		cmd = FS_IOC_GETFLAGS;
+		break;
+	case FS_IOC32_SETFLAGS:
+		cmd = FS_IOC_SETFLAGS;
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+	return ubifs_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
new file mode 100644
index 00000000000..283155abe5f
--- /dev/null
+++ b/fs/ubifs/journal.c
@@ -0,0 +1,1387 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/*
+ * This file implements UBIFS journal.
+ *
+ * The journal consists of 2 parts - the log and bud LEBs. The log has fixed
+ * length and position, while a bud logical eraseblock is any LEB in the main
+ * area. Buds contain file system data - data nodes, inode nodes, etc. The log
+ * contains only references to buds and some other stuff like commit
+ * start node. The idea is that when we commit the journal, we do
+ * not copy the data, the buds just become indexed. Since after the commit the
+ * nodes in bud eraseblocks become leaf nodes of the file system index tree, we
+ * use term "bud". Analogy is obvious, bud eraseblocks contain nodes which will
+ * become leafs in the future.
+ *
+ * The journal is multi-headed because we want to write data to the journal as
+ * optimally as possible. It is nice to have nodes belonging to the same inode
+ * in one LEB, so we may write data owned by different inodes to different
+ * journal heads, although at present only one data head is used.
+ *
+ * For recovery reasons, the base head contains all inode nodes, all directory
+ * entry nodes and all truncate nodes. This means that the other heads contain
+ * only data nodes.
+ *
+ * Bud LEBs may be half-indexed. For example, if the bud was not full at the
+ * time of commit, the bud is retained to continue to be used in the journal,
+ * even though the "front" of the LEB is now indexed. In that case, the log
+ * reference contains the offset where the bud starts for the purposes of the
+ * journal.
+ *
+ * The journal size has to be limited, because the larger is the journal, the
+ * longer it takes to mount UBIFS (scanning the journal) and the more memory it
+ * takes (indexing in the TNC).
+ *
+ * All the journal write operations like 'ubifs_jnl_update()' here, which write
+ * multiple UBIFS nodes to the journal at one go, are atomic with respect to
+ * unclean reboots. Should the unclean reboot happen, the recovery code drops
+ * all the nodes.
+ */
+
+#include "ubifs.h"
+
+/**
+ * zero_ino_node_unused - zero out unused fields of an on-flash inode node.
+ * @ino: the inode to zero out
+ */
+static inline void zero_ino_node_unused(struct ubifs_ino_node *ino)
+{
+	memset(ino->padding1, 0, 4);
+	memset(ino->padding2, 0, 26);
+}
+
+/**
+ * zero_dent_node_unused - zero out unused fields of an on-flash directory
+ *                         entry node.
+ * @dent: the directory entry to zero out
+ */
+static inline void zero_dent_node_unused(struct ubifs_dent_node *dent)
+{
+	dent->padding1 = 0;
+	memset(dent->padding2, 0, 4);
+}
+
+/**
+ * zero_data_node_unused - zero out unused fields of an on-flash data node.
+ * @data: the data node to zero out
+ */
+static inline void zero_data_node_unused(struct ubifs_data_node *data)
+{
+	memset(data->padding, 0, 2);
+}
+
+/**
+ * zero_trun_node_unused - zero out unused fields of an on-flash truncation
+ *                         node.
+ * @trun: the truncation node to zero out
+ */
+static inline void zero_trun_node_unused(struct ubifs_trun_node *trun)
+{
+	memset(trun->padding, 0, 12);
+}
+
+/**
+ * reserve_space - reserve space in the journal.
+ * @c: UBIFS file-system description object
+ * @jhead: journal head number
+ * @len: node length
+ *
+ * This function reserves space in journal head @head. If the reservation
+ * succeeded, the journal head stays locked and later has to be unlocked using
+ * 'release_head()'. 'write_node()' and 'write_head()' functions also unlock
+ * it. Returns zero in case of success, %-EAGAIN if commit has to be done, and
+ * other negative error codes in case of other failures.
+ */
+static int reserve_space(struct ubifs_info *c, int jhead, int len)
+{
+	int err = 0, err1, retries = 0, avail, lnum, offs, free, squeeze;
+	struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf;
+
+	/*
+	 * Typically, the base head has smaller nodes written to it, so it is
+	 * better to try to allocate space at the ends of eraseblocks. This is
+	 * what the squeeze parameter does.
+	 */
+	squeeze = (jhead == BASEHD);
+again:
+	mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+
+	if (c->ro_media) {
+		err = -EROFS;
+		goto out_unlock;
+	}
+
+	avail = c->leb_size - wbuf->offs - wbuf->used;
+	if (wbuf->lnum != -1 && avail >= len)
+		return 0;
+
+	/*
+	 * Write buffer wasn't seek'ed or there is no enough space - look for an
+	 * LEB with some empty space.
+	 */
+	lnum = ubifs_find_free_space(c, len, &free, squeeze);
+	if (lnum >= 0) {
+		/* Found an LEB, add it to the journal head */
+		offs = c->leb_size - free;
+		err = ubifs_add_bud_to_log(c, jhead, lnum, offs);
+		if (err)
+			goto out_return;
+		/* A new bud was successfully allocated and added to the log */
+		goto out;
+	}
+
+	err = lnum;
+	if (err != -ENOSPC)
+		goto out_unlock;
+
+	/*
+	 * No free space, we have to run garbage collector to make
+	 * some. But the write-buffer mutex has to be unlocked because
+	 * GC also takes it.
+	 */
+	dbg_jnl("no free space  jhead %d, run GC", jhead);
+	mutex_unlock(&wbuf->io_mutex);
+
+	lnum = ubifs_garbage_collect(c, 0);
+	if (lnum < 0) {
+		err = lnum;
+		if (err != -ENOSPC)
+			return err;
+
+		/*
+		 * GC could not make a free LEB. But someone else may
+		 * have allocated new bud for this journal head,
+		 * because we dropped @wbuf->io_mutex, so try once
+		 * again.
+		 */
+		dbg_jnl("GC couldn't make a free LEB for jhead %d", jhead);
+		if (retries++ < 2) {
+			dbg_jnl("retry (%d)", retries);
+			goto again;
+		}
+
+		dbg_jnl("return -ENOSPC");
+		return err;
+	}
+
+	mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+	dbg_jnl("got LEB %d for jhead %d", lnum, jhead);
+	avail = c->leb_size - wbuf->offs - wbuf->used;
+
+	if (wbuf->lnum != -1 && avail >= len) {
+		/*
+		 * Someone else has switched the journal head and we have
+		 * enough space now. This happens when more then one process is
+		 * trying to write to the same journal head at the same time.
+		 */
+		dbg_jnl("return LEB %d back, already have LEB %d:%d",
+			lnum, wbuf->lnum, wbuf->offs + wbuf->used);
+		err = ubifs_return_leb(c, lnum);
+		if (err)
+			goto out_unlock;
+		return 0;
+	}
+
+	err = ubifs_add_bud_to_log(c, jhead, lnum, 0);
+	if (err)
+		goto out_return;
+	offs = 0;
+
+out:
+	err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, UBI_SHORTTERM);
+	if (err)
+		goto out_unlock;
+
+	return 0;
+
+out_unlock:
+	mutex_unlock(&wbuf->io_mutex);
+	return err;
+
+out_return:
+	/* An error occurred and the LEB has to be returned to lprops */
+	ubifs_assert(err < 0);
+	err1 = ubifs_return_leb(c, lnum);
+	if (err1 && err == -EAGAIN)
+		/*
+		 * Return original error code only if it is not %-EAGAIN,
+		 * which is not really an error. Otherwise, return the error
+		 * code of 'ubifs_return_leb()'.
+		 */
+		err = err1;
+	mutex_unlock(&wbuf->io_mutex);
+	return err;
+}
+
+/**
+ * write_node - write node to a journal head.
+ * @c: UBIFS file-system description object
+ * @jhead: journal head
+ * @node: node to write
+ * @len: node length
+ * @lnum: LEB number written is returned here
+ * @offs: offset written is returned here
+ *
+ * This function writes a node to reserved space of journal head @jhead.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+static int write_node(struct ubifs_info *c, int jhead, void *node, int len,
+		      int *lnum, int *offs)
+{
+	struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf;
+
+	ubifs_assert(jhead != GCHD);
+
+	*lnum = c->jheads[jhead].wbuf.lnum;
+	*offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used;
+
+	dbg_jnl("jhead %d, LEB %d:%d, len %d", jhead, *lnum, *offs, len);
+	ubifs_prepare_node(c, node, len, 0);
+
+	return ubifs_wbuf_write_nolock(wbuf, node, len);
+}
+
+/**
+ * write_head - write data to a journal head.
+ * @c: UBIFS file-system description object
+ * @jhead: journal head
+ * @buf: buffer to write
+ * @len: length to write
+ * @lnum: LEB number written is returned here
+ * @offs: offset written is returned here
+ * @sync: non-zero if the write-buffer has to by synchronized
+ *
+ * This function is the same as 'write_node()' but it does not assume the
+ * buffer it is writing is a node, so it does not prepare it (which means
+ * initializing common header and calculating CRC).
+ */
+static int write_head(struct ubifs_info *c, int jhead, void *buf, int len,
+		      int *lnum, int *offs, int sync)
+{
+	int err;
+	struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf;
+
+	ubifs_assert(jhead != GCHD);
+
+	*lnum = c->jheads[jhead].wbuf.lnum;
+	*offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used;
+	dbg_jnl("jhead %d, LEB %d:%d, len %d", jhead, *lnum, *offs, len);
+
+	err = ubifs_wbuf_write_nolock(wbuf, buf, len);
+	if (err)
+		return err;
+	if (sync)
+		err = ubifs_wbuf_sync_nolock(wbuf);
+	return err;
+}
+
+/**
+ * make_reservation - reserve journal space.
+ * @c: UBIFS file-system description object
+ * @jhead: journal head
+ * @len: how many bytes to reserve
+ *
+ * This function makes space reservation in journal head @jhead. The function
+ * takes the commit lock and locks the journal head, and the caller has to
+ * unlock the head and finish the reservation with 'finish_reservation()'.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ *
+ * Note, the journal head may be unlocked as soon as the data is written, while
+ * the commit lock has to be released after the data has been added to the
+ * TNC.
+ */
+static int make_reservation(struct ubifs_info *c, int jhead, int len)
+{
+	int err, cmt_retries = 0, nospc_retries = 0;
+
+again:
+	down_read(&c->commit_sem);
+	err = reserve_space(c, jhead, len);
+	if (!err)
+		return 0;
+	up_read(&c->commit_sem);
+
+	if (err == -ENOSPC) {
+		/*
+		 * GC could not make any progress. We should try to commit
+		 * once because it could make some dirty space and GC would
+		 * make progress, so make the error -EAGAIN so that the below
+		 * will commit and re-try.
+		 */
+		if (nospc_retries++ < 2) {
+			dbg_jnl("no space, retry");
+			err = -EAGAIN;
+		}
+
+		/*
+		 * This means that the budgeting is incorrect. We always have
+		 * to be able to write to the media, because all operations are
+		 * budgeted. Deletions are not budgeted, though, but we reserve
+		 * an extra LEB for them.
+		 */
+	}
+
+	if (err != -EAGAIN)
+		goto out;
+
+	/*
+	 * -EAGAIN means that the journal is full or too large, or the above
+	 * code wants to do one commit. Do this and re-try.
+	 */
+	if (cmt_retries > 128) {
+		/*
+		 * This should not happen unless the journal size limitations
+		 * are too tough.
+		 */
+		ubifs_err("stuck in space allocation");
+		err = -ENOSPC;
+		goto out;
+	} else if (cmt_retries > 32)
+		ubifs_warn("too many space allocation re-tries (%d)",
+			   cmt_retries);
+
+	dbg_jnl("-EAGAIN, commit and retry (retried %d times)",
+		cmt_retries);
+	cmt_retries += 1;
+
+	err = ubifs_run_commit(c);
+	if (err)
+		return err;
+	goto again;
+
+out:
+	ubifs_err("cannot reserve %d bytes in jhead %d, error %d",
+		  len, jhead, err);
+	if (err == -ENOSPC) {
+		/* This are some budgeting problems, print useful information */
+		down_write(&c->commit_sem);
+		spin_lock(&c->space_lock);
+		dbg_dump_stack();
+		dbg_dump_budg(c);
+		spin_unlock(&c->space_lock);
+		dbg_dump_lprops(c);
+		cmt_retries = dbg_check_lprops(c);
+		up_write(&c->commit_sem);
+	}
+	return err;
+}
+
+/**
+ * release_head - release a journal head.
+ * @c: UBIFS file-system description object
+ * @jhead: journal head
+ *
+ * This function releases journal head @jhead which was locked by
+ * the 'make_reservation()' function. It has to be called after each successful
+ * 'make_reservation()' invocation.
+ */
+static inline void release_head(struct ubifs_info *c, int jhead)
+{
+	mutex_unlock(&c->jheads[jhead].wbuf.io_mutex);
+}
+
+/**
+ * finish_reservation - finish a reservation.
+ * @c: UBIFS file-system description object
+ *
+ * This function finishes journal space reservation. It must be called after
+ * 'make_reservation()'.
+ */
+static void finish_reservation(struct ubifs_info *c)
+{
+	up_read(&c->commit_sem);
+}
+
+/**
+ * get_dent_type - translate VFS inode mode to UBIFS directory entry type.
+ * @mode: inode mode
+ */
+static int get_dent_type(int mode)
+{
+	switch (mode & S_IFMT) {
+	case S_IFREG:
+		return UBIFS_ITYPE_REG;
+	case S_IFDIR:
+		return UBIFS_ITYPE_DIR;
+	case S_IFLNK:
+		return UBIFS_ITYPE_LNK;
+	case S_IFBLK:
+		return UBIFS_ITYPE_BLK;
+	case S_IFCHR:
+		return UBIFS_ITYPE_CHR;
+	case S_IFIFO:
+		return UBIFS_ITYPE_FIFO;
+	case S_IFSOCK:
+		return UBIFS_ITYPE_SOCK;
+	default:
+		BUG();
+	}
+	return 0;
+}
+
+/**
+ * pack_inode - pack an inode node.
+ * @c: UBIFS file-system description object
+ * @ino: buffer in which to pack inode node
+ * @inode: inode to pack
+ * @last: indicates the last node of the group
+ * @last_reference: non-zero if this is a deletion inode
+ */
+static void pack_inode(struct ubifs_info *c, struct ubifs_ino_node *ino,
+		       const struct inode *inode, int last,
+		       int last_reference)
+{
+	int data_len = 0;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+
+	ino->ch.node_type = UBIFS_INO_NODE;
+	ino_key_init_flash(c, &ino->key, inode->i_ino);
+	ino->creat_sqnum = cpu_to_le64(ui->creat_sqnum);
+	ino->atime_sec  = cpu_to_le64(inode->i_atime.tv_sec);
+	ino->atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
+	ino->ctime_sec  = cpu_to_le64(inode->i_ctime.tv_sec);
+	ino->ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+	ino->mtime_sec  = cpu_to_le64(inode->i_mtime.tv_sec);
+	ino->mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+	ino->uid   = cpu_to_le32(inode->i_uid);
+	ino->gid   = cpu_to_le32(inode->i_gid);
+	ino->mode  = cpu_to_le32(inode->i_mode);
+	ino->flags = cpu_to_le32(ui->flags);
+	ino->size  = cpu_to_le64(ui->ui_size);
+	ino->nlink = cpu_to_le32(inode->i_nlink);
+	ino->compr_type  = cpu_to_le16(ui->compr_type);
+	ino->data_len    = cpu_to_le32(ui->data_len);
+	ino->xattr_cnt   = cpu_to_le32(ui->xattr_cnt);
+	ino->xattr_size  = cpu_to_le32(ui->xattr_size);
+	ino->xattr_names = cpu_to_le32(ui->xattr_names);
+	zero_ino_node_unused(ino);
+
+	/*
+	 * Drop the attached data if this is a deletion inode, the data is not
+	 * needed anymore.
+	 */
+	if (!last_reference) {
+		memcpy(ino->data, ui->data, ui->data_len);
+		data_len = ui->data_len;
+	}
+
+	ubifs_prep_grp_node(c, ino, UBIFS_INO_NODE_SZ + data_len, last);
+}
+
+/**
+ * mark_inode_clean - mark UBIFS inode as clean.
+ * @c: UBIFS file-system description object
+ * @ui: UBIFS inode to mark as clean
+ *
+ * This helper function marks UBIFS inode @ui as clean by cleaning the
+ * @ui->dirty flag and releasing its budget. Note, VFS may still treat the
+ * inode as dirty and try to write it back, but 'ubifs_write_inode()' would
+ * just do nothing.
+ */
+static void mark_inode_clean(struct ubifs_info *c, struct ubifs_inode *ui)
+{
+	if (ui->dirty)
+		ubifs_release_dirty_inode_budget(c, ui);
+	ui->dirty = 0;
+}
+
+/**
+ * ubifs_jnl_update - update inode.
+ * @c: UBIFS file-system description object
+ * @dir: parent inode or host inode in case of extended attributes
+ * @nm: directory entry name
+ * @inode: inode to update
+ * @deletion: indicates a directory entry deletion i.e unlink or rmdir
+ * @xent: non-zero if the directory entry is an extended attribute entry
+ *
+ * This function updates an inode by writing a directory entry (or extended
+ * attribute entry), the inode itself, and the parent directory inode (or the
+ * host inode) to the journal.
+ *
+ * The function writes the host inode @dir last, which is important in case of
+ * extended attributes. Indeed, then we guarantee that if the host inode gets
+ * synchronized (with 'fsync()'), and the write-buffer it sits in gets flushed,
+ * the extended attribute inode gets flushed too. And this is exactly what the
+ * user expects - synchronizing the host inode synchronizes its extended
+ * attributes. Similarly, this guarantees that if @dir is synchronized, its
+ * directory entry corresponding to @nm gets synchronized too.
+ *
+ * If the inode (@inode) or the parent directory (@dir) are synchronous, this
+ * function synchronizes the write-buffer.
+ *
+ * This function marks the @dir and @inode inodes as clean and returns zero on
+ * success. In case of failure, a negative error code is returned.
+ */
+int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
+		     const struct qstr *nm, const struct inode *inode,
+		     int deletion, int xent)
+{
+	int err, dlen, ilen, len, lnum, ino_offs, dent_offs;
+	int aligned_dlen, aligned_ilen, sync = IS_DIRSYNC(dir);
+	int last_reference = !!(deletion && inode->i_nlink == 0);
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	struct ubifs_inode *dir_ui = ubifs_inode(dir);
+	struct ubifs_dent_node *dent;
+	struct ubifs_ino_node *ino;
+	union ubifs_key dent_key, ino_key;
+
+	dbg_jnl("ino %lu, dent '%.*s', data len %d in dir ino %lu",
+		inode->i_ino, nm->len, nm->name, ui->data_len, dir->i_ino);
+	ubifs_assert(dir_ui->data_len == 0);
+	ubifs_assert(mutex_is_locked(&dir_ui->ui_mutex));
+
+	dlen = UBIFS_DENT_NODE_SZ + nm->len + 1;
+	ilen = UBIFS_INO_NODE_SZ;
+
+	/*
+	 * If the last reference to the inode is being deleted, then there is
+	 * no need to attach and write inode data, it is being deleted anyway.
+	 * And if the inode is being deleted, no need to synchronize
+	 * write-buffer even if the inode is synchronous.
+	 */
+	if (!last_reference) {
+		ilen += ui->data_len;
+		sync |= IS_SYNC(inode);
+	}
+
+	aligned_dlen = ALIGN(dlen, 8);
+	aligned_ilen = ALIGN(ilen, 8);
+	len = aligned_dlen + aligned_ilen + UBIFS_INO_NODE_SZ;
+	dent = kmalloc(len, GFP_NOFS);
+	if (!dent)
+		return -ENOMEM;
+
+	/* Make reservation before allocating sequence numbers */
+	err = make_reservation(c, BASEHD, len);
+	if (err)
+		goto out_free;
+
+	if (!xent) {
+		dent->ch.node_type = UBIFS_DENT_NODE;
+		dent_key_init(c, &dent_key, dir->i_ino, nm);
+	} else {
+		dent->ch.node_type = UBIFS_XENT_NODE;
+		xent_key_init(c, &dent_key, dir->i_ino, nm);
+	}
+
+	key_write(c, &dent_key, dent->key);
+	dent->inum = deletion ? 0 : cpu_to_le64(inode->i_ino);
+	dent->type = get_dent_type(inode->i_mode);
+	dent->nlen = cpu_to_le16(nm->len);
+	memcpy(dent->name, nm->name, nm->len);
+	dent->name[nm->len] = '\0';
+	zero_dent_node_unused(dent);
+	ubifs_prep_grp_node(c, dent, dlen, 0);
+
+	ino = (void *)dent + aligned_dlen;
+	pack_inode(c, ino, inode, 0, last_reference);
+	ino = (void *)ino + aligned_ilen;
+	pack_inode(c, ino, dir, 1, 0);
+
+	if (last_reference) {
+		err = ubifs_add_orphan(c, inode->i_ino);
+		if (err) {
+			release_head(c, BASEHD);
+			goto out_finish;
+		}
+	}
+
+	err = write_head(c, BASEHD, dent, len, &lnum, &dent_offs, sync);
+	if (err)
+		goto out_release;
+	if (!sync) {
+		struct ubifs_wbuf *wbuf = &c->jheads[BASEHD].wbuf;
+
+		ubifs_wbuf_add_ino_nolock(wbuf, inode->i_ino);
+		ubifs_wbuf_add_ino_nolock(wbuf, dir->i_ino);
+	}
+	release_head(c, BASEHD);
+	kfree(dent);
+
+	if (deletion) {
+		err = ubifs_tnc_remove_nm(c, &dent_key, nm);
+		if (err)
+			goto out_ro;
+		err = ubifs_add_dirt(c, lnum, dlen);
+	} else
+		err = ubifs_tnc_add_nm(c, &dent_key, lnum, dent_offs, dlen, nm);
+	if (err)
+		goto out_ro;
+
+	/*
+	 * Note, we do not remove the inode from TNC even if the last reference
+	 * to it has just been deleted, because the inode may still be opened.
+	 * Instead, the inode has been added to orphan lists and the orphan
+	 * subsystem will take further care about it.
+	 */
+	ino_key_init(c, &ino_key, inode->i_ino);
+	ino_offs = dent_offs + aligned_dlen;
+	err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs, ilen);
+	if (err)
+		goto out_ro;
+
+	ino_key_init(c, &ino_key, dir->i_ino);
+	ino_offs += aligned_ilen;
+	err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs, UBIFS_INO_NODE_SZ);
+	if (err)
+		goto out_ro;
+
+	finish_reservation(c);
+	spin_lock(&ui->ui_lock);
+	ui->synced_i_size = ui->ui_size;
+	spin_unlock(&ui->ui_lock);
+	mark_inode_clean(c, ui);
+	mark_inode_clean(c, dir_ui);
+	return 0;
+
+out_finish:
+	finish_reservation(c);
+out_free:
+	kfree(dent);
+	return err;
+
+out_release:
+	release_head(c, BASEHD);
+out_ro:
+	ubifs_ro_mode(c, err);
+	if (last_reference)
+		ubifs_delete_orphan(c, inode->i_ino);
+	finish_reservation(c);
+	return err;
+}
+
+/**
+ * ubifs_jnl_write_data - write a data node to the journal.
+ * @c: UBIFS file-system description object
+ * @inode: inode the data node belongs to
+ * @key: node key
+ * @buf: buffer to write
+ * @len: data length (must not exceed %UBIFS_BLOCK_SIZE)
+ *
+ * This function writes a data node to the journal. Returns %0 if the data node
+ * was successfully written, and a negative error code in case of failure.
+ */
+int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
+			 const union ubifs_key *key, const void *buf, int len)
+{
+	struct ubifs_data_node *data;
+	int err, lnum, offs, compr_type, out_len;
+	int dlen = UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE * WORST_COMPR_FACTOR;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+
+	dbg_jnl("ino %lu, blk %u, len %d, key %s", key_inum(c, key),
+		key_block(c, key), len, DBGKEY(key));
+	ubifs_assert(len <= UBIFS_BLOCK_SIZE);
+
+	data = kmalloc(dlen, GFP_NOFS);
+	if (!data)
+		return -ENOMEM;
+
+	data->ch.node_type = UBIFS_DATA_NODE;
+	key_write(c, key, &data->key);
+	data->size = cpu_to_le32(len);
+	zero_data_node_unused(data);
+
+	if (!(ui->flags && UBIFS_COMPR_FL))
+		/* Compression is disabled for this inode */
+		compr_type = UBIFS_COMPR_NONE;
+	else
+		compr_type = ui->compr_type;
+
+	out_len = dlen - UBIFS_DATA_NODE_SZ;
+	ubifs_compress(buf, len, &data->data, &out_len, &compr_type);
+	ubifs_assert(out_len <= UBIFS_BLOCK_SIZE);
+
+	dlen = UBIFS_DATA_NODE_SZ + out_len;
+	data->compr_type = cpu_to_le16(compr_type);
+
+	/* Make reservation before allocating sequence numbers */
+	err = make_reservation(c, DATAHD, dlen);
+	if (err)
+		goto out_free;
+
+	err = write_node(c, DATAHD, data, dlen, &lnum, &offs);
+	if (err)
+		goto out_release;
+	ubifs_wbuf_add_ino_nolock(&c->jheads[DATAHD].wbuf, key_inum(c, key));
+	release_head(c, DATAHD);
+
+	err = ubifs_tnc_add(c, key, lnum, offs, dlen);
+	if (err)
+		goto out_ro;
+
+	finish_reservation(c);
+	kfree(data);
+	return 0;
+
+out_release:
+	release_head(c, DATAHD);
+out_ro:
+	ubifs_ro_mode(c, err);
+	finish_reservation(c);
+out_free:
+	kfree(data);
+	return err;
+}
+
+/**
+ * ubifs_jnl_write_inode - flush inode to the journal.
+ * @c: UBIFS file-system description object
+ * @inode: inode to flush
+ * @deletion: inode has been deleted
+ *
+ * This function writes inode @inode to the journal. If the inode is
+ * synchronous, it also synchronizes the write-buffer. Returns zero in case of
+ * success and a negative error code in case of failure.
+ */
+int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode,
+			  int deletion)
+{
+	int err, len, lnum, offs, sync = 0;
+	struct ubifs_ino_node *ino;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+
+	dbg_jnl("ino %lu%s", inode->i_ino,
+		deletion ? " (last reference)" : "");
+	if (deletion)
+		ubifs_assert(inode->i_nlink == 0);
+
+	len = UBIFS_INO_NODE_SZ;
+	/*
+	 * If the inode is being deleted, do not write the attached data. No
+	 * need to synchronize the write-buffer either.
+	 */
+	if (!deletion) {
+		len += ui->data_len;
+		sync = IS_SYNC(inode);
+	}
+	ino = kmalloc(len, GFP_NOFS);
+	if (!ino)
+		return -ENOMEM;
+
+	/* Make reservation before allocating sequence numbers */
+	err = make_reservation(c, BASEHD, len);
+	if (err)
+		goto out_free;
+
+	pack_inode(c, ino, inode, 1, deletion);
+	err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync);
+	if (err)
+		goto out_release;
+	if (!sync)
+		ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf,
+					  inode->i_ino);
+	release_head(c, BASEHD);
+
+	if (deletion) {
+		err = ubifs_tnc_remove_ino(c, inode->i_ino);
+		if (err)
+			goto out_ro;
+		ubifs_delete_orphan(c, inode->i_ino);
+		err = ubifs_add_dirt(c, lnum, len);
+	} else {
+		union ubifs_key key;
+
+		ino_key_init(c, &key, inode->i_ino);
+		err = ubifs_tnc_add(c, &key, lnum, offs, len);
+	}
+	if (err)
+		goto out_ro;
+
+	finish_reservation(c);
+	spin_lock(&ui->ui_lock);
+	ui->synced_i_size = ui->ui_size;
+	spin_unlock(&ui->ui_lock);
+	kfree(ino);
+	return 0;
+
+out_release:
+	release_head(c, BASEHD);
+out_ro:
+	ubifs_ro_mode(c, err);
+	finish_reservation(c);
+out_free:
+	kfree(ino);
+	return err;
+}
+
+/**
+ * ubifs_jnl_rename - rename a directory entry.
+ * @c: UBIFS file-system description object
+ * @old_dir: parent inode of directory entry to rename
+ * @old_dentry: directory entry to rename
+ * @new_dir: parent inode of directory entry to rename
+ * @new_dentry: new directory entry (or directory entry to replace)
+ * @sync: non-zero if the write-buffer has to be synchronized
+ *
+ * This function implements the re-name operation which may involve writing up
+ * to 3 inodes and 2 directory entries. It marks the written inodes as clean
+ * and returns zero on success. In case of failure, a negative error code is
+ * returned.
+ */
+int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
+		     const struct dentry *old_dentry,
+		     const struct inode *new_dir,
+		     const struct dentry *new_dentry, int sync)
+{
+	void *p;
+	union ubifs_key key;
+	struct ubifs_dent_node *dent, *dent2;
+	int err, dlen1, dlen2, ilen, lnum, offs, len;
+	const struct inode *old_inode = old_dentry->d_inode;
+	const struct inode *new_inode = new_dentry->d_inode;
+	int aligned_dlen1, aligned_dlen2, plen = UBIFS_INO_NODE_SZ;
+	int last_reference = !!(new_inode && new_inode->i_nlink == 0);
+	int move = (old_dir != new_dir);
+	struct ubifs_inode *uninitialized_var(new_ui);
+
+	dbg_jnl("dent '%.*s' in dir ino %lu to dent '%.*s' in dir ino %lu",
+		old_dentry->d_name.len, old_dentry->d_name.name,
+		old_dir->i_ino, new_dentry->d_name.len,
+		new_dentry->d_name.name, new_dir->i_ino);
+	ubifs_assert(ubifs_inode(old_dir)->data_len == 0);
+	ubifs_assert(ubifs_inode(new_dir)->data_len == 0);
+	ubifs_assert(mutex_is_locked(&ubifs_inode(old_dir)->ui_mutex));
+	ubifs_assert(mutex_is_locked(&ubifs_inode(new_dir)->ui_mutex));
+
+	dlen1 = UBIFS_DENT_NODE_SZ + new_dentry->d_name.len + 1;
+	dlen2 = UBIFS_DENT_NODE_SZ + old_dentry->d_name.len + 1;
+	if (new_inode) {
+		new_ui = ubifs_inode(new_inode);
+		ubifs_assert(mutex_is_locked(&new_ui->ui_mutex));
+		ilen = UBIFS_INO_NODE_SZ;
+		if (!last_reference)
+			ilen += new_ui->data_len;
+	} else
+		ilen = 0;
+
+	aligned_dlen1 = ALIGN(dlen1, 8);
+	aligned_dlen2 = ALIGN(dlen2, 8);
+	len = aligned_dlen1 + aligned_dlen2 + ALIGN(ilen, 8) + ALIGN(plen, 8);
+	if (old_dir != new_dir)
+		len += plen;
+	dent = kmalloc(len, GFP_NOFS);
+	if (!dent)
+		return -ENOMEM;
+
+	/* Make reservation before allocating sequence numbers */
+	err = make_reservation(c, BASEHD, len);
+	if (err)
+		goto out_free;
+
+	/* Make new dent */
+	dent->ch.node_type = UBIFS_DENT_NODE;
+	dent_key_init_flash(c, &dent->key, new_dir->i_ino, &new_dentry->d_name);
+	dent->inum = cpu_to_le64(old_inode->i_ino);
+	dent->type = get_dent_type(old_inode->i_mode);
+	dent->nlen = cpu_to_le16(new_dentry->d_name.len);
+	memcpy(dent->name, new_dentry->d_name.name, new_dentry->d_name.len);
+	dent->name[new_dentry->d_name.len] = '\0';
+	zero_dent_node_unused(dent);
+	ubifs_prep_grp_node(c, dent, dlen1, 0);
+
+	/* Make deletion dent */
+	dent2 = (void *)dent + aligned_dlen1;
+	dent2->ch.node_type = UBIFS_DENT_NODE;
+	dent_key_init_flash(c, &dent2->key, old_dir->i_ino,
+			    &old_dentry->d_name);
+	dent2->inum = 0;
+	dent2->type = DT_UNKNOWN;
+	dent2->nlen = cpu_to_le16(old_dentry->d_name.len);
+	memcpy(dent2->name, old_dentry->d_name.name, old_dentry->d_name.len);
+	dent2->name[old_dentry->d_name.len] = '\0';
+	zero_dent_node_unused(dent2);
+	ubifs_prep_grp_node(c, dent2, dlen2, 0);
+
+	p = (void *)dent2 + aligned_dlen2;
+	if (new_inode) {
+		pack_inode(c, p, new_inode, 0, last_reference);
+		p += ALIGN(ilen, 8);
+	}
+
+	if (!move)
+		pack_inode(c, p, old_dir, 1, 0);
+	else {
+		pack_inode(c, p, old_dir, 0, 0);
+		p += ALIGN(plen, 8);
+		pack_inode(c, p, new_dir, 1, 0);
+	}
+
+	if (last_reference) {
+		err = ubifs_add_orphan(c, new_inode->i_ino);
+		if (err) {
+			release_head(c, BASEHD);
+			goto out_finish;
+		}
+	}
+
+	err = write_head(c, BASEHD, dent, len, &lnum, &offs, sync);
+	if (err)
+		goto out_release;
+	if (!sync) {
+		struct ubifs_wbuf *wbuf = &c->jheads[BASEHD].wbuf;
+
+		ubifs_wbuf_add_ino_nolock(wbuf, new_dir->i_ino);
+		ubifs_wbuf_add_ino_nolock(wbuf, old_dir->i_ino);
+		if (new_inode)
+			ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf,
+						  new_inode->i_ino);
+	}
+	release_head(c, BASEHD);
+
+	dent_key_init(c, &key, new_dir->i_ino, &new_dentry->d_name);
+	err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, &new_dentry->d_name);
+	if (err)
+		goto out_ro;
+
+	err = ubifs_add_dirt(c, lnum, dlen2);
+	if (err)
+		goto out_ro;
+
+	dent_key_init(c, &key, old_dir->i_ino, &old_dentry->d_name);
+	err = ubifs_tnc_remove_nm(c, &key, &old_dentry->d_name);
+	if (err)
+		goto out_ro;
+
+	offs += aligned_dlen1 + aligned_dlen2;
+	if (new_inode) {
+		ino_key_init(c, &key, new_inode->i_ino);
+		err = ubifs_tnc_add(c, &key, lnum, offs, ilen);
+		if (err)
+			goto out_ro;
+		offs += ALIGN(ilen, 8);
+	}
+
+	ino_key_init(c, &key, old_dir->i_ino);
+	err = ubifs_tnc_add(c, &key, lnum, offs, plen);
+	if (err)
+		goto out_ro;
+
+	if (old_dir != new_dir) {
+		offs += ALIGN(plen, 8);
+		ino_key_init(c, &key, new_dir->i_ino);
+		err = ubifs_tnc_add(c, &key, lnum, offs, plen);
+		if (err)
+			goto out_ro;
+	}
+
+	finish_reservation(c);
+	if (new_inode) {
+		mark_inode_clean(c, new_ui);
+		spin_lock(&new_ui->ui_lock);
+		new_ui->synced_i_size = new_ui->ui_size;
+		spin_unlock(&new_ui->ui_lock);
+	}
+	mark_inode_clean(c, ubifs_inode(old_dir));
+	if (move)
+		mark_inode_clean(c, ubifs_inode(new_dir));
+	kfree(dent);
+	return 0;
+
+out_release:
+	release_head(c, BASEHD);
+out_ro:
+	ubifs_ro_mode(c, err);
+	if (last_reference)
+		ubifs_delete_orphan(c, new_inode->i_ino);
+out_finish:
+	finish_reservation(c);
+out_free:
+	kfree(dent);
+	return err;
+}
+
+/**
+ * recomp_data_node - re-compress a truncated data node.
+ * @dn: data node to re-compress
+ * @new_len: new length
+ *
+ * This function is used when an inode is truncated and the last data node of
+ * the inode has to be re-compressed and re-written.
+ */
+static int recomp_data_node(struct ubifs_data_node *dn, int *new_len)
+{
+	void *buf;
+	int err, len, compr_type, out_len;
+
+	out_len = le32_to_cpu(dn->size);
+	buf = kmalloc(out_len * WORST_COMPR_FACTOR, GFP_NOFS);
+	if (!buf)
+		return -ENOMEM;
+
+	len = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
+	compr_type = le16_to_cpu(dn->compr_type);
+	err = ubifs_decompress(&dn->data, len, buf, &out_len, compr_type);
+	if (err)
+		goto out;
+
+	ubifs_compress(buf, *new_len, &dn->data, &out_len, &compr_type);
+	ubifs_assert(out_len <= UBIFS_BLOCK_SIZE);
+	dn->compr_type = cpu_to_le16(compr_type);
+	dn->size = cpu_to_le32(*new_len);
+	*new_len = UBIFS_DATA_NODE_SZ + out_len;
+out:
+	kfree(buf);
+	return err;
+}
+
+/**
+ * ubifs_jnl_truncate - update the journal for a truncation.
+ * @c: UBIFS file-system description object
+ * @inode: inode to truncate
+ * @old_size: old size
+ * @new_size: new size
+ *
+ * When the size of a file decreases due to truncation, a truncation node is
+ * written, the journal tree is updated, and the last data block is re-written
+ * if it has been affected. The inode is also updated in order to synchronize
+ * the new inode size.
+ *
+ * This function marks the inode as clean and returns zero on success. In case
+ * of failure, a negative error code is returned.
+ */
+int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
+		       loff_t old_size, loff_t new_size)
+{
+	union ubifs_key key, to_key;
+	struct ubifs_ino_node *ino;
+	struct ubifs_trun_node *trun;
+	struct ubifs_data_node *uninitialized_var(dn);
+	int err, dlen, len, lnum, offs, bit, sz, sync = IS_SYNC(inode);
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	ino_t inum = inode->i_ino;
+	unsigned int blk;
+
+	dbg_jnl("ino %lu, size %lld -> %lld", inum, old_size, new_size);
+	ubifs_assert(!ui->data_len);
+	ubifs_assert(S_ISREG(inode->i_mode));
+	ubifs_assert(mutex_is_locked(&ui->ui_mutex));
+
+	sz = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ +
+	     UBIFS_MAX_DATA_NODE_SZ * WORST_COMPR_FACTOR;
+	ino = kmalloc(sz, GFP_NOFS);
+	if (!ino)
+		return -ENOMEM;
+
+	trun = (void *)ino + UBIFS_INO_NODE_SZ;
+	trun->ch.node_type = UBIFS_TRUN_NODE;
+	trun->inum = cpu_to_le32(inum);
+	trun->old_size = cpu_to_le64(old_size);
+	trun->new_size = cpu_to_le64(new_size);
+	zero_trun_node_unused(trun);
+
+	dlen = new_size & (UBIFS_BLOCK_SIZE - 1);
+	if (dlen) {
+		/* Get last data block so it can be truncated */
+		dn = (void *)trun + UBIFS_TRUN_NODE_SZ;
+		blk = new_size >> UBIFS_BLOCK_SHIFT;
+		data_key_init(c, &key, inum, blk);
+		dbg_jnl("last block key %s", DBGKEY(&key));
+		err = ubifs_tnc_lookup(c, &key, dn);
+		if (err == -ENOENT)
+			dlen = 0; /* Not found (so it is a hole) */
+		else if (err)
+			goto out_free;
+		else {
+			if (le32_to_cpu(dn->size) <= dlen)
+				dlen = 0; /* Nothing to do */
+			else {
+				int compr_type = le16_to_cpu(dn->compr_type);
+
+				if (compr_type != UBIFS_COMPR_NONE) {
+					err = recomp_data_node(dn, &dlen);
+					if (err)
+						goto out_free;
+				} else {
+					dn->size = cpu_to_le32(dlen);
+					dlen += UBIFS_DATA_NODE_SZ;
+				}
+				zero_data_node_unused(dn);
+			}
+		}
+	}
+
+	/* Must make reservation before allocating sequence numbers */
+	len = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ;
+	if (dlen)
+		len += dlen;
+	err = make_reservation(c, BASEHD, len);
+	if (err)
+		goto out_free;
+
+	pack_inode(c, ino, inode, 0, 0);
+	ubifs_prep_grp_node(c, trun, UBIFS_TRUN_NODE_SZ, dlen ? 0 : 1);
+	if (dlen)
+		ubifs_prep_grp_node(c, dn, dlen, 1);
+
+	err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync);
+	if (err)
+		goto out_release;
+	if (!sync)
+		ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, inum);
+	release_head(c, BASEHD);
+
+	if (dlen) {
+		sz = offs + UBIFS_INO_NODE_SZ + UBIFS_TRUN_NODE_SZ;
+		err = ubifs_tnc_add(c, &key, lnum, sz, dlen);
+		if (err)
+			goto out_ro;
+	}
+
+	ino_key_init(c, &key, inum);
+	err = ubifs_tnc_add(c, &key, lnum, offs, UBIFS_INO_NODE_SZ);
+	if (err)
+		goto out_ro;
+
+	err = ubifs_add_dirt(c, lnum, UBIFS_TRUN_NODE_SZ);
+	if (err)
+		goto out_ro;
+
+	bit = new_size & (UBIFS_BLOCK_SIZE - 1);
+	blk = (new_size >> UBIFS_BLOCK_SHIFT) + (bit ? 1 : 0);
+	data_key_init(c, &key, inum, blk);
+
+	bit = old_size & (UBIFS_BLOCK_SIZE - 1);
+	blk = (old_size >> UBIFS_BLOCK_SHIFT) - (bit ? 0: 1);
+	data_key_init(c, &to_key, inum, blk);
+
+	err = ubifs_tnc_remove_range(c, &key, &to_key);
+	if (err)
+		goto out_ro;
+
+	finish_reservation(c);
+	spin_lock(&ui->ui_lock);
+	ui->synced_i_size = ui->ui_size;
+	spin_unlock(&ui->ui_lock);
+	mark_inode_clean(c, ui);
+	kfree(ino);
+	return 0;
+
+out_release:
+	release_head(c, BASEHD);
+out_ro:
+	ubifs_ro_mode(c, err);
+	finish_reservation(c);
+out_free:
+	kfree(ino);
+	return err;
+}
+
+#ifdef CONFIG_UBIFS_FS_XATTR
+
+/**
+ * ubifs_jnl_delete_xattr - delete an extended attribute.
+ * @c: UBIFS file-system description object
+ * @host: host inode
+ * @inode: extended attribute inode
+ * @nm: extended attribute entry name
+ *
+ * This function delete an extended attribute which is very similar to
+ * un-linking regular files - it writes a deletion xentry, a deletion inode and
+ * updates the target inode. Returns zero in case of success and a negative
+ * error code in case of failure.
+ */
+int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
+			   const struct inode *inode, const struct qstr *nm)
+{
+	int err, xlen, hlen, len, lnum, xent_offs, aligned_xlen;
+	struct ubifs_dent_node *xent;
+	struct ubifs_ino_node *ino;
+	union ubifs_key xent_key, key1, key2;
+	int sync = IS_DIRSYNC(host);
+	struct ubifs_inode *host_ui = ubifs_inode(host);
+
+	dbg_jnl("host %lu, xattr ino %lu, name '%s', data len %d",
+		host->i_ino, inode->i_ino, nm->name,
+		ubifs_inode(inode)->data_len);
+	ubifs_assert(inode->i_nlink == 0);
+	ubifs_assert(mutex_is_locked(&host_ui->ui_mutex));
+
+	/*
+	 * Since we are deleting the inode, we do not bother to attach any data
+	 * to it and assume its length is %UBIFS_INO_NODE_SZ.
+	 */
+	xlen = UBIFS_DENT_NODE_SZ + nm->len + 1;
+	aligned_xlen = ALIGN(xlen, 8);
+	hlen = host_ui->data_len + UBIFS_INO_NODE_SZ;
+	len = aligned_xlen + UBIFS_INO_NODE_SZ + ALIGN(hlen, 8);
+
+	xent = kmalloc(len, GFP_NOFS);
+	if (!xent)
+		return -ENOMEM;
+
+	/* Make reservation before allocating sequence numbers */
+	err = make_reservation(c, BASEHD, len);
+	if (err) {
+		kfree(xent);
+		return err;
+	}
+
+	xent->ch.node_type = UBIFS_XENT_NODE;
+	xent_key_init(c, &xent_key, host->i_ino, nm);
+	key_write(c, &xent_key, xent->key);
+	xent->inum = 0;
+	xent->type = get_dent_type(inode->i_mode);
+	xent->nlen = cpu_to_le16(nm->len);
+	memcpy(xent->name, nm->name, nm->len);
+	xent->name[nm->len] = '\0';
+	zero_dent_node_unused(xent);
+	ubifs_prep_grp_node(c, xent, xlen, 0);
+
+	ino = (void *)xent + aligned_xlen;
+	pack_inode(c, ino, inode, 0, 1);
+	ino = (void *)ino + UBIFS_INO_NODE_SZ;
+	pack_inode(c, ino, host, 1, 0);
+
+	err = write_head(c, BASEHD, xent, len, &lnum, &xent_offs, sync);
+	if (!sync && !err)
+		ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, host->i_ino);
+	release_head(c, BASEHD);
+	kfree(xent);
+	if (err)
+		goto out_ro;
+
+	/* Remove the extended attribute entry from TNC */
+	err = ubifs_tnc_remove_nm(c, &xent_key, nm);
+	if (err)
+		goto out_ro;
+	err = ubifs_add_dirt(c, lnum, xlen);
+	if (err)
+		goto out_ro;
+
+	/*
+	 * Remove all nodes belonging to the extended attribute inode from TNC.
+	 * Well, there actually must be only one node - the inode itself.
+	 */
+	lowest_ino_key(c, &key1, inode->i_ino);
+	highest_ino_key(c, &key2, inode->i_ino);
+	err = ubifs_tnc_remove_range(c, &key1, &key2);
+	if (err)
+		goto out_ro;
+	err = ubifs_add_dirt(c, lnum, UBIFS_INO_NODE_SZ);
+	if (err)
+		goto out_ro;
+
+	/* And update TNC with the new host inode position */
+	ino_key_init(c, &key1, host->i_ino);
+	err = ubifs_tnc_add(c, &key1, lnum, xent_offs + len - hlen, hlen);
+	if (err)
+		goto out_ro;
+
+	finish_reservation(c);
+	spin_lock(&host_ui->ui_lock);
+	host_ui->synced_i_size = host_ui->ui_size;
+	spin_unlock(&host_ui->ui_lock);
+	mark_inode_clean(c, host_ui);
+	return 0;
+
+out_ro:
+	ubifs_ro_mode(c, err);
+	finish_reservation(c);
+	return err;
+}
+
+/**
+ * ubifs_jnl_change_xattr - change an extended attribute.
+ * @c: UBIFS file-system description object
+ * @inode: extended attribute inode
+ * @host: host inode
+ *
+ * This function writes the updated version of an extended attribute inode and
+ * the host inode tho the journal (to the base head). The host inode is written
+ * after the extended attribute inode in order to guarantee that the extended
+ * attribute will be flushed when the inode is synchronized by 'fsync()' and
+ * consequently, the write-buffer is synchronized. This function returns zero
+ * in case of success and a negative error code in case of failure.
+ */
+int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode,
+			   const struct inode *host)
+{
+	int err, len1, len2, aligned_len, aligned_len1, lnum, offs;
+	struct ubifs_inode *host_ui = ubifs_inode(inode);
+	struct ubifs_ino_node *ino;
+	union ubifs_key key;
+	int sync = IS_DIRSYNC(host);
+
+	dbg_jnl("ino %lu, ino %lu", host->i_ino, inode->i_ino);
+	ubifs_assert(host->i_nlink > 0);
+	ubifs_assert(inode->i_nlink > 0);
+	ubifs_assert(mutex_is_locked(&host_ui->ui_mutex));
+
+	len1 = UBIFS_INO_NODE_SZ + host_ui->data_len;
+	len2 = UBIFS_INO_NODE_SZ + ubifs_inode(inode)->data_len;
+	aligned_len1 = ALIGN(len1, 8);
+	aligned_len = aligned_len1 + ALIGN(len2, 8);
+
+	ino = kmalloc(aligned_len, GFP_NOFS);
+	if (!ino)
+		return -ENOMEM;
+
+	/* Make reservation before allocating sequence numbers */
+	err = make_reservation(c, BASEHD, aligned_len);
+	if (err)
+		goto out_free;
+
+	pack_inode(c, ino, host, 0, 0);
+	pack_inode(c, (void *)ino + aligned_len1, inode, 1, 0);
+
+	err = write_head(c, BASEHD, ino, aligned_len, &lnum, &offs, 0);
+	if (!sync && !err) {
+		struct ubifs_wbuf *wbuf = &c->jheads[BASEHD].wbuf;
+
+		ubifs_wbuf_add_ino_nolock(wbuf, host->i_ino);
+		ubifs_wbuf_add_ino_nolock(wbuf, inode->i_ino);
+	}
+	release_head(c, BASEHD);
+	if (err)
+		goto out_ro;
+
+	ino_key_init(c, &key, host->i_ino);
+	err = ubifs_tnc_add(c, &key, lnum, offs, len1);
+	if (err)
+		goto out_ro;
+
+	ino_key_init(c, &key, inode->i_ino);
+	err = ubifs_tnc_add(c, &key, lnum, offs + aligned_len1, len2);
+	if (err)
+		goto out_ro;
+
+	finish_reservation(c);
+	spin_lock(&host_ui->ui_lock);
+	host_ui->synced_i_size = host_ui->ui_size;
+	spin_unlock(&host_ui->ui_lock);
+	mark_inode_clean(c, host_ui);
+	kfree(ino);
+	return 0;
+
+out_ro:
+	ubifs_ro_mode(c, err);
+	finish_reservation(c);
+out_free:
+	kfree(ino);
+	return err;
+}
+
+#endif /* CONFIG_UBIFS_FS_XATTR */
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
new file mode 100644
index 00000000000..8f747600754
--- /dev/null
+++ b/fs/ubifs/key.h
@@ -0,0 +1,533 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/*
+ * This header contains various key-related definitions and helper function.
+ * UBIFS allows several key schemes, so we access key fields only via these
+ * helpers. At the moment only one key scheme is supported.
+ *
+ * Simple key scheme
+ * ~~~~~~~~~~~~~~~~~
+ *
+ * Keys are 64-bits long. First 32-bits are inode number (parent inode number
+ * in case of direntry key). Next 3 bits are node type. The last 29 bits are
+ * 4KiB offset in case of inode node, and direntry hash in case of a direntry
+ * node. We use "r5" hash borrowed from reiserfs.
+ */
+
+#ifndef __UBIFS_KEY_H__
+#define __UBIFS_KEY_H__
+
+/**
+ * key_r5_hash - R5 hash function (borrowed from reiserfs).
+ * @s: direntry name
+ * @len: name length
+ */
+static inline uint32_t key_r5_hash(const char *s, int len)
+{
+	uint32_t a = 0;
+	const signed char *str = (const signed char *)s;
+
+	while (*str) {
+		a += *str << 4;
+		a += *str >> 4;
+		a *= 11;
+		str++;
+	}
+
+	a &= UBIFS_S_KEY_HASH_MASK;
+
+	/*
+	 * We use hash values as offset in directories, so values %0 and %1 are
+	 * reserved for "." and "..". %2 is reserved for "end of readdir"
+	 * marker.
+	 */
+	if (unlikely(a >= 0 && a <= 2))
+		a += 3;
+	return a;
+}
+
+/**
+ * key_test_hash - testing hash function.
+ * @str: direntry name
+ * @len: name length
+ */
+static inline uint32_t key_test_hash(const char *str, int len)
+{
+	uint32_t a = 0;
+
+	len = min_t(uint32_t, len, 4);
+	memcpy(&a, str, len);
+	a &= UBIFS_S_KEY_HASH_MASK;
+	if (unlikely(a >= 0 && a <= 2))
+		a += 3;
+	return a;
+}
+
+/**
+ * ino_key_init - initialize inode key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: inode number
+ */
+static inline void ino_key_init(const struct ubifs_info *c,
+				union ubifs_key *key, ino_t inum)
+{
+	key->u32[0] = inum;
+	key->u32[1] = UBIFS_INO_KEY << UBIFS_S_KEY_BLOCK_BITS;
+}
+
+/**
+ * ino_key_init_flash - initialize on-flash inode key.
+ * @c: UBIFS file-system description object
+ * @k: key to initialize
+ * @inum: inode number
+ */
+static inline void ino_key_init_flash(const struct ubifs_info *c, void *k,
+				      ino_t inum)
+{
+	union ubifs_key *key = k;
+
+	key->j32[0] = cpu_to_le32(inum);
+	key->j32[1] = cpu_to_le32(UBIFS_INO_KEY << UBIFS_S_KEY_BLOCK_BITS);
+	memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8);
+}
+
+/**
+ * lowest_ino_key - get the lowest possible inode key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: inode number
+ */
+static inline void lowest_ino_key(const struct ubifs_info *c,
+				union ubifs_key *key, ino_t inum)
+{
+	key->u32[0] = inum;
+	key->u32[1] = 0;
+}
+
+/**
+ * highest_ino_key - get the highest possible inode key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: inode number
+ */
+static inline void highest_ino_key(const struct ubifs_info *c,
+				union ubifs_key *key, ino_t inum)
+{
+	key->u32[0] = inum;
+	key->u32[1] = 0xffffffff;
+}
+
+/**
+ * dent_key_init - initialize directory entry key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: parent inode number
+ * @nm: direntry name and length
+ */
+static inline void dent_key_init(const struct ubifs_info *c,
+				 union ubifs_key *key, ino_t inum,
+				 const struct qstr *nm)
+{
+	uint32_t hash = c->key_hash(nm->name, nm->len);
+
+	ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
+	key->u32[0] = inum;
+	key->u32[1] = hash | (UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS);
+}
+
+/**
+ * dent_key_init_hash - initialize directory entry key without re-calculating
+ *                      hash function.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: parent inode number
+ * @hash: direntry name hash
+ */
+static inline void dent_key_init_hash(const struct ubifs_info *c,
+				      union ubifs_key *key, ino_t inum,
+				      uint32_t hash)
+{
+	ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
+	key->u32[0] = inum;
+	key->u32[1] = hash | (UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS);
+}
+
+/**
+ * dent_key_init_flash - initialize on-flash directory entry key.
+ * @c: UBIFS file-system description object
+ * @k: key to initialize
+ * @inum: parent inode number
+ * @nm: direntry name and length
+ */
+static inline void dent_key_init_flash(const struct ubifs_info *c, void *k,
+				       ino_t inum, const struct qstr *nm)
+{
+	union ubifs_key *key = k;
+	uint32_t hash = c->key_hash(nm->name, nm->len);
+
+	ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
+	key->j32[0] = cpu_to_le32(inum);
+	key->j32[1] = cpu_to_le32(hash |
+				  (UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS));
+	memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8);
+}
+
+/**
+ * lowest_dent_key - get the lowest possible directory entry key.
+ * @c: UBIFS file-system description object
+ * @key: where to store the lowest key
+ * @inum: parent inode number
+ */
+static inline void lowest_dent_key(const struct ubifs_info *c,
+				   union ubifs_key *key, ino_t inum)
+{
+	key->u32[0] = inum;
+	key->u32[1] = UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS;
+}
+
+/**
+ * xent_key_init - initialize extended attribute entry key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: host inode number
+ * @nm: extended attribute entry name and length
+ */
+static inline void xent_key_init(const struct ubifs_info *c,
+				 union ubifs_key *key, ino_t inum,
+				 const struct qstr *nm)
+{
+	uint32_t hash = c->key_hash(nm->name, nm->len);
+
+	ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
+	key->u32[0] = inum;
+	key->u32[1] = hash | (UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS);
+}
+
+/**
+ * xent_key_init_hash - initialize extended attribute entry key without
+ *                      re-calculating hash function.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: host inode number
+ * @hash: extended attribute entry name hash
+ */
+static inline void xent_key_init_hash(const struct ubifs_info *c,
+				      union ubifs_key *key, ino_t inum,
+				      uint32_t hash)
+{
+	ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
+	key->u32[0] = inum;
+	key->u32[1] = hash | (UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS);
+}
+
+/**
+ * xent_key_init_flash - initialize on-flash extended attribute entry key.
+ * @c: UBIFS file-system description object
+ * @k: key to initialize
+ * @inum: host inode number
+ * @nm: extended attribute entry name and length
+ */
+static inline void xent_key_init_flash(const struct ubifs_info *c, void *k,
+				       ino_t inum, const struct qstr *nm)
+{
+	union ubifs_key *key = k;
+	uint32_t hash = c->key_hash(nm->name, nm->len);
+
+	ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
+	key->j32[0] = cpu_to_le32(inum);
+	key->j32[1] = cpu_to_le32(hash |
+				  (UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS));
+	memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8);
+}
+
+/**
+ * lowest_xent_key - get the lowest possible extended attribute entry key.
+ * @c: UBIFS file-system description object
+ * @key: where to store the lowest key
+ * @inum: host inode number
+ */
+static inline void lowest_xent_key(const struct ubifs_info *c,
+				   union ubifs_key *key, ino_t inum)
+{
+	key->u32[0] = inum;
+	key->u32[1] = UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS;
+}
+
+/**
+ * data_key_init - initialize data key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: inode number
+ * @block: block number
+ */
+static inline void data_key_init(const struct ubifs_info *c,
+				 union ubifs_key *key, ino_t inum,
+				 unsigned int block)
+{
+	ubifs_assert(!(block & ~UBIFS_S_KEY_BLOCK_MASK));
+	key->u32[0] = inum;
+	key->u32[1] = block | (UBIFS_DATA_KEY << UBIFS_S_KEY_BLOCK_BITS);
+}
+
+/**
+ * data_key_init_flash - initialize on-flash data key.
+ * @c: UBIFS file-system description object
+ * @k: key to initialize
+ * @inum: inode number
+ * @block: block number
+ */
+static inline void data_key_init_flash(const struct ubifs_info *c, void *k,
+				       ino_t inum, unsigned int block)
+{
+	union ubifs_key *key = k;
+
+	ubifs_assert(!(block & ~UBIFS_S_KEY_BLOCK_MASK));
+	key->j32[0] = cpu_to_le32(inum);
+	key->j32[1] = cpu_to_le32(block |
+				  (UBIFS_DATA_KEY << UBIFS_S_KEY_BLOCK_BITS));
+	memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8);
+}
+
+/**
+ * trun_key_init - initialize truncation node key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: inode number
+ *
+ * Note, UBIFS does not have truncation keys on the media and this function is
+ * only used for purposes of replay.
+ */
+static inline void trun_key_init(const struct ubifs_info *c,
+				 union ubifs_key *key, ino_t inum)
+{
+	key->u32[0] = inum;
+	key->u32[1] = UBIFS_TRUN_KEY << UBIFS_S_KEY_BLOCK_BITS;
+}
+
+/**
+ * key_type - get key type.
+ * @c: UBIFS file-system description object
+ * @key: key to get type of
+ */
+static inline int key_type(const struct ubifs_info *c,
+			   const union ubifs_key *key)
+{
+	return key->u32[1] >> UBIFS_S_KEY_BLOCK_BITS;
+}
+
+/**
+ * key_type_flash - get type of a on-flash formatted key.
+ * @c: UBIFS file-system description object
+ * @k: key to get type of
+ */
+static inline int key_type_flash(const struct ubifs_info *c, const void *k)
+{
+	const union ubifs_key *key = k;
+
+	return le32_to_cpu(key->u32[1]) >> UBIFS_S_KEY_BLOCK_BITS;
+}
+
+/**
+ * key_inum - fetch inode number from key.
+ * @c: UBIFS file-system description object
+ * @k: key to fetch inode number from
+ */
+static inline ino_t key_inum(const struct ubifs_info *c, const void *k)
+{
+	const union ubifs_key *key = k;
+
+	return key->u32[0];
+}
+
+/**
+ * key_inum_flash - fetch inode number from an on-flash formatted key.
+ * @c: UBIFS file-system description object
+ * @k: key to fetch inode number from
+ */
+static inline ino_t key_inum_flash(const struct ubifs_info *c, const void *k)
+{
+	const union ubifs_key *key = k;
+
+	return le32_to_cpu(key->j32[0]);
+}
+
+/**
+ * key_hash - get directory entry hash.
+ * @c: UBIFS file-system description object
+ * @key: the key to get hash from
+ */
+static inline int key_hash(const struct ubifs_info *c,
+			   const union ubifs_key *key)
+{
+	return key->u32[1] & UBIFS_S_KEY_HASH_MASK;
+}
+
+/**
+ * key_hash_flash - get directory entry hash from an on-flash formatted key.
+ * @c: UBIFS file-system description object
+ * @k: the key to get hash from
+ */
+static inline int key_hash_flash(const struct ubifs_info *c, const void *k)
+{
+	const union ubifs_key *key = k;
+
+	return le32_to_cpu(key->j32[1]) & UBIFS_S_KEY_HASH_MASK;
+}
+
+/**
+ * key_block - get data block number.
+ * @c: UBIFS file-system description object
+ * @key: the key to get the block number from
+ */
+static inline unsigned int key_block(const struct ubifs_info *c,
+				     const union ubifs_key *key)
+{
+	return key->u32[1] & UBIFS_S_KEY_BLOCK_MASK;
+}
+
+/**
+ * key_block_flash - get data block number from an on-flash formatted key.
+ * @c: UBIFS file-system description object
+ * @k: the key to get the block number from
+ */
+static inline unsigned int key_block_flash(const struct ubifs_info *c,
+					   const void *k)
+{
+	const union ubifs_key *key = k;
+
+	return le32_to_cpu(key->u32[1]) & UBIFS_S_KEY_BLOCK_MASK;
+}
+
+/**
+ * key_read - transform a key to in-memory format.
+ * @c: UBIFS file-system description object
+ * @from: the key to transform
+ * @to: the key to store the result
+ */
+static inline void key_read(const struct ubifs_info *c, const void *from,
+			    union ubifs_key *to)
+{
+	const union ubifs_key *f = from;
+
+	to->u32[0] = le32_to_cpu(f->j32[0]);
+	to->u32[1] = le32_to_cpu(f->j32[1]);
+}
+
+/**
+ * key_write - transform a key from in-memory format.
+ * @c: UBIFS file-system description object
+ * @from: the key to transform
+ * @to: the key to store the result
+ */
+static inline void key_write(const struct ubifs_info *c,
+			     const union ubifs_key *from, void *to)
+{
+	union ubifs_key *t = to;
+
+	t->j32[0] = cpu_to_le32(from->u32[0]);
+	t->j32[1] = cpu_to_le32(from->u32[1]);
+	memset(to + 8, 0, UBIFS_MAX_KEY_LEN - 8);
+}
+
+/**
+ * key_write_idx - transform a key from in-memory format for the index.
+ * @c: UBIFS file-system description object
+ * @from: the key to transform
+ * @to: the key to store the result
+ */
+static inline void key_write_idx(const struct ubifs_info *c,
+				 const union ubifs_key *from, void *to)
+{
+	union ubifs_key *t = to;
+
+	t->j32[0] = cpu_to_le32(from->u32[0]);
+	t->j32[1] = cpu_to_le32(from->u32[1]);
+}
+
+/**
+ * key_copy - copy a key.
+ * @c: UBIFS file-system description object
+ * @from: the key to copy from
+ * @to: the key to copy to
+ */
+static inline void key_copy(const struct ubifs_info *c,
+			    const union ubifs_key *from, union ubifs_key *to)
+{
+	to->u64[0] = from->u64[0];
+}
+
+/**
+ * keys_cmp - compare keys.
+ * @c: UBIFS file-system description object
+ * @key1: the first key to compare
+ * @key2: the second key to compare
+ *
+ * This function compares 2 keys and returns %-1 if @key1 is less than
+ * @key2, 0 if the keys are equivalent and %1 if @key1 is greater than @key2.
+ */
+static inline int keys_cmp(const struct ubifs_info *c,
+			   const union ubifs_key *key1,
+			   const union ubifs_key *key2)
+{
+	if (key1->u32[0] < key2->u32[0])
+		return -1;
+	if (key1->u32[0] > key2->u32[0])
+		return 1;
+	if (key1->u32[1] < key2->u32[1])
+		return -1;
+	if (key1->u32[1] > key2->u32[1])
+		return 1;
+
+	return 0;
+}
+
+/**
+ * is_hash_key - is a key vulnerable to hash collisions.
+ * @c: UBIFS file-system description object
+ * @key: key
+ *
+ * This function returns %1 if @key is a hashed key or %0 otherwise.
+ */
+static inline int is_hash_key(const struct ubifs_info *c,
+			      const union ubifs_key *key)
+{
+	int type = key_type(c, key);
+
+	return type == UBIFS_DENT_KEY || type == UBIFS_XENT_KEY;
+}
+
+/**
+ * key_max_inode_size - get maximum file size allowed by current key format.
+ * @c: UBIFS file-system description object
+ */
+static inline unsigned long long key_max_inode_size(const struct ubifs_info *c)
+{
+	switch (c->key_fmt) {
+	case UBIFS_SIMPLE_KEY_FMT:
+		return (1ULL << UBIFS_S_KEY_BLOCK_BITS) * UBIFS_BLOCK_SIZE;
+	default:
+		return 0;
+	}
+}
+#endif /* !__UBIFS_KEY_H__ */
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
new file mode 100644
index 00000000000..36857b9ed59
--- /dev/null
+++ b/fs/ubifs/log.c
@@ -0,0 +1,805 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/*
+ * This file is a part of UBIFS journal implementation and contains various
+ * functions which manipulate the log. The log is a fixed area on the flash
+ * which does not contain any data but refers to buds. The log is a part of the
+ * journal.
+ */
+
+#include "ubifs.h"
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+static int dbg_check_bud_bytes(struct ubifs_info *c);
+#else
+#define dbg_check_bud_bytes(c) 0
+#endif
+
+/**
+ * ubifs_search_bud - search bud LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: logical eraseblock number to search
+ *
+ * This function searches bud LEB @lnum. Returns bud description object in case
+ * of success and %NULL if there is no bud with this LEB number.
+ */
+struct ubifs_bud *ubifs_search_bud(struct ubifs_info *c, int lnum)
+{
+	struct rb_node *p;
+	struct ubifs_bud *bud;
+
+	spin_lock(&c->buds_lock);
+	p = c->buds.rb_node;
+	while (p) {
+		bud = rb_entry(p, struct ubifs_bud, rb);
+		if (lnum < bud->lnum)
+			p = p->rb_left;
+		else if (lnum > bud->lnum)
+			p = p->rb_right;
+		else {
+			spin_unlock(&c->buds_lock);
+			return bud;
+		}
+	}
+	spin_unlock(&c->buds_lock);
+	return NULL;
+}
+
+/**
+ * ubifs_get_wbuf - get the wbuf associated with a LEB, if there is one.
+ * @c: UBIFS file-system description object
+ * @lnum: logical eraseblock number to search
+ *
+ * This functions returns the wbuf for @lnum or %NULL if there is not one.
+ */
+struct ubifs_wbuf *ubifs_get_wbuf(struct ubifs_info *c, int lnum)
+{
+	struct rb_node *p;
+	struct ubifs_bud *bud;
+	int jhead;
+
+	if (!c->jheads)
+		return NULL;
+
+	spin_lock(&c->buds_lock);
+	p = c->buds.rb_node;
+	while (p) {
+		bud = rb_entry(p, struct ubifs_bud, rb);
+		if (lnum < bud->lnum)
+			p = p->rb_left;
+		else if (lnum > bud->lnum)
+			p = p->rb_right;
+		else {
+			jhead = bud->jhead;
+			spin_unlock(&c->buds_lock);
+			return &c->jheads[jhead].wbuf;
+		}
+	}
+	spin_unlock(&c->buds_lock);
+	return NULL;
+}
+
+/**
+ * next_log_lnum - switch to the next log LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: current log LEB
+ */
+static inline int next_log_lnum(const struct ubifs_info *c, int lnum)
+{
+	lnum += 1;
+	if (lnum > c->log_last)
+		lnum = UBIFS_LOG_LNUM;
+
+	return lnum;
+}
+
+/**
+ * empty_log_bytes - calculate amount of empty space in the log.
+ * @c: UBIFS file-system description object
+ */
+static inline long long empty_log_bytes(const struct ubifs_info *c)
+{
+	long long h, t;
+
+	h = (long long)c->lhead_lnum * c->leb_size + c->lhead_offs;
+	t = (long long)c->ltail_lnum * c->leb_size;
+
+	if (h >= t)
+		return c->log_bytes - h + t;
+	else
+		return t - h;
+}
+
+/**
+ * ubifs_add_bud - add bud LEB to the tree of buds and its journal head list.
+ * @c: UBIFS file-system description object
+ * @bud: the bud to add
+ */
+void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud)
+{
+	struct rb_node **p, *parent = NULL;
+	struct ubifs_bud *b;
+	struct ubifs_jhead *jhead;
+
+	spin_lock(&c->buds_lock);
+	p = &c->buds.rb_node;
+	while (*p) {
+		parent = *p;
+		b = rb_entry(parent, struct ubifs_bud, rb);
+		ubifs_assert(bud->lnum != b->lnum);
+		if (bud->lnum < b->lnum)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&bud->rb, parent, p);
+	rb_insert_color(&bud->rb, &c->buds);
+	if (c->jheads) {
+		jhead = &c->jheads[bud->jhead];
+		list_add_tail(&bud->list, &jhead->buds_list);
+	} else
+		ubifs_assert(c->replaying && (c->vfs_sb->s_flags & MS_RDONLY));
+
+	/*
+	 * Note, although this is a new bud, we anyway account this space now,
+	 * before any data has been written to it, because this is about to
+	 * guarantee fixed mount time, and this bud will anyway be read and
+	 * scanned.
+	 */
+	c->bud_bytes += c->leb_size - bud->start;
+
+	dbg_log("LEB %d:%d, jhead %d, bud_bytes %lld", bud->lnum,
+		bud->start, bud->jhead, c->bud_bytes);
+	spin_unlock(&c->buds_lock);
+}
+
+/**
+ * ubifs_create_buds_lists - create journal head buds lists for remount rw.
+ * @c: UBIFS file-system description object
+ */
+void ubifs_create_buds_lists(struct ubifs_info *c)
+{
+	struct rb_node *p;
+
+	spin_lock(&c->buds_lock);
+	p = rb_first(&c->buds);
+	while (p) {
+		struct ubifs_bud *bud = rb_entry(p, struct ubifs_bud, rb);
+		struct ubifs_jhead *jhead = &c->jheads[bud->jhead];
+
+		list_add_tail(&bud->list, &jhead->buds_list);
+		p = rb_next(p);
+	}
+	spin_unlock(&c->buds_lock);
+}
+
+/**
+ * ubifs_add_bud_to_log - add a new bud to the log.
+ * @c: UBIFS file-system description object
+ * @jhead: journal head the bud belongs to
+ * @lnum: LEB number of the bud
+ * @offs: starting offset of the bud
+ *
+ * This function writes reference node for the new bud LEB @lnum it to the log,
+ * and adds it to the buds tress. It also makes sure that log size does not
+ * exceed the 'c->max_bud_bytes' limit. Returns zero in case of success,
+ * %-EAGAIN if commit is required, and a negative error codes in case of
+ * failure.
+ */
+int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
+{
+	int err;
+	struct ubifs_bud *bud;
+	struct ubifs_ref_node *ref;
+
+	bud = kmalloc(sizeof(struct ubifs_bud), GFP_NOFS);
+	if (!bud)
+		return -ENOMEM;
+	ref = kzalloc(c->ref_node_alsz, GFP_NOFS);
+	if (!ref) {
+		kfree(bud);
+		return -ENOMEM;
+	}
+
+	mutex_lock(&c->log_mutex);
+
+	if (c->ro_media) {
+		err = -EROFS;
+		goto out_unlock;
+	}
+
+	/* Make sure we have enough space in the log */
+	if (empty_log_bytes(c) - c->ref_node_alsz < c->min_log_bytes) {
+		dbg_log("not enough log space - %lld, required %d",
+			empty_log_bytes(c), c->min_log_bytes);
+		ubifs_commit_required(c);
+		err = -EAGAIN;
+		goto out_unlock;
+	}
+
+	/*
+	 * Make sure the the amount of space in buds will not exceed
+	 * 'c->max_bud_bytes' limit, because we want to guarantee mount time
+	 * limits.
+	 *
+	 * It is not necessary to hold @c->buds_lock when reading @c->bud_bytes
+	 * because we are holding @c->log_mutex. All @c->bud_bytes take place
+	 * when both @c->log_mutex and @c->bud_bytes are locked.
+	 */
+	if (c->bud_bytes + c->leb_size - offs > c->max_bud_bytes) {
+		dbg_log("bud bytes %lld (%lld max), require commit",
+			c->bud_bytes, c->max_bud_bytes);
+		ubifs_commit_required(c);
+		err = -EAGAIN;
+		goto out_unlock;
+	}
+
+	/*
+	 * If the journal is full enough - start background commit. Note, it is
+	 * OK to read 'c->cmt_state' without spinlock because integer reads
+	 * are atomic in the kernel.
+	 */
+	if (c->bud_bytes >= c->bg_bud_bytes &&
+	    c->cmt_state == COMMIT_RESTING) {
+		dbg_log("bud bytes %lld (%lld max), initiate BG commit",
+			c->bud_bytes, c->max_bud_bytes);
+		ubifs_request_bg_commit(c);
+	}
+
+	bud->lnum = lnum;
+	bud->start = offs;
+	bud->jhead = jhead;
+
+	ref->ch.node_type = UBIFS_REF_NODE;
+	ref->lnum = cpu_to_le32(bud->lnum);
+	ref->offs = cpu_to_le32(bud->start);
+	ref->jhead = cpu_to_le32(jhead);
+
+	if (c->lhead_offs > c->leb_size - c->ref_node_alsz) {
+		c->lhead_lnum = next_log_lnum(c, c->lhead_lnum);
+		c->lhead_offs = 0;
+	}
+
+	if (c->lhead_offs == 0) {
+		/* Must ensure next log LEB has been unmapped */
+		err = ubifs_leb_unmap(c, c->lhead_lnum);
+		if (err)
+			goto out_unlock;
+	}
+
+	if (bud->start == 0) {
+		/*
+		 * Before writing the LEB reference which refers an empty LEB
+		 * to the log, we have to make sure it is mapped, because
+		 * otherwise we'd risk to refer an LEB with garbage in case of
+		 * an unclean reboot, because the target LEB might have been
+		 * unmapped, but not yet physically erased.
+		 */
+		err = ubi_leb_map(c->ubi, bud->lnum, UBI_SHORTTERM);
+		if (err)
+			goto out_unlock;
+	}
+
+	dbg_log("write ref LEB %d:%d",
+		c->lhead_lnum, c->lhead_offs);
+	err = ubifs_write_node(c, ref, UBIFS_REF_NODE_SZ, c->lhead_lnum,
+			       c->lhead_offs, UBI_SHORTTERM);
+	if (err)
+		goto out_unlock;
+
+	c->lhead_offs += c->ref_node_alsz;
+
+	ubifs_add_bud(c, bud);
+
+	mutex_unlock(&c->log_mutex);
+	kfree(ref);
+	return 0;
+
+out_unlock:
+	mutex_unlock(&c->log_mutex);
+	kfree(ref);
+	kfree(bud);
+	return err;
+}
+
+/**
+ * remove_buds - remove used buds.
+ * @c: UBIFS file-system description object
+ *
+ * This function removes use buds from the buds tree. It does not remove the
+ * buds which are pointed to by journal heads.
+ */
+static void remove_buds(struct ubifs_info *c)
+{
+	struct rb_node *p;
+
+	ubifs_assert(list_empty(&c->old_buds));
+	c->cmt_bud_bytes = 0;
+	spin_lock(&c->buds_lock);
+	p = rb_first(&c->buds);
+	while (p) {
+		struct rb_node *p1 = p;
+		struct ubifs_bud *bud;
+		struct ubifs_wbuf *wbuf;
+
+		p = rb_next(p);
+		bud = rb_entry(p1, struct ubifs_bud, rb);
+		wbuf = &c->jheads[bud->jhead].wbuf;
+
+		if (wbuf->lnum == bud->lnum) {
+			/*
+			 * Do not remove buds which are pointed to by journal
+			 * heads (non-closed buds).
+			 */
+			c->cmt_bud_bytes += wbuf->offs - bud->start;
+			dbg_log("preserve %d:%d, jhead %d, bud bytes %d, "
+				"cmt_bud_bytes %lld", bud->lnum, bud->start,
+				bud->jhead, wbuf->offs - bud->start,
+				c->cmt_bud_bytes);
+			bud->start = wbuf->offs;
+		} else {
+			c->cmt_bud_bytes += c->leb_size - bud->start;
+			dbg_log("remove %d:%d, jhead %d, bud bytes %d, "
+				"cmt_bud_bytes %lld", bud->lnum, bud->start,
+				bud->jhead, c->leb_size - bud->start,
+				c->cmt_bud_bytes);
+			rb_erase(p1, &c->buds);
+			list_del(&bud->list);
+			/*
+			 * If the commit does not finish, the recovery will need
+			 * to replay the journal, in which case the old buds
+			 * must be unchanged. Do not release them until post
+			 * commit i.e. do not allow them to be garbage
+			 * collected.
+			 */
+			list_add(&bud->list, &c->old_buds);
+		}
+	}
+	spin_unlock(&c->buds_lock);
+}
+
+/**
+ * ubifs_log_start_commit - start commit.
+ * @c: UBIFS file-system description object
+ * @ltail_lnum: return new log tail LEB number
+ *
+ * The commit operation starts with writing "commit start" node to the log and
+ * reference nodes for all journal heads which will define new journal after
+ * the commit has been finished. The commit start and reference nodes are
+ * written in one go to the nearest empty log LEB (hence, when commit is
+ * finished UBIFS may safely unmap all the previous log LEBs). This function
+ * returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
+{
+	void *buf;
+	struct ubifs_cs_node *cs;
+	struct ubifs_ref_node *ref;
+	int err, i, max_len, len;
+
+	err = dbg_check_bud_bytes(c);
+	if (err)
+		return err;
+
+	max_len = UBIFS_CS_NODE_SZ + c->jhead_cnt * UBIFS_REF_NODE_SZ;
+	max_len = ALIGN(max_len, c->min_io_size);
+	buf = cs = kmalloc(max_len, GFP_NOFS);
+	if (!buf)
+		return -ENOMEM;
+
+	cs->ch.node_type = UBIFS_CS_NODE;
+	cs->cmt_no = cpu_to_le64(c->cmt_no + 1);
+	ubifs_prepare_node(c, cs, UBIFS_CS_NODE_SZ, 0);
+
+	/*
+	 * Note, we do not lock 'c->log_mutex' because this is the commit start
+	 * phase and we are exclusively using the log. And we do not lock
+	 * write-buffer because nobody can write to the file-system at this
+	 * phase.
+	 */
+
+	len = UBIFS_CS_NODE_SZ;
+	for (i = 0; i < c->jhead_cnt; i++) {
+		int lnum = c->jheads[i].wbuf.lnum;
+		int offs = c->jheads[i].wbuf.offs;
+
+		if (lnum == -1 || offs == c->leb_size)
+			continue;
+
+		dbg_log("add ref to LEB %d:%d for jhead %d", lnum, offs, i);
+		ref = buf + len;
+		ref->ch.node_type = UBIFS_REF_NODE;
+		ref->lnum = cpu_to_le32(lnum);
+		ref->offs = cpu_to_le32(offs);
+		ref->jhead = cpu_to_le32(i);
+
+		ubifs_prepare_node(c, ref, UBIFS_REF_NODE_SZ, 0);
+		len += UBIFS_REF_NODE_SZ;
+	}
+
+	ubifs_pad(c, buf + len, ALIGN(len, c->min_io_size) - len);
+
+	/* Switch to the next log LEB */
+	if (c->lhead_offs) {
+		c->lhead_lnum = next_log_lnum(c, c->lhead_lnum);
+		c->lhead_offs = 0;
+	}
+
+	if (c->lhead_offs == 0) {
+		/* Must ensure next LEB has been unmapped */
+		err = ubifs_leb_unmap(c, c->lhead_lnum);
+		if (err)
+			goto out;
+	}
+
+	len = ALIGN(len, c->min_io_size);
+	dbg_log("writing commit start at LEB %d:0, len %d", c->lhead_lnum, len);
+	err = ubifs_leb_write(c, c->lhead_lnum, cs, 0, len, UBI_SHORTTERM);
+	if (err)
+		goto out;
+
+	*ltail_lnum = c->lhead_lnum;
+
+	c->lhead_offs += len;
+	if (c->lhead_offs == c->leb_size) {
+		c->lhead_lnum = next_log_lnum(c, c->lhead_lnum);
+		c->lhead_offs = 0;
+	}
+
+	remove_buds(c);
+
+	/*
+	 * We have started the commit and now users may use the rest of the log
+	 * for new writes.
+	 */
+	c->min_log_bytes = 0;
+
+out:
+	kfree(buf);
+	return err;
+}
+
+/**
+ * ubifs_log_end_commit - end commit.
+ * @c: UBIFS file-system description object
+ * @ltail_lnum: new log tail LEB number
+ *
+ * This function is called on when the commit operation was finished. It
+ * moves log tail to new position and unmaps LEBs which contain obsolete data.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+int ubifs_log_end_commit(struct ubifs_info *c, int ltail_lnum)
+{
+	int err;
+
+	/*
+	 * At this phase we have to lock 'c->log_mutex' because UBIFS allows FS
+	 * writes during commit. Its only short "commit" start phase when
+	 * writers are blocked.
+	 */
+	mutex_lock(&c->log_mutex);
+
+	dbg_log("old tail was LEB %d:0, new tail is LEB %d:0",
+		c->ltail_lnum, ltail_lnum);
+
+	c->ltail_lnum = ltail_lnum;
+	/*
+	 * The commit is finished and from now on it must be guaranteed that
+	 * there is always enough space for the next commit.
+	 */
+	c->min_log_bytes = c->leb_size;
+
+	spin_lock(&c->buds_lock);
+	c->bud_bytes -= c->cmt_bud_bytes;
+	spin_unlock(&c->buds_lock);
+
+	err = dbg_check_bud_bytes(c);
+
+	mutex_unlock(&c->log_mutex);
+	return err;
+}
+
+/**
+ * ubifs_log_post_commit - things to do after commit is completed.
+ * @c: UBIFS file-system description object
+ * @old_ltail_lnum: old log tail LEB number
+ *
+ * Release buds only after commit is completed, because they must be unchanged
+ * if recovery is needed.
+ *
+ * Unmap log LEBs only after commit is completed, because they may be needed for
+ * recovery.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_log_post_commit(struct ubifs_info *c, int old_ltail_lnum)
+{
+	int lnum, err = 0;
+
+	while (!list_empty(&c->old_buds)) {
+		struct ubifs_bud *bud;
+
+		bud = list_entry(c->old_buds.next, struct ubifs_bud, list);
+		err = ubifs_return_leb(c, bud->lnum);
+		if (err)
+			return err;
+		list_del(&bud->list);
+		kfree(bud);
+	}
+	mutex_lock(&c->log_mutex);
+	for (lnum = old_ltail_lnum; lnum != c->ltail_lnum;
+	     lnum = next_log_lnum(c, lnum)) {
+		dbg_log("unmap log LEB %d", lnum);
+		err = ubifs_leb_unmap(c, lnum);
+		if (err)
+			goto out;
+	}
+out:
+	mutex_unlock(&c->log_mutex);
+	return err;
+}
+
+/**
+ * struct done_ref - references that have been done.
+ * @rb: rb-tree node
+ * @lnum: LEB number
+ */
+struct done_ref {
+	struct rb_node rb;
+	int lnum;
+};
+
+/**
+ * done_already - determine if a reference has been done already.
+ * @done_tree: rb-tree to store references that have been done
+ * @lnum: LEB number of reference
+ *
+ * This function returns %1 if the reference has been done, %0 if not, otherwise
+ * a negative error code is returned.
+ */
+static int done_already(struct rb_root *done_tree, int lnum)
+{
+	struct rb_node **p = &done_tree->rb_node, *parent = NULL;
+	struct done_ref *dr;
+
+	while (*p) {
+		parent = *p;
+		dr = rb_entry(parent, struct done_ref, rb);
+		if (lnum < dr->lnum)
+			p = &(*p)->rb_left;
+		else if (lnum > dr->lnum)
+			p = &(*p)->rb_right;
+		else
+			return 1;
+	}
+
+	dr = kzalloc(sizeof(struct done_ref), GFP_NOFS);
+	if (!dr)
+		return -ENOMEM;
+
+	dr->lnum = lnum;
+
+	rb_link_node(&dr->rb, parent, p);
+	rb_insert_color(&dr->rb, done_tree);
+
+	return 0;
+}
+
+/**
+ * destroy_done_tree - destroy the done tree.
+ * @done_tree: done tree to destroy
+ */
+static void destroy_done_tree(struct rb_root *done_tree)
+{
+	struct rb_node *this = done_tree->rb_node;
+	struct done_ref *dr;
+
+	while (this) {
+		if (this->rb_left) {
+			this = this->rb_left;
+			continue;
+		} else if (this->rb_right) {
+			this = this->rb_right;
+			continue;
+		}
+		dr = rb_entry(this, struct done_ref, rb);
+		this = rb_parent(this);
+		if (this) {
+			if (this->rb_left == &dr->rb)
+				this->rb_left = NULL;
+			else
+				this->rb_right = NULL;
+		}
+		kfree(dr);
+	}
+}
+
+/**
+ * add_node - add a node to the consolidated log.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to which to add
+ * @lnum: LEB number to which to write is passed and returned here
+ * @offs: offset to where to write is passed and returned here
+ * @node: node to add
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int add_node(struct ubifs_info *c, void *buf, int *lnum, int *offs,
+		    void *node)
+{
+	struct ubifs_ch *ch = node;
+	int len = le32_to_cpu(ch->len), remains = c->leb_size - *offs;
+
+	if (len > remains) {
+		int sz = ALIGN(*offs, c->min_io_size), err;
+
+		ubifs_pad(c, buf + *offs, sz - *offs);
+		err = ubifs_leb_change(c, *lnum, buf, sz, UBI_SHORTTERM);
+		if (err)
+			return err;
+		*lnum = next_log_lnum(c, *lnum);
+		*offs = 0;
+	}
+	memcpy(buf + *offs, node, len);
+	*offs += ALIGN(len, 8);
+	return 0;
+}
+
+/**
+ * ubifs_consolidate_log - consolidate the log.
+ * @c: UBIFS file-system description object
+ *
+ * Repeated failed commits could cause the log to be full, but at least 1 LEB is
+ * needed for commit. This function rewrites the reference nodes in the log
+ * omitting duplicates, and failed CS nodes, and leaving no gaps.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_consolidate_log(struct ubifs_info *c)
+{
+	struct ubifs_scan_leb *sleb;
+	struct ubifs_scan_node *snod;
+	struct rb_root done_tree = RB_ROOT;
+	int lnum, err, first = 1, write_lnum, offs = 0;
+	void *buf;
+
+	dbg_rcvry("log tail LEB %d, log head LEB %d", c->ltail_lnum,
+		  c->lhead_lnum);
+	buf = vmalloc(c->leb_size);
+	if (!buf)
+		return -ENOMEM;
+	lnum = c->ltail_lnum;
+	write_lnum = lnum;
+	while (1) {
+		sleb = ubifs_scan(c, lnum, 0, c->sbuf);
+		if (IS_ERR(sleb)) {
+			err = PTR_ERR(sleb);
+			goto out_free;
+		}
+		list_for_each_entry(snod, &sleb->nodes, list) {
+			switch (snod->type) {
+			case UBIFS_REF_NODE: {
+				struct ubifs_ref_node *ref = snod->node;
+				int ref_lnum = le32_to_cpu(ref->lnum);
+
+				err = done_already(&done_tree, ref_lnum);
+				if (err < 0)
+					goto out_scan;
+				if (err != 1) {
+					err = add_node(c, buf, &write_lnum,
+						       &offs, snod->node);
+					if (err)
+						goto out_scan;
+				}
+				break;
+			}
+			case UBIFS_CS_NODE:
+				if (!first)
+					break;
+				err = add_node(c, buf, &write_lnum, &offs,
+					       snod->node);
+				if (err)
+					goto out_scan;
+				first = 0;
+				break;
+			}
+		}
+		ubifs_scan_destroy(sleb);
+		if (lnum == c->lhead_lnum)
+			break;
+		lnum = next_log_lnum(c, lnum);
+	}
+	if (offs) {
+		int sz = ALIGN(offs, c->min_io_size);
+
+		ubifs_pad(c, buf + offs, sz - offs);
+		err = ubifs_leb_change(c, write_lnum, buf, sz, UBI_SHORTTERM);
+		if (err)
+			goto out_free;
+		offs = ALIGN(offs, c->min_io_size);
+	}
+	destroy_done_tree(&done_tree);
+	vfree(buf);
+	if (write_lnum == c->lhead_lnum) {
+		ubifs_err("log is too full");
+		return -EINVAL;
+	}
+	/* Unmap remaining LEBs */
+	lnum = write_lnum;
+	do {
+		lnum = next_log_lnum(c, lnum);
+		err = ubifs_leb_unmap(c, lnum);
+		if (err)
+			return err;
+	} while (lnum != c->lhead_lnum);
+	c->lhead_lnum = write_lnum;
+	c->lhead_offs = offs;
+	dbg_rcvry("new log head at %d:%d", c->lhead_lnum, c->lhead_offs);
+	return 0;
+
+out_scan:
+	ubifs_scan_destroy(sleb);
+out_free:
+	destroy_done_tree(&done_tree);
+	vfree(buf);
+	return err;
+}
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+
+/**
+ * dbg_check_bud_bytes - make sure bud bytes calculation are all right.
+ * @c: UBIFS file-system description object
+ *
+ * This function makes sure the amount of flash space used by closed buds
+ * ('c->bud_bytes' is correct). Returns zero in case of success and %-EINVAL in
+ * case of failure.
+ */
+static int dbg_check_bud_bytes(struct ubifs_info *c)
+{
+	int i, err = 0;
+	struct ubifs_bud *bud;
+	long long bud_bytes = 0;
+
+	if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+		return 0;
+
+	spin_lock(&c->buds_lock);
+	for (i = 0; i < c->jhead_cnt; i++)
+		list_for_each_entry(bud, &c->jheads[i].buds_list, list)
+			bud_bytes += c->leb_size - bud->start;
+
+	if (c->bud_bytes != bud_bytes) {
+		ubifs_err("bad bud_bytes %lld, calculated %lld",
+			  c->bud_bytes, bud_bytes);
+		err = -EINVAL;
+	}
+	spin_unlock(&c->buds_lock);
+
+	return err;
+}
+
+#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
new file mode 100644
index 00000000000..2ba93da71b6
--- /dev/null
+++ b/fs/ubifs/lprops.c
@@ -0,0 +1,1357 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements the functions that access LEB properties and their
+ * categories. LEBs are categorized based on the needs of UBIFS, and the
+ * categories are stored as either heaps or lists to provide a fast way of
+ * finding a LEB in a particular category. For example, UBIFS may need to find
+ * an empty LEB for the journal, or a very dirty LEB for garbage collection.
+ */
+
+#include "ubifs.h"
+
+/**
+ * get_heap_comp_val - get the LEB properties value for heap comparisons.
+ * @lprops: LEB properties
+ * @cat: LEB category
+ */
+static int get_heap_comp_val(struct ubifs_lprops *lprops, int cat)
+{
+	switch (cat) {
+	case LPROPS_FREE:
+		return lprops->free;
+	case LPROPS_DIRTY_IDX:
+		return lprops->free + lprops->dirty;
+	default:
+		return lprops->dirty;
+	}
+}
+
+/**
+ * move_up_lpt_heap - move a new heap entry up as far as possible.
+ * @c: UBIFS file-system description object
+ * @heap: LEB category heap
+ * @lprops: LEB properties to move
+ * @cat: LEB category
+ *
+ * New entries to a heap are added at the bottom and then moved up until the
+ * parent's value is greater.  In the case of LPT's category heaps, the value
+ * is either the amount of free space or the amount of dirty space, depending
+ * on the category.
+ */
+static void move_up_lpt_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap,
+			     struct ubifs_lprops *lprops, int cat)
+{
+	int val1, val2, hpos;
+
+	hpos = lprops->hpos;
+	if (!hpos)
+		return; /* Already top of the heap */
+	val1 = get_heap_comp_val(lprops, cat);
+	/* Compare to parent and, if greater, move up the heap */
+	do {
+		int ppos = (hpos - 1) / 2;
+
+		val2 = get_heap_comp_val(heap->arr[ppos], cat);
+		if (val2 >= val1)
+			return;
+		/* Greater than parent so move up */
+		heap->arr[ppos]->hpos = hpos;
+		heap->arr[hpos] = heap->arr[ppos];
+		heap->arr[ppos] = lprops;
+		lprops->hpos = ppos;
+		hpos = ppos;
+	} while (hpos);
+}
+
+/**
+ * adjust_lpt_heap - move a changed heap entry up or down the heap.
+ * @c: UBIFS file-system description object
+ * @heap: LEB category heap
+ * @lprops: LEB properties to move
+ * @hpos: heap position of @lprops
+ * @cat: LEB category
+ *
+ * Changed entries in a heap are moved up or down until the parent's value is
+ * greater.  In the case of LPT's category heaps, the value is either the amount
+ * of free space or the amount of dirty space, depending on the category.
+ */
+static void adjust_lpt_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap,
+			    struct ubifs_lprops *lprops, int hpos, int cat)
+{
+	int val1, val2, val3, cpos;
+
+	val1 = get_heap_comp_val(lprops, cat);
+	/* Compare to parent and, if greater than parent, move up the heap */
+	if (hpos) {
+		int ppos = (hpos - 1) / 2;
+
+		val2 = get_heap_comp_val(heap->arr[ppos], cat);
+		if (val1 > val2) {
+			/* Greater than parent so move up */
+			while (1) {
+				heap->arr[ppos]->hpos = hpos;
+				heap->arr[hpos] = heap->arr[ppos];
+				heap->arr[ppos] = lprops;
+				lprops->hpos = ppos;
+				hpos = ppos;
+				if (!hpos)
+					return;
+				ppos = (hpos - 1) / 2;
+				val2 = get_heap_comp_val(heap->arr[ppos], cat);
+				if (val1 <= val2)
+					return;
+				/* Still greater than parent so keep going */
+			}
+		}
+	}
+	/* Not greater than parent, so compare to children */
+	while (1) {
+		/* Compare to left child */
+		cpos = hpos * 2 + 1;
+		if (cpos >= heap->cnt)
+			return;
+		val2 = get_heap_comp_val(heap->arr[cpos], cat);
+		if (val1 < val2) {
+			/* Less than left child, so promote biggest child */
+			if (cpos + 1 < heap->cnt) {
+				val3 = get_heap_comp_val(heap->arr[cpos + 1],
+							 cat);
+				if (val3 > val2)
+					cpos += 1; /* Right child is bigger */
+			}
+			heap->arr[cpos]->hpos = hpos;
+			heap->arr[hpos] = heap->arr[cpos];
+			heap->arr[cpos] = lprops;
+			lprops->hpos = cpos;
+			hpos = cpos;
+			continue;
+		}
+		/* Compare to right child */
+		cpos += 1;
+		if (cpos >= heap->cnt)
+			return;
+		val3 = get_heap_comp_val(heap->arr[cpos], cat);
+		if (val1 < val3) {
+			/* Less than right child, so promote right child */
+			heap->arr[cpos]->hpos = hpos;
+			heap->arr[hpos] = heap->arr[cpos];
+			heap->arr[cpos] = lprops;
+			lprops->hpos = cpos;
+			hpos = cpos;
+			continue;
+		}
+		return;
+	}
+}
+
+/**
+ * add_to_lpt_heap - add LEB properties to a LEB category heap.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties to add
+ * @cat: LEB category
+ *
+ * This function returns %1 if @lprops is added to the heap for LEB category
+ * @cat, otherwise %0 is returned because the heap is full.
+ */
+static int add_to_lpt_heap(struct ubifs_info *c, struct ubifs_lprops *lprops,
+			   int cat)
+{
+	struct ubifs_lpt_heap *heap = &c->lpt_heap[cat - 1];
+
+	if (heap->cnt >= heap->max_cnt) {
+		const int b = LPT_HEAP_SZ / 2 - 1;
+		int cpos, val1, val2;
+
+		/* Compare to some other LEB on the bottom of heap */
+		/* Pick a position kind of randomly */
+		cpos = (((size_t)lprops >> 4) & b) + b;
+		ubifs_assert(cpos >= b);
+		ubifs_assert(cpos < LPT_HEAP_SZ);
+		ubifs_assert(cpos < heap->cnt);
+
+		val1 = get_heap_comp_val(lprops, cat);
+		val2 = get_heap_comp_val(heap->arr[cpos], cat);
+		if (val1 > val2) {
+			struct ubifs_lprops *lp;
+
+			lp = heap->arr[cpos];
+			lp->flags &= ~LPROPS_CAT_MASK;
+			lp->flags |= LPROPS_UNCAT;
+			list_add(&lp->list, &c->uncat_list);
+			lprops->hpos = cpos;
+			heap->arr[cpos] = lprops;
+			move_up_lpt_heap(c, heap, lprops, cat);
+			dbg_check_heap(c, heap, cat, lprops->hpos);
+			return 1; /* Added to heap */
+		}
+		dbg_check_heap(c, heap, cat, -1);
+		return 0; /* Not added to heap */
+	} else {
+		lprops->hpos = heap->cnt++;
+		heap->arr[lprops->hpos] = lprops;
+		move_up_lpt_heap(c, heap, lprops, cat);
+		dbg_check_heap(c, heap, cat, lprops->hpos);
+		return 1; /* Added to heap */
+	}
+}
+
+/**
+ * remove_from_lpt_heap - remove LEB properties from a LEB category heap.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties to remove
+ * @cat: LEB category
+ */
+static void remove_from_lpt_heap(struct ubifs_info *c,
+				 struct ubifs_lprops *lprops, int cat)
+{
+	struct ubifs_lpt_heap *heap;
+	int hpos = lprops->hpos;
+
+	heap = &c->lpt_heap[cat - 1];
+	ubifs_assert(hpos >= 0 && hpos < heap->cnt);
+	ubifs_assert(heap->arr[hpos] == lprops);
+	heap->cnt -= 1;
+	if (hpos < heap->cnt) {
+		heap->arr[hpos] = heap->arr[heap->cnt];
+		heap->arr[hpos]->hpos = hpos;
+		adjust_lpt_heap(c, heap, heap->arr[hpos], hpos, cat);
+	}
+	dbg_check_heap(c, heap, cat, -1);
+}
+
+/**
+ * lpt_heap_replace - replace lprops in a category heap.
+ * @c: UBIFS file-system description object
+ * @old_lprops: LEB properties to replace
+ * @new_lprops: LEB properties with which to replace
+ * @cat: LEB category
+ *
+ * During commit it is sometimes necessary to copy a pnode (see dirty_cow_pnode)
+ * and the lprops that the pnode contains.  When that happens, references in
+ * the category heaps to those lprops must be updated to point to the new
+ * lprops.  This function does that.
+ */
+static void lpt_heap_replace(struct ubifs_info *c,
+			     struct ubifs_lprops *old_lprops,
+			     struct ubifs_lprops *new_lprops, int cat)
+{
+	struct ubifs_lpt_heap *heap;
+	int hpos = new_lprops->hpos;
+
+	heap = &c->lpt_heap[cat - 1];
+	heap->arr[hpos] = new_lprops;
+}
+
+/**
+ * ubifs_add_to_cat - add LEB properties to a category list or heap.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties to add
+ * @cat: LEB category to which to add
+ *
+ * LEB properties are categorized to enable fast find operations.
+ */
+void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops,
+		      int cat)
+{
+	switch (cat) {
+	case LPROPS_DIRTY:
+	case LPROPS_DIRTY_IDX:
+	case LPROPS_FREE:
+		if (add_to_lpt_heap(c, lprops, cat))
+			break;
+		/* No more room on heap so make it uncategorized */
+		cat = LPROPS_UNCAT;
+		/* Fall through */
+	case LPROPS_UNCAT:
+		list_add(&lprops->list, &c->uncat_list);
+		break;
+	case LPROPS_EMPTY:
+		list_add(&lprops->list, &c->empty_list);
+		break;
+	case LPROPS_FREEABLE:
+		list_add(&lprops->list, &c->freeable_list);
+		c->freeable_cnt += 1;
+		break;
+	case LPROPS_FRDI_IDX:
+		list_add(&lprops->list, &c->frdi_idx_list);
+		break;
+	default:
+		ubifs_assert(0);
+	}
+	lprops->flags &= ~LPROPS_CAT_MASK;
+	lprops->flags |= cat;
+}
+
+/**
+ * ubifs_remove_from_cat - remove LEB properties from a category list or heap.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties to remove
+ * @cat: LEB category from which to remove
+ *
+ * LEB properties are categorized to enable fast find operations.
+ */
+static void ubifs_remove_from_cat(struct ubifs_info *c,
+				  struct ubifs_lprops *lprops, int cat)
+{
+	switch (cat) {
+	case LPROPS_DIRTY:
+	case LPROPS_DIRTY_IDX:
+	case LPROPS_FREE:
+		remove_from_lpt_heap(c, lprops, cat);
+		break;
+	case LPROPS_FREEABLE:
+		c->freeable_cnt -= 1;
+		ubifs_assert(c->freeable_cnt >= 0);
+		/* Fall through */
+	case LPROPS_UNCAT:
+	case LPROPS_EMPTY:
+	case LPROPS_FRDI_IDX:
+		ubifs_assert(!list_empty(&lprops->list));
+		list_del(&lprops->list);
+		break;
+	default:
+		ubifs_assert(0);
+	}
+}
+
+/**
+ * ubifs_replace_cat - replace lprops in a category list or heap.
+ * @c: UBIFS file-system description object
+ * @old_lprops: LEB properties to replace
+ * @new_lprops: LEB properties with which to replace
+ *
+ * During commit it is sometimes necessary to copy a pnode (see dirty_cow_pnode)
+ * and the lprops that the pnode contains. When that happens, references in
+ * category lists and heaps must be replaced. This function does that.
+ */
+void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops,
+		       struct ubifs_lprops *new_lprops)
+{
+	int cat;
+
+	cat = new_lprops->flags & LPROPS_CAT_MASK;
+	switch (cat) {
+	case LPROPS_DIRTY:
+	case LPROPS_DIRTY_IDX:
+	case LPROPS_FREE:
+		lpt_heap_replace(c, old_lprops, new_lprops, cat);
+		break;
+	case LPROPS_UNCAT:
+	case LPROPS_EMPTY:
+	case LPROPS_FREEABLE:
+	case LPROPS_FRDI_IDX:
+		list_replace(&old_lprops->list, &new_lprops->list);
+		break;
+	default:
+		ubifs_assert(0);
+	}
+}
+
+/**
+ * ubifs_ensure_cat - ensure LEB properties are categorized.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties
+ *
+ * A LEB may have fallen off of the bottom of a heap, and ended up as
+ * uncategorized even though it has enough space for us now. If that is the case
+ * this function will put the LEB back onto a heap.
+ */
+void ubifs_ensure_cat(struct ubifs_info *c, struct ubifs_lprops *lprops)
+{
+	int cat = lprops->flags & LPROPS_CAT_MASK;
+
+	if (cat != LPROPS_UNCAT)
+		return;
+	cat = ubifs_categorize_lprops(c, lprops);
+	if (cat == LPROPS_UNCAT)
+		return;
+	ubifs_remove_from_cat(c, lprops, LPROPS_UNCAT);
+	ubifs_add_to_cat(c, lprops, cat);
+}
+
+/**
+ * ubifs_categorize_lprops - categorize LEB properties.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties to categorize
+ *
+ * LEB properties are categorized to enable fast find operations. This function
+ * returns the LEB category to which the LEB properties belong. Note however
+ * that if the LEB category is stored as a heap and the heap is full, the
+ * LEB properties may have their category changed to %LPROPS_UNCAT.
+ */
+int ubifs_categorize_lprops(const struct ubifs_info *c,
+			    const struct ubifs_lprops *lprops)
+{
+	if (lprops->flags & LPROPS_TAKEN)
+		return LPROPS_UNCAT;
+
+	if (lprops->free == c->leb_size) {
+		ubifs_assert(!(lprops->flags & LPROPS_INDEX));
+		return LPROPS_EMPTY;
+	}
+
+	if (lprops->free + lprops->dirty == c->leb_size) {
+		if (lprops->flags & LPROPS_INDEX)
+			return LPROPS_FRDI_IDX;
+		else
+			return LPROPS_FREEABLE;
+	}
+
+	if (lprops->flags & LPROPS_INDEX) {
+		if (lprops->dirty + lprops->free >= c->min_idx_node_sz)
+			return LPROPS_DIRTY_IDX;
+	} else {
+		if (lprops->dirty >= c->dead_wm &&
+		    lprops->dirty > lprops->free)
+			return LPROPS_DIRTY;
+		if (lprops->free > 0)
+			return LPROPS_FREE;
+	}
+
+	return LPROPS_UNCAT;
+}
+
+/**
+ * change_category - change LEB properties category.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties to recategorize
+ *
+ * LEB properties are categorized to enable fast find operations. When the LEB
+ * properties change they must be recategorized.
+ */
+static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops)
+{
+	int old_cat = lprops->flags & LPROPS_CAT_MASK;
+	int new_cat = ubifs_categorize_lprops(c, lprops);
+
+	if (old_cat == new_cat) {
+		struct ubifs_lpt_heap *heap = &c->lpt_heap[new_cat - 1];
+
+		/* lprops on a heap now must be moved up or down */
+		if (new_cat < 1 || new_cat > LPROPS_HEAP_CNT)
+			return; /* Not on a heap */
+		heap = &c->lpt_heap[new_cat - 1];
+		adjust_lpt_heap(c, heap, lprops, lprops->hpos, new_cat);
+	} else {
+		ubifs_remove_from_cat(c, lprops, old_cat);
+		ubifs_add_to_cat(c, lprops, new_cat);
+	}
+}
+
+/**
+ * ubifs_get_lprops - get reference to LEB properties.
+ * @c: the UBIFS file-system description object
+ *
+ * This function locks lprops. Lprops have to be unlocked by
+ * 'ubifs_release_lprops()'.
+ */
+void ubifs_get_lprops(struct ubifs_info *c)
+{
+	mutex_lock(&c->lp_mutex);
+}
+
+/**
+ * calc_dark - calculate LEB dark space size.
+ * @c: the UBIFS file-system description object
+ * @spc: amount of free and dirty space in the LEB
+ *
+ * This function calculates amount of dark space in an LEB which has @spc bytes
+ * of free and dirty space. Returns the calculations result.
+ *
+ * Dark space is the space which is not always usable - it depends on which
+ * nodes are written in which order. E.g., if an LEB has only 512 free bytes,
+ * it is dark space, because it cannot fit a large data node. So UBIFS cannot
+ * count on this LEB and treat these 512 bytes as usable because it is not true
+ * if, for example, only big chunks of uncompressible data will be written to
+ * the FS.
+ */
+static int calc_dark(struct ubifs_info *c, int spc)
+{
+	ubifs_assert(!(spc & 7));
+
+	if (spc < c->dark_wm)
+		return spc;
+
+	/*
+	 * If we have slightly more space then the dark space watermark, we can
+	 * anyway safely assume it we'll be able to write a node of the
+	 * smallest size there.
+	 */
+	if (spc - c->dark_wm < MIN_WRITE_SZ)
+		return spc - MIN_WRITE_SZ;
+
+	return c->dark_wm;
+}
+
+/**
+ * is_lprops_dirty - determine if LEB properties are dirty.
+ * @c: the UBIFS file-system description object
+ * @lprops: LEB properties to test
+ */
+static int is_lprops_dirty(struct ubifs_info *c, struct ubifs_lprops *lprops)
+{
+	struct ubifs_pnode *pnode;
+	int pos;
+
+	pos = (lprops->lnum - c->main_first) & (UBIFS_LPT_FANOUT - 1);
+	pnode = (struct ubifs_pnode *)container_of(lprops - pos,
+						   struct ubifs_pnode,
+						   lprops[0]);
+	return !test_bit(COW_ZNODE, &pnode->flags) &&
+	       test_bit(DIRTY_CNODE, &pnode->flags);
+}
+
+/**
+ * ubifs_change_lp - change LEB properties.
+ * @c: the UBIFS file-system description object
+ * @lp: LEB properties to change
+ * @free: new free space amount
+ * @dirty: new dirty space amount
+ * @flags: new flags
+ * @idx_gc_cnt: change to the count of idx_gc list
+ *
+ * This function changes LEB properties. This function does not change a LEB
+ * property (@free, @dirty or @flag) if the value passed is %LPROPS_NC.
+ *
+ * This function returns a pointer to the updated LEB properties on success
+ * and a negative error code on failure. N.B. the LEB properties may have had to
+ * be copied (due to COW) and consequently the pointer returned may not be the
+ * same as the pointer passed.
+ */
+const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
+					   const struct ubifs_lprops *lp,
+					   int free, int dirty, int flags,
+					   int idx_gc_cnt)
+{
+	/*
+	 * This is the only function that is allowed to change lprops, so we
+	 * discard the const qualifier.
+	 */
+	struct ubifs_lprops *lprops = (struct ubifs_lprops *)lp;
+
+	dbg_lp("LEB %d, free %d, dirty %d, flags %d",
+	       lprops->lnum, free, dirty, flags);
+
+	ubifs_assert(mutex_is_locked(&c->lp_mutex));
+	ubifs_assert(c->lst.empty_lebs >= 0 &&
+		     c->lst.empty_lebs <= c->main_lebs);
+	ubifs_assert(c->freeable_cnt >= 0);
+	ubifs_assert(c->freeable_cnt <= c->main_lebs);
+	ubifs_assert(c->lst.taken_empty_lebs >= 0);
+	ubifs_assert(c->lst.taken_empty_lebs <= c->lst.empty_lebs);
+	ubifs_assert(!(c->lst.total_free & 7) && !(c->lst.total_dirty & 7));
+	ubifs_assert(!(c->lst.total_dead & 7) && !(c->lst.total_dark & 7));
+	ubifs_assert(!(c->lst.total_used & 7));
+	ubifs_assert(free == LPROPS_NC || free >= 0);
+	ubifs_assert(dirty == LPROPS_NC || dirty >= 0);
+
+	if (!is_lprops_dirty(c, lprops)) {
+		lprops = ubifs_lpt_lookup_dirty(c, lprops->lnum);
+		if (IS_ERR(lprops))
+			return lprops;
+	} else
+		ubifs_assert(lprops == ubifs_lpt_lookup_dirty(c, lprops->lnum));
+
+	ubifs_assert(!(lprops->free & 7) && !(lprops->dirty & 7));
+
+	spin_lock(&c->space_lock);
+
+	if ((lprops->flags & LPROPS_TAKEN) && lprops->free == c->leb_size)
+		c->lst.taken_empty_lebs -= 1;
+
+	if (!(lprops->flags & LPROPS_INDEX)) {
+		int old_spc;
+
+		old_spc = lprops->free + lprops->dirty;
+		if (old_spc < c->dead_wm)
+			c->lst.total_dead -= old_spc;
+		else
+			c->lst.total_dark -= calc_dark(c, old_spc);
+
+		c->lst.total_used -= c->leb_size - old_spc;
+	}
+
+	if (free != LPROPS_NC) {
+		free = ALIGN(free, 8);
+		c->lst.total_free += free - lprops->free;
+
+		/* Increase or decrease empty LEBs counter if needed */
+		if (free == c->leb_size) {
+			if (lprops->free != c->leb_size)
+				c->lst.empty_lebs += 1;
+		} else if (lprops->free == c->leb_size)
+			c->lst.empty_lebs -= 1;
+		lprops->free = free;
+	}
+
+	if (dirty != LPROPS_NC) {
+		dirty = ALIGN(dirty, 8);
+		c->lst.total_dirty += dirty - lprops->dirty;
+		lprops->dirty = dirty;
+	}
+
+	if (flags != LPROPS_NC) {
+		/* Take care about indexing LEBs counter if needed */
+		if ((lprops->flags & LPROPS_INDEX)) {
+			if (!(flags & LPROPS_INDEX))
+				c->lst.idx_lebs -= 1;
+		} else if (flags & LPROPS_INDEX)
+			c->lst.idx_lebs += 1;
+		lprops->flags = flags;
+	}
+
+	if (!(lprops->flags & LPROPS_INDEX)) {
+		int new_spc;
+
+		new_spc = lprops->free + lprops->dirty;
+		if (new_spc < c->dead_wm)
+			c->lst.total_dead += new_spc;
+		else
+			c->lst.total_dark += calc_dark(c, new_spc);
+
+		c->lst.total_used += c->leb_size - new_spc;
+	}
+
+	if ((lprops->flags & LPROPS_TAKEN) && lprops->free == c->leb_size)
+		c->lst.taken_empty_lebs += 1;
+
+	change_category(c, lprops);
+
+	c->idx_gc_cnt += idx_gc_cnt;
+
+	spin_unlock(&c->space_lock);
+
+	return lprops;
+}
+
+/**
+ * ubifs_release_lprops - release lprops lock.
+ * @c: the UBIFS file-system description object
+ *
+ * This function has to be called after each 'ubifs_get_lprops()' call to
+ * unlock lprops.
+ */
+void ubifs_release_lprops(struct ubifs_info *c)
+{
+	ubifs_assert(mutex_is_locked(&c->lp_mutex));
+	ubifs_assert(c->lst.empty_lebs >= 0 &&
+		     c->lst.empty_lebs <= c->main_lebs);
+
+	mutex_unlock(&c->lp_mutex);
+}
+
+/**
+ * ubifs_get_lp_stats - get lprops statistics.
+ * @c: UBIFS file-system description object
+ * @st: return statistics
+ */
+void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *st)
+{
+	spin_lock(&c->space_lock);
+	memcpy(st, &c->lst, sizeof(struct ubifs_lp_stats));
+	spin_unlock(&c->space_lock);
+}
+
+/**
+ * ubifs_change_one_lp - change LEB properties.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB to change properties for
+ * @free: amount of free space
+ * @dirty: amount of dirty space
+ * @flags_set: flags to set
+ * @flags_clean: flags to clean
+ * @idx_gc_cnt: change to the count of idx_gc list
+ *
+ * This function changes properties of LEB @lnum. It is a helper wrapper over
+ * 'ubifs_change_lp()' which hides lprops get/release. The arguments are the
+ * same as in case of 'ubifs_change_lp()'. Returns zero in case of success and
+ * a negative error code in case of failure.
+ */
+int ubifs_change_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
+			int flags_set, int flags_clean, int idx_gc_cnt)
+{
+	int err = 0, flags;
+	const struct ubifs_lprops *lp;
+
+	ubifs_get_lprops(c);
+
+	lp = ubifs_lpt_lookup_dirty(c, lnum);
+	if (IS_ERR(lp)) {
+		err = PTR_ERR(lp);
+		goto out;
+	}
+
+	flags = (lp->flags | flags_set) & ~flags_clean;
+	lp = ubifs_change_lp(c, lp, free, dirty, flags, idx_gc_cnt);
+	if (IS_ERR(lp))
+		err = PTR_ERR(lp);
+
+out:
+	ubifs_release_lprops(c);
+	return err;
+}
+
+/**
+ * ubifs_update_one_lp - update LEB properties.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB to change properties for
+ * @free: amount of free space
+ * @dirty: amount of dirty space to add
+ * @flags_set: flags to set
+ * @flags_clean: flags to clean
+ *
+ * This function is the same as 'ubifs_change_one_lp()' but @dirty is added to
+ * current dirty space, not substitutes it.
+ */
+int ubifs_update_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
+			int flags_set, int flags_clean)
+{
+	int err = 0, flags;
+	const struct ubifs_lprops *lp;
+
+	ubifs_get_lprops(c);
+
+	lp = ubifs_lpt_lookup_dirty(c, lnum);
+	if (IS_ERR(lp)) {
+		err = PTR_ERR(lp);
+		goto out;
+	}
+
+	flags = (lp->flags | flags_set) & ~flags_clean;
+	lp = ubifs_change_lp(c, lp, free, lp->dirty + dirty, flags, 0);
+	if (IS_ERR(lp))
+		err = PTR_ERR(lp);
+
+out:
+	ubifs_release_lprops(c);
+	return err;
+}
+
+/**
+ * ubifs_read_one_lp - read LEB properties.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB to read properties for
+ * @lp: where to store read properties
+ *
+ * This helper function reads properties of a LEB @lnum and stores them in @lp.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+int ubifs_read_one_lp(struct ubifs_info *c, int lnum, struct ubifs_lprops *lp)
+{
+	int err = 0;
+	const struct ubifs_lprops *lpp;
+
+	ubifs_get_lprops(c);
+
+	lpp = ubifs_lpt_lookup(c, lnum);
+	if (IS_ERR(lpp)) {
+		err = PTR_ERR(lpp);
+		goto out;
+	}
+
+	memcpy(lp, lpp, sizeof(struct ubifs_lprops));
+
+out:
+	ubifs_release_lprops(c);
+	return err;
+}
+
+/**
+ * ubifs_fast_find_free - try to find a LEB with free space quickly.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns LEB properties for a LEB with free space or %NULL if
+ * the function is unable to find a LEB quickly.
+ */
+const struct ubifs_lprops *ubifs_fast_find_free(struct ubifs_info *c)
+{
+	struct ubifs_lprops *lprops;
+	struct ubifs_lpt_heap *heap;
+
+	ubifs_assert(mutex_is_locked(&c->lp_mutex));
+
+	heap = &c->lpt_heap[LPROPS_FREE - 1];
+	if (heap->cnt == 0)
+		return NULL;
+
+	lprops = heap->arr[0];
+	ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+	ubifs_assert(!(lprops->flags & LPROPS_INDEX));
+	return lprops;
+}
+
+/**
+ * ubifs_fast_find_empty - try to find an empty LEB quickly.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns LEB properties for an empty LEB or %NULL if the
+ * function is unable to find an empty LEB quickly.
+ */
+const struct ubifs_lprops *ubifs_fast_find_empty(struct ubifs_info *c)
+{
+	struct ubifs_lprops *lprops;
+
+	ubifs_assert(mutex_is_locked(&c->lp_mutex));
+
+	if (list_empty(&c->empty_list))
+		return NULL;
+
+	lprops = list_entry(c->empty_list.next, struct ubifs_lprops, list);
+	ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+	ubifs_assert(!(lprops->flags & LPROPS_INDEX));
+	ubifs_assert(lprops->free == c->leb_size);
+	return lprops;
+}
+
+/**
+ * ubifs_fast_find_freeable - try to find a freeable LEB quickly.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns LEB properties for a freeable LEB or %NULL if the
+ * function is unable to find a freeable LEB quickly.
+ */
+const struct ubifs_lprops *ubifs_fast_find_freeable(struct ubifs_info *c)
+{
+	struct ubifs_lprops *lprops;
+
+	ubifs_assert(mutex_is_locked(&c->lp_mutex));
+
+	if (list_empty(&c->freeable_list))
+		return NULL;
+
+	lprops = list_entry(c->freeable_list.next, struct ubifs_lprops, list);
+	ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+	ubifs_assert(!(lprops->flags & LPROPS_INDEX));
+	ubifs_assert(lprops->free + lprops->dirty == c->leb_size);
+	ubifs_assert(c->freeable_cnt > 0);
+	return lprops;
+}
+
+/**
+ * ubifs_fast_find_frdi_idx - try to find a freeable index LEB quickly.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns LEB properties for a freeable index LEB or %NULL if the
+ * function is unable to find a freeable index LEB quickly.
+ */
+const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c)
+{
+	struct ubifs_lprops *lprops;
+
+	ubifs_assert(mutex_is_locked(&c->lp_mutex));
+
+	if (list_empty(&c->frdi_idx_list))
+		return NULL;
+
+	lprops = list_entry(c->frdi_idx_list.next, struct ubifs_lprops, list);
+	ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+	ubifs_assert((lprops->flags & LPROPS_INDEX));
+	ubifs_assert(lprops->free + lprops->dirty == c->leb_size);
+	return lprops;
+}
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+
+/**
+ * dbg_check_cats - check category heaps and lists.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int dbg_check_cats(struct ubifs_info *c)
+{
+	struct ubifs_lprops *lprops;
+	struct list_head *pos;
+	int i, cat;
+
+	if (!(ubifs_chk_flags & (UBIFS_CHK_GEN | UBIFS_CHK_LPROPS)))
+		return 0;
+
+	list_for_each_entry(lprops, &c->empty_list, list) {
+		if (lprops->free != c->leb_size) {
+			ubifs_err("non-empty LEB %d on empty list "
+				  "(free %d dirty %d flags %d)", lprops->lnum,
+				  lprops->free, lprops->dirty, lprops->flags);
+			return -EINVAL;
+		}
+		if (lprops->flags & LPROPS_TAKEN) {
+			ubifs_err("taken LEB %d on empty list "
+				  "(free %d dirty %d flags %d)", lprops->lnum,
+				  lprops->free, lprops->dirty, lprops->flags);
+			return -EINVAL;
+		}
+	}
+
+	i = 0;
+	list_for_each_entry(lprops, &c->freeable_list, list) {
+		if (lprops->free + lprops->dirty != c->leb_size) {
+			ubifs_err("non-freeable LEB %d on freeable list "
+				  "(free %d dirty %d flags %d)", lprops->lnum,
+				  lprops->free, lprops->dirty, lprops->flags);
+			return -EINVAL;
+		}
+		if (lprops->flags & LPROPS_TAKEN) {
+			ubifs_err("taken LEB %d on freeable list "
+				  "(free %d dirty %d flags %d)", lprops->lnum,
+				  lprops->free, lprops->dirty, lprops->flags);
+			return -EINVAL;
+		}
+		i += 1;
+	}
+	if (i != c->freeable_cnt) {
+		ubifs_err("freeable list count %d expected %d", i,
+			  c->freeable_cnt);
+		return -EINVAL;
+	}
+
+	i = 0;
+	list_for_each(pos, &c->idx_gc)
+		i += 1;
+	if (i != c->idx_gc_cnt) {
+		ubifs_err("idx_gc list count %d expected %d", i,
+			  c->idx_gc_cnt);
+		return -EINVAL;
+	}
+
+	list_for_each_entry(lprops, &c->frdi_idx_list, list) {
+		if (lprops->free + lprops->dirty != c->leb_size) {
+			ubifs_err("non-freeable LEB %d on frdi_idx list "
+				  "(free %d dirty %d flags %d)", lprops->lnum,
+				  lprops->free, lprops->dirty, lprops->flags);
+			return -EINVAL;
+		}
+		if (lprops->flags & LPROPS_TAKEN) {
+			ubifs_err("taken LEB %d on frdi_idx list "
+				  "(free %d dirty %d flags %d)", lprops->lnum,
+				  lprops->free, lprops->dirty, lprops->flags);
+			return -EINVAL;
+		}
+		if (!(lprops->flags & LPROPS_INDEX)) {
+			ubifs_err("non-index LEB %d on frdi_idx list "
+				  "(free %d dirty %d flags %d)", lprops->lnum,
+				  lprops->free, lprops->dirty, lprops->flags);
+			return -EINVAL;
+		}
+	}
+
+	for (cat = 1; cat <= LPROPS_HEAP_CNT; cat++) {
+		struct ubifs_lpt_heap *heap = &c->lpt_heap[cat - 1];
+
+		for (i = 0; i < heap->cnt; i++) {
+			lprops = heap->arr[i];
+			if (!lprops) {
+				ubifs_err("null ptr in LPT heap cat %d", cat);
+				return -EINVAL;
+			}
+			if (lprops->hpos != i) {
+				ubifs_err("bad ptr in LPT heap cat %d", cat);
+				return -EINVAL;
+			}
+			if (lprops->flags & LPROPS_TAKEN) {
+				ubifs_err("taken LEB in LPT heap cat %d", cat);
+				return -EINVAL;
+			}
+		}
+	}
+
+	return 0;
+}
+
+void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat,
+		    int add_pos)
+{
+	int i = 0, j, err = 0;
+
+	if (!(ubifs_chk_flags & (UBIFS_CHK_GEN | UBIFS_CHK_LPROPS)))
+		return;
+
+	for (i = 0; i < heap->cnt; i++) {
+		struct ubifs_lprops *lprops = heap->arr[i];
+		struct ubifs_lprops *lp;
+
+		if (i != add_pos)
+			if ((lprops->flags & LPROPS_CAT_MASK) != cat) {
+				err = 1;
+				goto out;
+			}
+		if (lprops->hpos != i) {
+			err = 2;
+			goto out;
+		}
+		lp = ubifs_lpt_lookup(c, lprops->lnum);
+		if (IS_ERR(lp)) {
+			err = 3;
+			goto out;
+		}
+		if (lprops != lp) {
+			dbg_msg("lprops %zx lp %zx lprops->lnum %d lp->lnum %d",
+				(size_t)lprops, (size_t)lp, lprops->lnum,
+				lp->lnum);
+			err = 4;
+			goto out;
+		}
+		for (j = 0; j < i; j++) {
+			lp = heap->arr[j];
+			if (lp == lprops) {
+				err = 5;
+				goto out;
+			}
+			if (lp->lnum == lprops->lnum) {
+				err = 6;
+				goto out;
+			}
+		}
+	}
+out:
+	if (err) {
+		dbg_msg("failed cat %d hpos %d err %d", cat, i, err);
+		dbg_dump_stack();
+		dbg_dump_heap(c, heap, cat);
+	}
+}
+
+/**
+ * struct scan_check_data - data provided to scan callback function.
+ * @lst: LEB properties statistics
+ * @err: error code
+ */
+struct scan_check_data {
+	struct ubifs_lp_stats lst;
+	int err;
+};
+
+/**
+ * scan_check_cb - scan callback.
+ * @c: the UBIFS file-system description object
+ * @lp: LEB properties to scan
+ * @in_tree: whether the LEB properties are in main memory
+ * @data: information passed to and from the caller of the scan
+ *
+ * This function returns a code that indicates whether the scan should continue
+ * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
+ * in main memory (%LPT_SCAN_ADD), or whether the scan should stop
+ * (%LPT_SCAN_STOP).
+ */
+static int scan_check_cb(struct ubifs_info *c,
+			 const struct ubifs_lprops *lp, int in_tree,
+			 struct scan_check_data *data)
+{
+	struct ubifs_scan_leb *sleb;
+	struct ubifs_scan_node *snod;
+	struct ubifs_lp_stats *lst = &data->lst;
+	int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty;
+
+	cat = lp->flags & LPROPS_CAT_MASK;
+	if (cat != LPROPS_UNCAT) {
+		cat = ubifs_categorize_lprops(c, lp);
+		if (cat != (lp->flags & LPROPS_CAT_MASK)) {
+			ubifs_err("bad LEB category %d expected %d",
+				  (lp->flags & LPROPS_CAT_MASK), cat);
+			goto out;
+		}
+	}
+
+	/* Check lp is on its category list (if it has one) */
+	if (in_tree) {
+		struct list_head *list = NULL;
+
+		switch (cat) {
+		case LPROPS_EMPTY:
+			list = &c->empty_list;
+			break;
+		case LPROPS_FREEABLE:
+			list = &c->freeable_list;
+			break;
+		case LPROPS_FRDI_IDX:
+			list = &c->frdi_idx_list;
+			break;
+		case LPROPS_UNCAT:
+			list = &c->uncat_list;
+			break;
+		}
+		if (list) {
+			struct ubifs_lprops *lprops;
+			int found = 0;
+
+			list_for_each_entry(lprops, list, list) {
+				if (lprops == lp) {
+					found = 1;
+					break;
+				}
+			}
+			if (!found) {
+				ubifs_err("bad LPT list (category %d)", cat);
+				goto out;
+			}
+		}
+	}
+
+	/* Check lp is on its category heap (if it has one) */
+	if (in_tree && cat > 0 && cat <= LPROPS_HEAP_CNT) {
+		struct ubifs_lpt_heap *heap = &c->lpt_heap[cat - 1];
+
+		if ((lp->hpos != -1 && heap->arr[lp->hpos]->lnum != lnum) ||
+		    lp != heap->arr[lp->hpos]) {
+			ubifs_err("bad LPT heap (category %d)", cat);
+			goto out;
+		}
+	}
+
+	sleb = ubifs_scan(c, lnum, 0, c->dbg_buf);
+	if (IS_ERR(sleb)) {
+		/*
+		 * After an unclean unmount, empty and freeable LEBs
+		 * may contain garbage.
+		 */
+		if (lp->free == c->leb_size) {
+			ubifs_err("scan errors were in empty LEB "
+				  "- continuing checking");
+			lst->empty_lebs += 1;
+			lst->total_free += c->leb_size;
+			lst->total_dark += calc_dark(c, c->leb_size);
+			return LPT_SCAN_CONTINUE;
+		}
+
+		if (lp->free + lp->dirty == c->leb_size &&
+		    !(lp->flags & LPROPS_INDEX)) {
+			ubifs_err("scan errors were in freeable LEB "
+				  "- continuing checking");
+			lst->total_free  += lp->free;
+			lst->total_dirty += lp->dirty;
+			lst->total_dark  +=  calc_dark(c, c->leb_size);
+			return LPT_SCAN_CONTINUE;
+		}
+		data->err = PTR_ERR(sleb);
+		return LPT_SCAN_STOP;
+	}
+
+	is_idx = -1;
+	list_for_each_entry(snod, &sleb->nodes, list) {
+		int found, level = 0;
+
+		cond_resched();
+
+		if (is_idx == -1)
+			is_idx = (snod->type == UBIFS_IDX_NODE) ? 1 : 0;
+
+		if (is_idx && snod->type != UBIFS_IDX_NODE) {
+			ubifs_err("indexing node in data LEB %d:%d",
+				  lnum, snod->offs);
+			goto out_destroy;
+		}
+
+		if (snod->type == UBIFS_IDX_NODE) {
+			struct ubifs_idx_node *idx = snod->node;
+
+			key_read(c, ubifs_idx_key(c, idx), &snod->key);
+			level = le16_to_cpu(idx->level);
+		}
+
+		found = ubifs_tnc_has_node(c, &snod->key, level, lnum,
+					   snod->offs, is_idx);
+		if (found) {
+			if (found < 0)
+				goto out_destroy;
+			used += ALIGN(snod->len, 8);
+		}
+	}
+
+	free = c->leb_size - sleb->endpt;
+	dirty = sleb->endpt - used;
+
+	if (free > c->leb_size || free < 0 || dirty > c->leb_size ||
+	    dirty < 0) {
+		ubifs_err("bad calculated accounting for LEB %d: "
+			  "free %d, dirty %d", lnum, free, dirty);
+		goto out_destroy;
+	}
+
+	if (lp->free + lp->dirty == c->leb_size &&
+	    free + dirty == c->leb_size)
+		if ((is_idx && !(lp->flags & LPROPS_INDEX)) ||
+		    (!is_idx && free == c->leb_size) ||
+		    lp->free == c->leb_size) {
+			/*
+			 * Empty or freeable LEBs could contain index
+			 * nodes from an uncompleted commit due to an
+			 * unclean unmount. Or they could be empty for
+			 * the same reason. Or it may simply not have been
+			 * unmapped.
+			 */
+			free = lp->free;
+			dirty = lp->dirty;
+			is_idx = 0;
+		    }
+
+	if (is_idx && lp->free + lp->dirty == free + dirty &&
+	    lnum != c->ihead_lnum) {
+		/*
+		 * After an unclean unmount, an index LEB could have a different
+		 * amount of free space than the value recorded by lprops. That
+		 * is because the in-the-gaps method may use free space or
+		 * create free space (as a side-effect of using ubi_leb_change
+		 * and not writing the whole LEB). The incorrect free space
+		 * value is not a problem because the index is only ever
+		 * allocated empty LEBs, so there will never be an attempt to
+		 * write to the free space at the end of an index LEB - except
+		 * by the in-the-gaps method for which it is not a problem.
+		 */
+		free = lp->free;
+		dirty = lp->dirty;
+	}
+
+	if (lp->free != free || lp->dirty != dirty)
+		goto out_print;
+
+	if (is_idx && !(lp->flags & LPROPS_INDEX)) {
+		if (free == c->leb_size)
+			/* Free but not unmapped LEB, it's fine */
+			is_idx = 0;
+		else {
+			ubifs_err("indexing node without indexing "
+				  "flag");
+			goto out_print;
+		}
+	}
+
+	if (!is_idx && (lp->flags & LPROPS_INDEX)) {
+		ubifs_err("data node with indexing flag");
+		goto out_print;
+	}
+
+	if (free == c->leb_size)
+		lst->empty_lebs += 1;
+
+	if (is_idx)
+		lst->idx_lebs += 1;
+
+	if (!(lp->flags & LPROPS_INDEX))
+		lst->total_used += c->leb_size - free - dirty;
+	lst->total_free += free;
+	lst->total_dirty += dirty;
+
+	if (!(lp->flags & LPROPS_INDEX)) {
+		int spc = free + dirty;
+
+		if (spc < c->dead_wm)
+			lst->total_dead += spc;
+		else
+			lst->total_dark += calc_dark(c, spc);
+	}
+
+	ubifs_scan_destroy(sleb);
+
+	return LPT_SCAN_CONTINUE;
+
+out_print:
+	ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, "
+		  "should be free %d, dirty %d",
+		  lnum, lp->free, lp->dirty, lp->flags, free, dirty);
+	dbg_dump_leb(c, lnum);
+out_destroy:
+	ubifs_scan_destroy(sleb);
+out:
+	data->err = -EINVAL;
+	return LPT_SCAN_STOP;
+}
+
+/**
+ * dbg_check_lprops - check all LEB properties.
+ * @c: UBIFS file-system description object
+ *
+ * This function checks all LEB properties and makes sure they are all correct.
+ * It returns zero if everything is fine, %-EINVAL if there is an inconsistency
+ * and other negative error codes in case of other errors. This function is
+ * called while the file system is locked (because of commit start), so no
+ * additional locking is required. Note that locking the LPT mutex would cause
+ * a circular lock dependency with the TNC mutex.
+ */
+int dbg_check_lprops(struct ubifs_info *c)
+{
+	int i, err;
+	struct scan_check_data data;
+	struct ubifs_lp_stats *lst = &data.lst;
+
+	if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
+		return 0;
+
+	/*
+	 * As we are going to scan the media, the write buffers have to be
+	 * synchronized.
+	 */
+	for (i = 0; i < c->jhead_cnt; i++) {
+		err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
+		if (err)
+			return err;
+	}
+
+	memset(lst, 0, sizeof(struct ubifs_lp_stats));
+
+	data.err = 0;
+	err = ubifs_lpt_scan_nolock(c, c->main_first, c->leb_cnt - 1,
+				    (ubifs_lpt_scan_callback)scan_check_cb,
+				    &data);
+	if (err && err != -ENOSPC)
+		goto out;
+	if (data.err) {
+		err = data.err;
+		goto out;
+	}
+
+	if (lst->empty_lebs != c->lst.empty_lebs ||
+	    lst->idx_lebs != c->lst.idx_lebs ||
+	    lst->total_free != c->lst.total_free ||
+	    lst->total_dirty != c->lst.total_dirty ||
+	    lst->total_used != c->lst.total_used) {
+		ubifs_err("bad overall accounting");
+		ubifs_err("calculated: empty_lebs %d, idx_lebs %d, "
+			  "total_free %lld, total_dirty %lld, total_used %lld",
+			  lst->empty_lebs, lst->idx_lebs, lst->total_free,
+			  lst->total_dirty, lst->total_used);
+		ubifs_err("read from lprops: empty_lebs %d, idx_lebs %d, "
+			  "total_free %lld, total_dirty %lld, total_used %lld",
+			  c->lst.empty_lebs, c->lst.idx_lebs, c->lst.total_free,
+			  c->lst.total_dirty, c->lst.total_used);
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (lst->total_dead != c->lst.total_dead ||
+	    lst->total_dark != c->lst.total_dark) {
+		ubifs_err("bad dead/dark space accounting");
+		ubifs_err("calculated: total_dead %lld, total_dark %lld",
+			  lst->total_dead, lst->total_dark);
+		ubifs_err("read from lprops: total_dead %lld, total_dark %lld",
+			  c->lst.total_dead, c->lst.total_dark);
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = dbg_check_cats(c);
+out:
+	return err;
+}
+
+#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
new file mode 100644
index 00000000000..9ff2463177e
--- /dev/null
+++ b/fs/ubifs/lpt.c
@@ -0,0 +1,2243 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements the LEB properties tree (LPT) area. The LPT area
+ * contains the LEB properties tree, a table of LPT area eraseblocks (ltab), and
+ * (for the "big" model) a table of saved LEB numbers (lsave). The LPT area sits
+ * between the log and the orphan area.
+ *
+ * The LPT area is like a miniature self-contained file system. It is required
+ * that it never runs out of space, is fast to access and update, and scales
+ * logarithmically. The LEB properties tree is implemented as a wandering tree
+ * much like the TNC, and the LPT area has its own garbage collection.
+ *
+ * The LPT has two slightly different forms called the "small model" and the
+ * "big model". The small model is used when the entire LEB properties table
+ * can be written into a single eraseblock. In that case, garbage collection
+ * consists of just writing the whole table, which therefore makes all other
+ * eraseblocks reusable. In the case of the big model, dirty eraseblocks are
+ * selected for garbage collection, which consists are marking the nodes in
+ * that LEB as dirty, and then only the dirty nodes are written out. Also, in
+ * the case of the big model, a table of LEB numbers is saved so that the entire
+ * LPT does not to be scanned looking for empty eraseblocks when UBIFS is first
+ * mounted.
+ */
+
+#include <linux/crc16.h>
+#include "ubifs.h"
+
+/**
+ * do_calc_lpt_geom - calculate sizes for the LPT area.
+ * @c: the UBIFS file-system description object
+ *
+ * Calculate the sizes of LPT bit fields, nodes, and tree, based on the
+ * properties of the flash and whether LPT is "big" (c->big_lpt).
+ */
+static void do_calc_lpt_geom(struct ubifs_info *c)
+{
+	int i, n, bits, per_leb_wastage, max_pnode_cnt;
+	long long sz, tot_wastage;
+
+	n = c->main_lebs + c->max_leb_cnt - c->leb_cnt;
+	max_pnode_cnt = DIV_ROUND_UP(n, UBIFS_LPT_FANOUT);
+
+	c->lpt_hght = 1;
+	n = UBIFS_LPT_FANOUT;
+	while (n < max_pnode_cnt) {
+		c->lpt_hght += 1;
+		n <<= UBIFS_LPT_FANOUT_SHIFT;
+	}
+
+	c->pnode_cnt = DIV_ROUND_UP(c->main_lebs, UBIFS_LPT_FANOUT);
+
+	n = DIV_ROUND_UP(c->pnode_cnt, UBIFS_LPT_FANOUT);
+	c->nnode_cnt = n;
+	for (i = 1; i < c->lpt_hght; i++) {
+		n = DIV_ROUND_UP(n, UBIFS_LPT_FANOUT);
+		c->nnode_cnt += n;
+	}
+
+	c->space_bits = fls(c->leb_size) - 3;
+	c->lpt_lnum_bits = fls(c->lpt_lebs);
+	c->lpt_offs_bits = fls(c->leb_size - 1);
+	c->lpt_spc_bits = fls(c->leb_size);
+
+	n = DIV_ROUND_UP(c->max_leb_cnt, UBIFS_LPT_FANOUT);
+	c->pcnt_bits = fls(n - 1);
+
+	c->lnum_bits = fls(c->max_leb_cnt - 1);
+
+	bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS +
+	       (c->big_lpt ? c->pcnt_bits : 0) +
+	       (c->space_bits * 2 + 1) * UBIFS_LPT_FANOUT;
+	c->pnode_sz = (bits + 7) / 8;
+
+	bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS +
+	       (c->big_lpt ? c->pcnt_bits : 0) +
+	       (c->lpt_lnum_bits + c->lpt_offs_bits) * UBIFS_LPT_FANOUT;
+	c->nnode_sz = (bits + 7) / 8;
+
+	bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS +
+	       c->lpt_lebs * c->lpt_spc_bits * 2;
+	c->ltab_sz = (bits + 7) / 8;
+
+	bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS +
+	       c->lnum_bits * c->lsave_cnt;
+	c->lsave_sz = (bits + 7) / 8;
+
+	/* Calculate the minimum LPT size */
+	c->lpt_sz = (long long)c->pnode_cnt * c->pnode_sz;
+	c->lpt_sz += (long long)c->nnode_cnt * c->nnode_sz;
+	c->lpt_sz += c->ltab_sz;
+	c->lpt_sz += c->lsave_sz;
+
+	/* Add wastage */
+	sz = c->lpt_sz;
+	per_leb_wastage = max_t(int, c->pnode_sz, c->nnode_sz);
+	sz += per_leb_wastage;
+	tot_wastage = per_leb_wastage;
+	while (sz > c->leb_size) {
+		sz += per_leb_wastage;
+		sz -= c->leb_size;
+		tot_wastage += per_leb_wastage;
+	}
+	tot_wastage += ALIGN(sz, c->min_io_size) - sz;
+	c->lpt_sz += tot_wastage;
+}
+
+/**
+ * ubifs_calc_lpt_geom - calculate and check sizes for the LPT area.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_calc_lpt_geom(struct ubifs_info *c)
+{
+	int lebs_needed;
+	uint64_t sz;
+
+	do_calc_lpt_geom(c);
+
+	/* Verify that lpt_lebs is big enough */
+	sz = c->lpt_sz * 2; /* Must have at least 2 times the size */
+	sz += c->leb_size - 1;
+	do_div(sz, c->leb_size);
+	lebs_needed = sz;
+	if (lebs_needed > c->lpt_lebs) {
+		ubifs_err("too few LPT LEBs");
+		return -EINVAL;
+	}
+
+	/* Verify that ltab fits in a single LEB (since ltab is a single node */
+	if (c->ltab_sz > c->leb_size) {
+		ubifs_err("LPT ltab too big");
+		return -EINVAL;
+	}
+
+	c->check_lpt_free = c->big_lpt;
+
+	return 0;
+}
+
+/**
+ * calc_dflt_lpt_geom - calculate default LPT geometry.
+ * @c: the UBIFS file-system description object
+ * @main_lebs: number of main area LEBs is passed and returned here
+ * @big_lpt: whether the LPT area is "big" is returned here
+ *
+ * The size of the LPT area depends on parameters that themselves are dependent
+ * on the size of the LPT area. This function, successively recalculates the LPT
+ * area geometry until the parameters and resultant geometry are consistent.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int calc_dflt_lpt_geom(struct ubifs_info *c, int *main_lebs,
+			      int *big_lpt)
+{
+	int i, lebs_needed;
+	uint64_t sz;
+
+	/* Start by assuming the minimum number of LPT LEBs */
+	c->lpt_lebs = UBIFS_MIN_LPT_LEBS;
+	c->main_lebs = *main_lebs - c->lpt_lebs;
+	if (c->main_lebs <= 0)
+		return -EINVAL;
+
+	/* And assume we will use the small LPT model */
+	c->big_lpt = 0;
+
+	/*
+	 * Calculate the geometry based on assumptions above and then see if it
+	 * makes sense
+	 */
+	do_calc_lpt_geom(c);
+
+	/* Small LPT model must have lpt_sz < leb_size */
+	if (c->lpt_sz > c->leb_size) {
+		/* Nope, so try again using big LPT model */
+		c->big_lpt = 1;
+		do_calc_lpt_geom(c);
+	}
+
+	/* Now check there are enough LPT LEBs */
+	for (i = 0; i < 64 ; i++) {
+		sz = c->lpt_sz * 4; /* Allow 4 times the size */
+		sz += c->leb_size - 1;
+		do_div(sz, c->leb_size);
+		lebs_needed = sz;
+		if (lebs_needed > c->lpt_lebs) {
+			/* Not enough LPT LEBs so try again with more */
+			c->lpt_lebs = lebs_needed;
+			c->main_lebs = *main_lebs - c->lpt_lebs;
+			if (c->main_lebs <= 0)
+				return -EINVAL;
+			do_calc_lpt_geom(c);
+			continue;
+		}
+		if (c->ltab_sz > c->leb_size) {
+			ubifs_err("LPT ltab too big");
+			return -EINVAL;
+		}
+		*main_lebs = c->main_lebs;
+		*big_lpt = c->big_lpt;
+		return 0;
+	}
+	return -EINVAL;
+}
+
+/**
+ * pack_bits - pack bit fields end-to-end.
+ * @addr: address at which to pack (passed and next address returned)
+ * @pos: bit position at which to pack (passed and next position returned)
+ * @val: value to pack
+ * @nrbits: number of bits of value to pack (1-32)
+ */
+static void pack_bits(uint8_t **addr, int *pos, uint32_t val, int nrbits)
+{
+	uint8_t *p = *addr;
+	int b = *pos;
+
+	ubifs_assert(nrbits > 0);
+	ubifs_assert(nrbits <= 32);
+	ubifs_assert(*pos >= 0);
+	ubifs_assert(*pos < 8);
+	ubifs_assert((val >> nrbits) == 0 || nrbits == 32);
+	if (b) {
+		*p |= ((uint8_t)val) << b;
+		nrbits += b;
+		if (nrbits > 8) {
+			*++p = (uint8_t)(val >>= (8 - b));
+			if (nrbits > 16) {
+				*++p = (uint8_t)(val >>= 8);
+				if (nrbits > 24) {
+					*++p = (uint8_t)(val >>= 8);
+					if (nrbits > 32)
+						*++p = (uint8_t)(val >>= 8);
+				}
+			}
+		}
+	} else {
+		*p = (uint8_t)val;
+		if (nrbits > 8) {
+			*++p = (uint8_t)(val >>= 8);
+			if (nrbits > 16) {
+				*++p = (uint8_t)(val >>= 8);
+				if (nrbits > 24)
+					*++p = (uint8_t)(val >>= 8);
+			}
+		}
+	}
+	b = nrbits & 7;
+	if (b == 0)
+		p++;
+	*addr = p;
+	*pos = b;
+}
+
+/**
+ * ubifs_unpack_bits - unpack bit fields.
+ * @addr: address at which to unpack (passed and next address returned)
+ * @pos: bit position at which to unpack (passed and next position returned)
+ * @nrbits: number of bits of value to unpack (1-32)
+ *
+ * This functions returns the value unpacked.
+ */
+uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits)
+{
+	const int k = 32 - nrbits;
+	uint8_t *p = *addr;
+	int b = *pos;
+	uint32_t val;
+
+	ubifs_assert(nrbits > 0);
+	ubifs_assert(nrbits <= 32);
+	ubifs_assert(*pos >= 0);
+	ubifs_assert(*pos < 8);
+	if (b) {
+		val = p[1] | ((uint32_t)p[2] << 8) | ((uint32_t)p[3] << 16) |
+		      ((uint32_t)p[4] << 24);
+		val <<= (8 - b);
+		val |= *p >> b;
+		nrbits += b;
+	} else
+		val = p[0] | ((uint32_t)p[1] << 8) | ((uint32_t)p[2] << 16) |
+		      ((uint32_t)p[3] << 24);
+	val <<= k;
+	val >>= k;
+	b = nrbits & 7;
+	p += nrbits / 8;
+	*addr = p;
+	*pos = b;
+	ubifs_assert((val >> nrbits) == 0 || nrbits - b == 32);
+	return val;
+}
+
+/**
+ * ubifs_pack_pnode - pack all the bit fields of a pnode.
+ * @c: UBIFS file-system description object
+ * @buf: buffer into which to pack
+ * @pnode: pnode to pack
+ */
+void ubifs_pack_pnode(struct ubifs_info *c, void *buf,
+		      struct ubifs_pnode *pnode)
+{
+	uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+	int i, pos = 0;
+	uint16_t crc;
+
+	pack_bits(&addr, &pos, UBIFS_LPT_PNODE, UBIFS_LPT_TYPE_BITS);
+	if (c->big_lpt)
+		pack_bits(&addr, &pos, pnode->num, c->pcnt_bits);
+	for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+		pack_bits(&addr, &pos, pnode->lprops[i].free >> 3,
+			  c->space_bits);
+		pack_bits(&addr, &pos, pnode->lprops[i].dirty >> 3,
+			  c->space_bits);
+		if (pnode->lprops[i].flags & LPROPS_INDEX)
+			pack_bits(&addr, &pos, 1, 1);
+		else
+			pack_bits(&addr, &pos, 0, 1);
+	}
+	crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
+		    c->pnode_sz - UBIFS_LPT_CRC_BYTES);
+	addr = buf;
+	pos = 0;
+	pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS);
+}
+
+/**
+ * ubifs_pack_nnode - pack all the bit fields of a nnode.
+ * @c: UBIFS file-system description object
+ * @buf: buffer into which to pack
+ * @nnode: nnode to pack
+ */
+void ubifs_pack_nnode(struct ubifs_info *c, void *buf,
+		      struct ubifs_nnode *nnode)
+{
+	uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+	int i, pos = 0;
+	uint16_t crc;
+
+	pack_bits(&addr, &pos, UBIFS_LPT_NNODE, UBIFS_LPT_TYPE_BITS);
+	if (c->big_lpt)
+		pack_bits(&addr, &pos, nnode->num, c->pcnt_bits);
+	for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+		int lnum = nnode->nbranch[i].lnum;
+
+		if (lnum == 0)
+			lnum = c->lpt_last + 1;
+		pack_bits(&addr, &pos, lnum - c->lpt_first, c->lpt_lnum_bits);
+		pack_bits(&addr, &pos, nnode->nbranch[i].offs,
+			  c->lpt_offs_bits);
+	}
+	crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
+		    c->nnode_sz - UBIFS_LPT_CRC_BYTES);
+	addr = buf;
+	pos = 0;
+	pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS);
+}
+
+/**
+ * ubifs_pack_ltab - pack the LPT's own lprops table.
+ * @c: UBIFS file-system description object
+ * @buf: buffer into which to pack
+ * @ltab: LPT's own lprops table to pack
+ */
+void ubifs_pack_ltab(struct ubifs_info *c, void *buf,
+		     struct ubifs_lpt_lprops *ltab)
+{
+	uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+	int i, pos = 0;
+	uint16_t crc;
+
+	pack_bits(&addr, &pos, UBIFS_LPT_LTAB, UBIFS_LPT_TYPE_BITS);
+	for (i = 0; i < c->lpt_lebs; i++) {
+		pack_bits(&addr, &pos, ltab[i].free, c->lpt_spc_bits);
+		pack_bits(&addr, &pos, ltab[i].dirty, c->lpt_spc_bits);
+	}
+	crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
+		    c->ltab_sz - UBIFS_LPT_CRC_BYTES);
+	addr = buf;
+	pos = 0;
+	pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS);
+}
+
+/**
+ * ubifs_pack_lsave - pack the LPT's save table.
+ * @c: UBIFS file-system description object
+ * @buf: buffer into which to pack
+ * @lsave: LPT's save table to pack
+ */
+void ubifs_pack_lsave(struct ubifs_info *c, void *buf, int *lsave)
+{
+	uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+	int i, pos = 0;
+	uint16_t crc;
+
+	pack_bits(&addr, &pos, UBIFS_LPT_LSAVE, UBIFS_LPT_TYPE_BITS);
+	for (i = 0; i < c->lsave_cnt; i++)
+		pack_bits(&addr, &pos, lsave[i], c->lnum_bits);
+	crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
+		    c->lsave_sz - UBIFS_LPT_CRC_BYTES);
+	addr = buf;
+	pos = 0;
+	pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS);
+}
+
+/**
+ * ubifs_add_lpt_dirt - add dirty space to LPT LEB properties.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to which to add dirty space
+ * @dirty: amount of dirty space to add
+ */
+void ubifs_add_lpt_dirt(struct ubifs_info *c, int lnum, int dirty)
+{
+	if (!dirty || !lnum)
+		return;
+	dbg_lp("LEB %d add %d to %d",
+	       lnum, dirty, c->ltab[lnum - c->lpt_first].dirty);
+	ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last);
+	c->ltab[lnum - c->lpt_first].dirty += dirty;
+}
+
+/**
+ * set_ltab - set LPT LEB properties.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number
+ * @free: amount of free space
+ * @dirty: amount of dirty space
+ */
+static void set_ltab(struct ubifs_info *c, int lnum, int free, int dirty)
+{
+	dbg_lp("LEB %d free %d dirty %d to %d %d",
+	       lnum, c->ltab[lnum - c->lpt_first].free,
+	       c->ltab[lnum - c->lpt_first].dirty, free, dirty);
+	ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last);
+	c->ltab[lnum - c->lpt_first].free = free;
+	c->ltab[lnum - c->lpt_first].dirty = dirty;
+}
+
+/**
+ * ubifs_add_nnode_dirt - add dirty space to LPT LEB properties.
+ * @c: UBIFS file-system description object
+ * @nnode: nnode for which to add dirt
+ */
+void ubifs_add_nnode_dirt(struct ubifs_info *c, struct ubifs_nnode *nnode)
+{
+	struct ubifs_nnode *np = nnode->parent;
+
+	if (np)
+		ubifs_add_lpt_dirt(c, np->nbranch[nnode->iip].lnum,
+				   c->nnode_sz);
+	else {
+		ubifs_add_lpt_dirt(c, c->lpt_lnum, c->nnode_sz);
+		if (!(c->lpt_drty_flgs & LTAB_DIRTY)) {
+			c->lpt_drty_flgs |= LTAB_DIRTY;
+			ubifs_add_lpt_dirt(c, c->ltab_lnum, c->ltab_sz);
+		}
+	}
+}
+
+/**
+ * add_pnode_dirt - add dirty space to LPT LEB properties.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode for which to add dirt
+ */
+static void add_pnode_dirt(struct ubifs_info *c, struct ubifs_pnode *pnode)
+{
+	ubifs_add_lpt_dirt(c, pnode->parent->nbranch[pnode->iip].lnum,
+			   c->pnode_sz);
+}
+
+/**
+ * calc_nnode_num - calculate nnode number.
+ * @row: the row in the tree (root is zero)
+ * @col: the column in the row (leftmost is zero)
+ *
+ * The nnode number is a number that uniquely identifies a nnode and can be used
+ * easily to traverse the tree from the root to that nnode.
+ *
+ * This function calculates and returns the nnode number for the nnode at @row
+ * and @col.
+ */
+static int calc_nnode_num(int row, int col)
+{
+	int num, bits;
+
+	num = 1;
+	while (row--) {
+		bits = (col & (UBIFS_LPT_FANOUT - 1));
+		col >>= UBIFS_LPT_FANOUT_SHIFT;
+		num <<= UBIFS_LPT_FANOUT_SHIFT;
+		num |= bits;
+	}
+	return num;
+}
+
+/**
+ * calc_nnode_num_from_parent - calculate nnode number.
+ * @c: UBIFS file-system description object
+ * @parent: parent nnode
+ * @iip: index in parent
+ *
+ * The nnode number is a number that uniquely identifies a nnode and can be used
+ * easily to traverse the tree from the root to that nnode.
+ *
+ * This function calculates and returns the nnode number based on the parent's
+ * nnode number and the index in parent.
+ */
+static int calc_nnode_num_from_parent(struct ubifs_info *c,
+				      struct ubifs_nnode *parent, int iip)
+{
+	int num, shft;
+
+	if (!parent)
+		return 1;
+	shft = (c->lpt_hght - parent->level) * UBIFS_LPT_FANOUT_SHIFT;
+	num = parent->num ^ (1 << shft);
+	num |= (UBIFS_LPT_FANOUT + iip) << shft;
+	return num;
+}
+
+/**
+ * calc_pnode_num_from_parent - calculate pnode number.
+ * @c: UBIFS file-system description object
+ * @parent: parent nnode
+ * @iip: index in parent
+ *
+ * The pnode number is a number that uniquely identifies a pnode and can be used
+ * easily to traverse the tree from the root to that pnode.
+ *
+ * This function calculates and returns the pnode number based on the parent's
+ * nnode number and the index in parent.
+ */
+static int calc_pnode_num_from_parent(struct ubifs_info *c,
+				      struct ubifs_nnode *parent, int iip)
+{
+	int i, n = c->lpt_hght - 1, pnum = parent->num, num = 0;
+
+	for (i = 0; i < n; i++) {
+		num <<= UBIFS_LPT_FANOUT_SHIFT;
+		num |= pnum & (UBIFS_LPT_FANOUT - 1);
+		pnum >>= UBIFS_LPT_FANOUT_SHIFT;
+	}
+	num <<= UBIFS_LPT_FANOUT_SHIFT;
+	num |= iip;
+	return num;
+}
+
+/**
+ * ubifs_create_dflt_lpt - create default LPT.
+ * @c: UBIFS file-system description object
+ * @main_lebs: number of main area LEBs is passed and returned here
+ * @lpt_first: LEB number of first LPT LEB
+ * @lpt_lebs: number of LEBs for LPT is passed and returned here
+ * @big_lpt: use big LPT model is passed and returned here
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first,
+			  int *lpt_lebs, int *big_lpt)
+{
+	int lnum, err = 0, node_sz, iopos, i, j, cnt, len, alen, row;
+	int blnum, boffs, bsz, bcnt;
+	struct ubifs_pnode *pnode = NULL;
+	struct ubifs_nnode *nnode = NULL;
+	void *buf = NULL, *p;
+	struct ubifs_lpt_lprops *ltab = NULL;
+	int *lsave = NULL;
+
+	err = calc_dflt_lpt_geom(c, main_lebs, big_lpt);
+	if (err)
+		return err;
+	*lpt_lebs = c->lpt_lebs;
+
+	/* Needed by 'ubifs_pack_nnode()' and 'set_ltab()' */
+	c->lpt_first = lpt_first;
+	/* Needed by 'set_ltab()' */
+	c->lpt_last = lpt_first + c->lpt_lebs - 1;
+	/* Needed by 'ubifs_pack_lsave()' */
+	c->main_first = c->leb_cnt - *main_lebs;
+
+	lsave = kmalloc(sizeof(int) * c->lsave_cnt, GFP_KERNEL);
+	pnode = kzalloc(sizeof(struct ubifs_pnode), GFP_KERNEL);
+	nnode = kzalloc(sizeof(struct ubifs_nnode), GFP_KERNEL);
+	buf = vmalloc(c->leb_size);
+	ltab = vmalloc(sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs);
+	if (!pnode || !nnode || !buf || !ltab || !lsave) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	ubifs_assert(!c->ltab);
+	c->ltab = ltab; /* Needed by set_ltab */
+
+	/* Initialize LPT's own lprops */
+	for (i = 0; i < c->lpt_lebs; i++) {
+		ltab[i].free = c->leb_size;
+		ltab[i].dirty = 0;
+		ltab[i].tgc = 0;
+		ltab[i].cmt = 0;
+	}
+
+	lnum = lpt_first;
+	p = buf;
+	/* Number of leaf nodes (pnodes) */
+	cnt = c->pnode_cnt;
+
+	/*
+	 * The first pnode contains the LEB properties for the LEBs that contain
+	 * the root inode node and the root index node of the index tree.
+	 */
+	node_sz = ALIGN(ubifs_idx_node_sz(c, 1), 8);
+	iopos = ALIGN(node_sz, c->min_io_size);
+	pnode->lprops[0].free = c->leb_size - iopos;
+	pnode->lprops[0].dirty = iopos - node_sz;
+	pnode->lprops[0].flags = LPROPS_INDEX;
+
+	node_sz = UBIFS_INO_NODE_SZ;
+	iopos = ALIGN(node_sz, c->min_io_size);
+	pnode->lprops[1].free = c->leb_size - iopos;
+	pnode->lprops[1].dirty = iopos - node_sz;
+
+	for (i = 2; i < UBIFS_LPT_FANOUT; i++)
+		pnode->lprops[i].free = c->leb_size;
+
+	/* Add first pnode */
+	ubifs_pack_pnode(c, p, pnode);
+	p += c->pnode_sz;
+	len = c->pnode_sz;
+	pnode->num += 1;
+
+	/* Reset pnode values for remaining pnodes */
+	pnode->lprops[0].free = c->leb_size;
+	pnode->lprops[0].dirty = 0;
+	pnode->lprops[0].flags = 0;
+
+	pnode->lprops[1].free = c->leb_size;
+	pnode->lprops[1].dirty = 0;
+
+	/*
+	 * To calculate the internal node branches, we keep information about
+	 * the level below.
+	 */
+	blnum = lnum; /* LEB number of level below */
+	boffs = 0; /* Offset of level below */
+	bcnt = cnt; /* Number of nodes in level below */
+	bsz = c->pnode_sz; /* Size of nodes in level below */
+
+	/* Add all remaining pnodes */
+	for (i = 1; i < cnt; i++) {
+		if (len + c->pnode_sz > c->leb_size) {
+			alen = ALIGN(len, c->min_io_size);
+			set_ltab(c, lnum, c->leb_size - alen, alen - len);
+			memset(p, 0xff, alen - len);
+			err = ubi_leb_change(c->ubi, lnum++, buf, alen,
+					     UBI_SHORTTERM);
+			if (err)
+				goto out;
+			p = buf;
+			len = 0;
+		}
+		ubifs_pack_pnode(c, p, pnode);
+		p += c->pnode_sz;
+		len += c->pnode_sz;
+		/*
+		 * pnodes are simply numbered left to right starting at zero,
+		 * which means the pnode number can be used easily to traverse
+		 * down the tree to the corresponding pnode.
+		 */
+		pnode->num += 1;
+	}
+
+	row = 0;
+	for (i = UBIFS_LPT_FANOUT; cnt > i; i <<= UBIFS_LPT_FANOUT_SHIFT)
+		row += 1;
+	/* Add all nnodes, one level at a time */
+	while (1) {
+		/* Number of internal nodes (nnodes) at next level */
+		cnt = DIV_ROUND_UP(cnt, UBIFS_LPT_FANOUT);
+		for (i = 0; i < cnt; i++) {
+			if (len + c->nnode_sz > c->leb_size) {
+				alen = ALIGN(len, c->min_io_size);
+				set_ltab(c, lnum, c->leb_size - alen,
+					    alen - len);
+				memset(p, 0xff, alen - len);
+				err = ubi_leb_change(c->ubi, lnum++, buf, alen,
+						     UBI_SHORTTERM);
+				if (err)
+					goto out;
+				p = buf;
+				len = 0;
+			}
+			/* Only 1 nnode at this level, so it is the root */
+			if (cnt == 1) {
+				c->lpt_lnum = lnum;
+				c->lpt_offs = len;
+			}
+			/* Set branches to the level below */
+			for (j = 0; j < UBIFS_LPT_FANOUT; j++) {
+				if (bcnt) {
+					if (boffs + bsz > c->leb_size) {
+						blnum += 1;
+						boffs = 0;
+					}
+					nnode->nbranch[j].lnum = blnum;
+					nnode->nbranch[j].offs = boffs;
+					boffs += bsz;
+					bcnt--;
+				} else {
+					nnode->nbranch[j].lnum = 0;
+					nnode->nbranch[j].offs = 0;
+				}
+			}
+			nnode->num = calc_nnode_num(row, i);
+			ubifs_pack_nnode(c, p, nnode);
+			p += c->nnode_sz;
+			len += c->nnode_sz;
+		}
+		/* Only 1 nnode at this level, so it is the root */
+		if (cnt == 1)
+			break;
+		/* Update the information about the level below */
+		bcnt = cnt;
+		bsz = c->nnode_sz;
+		row -= 1;
+	}
+
+	if (*big_lpt) {
+		/* Need to add LPT's save table */
+		if (len + c->lsave_sz > c->leb_size) {
+			alen = ALIGN(len, c->min_io_size);
+			set_ltab(c, lnum, c->leb_size - alen, alen - len);
+			memset(p, 0xff, alen - len);
+			err = ubi_leb_change(c->ubi, lnum++, buf, alen,
+					     UBI_SHORTTERM);
+			if (err)
+				goto out;
+			p = buf;
+			len = 0;
+		}
+
+		c->lsave_lnum = lnum;
+		c->lsave_offs = len;
+
+		for (i = 0; i < c->lsave_cnt && i < *main_lebs; i++)
+			lsave[i] = c->main_first + i;
+		for (; i < c->lsave_cnt; i++)
+			lsave[i] = c->main_first;
+
+		ubifs_pack_lsave(c, p, lsave);
+		p += c->lsave_sz;
+		len += c->lsave_sz;
+	}
+
+	/* Need to add LPT's own LEB properties table */
+	if (len + c->ltab_sz > c->leb_size) {
+		alen = ALIGN(len, c->min_io_size);
+		set_ltab(c, lnum, c->leb_size - alen, alen - len);
+		memset(p, 0xff, alen - len);
+		err = ubi_leb_change(c->ubi, lnum++, buf, alen, UBI_SHORTTERM);
+		if (err)
+			goto out;
+		p = buf;
+		len = 0;
+	}
+
+	c->ltab_lnum = lnum;
+	c->ltab_offs = len;
+
+	/* Update ltab before packing it */
+	len += c->ltab_sz;
+	alen = ALIGN(len, c->min_io_size);
+	set_ltab(c, lnum, c->leb_size - alen, alen - len);
+
+	ubifs_pack_ltab(c, p, ltab);
+	p += c->ltab_sz;
+
+	/* Write remaining buffer */
+	memset(p, 0xff, alen - len);
+	err = ubi_leb_change(c->ubi, lnum, buf, alen, UBI_SHORTTERM);
+	if (err)
+		goto out;
+
+	c->nhead_lnum = lnum;
+	c->nhead_offs = ALIGN(len, c->min_io_size);
+
+	dbg_lp("space_bits %d", c->space_bits);
+	dbg_lp("lpt_lnum_bits %d", c->lpt_lnum_bits);
+	dbg_lp("lpt_offs_bits %d", c->lpt_offs_bits);
+	dbg_lp("lpt_spc_bits %d", c->lpt_spc_bits);
+	dbg_lp("pcnt_bits %d", c->pcnt_bits);
+	dbg_lp("lnum_bits %d", c->lnum_bits);
+	dbg_lp("pnode_sz %d", c->pnode_sz);
+	dbg_lp("nnode_sz %d", c->nnode_sz);
+	dbg_lp("ltab_sz %d", c->ltab_sz);
+	dbg_lp("lsave_sz %d", c->lsave_sz);
+	dbg_lp("lsave_cnt %d", c->lsave_cnt);
+	dbg_lp("lpt_hght %d", c->lpt_hght);
+	dbg_lp("big_lpt %d", c->big_lpt);
+	dbg_lp("LPT root is at %d:%d", c->lpt_lnum, c->lpt_offs);
+	dbg_lp("LPT head is at %d:%d", c->nhead_lnum, c->nhead_offs);
+	dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs);
+	if (c->big_lpt)
+		dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs);
+out:
+	c->ltab = NULL;
+	kfree(lsave);
+	vfree(ltab);
+	vfree(buf);
+	kfree(nnode);
+	kfree(pnode);
+	return err;
+}
+
+/**
+ * update_cats - add LEB properties of a pnode to LEB category lists and heaps.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode
+ *
+ * When a pnode is loaded into memory, the LEB properties it contains are added,
+ * by this function, to the LEB category lists and heaps.
+ */
+static void update_cats(struct ubifs_info *c, struct ubifs_pnode *pnode)
+{
+	int i;
+
+	for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+		int cat = pnode->lprops[i].flags & LPROPS_CAT_MASK;
+		int lnum = pnode->lprops[i].lnum;
+
+		if (!lnum)
+			return;
+		ubifs_add_to_cat(c, &pnode->lprops[i], cat);
+	}
+}
+
+/**
+ * replace_cats - add LEB properties of a pnode to LEB category lists and heaps.
+ * @c: UBIFS file-system description object
+ * @old_pnode: pnode copied
+ * @new_pnode: pnode copy
+ *
+ * During commit it is sometimes necessary to copy a pnode
+ * (see dirty_cow_pnode).  When that happens, references in
+ * category lists and heaps must be replaced.  This function does that.
+ */
+static void replace_cats(struct ubifs_info *c, struct ubifs_pnode *old_pnode,
+			 struct ubifs_pnode *new_pnode)
+{
+	int i;
+
+	for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+		if (!new_pnode->lprops[i].lnum)
+			return;
+		ubifs_replace_cat(c, &old_pnode->lprops[i],
+				  &new_pnode->lprops[i]);
+	}
+}
+
+/**
+ * check_lpt_crc - check LPT node crc is correct.
+ * @c: UBIFS file-system description object
+ * @buf: buffer containing node
+ * @len: length of node
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int check_lpt_crc(void *buf, int len)
+{
+	int pos = 0;
+	uint8_t *addr = buf;
+	uint16_t crc, calc_crc;
+
+	crc = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_CRC_BITS);
+	calc_crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
+			 len - UBIFS_LPT_CRC_BYTES);
+	if (crc != calc_crc) {
+		ubifs_err("invalid crc in LPT node: crc %hx calc %hx", crc,
+			  calc_crc);
+		dbg_dump_stack();
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/**
+ * check_lpt_type - check LPT node type is correct.
+ * @c: UBIFS file-system description object
+ * @addr: address of type bit field is passed and returned updated here
+ * @pos: position of type bit field is passed and returned updated here
+ * @type: expected type
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int check_lpt_type(uint8_t **addr, int *pos, int type)
+{
+	int node_type;
+
+	node_type = ubifs_unpack_bits(addr, pos, UBIFS_LPT_TYPE_BITS);
+	if (node_type != type) {
+		ubifs_err("invalid type (%d) in LPT node type %d", node_type,
+			  type);
+		dbg_dump_stack();
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/**
+ * unpack_pnode - unpack a pnode.
+ * @c: UBIFS file-system description object
+ * @buf: buffer containing packed pnode to unpack
+ * @pnode: pnode structure to fill
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int unpack_pnode(struct ubifs_info *c, void *buf,
+			struct ubifs_pnode *pnode)
+{
+	uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+	int i, pos = 0, err;
+
+	err = check_lpt_type(&addr, &pos, UBIFS_LPT_PNODE);
+	if (err)
+		return err;
+	if (c->big_lpt)
+		pnode->num = ubifs_unpack_bits(&addr, &pos, c->pcnt_bits);
+	for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+		struct ubifs_lprops * const lprops = &pnode->lprops[i];
+
+		lprops->free = ubifs_unpack_bits(&addr, &pos, c->space_bits);
+		lprops->free <<= 3;
+		lprops->dirty = ubifs_unpack_bits(&addr, &pos, c->space_bits);
+		lprops->dirty <<= 3;
+
+		if (ubifs_unpack_bits(&addr, &pos, 1))
+			lprops->flags = LPROPS_INDEX;
+		else
+			lprops->flags = 0;
+		lprops->flags |= ubifs_categorize_lprops(c, lprops);
+	}
+	err = check_lpt_crc(buf, c->pnode_sz);
+	return err;
+}
+
+/**
+ * unpack_nnode - unpack a nnode.
+ * @c: UBIFS file-system description object
+ * @buf: buffer containing packed nnode to unpack
+ * @nnode: nnode structure to fill
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int unpack_nnode(struct ubifs_info *c, void *buf,
+			struct ubifs_nnode *nnode)
+{
+	uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+	int i, pos = 0, err;
+
+	err = check_lpt_type(&addr, &pos, UBIFS_LPT_NNODE);
+	if (err)
+		return err;
+	if (c->big_lpt)
+		nnode->num = ubifs_unpack_bits(&addr, &pos, c->pcnt_bits);
+	for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+		int lnum;
+
+		lnum = ubifs_unpack_bits(&addr, &pos, c->lpt_lnum_bits) +
+		       c->lpt_first;
+		if (lnum == c->lpt_last + 1)
+			lnum = 0;
+		nnode->nbranch[i].lnum = lnum;
+		nnode->nbranch[i].offs = ubifs_unpack_bits(&addr, &pos,
+						     c->lpt_offs_bits);
+	}
+	err = check_lpt_crc(buf, c->nnode_sz);
+	return err;
+}
+
+/**
+ * unpack_ltab - unpack the LPT's own lprops table.
+ * @c: UBIFS file-system description object
+ * @buf: buffer from which to unpack
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int unpack_ltab(struct ubifs_info *c, void *buf)
+{
+	uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+	int i, pos = 0, err;
+
+	err = check_lpt_type(&addr, &pos, UBIFS_LPT_LTAB);
+	if (err)
+		return err;
+	for (i = 0; i < c->lpt_lebs; i++) {
+		int free = ubifs_unpack_bits(&addr, &pos, c->lpt_spc_bits);
+		int dirty = ubifs_unpack_bits(&addr, &pos, c->lpt_spc_bits);
+
+		if (free < 0 || free > c->leb_size || dirty < 0 ||
+		    dirty > c->leb_size || free + dirty > c->leb_size)
+			return -EINVAL;
+
+		c->ltab[i].free = free;
+		c->ltab[i].dirty = dirty;
+		c->ltab[i].tgc = 0;
+		c->ltab[i].cmt = 0;
+	}
+	err = check_lpt_crc(buf, c->ltab_sz);
+	return err;
+}
+
+/**
+ * unpack_lsave - unpack the LPT's save table.
+ * @c: UBIFS file-system description object
+ * @buf: buffer from which to unpack
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int unpack_lsave(struct ubifs_info *c, void *buf)
+{
+	uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+	int i, pos = 0, err;
+
+	err = check_lpt_type(&addr, &pos, UBIFS_LPT_LSAVE);
+	if (err)
+		return err;
+	for (i = 0; i < c->lsave_cnt; i++) {
+		int lnum = ubifs_unpack_bits(&addr, &pos, c->lnum_bits);
+
+		if (lnum < c->main_first || lnum >= c->leb_cnt)
+			return -EINVAL;
+		c->lsave[i] = lnum;
+	}
+	err = check_lpt_crc(buf, c->lsave_sz);
+	return err;
+}
+
+/**
+ * validate_nnode - validate a nnode.
+ * @c: UBIFS file-system description object
+ * @nnode: nnode to validate
+ * @parent: parent nnode (or NULL for the root nnode)
+ * @iip: index in parent
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int validate_nnode(struct ubifs_info *c, struct ubifs_nnode *nnode,
+			  struct ubifs_nnode *parent, int iip)
+{
+	int i, lvl, max_offs;
+
+	if (c->big_lpt) {
+		int num = calc_nnode_num_from_parent(c, parent, iip);
+
+		if (nnode->num != num)
+			return -EINVAL;
+	}
+	lvl = parent ? parent->level - 1 : c->lpt_hght;
+	if (lvl < 1)
+		return -EINVAL;
+	if (lvl == 1)
+		max_offs = c->leb_size - c->pnode_sz;
+	else
+		max_offs = c->leb_size - c->nnode_sz;
+	for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+		int lnum = nnode->nbranch[i].lnum;
+		int offs = nnode->nbranch[i].offs;
+
+		if (lnum == 0) {
+			if (offs != 0)
+				return -EINVAL;
+			continue;
+		}
+		if (lnum < c->lpt_first || lnum > c->lpt_last)
+			return -EINVAL;
+		if (offs < 0 || offs > max_offs)
+			return -EINVAL;
+	}
+	return 0;
+}
+
+/**
+ * validate_pnode - validate a pnode.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode to validate
+ * @parent: parent nnode
+ * @iip: index in parent
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int validate_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
+			  struct ubifs_nnode *parent, int iip)
+{
+	int i;
+
+	if (c->big_lpt) {
+		int num = calc_pnode_num_from_parent(c, parent, iip);
+
+		if (pnode->num != num)
+			return -EINVAL;
+	}
+	for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+		int free = pnode->lprops[i].free;
+		int dirty = pnode->lprops[i].dirty;
+
+		if (free < 0 || free > c->leb_size || free % c->min_io_size ||
+		    (free & 7))
+			return -EINVAL;
+		if (dirty < 0 || dirty > c->leb_size || (dirty & 7))
+			return -EINVAL;
+		if (dirty + free > c->leb_size)
+			return -EINVAL;
+	}
+	return 0;
+}
+
+/**
+ * set_pnode_lnum - set LEB numbers on a pnode.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode to update
+ *
+ * This function calculates the LEB numbers for the LEB properties it contains
+ * based on the pnode number.
+ */
+static void set_pnode_lnum(struct ubifs_info *c, struct ubifs_pnode *pnode)
+{
+	int i, lnum;
+
+	lnum = (pnode->num << UBIFS_LPT_FANOUT_SHIFT) + c->main_first;
+	for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+		if (lnum >= c->leb_cnt)
+			return;
+		pnode->lprops[i].lnum = lnum++;
+	}
+}
+
+/**
+ * ubifs_read_nnode - read a nnode from flash and link it to the tree in memory.
+ * @c: UBIFS file-system description object
+ * @parent: parent nnode (or NULL for the root)
+ * @iip: index in parent
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip)
+{
+	struct ubifs_nbranch *branch = NULL;
+	struct ubifs_nnode *nnode = NULL;
+	void *buf = c->lpt_nod_buf;
+	int err, lnum, offs;
+
+	if (parent) {
+		branch = &parent->nbranch[iip];
+		lnum = branch->lnum;
+		offs = branch->offs;
+	} else {
+		lnum = c->lpt_lnum;
+		offs = c->lpt_offs;
+	}
+	nnode = kzalloc(sizeof(struct ubifs_nnode), GFP_NOFS);
+	if (!nnode) {
+		err = -ENOMEM;
+		goto out;
+	}
+	if (lnum == 0) {
+		/*
+		 * This nnode was not written which just means that the LEB
+		 * properties in the subtree below it describe empty LEBs. We
+		 * make the nnode as though we had read it, which in fact means
+		 * doing almost nothing.
+		 */
+		if (c->big_lpt)
+			nnode->num = calc_nnode_num_from_parent(c, parent, iip);
+	} else {
+		err = ubi_read(c->ubi, lnum, buf, offs, c->nnode_sz);
+		if (err)
+			goto out;
+		err = unpack_nnode(c, buf, nnode);
+		if (err)
+			goto out;
+	}
+	err = validate_nnode(c, nnode, parent, iip);
+	if (err)
+		goto out;
+	if (!c->big_lpt)
+		nnode->num = calc_nnode_num_from_parent(c, parent, iip);
+	if (parent) {
+		branch->nnode = nnode;
+		nnode->level = parent->level - 1;
+	} else {
+		c->nroot = nnode;
+		nnode->level = c->lpt_hght;
+	}
+	nnode->parent = parent;
+	nnode->iip = iip;
+	return 0;
+
+out:
+	ubifs_err("error %d reading nnode at %d:%d", err, lnum, offs);
+	kfree(nnode);
+	return err;
+}
+
+/**
+ * read_pnode - read a pnode from flash and link it to the tree in memory.
+ * @c: UBIFS file-system description object
+ * @parent: parent nnode
+ * @iip: index in parent
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int read_pnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip)
+{
+	struct ubifs_nbranch *branch;
+	struct ubifs_pnode *pnode = NULL;
+	void *buf = c->lpt_nod_buf;
+	int err, lnum, offs;
+
+	branch = &parent->nbranch[iip];
+	lnum = branch->lnum;
+	offs = branch->offs;
+	pnode = kzalloc(sizeof(struct ubifs_pnode), GFP_NOFS);
+	if (!pnode) {
+		err = -ENOMEM;
+		goto out;
+	}
+	if (lnum == 0) {
+		/*
+		 * This pnode was not written which just means that the LEB
+		 * properties in it describe empty LEBs. We make the pnode as
+		 * though we had read it.
+		 */
+		int i;
+
+		if (c->big_lpt)
+			pnode->num = calc_pnode_num_from_parent(c, parent, iip);
+		for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+			struct ubifs_lprops * const lprops = &pnode->lprops[i];
+
+			lprops->free = c->leb_size;
+			lprops->flags = ubifs_categorize_lprops(c, lprops);
+		}
+	} else {
+		err = ubi_read(c->ubi, lnum, buf, offs, c->pnode_sz);
+		if (err)
+			goto out;
+		err = unpack_pnode(c, buf, pnode);
+		if (err)
+			goto out;
+	}
+	err = validate_pnode(c, pnode, parent, iip);
+	if (err)
+		goto out;
+	if (!c->big_lpt)
+		pnode->num = calc_pnode_num_from_parent(c, parent, iip);
+	branch->pnode = pnode;
+	pnode->parent = parent;
+	pnode->iip = iip;
+	set_pnode_lnum(c, pnode);
+	c->pnodes_have += 1;
+	return 0;
+
+out:
+	ubifs_err("error %d reading pnode at %d:%d", err, lnum, offs);
+	dbg_dump_pnode(c, pnode, parent, iip);
+	dbg_msg("calc num: %d", calc_pnode_num_from_parent(c, parent, iip));
+	kfree(pnode);
+	return err;
+}
+
+/**
+ * read_ltab - read LPT's own lprops table.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int read_ltab(struct ubifs_info *c)
+{
+	int err;
+	void *buf;
+
+	buf = vmalloc(c->ltab_sz);
+	if (!buf)
+		return -ENOMEM;
+	err = ubi_read(c->ubi, c->ltab_lnum, buf, c->ltab_offs, c->ltab_sz);
+	if (err)
+		goto out;
+	err = unpack_ltab(c, buf);
+out:
+	vfree(buf);
+	return err;
+}
+
+/**
+ * read_lsave - read LPT's save table.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int read_lsave(struct ubifs_info *c)
+{
+	int err, i;
+	void *buf;
+
+	buf = vmalloc(c->lsave_sz);
+	if (!buf)
+		return -ENOMEM;
+	err = ubi_read(c->ubi, c->lsave_lnum, buf, c->lsave_offs, c->lsave_sz);
+	if (err)
+		goto out;
+	err = unpack_lsave(c, buf);
+	if (err)
+		goto out;
+	for (i = 0; i < c->lsave_cnt; i++) {
+		int lnum = c->lsave[i];
+
+		/*
+		 * Due to automatic resizing, the values in the lsave table
+		 * could be beyond the volume size - just ignore them.
+		 */
+		if (lnum >= c->leb_cnt)
+			continue;
+		ubifs_lpt_lookup(c, lnum);
+	}
+out:
+	vfree(buf);
+	return err;
+}
+
+/**
+ * ubifs_get_nnode - get a nnode.
+ * @c: UBIFS file-system description object
+ * @parent: parent nnode (or NULL for the root)
+ * @iip: index in parent
+ *
+ * This function returns a pointer to the nnode on success or a negative error
+ * code on failure.
+ */
+struct ubifs_nnode *ubifs_get_nnode(struct ubifs_info *c,
+				    struct ubifs_nnode *parent, int iip)
+{
+	struct ubifs_nbranch *branch;
+	struct ubifs_nnode *nnode;
+	int err;
+
+	branch = &parent->nbranch[iip];
+	nnode = branch->nnode;
+	if (nnode)
+		return nnode;
+	err = ubifs_read_nnode(c, parent, iip);
+	if (err)
+		return ERR_PTR(err);
+	return branch->nnode;
+}
+
+/**
+ * ubifs_get_pnode - get a pnode.
+ * @c: UBIFS file-system description object
+ * @parent: parent nnode
+ * @iip: index in parent
+ *
+ * This function returns a pointer to the pnode on success or a negative error
+ * code on failure.
+ */
+struct ubifs_pnode *ubifs_get_pnode(struct ubifs_info *c,
+				    struct ubifs_nnode *parent, int iip)
+{
+	struct ubifs_nbranch *branch;
+	struct ubifs_pnode *pnode;
+	int err;
+
+	branch = &parent->nbranch[iip];
+	pnode = branch->pnode;
+	if (pnode)
+		return pnode;
+	err = read_pnode(c, parent, iip);
+	if (err)
+		return ERR_PTR(err);
+	update_cats(c, branch->pnode);
+	return branch->pnode;
+}
+
+/**
+ * ubifs_lpt_lookup - lookup LEB properties in the LPT.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to lookup
+ *
+ * This function returns a pointer to the LEB properties on success or a
+ * negative error code on failure.
+ */
+struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum)
+{
+	int err, i, h, iip, shft;
+	struct ubifs_nnode *nnode;
+	struct ubifs_pnode *pnode;
+
+	if (!c->nroot) {
+		err = ubifs_read_nnode(c, NULL, 0);
+		if (err)
+			return ERR_PTR(err);
+	}
+	nnode = c->nroot;
+	i = lnum - c->main_first;
+	shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
+	for (h = 1; h < c->lpt_hght; h++) {
+		iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+		shft -= UBIFS_LPT_FANOUT_SHIFT;
+		nnode = ubifs_get_nnode(c, nnode, iip);
+		if (IS_ERR(nnode))
+			return ERR_PTR(PTR_ERR(nnode));
+	}
+	iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+	shft -= UBIFS_LPT_FANOUT_SHIFT;
+	pnode = ubifs_get_pnode(c, nnode, iip);
+	if (IS_ERR(pnode))
+		return ERR_PTR(PTR_ERR(pnode));
+	iip = (i & (UBIFS_LPT_FANOUT - 1));
+	dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum,
+	       pnode->lprops[iip].free, pnode->lprops[iip].dirty,
+	       pnode->lprops[iip].flags);
+	return &pnode->lprops[iip];
+}
+
+/**
+ * dirty_cow_nnode - ensure a nnode is not being committed.
+ * @c: UBIFS file-system description object
+ * @nnode: nnode to check
+ *
+ * Returns dirtied nnode on success or negative error code on failure.
+ */
+static struct ubifs_nnode *dirty_cow_nnode(struct ubifs_info *c,
+					   struct ubifs_nnode *nnode)
+{
+	struct ubifs_nnode *n;
+	int i;
+
+	if (!test_bit(COW_CNODE, &nnode->flags)) {
+		/* nnode is not being committed */
+		if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) {
+			c->dirty_nn_cnt += 1;
+			ubifs_add_nnode_dirt(c, nnode);
+		}
+		return nnode;
+	}
+
+	/* nnode is being committed, so copy it */
+	n = kmalloc(sizeof(struct ubifs_nnode), GFP_NOFS);
+	if (unlikely(!n))
+		return ERR_PTR(-ENOMEM);
+
+	memcpy(n, nnode, sizeof(struct ubifs_nnode));
+	n->cnext = NULL;
+	__set_bit(DIRTY_CNODE, &n->flags);
+	__clear_bit(COW_CNODE, &n->flags);
+
+	/* The children now have new parent */
+	for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+		struct ubifs_nbranch *branch = &n->nbranch[i];
+
+		if (branch->cnode)
+			branch->cnode->parent = n;
+	}
+
+	ubifs_assert(!test_bit(OBSOLETE_CNODE, &nnode->flags));
+	__set_bit(OBSOLETE_CNODE, &nnode->flags);
+
+	c->dirty_nn_cnt += 1;
+	ubifs_add_nnode_dirt(c, nnode);
+	if (nnode->parent)
+		nnode->parent->nbranch[n->iip].nnode = n;
+	else
+		c->nroot = n;
+	return n;
+}
+
+/**
+ * dirty_cow_pnode - ensure a pnode is not being committed.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode to check
+ *
+ * Returns dirtied pnode on success or negative error code on failure.
+ */
+static struct ubifs_pnode *dirty_cow_pnode(struct ubifs_info *c,
+					   struct ubifs_pnode *pnode)
+{
+	struct ubifs_pnode *p;
+
+	if (!test_bit(COW_CNODE, &pnode->flags)) {
+		/* pnode is not being committed */
+		if (!test_and_set_bit(DIRTY_CNODE, &pnode->flags)) {
+			c->dirty_pn_cnt += 1;
+			add_pnode_dirt(c, pnode);
+		}
+		return pnode;
+	}
+
+	/* pnode is being committed, so copy it */
+	p = kmalloc(sizeof(struct ubifs_pnode), GFP_NOFS);
+	if (unlikely(!p))
+		return ERR_PTR(-ENOMEM);
+
+	memcpy(p, pnode, sizeof(struct ubifs_pnode));
+	p->cnext = NULL;
+	__set_bit(DIRTY_CNODE, &p->flags);
+	__clear_bit(COW_CNODE, &p->flags);
+	replace_cats(c, pnode, p);
+
+	ubifs_assert(!test_bit(OBSOLETE_CNODE, &pnode->flags));
+	__set_bit(OBSOLETE_CNODE, &pnode->flags);
+
+	c->dirty_pn_cnt += 1;
+	add_pnode_dirt(c, pnode);
+	pnode->parent->nbranch[p->iip].pnode = p;
+	return p;
+}
+
+/**
+ * ubifs_lpt_lookup_dirty - lookup LEB properties in the LPT.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to lookup
+ *
+ * This function returns a pointer to the LEB properties on success or a
+ * negative error code on failure.
+ */
+struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum)
+{
+	int err, i, h, iip, shft;
+	struct ubifs_nnode *nnode;
+	struct ubifs_pnode *pnode;
+
+	if (!c->nroot) {
+		err = ubifs_read_nnode(c, NULL, 0);
+		if (err)
+			return ERR_PTR(err);
+	}
+	nnode = c->nroot;
+	nnode = dirty_cow_nnode(c, nnode);
+	if (IS_ERR(nnode))
+		return ERR_PTR(PTR_ERR(nnode));
+	i = lnum - c->main_first;
+	shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
+	for (h = 1; h < c->lpt_hght; h++) {
+		iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+		shft -= UBIFS_LPT_FANOUT_SHIFT;
+		nnode = ubifs_get_nnode(c, nnode, iip);
+		if (IS_ERR(nnode))
+			return ERR_PTR(PTR_ERR(nnode));
+		nnode = dirty_cow_nnode(c, nnode);
+		if (IS_ERR(nnode))
+			return ERR_PTR(PTR_ERR(nnode));
+	}
+	iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+	shft -= UBIFS_LPT_FANOUT_SHIFT;
+	pnode = ubifs_get_pnode(c, nnode, iip);
+	if (IS_ERR(pnode))
+		return ERR_PTR(PTR_ERR(pnode));
+	pnode = dirty_cow_pnode(c, pnode);
+	if (IS_ERR(pnode))
+		return ERR_PTR(PTR_ERR(pnode));
+	iip = (i & (UBIFS_LPT_FANOUT - 1));
+	dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum,
+	       pnode->lprops[iip].free, pnode->lprops[iip].dirty,
+	       pnode->lprops[iip].flags);
+	ubifs_assert(test_bit(DIRTY_CNODE, &pnode->flags));
+	return &pnode->lprops[iip];
+}
+
+/**
+ * lpt_init_rd - initialize the LPT for reading.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int lpt_init_rd(struct ubifs_info *c)
+{
+	int err, i;
+
+	c->ltab = vmalloc(sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs);
+	if (!c->ltab)
+		return -ENOMEM;
+
+	i = max_t(int, c->nnode_sz, c->pnode_sz);
+	c->lpt_nod_buf = kmalloc(i, GFP_KERNEL);
+	if (!c->lpt_nod_buf)
+		return -ENOMEM;
+
+	for (i = 0; i < LPROPS_HEAP_CNT; i++) {
+		c->lpt_heap[i].arr = kmalloc(sizeof(void *) * LPT_HEAP_SZ,
+					     GFP_KERNEL);
+		if (!c->lpt_heap[i].arr)
+			return -ENOMEM;
+		c->lpt_heap[i].cnt = 0;
+		c->lpt_heap[i].max_cnt = LPT_HEAP_SZ;
+	}
+
+	c->dirty_idx.arr = kmalloc(sizeof(void *) * LPT_HEAP_SZ, GFP_KERNEL);
+	if (!c->dirty_idx.arr)
+		return -ENOMEM;
+	c->dirty_idx.cnt = 0;
+	c->dirty_idx.max_cnt = LPT_HEAP_SZ;
+
+	err = read_ltab(c);
+	if (err)
+		return err;
+
+	dbg_lp("space_bits %d", c->space_bits);
+	dbg_lp("lpt_lnum_bits %d", c->lpt_lnum_bits);
+	dbg_lp("lpt_offs_bits %d", c->lpt_offs_bits);
+	dbg_lp("lpt_spc_bits %d", c->lpt_spc_bits);
+	dbg_lp("pcnt_bits %d", c->pcnt_bits);
+	dbg_lp("lnum_bits %d", c->lnum_bits);
+	dbg_lp("pnode_sz %d", c->pnode_sz);
+	dbg_lp("nnode_sz %d", c->nnode_sz);
+	dbg_lp("ltab_sz %d", c->ltab_sz);
+	dbg_lp("lsave_sz %d", c->lsave_sz);
+	dbg_lp("lsave_cnt %d", c->lsave_cnt);
+	dbg_lp("lpt_hght %d", c->lpt_hght);
+	dbg_lp("big_lpt %d", c->big_lpt);
+	dbg_lp("LPT root is at %d:%d", c->lpt_lnum, c->lpt_offs);
+	dbg_lp("LPT head is at %d:%d", c->nhead_lnum, c->nhead_offs);
+	dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs);
+	if (c->big_lpt)
+		dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs);
+
+	return 0;
+}
+
+/**
+ * lpt_init_wr - initialize the LPT for writing.
+ * @c: UBIFS file-system description object
+ *
+ * 'lpt_init_rd()' must have been called already.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int lpt_init_wr(struct ubifs_info *c)
+{
+	int err, i;
+
+	c->ltab_cmt = vmalloc(sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs);
+	if (!c->ltab_cmt)
+		return -ENOMEM;
+
+	c->lpt_buf = vmalloc(c->leb_size);
+	if (!c->lpt_buf)
+		return -ENOMEM;
+
+	if (c->big_lpt) {
+		c->lsave = kmalloc(sizeof(int) * c->lsave_cnt, GFP_NOFS);
+		if (!c->lsave)
+			return -ENOMEM;
+		err = read_lsave(c);
+		if (err)
+			return err;
+	}
+
+	for (i = 0; i < c->lpt_lebs; i++)
+		if (c->ltab[i].free == c->leb_size) {
+			err = ubifs_leb_unmap(c, i + c->lpt_first);
+			if (err)
+				return err;
+		}
+
+	return 0;
+}
+
+/**
+ * ubifs_lpt_init - initialize the LPT.
+ * @c: UBIFS file-system description object
+ * @rd: whether to initialize lpt for reading
+ * @wr: whether to initialize lpt for writing
+ *
+ * For mounting 'rw', @rd and @wr are both true. For mounting 'ro', @rd is true
+ * and @wr is false. For mounting from 'ro' to 'rw', @rd is false and @wr is
+ * true.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_lpt_init(struct ubifs_info *c, int rd, int wr)
+{
+	int err;
+
+	if (rd) {
+		err = lpt_init_rd(c);
+		if (err)
+			return err;
+	}
+
+	if (wr) {
+		err = lpt_init_wr(c);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+/**
+ * struct lpt_scan_node - somewhere to put nodes while we scan LPT.
+ * @nnode: where to keep a nnode
+ * @pnode: where to keep a pnode
+ * @cnode: where to keep a cnode
+ * @in_tree: is the node in the tree in memory
+ * @ptr.nnode: pointer to the nnode (if it is an nnode) which may be here or in
+ * the tree
+ * @ptr.pnode: ditto for pnode
+ * @ptr.cnode: ditto for cnode
+ */
+struct lpt_scan_node {
+	union {
+		struct ubifs_nnode nnode;
+		struct ubifs_pnode pnode;
+		struct ubifs_cnode cnode;
+	};
+	int in_tree;
+	union {
+		struct ubifs_nnode *nnode;
+		struct ubifs_pnode *pnode;
+		struct ubifs_cnode *cnode;
+	} ptr;
+};
+
+/**
+ * scan_get_nnode - for the scan, get a nnode from either the tree or flash.
+ * @c: the UBIFS file-system description object
+ * @path: where to put the nnode
+ * @parent: parent of the nnode
+ * @iip: index in parent of the nnode
+ *
+ * This function returns a pointer to the nnode on success or a negative error
+ * code on failure.
+ */
+static struct ubifs_nnode *scan_get_nnode(struct ubifs_info *c,
+					  struct lpt_scan_node *path,
+					  struct ubifs_nnode *parent, int iip)
+{
+	struct ubifs_nbranch *branch;
+	struct ubifs_nnode *nnode;
+	void *buf = c->lpt_nod_buf;
+	int err;
+
+	branch = &parent->nbranch[iip];
+	nnode = branch->nnode;
+	if (nnode) {
+		path->in_tree = 1;
+		path->ptr.nnode = nnode;
+		return nnode;
+	}
+	nnode = &path->nnode;
+	path->in_tree = 0;
+	path->ptr.nnode = nnode;
+	memset(nnode, 0, sizeof(struct ubifs_nnode));
+	if (branch->lnum == 0) {
+		/*
+		 * This nnode was not written which just means that the LEB
+		 * properties in the subtree below it describe empty LEBs. We
+		 * make the nnode as though we had read it, which in fact means
+		 * doing almost nothing.
+		 */
+		if (c->big_lpt)
+			nnode->num = calc_nnode_num_from_parent(c, parent, iip);
+	} else {
+		err = ubi_read(c->ubi, branch->lnum, buf, branch->offs,
+			       c->nnode_sz);
+		if (err)
+			return ERR_PTR(err);
+		err = unpack_nnode(c, buf, nnode);
+		if (err)
+			return ERR_PTR(err);
+	}
+	err = validate_nnode(c, nnode, parent, iip);
+	if (err)
+		return ERR_PTR(err);
+	if (!c->big_lpt)
+		nnode->num = calc_nnode_num_from_parent(c, parent, iip);
+	nnode->level = parent->level - 1;
+	nnode->parent = parent;
+	nnode->iip = iip;
+	return nnode;
+}
+
+/**
+ * scan_get_pnode - for the scan, get a pnode from either the tree or flash.
+ * @c: the UBIFS file-system description object
+ * @path: where to put the pnode
+ * @parent: parent of the pnode
+ * @iip: index in parent of the pnode
+ *
+ * This function returns a pointer to the pnode on success or a negative error
+ * code on failure.
+ */
+static struct ubifs_pnode *scan_get_pnode(struct ubifs_info *c,
+					  struct lpt_scan_node *path,
+					  struct ubifs_nnode *parent, int iip)
+{
+	struct ubifs_nbranch *branch;
+	struct ubifs_pnode *pnode;
+	void *buf = c->lpt_nod_buf;
+	int err;
+
+	branch = &parent->nbranch[iip];
+	pnode = branch->pnode;
+	if (pnode) {
+		path->in_tree = 1;
+		path->ptr.pnode = pnode;
+		return pnode;
+	}
+	pnode = &path->pnode;
+	path->in_tree = 0;
+	path->ptr.pnode = pnode;
+	memset(pnode, 0, sizeof(struct ubifs_pnode));
+	if (branch->lnum == 0) {
+		/*
+		 * This pnode was not written which just means that the LEB
+		 * properties in it describe empty LEBs. We make the pnode as
+		 * though we had read it.
+		 */
+		int i;
+
+		if (c->big_lpt)
+			pnode->num = calc_pnode_num_from_parent(c, parent, iip);
+		for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+			struct ubifs_lprops * const lprops = &pnode->lprops[i];
+
+			lprops->free = c->leb_size;
+			lprops->flags = ubifs_categorize_lprops(c, lprops);
+		}
+	} else {
+		ubifs_assert(branch->lnum >= c->lpt_first &&
+			     branch->lnum <= c->lpt_last);
+		ubifs_assert(branch->offs >= 0 && branch->offs < c->leb_size);
+		err = ubi_read(c->ubi, branch->lnum, buf, branch->offs,
+			       c->pnode_sz);
+		if (err)
+			return ERR_PTR(err);
+		err = unpack_pnode(c, buf, pnode);
+		if (err)
+			return ERR_PTR(err);
+	}
+	err = validate_pnode(c, pnode, parent, iip);
+	if (err)
+		return ERR_PTR(err);
+	if (!c->big_lpt)
+		pnode->num = calc_pnode_num_from_parent(c, parent, iip);
+	pnode->parent = parent;
+	pnode->iip = iip;
+	set_pnode_lnum(c, pnode);
+	return pnode;
+}
+
+/**
+ * ubifs_lpt_scan_nolock - scan the LPT.
+ * @c: the UBIFS file-system description object
+ * @start_lnum: LEB number from which to start scanning
+ * @end_lnum: LEB number at which to stop scanning
+ * @scan_cb: callback function called for each lprops
+ * @data: data to be passed to the callback function
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_lpt_scan_nolock(struct ubifs_info *c, int start_lnum, int end_lnum,
+			  ubifs_lpt_scan_callback scan_cb, void *data)
+{
+	int err = 0, i, h, iip, shft;
+	struct ubifs_nnode *nnode;
+	struct ubifs_pnode *pnode;
+	struct lpt_scan_node *path;
+
+	if (start_lnum == -1) {
+		start_lnum = end_lnum + 1;
+		if (start_lnum >= c->leb_cnt)
+			start_lnum = c->main_first;
+	}
+
+	ubifs_assert(start_lnum >= c->main_first && start_lnum < c->leb_cnt);
+	ubifs_assert(end_lnum >= c->main_first && end_lnum < c->leb_cnt);
+
+	if (!c->nroot) {
+		err = ubifs_read_nnode(c, NULL, 0);
+		if (err)
+			return err;
+	}
+
+	path = kmalloc(sizeof(struct lpt_scan_node) * (c->lpt_hght + 1),
+		       GFP_NOFS);
+	if (!path)
+		return -ENOMEM;
+
+	path[0].ptr.nnode = c->nroot;
+	path[0].in_tree = 1;
+again:
+	/* Descend to the pnode containing start_lnum */
+	nnode = c->nroot;
+	i = start_lnum - c->main_first;
+	shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
+	for (h = 1; h < c->lpt_hght; h++) {
+		iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+		shft -= UBIFS_LPT_FANOUT_SHIFT;
+		nnode = scan_get_nnode(c, path + h, nnode, iip);
+		if (IS_ERR(nnode)) {
+			err = PTR_ERR(nnode);
+			goto out;
+		}
+	}
+	iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+	shft -= UBIFS_LPT_FANOUT_SHIFT;
+	pnode = scan_get_pnode(c, path + h, nnode, iip);
+	if (IS_ERR(pnode)) {
+		err = PTR_ERR(pnode);
+		goto out;
+	}
+	iip = (i & (UBIFS_LPT_FANOUT - 1));
+
+	/* Loop for each lprops */
+	while (1) {
+		struct ubifs_lprops *lprops = &pnode->lprops[iip];
+		int ret, lnum = lprops->lnum;
+
+		ret = scan_cb(c, lprops, path[h].in_tree, data);
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+		if (ret & LPT_SCAN_ADD) {
+			/* Add all the nodes in path to the tree in memory */
+			for (h = 1; h < c->lpt_hght; h++) {
+				const size_t sz = sizeof(struct ubifs_nnode);
+				struct ubifs_nnode *parent;
+
+				if (path[h].in_tree)
+					continue;
+				nnode = kmalloc(sz, GFP_NOFS);
+				if (!nnode) {
+					err = -ENOMEM;
+					goto out;
+				}
+				memcpy(nnode, &path[h].nnode, sz);
+				parent = nnode->parent;
+				parent->nbranch[nnode->iip].nnode = nnode;
+				path[h].ptr.nnode = nnode;
+				path[h].in_tree = 1;
+				path[h + 1].cnode.parent = nnode;
+			}
+			if (path[h].in_tree)
+				ubifs_ensure_cat(c, lprops);
+			else {
+				const size_t sz = sizeof(struct ubifs_pnode);
+				struct ubifs_nnode *parent;
+
+				pnode = kmalloc(sz, GFP_NOFS);
+				if (!pnode) {
+					err = -ENOMEM;
+					goto out;
+				}
+				memcpy(pnode, &path[h].pnode, sz);
+				parent = pnode->parent;
+				parent->nbranch[pnode->iip].pnode = pnode;
+				path[h].ptr.pnode = pnode;
+				path[h].in_tree = 1;
+				update_cats(c, pnode);
+				c->pnodes_have += 1;
+			}
+			err = dbg_check_lpt_nodes(c, (struct ubifs_cnode *)
+						  c->nroot, 0, 0);
+			if (err)
+				goto out;
+			err = dbg_check_cats(c);
+			if (err)
+				goto out;
+		}
+		if (ret & LPT_SCAN_STOP) {
+			err = 0;
+			break;
+		}
+		/* Get the next lprops */
+		if (lnum == end_lnum) {
+			/*
+			 * We got to the end without finding what we were
+			 * looking for
+			 */
+			err = -ENOSPC;
+			goto out;
+		}
+		if (lnum + 1 >= c->leb_cnt) {
+			/* Wrap-around to the beginning */
+			start_lnum = c->main_first;
+			goto again;
+		}
+		if (iip + 1 < UBIFS_LPT_FANOUT) {
+			/* Next lprops is in the same pnode */
+			iip += 1;
+			continue;
+		}
+		/* We need to get the next pnode. Go up until we can go right */
+		iip = pnode->iip;
+		while (1) {
+			h -= 1;
+			ubifs_assert(h >= 0);
+			nnode = path[h].ptr.nnode;
+			if (iip + 1 < UBIFS_LPT_FANOUT)
+				break;
+			iip = nnode->iip;
+		}
+		/* Go right */
+		iip += 1;
+		/* Descend to the pnode */
+		h += 1;
+		for (; h < c->lpt_hght; h++) {
+			nnode = scan_get_nnode(c, path + h, nnode, iip);
+			if (IS_ERR(nnode)) {
+				err = PTR_ERR(nnode);
+				goto out;
+			}
+			iip = 0;
+		}
+		pnode = scan_get_pnode(c, path + h, nnode, iip);
+		if (IS_ERR(pnode)) {
+			err = PTR_ERR(pnode);
+			goto out;
+		}
+		iip = 0;
+	}
+out:
+	kfree(path);
+	return err;
+}
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+
+/**
+ * dbg_chk_pnode - check a pnode.
+ * @c: the UBIFS file-system description object
+ * @pnode: pnode to check
+ * @col: pnode column
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
+			 int col)
+{
+	int i;
+
+	if (pnode->num != col) {
+		dbg_err("pnode num %d expected %d parent num %d iip %d",
+			pnode->num, col, pnode->parent->num, pnode->iip);
+		return -EINVAL;
+	}
+	for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+		struct ubifs_lprops *lp, *lprops = &pnode->lprops[i];
+		int lnum = (pnode->num << UBIFS_LPT_FANOUT_SHIFT) + i +
+			   c->main_first;
+		int found, cat = lprops->flags & LPROPS_CAT_MASK;
+		struct ubifs_lpt_heap *heap;
+		struct list_head *list = NULL;
+
+		if (lnum >= c->leb_cnt)
+			continue;
+		if (lprops->lnum != lnum) {
+			dbg_err("bad LEB number %d expected %d",
+				lprops->lnum, lnum);
+			return -EINVAL;
+		}
+		if (lprops->flags & LPROPS_TAKEN) {
+			if (cat != LPROPS_UNCAT) {
+				dbg_err("LEB %d taken but not uncat %d",
+					lprops->lnum, cat);
+				return -EINVAL;
+			}
+			continue;
+		}
+		if (lprops->flags & LPROPS_INDEX) {
+			switch (cat) {
+			case LPROPS_UNCAT:
+			case LPROPS_DIRTY_IDX:
+			case LPROPS_FRDI_IDX:
+				break;
+			default:
+				dbg_err("LEB %d index but cat %d",
+					lprops->lnum, cat);
+				return -EINVAL;
+			}
+		} else {
+			switch (cat) {
+			case LPROPS_UNCAT:
+			case LPROPS_DIRTY:
+			case LPROPS_FREE:
+			case LPROPS_EMPTY:
+			case LPROPS_FREEABLE:
+				break;
+			default:
+				dbg_err("LEB %d not index but cat %d",
+					lprops->lnum, cat);
+				return -EINVAL;
+			}
+		}
+		switch (cat) {
+		case LPROPS_UNCAT:
+			list = &c->uncat_list;
+			break;
+		case LPROPS_EMPTY:
+			list = &c->empty_list;
+			break;
+		case LPROPS_FREEABLE:
+			list = &c->freeable_list;
+			break;
+		case LPROPS_FRDI_IDX:
+			list = &c->frdi_idx_list;
+			break;
+		}
+		found = 0;
+		switch (cat) {
+		case LPROPS_DIRTY:
+		case LPROPS_DIRTY_IDX:
+		case LPROPS_FREE:
+			heap = &c->lpt_heap[cat - 1];
+			if (lprops->hpos < heap->cnt &&
+			    heap->arr[lprops->hpos] == lprops)
+				found = 1;
+			break;
+		case LPROPS_UNCAT:
+		case LPROPS_EMPTY:
+		case LPROPS_FREEABLE:
+		case LPROPS_FRDI_IDX:
+			list_for_each_entry(lp, list, list)
+				if (lprops == lp) {
+					found = 1;
+					break;
+				}
+			break;
+		}
+		if (!found) {
+			dbg_err("LEB %d cat %d not found in cat heap/list",
+				lprops->lnum, cat);
+			return -EINVAL;
+		}
+		switch (cat) {
+		case LPROPS_EMPTY:
+			if (lprops->free != c->leb_size) {
+				dbg_err("LEB %d cat %d free %d dirty %d",
+					lprops->lnum, cat, lprops->free,
+					lprops->dirty);
+				return -EINVAL;
+			}
+		case LPROPS_FREEABLE:
+		case LPROPS_FRDI_IDX:
+			if (lprops->free + lprops->dirty != c->leb_size) {
+				dbg_err("LEB %d cat %d free %d dirty %d",
+					lprops->lnum, cat, lprops->free,
+					lprops->dirty);
+				return -EINVAL;
+			}
+		}
+	}
+	return 0;
+}
+
+/**
+ * dbg_check_lpt_nodes - check nnodes and pnodes.
+ * @c: the UBIFS file-system description object
+ * @cnode: next cnode (nnode or pnode) to check
+ * @row: row of cnode (root is zero)
+ * @col: column of cnode (leftmost is zero)
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
+			int row, int col)
+{
+	struct ubifs_nnode *nnode, *nn;
+	struct ubifs_cnode *cn;
+	int num, iip = 0, err;
+
+	if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
+		return 0;
+
+	while (cnode) {
+		ubifs_assert(row >= 0);
+		nnode = cnode->parent;
+		if (cnode->level) {
+			/* cnode is a nnode */
+			num = calc_nnode_num(row, col);
+			if (cnode->num != num) {
+				dbg_err("nnode num %d expected %d "
+					"parent num %d iip %d", cnode->num, num,
+					(nnode ? nnode->num : 0), cnode->iip);
+				return -EINVAL;
+			}
+			nn = (struct ubifs_nnode *)cnode;
+			while (iip < UBIFS_LPT_FANOUT) {
+				cn = nn->nbranch[iip].cnode;
+				if (cn) {
+					/* Go down */
+					row += 1;
+					col <<= UBIFS_LPT_FANOUT_SHIFT;
+					col += iip;
+					iip = 0;
+					cnode = cn;
+					break;
+				}
+				/* Go right */
+				iip += 1;
+			}
+			if (iip < UBIFS_LPT_FANOUT)
+				continue;
+		} else {
+			struct ubifs_pnode *pnode;
+
+			/* cnode is a pnode */
+			pnode = (struct ubifs_pnode *)cnode;
+			err = dbg_chk_pnode(c, pnode, col);
+			if (err)
+				return err;
+		}
+		/* Go up and to the right */
+		row -= 1;
+		col >>= UBIFS_LPT_FANOUT_SHIFT;
+		iip = cnode->iip + 1;
+		cnode = (struct ubifs_cnode *)nnode;
+	}
+	return 0;
+}
+
+#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
new file mode 100644
index 00000000000..5f0b83e20af
--- /dev/null
+++ b/fs/ubifs/lpt_commit.c
@@ -0,0 +1,1648 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements commit-related functionality of the LEB properties
+ * subsystem.
+ */
+
+#include <linux/crc16.h>
+#include "ubifs.h"
+
+/**
+ * first_dirty_cnode - find first dirty cnode.
+ * @c: UBIFS file-system description object
+ * @nnode: nnode at which to start
+ *
+ * This function returns the first dirty cnode or %NULL if there is not one.
+ */
+static struct ubifs_cnode *first_dirty_cnode(struct ubifs_nnode *nnode)
+{
+	ubifs_assert(nnode);
+	while (1) {
+		int i, cont = 0;
+
+		for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+			struct ubifs_cnode *cnode;
+
+			cnode = nnode->nbranch[i].cnode;
+			if (cnode &&
+			    test_bit(DIRTY_CNODE, &cnode->flags)) {
+				if (cnode->level == 0)
+					return cnode;
+				nnode = (struct ubifs_nnode *)cnode;
+				cont = 1;
+				break;
+			}
+		}
+		if (!cont)
+			return (struct ubifs_cnode *)nnode;
+	}
+}
+
+/**
+ * next_dirty_cnode - find next dirty cnode.
+ * @cnode: cnode from which to begin searching
+ *
+ * This function returns the next dirty cnode or %NULL if there is not one.
+ */
+static struct ubifs_cnode *next_dirty_cnode(struct ubifs_cnode *cnode)
+{
+	struct ubifs_nnode *nnode;
+	int i;
+
+	ubifs_assert(cnode);
+	nnode = cnode->parent;
+	if (!nnode)
+		return NULL;
+	for (i = cnode->iip + 1; i < UBIFS_LPT_FANOUT; i++) {
+		cnode = nnode->nbranch[i].cnode;
+		if (cnode && test_bit(DIRTY_CNODE, &cnode->flags)) {
+			if (cnode->level == 0)
+				return cnode; /* cnode is a pnode */
+			/* cnode is a nnode */
+			return first_dirty_cnode((struct ubifs_nnode *)cnode);
+		}
+	}
+	return (struct ubifs_cnode *)nnode;
+}
+
+/**
+ * get_cnodes_to_commit - create list of dirty cnodes to commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns the number of cnodes to commit.
+ */
+static int get_cnodes_to_commit(struct ubifs_info *c)
+{
+	struct ubifs_cnode *cnode, *cnext;
+	int cnt = 0;
+
+	if (!c->nroot)
+		return 0;
+
+	if (!test_bit(DIRTY_CNODE, &c->nroot->flags))
+		return 0;
+
+	c->lpt_cnext = first_dirty_cnode(c->nroot);
+	cnode = c->lpt_cnext;
+	if (!cnode)
+		return 0;
+	cnt += 1;
+	while (1) {
+		ubifs_assert(!test_bit(COW_ZNODE, &cnode->flags));
+		__set_bit(COW_ZNODE, &cnode->flags);
+		cnext = next_dirty_cnode(cnode);
+		if (!cnext) {
+			cnode->cnext = c->lpt_cnext;
+			break;
+		}
+		cnode->cnext = cnext;
+		cnode = cnext;
+		cnt += 1;
+	}
+	dbg_cmt("committing %d cnodes", cnt);
+	dbg_lp("committing %d cnodes", cnt);
+	ubifs_assert(cnt == c->dirty_nn_cnt + c->dirty_pn_cnt);
+	return cnt;
+}
+
+/**
+ * upd_ltab - update LPT LEB properties.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number
+ * @free: amount of free space
+ * @dirty: amount of dirty space to add
+ */
+static void upd_ltab(struct ubifs_info *c, int lnum, int free, int dirty)
+{
+	dbg_lp("LEB %d free %d dirty %d to %d +%d",
+	       lnum, c->ltab[lnum - c->lpt_first].free,
+	       c->ltab[lnum - c->lpt_first].dirty, free, dirty);
+	ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last);
+	c->ltab[lnum - c->lpt_first].free = free;
+	c->ltab[lnum - c->lpt_first].dirty += dirty;
+}
+
+/**
+ * alloc_lpt_leb - allocate an LPT LEB that is empty.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number is passed and returned here
+ *
+ * This function finds the next empty LEB in the ltab starting from @lnum. If a
+ * an empty LEB is found it is returned in @lnum and the function returns %0.
+ * Otherwise the function returns -ENOSPC.  Note however, that LPT is designed
+ * never to run out of space.
+ */
+static int alloc_lpt_leb(struct ubifs_info *c, int *lnum)
+{
+	int i, n;
+
+	n = *lnum - c->lpt_first + 1;
+	for (i = n; i < c->lpt_lebs; i++) {
+		if (c->ltab[i].tgc || c->ltab[i].cmt)
+			continue;
+		if (c->ltab[i].free == c->leb_size) {
+			c->ltab[i].cmt = 1;
+			*lnum = i + c->lpt_first;
+			return 0;
+		}
+	}
+
+	for (i = 0; i < n; i++) {
+		if (c->ltab[i].tgc || c->ltab[i].cmt)
+			continue;
+		if (c->ltab[i].free == c->leb_size) {
+			c->ltab[i].cmt = 1;
+			*lnum = i + c->lpt_first;
+			return 0;
+		}
+	}
+	dbg_err("last LEB %d", *lnum);
+	dump_stack();
+	return -ENOSPC;
+}
+
+/**
+ * layout_cnodes - layout cnodes for commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int layout_cnodes(struct ubifs_info *c)
+{
+	int lnum, offs, len, alen, done_lsave, done_ltab, err;
+	struct ubifs_cnode *cnode;
+
+	cnode = c->lpt_cnext;
+	if (!cnode)
+		return 0;
+	lnum = c->nhead_lnum;
+	offs = c->nhead_offs;
+	/* Try to place lsave and ltab nicely */
+	done_lsave = !c->big_lpt;
+	done_ltab = 0;
+	if (!done_lsave && offs + c->lsave_sz <= c->leb_size) {
+		done_lsave = 1;
+		c->lsave_lnum = lnum;
+		c->lsave_offs = offs;
+		offs += c->lsave_sz;
+	}
+
+	if (offs + c->ltab_sz <= c->leb_size) {
+		done_ltab = 1;
+		c->ltab_lnum = lnum;
+		c->ltab_offs = offs;
+		offs += c->ltab_sz;
+	}
+
+	do {
+		if (cnode->level) {
+			len = c->nnode_sz;
+			c->dirty_nn_cnt -= 1;
+		} else {
+			len = c->pnode_sz;
+			c->dirty_pn_cnt -= 1;
+		}
+		while (offs + len > c->leb_size) {
+			alen = ALIGN(offs, c->min_io_size);
+			upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
+			err = alloc_lpt_leb(c, &lnum);
+			if (err)
+				return err;
+			offs = 0;
+			ubifs_assert(lnum >= c->lpt_first &&
+				     lnum <= c->lpt_last);
+			/* Try to place lsave and ltab nicely */
+			if (!done_lsave) {
+				done_lsave = 1;
+				c->lsave_lnum = lnum;
+				c->lsave_offs = offs;
+				offs += c->lsave_sz;
+				continue;
+			}
+			if (!done_ltab) {
+				done_ltab = 1;
+				c->ltab_lnum = lnum;
+				c->ltab_offs = offs;
+				offs += c->ltab_sz;
+				continue;
+			}
+			break;
+		}
+		if (cnode->parent) {
+			cnode->parent->nbranch[cnode->iip].lnum = lnum;
+			cnode->parent->nbranch[cnode->iip].offs = offs;
+		} else {
+			c->lpt_lnum = lnum;
+			c->lpt_offs = offs;
+		}
+		offs += len;
+		cnode = cnode->cnext;
+	} while (cnode && cnode != c->lpt_cnext);
+
+	/* Make sure to place LPT's save table */
+	if (!done_lsave) {
+		if (offs + c->lsave_sz > c->leb_size) {
+			alen = ALIGN(offs, c->min_io_size);
+			upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
+			err = alloc_lpt_leb(c, &lnum);
+			if (err)
+				return err;
+			offs = 0;
+			ubifs_assert(lnum >= c->lpt_first &&
+				     lnum <= c->lpt_last);
+		}
+		done_lsave = 1;
+		c->lsave_lnum = lnum;
+		c->lsave_offs = offs;
+		offs += c->lsave_sz;
+	}
+
+	/* Make sure to place LPT's own lprops table */
+	if (!done_ltab) {
+		if (offs + c->ltab_sz > c->leb_size) {
+			alen = ALIGN(offs, c->min_io_size);
+			upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
+			err = alloc_lpt_leb(c, &lnum);
+			if (err)
+				return err;
+			offs = 0;
+			ubifs_assert(lnum >= c->lpt_first &&
+				     lnum <= c->lpt_last);
+		}
+		done_ltab = 1;
+		c->ltab_lnum = lnum;
+		c->ltab_offs = offs;
+		offs += c->ltab_sz;
+	}
+
+	alen = ALIGN(offs, c->min_io_size);
+	upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
+	return 0;
+}
+
+/**
+ * realloc_lpt_leb - allocate an LPT LEB that is empty.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number is passed and returned here
+ *
+ * This function duplicates exactly the results of the function alloc_lpt_leb.
+ * It is used during end commit to reallocate the same LEB numbers that were
+ * allocated by alloc_lpt_leb during start commit.
+ *
+ * This function finds the next LEB that was allocated by the alloc_lpt_leb
+ * function starting from @lnum. If a LEB is found it is returned in @lnum and
+ * the function returns %0. Otherwise the function returns -ENOSPC.
+ * Note however, that LPT is designed never to run out of space.
+ */
+static int realloc_lpt_leb(struct ubifs_info *c, int *lnum)
+{
+	int i, n;
+
+	n = *lnum - c->lpt_first + 1;
+	for (i = n; i < c->lpt_lebs; i++)
+		if (c->ltab[i].cmt) {
+			c->ltab[i].cmt = 0;
+			*lnum = i + c->lpt_first;
+			return 0;
+		}
+
+	for (i = 0; i < n; i++)
+		if (c->ltab[i].cmt) {
+			c->ltab[i].cmt = 0;
+			*lnum = i + c->lpt_first;
+			return 0;
+		}
+	dbg_err("last LEB %d", *lnum);
+	dump_stack();
+	return -ENOSPC;
+}
+
+/**
+ * write_cnodes - write cnodes for commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int write_cnodes(struct ubifs_info *c)
+{
+	int lnum, offs, len, from, err, wlen, alen, done_ltab, done_lsave;
+	struct ubifs_cnode *cnode;
+	void *buf = c->lpt_buf;
+
+	cnode = c->lpt_cnext;
+	if (!cnode)
+		return 0;
+	lnum = c->nhead_lnum;
+	offs = c->nhead_offs;
+	from = offs;
+	/* Ensure empty LEB is unmapped */
+	if (offs == 0) {
+		err = ubifs_leb_unmap(c, lnum);
+		if (err)
+			return err;
+	}
+	/* Try to place lsave and ltab nicely */
+	done_lsave = !c->big_lpt;
+	done_ltab = 0;
+	if (!done_lsave && offs + c->lsave_sz <= c->leb_size) {
+		done_lsave = 1;
+		ubifs_pack_lsave(c, buf + offs, c->lsave);
+		offs += c->lsave_sz;
+	}
+
+	if (offs + c->ltab_sz <= c->leb_size) {
+		done_ltab = 1;
+		ubifs_pack_ltab(c, buf + offs, c->ltab_cmt);
+		offs += c->ltab_sz;
+	}
+
+	/* Loop for each cnode */
+	do {
+		if (cnode->level)
+			len = c->nnode_sz;
+		else
+			len = c->pnode_sz;
+		while (offs + len > c->leb_size) {
+			wlen = offs - from;
+			if (wlen) {
+				alen = ALIGN(wlen, c->min_io_size);
+				memset(buf + offs, 0xff, alen - wlen);
+				err = ubifs_leb_write(c, lnum, buf + from, from,
+						       alen, UBI_SHORTTERM);
+				if (err)
+					return err;
+			}
+			err = realloc_lpt_leb(c, &lnum);
+			if (err)
+				return err;
+			offs = 0;
+			from = 0;
+			ubifs_assert(lnum >= c->lpt_first &&
+				     lnum <= c->lpt_last);
+			err = ubifs_leb_unmap(c, lnum);
+			if (err)
+				return err;
+			/* Try to place lsave and ltab nicely */
+			if (!done_lsave) {
+				done_lsave = 1;
+				ubifs_pack_lsave(c, buf + offs, c->lsave);
+				offs += c->lsave_sz;
+				continue;
+			}
+			if (!done_ltab) {
+				done_ltab = 1;
+				ubifs_pack_ltab(c, buf + offs, c->ltab_cmt);
+				offs += c->ltab_sz;
+				continue;
+			}
+			break;
+		}
+		if (cnode->level)
+			ubifs_pack_nnode(c, buf + offs,
+					 (struct ubifs_nnode *)cnode);
+		else
+			ubifs_pack_pnode(c, buf + offs,
+					 (struct ubifs_pnode *)cnode);
+		/*
+		 * The reason for the barriers is the same as in case of TNC.
+		 * See comment in 'write_index()'. 'dirty_cow_nnode()' and
+		 * 'dirty_cow_pnode()' are the functions for which this is
+		 * important.
+		 */
+		clear_bit(DIRTY_CNODE, &cnode->flags);
+		smp_mb__before_clear_bit();
+		clear_bit(COW_ZNODE, &cnode->flags);
+		smp_mb__after_clear_bit();
+		offs += len;
+		cnode = cnode->cnext;
+	} while (cnode && cnode != c->lpt_cnext);
+
+	/* Make sure to place LPT's save table */
+	if (!done_lsave) {
+		if (offs + c->lsave_sz > c->leb_size) {
+			wlen = offs - from;
+			alen = ALIGN(wlen, c->min_io_size);
+			memset(buf + offs, 0xff, alen - wlen);
+			err = ubifs_leb_write(c, lnum, buf + from, from, alen,
+					      UBI_SHORTTERM);
+			if (err)
+				return err;
+			err = realloc_lpt_leb(c, &lnum);
+			if (err)
+				return err;
+			offs = 0;
+			ubifs_assert(lnum >= c->lpt_first &&
+				     lnum <= c->lpt_last);
+			err = ubifs_leb_unmap(c, lnum);
+			if (err)
+				return err;
+		}
+		done_lsave = 1;
+		ubifs_pack_lsave(c, buf + offs, c->lsave);
+		offs += c->lsave_sz;
+	}
+
+	/* Make sure to place LPT's own lprops table */
+	if (!done_ltab) {
+		if (offs + c->ltab_sz > c->leb_size) {
+			wlen = offs - from;
+			alen = ALIGN(wlen, c->min_io_size);
+			memset(buf + offs, 0xff, alen - wlen);
+			err = ubifs_leb_write(c, lnum, buf + from, from, alen,
+					      UBI_SHORTTERM);
+			if (err)
+				return err;
+			err = realloc_lpt_leb(c, &lnum);
+			if (err)
+				return err;
+			offs = 0;
+			ubifs_assert(lnum >= c->lpt_first &&
+				     lnum <= c->lpt_last);
+			err = ubifs_leb_unmap(c, lnum);
+			if (err)
+				return err;
+		}
+		done_ltab = 1;
+		ubifs_pack_ltab(c, buf + offs, c->ltab_cmt);
+		offs += c->ltab_sz;
+	}
+
+	/* Write remaining data in buffer */
+	wlen = offs - from;
+	alen = ALIGN(wlen, c->min_io_size);
+	memset(buf + offs, 0xff, alen - wlen);
+	err = ubifs_leb_write(c, lnum, buf + from, from, alen, UBI_SHORTTERM);
+	if (err)
+		return err;
+	c->nhead_lnum = lnum;
+	c->nhead_offs = ALIGN(offs, c->min_io_size);
+
+	dbg_lp("LPT root is at %d:%d", c->lpt_lnum, c->lpt_offs);
+	dbg_lp("LPT head is at %d:%d", c->nhead_lnum, c->nhead_offs);
+	dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs);
+	if (c->big_lpt)
+		dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs);
+	return 0;
+}
+
+/**
+ * next_pnode - find next pnode.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode
+ *
+ * This function returns the next pnode or %NULL if there are no more pnodes.
+ */
+static struct ubifs_pnode *next_pnode(struct ubifs_info *c,
+				      struct ubifs_pnode *pnode)
+{
+	struct ubifs_nnode *nnode;
+	int iip;
+
+	/* Try to go right */
+	nnode = pnode->parent;
+	iip = pnode->iip + 1;
+	if (iip < UBIFS_LPT_FANOUT) {
+		/* We assume here that LEB zero is never an LPT LEB */
+		if (nnode->nbranch[iip].lnum)
+			return ubifs_get_pnode(c, nnode, iip);
+		else
+			return NULL;
+	}
+
+	/* Go up while can't go right */
+	do {
+		iip = nnode->iip + 1;
+		nnode = nnode->parent;
+		if (!nnode)
+			return NULL;
+		/* We assume here that LEB zero is never an LPT LEB */
+	} while (iip >= UBIFS_LPT_FANOUT || !nnode->nbranch[iip].lnum);
+
+	/* Go right */
+	nnode = ubifs_get_nnode(c, nnode, iip);
+	if (IS_ERR(nnode))
+		return (void *)nnode;
+
+	/* Go down to level 1 */
+	while (nnode->level > 1) {
+		nnode = ubifs_get_nnode(c, nnode, 0);
+		if (IS_ERR(nnode))
+			return (void *)nnode;
+	}
+
+	return ubifs_get_pnode(c, nnode, 0);
+}
+
+/**
+ * pnode_lookup - lookup a pnode in the LPT.
+ * @c: UBIFS file-system description object
+ * @i: pnode number (0 to main_lebs - 1)
+ *
+ * This function returns a pointer to the pnode on success or a negative
+ * error code on failure.
+ */
+static struct ubifs_pnode *pnode_lookup(struct ubifs_info *c, int i)
+{
+	int err, h, iip, shft;
+	struct ubifs_nnode *nnode;
+
+	if (!c->nroot) {
+		err = ubifs_read_nnode(c, NULL, 0);
+		if (err)
+			return ERR_PTR(err);
+	}
+	i <<= UBIFS_LPT_FANOUT_SHIFT;
+	nnode = c->nroot;
+	shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
+	for (h = 1; h < c->lpt_hght; h++) {
+		iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+		shft -= UBIFS_LPT_FANOUT_SHIFT;
+		nnode = ubifs_get_nnode(c, nnode, iip);
+		if (IS_ERR(nnode))
+			return ERR_PTR(PTR_ERR(nnode));
+	}
+	iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+	return ubifs_get_pnode(c, nnode, iip);
+}
+
+/**
+ * add_pnode_dirt - add dirty space to LPT LEB properties.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode for which to add dirt
+ */
+static void add_pnode_dirt(struct ubifs_info *c, struct ubifs_pnode *pnode)
+{
+	ubifs_add_lpt_dirt(c, pnode->parent->nbranch[pnode->iip].lnum,
+			   c->pnode_sz);
+}
+
+/**
+ * do_make_pnode_dirty - mark a pnode dirty.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode to mark dirty
+ */
+static void do_make_pnode_dirty(struct ubifs_info *c, struct ubifs_pnode *pnode)
+{
+	/* Assumes cnext list is empty i.e. not called during commit */
+	if (!test_and_set_bit(DIRTY_CNODE, &pnode->flags)) {
+		struct ubifs_nnode *nnode;
+
+		c->dirty_pn_cnt += 1;
+		add_pnode_dirt(c, pnode);
+		/* Mark parent and ancestors dirty too */
+		nnode = pnode->parent;
+		while (nnode) {
+			if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) {
+				c->dirty_nn_cnt += 1;
+				ubifs_add_nnode_dirt(c, nnode);
+				nnode = nnode->parent;
+			} else
+				break;
+		}
+	}
+}
+
+/**
+ * make_tree_dirty - mark the entire LEB properties tree dirty.
+ * @c: UBIFS file-system description object
+ *
+ * This function is used by the "small" LPT model to cause the entire LEB
+ * properties tree to be written.  The "small" LPT model does not use LPT
+ * garbage collection because it is more efficient to write the entire tree
+ * (because it is small).
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int make_tree_dirty(struct ubifs_info *c)
+{
+	struct ubifs_pnode *pnode;
+
+	pnode = pnode_lookup(c, 0);
+	while (pnode) {
+		do_make_pnode_dirty(c, pnode);
+		pnode = next_pnode(c, pnode);
+		if (IS_ERR(pnode))
+			return PTR_ERR(pnode);
+	}
+	return 0;
+}
+
+/**
+ * need_write_all - determine if the LPT area is running out of free space.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %1 if the LPT area is running out of free space and %0
+ * if it is not.
+ */
+static int need_write_all(struct ubifs_info *c)
+{
+	long long free = 0;
+	int i;
+
+	for (i = 0; i < c->lpt_lebs; i++) {
+		if (i + c->lpt_first == c->nhead_lnum)
+			free += c->leb_size - c->nhead_offs;
+		else if (c->ltab[i].free == c->leb_size)
+			free += c->leb_size;
+		else if (c->ltab[i].free + c->ltab[i].dirty == c->leb_size)
+			free += c->leb_size;
+	}
+	/* Less than twice the size left */
+	if (free <= c->lpt_sz * 2)
+		return 1;
+	return 0;
+}
+
+/**
+ * lpt_tgc_start - start trivial garbage collection of LPT LEBs.
+ * @c: UBIFS file-system description object
+ *
+ * LPT trivial garbage collection is where a LPT LEB contains only dirty and
+ * free space and so may be reused as soon as the next commit is completed.
+ * This function is called during start commit to mark LPT LEBs for trivial GC.
+ */
+static void lpt_tgc_start(struct ubifs_info *c)
+{
+	int i;
+
+	for (i = 0; i < c->lpt_lebs; i++) {
+		if (i + c->lpt_first == c->nhead_lnum)
+			continue;
+		if (c->ltab[i].dirty > 0 &&
+		    c->ltab[i].free + c->ltab[i].dirty == c->leb_size) {
+			c->ltab[i].tgc = 1;
+			c->ltab[i].free = c->leb_size;
+			c->ltab[i].dirty = 0;
+			dbg_lp("LEB %d", i + c->lpt_first);
+		}
+	}
+}
+
+/**
+ * lpt_tgc_end - end trivial garbage collection of LPT LEBs.
+ * @c: UBIFS file-system description object
+ *
+ * LPT trivial garbage collection is where a LPT LEB contains only dirty and
+ * free space and so may be reused as soon as the next commit is completed.
+ * This function is called after the commit is completed (master node has been
+ * written) and unmaps LPT LEBs that were marked for trivial GC.
+ */
+static int lpt_tgc_end(struct ubifs_info *c)
+{
+	int i, err;
+
+	for (i = 0; i < c->lpt_lebs; i++)
+		if (c->ltab[i].tgc) {
+			err = ubifs_leb_unmap(c, i + c->lpt_first);
+			if (err)
+				return err;
+			c->ltab[i].tgc = 0;
+			dbg_lp("LEB %d", i + c->lpt_first);
+		}
+	return 0;
+}
+
+/**
+ * populate_lsave - fill the lsave array with important LEB numbers.
+ * @c: the UBIFS file-system description object
+ *
+ * This function is only called for the "big" model. It records a small number
+ * of LEB numbers of important LEBs.  Important LEBs are ones that are (from
+ * most important to least important): empty, freeable, freeable index, dirty
+ * index, dirty or free. Upon mount, we read this list of LEB numbers and bring
+ * their pnodes into memory.  That will stop us from having to scan the LPT
+ * straight away. For the "small" model we assume that scanning the LPT is no
+ * big deal.
+ */
+static void populate_lsave(struct ubifs_info *c)
+{
+	struct ubifs_lprops *lprops;
+	struct ubifs_lpt_heap *heap;
+	int i, cnt = 0;
+
+	ubifs_assert(c->big_lpt);
+	if (!(c->lpt_drty_flgs & LSAVE_DIRTY)) {
+		c->lpt_drty_flgs |= LSAVE_DIRTY;
+		ubifs_add_lpt_dirt(c, c->lsave_lnum, c->lsave_sz);
+	}
+	list_for_each_entry(lprops, &c->empty_list, list) {
+		c->lsave[cnt++] = lprops->lnum;
+		if (cnt >= c->lsave_cnt)
+			return;
+	}
+	list_for_each_entry(lprops, &c->freeable_list, list) {
+		c->lsave[cnt++] = lprops->lnum;
+		if (cnt >= c->lsave_cnt)
+			return;
+	}
+	list_for_each_entry(lprops, &c->frdi_idx_list, list) {
+		c->lsave[cnt++] = lprops->lnum;
+		if (cnt >= c->lsave_cnt)
+			return;
+	}
+	heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
+	for (i = 0; i < heap->cnt; i++) {
+		c->lsave[cnt++] = heap->arr[i]->lnum;
+		if (cnt >= c->lsave_cnt)
+			return;
+	}
+	heap = &c->lpt_heap[LPROPS_DIRTY - 1];
+	for (i = 0; i < heap->cnt; i++) {
+		c->lsave[cnt++] = heap->arr[i]->lnum;
+		if (cnt >= c->lsave_cnt)
+			return;
+	}
+	heap = &c->lpt_heap[LPROPS_FREE - 1];
+	for (i = 0; i < heap->cnt; i++) {
+		c->lsave[cnt++] = heap->arr[i]->lnum;
+		if (cnt >= c->lsave_cnt)
+			return;
+	}
+	/* Fill it up completely */
+	while (cnt < c->lsave_cnt)
+		c->lsave[cnt++] = c->main_first;
+}
+
+/**
+ * nnode_lookup - lookup a nnode in the LPT.
+ * @c: UBIFS file-system description object
+ * @i: nnode number
+ *
+ * This function returns a pointer to the nnode on success or a negative
+ * error code on failure.
+ */
+static struct ubifs_nnode *nnode_lookup(struct ubifs_info *c, int i)
+{
+	int err, iip;
+	struct ubifs_nnode *nnode;
+
+	if (!c->nroot) {
+		err = ubifs_read_nnode(c, NULL, 0);
+		if (err)
+			return ERR_PTR(err);
+	}
+	nnode = c->nroot;
+	while (1) {
+		iip = i & (UBIFS_LPT_FANOUT - 1);
+		i >>= UBIFS_LPT_FANOUT_SHIFT;
+		if (!i)
+			break;
+		nnode = ubifs_get_nnode(c, nnode, iip);
+		if (IS_ERR(nnode))
+			return nnode;
+	}
+	return nnode;
+}
+
+/**
+ * make_nnode_dirty - find a nnode and, if found, make it dirty.
+ * @c: UBIFS file-system description object
+ * @node_num: nnode number of nnode to make dirty
+ * @lnum: LEB number where nnode was written
+ * @offs: offset where nnode was written
+ *
+ * This function is used by LPT garbage collection.  LPT garbage collection is
+ * used only for the "big" LPT model (c->big_lpt == 1).  Garbage collection
+ * simply involves marking all the nodes in the LEB being garbage-collected as
+ * dirty.  The dirty nodes are written next commit, after which the LEB is free
+ * to be reused.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int make_nnode_dirty(struct ubifs_info *c, int node_num, int lnum,
+			    int offs)
+{
+	struct ubifs_nnode *nnode;
+
+	nnode = nnode_lookup(c, node_num);
+	if (IS_ERR(nnode))
+		return PTR_ERR(nnode);
+	if (nnode->parent) {
+		struct ubifs_nbranch *branch;
+
+		branch = &nnode->parent->nbranch[nnode->iip];
+		if (branch->lnum != lnum || branch->offs != offs)
+			return 0; /* nnode is obsolete */
+	} else if (c->lpt_lnum != lnum || c->lpt_offs != offs)
+			return 0; /* nnode is obsolete */
+	/* Assumes cnext list is empty i.e. not called during commit */
+	if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) {
+		c->dirty_nn_cnt += 1;
+		ubifs_add_nnode_dirt(c, nnode);
+		/* Mark parent and ancestors dirty too */
+		nnode = nnode->parent;
+		while (nnode) {
+			if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) {
+				c->dirty_nn_cnt += 1;
+				ubifs_add_nnode_dirt(c, nnode);
+				nnode = nnode->parent;
+			} else
+				break;
+		}
+	}
+	return 0;
+}
+
+/**
+ * make_pnode_dirty - find a pnode and, if found, make it dirty.
+ * @c: UBIFS file-system description object
+ * @node_num: pnode number of pnode to make dirty
+ * @lnum: LEB number where pnode was written
+ * @offs: offset where pnode was written
+ *
+ * This function is used by LPT garbage collection.  LPT garbage collection is
+ * used only for the "big" LPT model (c->big_lpt == 1).  Garbage collection
+ * simply involves marking all the nodes in the LEB being garbage-collected as
+ * dirty.  The dirty nodes are written next commit, after which the LEB is free
+ * to be reused.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int make_pnode_dirty(struct ubifs_info *c, int node_num, int lnum,
+			    int offs)
+{
+	struct ubifs_pnode *pnode;
+	struct ubifs_nbranch *branch;
+
+	pnode = pnode_lookup(c, node_num);
+	if (IS_ERR(pnode))
+		return PTR_ERR(pnode);
+	branch = &pnode->parent->nbranch[pnode->iip];
+	if (branch->lnum != lnum || branch->offs != offs)
+		return 0;
+	do_make_pnode_dirty(c, pnode);
+	return 0;
+}
+
+/**
+ * make_ltab_dirty - make ltab node dirty.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number where ltab was written
+ * @offs: offset where ltab was written
+ *
+ * This function is used by LPT garbage collection.  LPT garbage collection is
+ * used only for the "big" LPT model (c->big_lpt == 1).  Garbage collection
+ * simply involves marking all the nodes in the LEB being garbage-collected as
+ * dirty.  The dirty nodes are written next commit, after which the LEB is free
+ * to be reused.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int make_ltab_dirty(struct ubifs_info *c, int lnum, int offs)
+{
+	if (lnum != c->ltab_lnum || offs != c->ltab_offs)
+		return 0; /* This ltab node is obsolete */
+	if (!(c->lpt_drty_flgs & LTAB_DIRTY)) {
+		c->lpt_drty_flgs |= LTAB_DIRTY;
+		ubifs_add_lpt_dirt(c, c->ltab_lnum, c->ltab_sz);
+	}
+	return 0;
+}
+
+/**
+ * make_lsave_dirty - make lsave node dirty.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number where lsave was written
+ * @offs: offset where lsave was written
+ *
+ * This function is used by LPT garbage collection.  LPT garbage collection is
+ * used only for the "big" LPT model (c->big_lpt == 1).  Garbage collection
+ * simply involves marking all the nodes in the LEB being garbage-collected as
+ * dirty.  The dirty nodes are written next commit, after which the LEB is free
+ * to be reused.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int make_lsave_dirty(struct ubifs_info *c, int lnum, int offs)
+{
+	if (lnum != c->lsave_lnum || offs != c->lsave_offs)
+		return 0; /* This lsave node is obsolete */
+	if (!(c->lpt_drty_flgs & LSAVE_DIRTY)) {
+		c->lpt_drty_flgs |= LSAVE_DIRTY;
+		ubifs_add_lpt_dirt(c, c->lsave_lnum, c->lsave_sz);
+	}
+	return 0;
+}
+
+/**
+ * make_node_dirty - make node dirty.
+ * @c: UBIFS file-system description object
+ * @node_type: LPT node type
+ * @node_num: node number
+ * @lnum: LEB number where node was written
+ * @offs: offset where node was written
+ *
+ * This function is used by LPT garbage collection.  LPT garbage collection is
+ * used only for the "big" LPT model (c->big_lpt == 1).  Garbage collection
+ * simply involves marking all the nodes in the LEB being garbage-collected as
+ * dirty.  The dirty nodes are written next commit, after which the LEB is free
+ * to be reused.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int make_node_dirty(struct ubifs_info *c, int node_type, int node_num,
+			   int lnum, int offs)
+{
+	switch (node_type) {
+	case UBIFS_LPT_NNODE:
+		return make_nnode_dirty(c, node_num, lnum, offs);
+	case UBIFS_LPT_PNODE:
+		return make_pnode_dirty(c, node_num, lnum, offs);
+	case UBIFS_LPT_LTAB:
+		return make_ltab_dirty(c, lnum, offs);
+	case UBIFS_LPT_LSAVE:
+		return make_lsave_dirty(c, lnum, offs);
+	}
+	return -EINVAL;
+}
+
+/**
+ * get_lpt_node_len - return the length of a node based on its type.
+ * @c: UBIFS file-system description object
+ * @node_type: LPT node type
+ */
+static int get_lpt_node_len(struct ubifs_info *c, int node_type)
+{
+	switch (node_type) {
+	case UBIFS_LPT_NNODE:
+		return c->nnode_sz;
+	case UBIFS_LPT_PNODE:
+		return c->pnode_sz;
+	case UBIFS_LPT_LTAB:
+		return c->ltab_sz;
+	case UBIFS_LPT_LSAVE:
+		return c->lsave_sz;
+	}
+	return 0;
+}
+
+/**
+ * get_pad_len - return the length of padding in a buffer.
+ * @c: UBIFS file-system description object
+ * @buf: buffer
+ * @len: length of buffer
+ */
+static int get_pad_len(struct ubifs_info *c, uint8_t *buf, int len)
+{
+	int offs, pad_len;
+
+	if (c->min_io_size == 1)
+		return 0;
+	offs = c->leb_size - len;
+	pad_len = ALIGN(offs, c->min_io_size) - offs;
+	return pad_len;
+}
+
+/**
+ * get_lpt_node_type - return type (and node number) of a node in a buffer.
+ * @c: UBIFS file-system description object
+ * @buf: buffer
+ * @node_num: node number is returned here
+ */
+static int get_lpt_node_type(struct ubifs_info *c, uint8_t *buf, int *node_num)
+{
+	uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+	int pos = 0, node_type;
+
+	node_type = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_TYPE_BITS);
+	*node_num = ubifs_unpack_bits(&addr, &pos, c->pcnt_bits);
+	return node_type;
+}
+
+/**
+ * is_a_node - determine if a buffer contains a node.
+ * @c: UBIFS file-system description object
+ * @buf: buffer
+ * @len: length of buffer
+ *
+ * This function returns %1 if the buffer contains a node or %0 if it does not.
+ */
+static int is_a_node(struct ubifs_info *c, uint8_t *buf, int len)
+{
+	uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+	int pos = 0, node_type, node_len;
+	uint16_t crc, calc_crc;
+
+	node_type = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_TYPE_BITS);
+	if (node_type == UBIFS_LPT_NOT_A_NODE)
+		return 0;
+	node_len = get_lpt_node_len(c, node_type);
+	if (!node_len || node_len > len)
+		return 0;
+	pos = 0;
+	addr = buf;
+	crc = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_CRC_BITS);
+	calc_crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
+			 node_len - UBIFS_LPT_CRC_BYTES);
+	if (crc != calc_crc)
+		return 0;
+	return 1;
+}
+
+
+/**
+ * lpt_gc_lnum - garbage collect a LPT LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to garbage collect
+ *
+ * LPT garbage collection is used only for the "big" LPT model
+ * (c->big_lpt == 1).  Garbage collection simply involves marking all the nodes
+ * in the LEB being garbage-collected as dirty.  The dirty nodes are written
+ * next commit, after which the LEB is free to be reused.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int lpt_gc_lnum(struct ubifs_info *c, int lnum)
+{
+	int err, len = c->leb_size, node_type, node_num, node_len, offs;
+	void *buf = c->lpt_buf;
+
+	dbg_lp("LEB %d", lnum);
+	err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
+	if (err) {
+		ubifs_err("cannot read LEB %d, error %d", lnum, err);
+		return err;
+	}
+	while (1) {
+		if (!is_a_node(c, buf, len)) {
+			int pad_len;
+
+			pad_len = get_pad_len(c, buf, len);
+			if (pad_len) {
+				buf += pad_len;
+				len -= pad_len;
+				continue;
+			}
+			return 0;
+		}
+		node_type = get_lpt_node_type(c, buf, &node_num);
+		node_len = get_lpt_node_len(c, node_type);
+		offs = c->leb_size - len;
+		ubifs_assert(node_len != 0);
+		mutex_lock(&c->lp_mutex);
+		err = make_node_dirty(c, node_type, node_num, lnum, offs);
+		mutex_unlock(&c->lp_mutex);
+		if (err)
+			return err;
+		buf += node_len;
+		len -= node_len;
+	}
+	return 0;
+}
+
+/**
+ * lpt_gc - LPT garbage collection.
+ * @c: UBIFS file-system description object
+ *
+ * Select a LPT LEB for LPT garbage collection and call 'lpt_gc_lnum()'.
+ * Returns %0 on success and a negative error code on failure.
+ */
+static int lpt_gc(struct ubifs_info *c)
+{
+	int i, lnum = -1, dirty = 0;
+
+	mutex_lock(&c->lp_mutex);
+	for (i = 0; i < c->lpt_lebs; i++) {
+		ubifs_assert(!c->ltab[i].tgc);
+		if (i + c->lpt_first == c->nhead_lnum ||
+		    c->ltab[i].free + c->ltab[i].dirty == c->leb_size)
+			continue;
+		if (c->ltab[i].dirty > dirty) {
+			dirty = c->ltab[i].dirty;
+			lnum = i + c->lpt_first;
+		}
+	}
+	mutex_unlock(&c->lp_mutex);
+	if (lnum == -1)
+		return -ENOSPC;
+	return lpt_gc_lnum(c, lnum);
+}
+
+/**
+ * ubifs_lpt_start_commit - UBIFS commit starts.
+ * @c: the UBIFS file-system description object
+ *
+ * This function has to be called when UBIFS starts the commit operation.
+ * This function "freezes" all currently dirty LEB properties and does not
+ * change them anymore. Further changes are saved and tracked separately
+ * because they are not part of this commit. This function returns zero in case
+ * of success and a negative error code in case of failure.
+ */
+int ubifs_lpt_start_commit(struct ubifs_info *c)
+{
+	int err, cnt;
+
+	dbg_lp("");
+
+	mutex_lock(&c->lp_mutex);
+	err = dbg_check_ltab(c);
+	if (err)
+		goto out;
+
+	if (c->check_lpt_free) {
+		/*
+		 * We ensure there is enough free space in
+		 * ubifs_lpt_post_commit() by marking nodes dirty. That
+		 * information is lost when we unmount, so we also need
+		 * to check free space once after mounting also.
+		 */
+		c->check_lpt_free = 0;
+		while (need_write_all(c)) {
+			mutex_unlock(&c->lp_mutex);
+			err = lpt_gc(c);
+			if (err)
+				return err;
+			mutex_lock(&c->lp_mutex);
+		}
+	}
+
+	lpt_tgc_start(c);
+
+	if (!c->dirty_pn_cnt) {
+		dbg_cmt("no cnodes to commit");
+		err = 0;
+		goto out;
+	}
+
+	if (!c->big_lpt && need_write_all(c)) {
+		/* If needed, write everything */
+		err = make_tree_dirty(c);
+		if (err)
+			goto out;
+		lpt_tgc_start(c);
+	}
+
+	if (c->big_lpt)
+		populate_lsave(c);
+
+	cnt = get_cnodes_to_commit(c);
+	ubifs_assert(cnt != 0);
+
+	err = layout_cnodes(c);
+	if (err)
+		goto out;
+
+	/* Copy the LPT's own lprops for end commit to write */
+	memcpy(c->ltab_cmt, c->ltab,
+	       sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs);
+	c->lpt_drty_flgs &= ~(LTAB_DIRTY | LSAVE_DIRTY);
+
+out:
+	mutex_unlock(&c->lp_mutex);
+	return err;
+}
+
+/**
+ * free_obsolete_cnodes - free obsolete cnodes for commit end.
+ * @c: UBIFS file-system description object
+ */
+static void free_obsolete_cnodes(struct ubifs_info *c)
+{
+	struct ubifs_cnode *cnode, *cnext;
+
+	cnext = c->lpt_cnext;
+	if (!cnext)
+		return;
+	do {
+		cnode = cnext;
+		cnext = cnode->cnext;
+		if (test_bit(OBSOLETE_CNODE, &cnode->flags))
+			kfree(cnode);
+		else
+			cnode->cnext = NULL;
+	} while (cnext != c->lpt_cnext);
+	c->lpt_cnext = NULL;
+}
+
+/**
+ * ubifs_lpt_end_commit - finish the commit operation.
+ * @c: the UBIFS file-system description object
+ *
+ * This function has to be called when the commit operation finishes. It
+ * flushes the changes which were "frozen" by 'ubifs_lprops_start_commit()' to
+ * the media. Returns zero in case of success and a negative error code in case
+ * of failure.
+ */
+int ubifs_lpt_end_commit(struct ubifs_info *c)
+{
+	int err;
+
+	dbg_lp("");
+
+	if (!c->lpt_cnext)
+		return 0;
+
+	err = write_cnodes(c);
+	if (err)
+		return err;
+
+	mutex_lock(&c->lp_mutex);
+	free_obsolete_cnodes(c);
+	mutex_unlock(&c->lp_mutex);
+
+	return 0;
+}
+
+/**
+ * ubifs_lpt_post_commit - post commit LPT trivial GC and LPT GC.
+ * @c: UBIFS file-system description object
+ *
+ * LPT trivial GC is completed after a commit. Also LPT GC is done after a
+ * commit for the "big" LPT model.
+ */
+int ubifs_lpt_post_commit(struct ubifs_info *c)
+{
+	int err;
+
+	mutex_lock(&c->lp_mutex);
+	err = lpt_tgc_end(c);
+	if (err)
+		goto out;
+	if (c->big_lpt)
+		while (need_write_all(c)) {
+			mutex_unlock(&c->lp_mutex);
+			err = lpt_gc(c);
+			if (err)
+				return err;
+			mutex_lock(&c->lp_mutex);
+		}
+out:
+	mutex_unlock(&c->lp_mutex);
+	return err;
+}
+
+/**
+ * first_nnode - find the first nnode in memory.
+ * @c: UBIFS file-system description object
+ * @hght: height of tree where nnode found is returned here
+ *
+ * This function returns a pointer to the nnode found or %NULL if no nnode is
+ * found. This function is a helper to 'ubifs_lpt_free()'.
+ */
+static struct ubifs_nnode *first_nnode(struct ubifs_info *c, int *hght)
+{
+	struct ubifs_nnode *nnode;
+	int h, i, found;
+
+	nnode = c->nroot;
+	*hght = 0;
+	if (!nnode)
+		return NULL;
+	for (h = 1; h < c->lpt_hght; h++) {
+		found = 0;
+		for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+			if (nnode->nbranch[i].nnode) {
+				found = 1;
+				nnode = nnode->nbranch[i].nnode;
+				*hght = h;
+				break;
+			}
+		}
+		if (!found)
+			break;
+	}
+	return nnode;
+}
+
+/**
+ * next_nnode - find the next nnode in memory.
+ * @c: UBIFS file-system description object
+ * @nnode: nnode from which to start.
+ * @hght: height of tree where nnode is, is passed and returned here
+ *
+ * This function returns a pointer to the nnode found or %NULL if no nnode is
+ * found. This function is a helper to 'ubifs_lpt_free()'.
+ */
+static struct ubifs_nnode *next_nnode(struct ubifs_info *c,
+				      struct ubifs_nnode *nnode, int *hght)
+{
+	struct ubifs_nnode *parent;
+	int iip, h, i, found;
+
+	parent = nnode->parent;
+	if (!parent)
+		return NULL;
+	if (nnode->iip == UBIFS_LPT_FANOUT - 1) {
+		*hght -= 1;
+		return parent;
+	}
+	for (iip = nnode->iip + 1; iip < UBIFS_LPT_FANOUT; iip++) {
+		nnode = parent->nbranch[iip].nnode;
+		if (nnode)
+			break;
+	}
+	if (!nnode) {
+		*hght -= 1;
+		return parent;
+	}
+	for (h = *hght + 1; h < c->lpt_hght; h++) {
+		found = 0;
+		for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+			if (nnode->nbranch[i].nnode) {
+				found = 1;
+				nnode = nnode->nbranch[i].nnode;
+				*hght = h;
+				break;
+			}
+		}
+		if (!found)
+			break;
+	}
+	return nnode;
+}
+
+/**
+ * ubifs_lpt_free - free resources owned by the LPT.
+ * @c: UBIFS file-system description object
+ * @wr_only: free only resources used for writing
+ */
+void ubifs_lpt_free(struct ubifs_info *c, int wr_only)
+{
+	struct ubifs_nnode *nnode;
+	int i, hght;
+
+	/* Free write-only things first */
+
+	free_obsolete_cnodes(c); /* Leftover from a failed commit */
+
+	vfree(c->ltab_cmt);
+	c->ltab_cmt = NULL;
+	vfree(c->lpt_buf);
+	c->lpt_buf = NULL;
+	kfree(c->lsave);
+	c->lsave = NULL;
+
+	if (wr_only)
+		return;
+
+	/* Now free the rest */
+
+	nnode = first_nnode(c, &hght);
+	while (nnode) {
+		for (i = 0; i < UBIFS_LPT_FANOUT; i++)
+			kfree(nnode->nbranch[i].nnode);
+		nnode = next_nnode(c, nnode, &hght);
+	}
+	for (i = 0; i < LPROPS_HEAP_CNT; i++)
+		kfree(c->lpt_heap[i].arr);
+	kfree(c->dirty_idx.arr);
+	kfree(c->nroot);
+	vfree(c->ltab);
+	kfree(c->lpt_nod_buf);
+}
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+
+/**
+ * dbg_is_all_ff - determine if a buffer contains only 0xff bytes.
+ * @buf: buffer
+ * @len: buffer length
+ */
+static int dbg_is_all_ff(uint8_t *buf, int len)
+{
+	int i;
+
+	for (i = 0; i < len; i++)
+		if (buf[i] != 0xff)
+			return 0;
+	return 1;
+}
+
+/**
+ * dbg_is_nnode_dirty - determine if a nnode is dirty.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB number where nnode was written
+ * @offs: offset where nnode was written
+ */
+static int dbg_is_nnode_dirty(struct ubifs_info *c, int lnum, int offs)
+{
+	struct ubifs_nnode *nnode;
+	int hght;
+
+	/* Entire tree is in memory so first_nnode / next_nnode are ok */
+	nnode = first_nnode(c, &hght);
+	for (; nnode; nnode = next_nnode(c, nnode, &hght)) {
+		struct ubifs_nbranch *branch;
+
+		cond_resched();
+		if (nnode->parent) {
+			branch = &nnode->parent->nbranch[nnode->iip];
+			if (branch->lnum != lnum || branch->offs != offs)
+				continue;
+			if (test_bit(DIRTY_CNODE, &nnode->flags))
+				return 1;
+			return 0;
+		} else {
+			if (c->lpt_lnum != lnum || c->lpt_offs != offs)
+				continue;
+			if (test_bit(DIRTY_CNODE, &nnode->flags))
+				return 1;
+			return 0;
+		}
+	}
+	return 1;
+}
+
+/**
+ * dbg_is_pnode_dirty - determine if a pnode is dirty.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB number where pnode was written
+ * @offs: offset where pnode was written
+ */
+static int dbg_is_pnode_dirty(struct ubifs_info *c, int lnum, int offs)
+{
+	int i, cnt;
+
+	cnt = DIV_ROUND_UP(c->main_lebs, UBIFS_LPT_FANOUT);
+	for (i = 0; i < cnt; i++) {
+		struct ubifs_pnode *pnode;
+		struct ubifs_nbranch *branch;
+
+		cond_resched();
+		pnode = pnode_lookup(c, i);
+		if (IS_ERR(pnode))
+			return PTR_ERR(pnode);
+		branch = &pnode->parent->nbranch[pnode->iip];
+		if (branch->lnum != lnum || branch->offs != offs)
+			continue;
+		if (test_bit(DIRTY_CNODE, &pnode->flags))
+			return 1;
+		return 0;
+	}
+	return 1;
+}
+
+/**
+ * dbg_is_ltab_dirty - determine if a ltab node is dirty.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB number where ltab node was written
+ * @offs: offset where ltab node was written
+ */
+static int dbg_is_ltab_dirty(struct ubifs_info *c, int lnum, int offs)
+{
+	if (lnum != c->ltab_lnum || offs != c->ltab_offs)
+		return 1;
+	return (c->lpt_drty_flgs & LTAB_DIRTY) != 0;
+}
+
+/**
+ * dbg_is_lsave_dirty - determine if a lsave node is dirty.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB number where lsave node was written
+ * @offs: offset where lsave node was written
+ */
+static int dbg_is_lsave_dirty(struct ubifs_info *c, int lnum, int offs)
+{
+	if (lnum != c->lsave_lnum || offs != c->lsave_offs)
+		return 1;
+	return (c->lpt_drty_flgs & LSAVE_DIRTY) != 0;
+}
+
+/**
+ * dbg_is_node_dirty - determine if a node is dirty.
+ * @c: the UBIFS file-system description object
+ * @node_type: node type
+ * @lnum: LEB number where node was written
+ * @offs: offset where node was written
+ */
+static int dbg_is_node_dirty(struct ubifs_info *c, int node_type, int lnum,
+			     int offs)
+{
+	switch (node_type) {
+	case UBIFS_LPT_NNODE:
+		return dbg_is_nnode_dirty(c, lnum, offs);
+	case UBIFS_LPT_PNODE:
+		return dbg_is_pnode_dirty(c, lnum, offs);
+	case UBIFS_LPT_LTAB:
+		return dbg_is_ltab_dirty(c, lnum, offs);
+	case UBIFS_LPT_LSAVE:
+		return dbg_is_lsave_dirty(c, lnum, offs);
+	}
+	return 1;
+}
+
+/**
+ * dbg_check_ltab_lnum - check the ltab for a LPT LEB number.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB number where node was written
+ * @offs: offset where node was written
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum)
+{
+	int err, len = c->leb_size, dirty = 0, node_type, node_num, node_len;
+	int ret;
+	void *buf = c->dbg_buf;
+
+	dbg_lp("LEB %d", lnum);
+	err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
+	if (err) {
+		dbg_msg("ubi_read failed, LEB %d, error %d", lnum, err);
+		return err;
+	}
+	while (1) {
+		if (!is_a_node(c, buf, len)) {
+			int i, pad_len;
+
+			pad_len = get_pad_len(c, buf, len);
+			if (pad_len) {
+				buf += pad_len;
+				len -= pad_len;
+				dirty += pad_len;
+				continue;
+			}
+			if (!dbg_is_all_ff(buf, len)) {
+				dbg_msg("invalid empty space in LEB %d at %d",
+					lnum, c->leb_size - len);
+				err = -EINVAL;
+			}
+			i = lnum - c->lpt_first;
+			if (len != c->ltab[i].free) {
+				dbg_msg("invalid free space in LEB %d "
+					"(free %d, expected %d)",
+					lnum, len, c->ltab[i].free);
+				err = -EINVAL;
+			}
+			if (dirty != c->ltab[i].dirty) {
+				dbg_msg("invalid dirty space in LEB %d "
+					"(dirty %d, expected %d)",
+					lnum, dirty, c->ltab[i].dirty);
+				err = -EINVAL;
+			}
+			return err;
+		}
+		node_type = get_lpt_node_type(c, buf, &node_num);
+		node_len = get_lpt_node_len(c, node_type);
+		ret = dbg_is_node_dirty(c, node_type, lnum, c->leb_size - len);
+		if (ret == 1)
+			dirty += node_len;
+		buf += node_len;
+		len -= node_len;
+	}
+}
+
+/**
+ * dbg_check_ltab - check the free and dirty space in the ltab.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int dbg_check_ltab(struct ubifs_info *c)
+{
+	int lnum, err, i, cnt;
+
+	if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
+		return 0;
+
+	/* Bring the entire tree into memory */
+	cnt = DIV_ROUND_UP(c->main_lebs, UBIFS_LPT_FANOUT);
+	for (i = 0; i < cnt; i++) {
+		struct ubifs_pnode *pnode;
+
+		pnode = pnode_lookup(c, i);
+		if (IS_ERR(pnode))
+			return PTR_ERR(pnode);
+		cond_resched();
+	}
+
+	/* Check nodes */
+	err = dbg_check_lpt_nodes(c, (struct ubifs_cnode *)c->nroot, 0, 0);
+	if (err)
+		return err;
+
+	/* Check each LEB */
+	for (lnum = c->lpt_first; lnum <= c->lpt_last; lnum++) {
+		err = dbg_check_ltab_lnum(c, lnum);
+		if (err) {
+			dbg_err("failed at LEB %d", lnum);
+			return err;
+		}
+	}
+
+	dbg_lp("succeeded");
+	return 0;
+}
+
+#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
new file mode 100644
index 00000000000..71d5493bf56
--- /dev/null
+++ b/fs/ubifs/master.c
@@ -0,0 +1,387 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/* This file implements reading and writing the master node */
+
+#include "ubifs.h"
+
+/**
+ * scan_for_master - search the valid master node.
+ * @c: UBIFS file-system description object
+ *
+ * This function scans the master node LEBs and search for the latest master
+ * node. Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+static int scan_for_master(struct ubifs_info *c)
+{
+	struct ubifs_scan_leb *sleb;
+	struct ubifs_scan_node *snod;
+	int lnum, offs = 0, nodes_cnt;
+
+	lnum = UBIFS_MST_LNUM;
+
+	sleb = ubifs_scan(c, lnum, 0, c->sbuf);
+	if (IS_ERR(sleb))
+		return PTR_ERR(sleb);
+	nodes_cnt = sleb->nodes_cnt;
+	if (nodes_cnt > 0) {
+		snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node,
+				  list);
+		if (snod->type != UBIFS_MST_NODE)
+			goto out;
+		memcpy(c->mst_node, snod->node, snod->len);
+		offs = snod->offs;
+	}
+	ubifs_scan_destroy(sleb);
+
+	lnum += 1;
+
+	sleb = ubifs_scan(c, lnum, 0, c->sbuf);
+	if (IS_ERR(sleb))
+		return PTR_ERR(sleb);
+	if (sleb->nodes_cnt != nodes_cnt)
+		goto out;
+	if (!sleb->nodes_cnt)
+		goto out;
+	snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, list);
+	if (snod->type != UBIFS_MST_NODE)
+		goto out;
+	if (snod->offs != offs)
+		goto out;
+	if (memcmp((void *)c->mst_node + UBIFS_CH_SZ,
+		   (void *)snod->node + UBIFS_CH_SZ,
+		   UBIFS_MST_NODE_SZ - UBIFS_CH_SZ))
+		goto out;
+	c->mst_offs = offs;
+	ubifs_scan_destroy(sleb);
+	return 0;
+
+out:
+	ubifs_scan_destroy(sleb);
+	return -EINVAL;
+}
+
+/**
+ * validate_master - validate master node.
+ * @c: UBIFS file-system description object
+ *
+ * This function validates data which was read from master node. Returns zero
+ * if the data is all right and %-EINVAL if not.
+ */
+static int validate_master(const struct ubifs_info *c)
+{
+	long long main_sz;
+	int err;
+
+	if (c->max_sqnum >= SQNUM_WATERMARK) {
+		err = 1;
+		goto out;
+	}
+
+	if (c->cmt_no >= c->max_sqnum) {
+		err = 2;
+		goto out;
+	}
+
+	if (c->highest_inum >= INUM_WATERMARK) {
+		err = 3;
+		goto out;
+	}
+
+	if (c->lhead_lnum < UBIFS_LOG_LNUM ||
+	    c->lhead_lnum >= UBIFS_LOG_LNUM + c->log_lebs ||
+	    c->lhead_offs < 0 || c->lhead_offs >= c->leb_size ||
+	    c->lhead_offs & (c->min_io_size - 1)) {
+		err = 4;
+		goto out;
+	}
+
+	if (c->zroot.lnum >= c->leb_cnt || c->zroot.lnum < c->main_first ||
+	    c->zroot.offs >= c->leb_size || c->zroot.offs & 7) {
+		err = 5;
+		goto out;
+	}
+
+	if (c->zroot.len < c->ranges[UBIFS_IDX_NODE].min_len ||
+	    c->zroot.len > c->ranges[UBIFS_IDX_NODE].max_len) {
+		err = 6;
+		goto out;
+	}
+
+	if (c->gc_lnum >= c->leb_cnt || c->gc_lnum < c->main_first) {
+		err = 7;
+		goto out;
+	}
+
+	if (c->ihead_lnum >= c->leb_cnt || c->ihead_lnum < c->main_first ||
+	    c->ihead_offs % c->min_io_size || c->ihead_offs < 0 ||
+	    c->ihead_offs > c->leb_size || c->ihead_offs & 7) {
+		err = 8;
+		goto out;
+	}
+
+	main_sz = (long long)c->main_lebs * c->leb_size;
+	if (c->old_idx_sz & 7 || c->old_idx_sz >= main_sz) {
+		err = 9;
+		goto out;
+	}
+
+	if (c->lpt_lnum < c->lpt_first || c->lpt_lnum > c->lpt_last ||
+	    c->lpt_offs < 0 || c->lpt_offs + c->nnode_sz > c->leb_size) {
+		err = 10;
+		goto out;
+	}
+
+	if (c->nhead_lnum < c->lpt_first || c->nhead_lnum > c->lpt_last ||
+	    c->nhead_offs < 0 || c->nhead_offs % c->min_io_size ||
+	    c->nhead_offs > c->leb_size) {
+		err = 11;
+		goto out;
+	}
+
+	if (c->ltab_lnum < c->lpt_first || c->ltab_lnum > c->lpt_last ||
+	    c->ltab_offs < 0 ||
+	    c->ltab_offs + c->ltab_sz > c->leb_size) {
+		err = 12;
+		goto out;
+	}
+
+	if (c->big_lpt && (c->lsave_lnum < c->lpt_first ||
+	    c->lsave_lnum > c->lpt_last || c->lsave_offs < 0 ||
+	    c->lsave_offs + c->lsave_sz > c->leb_size)) {
+		err = 13;
+		goto out;
+	}
+
+	if (c->lscan_lnum < c->main_first || c->lscan_lnum >= c->leb_cnt) {
+		err = 14;
+		goto out;
+	}
+
+	if (c->lst.empty_lebs < 0 || c->lst.empty_lebs > c->main_lebs - 2) {
+		err = 15;
+		goto out;
+	}
+
+	if (c->lst.idx_lebs < 0 || c->lst.idx_lebs > c->main_lebs - 1) {
+		err = 16;
+		goto out;
+	}
+
+	if (c->lst.total_free < 0 || c->lst.total_free > main_sz ||
+	    c->lst.total_free & 7) {
+		err = 17;
+		goto out;
+	}
+
+	if (c->lst.total_dirty < 0 || (c->lst.total_dirty & 7)) {
+		err = 18;
+		goto out;
+	}
+
+	if (c->lst.total_used < 0 || (c->lst.total_used & 7)) {
+		err = 19;
+		goto out;
+	}
+
+	if (c->lst.total_free + c->lst.total_dirty +
+	    c->lst.total_used > main_sz) {
+		err = 20;
+		goto out;
+	}
+
+	if (c->lst.total_dead + c->lst.total_dark +
+	    c->lst.total_used + c->old_idx_sz > main_sz) {
+		err = 21;
+		goto out;
+	}
+
+	if (c->lst.total_dead < 0 ||
+	    c->lst.total_dead > c->lst.total_free + c->lst.total_dirty ||
+	    c->lst.total_dead & 7) {
+		err = 22;
+		goto out;
+	}
+
+	if (c->lst.total_dark < 0 ||
+	    c->lst.total_dark > c->lst.total_free + c->lst.total_dirty ||
+	    c->lst.total_dark & 7) {
+		err = 23;
+		goto out;
+	}
+
+	return 0;
+
+out:
+	ubifs_err("bad master node at offset %d error %d", c->mst_offs, err);
+	dbg_dump_node(c, c->mst_node);
+	return -EINVAL;
+}
+
+/**
+ * ubifs_read_master - read master node.
+ * @c: UBIFS file-system description object
+ *
+ * This function finds and reads the master node during file-system mount. If
+ * the flash is empty, it creates default master node as well. Returns zero in
+ * case of success and a negative error code in case of failure.
+ */
+int ubifs_read_master(struct ubifs_info *c)
+{
+	int err, old_leb_cnt;
+
+	c->mst_node = kzalloc(c->mst_node_alsz, GFP_KERNEL);
+	if (!c->mst_node)
+		return -ENOMEM;
+
+	err = scan_for_master(c);
+	if (err) {
+		err = ubifs_recover_master_node(c);
+		if (err)
+			/*
+			 * Note, we do not free 'c->mst_node' here because the
+			 * unmount routine will take care of this.
+			 */
+			return err;
+	}
+
+	/* Make sure that the recovery flag is clear */
+	c->mst_node->flags &= cpu_to_le32(~UBIFS_MST_RCVRY);
+
+	c->max_sqnum       = le64_to_cpu(c->mst_node->ch.sqnum);
+	c->highest_inum    = le64_to_cpu(c->mst_node->highest_inum);
+	c->cmt_no          = le64_to_cpu(c->mst_node->cmt_no);
+	c->zroot.lnum      = le32_to_cpu(c->mst_node->root_lnum);
+	c->zroot.offs      = le32_to_cpu(c->mst_node->root_offs);
+	c->zroot.len       = le32_to_cpu(c->mst_node->root_len);
+	c->lhead_lnum      = le32_to_cpu(c->mst_node->log_lnum);
+	c->gc_lnum         = le32_to_cpu(c->mst_node->gc_lnum);
+	c->ihead_lnum      = le32_to_cpu(c->mst_node->ihead_lnum);
+	c->ihead_offs      = le32_to_cpu(c->mst_node->ihead_offs);
+	c->old_idx_sz      = le64_to_cpu(c->mst_node->index_size);
+	c->lpt_lnum        = le32_to_cpu(c->mst_node->lpt_lnum);
+	c->lpt_offs        = le32_to_cpu(c->mst_node->lpt_offs);
+	c->nhead_lnum      = le32_to_cpu(c->mst_node->nhead_lnum);
+	c->nhead_offs      = le32_to_cpu(c->mst_node->nhead_offs);
+	c->ltab_lnum       = le32_to_cpu(c->mst_node->ltab_lnum);
+	c->ltab_offs       = le32_to_cpu(c->mst_node->ltab_offs);
+	c->lsave_lnum      = le32_to_cpu(c->mst_node->lsave_lnum);
+	c->lsave_offs      = le32_to_cpu(c->mst_node->lsave_offs);
+	c->lscan_lnum      = le32_to_cpu(c->mst_node->lscan_lnum);
+	c->lst.empty_lebs  = le32_to_cpu(c->mst_node->empty_lebs);
+	c->lst.idx_lebs    = le32_to_cpu(c->mst_node->idx_lebs);
+	old_leb_cnt        = le32_to_cpu(c->mst_node->leb_cnt);
+	c->lst.total_free  = le64_to_cpu(c->mst_node->total_free);
+	c->lst.total_dirty = le64_to_cpu(c->mst_node->total_dirty);
+	c->lst.total_used  = le64_to_cpu(c->mst_node->total_used);
+	c->lst.total_dead  = le64_to_cpu(c->mst_node->total_dead);
+	c->lst.total_dark  = le64_to_cpu(c->mst_node->total_dark);
+
+	c->calc_idx_sz = c->old_idx_sz;
+
+	if (c->mst_node->flags & cpu_to_le32(UBIFS_MST_NO_ORPHS))
+		c->no_orphs = 1;
+
+	if (old_leb_cnt != c->leb_cnt) {
+		/* The file system has been resized */
+		int growth = c->leb_cnt - old_leb_cnt;
+
+		if (c->leb_cnt < old_leb_cnt ||
+		    c->leb_cnt < UBIFS_MIN_LEB_CNT) {
+			ubifs_err("bad leb_cnt on master node");
+			dbg_dump_node(c, c->mst_node);
+			return -EINVAL;
+		}
+
+		dbg_mnt("Auto resizing (master) from %d LEBs to %d LEBs",
+			old_leb_cnt, c->leb_cnt);
+		c->lst.empty_lebs += growth;
+		c->lst.total_free += growth * (long long)c->leb_size;
+		c->lst.total_dark += growth * (long long)c->dark_wm;
+
+		/*
+		 * Reflect changes back onto the master node. N.B. the master
+		 * node gets written immediately whenever mounting (or
+		 * remounting) in read-write mode, so we do not need to write it
+		 * here.
+		 */
+		c->mst_node->leb_cnt = cpu_to_le32(c->leb_cnt);
+		c->mst_node->empty_lebs = cpu_to_le32(c->lst.empty_lebs);
+		c->mst_node->total_free = cpu_to_le64(c->lst.total_free);
+		c->mst_node->total_dark = cpu_to_le64(c->lst.total_dark);
+	}
+
+	err = validate_master(c);
+	if (err)
+		return err;
+
+	err = dbg_old_index_check_init(c, &c->zroot);
+
+	return err;
+}
+
+/**
+ * ubifs_write_master - write master node.
+ * @c: UBIFS file-system description object
+ *
+ * This function writes the master node. The caller has to take the
+ * @c->mst_mutex lock before calling this function. Returns zero in case of
+ * success and a negative error code in case of failure. The master node is
+ * written twice to enable recovery.
+ */
+int ubifs_write_master(struct ubifs_info *c)
+{
+	int err, lnum, offs, len;
+
+	if (c->ro_media)
+		return -EINVAL;
+
+	lnum = UBIFS_MST_LNUM;
+	offs = c->mst_offs + c->mst_node_alsz;
+	len = UBIFS_MST_NODE_SZ;
+
+	if (offs + UBIFS_MST_NODE_SZ > c->leb_size) {
+		err = ubifs_leb_unmap(c, lnum);
+		if (err)
+			return err;
+		offs = 0;
+	}
+
+	c->mst_offs = offs;
+	c->mst_node->highest_inum = cpu_to_le64(c->highest_inum);
+
+	err = ubifs_write_node(c, c->mst_node, len, lnum, offs, UBI_SHORTTERM);
+	if (err)
+		return err;
+
+	lnum += 1;
+
+	if (offs == 0) {
+		err = ubifs_leb_unmap(c, lnum);
+		if (err)
+			return err;
+	}
+	err = ubifs_write_node(c, c->mst_node, len, lnum, offs, UBI_SHORTTERM);
+
+	return err;
+}
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
new file mode 100644
index 00000000000..4beccfc256d
--- /dev/null
+++ b/fs/ubifs/misc.h
@@ -0,0 +1,342 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/*
+ * This file contains miscellaneous helper functions.
+ */
+
+#ifndef __UBIFS_MISC_H__
+#define __UBIFS_MISC_H__
+
+/**
+ * ubifs_zn_dirty - check if znode is dirty.
+ * @znode: znode to check
+ *
+ * This helper function returns %1 if @znode is dirty and %0 otherwise.
+ */
+static inline int ubifs_zn_dirty(const struct ubifs_znode *znode)
+{
+	return !!test_bit(DIRTY_ZNODE, &znode->flags);
+}
+
+/**
+ * ubifs_wake_up_bgt - wake up background thread.
+ * @c: UBIFS file-system description object
+ */
+static inline void ubifs_wake_up_bgt(struct ubifs_info *c)
+{
+	if (c->bgt && !c->need_bgt) {
+		c->need_bgt = 1;
+		wake_up_process(c->bgt);
+	}
+}
+
+/**
+ * ubifs_tnc_find_child - find next child in znode.
+ * @znode: znode to search at
+ * @start: the zbranch index to start at
+ *
+ * This helper function looks for znode child starting at index @start. Returns
+ * the child or %NULL if no children were found.
+ */
+static inline struct ubifs_znode *
+ubifs_tnc_find_child(struct ubifs_znode *znode, int start)
+{
+	while (start < znode->child_cnt) {
+		if (znode->zbranch[start].znode)
+			return znode->zbranch[start].znode;
+		start += 1;
+	}
+
+	return NULL;
+}
+
+/**
+ * ubifs_inode - get UBIFS inode information by VFS 'struct inode' object.
+ * @inode: the VFS 'struct inode' pointer
+ */
+static inline struct ubifs_inode *ubifs_inode(const struct inode *inode)
+{
+	return container_of(inode, struct ubifs_inode, vfs_inode);
+}
+
+/**
+ * ubifs_ro_mode - switch UBIFS to read read-only mode.
+ * @c: UBIFS file-system description object
+ * @err: error code which is the reason of switching to R/O mode
+ */
+static inline void ubifs_ro_mode(struct ubifs_info *c, int err)
+{
+	if (!c->ro_media) {
+		c->ro_media = 1;
+		ubifs_warn("switched to read-only mode, error %d", err);
+		dbg_dump_stack();
+	}
+}
+
+/**
+ * ubifs_compr_present - check if compressor was compiled in.
+ * @compr_type: compressor type to check
+ *
+ * This function returns %1 of compressor of type @compr_type is present, and
+ * %0 if not.
+ */
+static inline int ubifs_compr_present(int compr_type)
+{
+	ubifs_assert(compr_type >= 0 && compr_type < UBIFS_COMPR_TYPES_CNT);
+	return !!ubifs_compressors[compr_type]->capi_name;
+}
+
+/**
+ * ubifs_compr_name - get compressor name string by its type.
+ * @compr_type: compressor type
+ *
+ * This function returns compressor type string.
+ */
+static inline const char *ubifs_compr_name(int compr_type)
+{
+	ubifs_assert(compr_type >= 0 && compr_type < UBIFS_COMPR_TYPES_CNT);
+	return ubifs_compressors[compr_type]->name;
+}
+
+/**
+ * ubifs_wbuf_sync - synchronize write-buffer.
+ * @wbuf: write-buffer to synchronize
+ *
+ * This is the same as as 'ubifs_wbuf_sync_nolock()' but it does not assume
+ * that the write-buffer is already locked.
+ */
+static inline int ubifs_wbuf_sync(struct ubifs_wbuf *wbuf)
+{
+	int err;
+
+	mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+	err = ubifs_wbuf_sync_nolock(wbuf);
+	mutex_unlock(&wbuf->io_mutex);
+	return err;
+}
+
+/**
+ * ubifs_leb_unmap - unmap an LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to unmap
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static inline int ubifs_leb_unmap(const struct ubifs_info *c, int lnum)
+{
+	int err;
+
+	if (c->ro_media)
+		return -EROFS;
+	err = ubi_leb_unmap(c->ubi, lnum);
+	if (err) {
+		ubifs_err("unmap LEB %d failed, error %d", lnum, err);
+		return err;
+	}
+
+	return 0;
+}
+
+/**
+ * ubifs_leb_write - write to a LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to write
+ * @buf: buffer to write from
+ * @offs: offset within LEB to write to
+ * @len: length to write
+ * @dtype: data type
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static inline int ubifs_leb_write(const struct ubifs_info *c, int lnum,
+				  const void *buf, int offs, int len, int dtype)
+{
+	int err;
+
+	if (c->ro_media)
+		return -EROFS;
+	err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype);
+	if (err) {
+		ubifs_err("writing %d bytes at %d:%d, error %d",
+			  len, lnum, offs, err);
+		return err;
+	}
+
+	return 0;
+}
+
+/**
+ * ubifs_leb_change - atomic LEB change.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to write
+ * @buf: buffer to write from
+ * @len: length to write
+ * @dtype: data type
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static inline int ubifs_leb_change(const struct ubifs_info *c, int lnum,
+				   const void *buf, int len, int dtype)
+{
+	int err;
+
+	if (c->ro_media)
+		return -EROFS;
+	err = ubi_leb_change(c->ubi, lnum, buf, len, dtype);
+	if (err) {
+		ubifs_err("changing %d bytes in LEB %d, error %d",
+			  len, lnum, err);
+		return err;
+	}
+
+	return 0;
+}
+
+/**
+ * ubifs_encode_dev - encode device node IDs.
+ * @dev: UBIFS device node information
+ * @rdev: device IDs to encode
+ *
+ * This is a helper function which encodes major/minor numbers of a device node
+ * into UBIFS device node description. We use standard Linux "new" and "huge"
+ * encodings.
+ */
+static inline int ubifs_encode_dev(union ubifs_dev_desc *dev, dev_t rdev)
+{
+	if (new_valid_dev(rdev)) {
+		dev->new = cpu_to_le32(new_encode_dev(rdev));
+		return sizeof(dev->new);
+	} else {
+		dev->huge = cpu_to_le64(huge_encode_dev(rdev));
+		return sizeof(dev->huge);
+	}
+}
+
+/**
+ * ubifs_add_dirt - add dirty space to LEB properties.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB to add dirty space for
+ * @dirty: dirty space to add
+ *
+ * This is a helper function which increased amount of dirty LEB space. Returns
+ * zero in case of success and a negative error code in case of failure.
+ */
+static inline int ubifs_add_dirt(struct ubifs_info *c, int lnum, int dirty)
+{
+	return ubifs_update_one_lp(c, lnum, LPROPS_NC, dirty, 0, 0);
+}
+
+/**
+ * ubifs_return_leb - return LEB to lprops.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB to return
+ *
+ * This helper function cleans the "taken" flag of a logical eraseblock in the
+ * lprops. Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+static inline int ubifs_return_leb(struct ubifs_info *c, int lnum)
+{
+	return ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
+				   LPROPS_TAKEN, 0);
+}
+
+/**
+ * ubifs_idx_node_sz - return index node size.
+ * @c: the UBIFS file-system description object
+ * @child_cnt: number of children of this index node
+ */
+static inline int ubifs_idx_node_sz(const struct ubifs_info *c, int child_cnt)
+{
+	return UBIFS_IDX_NODE_SZ + (UBIFS_BRANCH_SZ + c->key_len) * child_cnt;
+}
+
+/**
+ * ubifs_idx_branch - return pointer to an index branch.
+ * @c: the UBIFS file-system description object
+ * @idx: index node
+ * @bnum: branch number
+ */
+static inline
+struct ubifs_branch *ubifs_idx_branch(const struct ubifs_info *c,
+				      const struct ubifs_idx_node *idx,
+				      int bnum)
+{
+	return (struct ubifs_branch *)((void *)idx->branches +
+				       (UBIFS_BRANCH_SZ + c->key_len) * bnum);
+}
+
+/**
+ * ubifs_idx_key - return pointer to an index key.
+ * @c: the UBIFS file-system description object
+ * @idx: index node
+ */
+static inline void *ubifs_idx_key(const struct ubifs_info *c,
+				  const struct ubifs_idx_node *idx)
+{
+	return (void *)((struct ubifs_branch *)idx->branches)->key;
+}
+
+/**
+ * ubifs_reported_space - calculate reported free space.
+ * @c: the UBIFS file-system description object
+ * @free: amount of free space
+ *
+ * This function calculates amount of free space which will be reported to
+ * user-space. User-space application tend to expect that if the file-system
+ * (e.g., via the 'statfs()' call) reports that it has N bytes available, they
+ * are able to write a file of size N. UBIFS attaches node headers to each data
+ * node and it has to write indexind nodes as well. This introduces additional
+ * overhead, and UBIFS it has to report sligtly less free space to meet the
+ * above expectetion.
+ *
+ * This function assumes free space is made up of uncompressed data nodes and
+ * full index nodes (one per data node, doubled because we always allow enough
+ * space to write the index twice).
+ *
+ * Note, the calculation is pessimistic, which means that most of the time
+ * UBIFS reports less space than it actually has.
+ */
+static inline long long ubifs_reported_space(const struct ubifs_info *c,
+					     uint64_t free)
+{
+	int divisor, factor;
+
+	divisor = UBIFS_MAX_DATA_NODE_SZ + (c->max_idx_node_sz << 1);
+	factor = UBIFS_MAX_DATA_NODE_SZ - UBIFS_DATA_NODE_SZ;
+	do_div(free, divisor);
+
+	return free * factor;
+}
+
+/**
+ * ubifs_current_time - round current time to time granularity.
+ * @inode: inode
+ */
+static inline struct timespec ubifs_current_time(struct inode *inode)
+{
+	return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ?
+		current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
+}
+
+#endif /* __UBIFS_MISC_H__ */
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
new file mode 100644
index 00000000000..3afeb9242c6
--- /dev/null
+++ b/fs/ubifs/orphan.c
@@ -0,0 +1,958 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Author: Adrian Hunter
+ */
+
+#include "ubifs.h"
+
+/*
+ * An orphan is an inode number whose inode node has been committed to the index
+ * with a link count of zero. That happens when an open file is deleted
+ * (unlinked) and then a commit is run. In the normal course of events the inode
+ * would be deleted when the file is closed. However in the case of an unclean
+ * unmount, orphans need to be accounted for. After an unclean unmount, the
+ * orphans' inodes must be deleted which means either scanning the entire index
+ * looking for them, or keeping a list on flash somewhere. This unit implements
+ * the latter approach.
+ *
+ * The orphan area is a fixed number of LEBs situated between the LPT area and
+ * the main area. The number of orphan area LEBs is specified when the file
+ * system is created. The minimum number is 1. The size of the orphan area
+ * should be so that it can hold the maximum number of orphans that are expected
+ * to ever exist at one time.
+ *
+ * The number of orphans that can fit in a LEB is:
+ *
+ *         (c->leb_size - UBIFS_ORPH_NODE_SZ) / sizeof(__le64)
+ *
+ * For example: a 15872 byte LEB can fit 1980 orphans so 1 LEB may be enough.
+ *
+ * Orphans are accumulated in a rb-tree. When an inode's link count drops to
+ * zero, the inode number is added to the rb-tree. It is removed from the tree
+ * when the inode is deleted.  Any new orphans that are in the orphan tree when
+ * the commit is run, are written to the orphan area in 1 or more orph nodes.
+ * If the orphan area is full, it is consolidated to make space.  There is
+ * always enough space because validation prevents the user from creating more
+ * than the maximum number of orphans allowed.
+ */
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+static int dbg_check_orphans(struct ubifs_info *c);
+#else
+#define dbg_check_orphans(c) 0
+#endif
+
+/**
+ * ubifs_add_orphan - add an orphan.
+ * @c: UBIFS file-system description object
+ * @inum: orphan inode number
+ *
+ * Add an orphan. This function is called when an inodes link count drops to
+ * zero.
+ */
+int ubifs_add_orphan(struct ubifs_info *c, ino_t inum)
+{
+	struct ubifs_orphan *orphan, *o;
+	struct rb_node **p, *parent = NULL;
+
+	orphan = kzalloc(sizeof(struct ubifs_orphan), GFP_NOFS);
+	if (!orphan)
+		return -ENOMEM;
+	orphan->inum = inum;
+	orphan->new = 1;
+
+	spin_lock(&c->orphan_lock);
+	if (c->tot_orphans >= c->max_orphans) {
+		spin_unlock(&c->orphan_lock);
+		kfree(orphan);
+		return -ENFILE;
+	}
+	p = &c->orph_tree.rb_node;
+	while (*p) {
+		parent = *p;
+		o = rb_entry(parent, struct ubifs_orphan, rb);
+		if (inum < o->inum)
+			p = &(*p)->rb_left;
+		else if (inum > o->inum)
+			p = &(*p)->rb_right;
+		else {
+			dbg_err("orphaned twice");
+			spin_unlock(&c->orphan_lock);
+			kfree(orphan);
+			return 0;
+		}
+	}
+	c->tot_orphans += 1;
+	c->new_orphans += 1;
+	rb_link_node(&orphan->rb, parent, p);
+	rb_insert_color(&orphan->rb, &c->orph_tree);
+	list_add_tail(&orphan->list, &c->orph_list);
+	list_add_tail(&orphan->new_list, &c->orph_new);
+	spin_unlock(&c->orphan_lock);
+	dbg_gen("ino %lu", inum);
+	return 0;
+}
+
+/**
+ * ubifs_delete_orphan - delete an orphan.
+ * @c: UBIFS file-system description object
+ * @inum: orphan inode number
+ *
+ * Delete an orphan. This function is called when an inode is deleted.
+ */
+void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum)
+{
+	struct ubifs_orphan *o;
+	struct rb_node *p;
+
+	spin_lock(&c->orphan_lock);
+	p = c->orph_tree.rb_node;
+	while (p) {
+		o = rb_entry(p, struct ubifs_orphan, rb);
+		if (inum < o->inum)
+			p = p->rb_left;
+		else if (inum > o->inum)
+			p = p->rb_right;
+		else {
+			if (o->dnext) {
+				spin_unlock(&c->orphan_lock);
+				dbg_gen("deleted twice ino %lu", inum);
+				return;
+			}
+			if (o->cnext) {
+				o->dnext = c->orph_dnext;
+				c->orph_dnext = o;
+				spin_unlock(&c->orphan_lock);
+				dbg_gen("delete later ino %lu", inum);
+				return;
+			}
+			rb_erase(p, &c->orph_tree);
+			list_del(&o->list);
+			c->tot_orphans -= 1;
+			if (o->new) {
+				list_del(&o->new_list);
+				c->new_orphans -= 1;
+			}
+			spin_unlock(&c->orphan_lock);
+			kfree(o);
+			dbg_gen("inum %lu", inum);
+			return;
+		}
+	}
+	spin_unlock(&c->orphan_lock);
+	dbg_err("missing orphan ino %lu", inum);
+	dbg_dump_stack();
+}
+
+/**
+ * ubifs_orphan_start_commit - start commit of orphans.
+ * @c: UBIFS file-system description object
+ *
+ * Start commit of orphans.
+ */
+int ubifs_orphan_start_commit(struct ubifs_info *c)
+{
+	struct ubifs_orphan *orphan, **last;
+
+	spin_lock(&c->orphan_lock);
+	last = &c->orph_cnext;
+	list_for_each_entry(orphan, &c->orph_new, new_list) {
+		ubifs_assert(orphan->new);
+		orphan->new = 0;
+		*last = orphan;
+		last = &orphan->cnext;
+	}
+	*last = orphan->cnext;
+	c->cmt_orphans = c->new_orphans;
+	c->new_orphans = 0;
+	dbg_cmt("%d orphans to commit", c->cmt_orphans);
+	INIT_LIST_HEAD(&c->orph_new);
+	if (c->tot_orphans == 0)
+		c->no_orphs = 1;
+	else
+		c->no_orphs = 0;
+	spin_unlock(&c->orphan_lock);
+	return 0;
+}
+
+/**
+ * avail_orphs - calculate available space.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns the number of orphans that can be written in the
+ * available space.
+ */
+static int avail_orphs(struct ubifs_info *c)
+{
+	int avail_lebs, avail, gap;
+
+	avail_lebs = c->orph_lebs - (c->ohead_lnum - c->orph_first) - 1;
+	avail = avail_lebs *
+	       ((c->leb_size - UBIFS_ORPH_NODE_SZ) / sizeof(__le64));
+	gap = c->leb_size - c->ohead_offs;
+	if (gap >= UBIFS_ORPH_NODE_SZ + sizeof(__le64))
+		avail += (gap - UBIFS_ORPH_NODE_SZ) / sizeof(__le64);
+	return avail;
+}
+
+/**
+ * tot_avail_orphs - calculate total space.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns the number of orphans that can be written in half
+ * the total space. That leaves half the space for adding new orphans.
+ */
+static int tot_avail_orphs(struct ubifs_info *c)
+{
+	int avail_lebs, avail;
+
+	avail_lebs = c->orph_lebs;
+	avail = avail_lebs *
+	       ((c->leb_size - UBIFS_ORPH_NODE_SZ) / sizeof(__le64));
+	return avail / 2;
+}
+
+/**
+ * do_write_orph_node - write a node
+ * @c: UBIFS file-system description object
+ * @len: length of node
+ * @atomic: write atomically
+ *
+ * This function writes a node to the orphan head from the orphan buffer. If
+ * %atomic is not zero, then the write is done atomically. On success, %0 is
+ * returned, otherwise a negative error code is returned.
+ */
+static int do_write_orph_node(struct ubifs_info *c, int len, int atomic)
+{
+	int err = 0;
+
+	if (atomic) {
+		ubifs_assert(c->ohead_offs == 0);
+		ubifs_prepare_node(c, c->orph_buf, len, 1);
+		len = ALIGN(len, c->min_io_size);
+		err = ubifs_leb_change(c, c->ohead_lnum, c->orph_buf, len,
+				       UBI_SHORTTERM);
+	} else {
+		if (c->ohead_offs == 0) {
+			/* Ensure LEB has been unmapped */
+			err = ubifs_leb_unmap(c, c->ohead_lnum);
+			if (err)
+				return err;
+		}
+		err = ubifs_write_node(c, c->orph_buf, len, c->ohead_lnum,
+				       c->ohead_offs, UBI_SHORTTERM);
+	}
+	return err;
+}
+
+/**
+ * write_orph_node - write an orph node
+ * @c: UBIFS file-system description object
+ * @atomic: write atomically
+ *
+ * This function builds an orph node from the cnext list and writes it to the
+ * orphan head. On success, %0 is returned, otherwise a negative error code
+ * is returned.
+ */
+static int write_orph_node(struct ubifs_info *c, int atomic)
+{
+	struct ubifs_orphan *orphan, *cnext;
+	struct ubifs_orph_node *orph;
+	int gap, err, len, cnt, i;
+
+	ubifs_assert(c->cmt_orphans > 0);
+	gap = c->leb_size - c->ohead_offs;
+	if (gap < UBIFS_ORPH_NODE_SZ + sizeof(__le64)) {
+		c->ohead_lnum += 1;
+		c->ohead_offs = 0;
+		gap = c->leb_size;
+		if (c->ohead_lnum > c->orph_last) {
+			/*
+			 * We limit the number of orphans so that this should
+			 * never happen.
+			 */
+			ubifs_err("out of space in orphan area");
+			return -EINVAL;
+		}
+	}
+	cnt = (gap - UBIFS_ORPH_NODE_SZ) / sizeof(__le64);
+	if (cnt > c->cmt_orphans)
+		cnt = c->cmt_orphans;
+	len = UBIFS_ORPH_NODE_SZ + cnt * sizeof(__le64);
+	ubifs_assert(c->orph_buf);
+	orph = c->orph_buf;
+	orph->ch.node_type = UBIFS_ORPH_NODE;
+	spin_lock(&c->orphan_lock);
+	cnext = c->orph_cnext;
+	for (i = 0; i < cnt; i++) {
+		orphan = cnext;
+		orph->inos[i] = cpu_to_le64(orphan->inum);
+		cnext = orphan->cnext;
+		orphan->cnext = NULL;
+	}
+	c->orph_cnext = cnext;
+	c->cmt_orphans -= cnt;
+	spin_unlock(&c->orphan_lock);
+	if (c->cmt_orphans)
+		orph->cmt_no = cpu_to_le64(c->cmt_no + 1);
+	else
+		/* Mark the last node of the commit */
+		orph->cmt_no = cpu_to_le64((c->cmt_no + 1) | (1ULL << 63));
+	ubifs_assert(c->ohead_offs + len <= c->leb_size);
+	ubifs_assert(c->ohead_lnum >= c->orph_first);
+	ubifs_assert(c->ohead_lnum <= c->orph_last);
+	err = do_write_orph_node(c, len, atomic);
+	c->ohead_offs += ALIGN(len, c->min_io_size);
+	c->ohead_offs = ALIGN(c->ohead_offs, 8);
+	return err;
+}
+
+/**
+ * write_orph_nodes - write orph nodes until there are no more to commit
+ * @c: UBIFS file-system description object
+ * @atomic: write atomically
+ *
+ * This function writes orph nodes for all the orphans to commit. On success,
+ * %0 is returned, otherwise a negative error code is returned.
+ */
+static int write_orph_nodes(struct ubifs_info *c, int atomic)
+{
+	int err;
+
+	while (c->cmt_orphans > 0) {
+		err = write_orph_node(c, atomic);
+		if (err)
+			return err;
+	}
+	if (atomic) {
+		int lnum;
+
+		/* Unmap any unused LEBs after consolidation */
+		lnum = c->ohead_lnum + 1;
+		for (lnum = c->ohead_lnum + 1; lnum <= c->orph_last; lnum++) {
+			err = ubifs_leb_unmap(c, lnum);
+			if (err)
+				return err;
+		}
+	}
+	return 0;
+}
+
+/**
+ * consolidate - consolidate the orphan area.
+ * @c: UBIFS file-system description object
+ *
+ * This function enables consolidation by putting all the orphans into the list
+ * to commit. The list is in the order that the orphans were added, and the
+ * LEBs are written atomically in order, so at no time can orphans be lost by
+ * an unclean unmount.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int consolidate(struct ubifs_info *c)
+{
+	int tot_avail = tot_avail_orphs(c), err = 0;
+
+	spin_lock(&c->orphan_lock);
+	dbg_cmt("there is space for %d orphans and there are %d",
+		tot_avail, c->tot_orphans);
+	if (c->tot_orphans - c->new_orphans <= tot_avail) {
+		struct ubifs_orphan *orphan, **last;
+		int cnt = 0;
+
+		/* Change the cnext list to include all non-new orphans */
+		last = &c->orph_cnext;
+		list_for_each_entry(orphan, &c->orph_list, list) {
+			if (orphan->new)
+				continue;
+			*last = orphan;
+			last = &orphan->cnext;
+			cnt += 1;
+		}
+		*last = orphan->cnext;
+		ubifs_assert(cnt == c->tot_orphans - c->new_orphans);
+		c->cmt_orphans = cnt;
+		c->ohead_lnum = c->orph_first;
+		c->ohead_offs = 0;
+	} else {
+		/*
+		 * We limit the number of orphans so that this should
+		 * never happen.
+		 */
+		ubifs_err("out of space in orphan area");
+		err = -EINVAL;
+	}
+	spin_unlock(&c->orphan_lock);
+	return err;
+}
+
+/**
+ * commit_orphans - commit orphans.
+ * @c: UBIFS file-system description object
+ *
+ * This function commits orphans to flash. On success, %0 is returned,
+ * otherwise a negative error code is returned.
+ */
+static int commit_orphans(struct ubifs_info *c)
+{
+	int avail, atomic = 0, err;
+
+	ubifs_assert(c->cmt_orphans > 0);
+	avail = avail_orphs(c);
+	if (avail < c->cmt_orphans) {
+		/* Not enough space to write new orphans, so consolidate */
+		err = consolidate(c);
+		if (err)
+			return err;
+		atomic = 1;
+	}
+	err = write_orph_nodes(c, atomic);
+	return err;
+}
+
+/**
+ * erase_deleted - erase the orphans marked for deletion.
+ * @c: UBIFS file-system description object
+ *
+ * During commit, the orphans being committed cannot be deleted, so they are
+ * marked for deletion and deleted by this function. Also, the recovery
+ * adds killed orphans to the deletion list, and therefore they are deleted
+ * here too.
+ */
+static void erase_deleted(struct ubifs_info *c)
+{
+	struct ubifs_orphan *orphan, *dnext;
+
+	spin_lock(&c->orphan_lock);
+	dnext = c->orph_dnext;
+	while (dnext) {
+		orphan = dnext;
+		dnext = orphan->dnext;
+		ubifs_assert(!orphan->new);
+		rb_erase(&orphan->rb, &c->orph_tree);
+		list_del(&orphan->list);
+		c->tot_orphans -= 1;
+		dbg_gen("deleting orphan ino %lu", orphan->inum);
+		kfree(orphan);
+	}
+	c->orph_dnext = NULL;
+	spin_unlock(&c->orphan_lock);
+}
+
+/**
+ * ubifs_orphan_end_commit - end commit of orphans.
+ * @c: UBIFS file-system description object
+ *
+ * End commit of orphans.
+ */
+int ubifs_orphan_end_commit(struct ubifs_info *c)
+{
+	int err;
+
+	if (c->cmt_orphans != 0) {
+		err = commit_orphans(c);
+		if (err)
+			return err;
+	}
+	erase_deleted(c);
+	err = dbg_check_orphans(c);
+	return err;
+}
+
+/**
+ * clear_orphans - erase all LEBs used for orphans.
+ * @c: UBIFS file-system description object
+ *
+ * If recovery is not required, then the orphans from the previous session
+ * are not needed. This function locates the LEBs used to record
+ * orphans, and un-maps them.
+ */
+static int clear_orphans(struct ubifs_info *c)
+{
+	int lnum, err;
+
+	for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
+		err = ubifs_leb_unmap(c, lnum);
+		if (err)
+			return err;
+	}
+	c->ohead_lnum = c->orph_first;
+	c->ohead_offs = 0;
+	return 0;
+}
+
+/**
+ * insert_dead_orphan - insert an orphan.
+ * @c: UBIFS file-system description object
+ * @inum: orphan inode number
+ *
+ * This function is a helper to the 'do_kill_orphans()' function. The orphan
+ * must be kept until the next commit, so it is added to the rb-tree and the
+ * deletion list.
+ */
+static int insert_dead_orphan(struct ubifs_info *c, ino_t inum)
+{
+	struct ubifs_orphan *orphan, *o;
+	struct rb_node **p, *parent = NULL;
+
+	orphan = kzalloc(sizeof(struct ubifs_orphan), GFP_KERNEL);
+	if (!orphan)
+		return -ENOMEM;
+	orphan->inum = inum;
+
+	p = &c->orph_tree.rb_node;
+	while (*p) {
+		parent = *p;
+		o = rb_entry(parent, struct ubifs_orphan, rb);
+		if (inum < o->inum)
+			p = &(*p)->rb_left;
+		else if (inum > o->inum)
+			p = &(*p)->rb_right;
+		else {
+			/* Already added - no problem */
+			kfree(orphan);
+			return 0;
+		}
+	}
+	c->tot_orphans += 1;
+	rb_link_node(&orphan->rb, parent, p);
+	rb_insert_color(&orphan->rb, &c->orph_tree);
+	list_add_tail(&orphan->list, &c->orph_list);
+	orphan->dnext = c->orph_dnext;
+	c->orph_dnext = orphan;
+	dbg_mnt("ino %lu, new %d, tot %d",
+		inum, c->new_orphans, c->tot_orphans);
+	return 0;
+}
+
+/**
+ * do_kill_orphans - remove orphan inodes from the index.
+ * @c: UBIFS file-system description object
+ * @sleb: scanned LEB
+ * @last_cmt_no: cmt_no of last orph node read is passed and returned here
+ * @outofdate: whether the LEB is out of date is returned here
+ * @last_flagged: whether the end orph node is encountered
+ *
+ * This function is a helper to the 'kill_orphans()' function. It goes through
+ * every orphan node in a LEB and for every inode number recorded, removes
+ * all keys for that inode from the TNC.
+ */
+static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+			   unsigned long long *last_cmt_no, int *outofdate,
+			   int *last_flagged)
+{
+	struct ubifs_scan_node *snod;
+	struct ubifs_orph_node *orph;
+	unsigned long long cmt_no;
+	ino_t inum;
+	int i, n, err, first = 1;
+
+	list_for_each_entry(snod, &sleb->nodes, list) {
+		if (snod->type != UBIFS_ORPH_NODE) {
+			ubifs_err("invalid node type %d in orphan area at "
+				  "%d:%d", snod->type, sleb->lnum, snod->offs);
+			dbg_dump_node(c, snod->node);
+			return -EINVAL;
+		}
+
+		orph = snod->node;
+
+		/* Check commit number */
+		cmt_no = le64_to_cpu(orph->cmt_no) & LLONG_MAX;
+		/*
+		 * The commit number on the master node may be less, because
+		 * of a failed commit. If there are several failed commits in a
+		 * row, the commit number written on orph nodes will continue to
+		 * increase (because the commit number is adjusted here) even
+		 * though the commit number on the master node stays the same
+		 * because the master node has not been re-written.
+		 */
+		if (cmt_no > c->cmt_no)
+			c->cmt_no = cmt_no;
+		if (cmt_no < *last_cmt_no && *last_flagged) {
+			/*
+			 * The last orph node had a higher commit number and was
+			 * flagged as the last written for that commit number.
+			 * That makes this orph node, out of date.
+			 */
+			if (!first) {
+				ubifs_err("out of order commit number %llu in "
+					  "orphan node at %d:%d",
+					  cmt_no, sleb->lnum, snod->offs);
+				dbg_dump_node(c, snod->node);
+				return -EINVAL;
+			}
+			dbg_rcvry("out of date LEB %d", sleb->lnum);
+			*outofdate = 1;
+			return 0;
+		}
+
+		if (first)
+			first = 0;
+
+		n = (le32_to_cpu(orph->ch.len) - UBIFS_ORPH_NODE_SZ) >> 3;
+		for (i = 0; i < n; i++) {
+			inum = le64_to_cpu(orph->inos[i]);
+			dbg_rcvry("deleting orphaned inode %lu", inum);
+			err = ubifs_tnc_remove_ino(c, inum);
+			if (err)
+				return err;
+			err = insert_dead_orphan(c, inum);
+			if (err)
+				return err;
+		}
+
+		*last_cmt_no = cmt_no;
+		if (le64_to_cpu(orph->cmt_no) & (1ULL << 63)) {
+			dbg_rcvry("last orph node for commit %llu at %d:%d",
+				  cmt_no, sleb->lnum, snod->offs);
+			*last_flagged = 1;
+		} else
+			*last_flagged = 0;
+	}
+
+	return 0;
+}
+
+/**
+ * kill_orphans - remove all orphan inodes from the index.
+ * @c: UBIFS file-system description object
+ *
+ * If recovery is required, then orphan inodes recorded during the previous
+ * session (which ended with an unclean unmount) must be deleted from the index.
+ * This is done by updating the TNC, but since the index is not updated until
+ * the next commit, the LEBs where the orphan information is recorded are not
+ * erased until the next commit.
+ */
+static int kill_orphans(struct ubifs_info *c)
+{
+	unsigned long long last_cmt_no = 0;
+	int lnum, err = 0, outofdate = 0, last_flagged = 0;
+
+	c->ohead_lnum = c->orph_first;
+	c->ohead_offs = 0;
+	/* Check no-orphans flag and skip this if no orphans */
+	if (c->no_orphs) {
+		dbg_rcvry("no orphans");
+		return 0;
+	}
+	/*
+	 * Orph nodes always start at c->orph_first and are written to each
+	 * successive LEB in turn. Generally unused LEBs will have been unmapped
+	 * but may contain out of date orph nodes if the unmap didn't go
+	 * through. In addition, the last orph node written for each commit is
+	 * marked (top bit of orph->cmt_no is set to 1). It is possible that
+	 * there are orph nodes from the next commit (i.e. the commit did not
+	 * complete successfully). In that case, no orphans will have been lost
+	 * due to the way that orphans are written, and any orphans added will
+	 * be valid orphans anyway and so can be deleted.
+	 */
+	for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
+		struct ubifs_scan_leb *sleb;
+
+		dbg_rcvry("LEB %d", lnum);
+		sleb = ubifs_scan(c, lnum, 0, c->sbuf);
+		if (IS_ERR(sleb)) {
+			sleb = ubifs_recover_leb(c, lnum, 0, c->sbuf, 0);
+			if (IS_ERR(sleb)) {
+				err = PTR_ERR(sleb);
+				break;
+			}
+		}
+		err = do_kill_orphans(c, sleb, &last_cmt_no, &outofdate,
+				      &last_flagged);
+		if (err || outofdate) {
+			ubifs_scan_destroy(sleb);
+			break;
+		}
+		if (sleb->endpt) {
+			c->ohead_lnum = lnum;
+			c->ohead_offs = sleb->endpt;
+		}
+		ubifs_scan_destroy(sleb);
+	}
+	return err;
+}
+
+/**
+ * ubifs_mount_orphans - delete orphan inodes and erase LEBs that recorded them.
+ * @c: UBIFS file-system description object
+ * @unclean: indicates recovery from unclean unmount
+ * @read_only: indicates read only mount
+ *
+ * This function is called when mounting to erase orphans from the previous
+ * session. If UBIFS was not unmounted cleanly, then the inodes recorded as
+ * orphans are deleted.
+ */
+int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only)
+{
+	int err = 0;
+
+	c->max_orphans = tot_avail_orphs(c);
+
+	if (!read_only) {
+		c->orph_buf = vmalloc(c->leb_size);
+		if (!c->orph_buf)
+			return -ENOMEM;
+	}
+
+	if (unclean)
+		err = kill_orphans(c);
+	else if (!read_only)
+		err = clear_orphans(c);
+
+	return err;
+}
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+
+struct check_orphan {
+	struct rb_node rb;
+	ino_t inum;
+};
+
+struct check_info {
+	unsigned long last_ino;
+	unsigned long tot_inos;
+	unsigned long missing;
+	unsigned long long leaf_cnt;
+	struct ubifs_ino_node *node;
+	struct rb_root root;
+};
+
+static int dbg_find_orphan(struct ubifs_info *c, ino_t inum)
+{
+	struct ubifs_orphan *o;
+	struct rb_node *p;
+
+	spin_lock(&c->orphan_lock);
+	p = c->orph_tree.rb_node;
+	while (p) {
+		o = rb_entry(p, struct ubifs_orphan, rb);
+		if (inum < o->inum)
+			p = p->rb_left;
+		else if (inum > o->inum)
+			p = p->rb_right;
+		else {
+			spin_unlock(&c->orphan_lock);
+			return 1;
+		}
+	}
+	spin_unlock(&c->orphan_lock);
+	return 0;
+}
+
+static int dbg_ins_check_orphan(struct rb_root *root, ino_t inum)
+{
+	struct check_orphan *orphan, *o;
+	struct rb_node **p, *parent = NULL;
+
+	orphan = kzalloc(sizeof(struct check_orphan), GFP_NOFS);
+	if (!orphan)
+		return -ENOMEM;
+	orphan->inum = inum;
+
+	p = &root->rb_node;
+	while (*p) {
+		parent = *p;
+		o = rb_entry(parent, struct check_orphan, rb);
+		if (inum < o->inum)
+			p = &(*p)->rb_left;
+		else if (inum > o->inum)
+			p = &(*p)->rb_right;
+		else {
+			kfree(orphan);
+			return 0;
+		}
+	}
+	rb_link_node(&orphan->rb, parent, p);
+	rb_insert_color(&orphan->rb, root);
+	return 0;
+}
+
+static int dbg_find_check_orphan(struct rb_root *root, ino_t inum)
+{
+	struct check_orphan *o;
+	struct rb_node *p;
+
+	p = root->rb_node;
+	while (p) {
+		o = rb_entry(p, struct check_orphan, rb);
+		if (inum < o->inum)
+			p = p->rb_left;
+		else if (inum > o->inum)
+			p = p->rb_right;
+		else
+			return 1;
+	}
+	return 0;
+}
+
+static void dbg_free_check_tree(struct rb_root *root)
+{
+	struct rb_node *this = root->rb_node;
+	struct check_orphan *o;
+
+	while (this) {
+		if (this->rb_left) {
+			this = this->rb_left;
+			continue;
+		} else if (this->rb_right) {
+			this = this->rb_right;
+			continue;
+		}
+		o = rb_entry(this, struct check_orphan, rb);
+		this = rb_parent(this);
+		if (this) {
+			if (this->rb_left == &o->rb)
+				this->rb_left = NULL;
+			else
+				this->rb_right = NULL;
+		}
+		kfree(o);
+	}
+}
+
+static int dbg_orphan_check(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+			    void *priv)
+{
+	struct check_info *ci = priv;
+	ino_t inum;
+	int err;
+
+	inum = key_inum(c, &zbr->key);
+	if (inum != ci->last_ino) {
+		/* Lowest node type is the inode node, so it comes first */
+		if (key_type(c, &zbr->key) != UBIFS_INO_KEY)
+			ubifs_err("found orphan node ino %lu, type %d", inum,
+				  key_type(c, &zbr->key));
+		ci->last_ino = inum;
+		ci->tot_inos += 1;
+		err = ubifs_tnc_read_node(c, zbr, ci->node);
+		if (err) {
+			ubifs_err("node read failed, error %d", err);
+			return err;
+		}
+		if (ci->node->nlink == 0)
+			/* Must be recorded as an orphan */
+			if (!dbg_find_check_orphan(&ci->root, inum) &&
+			    !dbg_find_orphan(c, inum)) {
+				ubifs_err("missing orphan, ino %lu", inum);
+				ci->missing += 1;
+			}
+	}
+	ci->leaf_cnt += 1;
+	return 0;
+}
+
+static int dbg_read_orphans(struct check_info *ci, struct ubifs_scan_leb *sleb)
+{
+	struct ubifs_scan_node *snod;
+	struct ubifs_orph_node *orph;
+	ino_t inum;
+	int i, n, err;
+
+	list_for_each_entry(snod, &sleb->nodes, list) {
+		cond_resched();
+		if (snod->type != UBIFS_ORPH_NODE)
+			continue;
+		orph = snod->node;
+		n = (le32_to_cpu(orph->ch.len) - UBIFS_ORPH_NODE_SZ) >> 3;
+		for (i = 0; i < n; i++) {
+			inum = le64_to_cpu(orph->inos[i]);
+			err = dbg_ins_check_orphan(&ci->root, inum);
+			if (err)
+				return err;
+		}
+	}
+	return 0;
+}
+
+static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci)
+{
+	int lnum, err = 0;
+
+	/* Check no-orphans flag and skip this if no orphans */
+	if (c->no_orphs)
+		return 0;
+
+	for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
+		struct ubifs_scan_leb *sleb;
+
+		sleb = ubifs_scan(c, lnum, 0, c->dbg_buf);
+		if (IS_ERR(sleb)) {
+			err = PTR_ERR(sleb);
+			break;
+		}
+
+		err = dbg_read_orphans(ci, sleb);
+		ubifs_scan_destroy(sleb);
+		if (err)
+			break;
+	}
+
+	return err;
+}
+
+static int dbg_check_orphans(struct ubifs_info *c)
+{
+	struct check_info ci;
+	int err;
+
+	if (!(ubifs_chk_flags & UBIFS_CHK_ORPH))
+		return 0;
+
+	ci.last_ino = 0;
+	ci.tot_inos = 0;
+	ci.missing  = 0;
+	ci.leaf_cnt = 0;
+	ci.root = RB_ROOT;
+	ci.node = kmalloc(UBIFS_MAX_INO_NODE_SZ, GFP_NOFS);
+	if (!ci.node) {
+		ubifs_err("out of memory");
+		return -ENOMEM;
+	}
+
+	err = dbg_scan_orphans(c, &ci);
+	if (err)
+		goto out;
+
+	err = dbg_walk_index(c, &dbg_orphan_check, NULL, &ci);
+	if (err) {
+		ubifs_err("cannot scan TNC, error %d", err);
+		goto out;
+	}
+
+	if (ci.missing) {
+		ubifs_err("%lu missing orphan(s)", ci.missing);
+		err = -EINVAL;
+		goto out;
+	}
+
+	dbg_cmt("last inode number is %lu", ci.last_ino);
+	dbg_cmt("total number of inodes is %lu", ci.tot_inos);
+	dbg_cmt("total number of leaf nodes is %llu", ci.leaf_cnt);
+
+out:
+	dbg_free_check_tree(&ci.root);
+	kfree(ci.node);
+	return err;
+}
+
+#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
new file mode 100644
index 00000000000..77d26c141cf
--- /dev/null
+++ b/fs/ubifs/recovery.c
@@ -0,0 +1,1519 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements functions needed to recover from unclean un-mounts.
+ * When UBIFS is mounted, it checks a flag on the master node to determine if
+ * an un-mount was completed sucessfully. If not, the process of mounting
+ * incorparates additional checking and fixing of on-flash data structures.
+ * UBIFS always cleans away all remnants of an unclean un-mount, so that
+ * errors do not accumulate. However UBIFS defers recovery if it is mounted
+ * read-only, and the flash is not modified in that case.
+ */
+
+#include <linux/crc32.h>
+#include "ubifs.h"
+
+/**
+ * is_empty - determine whether a buffer is empty (contains all 0xff).
+ * @buf: buffer to clean
+ * @len: length of buffer
+ *
+ * This function returns %1 if the buffer is empty (contains all 0xff) otherwise
+ * %0 is returned.
+ */
+static int is_empty(void *buf, int len)
+{
+	uint8_t *p = buf;
+	int i;
+
+	for (i = 0; i < len; i++)
+		if (*p++ != 0xff)
+			return 0;
+	return 1;
+}
+
+/**
+ * get_master_node - get the last valid master node allowing for corruption.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number
+ * @pbuf: buffer containing the LEB read, is returned here
+ * @mst: master node, if found, is returned here
+ * @cor: corruption, if found, is returned here
+ *
+ * This function allocates a buffer, reads the LEB into it, and finds and
+ * returns the last valid master node allowing for one area of corruption.
+ * The corrupt area, if there is one, must be consistent with the assumption
+ * that it is the result of an unclean unmount while the master node was being
+ * written. Under those circumstances, it is valid to use the previously written
+ * master node.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int get_master_node(const struct ubifs_info *c, int lnum, void **pbuf,
+			   struct ubifs_mst_node **mst, void **cor)
+{
+	const int sz = c->mst_node_alsz;
+	int err, offs, len;
+	void *sbuf, *buf;
+
+	sbuf = vmalloc(c->leb_size);
+	if (!sbuf)
+		return -ENOMEM;
+
+	err = ubi_read(c->ubi, lnum, sbuf, 0, c->leb_size);
+	if (err && err != -EBADMSG)
+		goto out_free;
+
+	/* Find the first position that is definitely not a node */
+	offs = 0;
+	buf = sbuf;
+	len = c->leb_size;
+	while (offs + UBIFS_MST_NODE_SZ <= c->leb_size) {
+		struct ubifs_ch *ch = buf;
+
+		if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC)
+			break;
+		offs += sz;
+		buf  += sz;
+		len  -= sz;
+	}
+	/* See if there was a valid master node before that */
+	if (offs) {
+		int ret;
+
+		offs -= sz;
+		buf  -= sz;
+		len  += sz;
+		ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
+		if (ret != SCANNED_A_NODE && offs) {
+			/* Could have been corruption so check one place back */
+			offs -= sz;
+			buf  -= sz;
+			len  += sz;
+			ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
+			if (ret != SCANNED_A_NODE)
+				/*
+				 * We accept only one area of corruption because
+				 * we are assuming that it was caused while
+				 * trying to write a master node.
+				 */
+				goto out_err;
+		}
+		if (ret == SCANNED_A_NODE) {
+			struct ubifs_ch *ch = buf;
+
+			if (ch->node_type != UBIFS_MST_NODE)
+				goto out_err;
+			dbg_rcvry("found a master node at %d:%d", lnum, offs);
+			*mst = buf;
+			offs += sz;
+			buf  += sz;
+			len  -= sz;
+		}
+	}
+	/* Check for corruption */
+	if (offs < c->leb_size) {
+		if (!is_empty(buf, min_t(int, len, sz))) {
+			*cor = buf;
+			dbg_rcvry("found corruption at %d:%d", lnum, offs);
+		}
+		offs += sz;
+		buf  += sz;
+		len  -= sz;
+	}
+	/* Check remaining empty space */
+	if (offs < c->leb_size)
+		if (!is_empty(buf, len))
+			goto out_err;
+	*pbuf = sbuf;
+	return 0;
+
+out_err:
+	err = -EINVAL;
+out_free:
+	vfree(sbuf);
+	*mst = NULL;
+	*cor = NULL;
+	return err;
+}
+
+/**
+ * write_rcvrd_mst_node - write recovered master node.
+ * @c: UBIFS file-system description object
+ * @mst: master node
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int write_rcvrd_mst_node(struct ubifs_info *c,
+				struct ubifs_mst_node *mst)
+{
+	int err = 0, lnum = UBIFS_MST_LNUM, sz = c->mst_node_alsz;
+	uint32_t save_flags;
+
+	dbg_rcvry("recovery");
+
+	save_flags = mst->flags;
+	mst->flags = cpu_to_le32(le32_to_cpu(mst->flags) | UBIFS_MST_RCVRY);
+
+	ubifs_prepare_node(c, mst, UBIFS_MST_NODE_SZ, 1);
+	err = ubi_leb_change(c->ubi, lnum, mst, sz, UBI_SHORTTERM);
+	if (err)
+		goto out;
+	err = ubi_leb_change(c->ubi, lnum + 1, mst, sz, UBI_SHORTTERM);
+	if (err)
+		goto out;
+out:
+	mst->flags = save_flags;
+	return err;
+}
+
+/**
+ * ubifs_recover_master_node - recover the master node.
+ * @c: UBIFS file-system description object
+ *
+ * This function recovers the master node from corruption that may occur due to
+ * an unclean unmount.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_recover_master_node(struct ubifs_info *c)
+{
+	void *buf1 = NULL, *buf2 = NULL, *cor1 = NULL, *cor2 = NULL;
+	struct ubifs_mst_node *mst1 = NULL, *mst2 = NULL, *mst;
+	const int sz = c->mst_node_alsz;
+	int err, offs1, offs2;
+
+	dbg_rcvry("recovery");
+
+	err = get_master_node(c, UBIFS_MST_LNUM, &buf1, &mst1, &cor1);
+	if (err)
+		goto out_free;
+
+	err = get_master_node(c, UBIFS_MST_LNUM + 1, &buf2, &mst2, &cor2);
+	if (err)
+		goto out_free;
+
+	if (mst1) {
+		offs1 = (void *)mst1 - buf1;
+		if ((le32_to_cpu(mst1->flags) & UBIFS_MST_RCVRY) &&
+		    (offs1 == 0 && !cor1)) {
+			/*
+			 * mst1 was written by recovery at offset 0 with no
+			 * corruption.
+			 */
+			dbg_rcvry("recovery recovery");
+			mst = mst1;
+		} else if (mst2) {
+			offs2 = (void *)mst2 - buf2;
+			if (offs1 == offs2) {
+				/* Same offset, so must be the same */
+				if (memcmp((void *)mst1 + UBIFS_CH_SZ,
+					   (void *)mst2 + UBIFS_CH_SZ,
+					   UBIFS_MST_NODE_SZ - UBIFS_CH_SZ))
+					goto out_err;
+				mst = mst1;
+			} else if (offs2 + sz == offs1) {
+				/* 1st LEB was written, 2nd was not */
+				if (cor1)
+					goto out_err;
+				mst = mst1;
+			} else if (offs1 == 0 && offs2 + sz >= c->leb_size) {
+				/* 1st LEB was unmapped and written, 2nd not */
+				if (cor1)
+					goto out_err;
+				mst = mst1;
+			} else
+				goto out_err;
+		} else {
+			/*
+			 * 2nd LEB was unmapped and about to be written, so
+			 * there must be only one master node in the first LEB
+			 * and no corruption.
+			 */
+			if (offs1 != 0 || cor1)
+				goto out_err;
+			mst = mst1;
+		}
+	} else {
+		if (!mst2)
+			goto out_err;
+		/*
+		 * 1st LEB was unmapped and about to be written, so there must
+		 * be no room left in 2nd LEB.
+		 */
+		offs2 = (void *)mst2 - buf2;
+		if (offs2 + sz + sz <= c->leb_size)
+			goto out_err;
+		mst = mst2;
+	}
+
+	dbg_rcvry("recovered master node from LEB %d",
+		  (mst == mst1 ? UBIFS_MST_LNUM : UBIFS_MST_LNUM + 1));
+
+	memcpy(c->mst_node, mst, UBIFS_MST_NODE_SZ);
+
+	if ((c->vfs_sb->s_flags & MS_RDONLY)) {
+		/* Read-only mode. Keep a copy for switching to rw mode */
+		c->rcvrd_mst_node = kmalloc(sz, GFP_KERNEL);
+		if (!c->rcvrd_mst_node) {
+			err = -ENOMEM;
+			goto out_free;
+		}
+		memcpy(c->rcvrd_mst_node, c->mst_node, UBIFS_MST_NODE_SZ);
+	} else {
+		/* Write the recovered master node */
+		c->max_sqnum = le64_to_cpu(mst->ch.sqnum) - 1;
+		err = write_rcvrd_mst_node(c, c->mst_node);
+		if (err)
+			goto out_free;
+	}
+
+	vfree(buf2);
+	vfree(buf1);
+
+	return 0;
+
+out_err:
+	err = -EINVAL;
+out_free:
+	ubifs_err("failed to recover master node");
+	if (mst1) {
+		dbg_err("dumping first master node");
+		dbg_dump_node(c, mst1);
+	}
+	if (mst2) {
+		dbg_err("dumping second master node");
+		dbg_dump_node(c, mst2);
+	}
+	vfree(buf2);
+	vfree(buf1);
+	return err;
+}
+
+/**
+ * ubifs_write_rcvrd_mst_node - write the recovered master node.
+ * @c: UBIFS file-system description object
+ *
+ * This function writes the master node that was recovered during mounting in
+ * read-only mode and must now be written because we are remounting rw.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_write_rcvrd_mst_node(struct ubifs_info *c)
+{
+	int err;
+
+	if (!c->rcvrd_mst_node)
+		return 0;
+	c->rcvrd_mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
+	c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
+	err = write_rcvrd_mst_node(c, c->rcvrd_mst_node);
+	if (err)
+		return err;
+	kfree(c->rcvrd_mst_node);
+	c->rcvrd_mst_node = NULL;
+	return 0;
+}
+
+/**
+ * is_last_write - determine if an offset was in the last write to a LEB.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to check
+ * @offs: offset to check
+ *
+ * This function returns %1 if @offs was in the last write to the LEB whose data
+ * is in @buf, otherwise %0 is returned.  The determination is made by checking
+ * for subsequent empty space starting from the next min_io_size boundary (or a
+ * bit less than the common header size if min_io_size is one).
+ */
+static int is_last_write(const struct ubifs_info *c, void *buf, int offs)
+{
+	int empty_offs;
+	int check_len;
+	uint8_t *p;
+
+	if (c->min_io_size == 1) {
+		check_len = c->leb_size - offs;
+		p = buf + check_len;
+		for (; check_len > 0; check_len--)
+			if (*--p != 0xff)
+				break;
+		/*
+		 * 'check_len' is the size of the corruption which cannot be
+		 * more than the size of 1 node if it was caused by an unclean
+		 * unmount.
+		 */
+		if (check_len > UBIFS_MAX_NODE_SZ)
+			return 0;
+		return 1;
+	}
+
+	/*
+	 * Round up to the next c->min_io_size boundary i.e. 'offs' is in the
+	 * last wbuf written. After that should be empty space.
+	 */
+	empty_offs = ALIGN(offs + 1, c->min_io_size);
+	check_len = c->leb_size - empty_offs;
+	p = buf + empty_offs - offs;
+
+	for (; check_len > 0; check_len--)
+		if (*p++ != 0xff)
+			return 0;
+	return 1;
+}
+
+/**
+ * clean_buf - clean the data from an LEB sitting in a buffer.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to clean
+ * @lnum: LEB number to clean
+ * @offs: offset from which to clean
+ * @len: length of buffer
+ *
+ * This function pads up to the next min_io_size boundary (if there is one) and
+ * sets empty space to all 0xff. @buf, @offs and @len are updated to the next
+ * min_io_size boundary (if there is one).
+ */
+static void clean_buf(const struct ubifs_info *c, void **buf, int lnum,
+		      int *offs, int *len)
+{
+	int empty_offs, pad_len;
+
+	lnum = lnum;
+	dbg_rcvry("cleaning corruption at %d:%d", lnum, *offs);
+
+	if (c->min_io_size == 1) {
+		memset(*buf, 0xff, c->leb_size - *offs);
+		return;
+	}
+
+	ubifs_assert(!(*offs & 7));
+	empty_offs = ALIGN(*offs, c->min_io_size);
+	pad_len = empty_offs - *offs;
+	ubifs_pad(c, *buf, pad_len);
+	*offs += pad_len;
+	*buf += pad_len;
+	*len -= pad_len;
+	memset(*buf, 0xff, c->leb_size - empty_offs);
+}
+
+/**
+ * no_more_nodes - determine if there are no more nodes in a buffer.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to check
+ * @len: length of buffer
+ * @lnum: LEB number of the LEB from which @buf was read
+ * @offs: offset from which @buf was read
+ *
+ * This function scans @buf for more nodes and returns %0 is a node is found and
+ * %1 if no more nodes are found.
+ */
+static int no_more_nodes(const struct ubifs_info *c, void *buf, int len,
+			int lnum, int offs)
+{
+	int skip, next_offs = 0;
+
+	if (len > UBIFS_DATA_NODE_SZ) {
+		struct ubifs_ch *ch = buf;
+		int dlen = le32_to_cpu(ch->len);
+
+		if (ch->node_type == UBIFS_DATA_NODE && dlen >= UBIFS_CH_SZ &&
+		    dlen <= UBIFS_MAX_DATA_NODE_SZ)
+			/* The corrupt node looks like a data node */
+			next_offs = ALIGN(offs + dlen, 8);
+	}
+
+	if (c->min_io_size == 1)
+		skip = 8;
+	else
+		skip = ALIGN(offs + 1, c->min_io_size) - offs;
+
+	offs += skip;
+	buf += skip;
+	len -= skip;
+	while (len > 8) {
+		struct ubifs_ch *ch = buf;
+		uint32_t magic = le32_to_cpu(ch->magic);
+		int ret;
+
+		if (magic == UBIFS_NODE_MAGIC) {
+			ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
+			if (ret == SCANNED_A_NODE || ret > 0) {
+				/*
+				 * There is a small chance this is just data in
+				 * a data node, so check that possibility. e.g.
+				 * this is part of a file that itself contains
+				 * a UBIFS image.
+				 */
+				if (next_offs && offs + le32_to_cpu(ch->len) <=
+				    next_offs)
+					continue;
+				dbg_rcvry("unexpected node at %d:%d", lnum,
+					  offs);
+				return 0;
+			}
+		}
+		offs += 8;
+		buf += 8;
+		len -= 8;
+	}
+	return 1;
+}
+
+/**
+ * fix_unclean_leb - fix an unclean LEB.
+ * @c: UBIFS file-system description object
+ * @sleb: scanned LEB information
+ * @start: offset where scan started
+ */
+static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+			   int start)
+{
+	int lnum = sleb->lnum, endpt = start;
+
+	/* Get the end offset of the last node we are keeping */
+	if (!list_empty(&sleb->nodes)) {
+		struct ubifs_scan_node *snod;
+
+		snod = list_entry(sleb->nodes.prev,
+				  struct ubifs_scan_node, list);
+		endpt = snod->offs + snod->len;
+	}
+
+	if ((c->vfs_sb->s_flags & MS_RDONLY) && !c->remounting_rw) {
+		/* Add to recovery list */
+		struct ubifs_unclean_leb *ucleb;
+
+		dbg_rcvry("need to fix LEB %d start %d endpt %d",
+			  lnum, start, sleb->endpt);
+		ucleb = kzalloc(sizeof(struct ubifs_unclean_leb), GFP_NOFS);
+		if (!ucleb)
+			return -ENOMEM;
+		ucleb->lnum = lnum;
+		ucleb->endpt = endpt;
+		list_add_tail(&ucleb->list, &c->unclean_leb_list);
+	} else {
+		/* Write the fixed LEB back to flash */
+		int err;
+
+		dbg_rcvry("fixing LEB %d start %d endpt %d",
+			  lnum, start, sleb->endpt);
+		if (endpt == 0) {
+			err = ubifs_leb_unmap(c, lnum);
+			if (err)
+				return err;
+		} else {
+			int len = ALIGN(endpt, c->min_io_size);
+
+			if (start) {
+				err = ubi_read(c->ubi, lnum, sleb->buf, 0,
+					       start);
+				if (err)
+					return err;
+			}
+			/* Pad to min_io_size */
+			if (len > endpt) {
+				int pad_len = len - ALIGN(endpt, 8);
+
+				if (pad_len > 0) {
+					void *buf = sleb->buf + len - pad_len;
+
+					ubifs_pad(c, buf, pad_len);
+				}
+			}
+			err = ubi_leb_change(c->ubi, lnum, sleb->buf, len,
+					     UBI_UNKNOWN);
+			if (err)
+				return err;
+		}
+	}
+	return 0;
+}
+
+/**
+ * drop_incomplete_group - drop nodes from an incomplete group.
+ * @sleb: scanned LEB information
+ * @offs: offset of dropped nodes is returned here
+ *
+ * This function returns %1 if nodes are dropped and %0 otherwise.
+ */
+static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs)
+{
+	int dropped = 0;
+
+	while (!list_empty(&sleb->nodes)) {
+		struct ubifs_scan_node *snod;
+		struct ubifs_ch *ch;
+
+		snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node,
+				  list);
+		ch = snod->node;
+		if (ch->group_type != UBIFS_IN_NODE_GROUP)
+			return dropped;
+		dbg_rcvry("dropping node at %d:%d", sleb->lnum, snod->offs);
+		*offs = snod->offs;
+		list_del(&snod->list);
+		kfree(snod);
+		sleb->nodes_cnt -= 1;
+		dropped = 1;
+	}
+	return dropped;
+}
+
+/**
+ * ubifs_recover_leb - scan and recover a LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number
+ * @offs: offset
+ * @sbuf: LEB-sized buffer to use
+ * @grouped: nodes may be grouped for recovery
+ *
+ * This function does a scan of a LEB, but caters for errors that might have
+ * been caused by the unclean unmount from which we are attempting to recover.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
+					 int offs, void *sbuf, int grouped)
+{
+	int err, len = c->leb_size - offs, need_clean = 0, quiet = 1;
+	int empty_chkd = 0, start = offs;
+	struct ubifs_scan_leb *sleb;
+	void *buf = sbuf + offs;
+
+	dbg_rcvry("%d:%d", lnum, offs);
+
+	sleb = ubifs_start_scan(c, lnum, offs, sbuf);
+	if (IS_ERR(sleb))
+		return sleb;
+
+	if (sleb->ecc)
+		need_clean = 1;
+
+	while (len >= 8) {
+		int ret;
+
+		dbg_scan("look at LEB %d:%d (%d bytes left)",
+			 lnum, offs, len);
+
+		cond_resched();
+
+		/*
+		 * Scan quietly until there is an error from which we cannot
+		 * recover
+		 */
+		ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet);
+
+		if (ret == SCANNED_A_NODE) {
+			/* A valid node, and not a padding node */
+			struct ubifs_ch *ch = buf;
+			int node_len;
+
+			err = ubifs_add_snod(c, sleb, buf, offs);
+			if (err)
+				goto error;
+			node_len = ALIGN(le32_to_cpu(ch->len), 8);
+			offs += node_len;
+			buf += node_len;
+			len -= node_len;
+			continue;
+		}
+
+		if (ret > 0) {
+			/* Padding bytes or a valid padding node */
+			offs += ret;
+			buf += ret;
+			len -= ret;
+			continue;
+		}
+
+		if (ret == SCANNED_EMPTY_SPACE) {
+			if (!is_empty(buf, len)) {
+				if (!is_last_write(c, buf, offs))
+					break;
+				clean_buf(c, &buf, lnum, &offs, &len);
+				need_clean = 1;
+			}
+			empty_chkd = 1;
+			break;
+		}
+
+		if (ret == SCANNED_GARBAGE || ret == SCANNED_A_BAD_PAD_NODE)
+			if (is_last_write(c, buf, offs)) {
+				clean_buf(c, &buf, lnum, &offs, &len);
+				need_clean = 1;
+				empty_chkd = 1;
+				break;
+			}
+
+		if (ret == SCANNED_A_CORRUPT_NODE)
+			if (no_more_nodes(c, buf, len, lnum, offs)) {
+				clean_buf(c, &buf, lnum, &offs, &len);
+				need_clean = 1;
+				empty_chkd = 1;
+				break;
+			}
+
+		if (quiet) {
+			/* Redo the last scan but noisily */
+			quiet = 0;
+			continue;
+		}
+
+		switch (ret) {
+		case SCANNED_GARBAGE:
+			dbg_err("garbage");
+			goto corrupted;
+		case SCANNED_A_CORRUPT_NODE:
+		case SCANNED_A_BAD_PAD_NODE:
+			dbg_err("bad node");
+			goto corrupted;
+		default:
+			dbg_err("unknown");
+			goto corrupted;
+		}
+	}
+
+	if (!empty_chkd && !is_empty(buf, len)) {
+		if (is_last_write(c, buf, offs)) {
+			clean_buf(c, &buf, lnum, &offs, &len);
+			need_clean = 1;
+		} else {
+			ubifs_err("corrupt empty space at LEB %d:%d",
+				  lnum, offs);
+			goto corrupted;
+		}
+	}
+
+	/* Drop nodes from incomplete group */
+	if (grouped && drop_incomplete_group(sleb, &offs)) {
+		buf = sbuf + offs;
+		len = c->leb_size - offs;
+		clean_buf(c, &buf, lnum, &offs, &len);
+		need_clean = 1;
+	}
+
+	if (offs % c->min_io_size) {
+		clean_buf(c, &buf, lnum, &offs, &len);
+		need_clean = 1;
+	}
+
+	ubifs_end_scan(c, sleb, lnum, offs);
+
+	if (need_clean) {
+		err = fix_unclean_leb(c, sleb, start);
+		if (err)
+			goto error;
+	}
+
+	return sleb;
+
+corrupted:
+	ubifs_scanned_corruption(c, lnum, offs, buf);
+	err = -EUCLEAN;
+error:
+	ubifs_err("LEB %d scanning failed", lnum);
+	ubifs_scan_destroy(sleb);
+	return ERR_PTR(err);
+}
+
+/**
+ * get_cs_sqnum - get commit start sequence number.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number of commit start node
+ * @offs: offset of commit start node
+ * @cs_sqnum: commit start sequence number is returned here
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int get_cs_sqnum(struct ubifs_info *c, int lnum, int offs,
+			unsigned long long *cs_sqnum)
+{
+	struct ubifs_cs_node *cs_node = NULL;
+	int err, ret;
+
+	dbg_rcvry("at %d:%d", lnum, offs);
+	cs_node = kmalloc(UBIFS_CS_NODE_SZ, GFP_KERNEL);
+	if (!cs_node)
+		return -ENOMEM;
+	if (c->leb_size - offs < UBIFS_CS_NODE_SZ)
+		goto out_err;
+	err = ubi_read(c->ubi, lnum, (void *)cs_node, offs, UBIFS_CS_NODE_SZ);
+	if (err && err != -EBADMSG)
+		goto out_free;
+	ret = ubifs_scan_a_node(c, cs_node, UBIFS_CS_NODE_SZ, lnum, offs, 0);
+	if (ret != SCANNED_A_NODE) {
+		dbg_err("Not a valid node");
+		goto out_err;
+	}
+	if (cs_node->ch.node_type != UBIFS_CS_NODE) {
+		dbg_err("Node a CS node, type is %d", cs_node->ch.node_type);
+		goto out_err;
+	}
+	if (le64_to_cpu(cs_node->cmt_no) != c->cmt_no) {
+		dbg_err("CS node cmt_no %llu != current cmt_no %llu",
+			(unsigned long long)le64_to_cpu(cs_node->cmt_no),
+			c->cmt_no);
+		goto out_err;
+	}
+	*cs_sqnum = le64_to_cpu(cs_node->ch.sqnum);
+	dbg_rcvry("commit start sqnum %llu", *cs_sqnum);
+	kfree(cs_node);
+	return 0;
+
+out_err:
+	err = -EINVAL;
+out_free:
+	ubifs_err("failed to get CS sqnum");
+	kfree(cs_node);
+	return err;
+}
+
+/**
+ * ubifs_recover_log_leb - scan and recover a log LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number
+ * @offs: offset
+ * @sbuf: LEB-sized buffer to use
+ *
+ * This function does a scan of a LEB, but caters for errors that might have
+ * been caused by the unclean unmount from which we are attempting to recover.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
+					     int offs, void *sbuf)
+{
+	struct ubifs_scan_leb *sleb;
+	int next_lnum;
+
+	dbg_rcvry("LEB %d", lnum);
+	next_lnum = lnum + 1;
+	if (next_lnum >= UBIFS_LOG_LNUM + c->log_lebs)
+		next_lnum = UBIFS_LOG_LNUM;
+	if (next_lnum != c->ltail_lnum) {
+		/*
+		 * We can only recover at the end of the log, so check that the
+		 * next log LEB is empty or out of date.
+		 */
+		sleb = ubifs_scan(c, next_lnum, 0, sbuf);
+		if (IS_ERR(sleb))
+			return sleb;
+		if (sleb->nodes_cnt) {
+			struct ubifs_scan_node *snod;
+			unsigned long long cs_sqnum = c->cs_sqnum;
+
+			snod = list_entry(sleb->nodes.next,
+					  struct ubifs_scan_node, list);
+			if (cs_sqnum == 0) {
+				int err;
+
+				err = get_cs_sqnum(c, lnum, offs, &cs_sqnum);
+				if (err) {
+					ubifs_scan_destroy(sleb);
+					return ERR_PTR(err);
+				}
+			}
+			if (snod->sqnum > cs_sqnum) {
+				ubifs_err("unrecoverable log corruption "
+					  "in LEB %d", lnum);
+				ubifs_scan_destroy(sleb);
+				return ERR_PTR(-EUCLEAN);
+			}
+		}
+		ubifs_scan_destroy(sleb);
+	}
+	return ubifs_recover_leb(c, lnum, offs, sbuf, 0);
+}
+
+/**
+ * recover_head - recover a head.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number of head to recover
+ * @offs: offset of head to recover
+ * @sbuf: LEB-sized buffer to use
+ *
+ * This function ensures that there is no data on the flash at a head location.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int recover_head(const struct ubifs_info *c, int lnum, int offs,
+			void *sbuf)
+{
+	int len, err, need_clean = 0;
+
+	if (c->min_io_size > 1)
+		len = c->min_io_size;
+	else
+		len = 512;
+	if (offs + len > c->leb_size)
+		len = c->leb_size - offs;
+
+	if (!len)
+		return 0;
+
+	/* Read at the head location and check it is empty flash */
+	err = ubi_read(c->ubi, lnum, sbuf, offs, len);
+	if (err)
+		need_clean = 1;
+	else {
+		uint8_t *p = sbuf;
+
+		while (len--)
+			if (*p++ != 0xff) {
+				need_clean = 1;
+				break;
+			}
+	}
+
+	if (need_clean) {
+		dbg_rcvry("cleaning head at %d:%d", lnum, offs);
+		if (offs == 0)
+			return ubifs_leb_unmap(c, lnum);
+		err = ubi_read(c->ubi, lnum, sbuf, 0, offs);
+		if (err)
+			return err;
+		return ubi_leb_change(c->ubi, lnum, sbuf, offs, UBI_UNKNOWN);
+	}
+
+	return 0;
+}
+
+/**
+ * ubifs_recover_inl_heads - recover index and LPT heads.
+ * @c: UBIFS file-system description object
+ * @sbuf: LEB-sized buffer to use
+ *
+ * This function ensures that there is no data on the flash at the index and
+ * LPT head locations.
+ *
+ * This deals with the recovery of a half-completed journal commit. UBIFS is
+ * careful never to overwrite the last version of the index or the LPT. Because
+ * the index and LPT are wandering trees, data from a half-completed commit will
+ * not be referenced anywhere in UBIFS. The data will be either in LEBs that are
+ * assumed to be empty and will be unmapped anyway before use, or in the index
+ * and LPT heads.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf)
+{
+	int err;
+
+	ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY) || c->remounting_rw);
+
+	dbg_rcvry("checking index head at %d:%d", c->ihead_lnum, c->ihead_offs);
+	err = recover_head(c, c->ihead_lnum, c->ihead_offs, sbuf);
+	if (err)
+		return err;
+
+	dbg_rcvry("checking LPT head at %d:%d", c->nhead_lnum, c->nhead_offs);
+	err = recover_head(c, c->nhead_lnum, c->nhead_offs, sbuf);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+/**
+ *  clean_an_unclean_leb - read and write a LEB to remove corruption.
+ * @c: UBIFS file-system description object
+ * @ucleb: unclean LEB information
+ * @sbuf: LEB-sized buffer to use
+ *
+ * This function reads a LEB up to a point pre-determined by the mount recovery,
+ * checks the nodes, and writes the result back to the flash, thereby cleaning
+ * off any following corruption, or non-fatal ECC errors.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int clean_an_unclean_leb(const struct ubifs_info *c,
+				struct ubifs_unclean_leb *ucleb, void *sbuf)
+{
+	int err, lnum = ucleb->lnum, offs = 0, len = ucleb->endpt, quiet = 1;
+	void *buf = sbuf;
+
+	dbg_rcvry("LEB %d len %d", lnum, len);
+
+	if (len == 0) {
+		/* Nothing to read, just unmap it */
+		err = ubifs_leb_unmap(c, lnum);
+		if (err)
+			return err;
+		return 0;
+	}
+
+	err = ubi_read(c->ubi, lnum, buf, offs, len);
+	if (err && err != -EBADMSG)
+		return err;
+
+	while (len >= 8) {
+		int ret;
+
+		cond_resched();
+
+		/* Scan quietly until there is an error */
+		ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet);
+
+		if (ret == SCANNED_A_NODE) {
+			/* A valid node, and not a padding node */
+			struct ubifs_ch *ch = buf;
+			int node_len;
+
+			node_len = ALIGN(le32_to_cpu(ch->len), 8);
+			offs += node_len;
+			buf += node_len;
+			len -= node_len;
+			continue;
+		}
+
+		if (ret > 0) {
+			/* Padding bytes or a valid padding node */
+			offs += ret;
+			buf += ret;
+			len -= ret;
+			continue;
+		}
+
+		if (ret == SCANNED_EMPTY_SPACE) {
+			ubifs_err("unexpected empty space at %d:%d",
+				  lnum, offs);
+			return -EUCLEAN;
+		}
+
+		if (quiet) {
+			/* Redo the last scan but noisily */
+			quiet = 0;
+			continue;
+		}
+
+		ubifs_scanned_corruption(c, lnum, offs, buf);
+		return -EUCLEAN;
+	}
+
+	/* Pad to min_io_size */
+	len = ALIGN(ucleb->endpt, c->min_io_size);
+	if (len > ucleb->endpt) {
+		int pad_len = len - ALIGN(ucleb->endpt, 8);
+
+		if (pad_len > 0) {
+			buf = c->sbuf + len - pad_len;
+			ubifs_pad(c, buf, pad_len);
+		}
+	}
+
+	/* Write back the LEB atomically */
+	err = ubi_leb_change(c->ubi, lnum, sbuf, len, UBI_UNKNOWN);
+	if (err)
+		return err;
+
+	dbg_rcvry("cleaned LEB %d", lnum);
+
+	return 0;
+}
+
+/**
+ * ubifs_clean_lebs - clean LEBs recovered during read-only mount.
+ * @c: UBIFS file-system description object
+ * @sbuf: LEB-sized buffer to use
+ *
+ * This function cleans a LEB identified during recovery that needs to be
+ * written but was not because UBIFS was mounted read-only. This happens when
+ * remounting to read-write mode.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_clean_lebs(const struct ubifs_info *c, void *sbuf)
+{
+	dbg_rcvry("recovery");
+	while (!list_empty(&c->unclean_leb_list)) {
+		struct ubifs_unclean_leb *ucleb;
+		int err;
+
+		ucleb = list_entry(c->unclean_leb_list.next,
+				   struct ubifs_unclean_leb, list);
+		err = clean_an_unclean_leb(c, ucleb, sbuf);
+		if (err)
+			return err;
+		list_del(&ucleb->list);
+		kfree(ucleb);
+	}
+	return 0;
+}
+
+/**
+ * ubifs_rcvry_gc_commit - recover the GC LEB number and run the commit.
+ * @c: UBIFS file-system description object
+ *
+ * Out-of-place garbage collection requires always one empty LEB with which to
+ * start garbage collection. The LEB number is recorded in c->gc_lnum and is
+ * written to the master node on unmounting. In the case of an unclean unmount
+ * the value of gc_lnum recorded in the master node is out of date and cannot
+ * be used. Instead, recovery must allocate an empty LEB for this purpose.
+ * However, there may not be enough empty space, in which case it must be
+ * possible to GC the dirtiest LEB into the GC head LEB.
+ *
+ * This function also runs the commit which causes the TNC updates from
+ * size-recovery and orphans to be written to the flash. That is important to
+ * ensure correct replay order for subsequent mounts.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_rcvry_gc_commit(struct ubifs_info *c)
+{
+	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
+	struct ubifs_lprops lp;
+	int lnum, err;
+
+	c->gc_lnum = -1;
+	if (wbuf->lnum == -1) {
+		dbg_rcvry("no GC head LEB");
+		goto find_free;
+	}
+	/*
+	 * See whether the used space in the dirtiest LEB fits in the GC head
+	 * LEB.
+	 */
+	if (wbuf->offs == c->leb_size) {
+		dbg_rcvry("no room in GC head LEB");
+		goto find_free;
+	}
+	err = ubifs_find_dirty_leb(c, &lp, wbuf->offs, 2);
+	if (err) {
+		if (err == -ENOSPC)
+			dbg_err("could not find a dirty LEB");
+		return err;
+	}
+	ubifs_assert(!(lp.flags & LPROPS_INDEX));
+	lnum = lp.lnum;
+	if (lp.free + lp.dirty == c->leb_size) {
+		/* An empty LEB was returned */
+		if (lp.free != c->leb_size) {
+			err = ubifs_change_one_lp(c, lnum, c->leb_size,
+						  0, 0, 0, 0);
+			if (err)
+				return err;
+		}
+		err = ubifs_leb_unmap(c, lnum);
+		if (err)
+			return err;
+		c->gc_lnum = lnum;
+		dbg_rcvry("allocated LEB %d for GC", lnum);
+		/* Run the commit */
+		dbg_rcvry("committing");
+		return ubifs_run_commit(c);
+	}
+	/*
+	 * There was no empty LEB so the used space in the dirtiest LEB must fit
+	 * in the GC head LEB.
+	 */
+	if (lp.free + lp.dirty < wbuf->offs) {
+		dbg_rcvry("LEB %d doesn't fit in GC head LEB %d:%d",
+			  lnum, wbuf->lnum, wbuf->offs);
+		err = ubifs_return_leb(c, lnum);
+		if (err)
+			return err;
+		goto find_free;
+	}
+	/*
+	 * We run the commit before garbage collection otherwise subsequent
+	 * mounts will see the GC and orphan deletion in a different order.
+	 */
+	dbg_rcvry("committing");
+	err = ubifs_run_commit(c);
+	if (err)
+		return err;
+	/*
+	 * The data in the dirtiest LEB fits in the GC head LEB, so do the GC
+	 * - use locking to keep 'ubifs_assert()' happy.
+	 */
+	dbg_rcvry("GC'ing LEB %d", lnum);
+	mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+	err = ubifs_garbage_collect_leb(c, &lp);
+	if (err >= 0) {
+		int err2 = ubifs_wbuf_sync_nolock(wbuf);
+
+		if (err2)
+			err = err2;
+	}
+	mutex_unlock(&wbuf->io_mutex);
+	if (err < 0) {
+		dbg_err("GC failed, error %d", err);
+		if (err == -EAGAIN)
+			err = -EINVAL;
+		return err;
+	}
+	if (err != LEB_RETAINED) {
+		dbg_err("GC returned %d", err);
+		return -EINVAL;
+	}
+	err = ubifs_leb_unmap(c, c->gc_lnum);
+	if (err)
+		return err;
+	dbg_rcvry("allocated LEB %d for GC", lnum);
+	return 0;
+
+find_free:
+	/*
+	 * There is no GC head LEB or the free space in the GC head LEB is too
+	 * small. Allocate gc_lnum by calling 'ubifs_find_free_leb_for_idx()' so
+	 * GC is not run.
+	 */
+	lnum = ubifs_find_free_leb_for_idx(c);
+	if (lnum < 0) {
+		dbg_err("could not find an empty LEB");
+		return lnum;
+	}
+	/* And reset the index flag */
+	err = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
+				  LPROPS_INDEX, 0);
+	if (err)
+		return err;
+	c->gc_lnum = lnum;
+	dbg_rcvry("allocated LEB %d for GC", lnum);
+	/* Run the commit */
+	dbg_rcvry("committing");
+	return ubifs_run_commit(c);
+}
+
+/**
+ * struct size_entry - inode size information for recovery.
+ * @rb: link in the RB-tree of sizes
+ * @inum: inode number
+ * @i_size: size on inode
+ * @d_size: maximum size based on data nodes
+ * @exists: indicates whether the inode exists
+ * @inode: inode if pinned in memory awaiting rw mode to fix it
+ */
+struct size_entry {
+	struct rb_node rb;
+	ino_t inum;
+	loff_t i_size;
+	loff_t d_size;
+	int exists;
+	struct inode *inode;
+};
+
+/**
+ * add_ino - add an entry to the size tree.
+ * @c: UBIFS file-system description object
+ * @inum: inode number
+ * @i_size: size on inode
+ * @d_size: maximum size based on data nodes
+ * @exists: indicates whether the inode exists
+ */
+static int add_ino(struct ubifs_info *c, ino_t inum, loff_t i_size,
+		   loff_t d_size, int exists)
+{
+	struct rb_node **p = &c->size_tree.rb_node, *parent = NULL;
+	struct size_entry *e;
+
+	while (*p) {
+		parent = *p;
+		e = rb_entry(parent, struct size_entry, rb);
+		if (inum < e->inum)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	e = kzalloc(sizeof(struct size_entry), GFP_KERNEL);
+	if (!e)
+		return -ENOMEM;
+
+	e->inum = inum;
+	e->i_size = i_size;
+	e->d_size = d_size;
+	e->exists = exists;
+
+	rb_link_node(&e->rb, parent, p);
+	rb_insert_color(&e->rb, &c->size_tree);
+
+	return 0;
+}
+
+/**
+ * find_ino - find an entry on the size tree.
+ * @c: UBIFS file-system description object
+ * @inum: inode number
+ */
+static struct size_entry *find_ino(struct ubifs_info *c, ino_t inum)
+{
+	struct rb_node *p = c->size_tree.rb_node;
+	struct size_entry *e;
+
+	while (p) {
+		e = rb_entry(p, struct size_entry, rb);
+		if (inum < e->inum)
+			p = p->rb_left;
+		else if (inum > e->inum)
+			p = p->rb_right;
+		else
+			return e;
+	}
+	return NULL;
+}
+
+/**
+ * remove_ino - remove an entry from the size tree.
+ * @c: UBIFS file-system description object
+ * @inum: inode number
+ */
+static void remove_ino(struct ubifs_info *c, ino_t inum)
+{
+	struct size_entry *e = find_ino(c, inum);
+
+	if (!e)
+		return;
+	rb_erase(&e->rb, &c->size_tree);
+	kfree(e);
+}
+
+/**
+ * ubifs_destroy_size_tree - free resources related to the size tree.
+ * @c: UBIFS file-system description object
+ */
+void ubifs_destroy_size_tree(struct ubifs_info *c)
+{
+	struct rb_node *this = c->size_tree.rb_node;
+	struct size_entry *e;
+
+	while (this) {
+		if (this->rb_left) {
+			this = this->rb_left;
+			continue;
+		} else if (this->rb_right) {
+			this = this->rb_right;
+			continue;
+		}
+		e = rb_entry(this, struct size_entry, rb);
+		if (e->inode)
+			iput(e->inode);
+		this = rb_parent(this);
+		if (this) {
+			if (this->rb_left == &e->rb)
+				this->rb_left = NULL;
+			else
+				this->rb_right = NULL;
+		}
+		kfree(e);
+	}
+	c->size_tree = RB_ROOT;
+}
+
+/**
+ * ubifs_recover_size_accum - accumulate inode sizes for recovery.
+ * @c: UBIFS file-system description object
+ * @key: node key
+ * @deletion: node is for a deletion
+ * @new_size: inode size
+ *
+ * This function has two purposes:
+ *     1) to ensure there are no data nodes that fall outside the inode size
+ *     2) to ensure there are no data nodes for inodes that do not exist
+ * To accomplish those purposes, a rb-tree is constructed containing an entry
+ * for each inode number in the journal that has not been deleted, and recording
+ * the size from the inode node, the maximum size of any data node (also altered
+ * by truncations) and a flag indicating a inode number for which no inode node
+ * was present in the journal.
+ *
+ * Note that there is still the possibility that there are data nodes that have
+ * been committed that are beyond the inode size, however the only way to find
+ * them would be to scan the entire index. Alternatively, some provision could
+ * be made to record the size of inodes at the start of commit, which would seem
+ * very cumbersome for a scenario that is quite unlikely and the only negative
+ * consequence of which is wasted space.
+ *
+ * This functions returns %0 on success and a negative error code on failure.
+ */
+int ubifs_recover_size_accum(struct ubifs_info *c, union ubifs_key *key,
+			     int deletion, loff_t new_size)
+{
+	ino_t inum = key_inum(c, key);
+	struct size_entry *e;
+	int err;
+
+	switch (key_type(c, key)) {
+	case UBIFS_INO_KEY:
+		if (deletion)
+			remove_ino(c, inum);
+		else {
+			e = find_ino(c, inum);
+			if (e) {
+				e->i_size = new_size;
+				e->exists = 1;
+			} else {
+				err = add_ino(c, inum, new_size, 0, 1);
+				if (err)
+					return err;
+			}
+		}
+		break;
+	case UBIFS_DATA_KEY:
+		e = find_ino(c, inum);
+		if (e) {
+			if (new_size > e->d_size)
+				e->d_size = new_size;
+		} else {
+			err = add_ino(c, inum, 0, new_size, 0);
+			if (err)
+				return err;
+		}
+		break;
+	case UBIFS_TRUN_KEY:
+		e = find_ino(c, inum);
+		if (e)
+			e->d_size = new_size;
+		break;
+	}
+	return 0;
+}
+
+/**
+ * fix_size_in_place - fix inode size in place on flash.
+ * @c: UBIFS file-system description object
+ * @e: inode size information for recovery
+ */
+static int fix_size_in_place(struct ubifs_info *c, struct size_entry *e)
+{
+	struct ubifs_ino_node *ino = c->sbuf;
+	unsigned char *p;
+	union ubifs_key key;
+	int err, lnum, offs, len;
+	loff_t i_size;
+	uint32_t crc;
+
+	/* Locate the inode node LEB number and offset */
+	ino_key_init(c, &key, e->inum);
+	err = ubifs_tnc_locate(c, &key, ino, &lnum, &offs);
+	if (err)
+		goto out;
+	/*
+	 * If the size recorded on the inode node is greater than the size that
+	 * was calculated from nodes in the journal then don't change the inode.
+	 */
+	i_size = le64_to_cpu(ino->size);
+	if (i_size >= e->d_size)
+		return 0;
+	/* Read the LEB */
+	err = ubi_read(c->ubi, lnum, c->sbuf, 0, c->leb_size);
+	if (err)
+		goto out;
+	/* Change the size field and recalculate the CRC */
+	ino = c->sbuf + offs;
+	ino->size = cpu_to_le64(e->d_size);
+	len = le32_to_cpu(ino->ch.len);
+	crc = crc32(UBIFS_CRC32_INIT, (void *)ino + 8, len - 8);
+	ino->ch.crc = cpu_to_le32(crc);
+	/* Work out where data in the LEB ends and free space begins */
+	p = c->sbuf;
+	len = c->leb_size - 1;
+	while (p[len] == 0xff)
+		len -= 1;
+	len = ALIGN(len + 1, c->min_io_size);
+	/* Atomically write the fixed LEB back again */
+	err = ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN);
+	if (err)
+		goto out;
+	dbg_rcvry("inode %lu at %d:%d size %lld -> %lld ", e->inum, lnum, offs,
+		  i_size, e->d_size);
+	return 0;
+
+out:
+	ubifs_warn("inode %lu failed to fix size %lld -> %lld error %d",
+		   e->inum, e->i_size, e->d_size, err);
+	return err;
+}
+
+/**
+ * ubifs_recover_size - recover inode size.
+ * @c: UBIFS file-system description object
+ *
+ * This function attempts to fix inode size discrepancies identified by the
+ * 'ubifs_recover_size_accum()' function.
+ *
+ * This functions returns %0 on success and a negative error code on failure.
+ */
+int ubifs_recover_size(struct ubifs_info *c)
+{
+	struct rb_node *this = rb_first(&c->size_tree);
+
+	while (this) {
+		struct size_entry *e;
+		int err;
+
+		e = rb_entry(this, struct size_entry, rb);
+		if (!e->exists) {
+			union ubifs_key key;
+
+			ino_key_init(c, &key, e->inum);
+			err = ubifs_tnc_lookup(c, &key, c->sbuf);
+			if (err && err != -ENOENT)
+				return err;
+			if (err == -ENOENT) {
+				/* Remove data nodes that have no inode */
+				dbg_rcvry("removing ino %lu", e->inum);
+				err = ubifs_tnc_remove_ino(c, e->inum);
+				if (err)
+					return err;
+			} else {
+				struct ubifs_ino_node *ino = c->sbuf;
+
+				e->exists = 1;
+				e->i_size = le64_to_cpu(ino->size);
+			}
+		}
+		if (e->exists && e->i_size < e->d_size) {
+			if (!e->inode && (c->vfs_sb->s_flags & MS_RDONLY)) {
+				/* Fix the inode size and pin it in memory */
+				struct inode *inode;
+
+				inode = ubifs_iget(c->vfs_sb, e->inum);
+				if (IS_ERR(inode))
+					return PTR_ERR(inode);
+				if (inode->i_size < e->d_size) {
+					dbg_rcvry("ino %lu size %lld -> %lld",
+						  e->inum, e->d_size,
+						  inode->i_size);
+					inode->i_size = e->d_size;
+					ubifs_inode(inode)->ui_size = e->d_size;
+					e->inode = inode;
+					this = rb_next(this);
+					continue;
+				}
+				iput(inode);
+			} else {
+				/* Fix the size in place */
+				err = fix_size_in_place(c, e);
+				if (err)
+					return err;
+				if (e->inode)
+					iput(e->inode);
+			}
+		}
+		this = rb_next(this);
+		rb_erase(&e->rb, &c->size_tree);
+		kfree(e);
+	}
+	return 0;
+}
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
new file mode 100644
index 00000000000..7399692af85
--- /dev/null
+++ b/fs/ubifs/replay.c
@@ -0,0 +1,1075 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file contains journal replay code. It runs when the file-system is being
+ * mounted and requires no locking.
+ *
+ * The larger is the journal, the longer it takes to scan it, so the longer it
+ * takes to mount UBIFS. This is why the journal has limited size which may be
+ * changed depending on the system requirements. But a larger journal gives
+ * faster I/O speed because it writes the index less frequently. So this is a
+ * trade-off. Also, the journal is indexed by the in-memory index (TNC), so the
+ * larger is the journal, the more memory its index may consume.
+ */
+
+#include "ubifs.h"
+
+/*
+ * Replay flags.
+ *
+ * REPLAY_DELETION: node was deleted
+ * REPLAY_REF: node is a reference node
+ */
+enum {
+	REPLAY_DELETION = 1,
+	REPLAY_REF = 2,
+};
+
+/**
+ * struct replay_entry - replay tree entry.
+ * @lnum: logical eraseblock number of the node
+ * @offs: node offset
+ * @len: node length
+ * @sqnum: node sequence number
+ * @flags: replay flags
+ * @rb: links the replay tree
+ * @key: node key
+ * @nm: directory entry name
+ * @old_size: truncation old size
+ * @new_size: truncation new size
+ * @free: amount of free space in a bud
+ * @dirty: amount of dirty space in a bud from padding and deletion nodes
+ *
+ * UBIFS journal replay must compare node sequence numbers, which means it must
+ * build a tree of node information to insert into the TNC.
+ */
+struct replay_entry {
+	int lnum;
+	int offs;
+	int len;
+	unsigned long long sqnum;
+	int flags;
+	struct rb_node rb;
+	union ubifs_key key;
+	union {
+		struct qstr nm;
+		struct {
+			loff_t old_size;
+			loff_t new_size;
+		};
+		struct {
+			int free;
+			int dirty;
+		};
+	};
+};
+
+/**
+ * struct bud_entry - entry in the list of buds to replay.
+ * @list: next bud in the list
+ * @bud: bud description object
+ * @free: free bytes in the bud
+ * @sqnum: reference node sequence number
+ */
+struct bud_entry {
+	struct list_head list;
+	struct ubifs_bud *bud;
+	int free;
+	unsigned long long sqnum;
+};
+
+/**
+ * set_bud_lprops - set free and dirty space used by a bud.
+ * @c: UBIFS file-system description object
+ * @r: replay entry of bud
+ */
+static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
+{
+	const struct ubifs_lprops *lp;
+	int err = 0, dirty;
+
+	ubifs_get_lprops(c);
+
+	lp = ubifs_lpt_lookup_dirty(c, r->lnum);
+	if (IS_ERR(lp)) {
+		err = PTR_ERR(lp);
+		goto out;
+	}
+
+	dirty = lp->dirty;
+	if (r->offs == 0 && (lp->free != c->leb_size || lp->dirty != 0)) {
+		/*
+		 * The LEB was added to the journal with a starting offset of
+		 * zero which means the LEB must have been empty. The LEB
+		 * property values should be lp->free == c->leb_size and
+		 * lp->dirty == 0, but that is not the case. The reason is that
+		 * the LEB was garbage collected. The garbage collector resets
+		 * the free and dirty space without recording it anywhere except
+		 * lprops, so if there is not a commit then lprops does not have
+		 * that information next time the file system is mounted.
+		 *
+		 * We do not need to adjust free space because the scan has told
+		 * us the exact value which is recorded in the replay entry as
+		 * r->free.
+		 *
+		 * However we do need to subtract from the dirty space the
+		 * amount of space that the garbage collector reclaimed, which
+		 * is the whole LEB minus the amount of space that was free.
+		 */
+		dbg_mnt("bud LEB %d was GC'd (%d free, %d dirty)", r->lnum,
+			lp->free, lp->dirty);
+		dbg_gc("bud LEB %d was GC'd (%d free, %d dirty)", r->lnum,
+			lp->free, lp->dirty);
+		dirty -= c->leb_size - lp->free;
+		/*
+		 * If the replay order was perfect the dirty space would now be
+		 * zero. The order is not perfect because the the journal heads
+		 * race with eachother. This is not a problem but is does mean
+		 * that the dirty space may temporarily exceed c->leb_size
+		 * during the replay.
+		 */
+		if (dirty != 0)
+			dbg_msg("LEB %d lp: %d free %d dirty "
+				"replay: %d free %d dirty", r->lnum, lp->free,
+				lp->dirty, r->free, r->dirty);
+	}
+	lp = ubifs_change_lp(c, lp, r->free, dirty + r->dirty,
+			     lp->flags | LPROPS_TAKEN, 0);
+	if (IS_ERR(lp)) {
+		err = PTR_ERR(lp);
+		goto out;
+	}
+out:
+	ubifs_release_lprops(c);
+	return err;
+}
+
+/**
+ * trun_remove_range - apply a replay entry for a truncation to the TNC.
+ * @c: UBIFS file-system description object
+ * @r: replay entry of truncation
+ */
+static int trun_remove_range(struct ubifs_info *c, struct replay_entry *r)
+{
+	unsigned min_blk, max_blk;
+	union ubifs_key min_key, max_key;
+	ino_t ino;
+
+	min_blk = r->new_size / UBIFS_BLOCK_SIZE;
+	if (r->new_size & (UBIFS_BLOCK_SIZE - 1))
+		min_blk += 1;
+
+	max_blk = r->old_size / UBIFS_BLOCK_SIZE;
+	if ((r->old_size & (UBIFS_BLOCK_SIZE - 1)) == 0)
+		max_blk -= 1;
+
+	ino = key_inum(c, &r->key);
+
+	data_key_init(c, &min_key, ino, min_blk);
+	data_key_init(c, &max_key, ino, max_blk);
+
+	return ubifs_tnc_remove_range(c, &min_key, &max_key);
+}
+
+/**
+ * apply_replay_entry - apply a replay entry to the TNC.
+ * @c: UBIFS file-system description object
+ * @r: replay entry to apply
+ *
+ * Apply a replay entry to the TNC.
+ */
+static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
+{
+	int err, deletion = ((r->flags & REPLAY_DELETION) != 0);
+
+	dbg_mnt("LEB %d:%d len %d flgs %d sqnum %llu %s", r->lnum,
+		r->offs, r->len, r->flags, r->sqnum, DBGKEY(&r->key));
+
+	/* Set c->replay_sqnum to help deal with dangling branches. */
+	c->replay_sqnum = r->sqnum;
+
+	if (r->flags & REPLAY_REF)
+		err = set_bud_lprops(c, r);
+	else if (is_hash_key(c, &r->key)) {
+		if (deletion)
+			err = ubifs_tnc_remove_nm(c, &r->key, &r->nm);
+		else
+			err = ubifs_tnc_add_nm(c, &r->key, r->lnum, r->offs,
+					       r->len, &r->nm);
+	} else {
+		if (deletion)
+			switch (key_type(c, &r->key)) {
+			case UBIFS_INO_KEY:
+			{
+				ino_t inum = key_inum(c, &r->key);
+
+				err = ubifs_tnc_remove_ino(c, inum);
+				break;
+			}
+			case UBIFS_TRUN_KEY:
+				err = trun_remove_range(c, r);
+				break;
+			default:
+				err = ubifs_tnc_remove(c, &r->key);
+				break;
+			}
+		else
+			err = ubifs_tnc_add(c, &r->key, r->lnum, r->offs,
+					    r->len);
+		if (err)
+			return err;
+
+		if (c->need_recovery)
+			err = ubifs_recover_size_accum(c, &r->key, deletion,
+						       r->new_size);
+	}
+
+	return err;
+}
+
+/**
+ * destroy_replay_tree - destroy the replay.
+ * @c: UBIFS file-system description object
+ *
+ * Destroy the replay tree.
+ */
+static void destroy_replay_tree(struct ubifs_info *c)
+{
+	struct rb_node *this = c->replay_tree.rb_node;
+	struct replay_entry *r;
+
+	while (this) {
+		if (this->rb_left) {
+			this = this->rb_left;
+			continue;
+		} else if (this->rb_right) {
+			this = this->rb_right;
+			continue;
+		}
+		r = rb_entry(this, struct replay_entry, rb);
+		this = rb_parent(this);
+		if (this) {
+			if (this->rb_left == &r->rb)
+				this->rb_left = NULL;
+			else
+				this->rb_right = NULL;
+		}
+		if (is_hash_key(c, &r->key))
+			kfree(r->nm.name);
+		kfree(r);
+	}
+	c->replay_tree = RB_ROOT;
+}
+
+/**
+ * apply_replay_tree - apply the replay tree to the TNC.
+ * @c: UBIFS file-system description object
+ *
+ * Apply the replay tree.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+static int apply_replay_tree(struct ubifs_info *c)
+{
+	struct rb_node *this = rb_first(&c->replay_tree);
+
+	while (this) {
+		struct replay_entry *r;
+		int err;
+
+		cond_resched();
+
+		r = rb_entry(this, struct replay_entry, rb);
+		err = apply_replay_entry(c, r);
+		if (err)
+			return err;
+		this = rb_next(this);
+	}
+	return 0;
+}
+
+/**
+ * insert_node - insert a node to the replay tree.
+ * @c: UBIFS file-system description object
+ * @lnum: node logical eraseblock number
+ * @offs: node offset
+ * @len: node length
+ * @key: node key
+ * @sqnum: sequence number
+ * @deletion: non-zero if this is a deletion
+ * @used: number of bytes in use in a LEB
+ * @old_size: truncation old size
+ * @new_size: truncation new size
+ *
+ * This function inserts a scanned non-direntry node to the replay tree. The
+ * replay tree is an RB-tree containing @struct replay_entry elements which are
+ * indexed by the sequence number. The replay tree is applied at the very end
+ * of the replay process. Since the tree is sorted in sequence number order,
+ * the older modifications are applied first. This function returns zero in
+ * case of success and a negative error code in case of failure.
+ */
+static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
+		       union ubifs_key *key, unsigned long long sqnum,
+		       int deletion, int *used, loff_t old_size,
+		       loff_t new_size)
+{
+	struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
+	struct replay_entry *r;
+
+	if (key_inum(c, key) >= c->highest_inum)
+		c->highest_inum = key_inum(c, key);
+
+	dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
+	while (*p) {
+		parent = *p;
+		r = rb_entry(parent, struct replay_entry, rb);
+		if (sqnum < r->sqnum) {
+			p = &(*p)->rb_left;
+			continue;
+		} else if (sqnum > r->sqnum) {
+			p = &(*p)->rb_right;
+			continue;
+		}
+		ubifs_err("duplicate sqnum in replay");
+		return -EINVAL;
+	}
+
+	r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
+	if (!r)
+		return -ENOMEM;
+
+	if (!deletion)
+		*used += ALIGN(len, 8);
+	r->lnum = lnum;
+	r->offs = offs;
+	r->len = len;
+	r->sqnum = sqnum;
+	r->flags = (deletion ? REPLAY_DELETION : 0);
+	r->old_size = old_size;
+	r->new_size = new_size;
+	key_copy(c, key, &r->key);
+
+	rb_link_node(&r->rb, parent, p);
+	rb_insert_color(&r->rb, &c->replay_tree);
+	return 0;
+}
+
+/**
+ * insert_dent - insert a directory entry node into the replay tree.
+ * @c: UBIFS file-system description object
+ * @lnum: node logical eraseblock number
+ * @offs: node offset
+ * @len: node length
+ * @key: node key
+ * @name: directory entry name
+ * @nlen: directory entry name length
+ * @sqnum: sequence number
+ * @deletion: non-zero if this is a deletion
+ * @used: number of bytes in use in a LEB
+ *
+ * This function inserts a scanned directory entry node to the replay tree.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ *
+ * This function is also used for extended attribute entries because they are
+ * implemented as directory entry nodes.
+ */
+static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len,
+		       union ubifs_key *key, const char *name, int nlen,
+		       unsigned long long sqnum, int deletion, int *used)
+{
+	struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
+	struct replay_entry *r;
+	char *nbuf;
+
+	if (key_inum(c, key) >= c->highest_inum)
+		c->highest_inum = key_inum(c, key);
+
+	dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
+	while (*p) {
+		parent = *p;
+		r = rb_entry(parent, struct replay_entry, rb);
+		if (sqnum < r->sqnum) {
+			p = &(*p)->rb_left;
+			continue;
+		}
+		if (sqnum > r->sqnum) {
+			p = &(*p)->rb_right;
+			continue;
+		}
+		ubifs_err("duplicate sqnum in replay");
+		return -EINVAL;
+	}
+
+	r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
+	if (!r)
+		return -ENOMEM;
+	nbuf = kmalloc(nlen + 1, GFP_KERNEL);
+	if (!nbuf) {
+		kfree(r);
+		return -ENOMEM;
+	}
+
+	if (!deletion)
+		*used += ALIGN(len, 8);
+	r->lnum = lnum;
+	r->offs = offs;
+	r->len = len;
+	r->sqnum = sqnum;
+	r->nm.len = nlen;
+	memcpy(nbuf, name, nlen);
+	nbuf[nlen] = '\0';
+	r->nm.name = nbuf;
+	r->flags = (deletion ? REPLAY_DELETION : 0);
+	key_copy(c, key, &r->key);
+
+	ubifs_assert(!*p);
+	rb_link_node(&r->rb, parent, p);
+	rb_insert_color(&r->rb, &c->replay_tree);
+	return 0;
+}
+
+/**
+ * ubifs_validate_entry - validate directory or extended attribute entry node.
+ * @c: UBIFS file-system description object
+ * @dent: the node to validate
+ *
+ * This function validates directory or extended attribute entry node @dent.
+ * Returns zero if the node is all right and a %-EINVAL if not.
+ */
+int ubifs_validate_entry(struct ubifs_info *c,
+			 const struct ubifs_dent_node *dent)
+{
+	int key_type = key_type_flash(c, dent->key);
+	int nlen = le16_to_cpu(dent->nlen);
+
+	if (le32_to_cpu(dent->ch.len) != nlen + UBIFS_DENT_NODE_SZ + 1 ||
+	    dent->type >= UBIFS_ITYPES_CNT ||
+	    nlen > UBIFS_MAX_NLEN || dent->name[nlen] != 0 ||
+	    strnlen(dent->name, nlen) != nlen ||
+	    le64_to_cpu(dent->inum) > MAX_INUM) {
+		ubifs_err("bad %s node", key_type == UBIFS_DENT_KEY ?
+			  "directory entry" : "extended attribute entry");
+		return -EINVAL;
+	}
+
+	if (key_type != UBIFS_DENT_KEY && key_type != UBIFS_XENT_KEY) {
+		ubifs_err("bad key type %d", key_type);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * replay_bud - replay a bud logical eraseblock.
+ * @c: UBIFS file-system description object
+ * @lnum: bud logical eraseblock number to replay
+ * @offs: bud start offset
+ * @jhead: journal head to which this bud belongs
+ * @free: amount of free space in the bud is returned here
+ * @dirty: amount of dirty space from padding and deletion nodes is returned
+ * here
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
+		      int *free, int *dirty)
+{
+	int err = 0, used = 0;
+	struct ubifs_scan_leb *sleb;
+	struct ubifs_scan_node *snod;
+	struct ubifs_bud *bud;
+
+	dbg_mnt("replay bud LEB %d, head %d", lnum, jhead);
+	if (c->need_recovery)
+		sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, jhead != GCHD);
+	else
+		sleb = ubifs_scan(c, lnum, offs, c->sbuf);
+	if (IS_ERR(sleb))
+		return PTR_ERR(sleb);
+
+	/*
+	 * The bud does not have to start from offset zero - the beginning of
+	 * the 'lnum' LEB may contain previously committed data. One of the
+	 * things we have to do in replay is to correctly update lprops with
+	 * newer information about this LEB.
+	 *
+	 * At this point lprops thinks that this LEB has 'c->leb_size - offs'
+	 * bytes of free space because it only contain information about
+	 * committed data.
+	 *
+	 * But we know that real amount of free space is 'c->leb_size -
+	 * sleb->endpt', and the space in the 'lnum' LEB between 'offs' and
+	 * 'sleb->endpt' is used by bud data. We have to correctly calculate
+	 * how much of these data are dirty and update lprops with this
+	 * information.
+	 *
+	 * The dirt in that LEB region is comprised of padding nodes, deletion
+	 * nodes, truncation nodes and nodes which are obsoleted by subsequent
+	 * nodes in this LEB. So instead of calculating clean space, we
+	 * calculate used space ('used' variable).
+	 */
+
+	list_for_each_entry(snod, &sleb->nodes, list) {
+		int deletion = 0;
+
+		cond_resched();
+
+		if (snod->sqnum >= SQNUM_WATERMARK) {
+			ubifs_err("file system's life ended");
+			goto out_dump;
+		}
+
+		if (snod->sqnum > c->max_sqnum)
+			c->max_sqnum = snod->sqnum;
+
+		switch (snod->type) {
+		case UBIFS_INO_NODE:
+		{
+			struct ubifs_ino_node *ino = snod->node;
+			loff_t new_size = le64_to_cpu(ino->size);
+
+			if (le32_to_cpu(ino->nlink) == 0)
+				deletion = 1;
+			err = insert_node(c, lnum, snod->offs, snod->len,
+					  &snod->key, snod->sqnum, deletion,
+					  &used, 0, new_size);
+			break;
+		}
+		case UBIFS_DATA_NODE:
+		{
+			struct ubifs_data_node *dn = snod->node;
+			loff_t new_size = le32_to_cpu(dn->size) +
+					  key_block(c, &snod->key) *
+					  UBIFS_BLOCK_SIZE;
+
+			err = insert_node(c, lnum, snod->offs, snod->len,
+					  &snod->key, snod->sqnum, deletion,
+					  &used, 0, new_size);
+			break;
+		}
+		case UBIFS_DENT_NODE:
+		case UBIFS_XENT_NODE:
+		{
+			struct ubifs_dent_node *dent = snod->node;
+
+			err = ubifs_validate_entry(c, dent);
+			if (err)
+				goto out_dump;
+
+			err = insert_dent(c, lnum, snod->offs, snod->len,
+					  &snod->key, dent->name,
+					  le16_to_cpu(dent->nlen), snod->sqnum,
+					  !le64_to_cpu(dent->inum), &used);
+			break;
+		}
+		case UBIFS_TRUN_NODE:
+		{
+			struct ubifs_trun_node *trun = snod->node;
+			loff_t old_size = le64_to_cpu(trun->old_size);
+			loff_t new_size = le64_to_cpu(trun->new_size);
+			union ubifs_key key;
+
+			/* Validate truncation node */
+			if (old_size < 0 || old_size > c->max_inode_sz ||
+			    new_size < 0 || new_size > c->max_inode_sz ||
+			    old_size <= new_size) {
+				ubifs_err("bad truncation node");
+				goto out_dump;
+			}
+
+			/*
+			 * Create a fake truncation key just to use the same
+			 * functions which expect nodes to have keys.
+			 */
+			trun_key_init(c, &key, le32_to_cpu(trun->inum));
+			err = insert_node(c, lnum, snod->offs, snod->len,
+					  &key, snod->sqnum, 1, &used,
+					  old_size, new_size);
+			break;
+		}
+		default:
+			ubifs_err("unexpected node type %d in bud LEB %d:%d",
+				  snod->type, lnum, snod->offs);
+			err = -EINVAL;
+			goto out_dump;
+		}
+		if (err)
+			goto out;
+	}
+
+	bud = ubifs_search_bud(c, lnum);
+	if (!bud)
+		BUG();
+
+	ubifs_assert(sleb->endpt - offs >= used);
+	ubifs_assert(sleb->endpt % c->min_io_size == 0);
+
+	if (sleb->endpt + c->min_io_size <= c->leb_size &&
+	    !(c->vfs_sb->s_flags & MS_RDONLY))
+		err = ubifs_wbuf_seek_nolock(&c->jheads[jhead].wbuf, lnum,
+					     sleb->endpt, UBI_SHORTTERM);
+
+	*dirty = sleb->endpt - offs - used;
+	*free = c->leb_size - sleb->endpt;
+
+out:
+	ubifs_scan_destroy(sleb);
+	return err;
+
+out_dump:
+	ubifs_err("bad node is at LEB %d:%d", lnum, snod->offs);
+	dbg_dump_node(c, snod->node);
+	ubifs_scan_destroy(sleb);
+	return -EINVAL;
+}
+
+/**
+ * insert_ref_node - insert a reference node to the replay tree.
+ * @c: UBIFS file-system description object
+ * @lnum: node logical eraseblock number
+ * @offs: node offset
+ * @sqnum: sequence number
+ * @free: amount of free space in bud
+ * @dirty: amount of dirty space from padding and deletion nodes
+ *
+ * This function inserts a reference node to the replay tree and returns zero
+ * in case of success ort a negative error code in case of failure.
+ */
+static int insert_ref_node(struct ubifs_info *c, int lnum, int offs,
+			   unsigned long long sqnum, int free, int dirty)
+{
+	struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
+	struct replay_entry *r;
+
+	dbg_mnt("add ref LEB %d:%d", lnum, offs);
+	while (*p) {
+		parent = *p;
+		r = rb_entry(parent, struct replay_entry, rb);
+		if (sqnum < r->sqnum) {
+			p = &(*p)->rb_left;
+			continue;
+		} else if (sqnum > r->sqnum) {
+			p = &(*p)->rb_right;
+			continue;
+		}
+		ubifs_err("duplicate sqnum in replay tree");
+		return -EINVAL;
+	}
+
+	r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
+	if (!r)
+		return -ENOMEM;
+
+	r->lnum = lnum;
+	r->offs = offs;
+	r->sqnum = sqnum;
+	r->flags = REPLAY_REF;
+	r->free = free;
+	r->dirty = dirty;
+
+	rb_link_node(&r->rb, parent, p);
+	rb_insert_color(&r->rb, &c->replay_tree);
+	return 0;
+}
+
+/**
+ * replay_buds - replay all buds.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int replay_buds(struct ubifs_info *c)
+{
+	struct bud_entry *b;
+	int err, uninitialized_var(free), uninitialized_var(dirty);
+
+	list_for_each_entry(b, &c->replay_buds, list) {
+		err = replay_bud(c, b->bud->lnum, b->bud->start, b->bud->jhead,
+				 &free, &dirty);
+		if (err)
+			return err;
+		err = insert_ref_node(c, b->bud->lnum, b->bud->start, b->sqnum,
+				      free, dirty);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+/**
+ * destroy_bud_list - destroy the list of buds to replay.
+ * @c: UBIFS file-system description object
+ */
+static void destroy_bud_list(struct ubifs_info *c)
+{
+	struct bud_entry *b;
+
+	while (!list_empty(&c->replay_buds)) {
+		b = list_entry(c->replay_buds.next, struct bud_entry, list);
+		list_del(&b->list);
+		kfree(b);
+	}
+}
+
+/**
+ * add_replay_bud - add a bud to the list of buds to replay.
+ * @c: UBIFS file-system description object
+ * @lnum: bud logical eraseblock number to replay
+ * @offs: bud start offset
+ * @jhead: journal head to which this bud belongs
+ * @sqnum: reference node sequence number
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int add_replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
+			  unsigned long long sqnum)
+{
+	struct ubifs_bud *bud;
+	struct bud_entry *b;
+
+	dbg_mnt("add replay bud LEB %d:%d, head %d", lnum, offs, jhead);
+
+	bud = kmalloc(sizeof(struct ubifs_bud), GFP_KERNEL);
+	if (!bud)
+		return -ENOMEM;
+
+	b = kmalloc(sizeof(struct bud_entry), GFP_KERNEL);
+	if (!b) {
+		kfree(bud);
+		return -ENOMEM;
+	}
+
+	bud->lnum = lnum;
+	bud->start = offs;
+	bud->jhead = jhead;
+	ubifs_add_bud(c, bud);
+
+	b->bud = bud;
+	b->sqnum = sqnum;
+	list_add_tail(&b->list, &c->replay_buds);
+
+	return 0;
+}
+
+/**
+ * validate_ref - validate a reference node.
+ * @c: UBIFS file-system description object
+ * @ref: the reference node to validate
+ * @ref_lnum: LEB number of the reference node
+ * @ref_offs: reference node offset
+ *
+ * This function returns %1 if a bud reference already exists for the LEB. %0 is
+ * returned if the reference node is new, otherwise %-EINVAL is returned if
+ * validation failed.
+ */
+static int validate_ref(struct ubifs_info *c, const struct ubifs_ref_node *ref)
+{
+	struct ubifs_bud *bud;
+	int lnum = le32_to_cpu(ref->lnum);
+	unsigned int offs = le32_to_cpu(ref->offs);
+	unsigned int jhead = le32_to_cpu(ref->jhead);
+
+	/*
+	 * ref->offs may point to the end of LEB when the journal head points
+	 * to the end of LEB and we write reference node for it during commit.
+	 * So this is why we require 'offs > c->leb_size'.
+	 */
+	if (jhead >= c->jhead_cnt || lnum >= c->leb_cnt ||
+	    lnum < c->main_first || offs > c->leb_size ||
+	    offs & (c->min_io_size - 1))
+		return -EINVAL;
+
+	/* Make sure we have not already looked at this bud */
+	bud = ubifs_search_bud(c, lnum);
+	if (bud) {
+		if (bud->jhead == jhead && bud->start <= offs)
+			return 1;
+		ubifs_err("bud at LEB %d:%d was already referred", lnum, offs);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * replay_log_leb - replay a log logical eraseblock.
+ * @c: UBIFS file-system description object
+ * @lnum: log logical eraseblock to replay
+ * @offs: offset to start replaying from
+ * @sbuf: scan buffer
+ *
+ * This function replays a log LEB and returns zero in case of success, %1 if
+ * this is the last LEB in the log, and a negative error code in case of
+ * failure.
+ */
+static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
+{
+	int err;
+	struct ubifs_scan_leb *sleb;
+	struct ubifs_scan_node *snod;
+	const struct ubifs_cs_node *node;
+
+	dbg_mnt("replay log LEB %d:%d", lnum, offs);
+	sleb = ubifs_scan(c, lnum, offs, sbuf);
+	if (IS_ERR(sleb)) {
+		if (c->need_recovery)
+			sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf);
+		if (IS_ERR(sleb))
+			return PTR_ERR(sleb);
+	}
+
+	if (sleb->nodes_cnt == 0) {
+		err = 1;
+		goto out;
+	}
+
+	node = sleb->buf;
+
+	snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list);
+	if (c->cs_sqnum == 0) {
+		/*
+		 * This is the first log LEB we are looking at, make sure that
+		 * the first node is a commit start node. Also record its
+		 * sequence number so that UBIFS can determine where the log
+		 * ends, because all nodes which were have higher sequence
+		 * numbers.
+		 */
+		if (snod->type != UBIFS_CS_NODE) {
+			dbg_err("first log node at LEB %d:%d is not CS node",
+				lnum, offs);
+			goto out_dump;
+		}
+		if (le64_to_cpu(node->cmt_no) != c->cmt_no) {
+			dbg_err("first CS node at LEB %d:%d has wrong "
+				"commit number %llu expected %llu",
+				lnum, offs,
+				(unsigned long long)le64_to_cpu(node->cmt_no),
+				c->cmt_no);
+			goto out_dump;
+		}
+
+		c->cs_sqnum = le64_to_cpu(node->ch.sqnum);
+		dbg_mnt("commit start sqnum %llu", c->cs_sqnum);
+	}
+
+	if (snod->sqnum < c->cs_sqnum) {
+		/*
+		 * This means that we reached end of log and now
+		 * look to the older log data, which was already
+		 * committed but the eraseblock was not erased (UBIFS
+		 * only unmaps it). So this basically means we have to
+		 * exit with "end of log" code.
+		 */
+		err = 1;
+		goto out;
+	}
+
+	/* Make sure the first node sits at offset zero of the LEB */
+	if (snod->offs != 0) {
+		dbg_err("first node is not at zero offset");
+		goto out_dump;
+	}
+
+	list_for_each_entry(snod, &sleb->nodes, list) {
+
+		cond_resched();
+
+		if (snod->sqnum >= SQNUM_WATERMARK) {
+			ubifs_err("file system's life ended");
+			goto out_dump;
+		}
+
+		if (snod->sqnum < c->cs_sqnum) {
+			dbg_err("bad sqnum %llu, commit sqnum %llu",
+				snod->sqnum, c->cs_sqnum);
+			goto out_dump;
+		}
+
+		if (snod->sqnum > c->max_sqnum)
+			c->max_sqnum = snod->sqnum;
+
+		switch (snod->type) {
+		case UBIFS_REF_NODE: {
+			const struct ubifs_ref_node *ref = snod->node;
+
+			err = validate_ref(c, ref);
+			if (err == 1)
+				break; /* Already have this bud */
+			if (err)
+				goto out_dump;
+
+			err = add_replay_bud(c, le32_to_cpu(ref->lnum),
+					     le32_to_cpu(ref->offs),
+					     le32_to_cpu(ref->jhead),
+					     snod->sqnum);
+			if (err)
+				goto out;
+
+			break;
+		}
+		case UBIFS_CS_NODE:
+			/* Make sure it sits at the beginning of LEB */
+			if (snod->offs != 0) {
+				ubifs_err("unexpected node in log");
+				goto out_dump;
+			}
+			break;
+		default:
+			ubifs_err("unexpected node in log");
+			goto out_dump;
+		}
+	}
+
+	if (sleb->endpt || c->lhead_offs >= c->leb_size) {
+		c->lhead_lnum = lnum;
+		c->lhead_offs = sleb->endpt;
+	}
+
+	err = !sleb->endpt;
+out:
+	ubifs_scan_destroy(sleb);
+	return err;
+
+out_dump:
+	ubifs_err("log error detected while replying the log at LEB %d:%d",
+		  lnum, offs + snod->offs);
+	dbg_dump_node(c, snod->node);
+	ubifs_scan_destroy(sleb);
+	return -EINVAL;
+}
+
+/**
+ * take_ihead - update the status of the index head in lprops to 'taken'.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns the amount of free space in the index head LEB or a
+ * negative error code.
+ */
+static int take_ihead(struct ubifs_info *c)
+{
+	const struct ubifs_lprops *lp;
+	int err, free;
+
+	ubifs_get_lprops(c);
+
+	lp = ubifs_lpt_lookup_dirty(c, c->ihead_lnum);
+	if (IS_ERR(lp)) {
+		err = PTR_ERR(lp);
+		goto out;
+	}
+
+	free = lp->free;
+
+	lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC,
+			     lp->flags | LPROPS_TAKEN, 0);
+	if (IS_ERR(lp)) {
+		err = PTR_ERR(lp);
+		goto out;
+	}
+
+	err = free;
+out:
+	ubifs_release_lprops(c);
+	return err;
+}
+
+/**
+ * ubifs_replay_journal - replay journal.
+ * @c: UBIFS file-system description object
+ *
+ * This function scans the journal, replays and cleans it up. It makes sure all
+ * memory data structures related to uncommitted journal are built (dirty TNC
+ * tree, tree of buds, modified lprops, etc).
+ */
+int ubifs_replay_journal(struct ubifs_info *c)
+{
+	int err, i, lnum, offs, free;
+	void *sbuf = NULL;
+
+	BUILD_BUG_ON(UBIFS_TRUN_KEY > 5);
+
+	/* Update the status of the index head in lprops to 'taken' */
+	free = take_ihead(c);
+	if (free < 0)
+		return free; /* Error code */
+
+	if (c->ihead_offs != c->leb_size - free) {
+		ubifs_err("bad index head LEB %d:%d", c->ihead_lnum,
+			  c->ihead_offs);
+		return -EINVAL;
+	}
+
+	sbuf = vmalloc(c->leb_size);
+	if (!sbuf)
+		return -ENOMEM;
+
+	dbg_mnt("start replaying the journal");
+
+	c->replaying = 1;
+
+	lnum = c->ltail_lnum = c->lhead_lnum;
+	offs = c->lhead_offs;
+
+	for (i = 0; i < c->log_lebs; i++, lnum++) {
+		if (lnum >= UBIFS_LOG_LNUM + c->log_lebs) {
+			/*
+			 * The log is logically circular, we reached the last
+			 * LEB, switch to the first one.
+			 */
+			lnum = UBIFS_LOG_LNUM;
+			offs = 0;
+		}
+		err = replay_log_leb(c, lnum, offs, sbuf);
+		if (err == 1)
+			/* We hit the end of the log */
+			break;
+		if (err)
+			goto out;
+		offs = 0;
+	}
+
+	err = replay_buds(c);
+	if (err)
+		goto out;
+
+	err = apply_replay_tree(c);
+	if (err)
+		goto out;
+
+	ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery);
+	dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, "
+		"highest_inum %lu", c->lhead_lnum, c->lhead_offs, c->max_sqnum,
+		c->highest_inum);
+out:
+	destroy_replay_tree(c);
+	destroy_bud_list(c);
+	vfree(sbuf);
+	c->replaying = 0;
+	return err;
+}
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
new file mode 100644
index 00000000000..2bf753b3888
--- /dev/null
+++ b/fs/ubifs/sb.c
@@ -0,0 +1,629 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/*
+ * This file implements UBIFS superblock. The superblock is stored at the first
+ * LEB of the volume and is never changed by UBIFS. Only user-space tools may
+ * change it. The superblock node mostly contains geometry information.
+ */
+
+#include "ubifs.h"
+#include <linux/random.h>
+
+/*
+ * Default journal size in logical eraseblocks as a percent of total
+ * flash size.
+ */
+#define DEFAULT_JNL_PERCENT 5
+
+/* Default maximum journal size in bytes */
+#define DEFAULT_MAX_JNL (32*1024*1024)
+
+/* Default indexing tree fanout */
+#define DEFAULT_FANOUT 8
+
+/* Default number of data journal heads */
+#define DEFAULT_JHEADS_CNT 1
+
+/* Default positions of different LEBs in the main area */
+#define DEFAULT_IDX_LEB  0
+#define DEFAULT_DATA_LEB 1
+#define DEFAULT_GC_LEB   2
+
+/* Default number of LEB numbers in LPT's save table */
+#define DEFAULT_LSAVE_CNT 256
+
+/* Default reserved pool size as a percent of maximum free space */
+#define DEFAULT_RP_PERCENT 5
+
+/* The default maximum size of reserved pool in bytes */
+#define DEFAULT_MAX_RP_SIZE (5*1024*1024)
+
+/* Default time granularity in nanoseconds */
+#define DEFAULT_TIME_GRAN 1000000000
+
+/**
+ * create_default_filesystem - format empty UBI volume.
+ * @c: UBIFS file-system description object
+ *
+ * This function creates default empty file-system. Returns zero in case of
+ * success and a negative error code in case of failure.
+ */
+static int create_default_filesystem(struct ubifs_info *c)
+{
+	struct ubifs_sb_node *sup;
+	struct ubifs_mst_node *mst;
+	struct ubifs_idx_node *idx;
+	struct ubifs_branch *br;
+	struct ubifs_ino_node *ino;
+	struct ubifs_cs_node *cs;
+	union ubifs_key key;
+	int err, tmp, jnl_lebs, log_lebs, max_buds, main_lebs, main_first;
+	int lpt_lebs, lpt_first, orph_lebs, big_lpt, ino_waste, sup_flags = 0;
+	int min_leb_cnt = UBIFS_MIN_LEB_CNT;
+	uint64_t tmp64, main_bytes;
+
+	/* Some functions called from here depend on the @c->key_len filed */
+	c->key_len = UBIFS_SK_LEN;
+
+	/*
+	 * First of all, we have to calculate default file-system geometry -
+	 * log size, journal size, etc.
+	 */
+	if (c->leb_cnt < 0x7FFFFFFF / DEFAULT_JNL_PERCENT)
+		/* We can first multiply then divide and have no overflow */
+		jnl_lebs = c->leb_cnt * DEFAULT_JNL_PERCENT / 100;
+	else
+		jnl_lebs = (c->leb_cnt / 100) * DEFAULT_JNL_PERCENT;
+
+	if (jnl_lebs < UBIFS_MIN_JNL_LEBS)
+		jnl_lebs = UBIFS_MIN_JNL_LEBS;
+	if (jnl_lebs * c->leb_size > DEFAULT_MAX_JNL)
+		jnl_lebs = DEFAULT_MAX_JNL / c->leb_size;
+
+	/*
+	 * The log should be large enough to fit reference nodes for all bud
+	 * LEBs. Because buds do not have to start from the beginning of LEBs
+	 * (half of the LEB may contain committed data), the log should
+	 * generally be larger, make it twice as large.
+	 */
+	tmp = 2 * (c->ref_node_alsz * jnl_lebs) + c->leb_size - 1;
+	log_lebs = tmp / c->leb_size;
+	/* Plus one LEB reserved for commit */
+	log_lebs += 1;
+	if (c->leb_cnt - min_leb_cnt > 8) {
+		/* And some extra space to allow writes while committing */
+		log_lebs += 1;
+		min_leb_cnt += 1;
+	}
+
+	max_buds = jnl_lebs - log_lebs;
+	if (max_buds < UBIFS_MIN_BUD_LEBS)
+		max_buds = UBIFS_MIN_BUD_LEBS;
+
+	/*
+	 * Orphan nodes are stored in a separate area. One node can store a lot
+	 * of orphan inode numbers, but when new orphan comes we just add a new
+	 * orphan node. At some point the nodes are consolidated into one
+	 * orphan node.
+	 */
+	orph_lebs = UBIFS_MIN_ORPH_LEBS;
+#ifdef CONFIG_UBIFS_FS_DEBUG
+	if (c->leb_cnt - min_leb_cnt > 1)
+		/*
+		 * For debugging purposes it is better to have at least 2
+		 * orphan LEBs, because the orphan subsystem would need to do
+		 * consolidations and would be stressed more.
+		 */
+		orph_lebs += 1;
+#endif
+
+	main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS - log_lebs;
+	main_lebs -= orph_lebs;
+
+	lpt_first = UBIFS_LOG_LNUM + log_lebs;
+	c->lsave_cnt = DEFAULT_LSAVE_CNT;
+	c->max_leb_cnt = c->leb_cnt;
+	err = ubifs_create_dflt_lpt(c, &main_lebs, lpt_first, &lpt_lebs,
+				    &big_lpt);
+	if (err)
+		return err;
+
+	dbg_gen("LEB Properties Tree created (LEBs %d-%d)", lpt_first,
+		lpt_first + lpt_lebs - 1);
+
+	main_first = c->leb_cnt - main_lebs;
+
+	/* Create default superblock */
+	tmp = ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size);
+	sup = kzalloc(tmp, GFP_KERNEL);
+	if (!sup)
+		return -ENOMEM;
+
+	tmp64 = (uint64_t)max_buds * c->leb_size;
+	if (big_lpt)
+		sup_flags |= UBIFS_FLG_BIGLPT;
+
+	sup->ch.node_type  = UBIFS_SB_NODE;
+	sup->key_hash      = UBIFS_KEY_HASH_R5;
+	sup->flags         = cpu_to_le32(sup_flags);
+	sup->min_io_size   = cpu_to_le32(c->min_io_size);
+	sup->leb_size      = cpu_to_le32(c->leb_size);
+	sup->leb_cnt       = cpu_to_le32(c->leb_cnt);
+	sup->max_leb_cnt   = cpu_to_le32(c->max_leb_cnt);
+	sup->max_bud_bytes = cpu_to_le64(tmp64);
+	sup->log_lebs      = cpu_to_le32(log_lebs);
+	sup->lpt_lebs      = cpu_to_le32(lpt_lebs);
+	sup->orph_lebs     = cpu_to_le32(orph_lebs);
+	sup->jhead_cnt     = cpu_to_le32(DEFAULT_JHEADS_CNT);
+	sup->fanout        = cpu_to_le32(DEFAULT_FANOUT);
+	sup->lsave_cnt     = cpu_to_le32(c->lsave_cnt);
+	sup->fmt_version   = cpu_to_le32(UBIFS_FORMAT_VERSION);
+	sup->default_compr = cpu_to_le16(UBIFS_COMPR_LZO);
+	sup->time_gran     = cpu_to_le32(DEFAULT_TIME_GRAN);
+
+	generate_random_uuid(sup->uuid);
+
+	main_bytes = (uint64_t)main_lebs * c->leb_size;
+	tmp64 = main_bytes * DEFAULT_RP_PERCENT;
+	do_div(tmp64, 100);
+	if (tmp64 > DEFAULT_MAX_RP_SIZE)
+		tmp64 = DEFAULT_MAX_RP_SIZE;
+	sup->rp_size = cpu_to_le64(tmp64);
+
+	err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0, UBI_LONGTERM);
+	kfree(sup);
+	if (err)
+		return err;
+
+	dbg_gen("default superblock created at LEB 0:0");
+
+	/* Create default master node */
+	mst = kzalloc(c->mst_node_alsz, GFP_KERNEL);
+	if (!mst)
+		return -ENOMEM;
+
+	mst->ch.node_type = UBIFS_MST_NODE;
+	mst->log_lnum     = cpu_to_le32(UBIFS_LOG_LNUM);
+	mst->highest_inum = cpu_to_le64(UBIFS_FIRST_INO);
+	mst->cmt_no       = 0;
+	mst->root_lnum    = cpu_to_le32(main_first + DEFAULT_IDX_LEB);
+	mst->root_offs    = 0;
+	tmp = ubifs_idx_node_sz(c, 1);
+	mst->root_len     = cpu_to_le32(tmp);
+	mst->gc_lnum      = cpu_to_le32(main_first + DEFAULT_GC_LEB);
+	mst->ihead_lnum   = cpu_to_le32(main_first + DEFAULT_IDX_LEB);
+	mst->ihead_offs   = cpu_to_le32(ALIGN(tmp, c->min_io_size));
+	mst->index_size   = cpu_to_le64(ALIGN(tmp, 8));
+	mst->lpt_lnum     = cpu_to_le32(c->lpt_lnum);
+	mst->lpt_offs     = cpu_to_le32(c->lpt_offs);
+	mst->nhead_lnum   = cpu_to_le32(c->nhead_lnum);
+	mst->nhead_offs   = cpu_to_le32(c->nhead_offs);
+	mst->ltab_lnum    = cpu_to_le32(c->ltab_lnum);
+	mst->ltab_offs    = cpu_to_le32(c->ltab_offs);
+	mst->lsave_lnum   = cpu_to_le32(c->lsave_lnum);
+	mst->lsave_offs   = cpu_to_le32(c->lsave_offs);
+	mst->lscan_lnum   = cpu_to_le32(main_first);
+	mst->empty_lebs   = cpu_to_le32(main_lebs - 2);
+	mst->idx_lebs     = cpu_to_le32(1);
+	mst->leb_cnt      = cpu_to_le32(c->leb_cnt);
+
+	/* Calculate lprops statistics */
+	tmp64 = main_bytes;
+	tmp64 -= ALIGN(ubifs_idx_node_sz(c, 1), c->min_io_size);
+	tmp64 -= ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size);
+	mst->total_free = cpu_to_le64(tmp64);
+
+	tmp64 = ALIGN(ubifs_idx_node_sz(c, 1), c->min_io_size);
+	ino_waste = ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size) -
+			  UBIFS_INO_NODE_SZ;
+	tmp64 += ino_waste;
+	tmp64 -= ALIGN(ubifs_idx_node_sz(c, 1), 8);
+	mst->total_dirty = cpu_to_le64(tmp64);
+
+	/*  The indexing LEB does not contribute to dark space */
+	tmp64 = (c->main_lebs - 1) * c->dark_wm;
+	mst->total_dark = cpu_to_le64(tmp64);
+
+	mst->total_used = cpu_to_le64(UBIFS_INO_NODE_SZ);
+
+	err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM, 0,
+			       UBI_UNKNOWN);
+	if (err) {
+		kfree(mst);
+		return err;
+	}
+	err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM + 1, 0,
+			       UBI_UNKNOWN);
+	kfree(mst);
+	if (err)
+		return err;
+
+	dbg_gen("default master node created at LEB %d:0", UBIFS_MST_LNUM);
+
+	/* Create the root indexing node */
+	tmp = ubifs_idx_node_sz(c, 1);
+	idx = kzalloc(ALIGN(tmp, c->min_io_size), GFP_KERNEL);
+	if (!idx)
+		return -ENOMEM;
+
+	c->key_fmt = UBIFS_SIMPLE_KEY_FMT;
+	c->key_hash = key_r5_hash;
+
+	idx->ch.node_type = UBIFS_IDX_NODE;
+	idx->child_cnt = cpu_to_le16(1);
+	ino_key_init(c, &key, UBIFS_ROOT_INO);
+	br = ubifs_idx_branch(c, idx, 0);
+	key_write_idx(c, &key, &br->key);
+	br->lnum = cpu_to_le32(main_first + DEFAULT_DATA_LEB);
+	br->len  = cpu_to_le32(UBIFS_INO_NODE_SZ);
+	err = ubifs_write_node(c, idx, tmp, main_first + DEFAULT_IDX_LEB, 0,
+			       UBI_UNKNOWN);
+	kfree(idx);
+	if (err)
+		return err;
+
+	dbg_gen("default root indexing node created LEB %d:0",
+		main_first + DEFAULT_IDX_LEB);
+
+	/* Create default root inode */
+	tmp = ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size);
+	ino = kzalloc(tmp, GFP_KERNEL);
+	if (!ino)
+		return -ENOMEM;
+
+	ino_key_init_flash(c, &ino->key, UBIFS_ROOT_INO);
+	ino->ch.node_type = UBIFS_INO_NODE;
+	ino->creat_sqnum = cpu_to_le64(++c->max_sqnum);
+	ino->nlink = cpu_to_le32(2);
+	tmp = cpu_to_le64(CURRENT_TIME_SEC.tv_sec);
+	ino->atime_sec   = tmp;
+	ino->ctime_sec   = tmp;
+	ino->mtime_sec   = tmp;
+	ino->atime_nsec  = 0;
+	ino->ctime_nsec  = 0;
+	ino->mtime_nsec  = 0;
+	ino->mode = cpu_to_le32(S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO);
+	ino->size = cpu_to_le64(UBIFS_INO_NODE_SZ);
+
+	/* Set compression enabled by default */
+	ino->flags = cpu_to_le32(UBIFS_COMPR_FL);
+
+	err = ubifs_write_node(c, ino, UBIFS_INO_NODE_SZ,
+			       main_first + DEFAULT_DATA_LEB, 0,
+			       UBI_UNKNOWN);
+	kfree(ino);
+	if (err)
+		return err;
+
+	dbg_gen("root inode created at LEB %d:0",
+		main_first + DEFAULT_DATA_LEB);
+
+	/*
+	 * The first node in the log has to be the commit start node. This is
+	 * always the case during normal file-system operation. Write a fake
+	 * commit start node to the log.
+	 */
+	tmp = ALIGN(UBIFS_CS_NODE_SZ, c->min_io_size);
+	cs = kzalloc(tmp, GFP_KERNEL);
+	if (!cs)
+		return -ENOMEM;
+
+	cs->ch.node_type = UBIFS_CS_NODE;
+	err = ubifs_write_node(c, cs, UBIFS_CS_NODE_SZ, UBIFS_LOG_LNUM,
+			       0, UBI_UNKNOWN);
+	kfree(cs);
+
+	ubifs_msg("default file-system created");
+	return 0;
+}
+
+/**
+ * validate_sb - validate superblock node.
+ * @c: UBIFS file-system description object
+ * @sup: superblock node
+ *
+ * This function validates superblock node @sup. Since most of data was read
+ * from the superblock and stored in @c, the function validates fields in @c
+ * instead. Returns zero in case of success and %-EINVAL in case of validation
+ * failure.
+ */
+static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup)
+{
+	long long max_bytes;
+	int err = 1, min_leb_cnt;
+
+	if (!c->key_hash) {
+		err = 2;
+		goto failed;
+	}
+
+	if (sup->key_fmt != UBIFS_SIMPLE_KEY_FMT) {
+		err = 3;
+		goto failed;
+	}
+
+	if (le32_to_cpu(sup->min_io_size) != c->min_io_size) {
+		ubifs_err("min. I/O unit mismatch: %d in superblock, %d real",
+			  le32_to_cpu(sup->min_io_size), c->min_io_size);
+		goto failed;
+	}
+
+	if (le32_to_cpu(sup->leb_size) != c->leb_size) {
+		ubifs_err("LEB size mismatch: %d in superblock, %d real",
+			  le32_to_cpu(sup->leb_size), c->leb_size);
+		goto failed;
+	}
+
+	if (c->log_lebs < UBIFS_MIN_LOG_LEBS ||
+	    c->lpt_lebs < UBIFS_MIN_LPT_LEBS ||
+	    c->orph_lebs < UBIFS_MIN_ORPH_LEBS ||
+	    c->main_lebs < UBIFS_MIN_MAIN_LEBS) {
+		err = 4;
+		goto failed;
+	}
+
+	/*
+	 * Calculate minimum allowed amount of main area LEBs. This is very
+	 * similar to %UBIFS_MIN_LEB_CNT, but we take into account real what we
+	 * have just read from the superblock.
+	 */
+	min_leb_cnt = UBIFS_SB_LEBS + UBIFS_MST_LEBS + c->log_lebs;
+	min_leb_cnt += c->lpt_lebs + c->orph_lebs + c->jhead_cnt + 6;
+
+	if (c->leb_cnt < min_leb_cnt || c->leb_cnt > c->vi.size) {
+		ubifs_err("bad LEB count: %d in superblock, %d on UBI volume, "
+			  "%d minimum required", c->leb_cnt, c->vi.size,
+			  min_leb_cnt);
+		goto failed;
+	}
+
+	if (c->max_leb_cnt < c->leb_cnt) {
+		ubifs_err("max. LEB count %d less than LEB count %d",
+			  c->max_leb_cnt, c->leb_cnt);
+		goto failed;
+	}
+
+	if (c->main_lebs < UBIFS_MIN_MAIN_LEBS) {
+		err = 7;
+		goto failed;
+	}
+
+	if (c->max_bud_bytes < (long long)c->leb_size * UBIFS_MIN_BUD_LEBS ||
+	    c->max_bud_bytes > (long long)c->leb_size * c->main_lebs) {
+		err = 8;
+		goto failed;
+	}
+
+	if (c->jhead_cnt < NONDATA_JHEADS_CNT + 1 ||
+	    c->jhead_cnt > NONDATA_JHEADS_CNT + UBIFS_MAX_JHEADS) {
+		err = 9;
+		goto failed;
+	}
+
+	if (c->fanout < UBIFS_MIN_FANOUT ||
+	    ubifs_idx_node_sz(c, c->fanout) > c->leb_size) {
+		err = 10;
+		goto failed;
+	}
+
+	if (c->lsave_cnt < 0 || (c->lsave_cnt > DEFAULT_LSAVE_CNT &&
+	    c->lsave_cnt > c->max_leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS -
+	    c->log_lebs - c->lpt_lebs - c->orph_lebs)) {
+		err = 11;
+		goto failed;
+	}
+
+	if (UBIFS_SB_LEBS + UBIFS_MST_LEBS + c->log_lebs + c->lpt_lebs +
+	    c->orph_lebs + c->main_lebs != c->leb_cnt) {
+		err = 12;
+		goto failed;
+	}
+
+	if (c->default_compr < 0 || c->default_compr >= UBIFS_COMPR_TYPES_CNT) {
+		err = 13;
+		goto failed;
+	}
+
+	max_bytes = c->main_lebs * (long long)c->leb_size;
+	if (c->rp_size < 0 || max_bytes < c->rp_size) {
+		err = 14;
+		goto failed;
+	}
+
+	if (le32_to_cpu(sup->time_gran) > 1000000000 ||
+	    le32_to_cpu(sup->time_gran) < 1) {
+		err = 15;
+		goto failed;
+	}
+
+	return 0;
+
+failed:
+	ubifs_err("bad superblock, error %d", err);
+	dbg_dump_node(c, sup);
+	return -EINVAL;
+}
+
+/**
+ * ubifs_read_sb_node - read superblock node.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns a pointer to the superblock node or a negative error
+ * code.
+ */
+struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c)
+{
+	struct ubifs_sb_node *sup;
+	int err;
+
+	sup = kmalloc(ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size), GFP_NOFS);
+	if (!sup)
+		return ERR_PTR(-ENOMEM);
+
+	err = ubifs_read_node(c, sup, UBIFS_SB_NODE, UBIFS_SB_NODE_SZ,
+			      UBIFS_SB_LNUM, 0);
+	if (err) {
+		kfree(sup);
+		return ERR_PTR(err);
+	}
+
+	return sup;
+}
+
+/**
+ * ubifs_write_sb_node - write superblock node.
+ * @c: UBIFS file-system description object
+ * @sup: superblock node read with 'ubifs_read_sb_node()'
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup)
+{
+	int len = ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size);
+
+	ubifs_prepare_node(c, sup, UBIFS_SB_NODE_SZ, 1);
+	return ubifs_leb_change(c, UBIFS_SB_LNUM, sup, len, UBI_LONGTERM);
+}
+
+/**
+ * ubifs_read_superblock - read superblock.
+ * @c: UBIFS file-system description object
+ *
+ * This function finds, reads and checks the superblock. If an empty UBI volume
+ * is being mounted, this function creates default superblock. Returns zero in
+ * case of success, and a negative error code in case of failure.
+ */
+int ubifs_read_superblock(struct ubifs_info *c)
+{
+	int err, sup_flags;
+	struct ubifs_sb_node *sup;
+
+	if (c->empty) {
+		err = create_default_filesystem(c);
+		if (err)
+			return err;
+	}
+
+	sup = ubifs_read_sb_node(c);
+	if (IS_ERR(sup))
+		return PTR_ERR(sup);
+
+	/*
+	 * The software supports all previous versions but not future versions,
+	 * due to the unavailability of time-travelling equipment.
+	 */
+	c->fmt_version = le32_to_cpu(sup->fmt_version);
+	if (c->fmt_version > UBIFS_FORMAT_VERSION) {
+		ubifs_err("on-flash format version is %d, but software only "
+			  "supports up to version %d", c->fmt_version,
+			  UBIFS_FORMAT_VERSION);
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (c->fmt_version < 3) {
+		ubifs_err("on-flash format version %d is not supported",
+			  c->fmt_version);
+		err = -EINVAL;
+		goto out;
+	}
+
+	switch (sup->key_hash) {
+	case UBIFS_KEY_HASH_R5:
+		c->key_hash = key_r5_hash;
+		c->key_hash_type = UBIFS_KEY_HASH_R5;
+		break;
+
+	case UBIFS_KEY_HASH_TEST:
+		c->key_hash = key_test_hash;
+		c->key_hash_type = UBIFS_KEY_HASH_TEST;
+		break;
+	};
+
+	c->key_fmt = sup->key_fmt;
+
+	switch (c->key_fmt) {
+	case UBIFS_SIMPLE_KEY_FMT:
+		c->key_len = UBIFS_SK_LEN;
+		break;
+	default:
+		ubifs_err("unsupported key format");
+		err = -EINVAL;
+		goto out;
+	}
+
+	c->leb_cnt       = le32_to_cpu(sup->leb_cnt);
+	c->max_leb_cnt   = le32_to_cpu(sup->max_leb_cnt);
+	c->max_bud_bytes = le64_to_cpu(sup->max_bud_bytes);
+	c->log_lebs      = le32_to_cpu(sup->log_lebs);
+	c->lpt_lebs      = le32_to_cpu(sup->lpt_lebs);
+	c->orph_lebs     = le32_to_cpu(sup->orph_lebs);
+	c->jhead_cnt     = le32_to_cpu(sup->jhead_cnt) + NONDATA_JHEADS_CNT;
+	c->fanout        = le32_to_cpu(sup->fanout);
+	c->lsave_cnt     = le32_to_cpu(sup->lsave_cnt);
+	c->default_compr = le16_to_cpu(sup->default_compr);
+	c->rp_size       = le64_to_cpu(sup->rp_size);
+	c->rp_uid        = le32_to_cpu(sup->rp_uid);
+	c->rp_gid        = le32_to_cpu(sup->rp_gid);
+	sup_flags        = le32_to_cpu(sup->flags);
+
+	c->vfs_sb->s_time_gran = le32_to_cpu(sup->time_gran);
+
+	memcpy(&c->uuid, &sup->uuid, 16);
+
+	c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT);
+
+	/* Automatically increase file system size to the maximum size */
+	c->old_leb_cnt = c->leb_cnt;
+	if (c->leb_cnt < c->vi.size && c->leb_cnt < c->max_leb_cnt) {
+		c->leb_cnt = min_t(int, c->max_leb_cnt, c->vi.size);
+		if (c->vfs_sb->s_flags & MS_RDONLY)
+			dbg_mnt("Auto resizing (ro) from %d LEBs to %d LEBs",
+				c->old_leb_cnt,	c->leb_cnt);
+		else {
+			dbg_mnt("Auto resizing (sb) from %d LEBs to %d LEBs",
+				c->old_leb_cnt, c->leb_cnt);
+			sup->leb_cnt = cpu_to_le32(c->leb_cnt);
+			err = ubifs_write_sb_node(c, sup);
+			if (err)
+				goto out;
+			c->old_leb_cnt = c->leb_cnt;
+		}
+	}
+
+	c->log_bytes = (long long)c->log_lebs * c->leb_size;
+	c->log_last = UBIFS_LOG_LNUM + c->log_lebs - 1;
+	c->lpt_first = UBIFS_LOG_LNUM + c->log_lebs;
+	c->lpt_last = c->lpt_first + c->lpt_lebs - 1;
+	c->orph_first = c->lpt_last + 1;
+	c->orph_last = c->orph_first + c->orph_lebs - 1;
+	c->main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS;
+	c->main_lebs -= c->log_lebs + c->lpt_lebs + c->orph_lebs;
+	c->main_first = c->leb_cnt - c->main_lebs;
+	c->report_rp_size = ubifs_reported_space(c, c->rp_size);
+
+	err = validate_sb(c, sup);
+out:
+	kfree(sup);
+	return err;
+}
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
new file mode 100644
index 00000000000..acf5c5fffc6
--- /dev/null
+++ b/fs/ubifs/scan.c
@@ -0,0 +1,362 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements the scan which is a general-purpose function for
+ * determining what nodes are in an eraseblock. The scan is used to replay the
+ * journal, to do garbage collection. for the TNC in-the-gaps method, and by
+ * debugging functions.
+ */
+
+#include "ubifs.h"
+
+/**
+ * scan_padding_bytes - scan for padding bytes.
+ * @buf: buffer to scan
+ * @len: length of buffer
+ *
+ * This function returns the number of padding bytes on success and
+ * %SCANNED_GARBAGE on failure.
+ */
+static int scan_padding_bytes(void *buf, int len)
+{
+	int pad_len = 0, max_pad_len = min_t(int, UBIFS_PAD_NODE_SZ, len);
+	uint8_t *p = buf;
+
+	dbg_scan("not a node");
+
+	while (pad_len < max_pad_len && *p++ == UBIFS_PADDING_BYTE)
+		pad_len += 1;
+
+	if (!pad_len || (pad_len & 7))
+		return SCANNED_GARBAGE;
+
+	dbg_scan("%d padding bytes", pad_len);
+
+	return pad_len;
+}
+
+/**
+ * ubifs_scan_a_node - scan for a node or padding.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to scan
+ * @len: length of buffer
+ * @lnum: logical eraseblock number
+ * @offs: offset within the logical eraseblock
+ * @quiet: print no messages
+ *
+ * This function returns a scanning code to indicate what was scanned.
+ */
+int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
+		      int offs, int quiet)
+{
+	struct ubifs_ch *ch = buf;
+	uint32_t magic;
+
+	magic = le32_to_cpu(ch->magic);
+
+	if (magic == 0xFFFFFFFF) {
+		dbg_scan("hit empty space");
+		return SCANNED_EMPTY_SPACE;
+	}
+
+	if (magic != UBIFS_NODE_MAGIC)
+		return scan_padding_bytes(buf, len);
+
+	if (len < UBIFS_CH_SZ)
+		return SCANNED_GARBAGE;
+
+	dbg_scan("scanning %s", dbg_ntype(ch->node_type));
+
+	if (ubifs_check_node(c, buf, lnum, offs, quiet))
+		return SCANNED_A_CORRUPT_NODE;
+
+	if (ch->node_type == UBIFS_PAD_NODE) {
+		struct ubifs_pad_node *pad = buf;
+		int pad_len = le32_to_cpu(pad->pad_len);
+		int node_len = le32_to_cpu(ch->len);
+
+		/* Validate the padding node */
+		if (pad_len < 0 ||
+		    offs + node_len + pad_len > c->leb_size) {
+			if (!quiet) {
+				ubifs_err("bad pad node at LEB %d:%d",
+					  lnum, offs);
+				dbg_dump_node(c, pad);
+			}
+			return SCANNED_A_BAD_PAD_NODE;
+		}
+
+		/* Make the node pads to 8-byte boundary */
+		if ((node_len + pad_len) & 7) {
+			if (!quiet) {
+				dbg_err("bad padding length %d - %d",
+					offs, offs + node_len + pad_len);
+			}
+			return SCANNED_A_BAD_PAD_NODE;
+		}
+
+		dbg_scan("%d bytes padded, offset now %d",
+			 pad_len, ALIGN(offs + node_len + pad_len, 8));
+
+		return node_len + pad_len;
+	}
+
+	return SCANNED_A_NODE;
+}
+
+/**
+ * ubifs_start_scan - create LEB scanning information at start of scan.
+ * @c: UBIFS file-system description object
+ * @lnum: logical eraseblock number
+ * @offs: offset to start at (usually zero)
+ * @sbuf: scan buffer (must be c->leb_size)
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum,
+					int offs, void *sbuf)
+{
+	struct ubifs_scan_leb *sleb;
+	int err;
+
+	dbg_scan("scan LEB %d:%d", lnum, offs);
+
+	sleb = kzalloc(sizeof(struct ubifs_scan_leb), GFP_NOFS);
+	if (!sleb)
+		return ERR_PTR(-ENOMEM);
+
+	sleb->lnum = lnum;
+	INIT_LIST_HEAD(&sleb->nodes);
+	sleb->buf = sbuf;
+
+	err = ubi_read(c->ubi, lnum, sbuf + offs, offs, c->leb_size - offs);
+	if (err && err != -EBADMSG) {
+		ubifs_err("cannot read %d bytes from LEB %d:%d,"
+			  " error %d", c->leb_size - offs, lnum, offs, err);
+		kfree(sleb);
+		return ERR_PTR(err);
+	}
+
+	if (err == -EBADMSG)
+		sleb->ecc = 1;
+
+	return sleb;
+}
+
+/**
+ * ubifs_end_scan - update LEB scanning information at end of scan.
+ * @c: UBIFS file-system description object
+ * @sleb: scanning information
+ * @lnum: logical eraseblock number
+ * @offs: offset to start at (usually zero)
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+void ubifs_end_scan(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+		    int lnum, int offs)
+{
+	lnum = lnum;
+	dbg_scan("stop scanning LEB %d at offset %d", lnum, offs);
+	ubifs_assert(offs % c->min_io_size == 0);
+
+	sleb->endpt = ALIGN(offs, c->min_io_size);
+}
+
+/**
+ * ubifs_add_snod - add a scanned node to LEB scanning information.
+ * @c: UBIFS file-system description object
+ * @sleb: scanning information
+ * @buf: buffer containing node
+ * @offs: offset of node on flash
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+		   void *buf, int offs)
+{
+	struct ubifs_ch *ch = buf;
+	struct ubifs_ino_node *ino = buf;
+	struct ubifs_scan_node *snod;
+
+	snod = kzalloc(sizeof(struct ubifs_scan_node), GFP_NOFS);
+	if (!snod)
+		return -ENOMEM;
+
+	snod->sqnum = le64_to_cpu(ch->sqnum);
+	snod->type = ch->node_type;
+	snod->offs = offs;
+	snod->len = le32_to_cpu(ch->len);
+	snod->node = buf;
+
+	switch (ch->node_type) {
+	case UBIFS_INO_NODE:
+	case UBIFS_DENT_NODE:
+	case UBIFS_XENT_NODE:
+	case UBIFS_DATA_NODE:
+	case UBIFS_TRUN_NODE:
+		/*
+		 * The key is in the same place in all keyed
+		 * nodes.
+		 */
+		key_read(c, &ino->key, &snod->key);
+		break;
+	}
+	list_add_tail(&snod->list, &sleb->nodes);
+	sleb->nodes_cnt += 1;
+	return 0;
+}
+
+/**
+ * ubifs_scanned_corruption - print information after UBIFS scanned corruption.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number of corruption
+ * @offs: offset of corruption
+ * @buf: buffer containing corruption
+ */
+void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs,
+			      void *buf)
+{
+	int len;
+
+	ubifs_err("corrupted data at LEB %d:%d", lnum, offs);
+	if (dbg_failure_mode)
+		return;
+	len = c->leb_size - offs;
+	if (len > 4096)
+		len = 4096;
+	dbg_err("first %d bytes from LEB %d:%d", len, lnum, offs);
+	print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 4, buf, len, 1);
+}
+
+/**
+ * ubifs_scan - scan a logical eraseblock.
+ * @c: UBIFS file-system description object
+ * @lnum: logical eraseblock number
+ * @offs: offset to start at (usually zero)
+ * @sbuf: scan buffer (must be c->leb_size)
+ *
+ * This function scans LEB number @lnum and returns complete information about
+ * its contents. Returns an error code in case of failure.
+ */
+struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
+				  int offs, void *sbuf)
+{
+	void *buf = sbuf + offs;
+	int err, len = c->leb_size - offs;
+	struct ubifs_scan_leb *sleb;
+
+	sleb = ubifs_start_scan(c, lnum, offs, sbuf);
+	if (IS_ERR(sleb))
+		return sleb;
+
+	while (len >= 8) {
+		struct ubifs_ch *ch = buf;
+		int node_len, ret;
+
+		dbg_scan("look at LEB %d:%d (%d bytes left)",
+			 lnum, offs, len);
+
+		cond_resched();
+
+		ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 0);
+
+		if (ret > 0) {
+			/* Padding bytes or a valid padding node */
+			offs += ret;
+			buf += ret;
+			len -= ret;
+			continue;
+		}
+
+		if (ret == SCANNED_EMPTY_SPACE)
+			/* Empty space is checked later */
+			break;
+
+		switch (ret) {
+		case SCANNED_GARBAGE:
+			dbg_err("garbage");
+			goto corrupted;
+		case SCANNED_A_NODE:
+			break;
+		case SCANNED_A_CORRUPT_NODE:
+		case SCANNED_A_BAD_PAD_NODE:
+			dbg_err("bad node");
+			goto corrupted;
+		default:
+			dbg_err("unknown");
+			goto corrupted;
+		}
+
+		err = ubifs_add_snod(c, sleb, buf, offs);
+		if (err)
+			goto error;
+
+		node_len = ALIGN(le32_to_cpu(ch->len), 8);
+		offs += node_len;
+		buf += node_len;
+		len -= node_len;
+	}
+
+	if (offs % c->min_io_size)
+		goto corrupted;
+
+	ubifs_end_scan(c, sleb, lnum, offs);
+
+	for (; len > 4; offs += 4, buf = buf + 4, len -= 4)
+		if (*(uint32_t *)buf != 0xffffffff)
+			break;
+	for (; len; offs++, buf++, len--)
+		if (*(uint8_t *)buf != 0xff) {
+			ubifs_err("corrupt empty space at LEB %d:%d",
+				  lnum, offs);
+			goto corrupted;
+		}
+
+	return sleb;
+
+corrupted:
+	ubifs_scanned_corruption(c, lnum, offs, buf);
+	err = -EUCLEAN;
+error:
+	ubifs_err("LEB %d scanning failed", lnum);
+	ubifs_scan_destroy(sleb);
+	return ERR_PTR(err);
+}
+
+/**
+ * ubifs_scan_destroy - destroy LEB scanning information.
+ * @sleb: scanning information to free
+ */
+void ubifs_scan_destroy(struct ubifs_scan_leb *sleb)
+{
+	struct ubifs_scan_node *node;
+	struct list_head *head;
+
+	head = &sleb->nodes;
+	while (!list_empty(head)) {
+		node = list_entry(head->next, struct ubifs_scan_node, list);
+		list_del(&node->list);
+		kfree(node);
+	}
+	kfree(sleb);
+}
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
new file mode 100644
index 00000000000..f248533841a
--- /dev/null
+++ b/fs/ubifs/shrinker.c
@@ -0,0 +1,322 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/*
+ * This file implements UBIFS shrinker which evicts clean znodes from the TNC
+ * tree when Linux VM needs more RAM.
+ *
+ * We do not implement any LRU lists to find oldest znodes to free because it
+ * would add additional overhead to the file system fast paths. So the shrinker
+ * just walks the TNC tree when searching for znodes to free.
+ *
+ * If the root of a TNC sub-tree is clean and old enough, then the children are
+ * also clean and old enough. So the shrinker walks the TNC in level order and
+ * dumps entire sub-trees.
+ *
+ * The age of znodes is just the time-stamp when they were last looked at.
+ * The current shrinker first tries to evict old znodes, then young ones.
+ *
+ * Since the shrinker is global, it has to protect against races with FS
+ * un-mounts, which is done by the 'ubifs_infos_lock' and 'c->umount_mutex'.
+ */
+
+#include "ubifs.h"
+
+/* List of all UBIFS file-system instances */
+LIST_HEAD(ubifs_infos);
+
+/*
+ * We number each shrinker run and record the number on the ubifs_info structure
+ * so that we can easily work out which ubifs_info structures have already been
+ * done by the current run.
+ */
+static unsigned int shrinker_run_no;
+
+/* Protects 'ubifs_infos' list */
+DEFINE_SPINLOCK(ubifs_infos_lock);
+
+/* Global clean znode counter (for all mounted UBIFS instances) */
+atomic_long_t ubifs_clean_zn_cnt;
+
+/**
+ * shrink_tnc - shrink TNC tree.
+ * @c: UBIFS file-system description object
+ * @nr: number of znodes to free
+ * @age: the age of znodes to free
+ * @contention: if any contention, this is set to %1
+ *
+ * This function traverses TNC tree and frees clean znodes. It does not free
+ * clean znodes which younger then @age. Returns number of freed znodes.
+ */
+static int shrink_tnc(struct ubifs_info *c, int nr, int age, int *contention)
+{
+	int total_freed = 0;
+	struct ubifs_znode *znode, *zprev;
+	int time = get_seconds();
+
+	ubifs_assert(mutex_is_locked(&c->umount_mutex));
+	ubifs_assert(mutex_is_locked(&c->tnc_mutex));
+
+	if (!c->zroot.znode || atomic_long_read(&c->clean_zn_cnt) == 0)
+		return 0;
+
+	/*
+	 * Traverse the TNC tree in levelorder manner, so that it is possible
+	 * to destroy large sub-trees. Indeed, if a znode is old, then all its
+	 * children are older or of the same age.
+	 *
+	 * Note, we are holding 'c->tnc_mutex', so we do not have to lock the
+	 * 'c->space_lock' when _reading_ 'c->clean_zn_cnt', because it is
+	 * changed only when the 'c->tnc_mutex' is held.
+	 */
+	zprev = NULL;
+	znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL);
+	while (znode && total_freed < nr &&
+	       atomic_long_read(&c->clean_zn_cnt) > 0) {
+		int freed;
+
+		/*
+		 * If the znode is clean, but it is in the 'c->cnext' list, this
+		 * means that this znode has just been written to flash as a
+		 * part of commit and was marked clean. They will be removed
+		 * from the list at end commit. We cannot change the list,
+		 * because it is not protected by any mutex (design decision to
+		 * make commit really independent and parallel to main I/O). So
+		 * we just skip these znodes.
+		 *
+		 * Note, the 'clean_zn_cnt' counters are not updated until
+		 * after the commit, so the UBIFS shrinker does not report
+		 * the znodes which are in the 'c->cnext' list as freeable.
+		 *
+		 * Also note, if the root of a sub-tree is not in 'c->cnext',
+		 * then the whole sub-tree is not in 'c->cnext' as well, so it
+		 * is safe to dump whole sub-tree.
+		 */
+
+		if (znode->cnext) {
+			/*
+			 * Very soon these znodes will be removed from the list
+			 * and become freeable.
+			 */
+			*contention = 1;
+		} else if (!ubifs_zn_dirty(znode) &&
+			   abs(time - znode->time) >= age) {
+			if (znode->parent)
+				znode->parent->zbranch[znode->iip].znode = NULL;
+			else
+				c->zroot.znode = NULL;
+
+			freed = ubifs_destroy_tnc_subtree(znode);
+			atomic_long_sub(freed, &ubifs_clean_zn_cnt);
+			atomic_long_sub(freed, &c->clean_zn_cnt);
+			ubifs_assert(atomic_long_read(&c->clean_zn_cnt) >= 0);
+			total_freed += freed;
+			znode = zprev;
+		}
+
+		if (unlikely(!c->zroot.znode))
+			break;
+
+		zprev = znode;
+		znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode);
+		cond_resched();
+	}
+
+	return total_freed;
+}
+
+/**
+ * shrink_tnc_trees - shrink UBIFS TNC trees.
+ * @nr: number of znodes to free
+ * @age: the age of znodes to free
+ * @contention: if any contention, this is set to %1
+ *
+ * This function walks the list of mounted UBIFS file-systems and frees clean
+ * znodes which are older then @age, until at least @nr znodes are freed.
+ * Returns the number of freed znodes.
+ */
+static int shrink_tnc_trees(int nr, int age, int *contention)
+{
+	struct ubifs_info *c;
+	struct list_head *p;
+	unsigned int run_no;
+	int freed = 0;
+
+	spin_lock(&ubifs_infos_lock);
+	do {
+		run_no = ++shrinker_run_no;
+	} while (run_no == 0);
+	/* Iterate over all mounted UBIFS file-systems and try to shrink them */
+	p = ubifs_infos.next;
+	while (p != &ubifs_infos) {
+		c = list_entry(p, struct ubifs_info, infos_list);
+		/*
+		 * We move the ones we do to the end of the list, so we stop
+		 * when we see one we have already done.
+		 */
+		if (c->shrinker_run_no == run_no)
+			break;
+		if (!mutex_trylock(&c->umount_mutex)) {
+			/* Some un-mount is in progress, try next FS */
+			*contention = 1;
+			p = p->next;
+			continue;
+		}
+		/*
+		 * We're holding 'c->umount_mutex', so the file-system won't go
+		 * away.
+		 */
+		if (!mutex_trylock(&c->tnc_mutex)) {
+			mutex_unlock(&c->umount_mutex);
+			*contention = 1;
+			p = p->next;
+			continue;
+		}
+		spin_unlock(&ubifs_infos_lock);
+		/*
+		 * OK, now we have TNC locked, the file-system cannot go away -
+		 * it is safe to reap the cache.
+		 */
+		c->shrinker_run_no = run_no;
+		freed += shrink_tnc(c, nr, age, contention);
+		mutex_unlock(&c->tnc_mutex);
+		spin_lock(&ubifs_infos_lock);
+		/* Get the next list element before we move this one */
+		p = p->next;
+		/*
+		 * Move this one to the end of the list to provide some
+		 * fairness.
+		 */
+		list_del(&c->infos_list);
+		list_add_tail(&c->infos_list, &ubifs_infos);
+		mutex_unlock(&c->umount_mutex);
+		if (freed >= nr)
+			break;
+	}
+	spin_unlock(&ubifs_infos_lock);
+	return freed;
+}
+
+/**
+ * kick_a_thread - kick a background thread to start commit.
+ *
+ * This function kicks a background thread to start background commit. Returns
+ * %-1 if a thread was kicked or there is another reason to assume the memory
+ * will soon be freed or become freeable. If there are no dirty znodes, returns
+ * %0.
+ */
+static int kick_a_thread(void)
+{
+	int i;
+	struct ubifs_info *c;
+
+	/*
+	 * Iterate over all mounted UBIFS file-systems and find out if there is
+	 * already an ongoing commit operation there. If no, then iterate for
+	 * the second time and initiate background commit.
+	 */
+	spin_lock(&ubifs_infos_lock);
+	for (i = 0; i < 2; i++) {
+		list_for_each_entry(c, &ubifs_infos, infos_list) {
+			long dirty_zn_cnt;
+
+			if (!mutex_trylock(&c->umount_mutex)) {
+				/*
+				 * Some un-mount is in progress, it will
+				 * certainly free memory, so just return.
+				 */
+				spin_unlock(&ubifs_infos_lock);
+				return -1;
+			}
+
+			dirty_zn_cnt = atomic_long_read(&c->dirty_zn_cnt);
+
+			if (!dirty_zn_cnt || c->cmt_state == COMMIT_BROKEN ||
+			    c->ro_media) {
+				mutex_unlock(&c->umount_mutex);
+				continue;
+			}
+
+			if (c->cmt_state != COMMIT_RESTING) {
+				spin_unlock(&ubifs_infos_lock);
+				mutex_unlock(&c->umount_mutex);
+				return -1;
+			}
+
+			if (i == 1) {
+				list_del(&c->infos_list);
+				list_add_tail(&c->infos_list, &ubifs_infos);
+				spin_unlock(&ubifs_infos_lock);
+
+				ubifs_request_bg_commit(c);
+				mutex_unlock(&c->umount_mutex);
+				return -1;
+			}
+			mutex_unlock(&c->umount_mutex);
+		}
+	}
+	spin_unlock(&ubifs_infos_lock);
+
+	return 0;
+}
+
+int ubifs_shrinker(int nr, gfp_t gfp_mask)
+{
+	int freed, contention = 0;
+	long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt);
+
+	if (nr == 0)
+		return clean_zn_cnt;
+
+	if (!clean_zn_cnt) {
+		/*
+		 * No clean znodes, nothing to reap. All we can do in this case
+		 * is to kick background threads to start commit, which will
+		 * probably make clean znodes which, in turn, will be freeable.
+		 * And we return -1 which means will make VM call us again
+		 * later.
+		 */
+		dbg_tnc("no clean znodes, kick a thread");
+		return kick_a_thread();
+	}
+
+	freed = shrink_tnc_trees(nr, OLD_ZNODE_AGE, &contention);
+	if (freed >= nr)
+		goto out;
+
+	dbg_tnc("not enough old znodes, try to free young ones");
+	freed += shrink_tnc_trees(nr - freed, YOUNG_ZNODE_AGE, &contention);
+	if (freed >= nr)
+		goto out;
+
+	dbg_tnc("not enough young znodes, free all");
+	freed += shrink_tnc_trees(nr - freed, 0, &contention);
+
+	if (!freed && contention) {
+		dbg_tnc("freed nothing, but contention");
+		return -1;
+	}
+
+out:
+	dbg_tnc("%d znodes were freed, requested %d", freed, nr);
+	return freed;
+}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
new file mode 100644
index 00000000000..00eb9c68ad0
--- /dev/null
+++ b/fs/ubifs/super.c
@@ -0,0 +1,1951 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/*
+ * This file implements UBIFS initialization and VFS superblock operations. Some
+ * initialization stuff which is rather large and complex is placed at
+ * corresponding subsystems, but most of it is here.
+ */
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/random.h>
+#include <linux/kthread.h>
+#include <linux/parser.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include "ubifs.h"
+
+/* Slab cache for UBIFS inodes */
+struct kmem_cache *ubifs_inode_slab;
+
+/* UBIFS TNC shrinker description */
+static struct shrinker ubifs_shrinker_info = {
+	.shrink = ubifs_shrinker,
+	.seeks = DEFAULT_SEEKS,
+};
+
+/**
+ * validate_inode - validate inode.
+ * @c: UBIFS file-system description object
+ * @inode: the inode to validate
+ *
+ * This is a helper function for 'ubifs_iget()' which validates various fields
+ * of a newly built inode to make sure they contain sane values and prevent
+ * possible vulnerabilities. Returns zero if the inode is all right and
+ * a non-zero error code if not.
+ */
+static int validate_inode(struct ubifs_info *c, const struct inode *inode)
+{
+	int err;
+	const struct ubifs_inode *ui = ubifs_inode(inode);
+
+	if (inode->i_size > c->max_inode_sz) {
+		ubifs_err("inode is too large (%lld)",
+			  (long long)inode->i_size);
+		return 1;
+	}
+
+	if (ui->compr_type < 0 || ui->compr_type >= UBIFS_COMPR_TYPES_CNT) {
+		ubifs_err("unknown compression type %d", ui->compr_type);
+		return 2;
+	}
+
+	if (ui->xattr_names + ui->xattr_cnt > XATTR_LIST_MAX)
+		return 3;
+
+	if (ui->data_len < 0 || ui->data_len > UBIFS_MAX_INO_DATA)
+		return 4;
+
+	if (ui->xattr && (inode->i_mode & S_IFMT) != S_IFREG)
+		return 5;
+
+	if (!ubifs_compr_present(ui->compr_type)) {
+		ubifs_warn("inode %lu uses '%s' compression, but it was not "
+			   "compiled in", inode->i_ino,
+			   ubifs_compr_name(ui->compr_type));
+	}
+
+	err = dbg_check_dir_size(c, inode);
+	return err;
+}
+
+struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
+{
+	int err;
+	union ubifs_key key;
+	struct ubifs_ino_node *ino;
+	struct ubifs_info *c = sb->s_fs_info;
+	struct inode *inode;
+	struct ubifs_inode *ui;
+
+	dbg_gen("inode %lu", inum);
+
+	inode = iget_locked(sb, inum);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+	ui = ubifs_inode(inode);
+
+	ino = kmalloc(UBIFS_MAX_INO_NODE_SZ, GFP_NOFS);
+	if (!ino) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	ino_key_init(c, &key, inode->i_ino);
+
+	err = ubifs_tnc_lookup(c, &key, ino);
+	if (err)
+		goto out_ino;
+
+	inode->i_flags |= (S_NOCMTIME | S_NOATIME);
+	inode->i_nlink = le32_to_cpu(ino->nlink);
+	inode->i_uid   = le32_to_cpu(ino->uid);
+	inode->i_gid   = le32_to_cpu(ino->gid);
+	inode->i_atime.tv_sec  = (int64_t)le64_to_cpu(ino->atime_sec);
+	inode->i_atime.tv_nsec = le32_to_cpu(ino->atime_nsec);
+	inode->i_mtime.tv_sec  = (int64_t)le64_to_cpu(ino->mtime_sec);
+	inode->i_mtime.tv_nsec = le32_to_cpu(ino->mtime_nsec);
+	inode->i_ctime.tv_sec  = (int64_t)le64_to_cpu(ino->ctime_sec);
+	inode->i_ctime.tv_nsec = le32_to_cpu(ino->ctime_nsec);
+	inode->i_mode = le32_to_cpu(ino->mode);
+	inode->i_size = le64_to_cpu(ino->size);
+
+	ui->data_len    = le32_to_cpu(ino->data_len);
+	ui->flags       = le32_to_cpu(ino->flags);
+	ui->compr_type  = le16_to_cpu(ino->compr_type);
+	ui->creat_sqnum = le64_to_cpu(ino->creat_sqnum);
+	ui->xattr_cnt   = le32_to_cpu(ino->xattr_cnt);
+	ui->xattr_size  = le32_to_cpu(ino->xattr_size);
+	ui->xattr_names = le32_to_cpu(ino->xattr_names);
+	ui->synced_i_size = ui->ui_size = inode->i_size;
+
+	ui->xattr = (ui->flags & UBIFS_XATTR_FL) ? 1 : 0;
+
+	err = validate_inode(c, inode);
+	if (err)
+		goto out_invalid;
+
+	/* Disable readahead */
+	inode->i_mapping->backing_dev_info = &c->bdi;
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFREG:
+		inode->i_mapping->a_ops = &ubifs_file_address_operations;
+		inode->i_op = &ubifs_file_inode_operations;
+		inode->i_fop = &ubifs_file_operations;
+		if (ui->xattr) {
+			ui->data = kmalloc(ui->data_len + 1, GFP_NOFS);
+			if (!ui->data) {
+				err = -ENOMEM;
+				goto out_ino;
+			}
+			memcpy(ui->data, ino->data, ui->data_len);
+			((char *)ui->data)[ui->data_len] = '\0';
+		} else if (ui->data_len != 0) {
+			err = 10;
+			goto out_invalid;
+		}
+		break;
+	case S_IFDIR:
+		inode->i_op  = &ubifs_dir_inode_operations;
+		inode->i_fop = &ubifs_dir_operations;
+		if (ui->data_len != 0) {
+			err = 11;
+			goto out_invalid;
+		}
+		break;
+	case S_IFLNK:
+		inode->i_op = &ubifs_symlink_inode_operations;
+		if (ui->data_len <= 0 || ui->data_len > UBIFS_MAX_INO_DATA) {
+			err = 12;
+			goto out_invalid;
+		}
+		ui->data = kmalloc(ui->data_len + 1, GFP_NOFS);
+		if (!ui->data) {
+			err = -ENOMEM;
+			goto out_ino;
+		}
+		memcpy(ui->data, ino->data, ui->data_len);
+		((char *)ui->data)[ui->data_len] = '\0';
+		break;
+	case S_IFBLK:
+	case S_IFCHR:
+	{
+		dev_t rdev;
+		union ubifs_dev_desc *dev;
+
+		ui->data = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS);
+		if (!ui->data) {
+			err = -ENOMEM;
+			goto out_ino;
+		}
+
+		dev = (union ubifs_dev_desc *)ino->data;
+		if (ui->data_len == sizeof(dev->new))
+			rdev = new_decode_dev(le32_to_cpu(dev->new));
+		else if (ui->data_len == sizeof(dev->huge))
+			rdev = huge_decode_dev(le64_to_cpu(dev->huge));
+		else {
+			err = 13;
+			goto out_invalid;
+		}
+		memcpy(ui->data, ino->data, ui->data_len);
+		inode->i_op = &ubifs_file_inode_operations;
+		init_special_inode(inode, inode->i_mode, rdev);
+		break;
+	}
+	case S_IFSOCK:
+	case S_IFIFO:
+		inode->i_op = &ubifs_file_inode_operations;
+		init_special_inode(inode, inode->i_mode, 0);
+		if (ui->data_len != 0) {
+			err = 14;
+			goto out_invalid;
+		}
+		break;
+	default:
+		err = 15;
+		goto out_invalid;
+	}
+
+	kfree(ino);
+	ubifs_set_inode_flags(inode);
+	unlock_new_inode(inode);
+	return inode;
+
+out_invalid:
+	ubifs_err("inode %lu validation failed, error %d", inode->i_ino, err);
+	dbg_dump_node(c, ino);
+	dbg_dump_inode(c, inode);
+	err = -EINVAL;
+out_ino:
+	kfree(ino);
+out:
+	ubifs_err("failed to read inode %lu, error %d", inode->i_ino, err);
+	iget_failed(inode);
+	return ERR_PTR(err);
+}
+
+static struct inode *ubifs_alloc_inode(struct super_block *sb)
+{
+	struct ubifs_inode *ui;
+
+	ui = kmem_cache_alloc(ubifs_inode_slab, GFP_NOFS);
+	if (!ui)
+		return NULL;
+
+	memset((void *)ui + sizeof(struct inode), 0,
+	       sizeof(struct ubifs_inode) - sizeof(struct inode));
+	mutex_init(&ui->ui_mutex);
+	spin_lock_init(&ui->ui_lock);
+	return &ui->vfs_inode;
+};
+
+static void ubifs_destroy_inode(struct inode *inode)
+{
+	struct ubifs_inode *ui = ubifs_inode(inode);
+
+	kfree(ui->data);
+	kmem_cache_free(ubifs_inode_slab, inode);
+}
+
+/*
+ * Note, Linux write-back code calls this without 'i_mutex'.
+ */
+static int ubifs_write_inode(struct inode *inode, int wait)
+{
+	int err;
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+
+	ubifs_assert(!ui->xattr);
+	if (is_bad_inode(inode))
+		return 0;
+
+	mutex_lock(&ui->ui_mutex);
+	/*
+	 * Due to races between write-back forced by budgeting
+	 * (see 'sync_some_inodes()') and pdflush write-back, the inode may
+	 * have already been synchronized, do not do this again. This might
+	 * also happen if it was synchronized in an VFS operation, e.g.
+	 * 'ubifs_link()'.
+	 */
+	if (!ui->dirty) {
+		mutex_unlock(&ui->ui_mutex);
+		return 0;
+	}
+
+	dbg_gen("inode %lu", inode->i_ino);
+	err = ubifs_jnl_write_inode(c, inode, 0);
+	if (err)
+		ubifs_err("can't write inode %lu, error %d", inode->i_ino, err);
+
+	ui->dirty = 0;
+	mutex_unlock(&ui->ui_mutex);
+	ubifs_release_dirty_inode_budget(c, ui);
+	return err;
+}
+
+static void ubifs_delete_inode(struct inode *inode)
+{
+	int err;
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+
+	if (ubifs_inode(inode)->xattr)
+		/*
+		 * Extended attribute inode deletions are fully handled in
+		 * 'ubifs_removexattr()'. These inodes are special and have
+		 * limited usage, so there is nothing to do here.
+		 */
+		goto out;
+
+	dbg_gen("inode %lu", inode->i_ino);
+	ubifs_assert(!atomic_read(&inode->i_count));
+	ubifs_assert(inode->i_nlink == 0);
+
+	truncate_inode_pages(&inode->i_data, 0);
+	if (is_bad_inode(inode))
+		goto out;
+
+	ubifs_inode(inode)->ui_size = inode->i_size = 0;
+	err = ubifs_jnl_write_inode(c, inode, 1);
+	if (err)
+		/*
+		 * Worst case we have a lost orphan inode wasting space, so a
+		 * simple error message is ok here.
+		 */
+		ubifs_err("can't write inode %lu, error %d", inode->i_ino, err);
+out:
+	clear_inode(inode);
+}
+
+static void ubifs_dirty_inode(struct inode *inode)
+{
+	struct ubifs_inode *ui = ubifs_inode(inode);
+
+	ubifs_assert(mutex_is_locked(&ui->ui_mutex));
+	if (!ui->dirty) {
+		ui->dirty = 1;
+		dbg_gen("inode %lu",  inode->i_ino);
+	}
+}
+
+static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct ubifs_info *c = dentry->d_sb->s_fs_info;
+	unsigned long long free;
+
+	free = ubifs_budg_get_free_space(c);
+	dbg_gen("free space %lld bytes (%lld blocks)",
+		free, free >> UBIFS_BLOCK_SHIFT);
+
+	buf->f_type = UBIFS_SUPER_MAGIC;
+	buf->f_bsize = UBIFS_BLOCK_SIZE;
+	buf->f_blocks = c->block_cnt;
+	buf->f_bfree = free >> UBIFS_BLOCK_SHIFT;
+	if (free > c->report_rp_size)
+		buf->f_bavail = (free - c->report_rp_size) >> UBIFS_BLOCK_SHIFT;
+	else
+		buf->f_bavail = 0;
+	buf->f_files = 0;
+	buf->f_ffree = 0;
+	buf->f_namelen = UBIFS_MAX_NLEN;
+
+	return 0;
+}
+
+static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt)
+{
+	struct ubifs_info *c = mnt->mnt_sb->s_fs_info;
+
+	if (c->mount_opts.unmount_mode == 2)
+		seq_printf(s, ",fast_unmount");
+	else if (c->mount_opts.unmount_mode == 1)
+		seq_printf(s, ",norm_unmount");
+
+	return 0;
+}
+
+static int ubifs_sync_fs(struct super_block *sb, int wait)
+{
+	struct ubifs_info *c = sb->s_fs_info;
+	int i, ret = 0, err;
+
+	if (c->jheads)
+		for (i = 0; i < c->jhead_cnt; i++) {
+			err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
+			if (err && !ret)
+				ret = err;
+		}
+	/*
+	 * We ought to call sync for c->ubi but it does not have one. If it had
+	 * it would in turn call mtd->sync, however mtd operations are
+	 * synchronous anyway, so we don't lose any sleep here.
+	 */
+	return ret;
+}
+
+/**
+ * init_constants_early - initialize UBIFS constants.
+ * @c: UBIFS file-system description object
+ *
+ * This function initialize UBIFS constants which do not need the superblock to
+ * be read. It also checks that the UBI volume satisfies basic UBIFS
+ * requirements. Returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int init_constants_early(struct ubifs_info *c)
+{
+	if (c->vi.corrupted) {
+		ubifs_warn("UBI volume is corrupted - read-only mode");
+		c->ro_media = 1;
+	}
+
+	if (c->di.ro_mode) {
+		ubifs_msg("read-only UBI device");
+		c->ro_media = 1;
+	}
+
+	if (c->vi.vol_type == UBI_STATIC_VOLUME) {
+		ubifs_msg("static UBI volume - read-only mode");
+		c->ro_media = 1;
+	}
+
+	c->leb_cnt = c->vi.size;
+	c->leb_size = c->vi.usable_leb_size;
+	c->half_leb_size = c->leb_size / 2;
+	c->min_io_size = c->di.min_io_size;
+	c->min_io_shift = fls(c->min_io_size) - 1;
+
+	if (c->leb_size < UBIFS_MIN_LEB_SZ) {
+		ubifs_err("too small LEBs (%d bytes), min. is %d bytes",
+			  c->leb_size, UBIFS_MIN_LEB_SZ);
+		return -EINVAL;
+	}
+
+	if (c->leb_cnt < UBIFS_MIN_LEB_CNT) {
+		ubifs_err("too few LEBs (%d), min. is %d",
+			  c->leb_cnt, UBIFS_MIN_LEB_CNT);
+		return -EINVAL;
+	}
+
+	if (!is_power_of_2(c->min_io_size)) {
+		ubifs_err("bad min. I/O size %d", c->min_io_size);
+		return -EINVAL;
+	}
+
+	/*
+	 * UBIFS aligns all node to 8-byte boundary, so to make function in
+	 * io.c simpler, assume minimum I/O unit size to be 8 bytes if it is
+	 * less than 8.
+	 */
+	if (c->min_io_size < 8) {
+		c->min_io_size = 8;
+		c->min_io_shift = 3;
+	}
+
+	c->ref_node_alsz = ALIGN(UBIFS_REF_NODE_SZ, c->min_io_size);
+	c->mst_node_alsz = ALIGN(UBIFS_MST_NODE_SZ, c->min_io_size);
+
+	/*
+	 * Initialize node length ranges which are mostly needed for node
+	 * length validation.
+	 */
+	c->ranges[UBIFS_PAD_NODE].len  = UBIFS_PAD_NODE_SZ;
+	c->ranges[UBIFS_SB_NODE].len   = UBIFS_SB_NODE_SZ;
+	c->ranges[UBIFS_MST_NODE].len  = UBIFS_MST_NODE_SZ;
+	c->ranges[UBIFS_REF_NODE].len  = UBIFS_REF_NODE_SZ;
+	c->ranges[UBIFS_TRUN_NODE].len = UBIFS_TRUN_NODE_SZ;
+	c->ranges[UBIFS_CS_NODE].len   = UBIFS_CS_NODE_SZ;
+
+	c->ranges[UBIFS_INO_NODE].min_len  = UBIFS_INO_NODE_SZ;
+	c->ranges[UBIFS_INO_NODE].max_len  = UBIFS_MAX_INO_NODE_SZ;
+	c->ranges[UBIFS_ORPH_NODE].min_len =
+				UBIFS_ORPH_NODE_SZ + sizeof(__le64);
+	c->ranges[UBIFS_ORPH_NODE].max_len = c->leb_size;
+	c->ranges[UBIFS_DENT_NODE].min_len = UBIFS_DENT_NODE_SZ;
+	c->ranges[UBIFS_DENT_NODE].max_len = UBIFS_MAX_DENT_NODE_SZ;
+	c->ranges[UBIFS_XENT_NODE].min_len = UBIFS_XENT_NODE_SZ;
+	c->ranges[UBIFS_XENT_NODE].max_len = UBIFS_MAX_XENT_NODE_SZ;
+	c->ranges[UBIFS_DATA_NODE].min_len = UBIFS_DATA_NODE_SZ;
+	c->ranges[UBIFS_DATA_NODE].max_len = UBIFS_MAX_DATA_NODE_SZ;
+	/*
+	 * Minimum indexing node size is amended later when superblock is
+	 * read and the key length is known.
+	 */
+	c->ranges[UBIFS_IDX_NODE].min_len = UBIFS_IDX_NODE_SZ + UBIFS_BRANCH_SZ;
+	/*
+	 * Maximum indexing node size is amended later when superblock is
+	 * read and the fanout is known.
+	 */
+	c->ranges[UBIFS_IDX_NODE].max_len = INT_MAX;
+
+	/*
+	 * Initialize dead and dark LEB space watermarks.
+	 *
+	 * Dead space is the space which cannot be used. Its watermark is
+	 * equivalent to min. I/O unit or minimum node size if it is greater
+	 * then min. I/O unit.
+	 *
+	 * Dark space is the space which might be used, or might not, depending
+	 * on which node should be written to the LEB. Its watermark is
+	 * equivalent to maximum UBIFS node size.
+	 */
+	c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size);
+	c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size);
+
+	return 0;
+}
+
+/**
+ * bud_wbuf_callback - bud LEB write-buffer synchronization call-back.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB the write-buffer was synchronized to
+ * @free: how many free bytes left in this LEB
+ * @pad: how many bytes were padded
+ *
+ * This is a callback function which is called by the I/O unit when the
+ * write-buffer is synchronized. We need this to correctly maintain space
+ * accounting in bud logical eraseblocks. This function returns zero in case of
+ * success and a negative error code in case of failure.
+ *
+ * This function actually belongs to the journal, but we keep it here because
+ * we want to keep it static.
+ */
+static int bud_wbuf_callback(struct ubifs_info *c, int lnum, int free, int pad)
+{
+	return ubifs_update_one_lp(c, lnum, free, pad, 0, 0);
+}
+
+/*
+ * init_constants_late - initialize UBIFS constants.
+ * @c: UBIFS file-system description object
+ *
+ * This is a helper function which initializes various UBIFS constants after
+ * the superblock has been read. It also checks various UBIFS parameters and
+ * makes sure they are all right. Returns zero in case of success and a
+ * negative error code in case of failure.
+ */
+static int init_constants_late(struct ubifs_info *c)
+{
+	int tmp, err;
+	uint64_t tmp64;
+
+	c->main_bytes = (long long)c->main_lebs * c->leb_size;
+	c->max_znode_sz = sizeof(struct ubifs_znode) +
+				c->fanout * sizeof(struct ubifs_zbranch);
+
+	tmp = ubifs_idx_node_sz(c, 1);
+	c->ranges[UBIFS_IDX_NODE].min_len = tmp;
+	c->min_idx_node_sz = ALIGN(tmp, 8);
+
+	tmp = ubifs_idx_node_sz(c, c->fanout);
+	c->ranges[UBIFS_IDX_NODE].max_len = tmp;
+	c->max_idx_node_sz = ALIGN(tmp, 8);
+
+	/* Make sure LEB size is large enough to fit full commit */
+	tmp = UBIFS_CS_NODE_SZ + UBIFS_REF_NODE_SZ * c->jhead_cnt;
+	tmp = ALIGN(tmp, c->min_io_size);
+	if (tmp > c->leb_size) {
+		dbg_err("too small LEB size %d, at least %d needed",
+			c->leb_size, tmp);
+		return -EINVAL;
+	}
+
+	/*
+	 * Make sure that the log is large enough to fit reference nodes for
+	 * all buds plus one reserved LEB.
+	 */
+	tmp64 = c->max_bud_bytes;
+	tmp = do_div(tmp64, c->leb_size);
+	c->max_bud_cnt = tmp64 + !!tmp;
+	tmp = (c->ref_node_alsz * c->max_bud_cnt + c->leb_size - 1);
+	tmp /= c->leb_size;
+	tmp += 1;
+	if (c->log_lebs < tmp) {
+		dbg_err("too small log %d LEBs, required min. %d LEBs",
+			c->log_lebs, tmp);
+		return -EINVAL;
+	}
+
+	/*
+	 * When budgeting we assume worst-case scenarios when the pages are not
+	 * be compressed and direntries are of the maximum size.
+	 *
+	 * Note, data, which may be stored in inodes is budgeted separately, so
+	 * it is not included into 'c->inode_budget'.
+	 */
+	c->page_budget = UBIFS_MAX_DATA_NODE_SZ * UBIFS_BLOCKS_PER_PAGE;
+	c->inode_budget = UBIFS_INO_NODE_SZ;
+	c->dent_budget = UBIFS_MAX_DENT_NODE_SZ;
+
+	/*
+	 * When the amount of flash space used by buds becomes
+	 * 'c->max_bud_bytes', UBIFS just blocks all writers and starts commit.
+	 * The writers are unblocked when the commit is finished. To avoid
+	 * writers to be blocked UBIFS initiates background commit in advance,
+	 * when number of bud bytes becomes above the limit defined below.
+	 */
+	c->bg_bud_bytes = (c->max_bud_bytes * 13) >> 4;
+
+	/*
+	 * Ensure minimum journal size. All the bytes in the journal heads are
+	 * considered to be used, when calculating the current journal usage.
+	 * Consequently, if the journal is too small, UBIFS will treat it as
+	 * always full.
+	 */
+	tmp64 = (uint64_t)(c->jhead_cnt + 1) * c->leb_size + 1;
+	if (c->bg_bud_bytes < tmp64)
+		c->bg_bud_bytes = tmp64;
+	if (c->max_bud_bytes < tmp64 + c->leb_size)
+		c->max_bud_bytes = tmp64 + c->leb_size;
+
+	err = ubifs_calc_lpt_geom(c);
+	if (err)
+		return err;
+
+	c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+
+	/*
+	 * Calculate total amount of FS blocks. This number is not used
+	 * internally because it does not make much sense for UBIFS, but it is
+	 * necessary to report something for the 'statfs()' call.
+	 *
+	 * Subtract the LEB reserved for GC and the LEB which is reserved for
+	 * deletions.
+	 *
+	 * Review 'ubifs_calc_available()' if changing this calculation.
+	 */
+	tmp64 = c->main_lebs - 2;
+	tmp64 *= (uint64_t)c->leb_size - c->dark_wm;
+	tmp64 = ubifs_reported_space(c, tmp64);
+	c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT;
+
+	return 0;
+}
+
+/**
+ * take_gc_lnum - reserve GC LEB.
+ * @c: UBIFS file-system description object
+ *
+ * This function ensures that the LEB reserved for garbage collection is
+ * unmapped and is marked as "taken" in lprops. We also have to set free space
+ * to LEB size and dirty space to zero, because lprops may contain out-of-date
+ * information if the file-system was un-mounted before it has been committed.
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int take_gc_lnum(struct ubifs_info *c)
+{
+	int err;
+
+	if (c->gc_lnum == -1) {
+		ubifs_err("no LEB for GC");
+		return -EINVAL;
+	}
+
+	err = ubifs_leb_unmap(c, c->gc_lnum);
+	if (err)
+		return err;
+
+	/* And we have to tell lprops that this LEB is taken */
+	err = ubifs_change_one_lp(c, c->gc_lnum, c->leb_size, 0,
+				  LPROPS_TAKEN, 0, 0);
+	return err;
+}
+
+/**
+ * alloc_wbufs - allocate write-buffers.
+ * @c: UBIFS file-system description object
+ *
+ * This helper function allocates and initializes UBIFS write-buffers. Returns
+ * zero in case of success and %-ENOMEM in case of failure.
+ */
+static int alloc_wbufs(struct ubifs_info *c)
+{
+	int i, err;
+
+	c->jheads = kzalloc(c->jhead_cnt * sizeof(struct ubifs_jhead),
+			   GFP_KERNEL);
+	if (!c->jheads)
+		return -ENOMEM;
+
+	/* Initialize journal heads */
+	for (i = 0; i < c->jhead_cnt; i++) {
+		INIT_LIST_HEAD(&c->jheads[i].buds_list);
+		err = ubifs_wbuf_init(c, &c->jheads[i].wbuf);
+		if (err)
+			return err;
+
+		c->jheads[i].wbuf.sync_callback = &bud_wbuf_callback;
+		c->jheads[i].wbuf.jhead = i;
+	}
+
+	c->jheads[BASEHD].wbuf.dtype = UBI_SHORTTERM;
+	/*
+	 * Garbage Collector head likely contains long-term data and
+	 * does not need to be synchronized by timer.
+	 */
+	c->jheads[GCHD].wbuf.dtype = UBI_LONGTERM;
+	c->jheads[GCHD].wbuf.timeout = 0;
+
+	return 0;
+}
+
+/**
+ * free_wbufs - free write-buffers.
+ * @c: UBIFS file-system description object
+ */
+static void free_wbufs(struct ubifs_info *c)
+{
+	int i;
+
+	if (c->jheads) {
+		for (i = 0; i < c->jhead_cnt; i++) {
+			kfree(c->jheads[i].wbuf.buf);
+			kfree(c->jheads[i].wbuf.inodes);
+		}
+		kfree(c->jheads);
+		c->jheads = NULL;
+	}
+}
+
+/**
+ * free_orphans - free orphans.
+ * @c: UBIFS file-system description object
+ */
+static void free_orphans(struct ubifs_info *c)
+{
+	struct ubifs_orphan *orph;
+
+	while (c->orph_dnext) {
+		orph = c->orph_dnext;
+		c->orph_dnext = orph->dnext;
+		list_del(&orph->list);
+		kfree(orph);
+	}
+
+	while (!list_empty(&c->orph_list)) {
+		orph = list_entry(c->orph_list.next, struct ubifs_orphan, list);
+		list_del(&orph->list);
+		kfree(orph);
+		dbg_err("orphan list not empty at unmount");
+	}
+
+	vfree(c->orph_buf);
+	c->orph_buf = NULL;
+}
+
+/**
+ * free_buds - free per-bud objects.
+ * @c: UBIFS file-system description object
+ */
+static void free_buds(struct ubifs_info *c)
+{
+	struct rb_node *this = c->buds.rb_node;
+	struct ubifs_bud *bud;
+
+	while (this) {
+		if (this->rb_left)
+			this = this->rb_left;
+		else if (this->rb_right)
+			this = this->rb_right;
+		else {
+			bud = rb_entry(this, struct ubifs_bud, rb);
+			this = rb_parent(this);
+			if (this) {
+				if (this->rb_left == &bud->rb)
+					this->rb_left = NULL;
+				else
+					this->rb_right = NULL;
+			}
+			kfree(bud);
+		}
+	}
+}
+
+/**
+ * check_volume_empty - check if the UBI volume is empty.
+ * @c: UBIFS file-system description object
+ *
+ * This function checks if the UBIFS volume is empty by looking if its LEBs are
+ * mapped or not. The result of checking is stored in the @c->empty variable.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+static int check_volume_empty(struct ubifs_info *c)
+{
+	int lnum, err;
+
+	c->empty = 1;
+	for (lnum = 0; lnum < c->leb_cnt; lnum++) {
+		err = ubi_is_mapped(c->ubi, lnum);
+		if (unlikely(err < 0))
+			return err;
+		if (err == 1) {
+			c->empty = 0;
+			break;
+		}
+
+		cond_resched();
+	}
+
+	return 0;
+}
+
+/*
+ * UBIFS mount options.
+ *
+ * Opt_fast_unmount: do not run a journal commit before un-mounting
+ * Opt_norm_unmount: run a journal commit before un-mounting
+ * Opt_err: just end of array marker
+ */
+enum {
+	Opt_fast_unmount,
+	Opt_norm_unmount,
+	Opt_err,
+};
+
+static match_table_t tokens = {
+	{Opt_fast_unmount, "fast_unmount"},
+	{Opt_norm_unmount, "norm_unmount"},
+	{Opt_err, NULL},
+};
+
+/**
+ * ubifs_parse_options - parse mount parameters.
+ * @c: UBIFS file-system description object
+ * @options: parameters to parse
+ * @is_remount: non-zero if this is FS re-mount
+ *
+ * This function parses UBIFS mount options and returns zero in case success
+ * and a negative error code in case of failure.
+ */
+static int ubifs_parse_options(struct ubifs_info *c, char *options,
+			       int is_remount)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+
+	if (!options)
+		return 0;
+
+	while ((p = strsep(&options, ","))) {
+		int token;
+
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_fast_unmount:
+			c->mount_opts.unmount_mode = 2;
+			c->fast_unmount = 1;
+			break;
+		case Opt_norm_unmount:
+			c->mount_opts.unmount_mode = 1;
+			c->fast_unmount = 0;
+			break;
+		default:
+			ubifs_err("unrecognized mount option \"%s\" "
+				  "or missing value", p);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * destroy_journal - destroy journal data structures.
+ * @c: UBIFS file-system description object
+ *
+ * This function destroys journal data structures including those that may have
+ * been created by recovery functions.
+ */
+static void destroy_journal(struct ubifs_info *c)
+{
+	while (!list_empty(&c->unclean_leb_list)) {
+		struct ubifs_unclean_leb *ucleb;
+
+		ucleb = list_entry(c->unclean_leb_list.next,
+				   struct ubifs_unclean_leb, list);
+		list_del(&ucleb->list);
+		kfree(ucleb);
+	}
+	while (!list_empty(&c->old_buds)) {
+		struct ubifs_bud *bud;
+
+		bud = list_entry(c->old_buds.next, struct ubifs_bud, list);
+		list_del(&bud->list);
+		kfree(bud);
+	}
+	ubifs_destroy_idx_gc(c);
+	ubifs_destroy_size_tree(c);
+	ubifs_tnc_close(c);
+	free_buds(c);
+}
+
+/**
+ * mount_ubifs - mount UBIFS file-system.
+ * @c: UBIFS file-system description object
+ *
+ * This function mounts UBIFS file system. Returns zero in case of success and
+ * a negative error code in case of failure.
+ *
+ * Note, the function does not de-allocate resources it it fails half way
+ * through, and the caller has to do this instead.
+ */
+static int mount_ubifs(struct ubifs_info *c)
+{
+	struct super_block *sb = c->vfs_sb;
+	int err, mounted_read_only = (sb->s_flags & MS_RDONLY);
+	long long x;
+	size_t sz;
+
+	err = init_constants_early(c);
+	if (err)
+		return err;
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+	c->dbg_buf = vmalloc(c->leb_size);
+	if (!c->dbg_buf)
+		return -ENOMEM;
+#endif
+
+	err = check_volume_empty(c);
+	if (err)
+		goto out_free;
+
+	if (c->empty && (mounted_read_only || c->ro_media)) {
+		/*
+		 * This UBI volume is empty, and read-only, or the file system
+		 * is mounted read-only - we cannot format it.
+		 */
+		ubifs_err("can't format empty UBI volume: read-only %s",
+			  c->ro_media ? "UBI volume" : "mount");
+		err = -EROFS;
+		goto out_free;
+	}
+
+	if (c->ro_media && !mounted_read_only) {
+		ubifs_err("cannot mount read-write - read-only media");
+		err = -EROFS;
+		goto out_free;
+	}
+
+	/*
+	 * The requirement for the buffer is that it should fit indexing B-tree
+	 * height amount of integers. We assume the height if the TNC tree will
+	 * never exceed 64.
+	 */
+	err = -ENOMEM;
+	c->bottom_up_buf = kmalloc(BOTTOM_UP_HEIGHT * sizeof(int), GFP_KERNEL);
+	if (!c->bottom_up_buf)
+		goto out_free;
+
+	c->sbuf = vmalloc(c->leb_size);
+	if (!c->sbuf)
+		goto out_free;
+
+	if (!mounted_read_only) {
+		c->ileb_buf = vmalloc(c->leb_size);
+		if (!c->ileb_buf)
+			goto out_free;
+	}
+
+	err = ubifs_read_superblock(c);
+	if (err)
+		goto out_free;
+
+	/*
+	 * Make sure the compressor which is set as the default on in the
+	 * superblock was actually compiled in.
+	 */
+	if (!ubifs_compr_present(c->default_compr)) {
+		ubifs_warn("'%s' compressor is set by superblock, but not "
+			   "compiled in", ubifs_compr_name(c->default_compr));
+		c->default_compr = UBIFS_COMPR_NONE;
+	}
+
+	dbg_failure_mode_registration(c);
+
+	err = init_constants_late(c);
+	if (err)
+		goto out_dereg;
+
+	sz = ALIGN(c->max_idx_node_sz, c->min_io_size);
+	sz = ALIGN(sz + c->max_idx_node_sz, c->min_io_size);
+	c->cbuf = kmalloc(sz, GFP_NOFS);
+	if (!c->cbuf) {
+		err = -ENOMEM;
+		goto out_dereg;
+	}
+
+	if (!mounted_read_only) {
+		err = alloc_wbufs(c);
+		if (err)
+			goto out_cbuf;
+
+		/* Create background thread */
+		sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num,
+			c->vi.vol_id);
+		c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name);
+		if (!c->bgt)
+			c->bgt = ERR_PTR(-EINVAL);
+		if (IS_ERR(c->bgt)) {
+			err = PTR_ERR(c->bgt);
+			c->bgt = NULL;
+			ubifs_err("cannot spawn \"%s\", error %d",
+				  c->bgt_name, err);
+			goto out_wbufs;
+		}
+		wake_up_process(c->bgt);
+	}
+
+	err = ubifs_read_master(c);
+	if (err)
+		goto out_master;
+
+	if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) {
+		ubifs_msg("recovery needed");
+		c->need_recovery = 1;
+		if (!mounted_read_only) {
+			err = ubifs_recover_inl_heads(c, c->sbuf);
+			if (err)
+				goto out_master;
+		}
+	} else if (!mounted_read_only) {
+		/*
+		 * Set the "dirty" flag so that if we reboot uncleanly we
+		 * will notice this immediately on the next mount.
+		 */
+		c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
+		err = ubifs_write_master(c);
+		if (err)
+			goto out_master;
+	}
+
+	err = ubifs_lpt_init(c, 1, !mounted_read_only);
+	if (err)
+		goto out_lpt;
+
+	err = dbg_check_idx_size(c, c->old_idx_sz);
+	if (err)
+		goto out_lpt;
+
+	err = ubifs_replay_journal(c);
+	if (err)
+		goto out_journal;
+
+	err = ubifs_mount_orphans(c, c->need_recovery, mounted_read_only);
+	if (err)
+		goto out_orphans;
+
+	if (!mounted_read_only) {
+		int lnum;
+
+		/* Check for enough free space */
+		if (ubifs_calc_available(c, c->min_idx_lebs) <= 0) {
+			ubifs_err("insufficient available space");
+			err = -EINVAL;
+			goto out_orphans;
+		}
+
+		/* Check for enough log space */
+		lnum = c->lhead_lnum + 1;
+		if (lnum >= UBIFS_LOG_LNUM + c->log_lebs)
+			lnum = UBIFS_LOG_LNUM;
+		if (lnum == c->ltail_lnum) {
+			err = ubifs_consolidate_log(c);
+			if (err)
+				goto out_orphans;
+		}
+
+		if (c->need_recovery) {
+			err = ubifs_recover_size(c);
+			if (err)
+				goto out_orphans;
+			err = ubifs_rcvry_gc_commit(c);
+		} else
+			err = take_gc_lnum(c);
+		if (err)
+			goto out_orphans;
+
+		err = dbg_check_lprops(c);
+		if (err)
+			goto out_orphans;
+	} else if (c->need_recovery) {
+		err = ubifs_recover_size(c);
+		if (err)
+			goto out_orphans;
+	}
+
+	spin_lock(&ubifs_infos_lock);
+	list_add_tail(&c->infos_list, &ubifs_infos);
+	spin_unlock(&ubifs_infos_lock);
+
+	if (c->need_recovery) {
+		if (mounted_read_only)
+			ubifs_msg("recovery deferred");
+		else {
+			c->need_recovery = 0;
+			ubifs_msg("recovery completed");
+		}
+	}
+
+	err = dbg_check_filesystem(c);
+	if (err)
+		goto out_infos;
+
+	ubifs_msg("mounted UBI device %d, volume %d", c->vi.ubi_num,
+		  c->vi.vol_id);
+	if (mounted_read_only)
+		ubifs_msg("mounted read-only");
+	x = (long long)c->main_lebs * c->leb_size;
+	ubifs_msg("file system size: %lld bytes (%lld KiB, %lld MiB, %d LEBs)",
+		  x, x >> 10, x >> 20, c->main_lebs);
+	x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes;
+	ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d LEBs)",
+		  x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt);
+	ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr));
+	ubifs_msg("media format %d, latest format %d",
+		  c->fmt_version, UBIFS_FORMAT_VERSION);
+
+	dbg_msg("compiled on:         " __DATE__ " at " __TIME__);
+	dbg_msg("min. I/O unit size:  %d bytes", c->min_io_size);
+	dbg_msg("LEB size:            %d bytes (%d KiB)",
+		c->leb_size, c->leb_size / 1024);
+	dbg_msg("data journal heads:  %d",
+		c->jhead_cnt - NONDATA_JHEADS_CNT);
+	dbg_msg("UUID:                %02X%02X%02X%02X-%02X%02X"
+	       "-%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X",
+	       c->uuid[0], c->uuid[1], c->uuid[2], c->uuid[3],
+	       c->uuid[4], c->uuid[5], c->uuid[6], c->uuid[7],
+	       c->uuid[8], c->uuid[9], c->uuid[10], c->uuid[11],
+	       c->uuid[12], c->uuid[13], c->uuid[14], c->uuid[15]);
+	dbg_msg("fast unmount:        %d", c->fast_unmount);
+	dbg_msg("big_lpt              %d", c->big_lpt);
+	dbg_msg("log LEBs:            %d (%d - %d)",
+		c->log_lebs, UBIFS_LOG_LNUM, c->log_last);
+	dbg_msg("LPT area LEBs:       %d (%d - %d)",
+		c->lpt_lebs, c->lpt_first, c->lpt_last);
+	dbg_msg("orphan area LEBs:    %d (%d - %d)",
+		c->orph_lebs, c->orph_first, c->orph_last);
+	dbg_msg("main area LEBs:      %d (%d - %d)",
+		c->main_lebs, c->main_first, c->leb_cnt - 1);
+	dbg_msg("index LEBs:          %d", c->lst.idx_lebs);
+	dbg_msg("total index bytes:   %lld (%lld KiB, %lld MiB)",
+		c->old_idx_sz, c->old_idx_sz >> 10, c->old_idx_sz >> 20);
+	dbg_msg("key hash type:       %d", c->key_hash_type);
+	dbg_msg("tree fanout:         %d", c->fanout);
+	dbg_msg("reserved GC LEB:     %d", c->gc_lnum);
+	dbg_msg("first main LEB:      %d", c->main_first);
+	dbg_msg("dead watermark:      %d", c->dead_wm);
+	dbg_msg("dark watermark:      %d", c->dark_wm);
+	x = (long long)c->main_lebs * c->dark_wm;
+	dbg_msg("max. dark space:     %lld (%lld KiB, %lld MiB)",
+		x, x >> 10, x >> 20);
+	dbg_msg("maximum bud bytes:   %lld (%lld KiB, %lld MiB)",
+		c->max_bud_bytes, c->max_bud_bytes >> 10,
+		c->max_bud_bytes >> 20);
+	dbg_msg("BG commit bud bytes: %lld (%lld KiB, %lld MiB)",
+		c->bg_bud_bytes, c->bg_bud_bytes >> 10,
+		c->bg_bud_bytes >> 20);
+	dbg_msg("current bud bytes    %lld (%lld KiB, %lld MiB)",
+		c->bud_bytes, c->bud_bytes >> 10, c->bud_bytes >> 20);
+	dbg_msg("max. seq. number:    %llu", c->max_sqnum);
+	dbg_msg("commit number:       %llu", c->cmt_no);
+
+	return 0;
+
+out_infos:
+	spin_lock(&ubifs_infos_lock);
+	list_del(&c->infos_list);
+	spin_unlock(&ubifs_infos_lock);
+out_orphans:
+	free_orphans(c);
+out_journal:
+	destroy_journal(c);
+out_lpt:
+	ubifs_lpt_free(c, 0);
+out_master:
+	kfree(c->mst_node);
+	kfree(c->rcvrd_mst_node);
+	if (c->bgt)
+		kthread_stop(c->bgt);
+out_wbufs:
+	free_wbufs(c);
+out_cbuf:
+	kfree(c->cbuf);
+out_dereg:
+	dbg_failure_mode_deregistration(c);
+out_free:
+	vfree(c->ileb_buf);
+	vfree(c->sbuf);
+	kfree(c->bottom_up_buf);
+	UBIFS_DBG(vfree(c->dbg_buf));
+	return err;
+}
+
+/**
+ * ubifs_umount - un-mount UBIFS file-system.
+ * @c: UBIFS file-system description object
+ *
+ * Note, this function is called to free allocated resourced when un-mounting,
+ * as well as free resources when an error occurred while we were half way
+ * through mounting (error path cleanup function). So it has to make sure the
+ * resource was actually allocated before freeing it.
+ */
+static void ubifs_umount(struct ubifs_info *c)
+{
+	dbg_gen("un-mounting UBI device %d, volume %d", c->vi.ubi_num,
+		c->vi.vol_id);
+
+	spin_lock(&ubifs_infos_lock);
+	list_del(&c->infos_list);
+	spin_unlock(&ubifs_infos_lock);
+
+	if (c->bgt)
+		kthread_stop(c->bgt);
+
+	destroy_journal(c);
+	free_wbufs(c);
+	free_orphans(c);
+	ubifs_lpt_free(c, 0);
+
+	kfree(c->cbuf);
+	kfree(c->rcvrd_mst_node);
+	kfree(c->mst_node);
+	vfree(c->sbuf);
+	kfree(c->bottom_up_buf);
+	UBIFS_DBG(vfree(c->dbg_buf));
+	vfree(c->ileb_buf);
+	dbg_failure_mode_deregistration(c);
+}
+
+/**
+ * ubifs_remount_rw - re-mount in read-write mode.
+ * @c: UBIFS file-system description object
+ *
+ * UBIFS avoids allocating many unnecessary resources when mounted in read-only
+ * mode. This function allocates the needed resources and re-mounts UBIFS in
+ * read-write mode.
+ */
+static int ubifs_remount_rw(struct ubifs_info *c)
+{
+	int err, lnum;
+
+	if (c->ro_media)
+		return -EINVAL;
+
+	mutex_lock(&c->umount_mutex);
+	c->remounting_rw = 1;
+
+	/* Check for enough free space */
+	if (ubifs_calc_available(c, c->min_idx_lebs) <= 0) {
+		ubifs_err("insufficient available space");
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (c->old_leb_cnt != c->leb_cnt) {
+		struct ubifs_sb_node *sup;
+
+		sup = ubifs_read_sb_node(c);
+		if (IS_ERR(sup)) {
+			err = PTR_ERR(sup);
+			goto out;
+		}
+		sup->leb_cnt = cpu_to_le32(c->leb_cnt);
+		err = ubifs_write_sb_node(c, sup);
+		if (err)
+			goto out;
+	}
+
+	if (c->need_recovery) {
+		ubifs_msg("completing deferred recovery");
+		err = ubifs_write_rcvrd_mst_node(c);
+		if (err)
+			goto out;
+		err = ubifs_recover_size(c);
+		if (err)
+			goto out;
+		err = ubifs_clean_lebs(c, c->sbuf);
+		if (err)
+			goto out;
+		err = ubifs_recover_inl_heads(c, c->sbuf);
+		if (err)
+			goto out;
+	}
+
+	if (!(c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY))) {
+		c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
+		err = ubifs_write_master(c);
+		if (err)
+			goto out;
+	}
+
+	c->ileb_buf = vmalloc(c->leb_size);
+	if (!c->ileb_buf) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	err = ubifs_lpt_init(c, 0, 1);
+	if (err)
+		goto out;
+
+	err = alloc_wbufs(c);
+	if (err)
+		goto out;
+
+	ubifs_create_buds_lists(c);
+
+	/* Create background thread */
+	c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name);
+	if (!c->bgt)
+		c->bgt = ERR_PTR(-EINVAL);
+	if (IS_ERR(c->bgt)) {
+		err = PTR_ERR(c->bgt);
+		c->bgt = NULL;
+		ubifs_err("cannot spawn \"%s\", error %d",
+			  c->bgt_name, err);
+		return err;
+	}
+	wake_up_process(c->bgt);
+
+	c->orph_buf = vmalloc(c->leb_size);
+	if (!c->orph_buf)
+		return -ENOMEM;
+
+	/* Check for enough log space */
+	lnum = c->lhead_lnum + 1;
+	if (lnum >= UBIFS_LOG_LNUM + c->log_lebs)
+		lnum = UBIFS_LOG_LNUM;
+	if (lnum == c->ltail_lnum) {
+		err = ubifs_consolidate_log(c);
+		if (err)
+			goto out;
+	}
+
+	if (c->need_recovery)
+		err = ubifs_rcvry_gc_commit(c);
+	else
+		err = take_gc_lnum(c);
+	if (err)
+		goto out;
+
+	if (c->need_recovery) {
+		c->need_recovery = 0;
+		ubifs_msg("deferred recovery completed");
+	}
+
+	dbg_gen("re-mounted read-write");
+	c->vfs_sb->s_flags &= ~MS_RDONLY;
+	c->remounting_rw = 0;
+	mutex_unlock(&c->umount_mutex);
+	return 0;
+
+out:
+	vfree(c->orph_buf);
+	c->orph_buf = NULL;
+	if (c->bgt) {
+		kthread_stop(c->bgt);
+		c->bgt = NULL;
+	}
+	free_wbufs(c);
+	vfree(c->ileb_buf);
+	c->ileb_buf = NULL;
+	ubifs_lpt_free(c, 1);
+	c->remounting_rw = 0;
+	mutex_unlock(&c->umount_mutex);
+	return err;
+}
+
+/**
+ * commit_on_unmount - commit the journal when un-mounting.
+ * @c: UBIFS file-system description object
+ *
+ * This function is called during un-mounting and it commits the journal unless
+ * the "fast unmount" mode is enabled. It also avoids committing the journal if
+ * it contains too few data.
+ *
+ * Sometimes recovery requires the journal to be committed at least once, and
+ * this function takes care about this.
+ */
+static void commit_on_unmount(struct ubifs_info *c)
+{
+	if (!c->fast_unmount) {
+		long long bud_bytes;
+
+		spin_lock(&c->buds_lock);
+		bud_bytes = c->bud_bytes;
+		spin_unlock(&c->buds_lock);
+		if (bud_bytes > c->leb_size)
+			ubifs_run_commit(c);
+	}
+}
+
+/**
+ * ubifs_remount_ro - re-mount in read-only mode.
+ * @c: UBIFS file-system description object
+ *
+ * We rely on VFS to have stopped writing. Possibly the background thread could
+ * be running a commit, however kthread_stop will wait in that case.
+ */
+static void ubifs_remount_ro(struct ubifs_info *c)
+{
+	int i, err;
+
+	ubifs_assert(!c->need_recovery);
+	commit_on_unmount(c);
+
+	mutex_lock(&c->umount_mutex);
+	if (c->bgt) {
+		kthread_stop(c->bgt);
+		c->bgt = NULL;
+	}
+
+	for (i = 0; i < c->jhead_cnt; i++) {
+		ubifs_wbuf_sync(&c->jheads[i].wbuf);
+		del_timer_sync(&c->jheads[i].wbuf.timer);
+	}
+
+	if (!c->ro_media) {
+		c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
+		c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
+		c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum);
+		err = ubifs_write_master(c);
+		if (err)
+			ubifs_ro_mode(c, err);
+	}
+
+	ubifs_destroy_idx_gc(c);
+	free_wbufs(c);
+	vfree(c->orph_buf);
+	c->orph_buf = NULL;
+	vfree(c->ileb_buf);
+	c->ileb_buf = NULL;
+	ubifs_lpt_free(c, 1);
+	mutex_unlock(&c->umount_mutex);
+}
+
+static void ubifs_put_super(struct super_block *sb)
+{
+	int i;
+	struct ubifs_info *c = sb->s_fs_info;
+
+	ubifs_msg("un-mount UBI device %d, volume %d", c->vi.ubi_num,
+		  c->vi.vol_id);
+	/*
+	 * The following asserts are only valid if there has not been a failure
+	 * of the media. For example, there will be dirty inodes if we failed
+	 * to write them back because of I/O errors.
+	 */
+	ubifs_assert(atomic_long_read(&c->dirty_pg_cnt) == 0);
+	ubifs_assert(c->budg_idx_growth == 0);
+	ubifs_assert(c->budg_data_growth == 0);
+
+	/*
+	 * The 'c->umount_lock' prevents races between UBIFS memory shrinker
+	 * and file system un-mount. Namely, it prevents the shrinker from
+	 * picking this superblock for shrinking - it will be just skipped if
+	 * the mutex is locked.
+	 */
+	mutex_lock(&c->umount_mutex);
+	if (!(c->vfs_sb->s_flags & MS_RDONLY)) {
+		/*
+		 * First of all kill the background thread to make sure it does
+		 * not interfere with un-mounting and freeing resources.
+		 */
+		if (c->bgt) {
+			kthread_stop(c->bgt);
+			c->bgt = NULL;
+		}
+
+		/* Synchronize write-buffers */
+		if (c->jheads)
+			for (i = 0; i < c->jhead_cnt; i++) {
+				ubifs_wbuf_sync(&c->jheads[i].wbuf);
+				del_timer_sync(&c->jheads[i].wbuf.timer);
+			}
+
+		/*
+		 * On fatal errors c->ro_media is set to 1, in which case we do
+		 * not write the master node.
+		 */
+		if (!c->ro_media) {
+			/*
+			 * We are being cleanly unmounted which means the
+			 * orphans were killed - indicate this in the master
+			 * node. Also save the reserved GC LEB number.
+			 */
+			int err;
+
+			c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
+			c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
+			c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum);
+			err = ubifs_write_master(c);
+			if (err)
+				/*
+				 * Recovery will attempt to fix the master area
+				 * next mount, so we just print a message and
+				 * continue to unmount normally.
+				 */
+				ubifs_err("failed to write master node, "
+					  "error %d", err);
+		}
+	}
+
+	ubifs_umount(c);
+	bdi_destroy(&c->bdi);
+	ubi_close_volume(c->ubi);
+	mutex_unlock(&c->umount_mutex);
+	kfree(c);
+}
+
+static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+	int err;
+	struct ubifs_info *c = sb->s_fs_info;
+
+	dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, *flags);
+
+	err = ubifs_parse_options(c, data, 1);
+	if (err) {
+		ubifs_err("invalid or unknown remount parameter");
+		return err;
+	}
+	if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
+		err = ubifs_remount_rw(c);
+		if (err)
+			return err;
+	} else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY))
+		ubifs_remount_ro(c);
+
+	return 0;
+}
+
+struct super_operations ubifs_super_operations = {
+	.alloc_inode   = ubifs_alloc_inode,
+	.destroy_inode = ubifs_destroy_inode,
+	.put_super     = ubifs_put_super,
+	.write_inode   = ubifs_write_inode,
+	.delete_inode  = ubifs_delete_inode,
+	.statfs        = ubifs_statfs,
+	.dirty_inode   = ubifs_dirty_inode,
+	.remount_fs    = ubifs_remount_fs,
+	.show_options  = ubifs_show_options,
+	.sync_fs       = ubifs_sync_fs,
+};
+
+/**
+ * open_ubi - parse UBI device name string and open the UBI device.
+ * @name: UBI volume name
+ * @mode: UBI volume open mode
+ *
+ * There are several ways to specify UBI volumes when mounting UBIFS:
+ * o ubiX_Y    - UBI device number X, volume Y;
+ * o ubiY      - UBI device number 0, volume Y;
+ * o ubiX:NAME - mount UBI device X, volume with name NAME;
+ * o ubi:NAME  - mount UBI device 0, volume with name NAME.
+ *
+ * Alternative '!' separator may be used instead of ':' (because some shells
+ * like busybox may interpret ':' as an NFS host name separator). This function
+ * returns ubi volume object in case of success and a negative error code in
+ * case of failure.
+ */
+static struct ubi_volume_desc *open_ubi(const char *name, int mode)
+{
+	int dev, vol;
+	char *endptr;
+
+	if (name[0] != 'u' || name[1] != 'b' || name[2] != 'i')
+		return ERR_PTR(-EINVAL);
+
+	/* ubi:NAME method */
+	if ((name[3] == ':' || name[3] == '!') && name[4] != '\0')
+		return ubi_open_volume_nm(0, name + 4, mode);
+
+	if (!isdigit(name[3]))
+		return ERR_PTR(-EINVAL);
+
+	dev = simple_strtoul(name + 3, &endptr, 0);
+
+	/* ubiY method */
+	if (*endptr == '\0')
+		return ubi_open_volume(0, dev, mode);
+
+	/* ubiX_Y method */
+	if (*endptr == '_' && isdigit(endptr[1])) {
+		vol = simple_strtoul(endptr + 1, &endptr, 0);
+		if (*endptr != '\0')
+			return ERR_PTR(-EINVAL);
+		return ubi_open_volume(dev, vol, mode);
+	}
+
+	/* ubiX:NAME method */
+	if ((*endptr == ':' || *endptr == '!') && endptr[1] != '\0')
+		return ubi_open_volume_nm(dev, ++endptr, mode);
+
+	return ERR_PTR(-EINVAL);
+}
+
+static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct ubi_volume_desc *ubi = sb->s_fs_info;
+	struct ubifs_info *c;
+	struct inode *root;
+	int err;
+
+	c = kzalloc(sizeof(struct ubifs_info), GFP_KERNEL);
+	if (!c)
+		return -ENOMEM;
+
+	spin_lock_init(&c->cnt_lock);
+	spin_lock_init(&c->cs_lock);
+	spin_lock_init(&c->buds_lock);
+	spin_lock_init(&c->space_lock);
+	spin_lock_init(&c->orphan_lock);
+	init_rwsem(&c->commit_sem);
+	mutex_init(&c->lp_mutex);
+	mutex_init(&c->tnc_mutex);
+	mutex_init(&c->log_mutex);
+	mutex_init(&c->mst_mutex);
+	mutex_init(&c->umount_mutex);
+	init_waitqueue_head(&c->cmt_wq);
+	c->buds = RB_ROOT;
+	c->old_idx = RB_ROOT;
+	c->size_tree = RB_ROOT;
+	c->orph_tree = RB_ROOT;
+	INIT_LIST_HEAD(&c->infos_list);
+	INIT_LIST_HEAD(&c->idx_gc);
+	INIT_LIST_HEAD(&c->replay_list);
+	INIT_LIST_HEAD(&c->replay_buds);
+	INIT_LIST_HEAD(&c->uncat_list);
+	INIT_LIST_HEAD(&c->empty_list);
+	INIT_LIST_HEAD(&c->freeable_list);
+	INIT_LIST_HEAD(&c->frdi_idx_list);
+	INIT_LIST_HEAD(&c->unclean_leb_list);
+	INIT_LIST_HEAD(&c->old_buds);
+	INIT_LIST_HEAD(&c->orph_list);
+	INIT_LIST_HEAD(&c->orph_new);
+
+	c->highest_inum = UBIFS_FIRST_INO;
+	get_random_bytes(&c->vfs_gen, sizeof(int));
+	c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM;
+
+	ubi_get_volume_info(ubi, &c->vi);
+	ubi_get_device_info(c->vi.ubi_num, &c->di);
+
+	/* Re-open the UBI device in read-write mode */
+	c->ubi = ubi_open_volume(c->vi.ubi_num, c->vi.vol_id, UBI_READWRITE);
+	if (IS_ERR(c->ubi)) {
+		err = PTR_ERR(c->ubi);
+		goto out_free;
+	}
+
+	/*
+	 * UBIFS provids 'backing_dev_info' in order to disable readahead. For
+	 * UBIFS, I/O is not deferred, it is done immediately in readpage,
+	 * which means the user would have to wait not just for their own I/O
+	 * but the readahead I/O as well i.e. completely pointless.
+	 *
+	 * Read-ahead will be disabled because @c->bdi.ra_pages is 0.
+	 */
+	c->bdi.capabilities = BDI_CAP_MAP_COPY;
+	c->bdi.unplug_io_fn = default_unplug_io_fn;
+	err  = bdi_init(&c->bdi);
+	if (err)
+		goto out_close;
+
+	err = ubifs_parse_options(c, data, 0);
+	if (err)
+		goto out_bdi;
+
+	c->vfs_sb = sb;
+
+	sb->s_fs_info = c;
+	sb->s_magic = UBIFS_SUPER_MAGIC;
+	sb->s_blocksize = UBIFS_BLOCK_SIZE;
+	sb->s_blocksize_bits = UBIFS_BLOCK_SHIFT;
+	sb->s_dev = c->vi.cdev;
+	sb->s_maxbytes = c->max_inode_sz = key_max_inode_size(c);
+	if (c->max_inode_sz > MAX_LFS_FILESIZE)
+		sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE;
+	sb->s_op = &ubifs_super_operations;
+
+	mutex_lock(&c->umount_mutex);
+	err = mount_ubifs(c);
+	if (err) {
+		ubifs_assert(err < 0);
+		goto out_unlock;
+	}
+
+	/* Read the root inode */
+	root = ubifs_iget(sb, UBIFS_ROOT_INO);
+	if (IS_ERR(root)) {
+		err = PTR_ERR(root);
+		goto out_umount;
+	}
+
+	sb->s_root = d_alloc_root(root);
+	if (!sb->s_root)
+		goto out_iput;
+
+	mutex_unlock(&c->umount_mutex);
+
+	return 0;
+
+out_iput:
+	iput(root);
+out_umount:
+	ubifs_umount(c);
+out_unlock:
+	mutex_unlock(&c->umount_mutex);
+out_bdi:
+	bdi_destroy(&c->bdi);
+out_close:
+	ubi_close_volume(c->ubi);
+out_free:
+	kfree(c);
+	return err;
+}
+
+static int sb_test(struct super_block *sb, void *data)
+{
+	dev_t *dev = data;
+
+	return sb->s_dev == *dev;
+}
+
+static int sb_set(struct super_block *sb, void *data)
+{
+	dev_t *dev = data;
+
+	sb->s_dev = *dev;
+	return 0;
+}
+
+static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
+			const char *name, void *data, struct vfsmount *mnt)
+{
+	struct ubi_volume_desc *ubi;
+	struct ubi_volume_info vi;
+	struct super_block *sb;
+	int err;
+
+	dbg_gen("name %s, flags %#x", name, flags);
+
+	/*
+	 * Get UBI device number and volume ID. Mount it read-only so far
+	 * because this might be a new mount point, and UBI allows only one
+	 * read-write user at a time.
+	 */
+	ubi = open_ubi(name, UBI_READONLY);
+	if (IS_ERR(ubi)) {
+		ubifs_err("cannot open \"%s\", error %d",
+			  name, (int)PTR_ERR(ubi));
+		return PTR_ERR(ubi);
+	}
+	ubi_get_volume_info(ubi, &vi);
+
+	dbg_gen("opened ubi%d_%d", vi.ubi_num, vi.vol_id);
+
+	sb = sget(fs_type, &sb_test, &sb_set, &vi.cdev);
+	if (IS_ERR(sb)) {
+		err = PTR_ERR(sb);
+		goto out_close;
+	}
+
+	if (sb->s_root) {
+		/* A new mount point for already mounted UBIFS */
+		dbg_gen("this ubi volume is already mounted");
+		if ((flags ^ sb->s_flags) & MS_RDONLY) {
+			err = -EBUSY;
+			goto out_deact;
+		}
+	} else {
+		sb->s_flags = flags;
+		/*
+		 * Pass 'ubi' to 'fill_super()' in sb->s_fs_info where it is
+		 * replaced by 'c'.
+		 */
+		sb->s_fs_info = ubi;
+		err = ubifs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
+		if (err)
+			goto out_deact;
+		/* We do not support atime */
+		sb->s_flags |= MS_ACTIVE | MS_NOATIME;
+	}
+
+	/* 'fill_super()' opens ubi again so we must close it here */
+	ubi_close_volume(ubi);
+
+	return simple_set_mnt(mnt, sb);
+
+out_deact:
+	up_write(&sb->s_umount);
+	deactivate_super(sb);
+out_close:
+	ubi_close_volume(ubi);
+	return err;
+}
+
+static void ubifs_kill_sb(struct super_block *sb)
+{
+	struct ubifs_info *c = sb->s_fs_info;
+
+	/*
+	 * We do 'commit_on_unmount()' here instead of 'ubifs_put_super()'
+	 * in order to be outside BKL.
+	 */
+	if (sb->s_root && !(sb->s_flags & MS_RDONLY))
+		commit_on_unmount(c);
+	/* The un-mount routine is actually done in put_super() */
+	generic_shutdown_super(sb);
+}
+
+static struct file_system_type ubifs_fs_type = {
+	.name    = "ubifs",
+	.owner   = THIS_MODULE,
+	.get_sb  = ubifs_get_sb,
+	.kill_sb = ubifs_kill_sb
+};
+
+/*
+ * Inode slab cache constructor.
+ */
+static void inode_slab_ctor(struct kmem_cache *cachep, void *obj)
+{
+	struct ubifs_inode *ui = obj;
+	inode_init_once(&ui->vfs_inode);
+}
+
+static int __init ubifs_init(void)
+{
+	int err;
+
+	BUILD_BUG_ON(sizeof(struct ubifs_ch) != 24);
+
+	/* Make sure node sizes are 8-byte aligned */
+	BUILD_BUG_ON(UBIFS_CH_SZ        & 7);
+	BUILD_BUG_ON(UBIFS_INO_NODE_SZ  & 7);
+	BUILD_BUG_ON(UBIFS_DENT_NODE_SZ & 7);
+	BUILD_BUG_ON(UBIFS_XENT_NODE_SZ & 7);
+	BUILD_BUG_ON(UBIFS_DATA_NODE_SZ & 7);
+	BUILD_BUG_ON(UBIFS_TRUN_NODE_SZ & 7);
+	BUILD_BUG_ON(UBIFS_SB_NODE_SZ   & 7);
+	BUILD_BUG_ON(UBIFS_MST_NODE_SZ  & 7);
+	BUILD_BUG_ON(UBIFS_REF_NODE_SZ  & 7);
+	BUILD_BUG_ON(UBIFS_CS_NODE_SZ   & 7);
+	BUILD_BUG_ON(UBIFS_ORPH_NODE_SZ & 7);
+
+	BUILD_BUG_ON(UBIFS_MAX_DENT_NODE_SZ & 7);
+	BUILD_BUG_ON(UBIFS_MAX_XENT_NODE_SZ & 7);
+	BUILD_BUG_ON(UBIFS_MAX_DATA_NODE_SZ & 7);
+	BUILD_BUG_ON(UBIFS_MAX_INO_NODE_SZ  & 7);
+	BUILD_BUG_ON(UBIFS_MAX_NODE_SZ      & 7);
+	BUILD_BUG_ON(MIN_WRITE_SZ           & 7);
+
+	/* Check min. node size */
+	BUILD_BUG_ON(UBIFS_INO_NODE_SZ  < MIN_WRITE_SZ);
+	BUILD_BUG_ON(UBIFS_DENT_NODE_SZ < MIN_WRITE_SZ);
+	BUILD_BUG_ON(UBIFS_XENT_NODE_SZ < MIN_WRITE_SZ);
+	BUILD_BUG_ON(UBIFS_TRUN_NODE_SZ < MIN_WRITE_SZ);
+
+	BUILD_BUG_ON(UBIFS_MAX_DENT_NODE_SZ > UBIFS_MAX_NODE_SZ);
+	BUILD_BUG_ON(UBIFS_MAX_XENT_NODE_SZ > UBIFS_MAX_NODE_SZ);
+	BUILD_BUG_ON(UBIFS_MAX_DATA_NODE_SZ > UBIFS_MAX_NODE_SZ);
+	BUILD_BUG_ON(UBIFS_MAX_INO_NODE_SZ  > UBIFS_MAX_NODE_SZ);
+
+	/* Defined node sizes */
+	BUILD_BUG_ON(UBIFS_SB_NODE_SZ  != 4096);
+	BUILD_BUG_ON(UBIFS_MST_NODE_SZ != 512);
+	BUILD_BUG_ON(UBIFS_INO_NODE_SZ != 160);
+	BUILD_BUG_ON(UBIFS_REF_NODE_SZ != 64);
+
+	/*
+	 * We require that PAGE_CACHE_SIZE is greater-than-or-equal-to
+	 * UBIFS_BLOCK_SIZE. It is assumed that both are powers of 2.
+	 */
+	if (PAGE_CACHE_SIZE < UBIFS_BLOCK_SIZE) {
+		ubifs_err("VFS page cache size is %u bytes, but UBIFS requires"
+			  " at least 4096 bytes",
+			  (unsigned int)PAGE_CACHE_SIZE);
+		return -EINVAL;
+	}
+
+	err = register_filesystem(&ubifs_fs_type);
+	if (err) {
+		ubifs_err("cannot register file system, error %d", err);
+		return err;
+	}
+
+	err = -ENOMEM;
+	ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab",
+				sizeof(struct ubifs_inode), 0,
+				SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT,
+				&inode_slab_ctor);
+	if (!ubifs_inode_slab)
+		goto out_reg;
+
+	register_shrinker(&ubifs_shrinker_info);
+
+	err = ubifs_compressors_init();
+	if (err)
+		goto out_compr;
+
+	return 0;
+
+out_compr:
+	unregister_shrinker(&ubifs_shrinker_info);
+	kmem_cache_destroy(ubifs_inode_slab);
+out_reg:
+	unregister_filesystem(&ubifs_fs_type);
+	return err;
+}
+/* late_initcall to let compressors initialize first */
+late_initcall(ubifs_init);
+
+static void __exit ubifs_exit(void)
+{
+	ubifs_assert(list_empty(&ubifs_infos));
+	ubifs_assert(atomic_long_read(&ubifs_clean_zn_cnt) == 0);
+
+	ubifs_compressors_exit();
+	unregister_shrinker(&ubifs_shrinker_info);
+	kmem_cache_destroy(ubifs_inode_slab);
+	unregister_filesystem(&ubifs_fs_type);
+}
+module_exit(ubifs_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_VERSION(__stringify(UBIFS_VERSION));
+MODULE_AUTHOR("Artem Bityutskiy, Adrian Hunter");
+MODULE_DESCRIPTION("UBIFS - UBI File System");
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
new file mode 100644
index 00000000000..e909f4a9644
--- /dev/null
+++ b/fs/ubifs/tnc.c
@@ -0,0 +1,2956 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements TNC (Tree Node Cache) which caches indexing nodes of
+ * the UBIFS B-tree.
+ *
+ * At the moment the locking rules of the TNC tree are quite simple and
+ * straightforward. We just have a mutex and lock it when we traverse the
+ * tree. If a znode is not in memory, we read it from flash while still having
+ * the mutex locked.
+ */
+
+#include <linux/crc32.h>
+#include "ubifs.h"
+
+/*
+ * Returned codes of 'matches_name()' and 'fallible_matches_name()' functions.
+ * @NAME_LESS: name corresponding to the first argument is less than second
+ * @NAME_MATCHES: names match
+ * @NAME_GREATER: name corresponding to the second argument is greater than
+ *                first
+ * @NOT_ON_MEDIA: node referred by zbranch does not exist on the media
+ *
+ * These constants were introduce to improve readability.
+ */
+enum {
+	NAME_LESS    = 0,
+	NAME_MATCHES = 1,
+	NAME_GREATER = 2,
+	NOT_ON_MEDIA = 3,
+};
+
+/**
+ * insert_old_idx - record an index node obsoleted since the last commit start.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number of obsoleted index node
+ * @offs: offset of obsoleted index node
+ *
+ * Returns %0 on success, and a negative error code on failure.
+ *
+ * For recovery, there must always be a complete intact version of the index on
+ * flash at all times. That is called the "old index". It is the index as at the
+ * time of the last successful commit. Many of the index nodes in the old index
+ * may be dirty, but they must not be erased until the next successful commit
+ * (at which point that index becomes the old index).
+ *
+ * That means that the garbage collection and the in-the-gaps method of
+ * committing must be able to determine if an index node is in the old index.
+ * Most of the old index nodes can be found by looking up the TNC using the
+ * 'lookup_znode()' function. However, some of the old index nodes may have
+ * been deleted from the current index or may have been changed so much that
+ * they cannot be easily found. In those cases, an entry is added to an RB-tree.
+ * That is what this function does. The RB-tree is ordered by LEB number and
+ * offset because they uniquely identify the old index node.
+ */
+static int insert_old_idx(struct ubifs_info *c, int lnum, int offs)
+{
+	struct ubifs_old_idx *old_idx, *o;
+	struct rb_node **p, *parent = NULL;
+
+	old_idx = kmalloc(sizeof(struct ubifs_old_idx), GFP_NOFS);
+	if (unlikely(!old_idx))
+		return -ENOMEM;
+	old_idx->lnum = lnum;
+	old_idx->offs = offs;
+
+	p = &c->old_idx.rb_node;
+	while (*p) {
+		parent = *p;
+		o = rb_entry(parent, struct ubifs_old_idx, rb);
+		if (lnum < o->lnum)
+			p = &(*p)->rb_left;
+		else if (lnum > o->lnum)
+			p = &(*p)->rb_right;
+		else if (offs < o->offs)
+			p = &(*p)->rb_left;
+		else if (offs > o->offs)
+			p = &(*p)->rb_right;
+		else {
+			ubifs_err("old idx added twice!");
+			kfree(old_idx);
+			return 0;
+		}
+	}
+	rb_link_node(&old_idx->rb, parent, p);
+	rb_insert_color(&old_idx->rb, &c->old_idx);
+	return 0;
+}
+
+/**
+ * insert_old_idx_znode - record a znode obsoleted since last commit start.
+ * @c: UBIFS file-system description object
+ * @znode: znode of obsoleted index node
+ *
+ * Returns %0 on success, and a negative error code on failure.
+ */
+int insert_old_idx_znode(struct ubifs_info *c, struct ubifs_znode *znode)
+{
+	if (znode->parent) {
+		struct ubifs_zbranch *zbr;
+
+		zbr = &znode->parent->zbranch[znode->iip];
+		if (zbr->len)
+			return insert_old_idx(c, zbr->lnum, zbr->offs);
+	} else
+		if (c->zroot.len)
+			return insert_old_idx(c, c->zroot.lnum,
+					      c->zroot.offs);
+	return 0;
+}
+
+/**
+ * ins_clr_old_idx_znode - record a znode obsoleted since last commit start.
+ * @c: UBIFS file-system description object
+ * @znode: znode of obsoleted index node
+ *
+ * Returns %0 on success, and a negative error code on failure.
+ */
+static int ins_clr_old_idx_znode(struct ubifs_info *c,
+				 struct ubifs_znode *znode)
+{
+	int err;
+
+	if (znode->parent) {
+		struct ubifs_zbranch *zbr;
+
+		zbr = &znode->parent->zbranch[znode->iip];
+		if (zbr->len) {
+			err = insert_old_idx(c, zbr->lnum, zbr->offs);
+			if (err)
+				return err;
+			zbr->lnum = 0;
+			zbr->offs = 0;
+			zbr->len = 0;
+		}
+	} else
+		if (c->zroot.len) {
+			err = insert_old_idx(c, c->zroot.lnum, c->zroot.offs);
+			if (err)
+				return err;
+			c->zroot.lnum = 0;
+			c->zroot.offs = 0;
+			c->zroot.len = 0;
+		}
+	return 0;
+}
+
+/**
+ * destroy_old_idx - destroy the old_idx RB-tree.
+ * @c: UBIFS file-system description object
+ *
+ * During start commit, the old_idx RB-tree is used to avoid overwriting index
+ * nodes that were in the index last commit but have since been deleted.  This
+ * is necessary for recovery i.e. the old index must be kept intact until the
+ * new index is successfully written.  The old-idx RB-tree is used for the
+ * in-the-gaps method of writing index nodes and is destroyed every commit.
+ */
+void destroy_old_idx(struct ubifs_info *c)
+{
+	struct rb_node *this = c->old_idx.rb_node;
+	struct ubifs_old_idx *old_idx;
+
+	while (this) {
+		if (this->rb_left) {
+			this = this->rb_left;
+			continue;
+		} else if (this->rb_right) {
+			this = this->rb_right;
+			continue;
+		}
+		old_idx = rb_entry(this, struct ubifs_old_idx, rb);
+		this = rb_parent(this);
+		if (this) {
+			if (this->rb_left == &old_idx->rb)
+				this->rb_left = NULL;
+			else
+				this->rb_right = NULL;
+		}
+		kfree(old_idx);
+	}
+	c->old_idx = RB_ROOT;
+}
+
+/**
+ * copy_znode - copy a dirty znode.
+ * @c: UBIFS file-system description object
+ * @znode: znode to copy
+ *
+ * A dirty znode being committed may not be changed, so it is copied.
+ */
+static struct ubifs_znode *copy_znode(struct ubifs_info *c,
+				      struct ubifs_znode *znode)
+{
+	struct ubifs_znode *zn;
+
+	zn = kmalloc(c->max_znode_sz, GFP_NOFS);
+	if (unlikely(!zn))
+		return ERR_PTR(-ENOMEM);
+
+	memcpy(zn, znode, c->max_znode_sz);
+	zn->cnext = NULL;
+	__set_bit(DIRTY_ZNODE, &zn->flags);
+	__clear_bit(COW_ZNODE, &zn->flags);
+
+	ubifs_assert(!test_bit(OBSOLETE_ZNODE, &znode->flags));
+	__set_bit(OBSOLETE_ZNODE, &znode->flags);
+
+	if (znode->level != 0) {
+		int i;
+		const int n = zn->child_cnt;
+
+		/* The children now have new parent */
+		for (i = 0; i < n; i++) {
+			struct ubifs_zbranch *zbr = &zn->zbranch[i];
+
+			if (zbr->znode)
+				zbr->znode->parent = zn;
+		}
+	}
+
+	atomic_long_inc(&c->dirty_zn_cnt);
+	return zn;
+}
+
+/**
+ * add_idx_dirt - add dirt due to a dirty znode.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number of index node
+ * @dirt: size of index node
+ *
+ * This function updates lprops dirty space and the new size of the index.
+ */
+static int add_idx_dirt(struct ubifs_info *c, int lnum, int dirt)
+{
+	c->calc_idx_sz -= ALIGN(dirt, 8);
+	return ubifs_add_dirt(c, lnum, dirt);
+}
+
+/**
+ * dirty_cow_znode - ensure a znode is not being committed.
+ * @c: UBIFS file-system description object
+ * @zbr: branch of znode to check
+ *
+ * Returns dirtied znode on success or negative error code on failure.
+ */
+static struct ubifs_znode *dirty_cow_znode(struct ubifs_info *c,
+					   struct ubifs_zbranch *zbr)
+{
+	struct ubifs_znode *znode = zbr->znode;
+	struct ubifs_znode *zn;
+	int err;
+
+	if (!test_bit(COW_ZNODE, &znode->flags)) {
+		/* znode is not being committed */
+		if (!test_and_set_bit(DIRTY_ZNODE, &znode->flags)) {
+			atomic_long_inc(&c->dirty_zn_cnt);
+			atomic_long_dec(&c->clean_zn_cnt);
+			atomic_long_dec(&ubifs_clean_zn_cnt);
+			err = add_idx_dirt(c, zbr->lnum, zbr->len);
+			if (unlikely(err))
+				return ERR_PTR(err);
+		}
+		return znode;
+	}
+
+	zn = copy_znode(c, znode);
+	if (unlikely(IS_ERR(zn)))
+		return zn;
+
+	if (zbr->len) {
+		err = insert_old_idx(c, zbr->lnum, zbr->offs);
+		if (unlikely(err))
+			return ERR_PTR(err);
+		err = add_idx_dirt(c, zbr->lnum, zbr->len);
+	} else
+		err = 0;
+
+	zbr->znode = zn;
+	zbr->lnum = 0;
+	zbr->offs = 0;
+	zbr->len = 0;
+
+	if (unlikely(err))
+		return ERR_PTR(err);
+	return zn;
+}
+
+/**
+ * lnc_add - add a leaf node to the leaf node cache.
+ * @c: UBIFS file-system description object
+ * @zbr: zbranch of leaf node
+ * @node: leaf node
+ *
+ * Leaf nodes are non-index nodes directory entry nodes or data nodes. The
+ * purpose of the leaf node cache is to save re-reading the same leaf node over
+ * and over again. Most things are cached by VFS, however the file system must
+ * cache directory entries for readdir and for resolving hash collisions. The
+ * present implementation of the leaf node cache is extremely simple, and
+ * allows for error returns that are not used but that may be needed if a more
+ * complex implementation is created.
+ *
+ * Note, this function does not add the @node object to LNC directly, but
+ * allocates a copy of the object and adds the copy to LNC. The reason for this
+ * is that @node has been allocated outside of the TNC subsystem and will be
+ * used with @c->tnc_mutex unlock upon return from the TNC subsystem. But LNC
+ * may be changed at any time, e.g. freed by the shrinker.
+ */
+static int lnc_add(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+		   const void *node)
+{
+	int err;
+	void *lnc_node;
+	const struct ubifs_dent_node *dent = node;
+
+	ubifs_assert(!zbr->leaf);
+	ubifs_assert(zbr->len != 0);
+	ubifs_assert(is_hash_key(c, &zbr->key));
+
+	err = ubifs_validate_entry(c, dent);
+	if (err) {
+		dbg_dump_stack();
+		dbg_dump_node(c, dent);
+		return err;
+	}
+
+	lnc_node = kmalloc(zbr->len, GFP_NOFS);
+	if (!lnc_node)
+		/* We don't have to have the cache, so no error */
+		return 0;
+
+	memcpy(lnc_node, node, zbr->len);
+	zbr->leaf = lnc_node;
+	return 0;
+}
+
+ /**
+ * lnc_add_directly - add a leaf node to the leaf-node-cache.
+ * @c: UBIFS file-system description object
+ * @zbr: zbranch of leaf node
+ * @node: leaf node
+ *
+ * This function is similar to 'lnc_add()', but it does not create a copy of
+ * @node but inserts @node to TNC directly.
+ */
+static int lnc_add_directly(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+			    void *node)
+{
+	int err;
+
+	ubifs_assert(!zbr->leaf);
+	ubifs_assert(zbr->len != 0);
+
+	err = ubifs_validate_entry(c, node);
+	if (err) {
+		dbg_dump_stack();
+		dbg_dump_node(c, node);
+		return err;
+	}
+
+	zbr->leaf = node;
+	return 0;
+}
+
+/**
+ * lnc_free - remove a leaf node from the leaf node cache.
+ * @zbr: zbranch of leaf node
+ * @node: leaf node
+ */
+static void lnc_free(struct ubifs_zbranch *zbr)
+{
+	if (!zbr->leaf)
+		return;
+	kfree(zbr->leaf);
+	zbr->leaf = NULL;
+}
+
+/**
+ * tnc_read_node_nm - read a "hashed" leaf node.
+ * @c: UBIFS file-system description object
+ * @zbr: key and position of the node
+ * @node: node is returned here
+ *
+ * This function reads a "hashed" node defined by @zbr from the leaf node cache
+ * (in it is there) or from the hash media, in which case the node is also
+ * added to LNC. Returns zero in case of success or a negative negative error
+ * code in case of failure.
+ */
+static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+			    void *node)
+{
+	int err;
+
+	ubifs_assert(is_hash_key(c, &zbr->key));
+
+	if (zbr->leaf) {
+		/* Read from the leaf node cache */
+		ubifs_assert(zbr->len != 0);
+		memcpy(node, zbr->leaf, zbr->len);
+		return 0;
+	}
+
+	err = ubifs_tnc_read_node(c, zbr, node);
+	if (err)
+		return err;
+
+	/* Add the node to the leaf node cache */
+	err = lnc_add(c, zbr, node);
+	return err;
+}
+
+/**
+ * try_read_node - read a node if it is a node.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to read to
+ * @type: node type
+ * @len: node length (not aligned)
+ * @lnum: LEB number of node to read
+ * @offs: offset of node to read
+ *
+ * This function tries to read a node of known type and length, checks it and
+ * stores it in @buf. This function returns %1 if a node is present and %0 if
+ * a node is not present. A negative error code is returned for I/O errors.
+ * This function performs that same function as ubifs_read_node except that
+ * it does not require that there is actually a node present and instead
+ * the return code indicates if a node was read.
+ */
+static int try_read_node(const struct ubifs_info *c, void *buf, int type,
+			 int len, int lnum, int offs)
+{
+	int err, node_len;
+	struct ubifs_ch *ch = buf;
+	uint32_t crc, node_crc;
+
+	dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len);
+
+	err = ubi_read(c->ubi, lnum, buf, offs, len);
+	if (err) {
+		ubifs_err("cannot read node type %d from LEB %d:%d, error %d",
+			  type, lnum, offs, err);
+		return err;
+	}
+
+	if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC)
+		return 0;
+
+	if (ch->node_type != type)
+		return 0;
+
+	node_len = le32_to_cpu(ch->len);
+	if (node_len != len)
+		return 0;
+
+	crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
+	node_crc = le32_to_cpu(ch->crc);
+	if (crc != node_crc)
+		return 0;
+
+	return 1;
+}
+
+/**
+ * fallible_read_node - try to read a leaf node.
+ * @c: UBIFS file-system description object
+ * @key:  key of node to read
+ * @zbr:  position of node
+ * @node: node returned
+ *
+ * This function tries to read a node and returns %1 if the node is read, %0
+ * if the node is not present, and a negative error code in the case of error.
+ */
+static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key,
+			      struct ubifs_zbranch *zbr, void *node)
+{
+	int ret;
+
+	dbg_tnc("LEB %d:%d, key %s", zbr->lnum, zbr->offs, DBGKEY(key));
+
+	ret = try_read_node(c, node, key_type(c, key), zbr->len, zbr->lnum,
+			    zbr->offs);
+	if (ret == 1) {
+		union ubifs_key node_key;
+		struct ubifs_dent_node *dent = node;
+
+		/* All nodes have key in the same place */
+		key_read(c, &dent->key, &node_key);
+		if (keys_cmp(c, key, &node_key) != 0)
+			ret = 0;
+	}
+	if (ret == 0)
+		dbg_mnt("dangling branch LEB %d:%d len %d, key %s",
+			zbr->lnum, zbr->offs, zbr->len, DBGKEY(key));
+	return ret;
+}
+
+/**
+ * matches_name - determine if a direntry or xattr entry matches a given name.
+ * @c: UBIFS file-system description object
+ * @zbr: zbranch of dent
+ * @nm: name to match
+ *
+ * This function checks if xentry/direntry referred by zbranch @zbr matches name
+ * @nm. Returns %NAME_MATCHES if it does, %NAME_LESS if the name referred by
+ * @zbr is less than @nm, and %NAME_GREATER if it is greater than @nm. In case
+ * of failure, a negative error code is returned.
+ */
+static int matches_name(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+			const struct qstr *nm)
+{
+	struct ubifs_dent_node *dent;
+	int nlen, err;
+
+	/* If possible, match against the dent in the leaf node cache */
+	if (!zbr->leaf) {
+		dent = kmalloc(zbr->len, GFP_NOFS);
+		if (!dent)
+			return -ENOMEM;
+
+		err = ubifs_tnc_read_node(c, zbr, dent);
+		if (err)
+			goto out_free;
+
+		/* Add the node to the leaf node cache */
+		err = lnc_add_directly(c, zbr, dent);
+		if (err)
+			goto out_free;
+	} else
+		dent = zbr->leaf;
+
+	nlen = le16_to_cpu(dent->nlen);
+	err = memcmp(dent->name, nm->name, min_t(int, nlen, nm->len));
+	if (err == 0) {
+		if (nlen == nm->len)
+			return NAME_MATCHES;
+		else if (nlen < nm->len)
+			return NAME_LESS;
+		else
+			return NAME_GREATER;
+	} else if (err < 0)
+		return NAME_LESS;
+	else
+		return NAME_GREATER;
+
+out_free:
+	kfree(dent);
+	return err;
+}
+
+/**
+ * get_znode - get a TNC znode that may not be loaded yet.
+ * @c: UBIFS file-system description object
+ * @znode: parent znode
+ * @n: znode branch slot number
+ *
+ * This function returns the znode or a negative error code.
+ */
+static struct ubifs_znode *get_znode(struct ubifs_info *c,
+				     struct ubifs_znode *znode, int n)
+{
+	struct ubifs_zbranch *zbr;
+
+	zbr = &znode->zbranch[n];
+	if (zbr->znode)
+		znode = zbr->znode;
+	else
+		znode = ubifs_load_znode(c, zbr, znode, n);
+	return znode;
+}
+
+/**
+ * tnc_next - find next TNC entry.
+ * @c: UBIFS file-system description object
+ * @zn: znode is passed and returned here
+ * @n: znode branch slot number is passed and returned here
+ *
+ * This function returns %0 if the next TNC entry is found, %-ENOENT if there is
+ * no next entry, or a negative error code otherwise.
+ */
+static int tnc_next(struct ubifs_info *c, struct ubifs_znode **zn, int *n)
+{
+	struct ubifs_znode *znode = *zn;
+	int nn = *n;
+
+	nn += 1;
+	if (nn < znode->child_cnt) {
+		*n = nn;
+		return 0;
+	}
+	while (1) {
+		struct ubifs_znode *zp;
+
+		zp = znode->parent;
+		if (!zp)
+			return -ENOENT;
+		nn = znode->iip + 1;
+		znode = zp;
+		if (nn < znode->child_cnt) {
+			znode = get_znode(c, znode, nn);
+			if (IS_ERR(znode))
+				return PTR_ERR(znode);
+			while (znode->level != 0) {
+				znode = get_znode(c, znode, 0);
+				if (IS_ERR(znode))
+					return PTR_ERR(znode);
+			}
+			nn = 0;
+			break;
+		}
+	}
+	*zn = znode;
+	*n = nn;
+	return 0;
+}
+
+/**
+ * tnc_prev - find previous TNC entry.
+ * @c: UBIFS file-system description object
+ * @zn: znode is returned here
+ * @n: znode branch slot number is passed and returned here
+ *
+ * This function returns %0 if the previous TNC entry is found, %-ENOENT if
+ * there is no next entry, or a negative error code otherwise.
+ */
+static int tnc_prev(struct ubifs_info *c, struct ubifs_znode **zn, int *n)
+{
+	struct ubifs_znode *znode = *zn;
+	int nn = *n;
+
+	if (nn > 0) {
+		*n = nn - 1;
+		return 0;
+	}
+	while (1) {
+		struct ubifs_znode *zp;
+
+		zp = znode->parent;
+		if (!zp)
+			return -ENOENT;
+		nn = znode->iip - 1;
+		znode = zp;
+		if (nn >= 0) {
+			znode = get_znode(c, znode, nn);
+			if (IS_ERR(znode))
+				return PTR_ERR(znode);
+			while (znode->level != 0) {
+				nn = znode->child_cnt - 1;
+				znode = get_znode(c, znode, nn);
+				if (IS_ERR(znode))
+					return PTR_ERR(znode);
+			}
+			nn = znode->child_cnt - 1;
+			break;
+		}
+	}
+	*zn = znode;
+	*n = nn;
+	return 0;
+}
+
+/**
+ * resolve_collision - resolve a collision.
+ * @c: UBIFS file-system description object
+ * @key: key of a directory or extended attribute entry
+ * @zn: znode is returned here
+ * @n: zbranch number is passed and returned here
+ * @nm: name of the entry
+ *
+ * This function is called for "hashed" keys to make sure that the found key
+ * really corresponds to the looked up node (directory or extended attribute
+ * entry). It returns %1 and sets @zn and @n if the collision is resolved.
+ * %0 is returned if @nm is not found and @zn and @n are set to the previous
+ * entry, i.e. to the entry after which @nm could follow if it were in TNC.
+ * This means that @n may be set to %-1 if the leftmost key in @zn is the
+ * previous one. A negative error code is returned on failures.
+ */
+static int resolve_collision(struct ubifs_info *c, const union ubifs_key *key,
+			     struct ubifs_znode **zn, int *n,
+			     const struct qstr *nm)
+{
+	int err;
+
+	err = matches_name(c, &(*zn)->zbranch[*n], nm);
+	if (unlikely(err < 0))
+		return err;
+	if (err == NAME_MATCHES)
+		return 1;
+
+	if (err == NAME_GREATER) {
+		/* Look left */
+		while (1) {
+			err = tnc_prev(c, zn, n);
+			if (err == -ENOENT) {
+				ubifs_assert(*n == 0);
+				*n = -1;
+				return 0;
+			}
+			if (err < 0)
+				return err;
+			if (keys_cmp(c, &(*zn)->zbranch[*n].key, key)) {
+				/*
+				 * We have found the branch after which we would
+				 * like to insert, but inserting in this znode
+				 * may still be wrong. Consider the following 3
+				 * znodes, in the case where we are resolving a
+				 * collision with Key2.
+				 *
+				 *                  znode zp
+				 *            ----------------------
+				 * level 1     |  Key0  |  Key1  |
+				 *            -----------------------
+				 *                 |            |
+				 *       znode za  |            |  znode zb
+				 *          ------------      ------------
+				 * level 0  |  Key0  |        |  Key2  |
+				 *          ------------      ------------
+				 *
+				 * The lookup finds Key2 in znode zb. Lets say
+				 * there is no match and the name is greater so
+				 * we look left. When we find Key0, we end up
+				 * here. If we return now, we will insert into
+				 * znode za at slot n = 1.  But that is invalid
+				 * according to the parent's keys.  Key2 must
+				 * be inserted into znode zb.
+				 *
+				 * Note, this problem is not relevant for the
+				 * case when we go right, because
+				 * 'tnc_insert()' would correct the parent key.
+				 */
+				if (*n == (*zn)->child_cnt - 1) {
+					err = tnc_next(c, zn, n);
+					if (err) {
+						/* Should be impossible */
+						ubifs_assert(0);
+						if (err == -ENOENT)
+							err = -EINVAL;
+						return err;
+					}
+					ubifs_assert(*n == 0);
+					*n = -1;
+				}
+				return 0;
+			}
+			err = matches_name(c, &(*zn)->zbranch[*n], nm);
+			if (err < 0)
+				return err;
+			if (err == NAME_LESS)
+				return 0;
+			if (err == NAME_MATCHES)
+				return 1;
+			ubifs_assert(err == NAME_GREATER);
+		}
+	} else {
+		int nn = *n;
+		struct ubifs_znode *znode = *zn;
+
+		/* Look right */
+		while (1) {
+			err = tnc_next(c, &znode, &nn);
+			if (err == -ENOENT)
+				return 0;
+			if (err < 0)
+				return err;
+			if (keys_cmp(c, &znode->zbranch[nn].key, key))
+				return 0;
+			err = matches_name(c, &znode->zbranch[nn], nm);
+			if (err < 0)
+				return err;
+			if (err == NAME_GREATER)
+				return 0;
+			*zn = znode;
+			*n = nn;
+			if (err == NAME_MATCHES)
+				return 1;
+			ubifs_assert(err == NAME_LESS);
+		}
+	}
+}
+
+/**
+ * fallible_matches_name - determine if a dent matches a given name.
+ * @c: UBIFS file-system description object
+ * @zbr: zbranch of dent
+ * @nm: name to match
+ *
+ * This is a "fallible" version of 'matches_name()' function which does not
+ * panic if the direntry/xentry referred by @zbr does not exist on the media.
+ *
+ * This function checks if xentry/direntry referred by zbranch @zbr matches name
+ * @nm. Returns %NAME_MATCHES it does, %NAME_LESS if the name referred by @zbr
+ * is less than @nm, %NAME_GREATER if it is greater than @nm, and @NOT_ON_MEDIA
+ * if xentry/direntry referred by @zbr does not exist on the media. A negative
+ * error code is returned in case of failure.
+ */
+static int fallible_matches_name(struct ubifs_info *c,
+				 struct ubifs_zbranch *zbr,
+				 const struct qstr *nm)
+{
+	struct ubifs_dent_node *dent;
+	int nlen, err;
+
+	/* If possible, match against the dent in the leaf node cache */
+	if (!zbr->leaf) {
+		dent = kmalloc(zbr->len, GFP_NOFS);
+		if (!dent)
+			return -ENOMEM;
+
+		err = fallible_read_node(c, &zbr->key, zbr, dent);
+		if (err < 0)
+			goto out_free;
+		if (err == 0) {
+			/* The node was not present */
+			err = NOT_ON_MEDIA;
+			goto out_free;
+		}
+		ubifs_assert(err == 1);
+
+		err = lnc_add_directly(c, zbr, dent);
+		if (err)
+			goto out_free;
+	} else
+		dent = zbr->leaf;
+
+	nlen = le16_to_cpu(dent->nlen);
+	err = memcmp(dent->name, nm->name, min_t(int, nlen, nm->len));
+	if (err == 0) {
+		if (nlen == nm->len)
+			return NAME_MATCHES;
+		else if (nlen < nm->len)
+			return NAME_LESS;
+		else
+			return NAME_GREATER;
+	} else if (err < 0)
+		return NAME_LESS;
+	else
+		return NAME_GREATER;
+
+out_free:
+	kfree(dent);
+	return err;
+}
+
+/**
+ * fallible_resolve_collision - resolve a collision even if nodes are missing.
+ * @c: UBIFS file-system description object
+ * @key: key
+ * @zn: znode is returned here
+ * @n: branch number is passed and returned here
+ * @nm: name of directory entry
+ * @adding: indicates caller is adding a key to the TNC
+ *
+ * This is a "fallible" version of the 'resolve_collision()' function which
+ * does not panic if one of the nodes referred to by TNC does not exist on the
+ * media. This may happen when replaying the journal if a deleted node was
+ * Garbage-collected and the commit was not done. A branch that refers to a node
+ * that is not present is called a dangling branch. The following are the return
+ * codes for this function:
+ *  o if @nm was found, %1 is returned and @zn and @n are set to the found
+ *    branch;
+ *  o if we are @adding and @nm was not found, %0 is returned;
+ *  o if we are not @adding and @nm was not found, but a dangling branch was
+ *    found, then %1 is returned and @zn and @n are set to the dangling branch;
+ *  o a negative error code is returned in case of failure.
+ */
+static int fallible_resolve_collision(struct ubifs_info *c,
+				      const union ubifs_key *key,
+				      struct ubifs_znode **zn, int *n,
+				      const struct qstr *nm, int adding)
+{
+	struct ubifs_znode *o_znode = NULL, *znode = *zn;
+	int uninitialized_var(o_n), err, cmp, unsure = 0, nn = *n;
+
+	cmp = fallible_matches_name(c, &znode->zbranch[nn], nm);
+	if (unlikely(cmp < 0))
+		return cmp;
+	if (cmp == NAME_MATCHES)
+		return 1;
+	if (cmp == NOT_ON_MEDIA) {
+		o_znode = znode;
+		o_n = nn;
+		/*
+		 * We are unlucky and hit a dangling branch straight away.
+		 * Now we do not really know where to go to find the needed
+		 * branch - to the left or to the right. Well, let's try left.
+		 */
+		unsure = 1;
+	} else if (!adding)
+		unsure = 1; /* Remove a dangling branch wherever it is */
+
+	if (cmp == NAME_GREATER || unsure) {
+		/* Look left */
+		while (1) {
+			err = tnc_prev(c, zn, n);
+			if (err == -ENOENT) {
+				ubifs_assert(*n == 0);
+				*n = -1;
+				break;
+			}
+			if (err < 0)
+				return err;
+			if (keys_cmp(c, &(*zn)->zbranch[*n].key, key)) {
+				/* See comments in 'resolve_collision()' */
+				if (*n == (*zn)->child_cnt - 1) {
+					err = tnc_next(c, zn, n);
+					if (err) {
+						/* Should be impossible */
+						ubifs_assert(0);
+						if (err == -ENOENT)
+							err = -EINVAL;
+						return err;
+					}
+					ubifs_assert(*n == 0);
+					*n = -1;
+				}
+				break;
+			}
+			err = fallible_matches_name(c, &(*zn)->zbranch[*n], nm);
+			if (err < 0)
+				return err;
+			if (err == NAME_MATCHES)
+				return 1;
+			if (err == NOT_ON_MEDIA) {
+				o_znode = *zn;
+				o_n = *n;
+				continue;
+			}
+			if (!adding)
+				continue;
+			if (err == NAME_LESS)
+				break;
+			else
+				unsure = 0;
+		}
+	}
+
+	if (cmp == NAME_LESS || unsure) {
+		/* Look right */
+		*zn = znode;
+		*n = nn;
+		while (1) {
+			err = tnc_next(c, &znode, &nn);
+			if (err == -ENOENT)
+				break;
+			if (err < 0)
+				return err;
+			if (keys_cmp(c, &znode->zbranch[nn].key, key))
+				break;
+			err = fallible_matches_name(c, &znode->zbranch[nn], nm);
+			if (err < 0)
+				return err;
+			if (err == NAME_GREATER)
+				break;
+			*zn = znode;
+			*n = nn;
+			if (err == NAME_MATCHES)
+				return 1;
+			if (err == NOT_ON_MEDIA) {
+				o_znode = znode;
+				o_n = nn;
+			}
+		}
+	}
+
+	/* Never match a dangling branch when adding */
+	if (adding || !o_znode)
+		return 0;
+
+	dbg_mnt("dangling match LEB %d:%d len %d %s",
+		o_znode->zbranch[o_n].lnum, o_znode->zbranch[o_n].offs,
+		o_znode->zbranch[o_n].len, DBGKEY(key));
+	*zn = o_znode;
+	*n = o_n;
+	return 1;
+}
+
+/**
+ * matches_position - determine if a zbranch matches a given position.
+ * @zbr: zbranch of dent
+ * @lnum: LEB number of dent to match
+ * @offs: offset of dent to match
+ *
+ * This function returns %1 if @lnum:@offs matches, and %0 otherwise.
+ */
+static int matches_position(struct ubifs_zbranch *zbr, int lnum, int offs)
+{
+	if (zbr->lnum == lnum && zbr->offs == offs)
+		return 1;
+	else
+		return 0;
+}
+
+/**
+ * resolve_collision_directly - resolve a collision directly.
+ * @c: UBIFS file-system description object
+ * @key: key of directory entry
+ * @zn: znode is passed and returned here
+ * @n: zbranch number is passed and returned here
+ * @lnum: LEB number of dent node to match
+ * @offs: offset of dent node to match
+ *
+ * This function is used for "hashed" keys to make sure the found directory or
+ * extended attribute entry node is what was looked for. It is used when the
+ * flash address of the right node is known (@lnum:@offs) which makes it much
+ * easier to resolve collisions (no need to read entries and match full
+ * names). This function returns %1 and sets @zn and @n if the collision is
+ * resolved, %0 if @lnum:@offs is not found and @zn and @n are set to the
+ * previous directory entry. Otherwise a negative error code is returned.
+ */
+static int resolve_collision_directly(struct ubifs_info *c,
+				      const union ubifs_key *key,
+				      struct ubifs_znode **zn, int *n,
+				      int lnum, int offs)
+{
+	struct ubifs_znode *znode;
+	int nn, err;
+
+	znode = *zn;
+	nn = *n;
+	if (matches_position(&znode->zbranch[nn], lnum, offs))
+		return 1;
+
+	/* Look left */
+	while (1) {
+		err = tnc_prev(c, &znode, &nn);
+		if (err == -ENOENT)
+			break;
+		if (err < 0)
+			return err;
+		if (keys_cmp(c, &znode->zbranch[nn].key, key))
+			break;
+		if (matches_position(&znode->zbranch[nn], lnum, offs)) {
+			*zn = znode;
+			*n = nn;
+			return 1;
+		}
+	}
+
+	/* Look right */
+	znode = *zn;
+	nn = *n;
+	while (1) {
+		err = tnc_next(c, &znode, &nn);
+		if (err == -ENOENT)
+			return 0;
+		if (err < 0)
+			return err;
+		if (keys_cmp(c, &znode->zbranch[nn].key, key))
+			return 0;
+		*zn = znode;
+		*n = nn;
+		if (matches_position(&znode->zbranch[nn], lnum, offs))
+			return 1;
+	}
+}
+
+/**
+ * dirty_cow_bottom_up - dirty a znode and its ancestors.
+ * @c: UBIFS file-system description object
+ * @znode: znode to dirty
+ *
+ * If we do not have a unique key that resides in a znode, then we cannot
+ * dirty that znode from the top down (i.e. by using lookup_level0_dirty)
+ * This function records the path back to the last dirty ancestor, and then
+ * dirties the znodes on that path.
+ */
+static struct ubifs_znode *dirty_cow_bottom_up(struct ubifs_info *c,
+					       struct ubifs_znode *znode)
+{
+	struct ubifs_znode *zp;
+	int *path = c->bottom_up_buf, p = 0;
+
+	ubifs_assert(c->zroot.znode);
+	ubifs_assert(znode);
+	if (c->zroot.znode->level > BOTTOM_UP_HEIGHT) {
+		kfree(c->bottom_up_buf);
+		c->bottom_up_buf = kmalloc(c->zroot.znode->level * sizeof(int),
+					   GFP_NOFS);
+		if (!c->bottom_up_buf)
+			return ERR_PTR(-ENOMEM);
+		path = c->bottom_up_buf;
+	}
+	if (c->zroot.znode->level) {
+		/* Go up until parent is dirty */
+		while (1) {
+			int n;
+
+			zp = znode->parent;
+			if (!zp)
+				break;
+			n = znode->iip;
+			ubifs_assert(p < c->zroot.znode->level);
+			path[p++] = n;
+			if (!zp->cnext && ubifs_zn_dirty(znode))
+				break;
+			znode = zp;
+		}
+	}
+
+	/* Come back down, dirtying as we go */
+	while (1) {
+		struct ubifs_zbranch *zbr;
+
+		zp = znode->parent;
+		if (zp) {
+			ubifs_assert(path[p - 1] >= 0);
+			ubifs_assert(path[p - 1] < zp->child_cnt);
+			zbr = &zp->zbranch[path[--p]];
+			znode = dirty_cow_znode(c, zbr);
+		} else {
+			ubifs_assert(znode == c->zroot.znode);
+			znode = dirty_cow_znode(c, &c->zroot);
+		}
+		if (unlikely(IS_ERR(znode)) || !p)
+			break;
+		ubifs_assert(path[p - 1] >= 0);
+		ubifs_assert(path[p - 1] < znode->child_cnt);
+		znode = znode->zbranch[path[p - 1]].znode;
+	}
+
+	return znode;
+}
+
+/**
+ * ubifs_lookup_level0 - search for zero-level znode.
+ * @c: UBIFS file-system description object
+ * @key:  key to lookup
+ * @zn: znode is returned here
+ * @n: znode branch slot number is returned here
+ *
+ * This function looks up the TNC tree and search for zero-level znode which
+ * refers key @key. The found zero-level znode is returned in @zn. There are 3
+ * cases:
+ *   o exact match, i.e. the found zero-level znode contains key @key, then %1
+ *     is returned and slot number of the matched branch is stored in @n;
+ *   o not exact match, which means that zero-level znode does not contain
+ *     @key, then %0 is returned and slot number of the closed branch is stored
+ *     in  @n;
+ *   o @key is so small that it is even less than the lowest key of the
+ *     leftmost zero-level node, then %0 is returned and %0 is stored in @n.
+ *
+ * Note, when the TNC tree is traversed, some znodes may be absent, then this
+ * function reads corresponding indexing nodes and inserts them to TNC. In
+ * case of failure, a negative error code is returned.
+ */
+int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
+			struct ubifs_znode **zn, int *n)
+{
+	int err, exact;
+	struct ubifs_znode *znode;
+	unsigned long time = get_seconds();
+
+	dbg_tnc("search key %s", DBGKEY(key));
+
+	znode = c->zroot.znode;
+	if (unlikely(!znode)) {
+		znode = ubifs_load_znode(c, &c->zroot, NULL, 0);
+		if (IS_ERR(znode))
+			return PTR_ERR(znode);
+	}
+
+	znode->time = time;
+
+	while (1) {
+		struct ubifs_zbranch *zbr;
+
+		exact = ubifs_search_zbranch(c, znode, key, n);
+
+		if (znode->level == 0)
+			break;
+
+		if (*n < 0)
+			*n = 0;
+		zbr = &znode->zbranch[*n];
+
+		if (zbr->znode) {
+			znode->time = time;
+			znode = zbr->znode;
+			continue;
+		}
+
+		/* znode is not in TNC cache, load it from the media */
+		znode = ubifs_load_znode(c, zbr, znode, *n);
+		if (IS_ERR(znode))
+			return PTR_ERR(znode);
+	}
+
+	*zn = znode;
+	if (exact || !is_hash_key(c, key) || *n != -1) {
+		dbg_tnc("found %d, lvl %d, n %d", exact, znode->level, *n);
+		return exact;
+	}
+
+	/*
+	 * Here is a tricky place. We have not found the key and this is a
+	 * "hashed" key, which may collide. The rest of the code deals with
+	 * situations like this:
+	 *
+	 *                  | 3 | 5 |
+	 *                  /       \
+	 *          | 3 | 5 |      | 6 | 7 | (x)
+	 *
+	 * Or more a complex example:
+	 *
+	 *                | 1 | 5 |
+	 *                /       \
+	 *       | 1 | 3 |         | 5 | 8 |
+	 *              \           /
+	 *          | 5 | 5 |   | 6 | 7 | (x)
+	 *
+	 * In the examples, if we are looking for key "5", we may reach nodes
+	 * marked with "(x)". In this case what we have do is to look at the
+	 * left and see if there is "5" key there. If there is, we have to
+	 * return it.
+	 *
+	 * Note, this whole situation is possible because we allow to have
+	 * elements which are equivalent to the next key in the parent in the
+	 * children of current znode. For example, this happens if we split a
+	 * znode like this: | 3 | 5 | 5 | 6 | 7 |, which results in something
+	 * like this:
+	 *                      | 3 | 5 |
+	 *                       /     \
+	 *                | 3 | 5 |   | 5 | 6 | 7 |
+	 *                              ^
+	 * And this becomes what is at the first "picture" after key "5" marked
+	 * with "^" is removed. What could be done is we could prohibit
+	 * splitting in the middle of the colliding sequence. Also, when
+	 * removing the leftmost key, we would have to correct the key of the
+	 * parent node, which would introduce additional complications. Namely,
+	 * if we changed the the leftmost key of the parent znode, the garbage
+	 * collector would be unable to find it (GC is doing this when GC'ing
+	 * indexing LEBs). Although we already have an additional RB-tree where
+	 * we save such changed znodes (see 'ins_clr_old_idx_znode()') until
+	 * after the commit. But anyway, this does not look easy to implement
+	 * so we did not try this.
+	 */
+	err = tnc_prev(c, &znode, n);
+	if (err == -ENOENT) {
+		dbg_tnc("found 0, lvl %d, n -1", znode->level);
+		*n = -1;
+		return 0;
+	}
+	if (unlikely(err < 0))
+		return err;
+	if (keys_cmp(c, key, &znode->zbranch[*n].key)) {
+		dbg_tnc("found 0, lvl %d, n -1", znode->level);
+		*n = -1;
+		return 0;
+	}
+
+	dbg_tnc("found 1, lvl %d, n %d", znode->level, *n);
+	*zn = znode;
+	return 1;
+}
+
+/**
+ * lookup_level0_dirty - search for zero-level znode dirtying.
+ * @c: UBIFS file-system description object
+ * @key:  key to lookup
+ * @zn: znode is returned here
+ * @n: znode branch slot number is returned here
+ *
+ * This function looks up the TNC tree and search for zero-level znode which
+ * refers key @key. The found zero-level znode is returned in @zn. There are 3
+ * cases:
+ *   o exact match, i.e. the found zero-level znode contains key @key, then %1
+ *     is returned and slot number of the matched branch is stored in @n;
+ *   o not exact match, which means that zero-level znode does not contain @key
+ *     then %0 is returned and slot number of the closed branch is stored in
+ *     @n;
+ *   o @key is so small that it is even less than the lowest key of the
+ *     leftmost zero-level node, then %0 is returned and %-1 is stored in @n.
+ *
+ * Additionally all znodes in the path from the root to the located zero-level
+ * znode are marked as dirty.
+ *
+ * Note, when the TNC tree is traversed, some znodes may be absent, then this
+ * function reads corresponding indexing nodes and inserts them to TNC. In
+ * case of failure, a negative error code is returned.
+ */
+static int lookup_level0_dirty(struct ubifs_info *c, const union ubifs_key *key,
+			       struct ubifs_znode **zn, int *n)
+{
+	int err, exact;
+	struct ubifs_znode *znode;
+	unsigned long time = get_seconds();
+
+	dbg_tnc("search and dirty key %s", DBGKEY(key));
+
+	znode = c->zroot.znode;
+	if (unlikely(!znode)) {
+		znode = ubifs_load_znode(c, &c->zroot, NULL, 0);
+		if (IS_ERR(znode))
+			return PTR_ERR(znode);
+	}
+
+	znode = dirty_cow_znode(c, &c->zroot);
+	if (IS_ERR(znode))
+		return PTR_ERR(znode);
+
+	znode->time = time;
+
+	while (1) {
+		struct ubifs_zbranch *zbr;
+
+		exact = ubifs_search_zbranch(c, znode, key, n);
+
+		if (znode->level == 0)
+			break;
+
+		if (*n < 0)
+			*n = 0;
+		zbr = &znode->zbranch[*n];
+
+		if (zbr->znode) {
+			znode->time = time;
+			znode = dirty_cow_znode(c, zbr);
+			if (IS_ERR(znode))
+				return PTR_ERR(znode);
+			continue;
+		}
+
+		/* znode is not in TNC cache, load it from the media */
+		znode = ubifs_load_znode(c, zbr, znode, *n);
+		if (IS_ERR(znode))
+			return PTR_ERR(znode);
+		znode = dirty_cow_znode(c, zbr);
+		if (IS_ERR(znode))
+			return PTR_ERR(znode);
+	}
+
+	*zn = znode;
+	if (exact || !is_hash_key(c, key) || *n != -1) {
+		dbg_tnc("found %d, lvl %d, n %d", exact, znode->level, *n);
+		return exact;
+	}
+
+	/*
+	 * See huge comment at 'lookup_level0_dirty()' what is the rest of the
+	 * code.
+	 */
+	err = tnc_prev(c, &znode, n);
+	if (err == -ENOENT) {
+		*n = -1;
+		dbg_tnc("found 0, lvl %d, n -1", znode->level);
+		return 0;
+	}
+	if (unlikely(err < 0))
+		return err;
+	if (keys_cmp(c, key, &znode->zbranch[*n].key)) {
+		*n = -1;
+		dbg_tnc("found 0, lvl %d, n -1", znode->level);
+		return 0;
+	}
+
+	if (znode->cnext || !ubifs_zn_dirty(znode)) {
+		znode = dirty_cow_bottom_up(c, znode);
+		if (IS_ERR(znode))
+			return PTR_ERR(znode);
+	}
+
+	dbg_tnc("found 1, lvl %d, n %d", znode->level, *n);
+	*zn = znode;
+	return 1;
+}
+
+/**
+ * ubifs_tnc_lookup - look up a file-system node.
+ * @c: UBIFS file-system description object
+ * @key: node key to lookup
+ * @node: the node is returned here
+ *
+ * This function look up and reads node with key @key. The caller has to make
+ * sure the @node buffer is large enough to fit the node. Returns zero in case
+ * of success, %-ENOENT if the node was not found, and a negative error code in
+ * case of failure.
+ */
+int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key,
+		     void *node)
+{
+	int found, n, err;
+	struct ubifs_znode *znode;
+	struct ubifs_zbranch zbr, *zt;
+
+	mutex_lock(&c->tnc_mutex);
+	found = ubifs_lookup_level0(c, key, &znode, &n);
+	if (!found) {
+		err = -ENOENT;
+		goto out;
+	} else if (found < 0) {
+		err = found;
+		goto out;
+	}
+	zt = &znode->zbranch[n];
+	if (is_hash_key(c, key)) {
+		/*
+		 * In this case the leaf node cache gets used, so we pass the
+		 * address of the zbranch and keep the mutex locked
+		 */
+		err = tnc_read_node_nm(c, zt, node);
+		goto out;
+	}
+	zbr = znode->zbranch[n];
+	mutex_unlock(&c->tnc_mutex);
+
+	err = ubifs_tnc_read_node(c, &zbr, node);
+	return err;
+
+out:
+	mutex_unlock(&c->tnc_mutex);
+	return err;
+}
+
+/**
+ * ubifs_tnc_locate - look up a file-system node and return it and its location.
+ * @c: UBIFS file-system description object
+ * @key: node key to lookup
+ * @node: the node is returned here
+ * @lnum: LEB number is returned here
+ * @offs: offset is returned here
+ *
+ * This function is the same as 'ubifs_tnc_lookup()' but it returns the node
+ * location also. See 'ubifs_tnc_lookup()'.
+ */
+int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
+		     void *node, int *lnum, int *offs)
+{
+	int found, n, err;
+	struct ubifs_znode *znode;
+	struct ubifs_zbranch zbr, *zt;
+
+	mutex_lock(&c->tnc_mutex);
+	found = ubifs_lookup_level0(c, key, &znode, &n);
+	if (!found) {
+		err = -ENOENT;
+		goto out;
+	} else if (found < 0) {
+		err = found;
+		goto out;
+	}
+	zt = &znode->zbranch[n];
+	if (is_hash_key(c, key)) {
+		/*
+		 * In this case the leaf node cache gets used, so we pass the
+		 * address of the zbranch and keep the mutex locked
+		 */
+		*lnum = zt->lnum;
+		*offs = zt->offs;
+		err = tnc_read_node_nm(c, zt, node);
+		goto out;
+	}
+	zbr = znode->zbranch[n];
+	mutex_unlock(&c->tnc_mutex);
+
+	*lnum = zbr.lnum;
+	*offs = zbr.offs;
+
+	err = ubifs_tnc_read_node(c, &zbr, node);
+	return err;
+
+out:
+	mutex_unlock(&c->tnc_mutex);
+	return err;
+}
+
+/**
+ * do_lookup_nm- look up a "hashed" node.
+ * @c: UBIFS file-system description object
+ * @key: node key to lookup
+ * @node: the node is returned here
+ * @nm: node name
+ *
+ * This function look up and reads a node which contains name hash in the key.
+ * Since the hash may have collisions, there may be many nodes with the same
+ * key, so we have to sequentially look to all of them until the needed one is
+ * found. This function returns zero in case of success, %-ENOENT if the node
+ * was not found, and a negative error code in case of failure.
+ */
+static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
+			void *node, const struct qstr *nm)
+{
+	int found, n, err;
+	struct ubifs_znode *znode;
+	struct ubifs_zbranch zbr;
+
+	dbg_tnc("name '%.*s' key %s", nm->len, nm->name, DBGKEY(key));
+	mutex_lock(&c->tnc_mutex);
+	found = ubifs_lookup_level0(c, key, &znode, &n);
+	if (!found) {
+		err = -ENOENT;
+		goto out_unlock;
+	} else if (found < 0) {
+		err = found;
+		goto out_unlock;
+	}
+
+	ubifs_assert(n >= 0);
+
+	err = resolve_collision(c, key, &znode, &n, nm);
+	dbg_tnc("rc returned %d, znode %p, n %d", err, znode, n);
+	if (unlikely(err < 0))
+		goto out_unlock;
+	if (err == 0) {
+		err = -ENOENT;
+		goto out_unlock;
+	}
+
+	zbr = znode->zbranch[n];
+	mutex_unlock(&c->tnc_mutex);
+
+	err = tnc_read_node_nm(c, &zbr, node);
+	return err;
+
+out_unlock:
+	mutex_unlock(&c->tnc_mutex);
+	return err;
+}
+
+/**
+ * ubifs_tnc_lookup_nm - look up a "hashed" node.
+ * @c: UBIFS file-system description object
+ * @key: node key to lookup
+ * @node: the node is returned here
+ * @nm: node name
+ *
+ * This function look up and reads a node which contains name hash in the key.
+ * Since the hash may have collisions, there may be many nodes with the same
+ * key, so we have to sequentially look to all of them until the needed one is
+ * found. This function returns zero in case of success, %-ENOENT if the node
+ * was not found, and a negative error code in case of failure.
+ */
+int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
+			void *node, const struct qstr *nm)
+{
+	int err, len;
+	const struct ubifs_dent_node *dent = node;
+
+	/*
+	 * We assume that in most of the cases there are no name collisions and
+	 * 'ubifs_tnc_lookup()' returns us the right direntry.
+	 */
+	err = ubifs_tnc_lookup(c, key, node);
+	if (err)
+		return err;
+
+	len = le16_to_cpu(dent->nlen);
+	if (nm->len == len && !memcmp(dent->name, nm->name, len))
+		return 0;
+
+	/*
+	 * Unluckily, there are hash collisions and we have to iterate over
+	 * them look at each direntry with colliding name hash sequentially.
+	 */
+	return do_lookup_nm(c, key, node, nm);
+}
+
+/**
+ * correct_parent_keys - correct parent znodes' keys.
+ * @c: UBIFS file-system description object
+ * @znode: znode to correct parent znodes for
+ *
+ * This is a helper function for 'tnc_insert()'. When the key of the leftmost
+ * zbranch changes, keys of parent znodes have to be corrected. This helper
+ * function is called in such situations and corrects the keys if needed.
+ */
+static void correct_parent_keys(const struct ubifs_info *c,
+				struct ubifs_znode *znode)
+{
+	union ubifs_key *key, *key1;
+
+	ubifs_assert(znode->parent);
+	ubifs_assert(znode->iip == 0);
+
+	key = &znode->zbranch[0].key;
+	key1 = &znode->parent->zbranch[0].key;
+
+	while (keys_cmp(c, key, key1) < 0) {
+		key_copy(c, key, key1);
+		znode = znode->parent;
+		znode->alt = 1;
+		if (!znode->parent || znode->iip)
+			break;
+		key1 = &znode->parent->zbranch[0].key;
+	}
+}
+
+/**
+ * insert_zbranch - insert a zbranch into a znode.
+ * @znode: znode into which to insert
+ * @zbr: zbranch to insert
+ * @n: slot number to insert to
+ *
+ * This is a helper function for 'tnc_insert()'. UBIFS does not allow "gaps" in
+ * znode's array of zbranches and keeps zbranches consolidated, so when a new
+ * zbranch has to be inserted to the @znode->zbranches[]' array at the @n-th
+ * slot, zbranches starting from @n have to be moved right.
+ */
+static void insert_zbranch(struct ubifs_znode *znode,
+			   const struct ubifs_zbranch *zbr, int n)
+{
+	int i;
+
+	ubifs_assert(ubifs_zn_dirty(znode));
+
+	if (znode->level) {
+		for (i = znode->child_cnt; i > n; i--) {
+			znode->zbranch[i] = znode->zbranch[i - 1];
+			if (znode->zbranch[i].znode)
+				znode->zbranch[i].znode->iip = i;
+		}
+		if (zbr->znode)
+			zbr->znode->iip = n;
+	} else
+		for (i = znode->child_cnt; i > n; i--)
+			znode->zbranch[i] = znode->zbranch[i - 1];
+
+	znode->zbranch[n] = *zbr;
+	znode->child_cnt += 1;
+
+	/*
+	 * After inserting at slot zero, the lower bound of the key range of
+	 * this znode may have changed. If this znode is subsequently split
+	 * then the upper bound of the key range may change, and furthermore
+	 * it could change to be lower than the original lower bound. If that
+	 * happens, then it will no longer be possible to find this znode in the
+	 * TNC using the key from the index node on flash. That is bad because
+	 * if it is not found, we will assume it is obsolete and may overwrite
+	 * it. Then if there is an unclean unmount, we will start using the
+	 * old index which will be broken.
+	 *
+	 * So we first mark znodes that have insertions at slot zero, and then
+	 * if they are split we add their lnum/offs to the old_idx tree.
+	 */
+	if (n == 0)
+		znode->alt = 1;
+}
+
+/**
+ * tnc_insert - insert a node into TNC.
+ * @c: UBIFS file-system description object
+ * @znode: znode to insert into
+ * @zbr: branch to insert
+ * @n: slot number to insert new zbranch to
+ *
+ * This function inserts a new node described by @zbr into znode @znode. If
+ * znode does not have a free slot for new zbranch, it is split. Parent znodes
+ * are splat as well if needed. Returns zero in case of success or a negative
+ * error code in case of failure.
+ */
+static int tnc_insert(struct ubifs_info *c, struct ubifs_znode *znode,
+		      struct ubifs_zbranch *zbr, int n)
+{
+	struct ubifs_znode *zn, *zi, *zp;
+	int i, keep, move, appending = 0;
+	union ubifs_key *key = &zbr->key;
+
+	ubifs_assert(n >= 0 && n <= c->fanout);
+
+	/* Implement naive insert for now */
+again:
+	zp = znode->parent;
+	if (znode->child_cnt < c->fanout) {
+		ubifs_assert(n != c->fanout);
+		dbg_tnc("inserted at %d level %d, key %s", n, znode->level,
+			DBGKEY(key));
+
+		insert_zbranch(znode, zbr, n);
+
+		/* Ensure parent's key is correct */
+		if (n == 0 && zp && znode->iip == 0)
+			correct_parent_keys(c, znode);
+
+		return 0;
+	}
+
+	/*
+	 * Unfortunately, @znode does not have more empty slots and we have to
+	 * split it.
+	 */
+	dbg_tnc("splitting level %d, key %s", znode->level, DBGKEY(key));
+
+	if (znode->alt)
+		/*
+		 * We can no longer be sure of finding this znode by key, so we
+		 * record it in the old_idx tree.
+		 */
+		ins_clr_old_idx_znode(c, znode);
+
+	zn = kzalloc(c->max_znode_sz, GFP_NOFS);
+	if (!zn)
+		return -ENOMEM;
+	zn->parent = zp;
+	zn->level = znode->level;
+
+	/* Decide where to split */
+	if (znode->level == 0 && n == c->fanout &&
+	    key_type(c, key) == UBIFS_DATA_KEY) {
+		union ubifs_key *key1;
+
+		/*
+		 * If this is an inode which is being appended - do not split
+		 * it because no other zbranches can be inserted between
+		 * zbranches of consecutive data nodes anyway.
+		 */
+		key1 = &znode->zbranch[n - 1].key;
+		if (key_inum(c, key1) == key_inum(c, key) &&
+		    key_type(c, key1) == UBIFS_DATA_KEY &&
+		    key_block(c, key1) == key_block(c, key) - 1)
+			appending = 1;
+	}
+
+	if (appending) {
+		keep = c->fanout;
+		move = 0;
+	} else {
+		keep = (c->fanout + 1) / 2;
+		move = c->fanout - keep;
+	}
+
+	/*
+	 * Although we don't at present, we could look at the neighbors and see
+	 * if we can move some zbranches there.
+	 */
+
+	if (n < keep) {
+		/* Insert into existing znode */
+		zi = znode;
+		move += 1;
+		keep -= 1;
+	} else {
+		/* Insert into new znode */
+		zi = zn;
+		n -= keep;
+		/* Re-parent */
+		if (zn->level != 0)
+			zbr->znode->parent = zn;
+	}
+
+	__set_bit(DIRTY_ZNODE, &zn->flags);
+	atomic_long_inc(&c->dirty_zn_cnt);
+
+	zn->child_cnt = move;
+	znode->child_cnt = keep;
+
+	dbg_tnc("moving %d, keeping %d", move, keep);
+
+	/* Move zbranch */
+	for (i = 0; i < move; i++) {
+		zn->zbranch[i] = znode->zbranch[keep + i];
+		/* Re-parent */
+		if (zn->level != 0)
+			if (zn->zbranch[i].znode) {
+				zn->zbranch[i].znode->parent = zn;
+				zn->zbranch[i].znode->iip = i;
+			}
+	}
+
+	/* Insert new key and branch */
+	dbg_tnc("inserting at %d level %d, key %s", n, zn->level, DBGKEY(key));
+
+	insert_zbranch(zi, zbr, n);
+
+	/* Insert new znode (produced by spitting) into the parent */
+	if (zp) {
+		i = n;
+		/* Locate insertion point */
+		n = znode->iip + 1;
+		if (appending && n != c->fanout)
+			appending = 0;
+
+		if (i == 0 && zi == znode && znode->iip == 0)
+			correct_parent_keys(c, znode);
+
+		/* Tail recursion */
+		zbr->key = zn->zbranch[0].key;
+		zbr->znode = zn;
+		zbr->lnum = 0;
+		zbr->offs = 0;
+		zbr->len = 0;
+		znode = zp;
+
+		goto again;
+	}
+
+	/* We have to split root znode */
+	dbg_tnc("creating new zroot at level %d", znode->level + 1);
+
+	zi = kzalloc(c->max_znode_sz, GFP_NOFS);
+	if (!zi)
+		return -ENOMEM;
+
+	zi->child_cnt = 2;
+	zi->level = znode->level + 1;
+
+	__set_bit(DIRTY_ZNODE, &zi->flags);
+	atomic_long_inc(&c->dirty_zn_cnt);
+
+	zi->zbranch[0].key = znode->zbranch[0].key;
+	zi->zbranch[0].znode = znode;
+	zi->zbranch[0].lnum = c->zroot.lnum;
+	zi->zbranch[0].offs = c->zroot.offs;
+	zi->zbranch[0].len = c->zroot.len;
+	zi->zbranch[1].key = zn->zbranch[0].key;
+	zi->zbranch[1].znode = zn;
+
+	c->zroot.lnum = 0;
+	c->zroot.offs = 0;
+	c->zroot.len = 0;
+	c->zroot.znode = zi;
+
+	zn->parent = zi;
+	zn->iip = 1;
+	znode->parent = zi;
+	znode->iip = 0;
+
+	return 0;
+}
+
+/**
+ * ubifs_tnc_add - add a node to TNC.
+ * @c: UBIFS file-system description object
+ * @key: key to add
+ * @lnum: LEB number of node
+ * @offs: node offset
+ * @len: node length
+ *
+ * This function adds a node with key @key to TNC. The node may be new or it may
+ * obsolete some existing one. Returns %0 on success or negative error code on
+ * failure.
+ */
+int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum,
+		  int offs, int len)
+{
+	int found, n, err = 0;
+	struct ubifs_znode *znode;
+
+	mutex_lock(&c->tnc_mutex);
+	dbg_tnc("%d:%d, len %d, key %s", lnum, offs, len, DBGKEY(key));
+	found = lookup_level0_dirty(c, key, &znode, &n);
+	if (!found) {
+		struct ubifs_zbranch zbr;
+
+		zbr.znode = NULL;
+		zbr.lnum = lnum;
+		zbr.offs = offs;
+		zbr.len = len;
+		key_copy(c, key, &zbr.key);
+		err = tnc_insert(c, znode, &zbr, n + 1);
+	} else if (found == 1) {
+		struct ubifs_zbranch *zbr = &znode->zbranch[n];
+
+		lnc_free(zbr);
+		err = ubifs_add_dirt(c, zbr->lnum, zbr->len);
+		zbr->lnum = lnum;
+		zbr->offs = offs;
+		zbr->len = len;
+	} else
+		err = found;
+	if (!err)
+		err = dbg_check_tnc(c, 0);
+	mutex_unlock(&c->tnc_mutex);
+
+	return err;
+}
+
+/**
+ * ubifs_tnc_replace - replace a node in the TNC only if the old node is found.
+ * @c: UBIFS file-system description object
+ * @key: key to add
+ * @old_lnum: LEB number of old node
+ * @old_offs: old node offset
+ * @lnum: LEB number of node
+ * @offs: node offset
+ * @len: node length
+ *
+ * This function replaces a node with key @key in the TNC only if the old node
+ * is found.  This function is called by garbage collection when node are moved.
+ * Returns %0 on success or negative error code on failure.
+ */
+int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key,
+		      int old_lnum, int old_offs, int lnum, int offs, int len)
+{
+	int found, n, err = 0;
+	struct ubifs_znode *znode;
+
+	mutex_lock(&c->tnc_mutex);
+	dbg_tnc("old LEB %d:%d, new LEB %d:%d, len %d, key %s", old_lnum,
+		old_offs, lnum, offs, len, DBGKEY(key));
+	found = lookup_level0_dirty(c, key, &znode, &n);
+	if (found < 0) {
+		err = found;
+		goto out_unlock;
+	}
+
+	if (found == 1) {
+		struct ubifs_zbranch *zbr = &znode->zbranch[n];
+
+		found = 0;
+		if (zbr->lnum == old_lnum && zbr->offs == old_offs) {
+			lnc_free(zbr);
+			err = ubifs_add_dirt(c, zbr->lnum, zbr->len);
+			if (err)
+				goto out_unlock;
+			zbr->lnum = lnum;
+			zbr->offs = offs;
+			zbr->len = len;
+			found = 1;
+		} else if (is_hash_key(c, key)) {
+			found = resolve_collision_directly(c, key, &znode, &n,
+							   old_lnum, old_offs);
+			dbg_tnc("rc returned %d, znode %p, n %d, LEB %d:%d",
+				found, znode, n, old_lnum, old_offs);
+			if (found < 0) {
+				err = found;
+				goto out_unlock;
+			}
+
+			if (found) {
+				/* Ensure the znode is dirtied */
+				if (znode->cnext || !ubifs_zn_dirty(znode)) {
+					    znode = dirty_cow_bottom_up(c,
+									znode);
+					    if (IS_ERR(znode)) {
+						    err = PTR_ERR(znode);
+						    goto out_unlock;
+					    }
+				}
+				zbr = &znode->zbranch[n];
+				lnc_free(zbr);
+				err = ubifs_add_dirt(c, zbr->lnum,
+						     zbr->len);
+				if (err)
+					goto out_unlock;
+				zbr->lnum = lnum;
+				zbr->offs = offs;
+				zbr->len = len;
+			}
+		}
+	}
+
+	if (!found)
+		err = ubifs_add_dirt(c, lnum, len);
+
+	if (!err)
+		err = dbg_check_tnc(c, 0);
+
+out_unlock:
+	mutex_unlock(&c->tnc_mutex);
+	return err;
+}
+
+/**
+ * ubifs_tnc_add_nm - add a "hashed" node to TNC.
+ * @c: UBIFS file-system description object
+ * @key: key to add
+ * @lnum: LEB number of node
+ * @offs: node offset
+ * @len: node length
+ * @nm: node name
+ *
+ * This is the same as 'ubifs_tnc_add()' but it should be used with keys which
+ * may have collisions, like directory entry keys.
+ */
+int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
+		     int lnum, int offs, int len, const struct qstr *nm)
+{
+	int found, n, err = 0;
+	struct ubifs_znode *znode;
+
+	mutex_lock(&c->tnc_mutex);
+	dbg_tnc("LEB %d:%d, name '%.*s', key %s", lnum, offs, nm->len, nm->name,
+		DBGKEY(key));
+	found = lookup_level0_dirty(c, key, &znode, &n);
+	if (found < 0) {
+		err = found;
+		goto out_unlock;
+	}
+
+	if (found == 1) {
+		if (c->replaying)
+			found = fallible_resolve_collision(c, key, &znode, &n,
+							   nm, 1);
+		else
+			found = resolve_collision(c, key, &znode, &n, nm);
+		dbg_tnc("rc returned %d, znode %p, n %d", found, znode, n);
+		if (found < 0) {
+			err = found;
+			goto out_unlock;
+		}
+
+		/* Ensure the znode is dirtied */
+		if (znode->cnext || !ubifs_zn_dirty(znode)) {
+			    znode = dirty_cow_bottom_up(c, znode);
+			    if (IS_ERR(znode)) {
+				    err = PTR_ERR(znode);
+				    goto out_unlock;
+			    }
+		}
+
+		if (found == 1) {
+			struct ubifs_zbranch *zbr = &znode->zbranch[n];
+
+			lnc_free(zbr);
+			err = ubifs_add_dirt(c, zbr->lnum, zbr->len);
+			zbr->lnum = lnum;
+			zbr->offs = offs;
+			zbr->len = len;
+			goto out_unlock;
+		}
+	}
+
+	if (!found) {
+		struct ubifs_zbranch zbr;
+
+		zbr.znode = NULL;
+		zbr.lnum = lnum;
+		zbr.offs = offs;
+		zbr.len = len;
+		key_copy(c, key, &zbr.key);
+		err = tnc_insert(c, znode, &zbr, n + 1);
+		if (err)
+			goto out_unlock;
+		if (c->replaying) {
+			/*
+			 * We did not find it in the index so there may be a
+			 * dangling branch still in the index. So we remove it
+			 * by passing 'ubifs_tnc_remove_nm()' the same key but
+			 * an unmatchable name.
+			 */
+			struct qstr noname = { .len = 0, .name = "" };
+
+			err = dbg_check_tnc(c, 0);
+			mutex_unlock(&c->tnc_mutex);
+			if (err)
+				return err;
+			return ubifs_tnc_remove_nm(c, key, &noname);
+		}
+	}
+
+out_unlock:
+	if (!err)
+		err = dbg_check_tnc(c, 0);
+	mutex_unlock(&c->tnc_mutex);
+	return err;
+}
+
+/**
+ * tnc_delete - delete a znode form TNC.
+ * @c: UBIFS file-system description object
+ * @znode: znode to delete from
+ * @n: zbranch slot number to delete
+ *
+ * This function deletes a leaf node from @n-th slot of @znode. Returns zero in
+ * case of success and a negative error code in case of failure.
+ */
+static int tnc_delete(struct ubifs_info *c, struct ubifs_znode *znode, int n)
+{
+	struct ubifs_zbranch *zbr;
+	struct ubifs_znode *zp;
+	int i, err;
+
+	/* Delete without merge for now */
+	ubifs_assert(znode->level == 0);
+	ubifs_assert(n >= 0 && n < c->fanout);
+	dbg_tnc("deleting %s", DBGKEY(&znode->zbranch[n].key));
+
+	zbr = &znode->zbranch[n];
+	lnc_free(zbr);
+
+	err = ubifs_add_dirt(c, zbr->lnum, zbr->len);
+	if (err) {
+		dbg_dump_znode(c, znode);
+		return err;
+	}
+
+	/* We do not "gap" zbranch slots */
+	for (i = n; i < znode->child_cnt - 1; i++)
+		znode->zbranch[i] = znode->zbranch[i + 1];
+	znode->child_cnt -= 1;
+
+	if (znode->child_cnt > 0)
+		return 0;
+
+	/*
+	 * This was the last zbranch, we have to delete this znode from the
+	 * parent.
+	 */
+
+	do {
+		ubifs_assert(!test_bit(OBSOLETE_ZNODE, &znode->flags));
+		ubifs_assert(ubifs_zn_dirty(znode));
+
+		zp = znode->parent;
+		n = znode->iip;
+
+		atomic_long_dec(&c->dirty_zn_cnt);
+
+		err = insert_old_idx_znode(c, znode);
+		if (err)
+			return err;
+
+		if (znode->cnext) {
+			__set_bit(OBSOLETE_ZNODE, &znode->flags);
+			atomic_long_inc(&c->clean_zn_cnt);
+			atomic_long_inc(&ubifs_clean_zn_cnt);
+		} else
+			kfree(znode);
+		znode = zp;
+	} while (znode->child_cnt == 1); /* while removing last child */
+
+	/* Remove from znode, entry n - 1 */
+	znode->child_cnt -= 1;
+	ubifs_assert(znode->level != 0);
+	for (i = n; i < znode->child_cnt; i++) {
+		znode->zbranch[i] = znode->zbranch[i + 1];
+		if (znode->zbranch[i].znode)
+			znode->zbranch[i].znode->iip = i;
+	}
+
+	/*
+	 * If this is the root and it has only 1 child then
+	 * collapse the tree.
+	 */
+	if (!znode->parent) {
+		while (znode->child_cnt == 1 && znode->level != 0) {
+			zp = znode;
+			zbr = &znode->zbranch[0];
+			znode = get_znode(c, znode, 0);
+			if (IS_ERR(znode))
+				return PTR_ERR(znode);
+			znode = dirty_cow_znode(c, zbr);
+			if (IS_ERR(znode))
+				return PTR_ERR(znode);
+			znode->parent = NULL;
+			znode->iip = 0;
+			if (c->zroot.len) {
+				err = insert_old_idx(c, c->zroot.lnum,
+						     c->zroot.offs);
+				if (err)
+					return err;
+			}
+			c->zroot.lnum = zbr->lnum;
+			c->zroot.offs = zbr->offs;
+			c->zroot.len = zbr->len;
+			c->zroot.znode = znode;
+			ubifs_assert(!test_bit(OBSOLETE_ZNODE,
+				     &zp->flags));
+			ubifs_assert(test_bit(DIRTY_ZNODE, &zp->flags));
+			atomic_long_dec(&c->dirty_zn_cnt);
+
+			if (zp->cnext) {
+				__set_bit(OBSOLETE_ZNODE, &zp->flags);
+				atomic_long_inc(&c->clean_zn_cnt);
+				atomic_long_inc(&ubifs_clean_zn_cnt);
+			} else
+				kfree(zp);
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * ubifs_tnc_remove - remove an index entry of a node.
+ * @c: UBIFS file-system description object
+ * @key: key of node
+ *
+ * Returns %0 on success or negative error code on failure.
+ */
+int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key)
+{
+	int found, n, err = 0;
+	struct ubifs_znode *znode;
+
+	mutex_lock(&c->tnc_mutex);
+	dbg_tnc("key %s", DBGKEY(key));
+	found = lookup_level0_dirty(c, key, &znode, &n);
+	if (found < 0) {
+		err = found;
+		goto out_unlock;
+	}
+	if (found == 1)
+		err = tnc_delete(c, znode, n);
+	if (!err)
+		err = dbg_check_tnc(c, 0);
+
+out_unlock:
+	mutex_unlock(&c->tnc_mutex);
+	return err;
+}
+
+/**
+ * ubifs_tnc_remove_nm - remove an index entry for a "hashed" node.
+ * @c: UBIFS file-system description object
+ * @key: key of node
+ * @nm: directory entry name
+ *
+ * Returns %0 on success or negative error code on failure.
+ */
+int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key,
+			const struct qstr *nm)
+{
+	int n, err;
+	struct ubifs_znode *znode;
+
+	mutex_lock(&c->tnc_mutex);
+	dbg_tnc("%.*s, key %s", nm->len, nm->name, DBGKEY(key));
+	err = lookup_level0_dirty(c, key, &znode, &n);
+	if (err < 0)
+		goto out_unlock;
+
+	if (err) {
+		if (c->replaying)
+			err = fallible_resolve_collision(c, key, &znode, &n,
+							 nm, 0);
+		else
+			err = resolve_collision(c, key, &znode, &n, nm);
+		dbg_tnc("rc returned %d, znode %p, n %d", err, znode, n);
+		if (err < 0)
+			goto out_unlock;
+		if (err) {
+			/* Ensure the znode is dirtied */
+			if (znode->cnext || !ubifs_zn_dirty(znode)) {
+				    znode = dirty_cow_bottom_up(c, znode);
+				    if (IS_ERR(znode)) {
+					    err = PTR_ERR(znode);
+					    goto out_unlock;
+				    }
+			}
+			err = tnc_delete(c, znode, n);
+		}
+	}
+
+out_unlock:
+	if (!err)
+		err = dbg_check_tnc(c, 0);
+	mutex_unlock(&c->tnc_mutex);
+	return err;
+}
+
+/**
+ * key_in_range - determine if a key falls within a range of keys.
+ * @c: UBIFS file-system description object
+ * @key: key to check
+ * @from_key: lowest key in range
+ * @to_key: highest key in range
+ *
+ * This function returns %1 if the key is in range and %0 otherwise.
+ */
+static int key_in_range(struct ubifs_info *c, union ubifs_key *key,
+			union ubifs_key *from_key, union ubifs_key *to_key)
+{
+	if (keys_cmp(c, key, from_key) < 0)
+		return 0;
+	if (keys_cmp(c, key, to_key) > 0)
+		return 0;
+	return 1;
+}
+
+/**
+ * ubifs_tnc_remove_range - remove index entries in range.
+ * @c: UBIFS file-system description object
+ * @from_key: lowest key to remove
+ * @to_key: highest key to remove
+ *
+ * This function removes index entries starting at @from_key and ending at
+ * @to_key.  This function returns zero in case of success and a negative error
+ * code in case of failure.
+ */
+int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key,
+			   union ubifs_key *to_key)
+{
+	int i, n, k, err = 0;
+	struct ubifs_znode *znode;
+	union ubifs_key *key;
+
+	mutex_lock(&c->tnc_mutex);
+	while (1) {
+		/* Find first level 0 znode that contains keys to remove */
+		err = ubifs_lookup_level0(c, from_key, &znode, &n);
+		if (err < 0)
+			goto out_unlock;
+
+		if (err)
+			key = from_key;
+		else {
+			err = tnc_next(c, &znode, &n);
+			if (err == -ENOENT) {
+				err = 0;
+				goto out_unlock;
+			}
+			if (err < 0)
+				goto out_unlock;
+			key = &znode->zbranch[n].key;
+			if (!key_in_range(c, key, from_key, to_key)) {
+				err = 0;
+				goto out_unlock;
+			}
+		}
+
+		/* Ensure the znode is dirtied */
+		if (znode->cnext || !ubifs_zn_dirty(znode)) {
+			    znode = dirty_cow_bottom_up(c, znode);
+			    if (IS_ERR(znode)) {
+				    err = PTR_ERR(znode);
+				    goto out_unlock;
+			    }
+		}
+
+		/* Remove all keys in range except the first */
+		for (i = n + 1, k = 0; i < znode->child_cnt; i++, k++) {
+			key = &znode->zbranch[i].key;
+			if (!key_in_range(c, key, from_key, to_key))
+				break;
+			lnc_free(&znode->zbranch[i]);
+			err = ubifs_add_dirt(c, znode->zbranch[i].lnum,
+					     znode->zbranch[i].len);
+			if (err) {
+				dbg_dump_znode(c, znode);
+				goto out_unlock;
+			}
+			dbg_tnc("removing %s", DBGKEY(key));
+		}
+		if (k) {
+			for (i = n + 1 + k; i < znode->child_cnt; i++)
+				znode->zbranch[i - k] = znode->zbranch[i];
+			znode->child_cnt -= k;
+		}
+
+		/* Now delete the first */
+		err = tnc_delete(c, znode, n);
+		if (err)
+			goto out_unlock;
+	}
+
+out_unlock:
+	if (!err)
+		err = dbg_check_tnc(c, 0);
+	mutex_unlock(&c->tnc_mutex);
+	return err;
+}
+
+/**
+ * ubifs_tnc_remove_ino - remove an inode from TNC.
+ * @c: UBIFS file-system description object
+ * @inum: inode number to remove
+ *
+ * This function remove inode @inum and all the extended attributes associated
+ * with the anode from TNC and returns zero in case of success or a negative
+ * error code in case of failure.
+ */
+int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum)
+{
+	union ubifs_key key1, key2;
+	struct ubifs_dent_node *xent, *pxent = NULL;
+	struct qstr nm = { .name = NULL };
+
+	dbg_tnc("ino %lu", inum);
+
+	/*
+	 * Walk all extended attribute entries and remove them together with
+	 * corresponding extended attribute inodes.
+	 */
+	lowest_xent_key(c, &key1, inum);
+	while (1) {
+		ino_t xattr_inum;
+		int err;
+
+		xent = ubifs_tnc_next_ent(c, &key1, &nm);
+		if (IS_ERR(xent)) {
+			err = PTR_ERR(xent);
+			if (err == -ENOENT)
+				break;
+			return err;
+		}
+
+		xattr_inum = le64_to_cpu(xent->inum);
+		dbg_tnc("xent '%s', ino %lu", xent->name, xattr_inum);
+
+		nm.name = xent->name;
+		nm.len = le16_to_cpu(xent->nlen);
+		err = ubifs_tnc_remove_nm(c, &key1, &nm);
+		if (err) {
+			kfree(xent);
+			return err;
+		}
+
+		lowest_ino_key(c, &key1, xattr_inum);
+		highest_ino_key(c, &key2, xattr_inum);
+		err = ubifs_tnc_remove_range(c, &key1, &key2);
+		if (err) {
+			kfree(xent);
+			return err;
+		}
+
+		kfree(pxent);
+		pxent = xent;
+		key_read(c, &xent->key, &key1);
+	}
+
+	kfree(pxent);
+	lowest_ino_key(c, &key1, inum);
+	highest_ino_key(c, &key2, inum);
+
+	return ubifs_tnc_remove_range(c, &key1, &key2);
+}
+
+/**
+ * ubifs_tnc_next_ent - walk directory or extended attribute entries.
+ * @c: UBIFS file-system description object
+ * @key: key of last entry
+ * @nm: name of last entry found or %NULL
+ *
+ * This function finds and reads the next directory or extended attribute entry
+ * after the given key (@key) if there is one. @nm is used to resolve
+ * collisions.
+ *
+ * If the name of the current entry is not known and only the key is known,
+ * @nm->name has to be %NULL. In this case the semantics of this function is a
+ * little bit different and it returns the entry corresponding to this key, not
+ * the next one. If the key was not found, the closest "right" entry is
+ * returned.
+ *
+ * If the fist entry has to be found, @key has to contain the lowest possible
+ * key value for this inode and @name has to be %NULL.
+ *
+ * This function returns the found directory or extended attribute entry node
+ * in case of success, %-ENOENT is returned if no entry was found, and a
+ * negative error code is returned in case of failure.
+ */
+struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c,
+					   union ubifs_key *key,
+					   const struct qstr *nm)
+{
+	int n, err, type = key_type(c, key);
+	struct ubifs_znode *znode;
+	struct ubifs_dent_node *dent;
+	struct ubifs_zbranch *zbr;
+	union ubifs_key *dkey;
+
+	dbg_tnc("%s %s", nm->name ? (char *)nm->name : "(lowest)", DBGKEY(key));
+	ubifs_assert(is_hash_key(c, key));
+
+	mutex_lock(&c->tnc_mutex);
+	err = ubifs_lookup_level0(c, key, &znode, &n);
+	if (unlikely(err < 0))
+		goto out_unlock;
+
+	if (nm->name) {
+		if (err) {
+			/* Handle collisions */
+			err = resolve_collision(c, key, &znode, &n, nm);
+			dbg_tnc("rc returned %d, znode %p, n %d",
+				err, znode, n);
+			if (unlikely(err < 0))
+				goto out_unlock;
+		}
+
+		/* Now find next entry */
+		err = tnc_next(c, &znode, &n);
+		if (unlikely(err))
+			goto out_unlock;
+	} else {
+		/*
+		 * The full name of the entry was not given, in which case the
+		 * behavior of this function is a little different and it
+		 * returns current entry, not the next one.
+		 */
+		if (!err) {
+			/*
+			 * However, the given key does not exist in the TNC
+			 * tree and @znode/@n variables contain the closest
+			 * "preceding" element. Switch to the next one.
+			 */
+			err = tnc_next(c, &znode, &n);
+			if (err)
+				goto out_unlock;
+		}
+	}
+
+	zbr = &znode->zbranch[n];
+	dent = kmalloc(zbr->len, GFP_NOFS);
+	if (unlikely(!dent)) {
+		err = -ENOMEM;
+		goto out_unlock;
+	}
+
+	/*
+	 * The above 'tnc_next()' call could lead us to the next inode, check
+	 * this.
+	 */
+	dkey = &zbr->key;
+	if (key_inum(c, dkey) != key_inum(c, key) ||
+	    key_type(c, dkey) != type) {
+		err = -ENOENT;
+		goto out_free;
+	}
+
+	err = tnc_read_node_nm(c, zbr, dent);
+	if (unlikely(err))
+		goto out_free;
+
+	mutex_unlock(&c->tnc_mutex);
+	return dent;
+
+out_free:
+	kfree(dent);
+out_unlock:
+	mutex_unlock(&c->tnc_mutex);
+	return ERR_PTR(err);
+}
+
+/**
+ * tnc_destroy_cnext - destroy left-over obsolete znodes from a failed commit.
+ * @c: UBIFS file-system description object
+ *
+ * Destroy left-over obsolete znodes from a failed commit.
+ */
+static void tnc_destroy_cnext(struct ubifs_info *c)
+{
+	struct ubifs_znode *cnext;
+
+	if (!c->cnext)
+		return;
+	ubifs_assert(c->cmt_state == COMMIT_BROKEN);
+	cnext = c->cnext;
+	do {
+		struct ubifs_znode *znode = cnext;
+
+		cnext = cnext->cnext;
+		if (test_bit(OBSOLETE_ZNODE, &znode->flags))
+			kfree(znode);
+	} while (cnext && cnext != c->cnext);
+}
+
+/**
+ * ubifs_tnc_close - close TNC subsystem and free all related resources.
+ * @c: UBIFS file-system description object
+ */
+void ubifs_tnc_close(struct ubifs_info *c)
+{
+	long clean_freed;
+
+	tnc_destroy_cnext(c);
+	if (c->zroot.znode) {
+		clean_freed = ubifs_destroy_tnc_subtree(c->zroot.znode);
+		atomic_long_sub(clean_freed, &ubifs_clean_zn_cnt);
+	}
+	kfree(c->gap_lebs);
+	kfree(c->ilebs);
+	destroy_old_idx(c);
+}
+
+/**
+ * left_znode - get the znode to the left.
+ * @c: UBIFS file-system description object
+ * @znode: znode
+ *
+ * This function returns a pointer to the znode to the left of @znode or NULL if
+ * there is not one. A negative error code is returned on failure.
+ */
+static struct ubifs_znode *left_znode(struct ubifs_info *c,
+				      struct ubifs_znode *znode)
+{
+	int level = znode->level;
+
+	while (1) {
+		int n = znode->iip - 1;
+
+		/* Go up until we can go left */
+		znode = znode->parent;
+		if (!znode)
+			return NULL;
+		if (n >= 0) {
+			/* Now go down the rightmost branch to 'level' */
+			znode = get_znode(c, znode, n);
+			if (IS_ERR(znode))
+				return znode;
+			while (znode->level != level) {
+				n = znode->child_cnt - 1;
+				znode = get_znode(c, znode, n);
+				if (IS_ERR(znode))
+					return znode;
+			}
+			break;
+		}
+	}
+	return znode;
+}
+
+/**
+ * right_znode - get the znode to the right.
+ * @c: UBIFS file-system description object
+ * @znode: znode
+ *
+ * This function returns a pointer to the znode to the right of @znode or NULL
+ * if there is not one. A negative error code is returned on failure.
+ */
+static struct ubifs_znode *right_znode(struct ubifs_info *c,
+				       struct ubifs_znode *znode)
+{
+	int level = znode->level;
+
+	while (1) {
+		int n = znode->iip + 1;
+
+		/* Go up until we can go right */
+		znode = znode->parent;
+		if (!znode)
+			return NULL;
+		if (n < znode->child_cnt) {
+			/* Now go down the leftmost branch to 'level' */
+			znode = get_znode(c, znode, n);
+			if (IS_ERR(znode))
+				return znode;
+			while (znode->level != level) {
+				znode = get_znode(c, znode, 0);
+				if (IS_ERR(znode))
+					return znode;
+			}
+			break;
+		}
+	}
+	return znode;
+}
+
+/**
+ * lookup_znode - find a particular indexing node from TNC.
+ * @c: UBIFS file-system description object
+ * @key: index node key to lookup
+ * @level: index node level
+ * @lnum: index node LEB number
+ * @offs: index node offset
+ *
+ * This function searches an indexing node by its first key @key and its
+ * address @lnum:@offs. It looks up the indexing tree by pulling all indexing
+ * nodes it traverses to TNC. This function is called fro indexing nodes which
+ * were found on the media by scanning, for example when garbage-collecting or
+ * when doing in-the-gaps commit. This means that the indexing node which is
+ * looked for does not have to have exactly the same leftmost key @key, because
+ * the leftmost key may have been changed, in which case TNC will contain a
+ * dirty znode which still refers the same @lnum:@offs. This function is clever
+ * enough to recognize such indexing nodes.
+ *
+ * Note, if a znode was deleted or changed too much, then this function will
+ * not find it. For situations like this UBIFS has the old index RB-tree
+ * (indexed by @lnum:@offs).
+ *
+ * This function returns a pointer to the znode found or %NULL if it is not
+ * found. A negative error code is returned on failure.
+ */
+static struct ubifs_znode *lookup_znode(struct ubifs_info *c,
+					union ubifs_key *key, int level,
+					int lnum, int offs)
+{
+	struct ubifs_znode *znode, *zn;
+	int n, nn;
+
+	/*
+	 * The arguments have probably been read off flash, so don't assume
+	 * they are valid.
+	 */
+	if (level < 0)
+		return ERR_PTR(-EINVAL);
+
+	/* Get the root znode */
+	znode = c->zroot.znode;
+	if (!znode) {
+		znode = ubifs_load_znode(c, &c->zroot, NULL, 0);
+		if (IS_ERR(znode))
+			return znode;
+	}
+	/* Check if it is the one we are looking for */
+	if (c->zroot.lnum == lnum && c->zroot.offs == offs)
+		return znode;
+	/* Descend to the parent level i.e. (level + 1) */
+	if (level >= znode->level)
+		return NULL;
+	while (1) {
+		ubifs_search_zbranch(c, znode, key, &n);
+		if (n < 0) {
+			/*
+			 * We reached a znode where the leftmost key is greater
+			 * than the key we are searching for. This is the same
+			 * situation as the one described in a huge comment at
+			 * the end of the 'ubifs_lookup_level0()' function. And
+			 * for exactly the same reasons we have to try to look
+			 * left before giving up.
+			 */
+			znode = left_znode(c, znode);
+			if (!znode)
+				return NULL;
+			if (IS_ERR(znode))
+				return znode;
+			ubifs_search_zbranch(c, znode, key, &n);
+			ubifs_assert(n >= 0);
+		}
+		if (znode->level == level + 1)
+			break;
+		znode = get_znode(c, znode, n);
+		if (IS_ERR(znode))
+			return znode;
+	}
+	/* Check if the child is the one we are looking for */
+	if (znode->zbranch[n].lnum == lnum && znode->zbranch[n].offs == offs)
+		return get_znode(c, znode, n);
+	/* If the key is unique, there is nowhere else to look */
+	if (!is_hash_key(c, key))
+		return NULL;
+	/*
+	 * The key is not unique and so may be also in the znodes to either
+	 * side.
+	 */
+	zn = znode;
+	nn = n;
+	/* Look left */
+	while (1) {
+		/* Move one branch to the left */
+		if (n)
+			n -= 1;
+		else {
+			znode = left_znode(c, znode);
+			if (!znode)
+				break;
+			if (IS_ERR(znode))
+				return znode;
+			n = znode->child_cnt - 1;
+		}
+		/* Check it */
+		if (znode->zbranch[n].lnum == lnum &&
+		    znode->zbranch[n].offs == offs)
+			return get_znode(c, znode, n);
+		/* Stop if the key is less than the one we are looking for */
+		if (keys_cmp(c, &znode->zbranch[n].key, key) < 0)
+			break;
+	}
+	/* Back to the middle */
+	znode = zn;
+	n = nn;
+	/* Look right */
+	while (1) {
+		/* Move one branch to the right */
+		if (++n >= znode->child_cnt) {
+			znode = right_znode(c, znode);
+			if (!znode)
+				break;
+			if (IS_ERR(znode))
+				return znode;
+			n = 0;
+		}
+		/* Check it */
+		if (znode->zbranch[n].lnum == lnum &&
+		    znode->zbranch[n].offs == offs)
+			return get_znode(c, znode, n);
+		/* Stop if the key is greater than the one we are looking for */
+		if (keys_cmp(c, &znode->zbranch[n].key, key) > 0)
+			break;
+	}
+	return NULL;
+}
+
+/**
+ * is_idx_node_in_tnc - determine if an index node is in the TNC.
+ * @c: UBIFS file-system description object
+ * @key: key of index node
+ * @level: index node level
+ * @lnum: LEB number of index node
+ * @offs: offset of index node
+ *
+ * This function returns %0 if the index node is not referred to in the TNC, %1
+ * if the index node is referred to in the TNC and the corresponding znode is
+ * dirty, %2 if an index node is referred to in the TNC and the corresponding
+ * znode is clean, and a negative error code in case of failure.
+ *
+ * Note, the @key argument has to be the key of the first child. Also note,
+ * this function relies on the fact that 0:0 is never a valid LEB number and
+ * offset for a main-area node.
+ */
+int is_idx_node_in_tnc(struct ubifs_info *c, union ubifs_key *key, int level,
+		       int lnum, int offs)
+{
+	struct ubifs_znode *znode;
+
+	znode = lookup_znode(c, key, level, lnum, offs);
+	if (!znode)
+		return 0;
+	if (IS_ERR(znode))
+		return PTR_ERR(znode);
+
+	return ubifs_zn_dirty(znode) ? 1 : 2;
+}
+
+/**
+ * is_leaf_node_in_tnc - determine if a non-indexing not is in the TNC.
+ * @c: UBIFS file-system description object
+ * @key: node key
+ * @lnum: node LEB number
+ * @offs: node offset
+ *
+ * This function returns %1 if the node is referred to in the TNC, %0 if it is
+ * not, and a negative error code in case of failure.
+ *
+ * Note, this function relies on the fact that 0:0 is never a valid LEB number
+ * and offset for a main-area node.
+ */
+static int is_leaf_node_in_tnc(struct ubifs_info *c, union ubifs_key *key,
+			       int lnum, int offs)
+{
+	struct ubifs_zbranch *zbr;
+	struct ubifs_znode *znode, *zn;
+	int n, found, err, nn;
+	const int unique = !is_hash_key(c, key);
+
+	found = ubifs_lookup_level0(c, key, &znode, &n);
+	if (found < 0)
+		return found; /* Error code */
+	if (!found)
+		return 0;
+	zbr = &znode->zbranch[n];
+	if (lnum == zbr->lnum && offs == zbr->offs)
+		return 1; /* Found it */
+	if (unique)
+		return 0;
+	/*
+	 * Because the key is not unique, we have to look left
+	 * and right as well
+	 */
+	zn = znode;
+	nn = n;
+	/* Look left */
+	while (1) {
+		err = tnc_prev(c, &znode, &n);
+		if (err == -ENOENT)
+			break;
+		if (err)
+			return err;
+		if (keys_cmp(c, key, &znode->zbranch[n].key))
+			break;
+		zbr = &znode->zbranch[n];
+		if (lnum == zbr->lnum && offs == zbr->offs)
+			return 1; /* Found it */
+	}
+	/* Look right */
+	znode = zn;
+	n = nn;
+	while (1) {
+		err = tnc_next(c, &znode, &n);
+		if (err) {
+			if (err == -ENOENT)
+				return 0;
+			return err;
+		}
+		if (keys_cmp(c, key, &znode->zbranch[n].key))
+			break;
+		zbr = &znode->zbranch[n];
+		if (lnum == zbr->lnum && offs == zbr->offs)
+			return 1; /* Found it */
+	}
+	return 0;
+}
+
+/**
+ * ubifs_tnc_has_node - determine whether a node is in the TNC.
+ * @c: UBIFS file-system description object
+ * @key: node key
+ * @level: index node level (if it is an index node)
+ * @lnum: node LEB number
+ * @offs: node offset
+ * @is_idx: non-zero if the node is an index node
+ *
+ * This function returns %1 if the node is in the TNC, %0 if it is not, and a
+ * negative error code in case of failure. For index nodes, @key has to be the
+ * key of the first child. An index node is considered to be in the TNC only if
+ * the corresponding znode is clean or has not been loaded.
+ */
+int ubifs_tnc_has_node(struct ubifs_info *c, union ubifs_key *key, int level,
+		       int lnum, int offs, int is_idx)
+{
+	int err;
+
+	mutex_lock(&c->tnc_mutex);
+	if (is_idx) {
+		err = is_idx_node_in_tnc(c, key, level, lnum, offs);
+		if (err < 0)
+			goto out_unlock;
+		if (err == 1)
+			/* The index node was found but it was dirty */
+			err = 0;
+		else if (err == 2)
+			/* The index node was found and it was clean */
+			err = 1;
+		else
+			BUG_ON(err != 0);
+	} else
+		err = is_leaf_node_in_tnc(c, key, lnum, offs);
+
+out_unlock:
+	mutex_unlock(&c->tnc_mutex);
+	return err;
+}
+
+/**
+ * ubifs_dirty_idx_node - dirty an index node.
+ * @c: UBIFS file-system description object
+ * @key: index node key
+ * @level: index node level
+ * @lnum: index node LEB number
+ * @offs: index node offset
+ *
+ * This function loads and dirties an index node so that it can be garbage
+ * collected. The @key argument has to be the key of the first child. This
+ * function relies on the fact that 0:0 is never a valid LEB number and offset
+ * for a main-area node. Returns %0 on success and a negative error code on
+ * failure.
+ */
+int ubifs_dirty_idx_node(struct ubifs_info *c, union ubifs_key *key, int level,
+			 int lnum, int offs)
+{
+	struct ubifs_znode *znode;
+	int err = 0;
+
+	mutex_lock(&c->tnc_mutex);
+	znode = lookup_znode(c, key, level, lnum, offs);
+	if (!znode)
+		goto out_unlock;
+	if (IS_ERR(znode)) {
+		err = PTR_ERR(znode);
+		goto out_unlock;
+	}
+	znode = dirty_cow_bottom_up(c, znode);
+	if (IS_ERR(znode)) {
+		err = PTR_ERR(znode);
+		goto out_unlock;
+	}
+
+out_unlock:
+	mutex_unlock(&c->tnc_mutex);
+	return err;
+}
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
new file mode 100644
index 00000000000..8117e65ba2e
--- /dev/null
+++ b/fs/ubifs/tnc_commit.c
@@ -0,0 +1,1103 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/* This file implements TNC functions for committing */
+
+#include "ubifs.h"
+
+/**
+ * make_idx_node - make an index node for fill-the-gaps method of TNC commit.
+ * @c: UBIFS file-system description object
+ * @idx: buffer in which to place new index node
+ * @znode: znode from which to make new index node
+ * @lnum: LEB number where new index node will be written
+ * @offs: offset where new index node will be written
+ * @len: length of new index node
+ */
+static int make_idx_node(struct ubifs_info *c, struct ubifs_idx_node *idx,
+			 struct ubifs_znode *znode, int lnum, int offs, int len)
+{
+	struct ubifs_znode *zp;
+	int i, err;
+
+	/* Make index node */
+	idx->ch.node_type = UBIFS_IDX_NODE;
+	idx->child_cnt = cpu_to_le16(znode->child_cnt);
+	idx->level = cpu_to_le16(znode->level);
+	for (i = 0; i < znode->child_cnt; i++) {
+		struct ubifs_branch *br = ubifs_idx_branch(c, idx, i);
+		struct ubifs_zbranch *zbr = &znode->zbranch[i];
+
+		key_write_idx(c, &zbr->key, &br->key);
+		br->lnum = cpu_to_le32(zbr->lnum);
+		br->offs = cpu_to_le32(zbr->offs);
+		br->len = cpu_to_le32(zbr->len);
+		if (!zbr->lnum || !zbr->len) {
+			ubifs_err("bad ref in znode");
+			dbg_dump_znode(c, znode);
+			if (zbr->znode)
+				dbg_dump_znode(c, zbr->znode);
+		}
+	}
+	ubifs_prepare_node(c, idx, len, 0);
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+	znode->lnum = lnum;
+	znode->offs = offs;
+	znode->len = len;
+#endif
+
+	err = insert_old_idx_znode(c, znode);
+
+	/* Update the parent */
+	zp = znode->parent;
+	if (zp) {
+		struct ubifs_zbranch *zbr;
+
+		zbr = &zp->zbranch[znode->iip];
+		zbr->lnum = lnum;
+		zbr->offs = offs;
+		zbr->len = len;
+	} else {
+		c->zroot.lnum = lnum;
+		c->zroot.offs = offs;
+		c->zroot.len = len;
+	}
+	c->calc_idx_sz += ALIGN(len, 8);
+
+	atomic_long_dec(&c->dirty_zn_cnt);
+
+	ubifs_assert(ubifs_zn_dirty(znode));
+	ubifs_assert(test_bit(COW_ZNODE, &znode->flags));
+
+	__clear_bit(DIRTY_ZNODE, &znode->flags);
+	__clear_bit(COW_ZNODE, &znode->flags);
+
+	return err;
+}
+
+/**
+ * fill_gap - make index nodes in gaps in dirty index LEBs.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number that gap appears in
+ * @gap_start: offset of start of gap
+ * @gap_end: offset of end of gap
+ * @dirt: adds dirty space to this
+ *
+ * This function returns the number of index nodes written into the gap.
+ */
+static int fill_gap(struct ubifs_info *c, int lnum, int gap_start, int gap_end,
+		    int *dirt)
+{
+	int len, gap_remains, gap_pos, written, pad_len;
+
+	ubifs_assert((gap_start & 7) == 0);
+	ubifs_assert((gap_end & 7) == 0);
+	ubifs_assert(gap_end >= gap_start);
+
+	gap_remains = gap_end - gap_start;
+	if (!gap_remains)
+		return 0;
+	gap_pos = gap_start;
+	written = 0;
+	while (c->enext) {
+		len = ubifs_idx_node_sz(c, c->enext->child_cnt);
+		if (len < gap_remains) {
+			struct ubifs_znode *znode = c->enext;
+			const int alen = ALIGN(len, 8);
+			int err;
+
+			ubifs_assert(alen <= gap_remains);
+			err = make_idx_node(c, c->ileb_buf + gap_pos, znode,
+					    lnum, gap_pos, len);
+			if (err)
+				return err;
+			gap_remains -= alen;
+			gap_pos += alen;
+			c->enext = znode->cnext;
+			if (c->enext == c->cnext)
+				c->enext = NULL;
+			written += 1;
+		} else
+			break;
+	}
+	if (gap_end == c->leb_size) {
+		c->ileb_len = ALIGN(gap_pos, c->min_io_size);
+		/* Pad to end of min_io_size */
+		pad_len = c->ileb_len - gap_pos;
+	} else
+		/* Pad to end of gap */
+		pad_len = gap_remains;
+	dbg_gc("LEB %d:%d to %d len %d nodes written %d wasted bytes %d",
+	       lnum, gap_start, gap_end, gap_end - gap_start, written, pad_len);
+	ubifs_pad(c, c->ileb_buf + gap_pos, pad_len);
+	*dirt += pad_len;
+	return written;
+}
+
+/**
+ * find_old_idx - find an index node obsoleted since the last commit start.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number of obsoleted index node
+ * @offs: offset of obsoleted index node
+ *
+ * Returns %1 if found and %0 otherwise.
+ */
+static int find_old_idx(struct ubifs_info *c, int lnum, int offs)
+{
+	struct ubifs_old_idx *o;
+	struct rb_node *p;
+
+	p = c->old_idx.rb_node;
+	while (p) {
+		o = rb_entry(p, struct ubifs_old_idx, rb);
+		if (lnum < o->lnum)
+			p = p->rb_left;
+		else if (lnum > o->lnum)
+			p = p->rb_right;
+		else if (offs < o->offs)
+			p = p->rb_left;
+		else if (offs > o->offs)
+			p = p->rb_right;
+		else
+			return 1;
+	}
+	return 0;
+}
+
+/**
+ * is_idx_node_in_use - determine if an index node can be overwritten.
+ * @c: UBIFS file-system description object
+ * @key: key of index node
+ * @level: index node level
+ * @lnum: LEB number of index node
+ * @offs: offset of index node
+ *
+ * If @key / @lnum / @offs identify an index node that was not part of the old
+ * index, then this function returns %0 (obsolete).  Else if the index node was
+ * part of the old index but is now dirty %1 is returned, else if it is clean %2
+ * is returned. A negative error code is returned on failure.
+ */
+static int is_idx_node_in_use(struct ubifs_info *c, union ubifs_key *key,
+			      int level, int lnum, int offs)
+{
+	int ret;
+
+	ret = is_idx_node_in_tnc(c, key, level, lnum, offs);
+	if (ret < 0)
+		return ret; /* Error code */
+	if (ret == 0)
+		if (find_old_idx(c, lnum, offs))
+			return 1;
+	return ret;
+}
+
+/**
+ * layout_leb_in_gaps - layout index nodes using in-the-gaps method.
+ * @c: UBIFS file-system description object
+ * @p: return LEB number here
+ *
+ * This function lays out new index nodes for dirty znodes using in-the-gaps
+ * method of TNC commit.
+ * This function merely puts the next znode into the next gap, making no attempt
+ * to try to maximise the number of znodes that fit.
+ * This function returns the number of index nodes written into the gaps, or a
+ * negative error code on failure.
+ */
+static int layout_leb_in_gaps(struct ubifs_info *c, int *p)
+{
+	struct ubifs_scan_leb *sleb;
+	struct ubifs_scan_node *snod;
+	int lnum, dirt = 0, gap_start, gap_end, err, written, tot_written;
+
+	tot_written = 0;
+	/* Get an index LEB with lots of obsolete index nodes */
+	lnum = ubifs_find_dirty_idx_leb(c);
+	if (lnum < 0)
+		/*
+		 * There also may be dirt in the index head that could be
+		 * filled, however we do not check there at present.
+		 */
+		return lnum; /* Error code */
+	*p = lnum;
+	dbg_gc("LEB %d", lnum);
+	/*
+	 * Scan the index LEB.  We use the generic scan for this even though
+	 * it is more comprehensive and less efficient than is needed for this
+	 * purpose.
+	 */
+	sleb = ubifs_scan(c, lnum, 0, c->ileb_buf);
+	c->ileb_len = 0;
+	if (IS_ERR(sleb))
+		return PTR_ERR(sleb);
+	gap_start = 0;
+	list_for_each_entry(snod, &sleb->nodes, list) {
+		struct ubifs_idx_node *idx;
+		int in_use, level;
+
+		ubifs_assert(snod->type == UBIFS_IDX_NODE);
+		idx = snod->node;
+		key_read(c, ubifs_idx_key(c, idx), &snod->key);
+		level = le16_to_cpu(idx->level);
+		/* Determine if the index node is in use (not obsolete) */
+		in_use = is_idx_node_in_use(c, &snod->key, level, lnum,
+					    snod->offs);
+		if (in_use < 0) {
+			ubifs_scan_destroy(sleb);
+			return in_use; /* Error code */
+		}
+		if (in_use) {
+			if (in_use == 1)
+				dirt += ALIGN(snod->len, 8);
+			/*
+			 * The obsolete index nodes form gaps that can be
+			 * overwritten.  This gap has ended because we have
+			 * found an index node that is still in use
+			 * i.e. not obsolete
+			 */
+			gap_end = snod->offs;
+			/* Try to fill gap */
+			written = fill_gap(c, lnum, gap_start, gap_end, &dirt);
+			if (written < 0) {
+				ubifs_scan_destroy(sleb);
+				return written; /* Error code */
+			}
+			tot_written += written;
+			gap_start = ALIGN(snod->offs + snod->len, 8);
+		}
+	}
+	ubifs_scan_destroy(sleb);
+	c->ileb_len = c->leb_size;
+	gap_end = c->leb_size;
+	/* Try to fill gap */
+	written = fill_gap(c, lnum, gap_start, gap_end, &dirt);
+	if (written < 0)
+		return written; /* Error code */
+	tot_written += written;
+	if (tot_written == 0) {
+		struct ubifs_lprops lp;
+
+		dbg_gc("LEB %d wrote %d index nodes", lnum, tot_written);
+		err = ubifs_read_one_lp(c, lnum, &lp);
+		if (err)
+			return err;
+		if (lp.free == c->leb_size) {
+			/*
+			 * We must have snatched this LEB from the idx_gc list
+			 * so we need to correct the free and dirty space.
+			 */
+			err = ubifs_change_one_lp(c, lnum,
+						  c->leb_size - c->ileb_len,
+						  dirt, 0, 0, 0);
+			if (err)
+				return err;
+		}
+		return 0;
+	}
+	err = ubifs_change_one_lp(c, lnum, c->leb_size - c->ileb_len, dirt,
+				  0, 0, 0);
+	if (err)
+		return err;
+	err = ubifs_leb_change(c, lnum, c->ileb_buf, c->ileb_len,
+			       UBI_SHORTTERM);
+	if (err)
+		return err;
+	dbg_gc("LEB %d wrote %d index nodes", lnum, tot_written);
+	return tot_written;
+}
+
+/**
+ * get_leb_cnt - calculate the number of empty LEBs needed to commit.
+ * @c: UBIFS file-system description object
+ * @cnt: number of znodes to commit
+ *
+ * This function returns the number of empty LEBs needed to commit @cnt znodes
+ * to the current index head.  The number is not exact and may be more than
+ * needed.
+ */
+static int get_leb_cnt(struct ubifs_info *c, int cnt)
+{
+	int d;
+
+	/* Assume maximum index node size (i.e. overestimate space needed) */
+	cnt -= (c->leb_size - c->ihead_offs) / c->max_idx_node_sz;
+	if (cnt < 0)
+		cnt = 0;
+	d = c->leb_size / c->max_idx_node_sz;
+	return DIV_ROUND_UP(cnt, d);
+}
+
+/**
+ * layout_in_gaps - in-the-gaps method of committing TNC.
+ * @c: UBIFS file-system description object
+ * @cnt: number of dirty znodes to commit.
+ *
+ * This function lays out new index nodes for dirty znodes using in-the-gaps
+ * method of TNC commit.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int layout_in_gaps(struct ubifs_info *c, int cnt)
+{
+	int err, leb_needed_cnt, written, *p;
+
+	dbg_gc("%d znodes to write", cnt);
+
+	c->gap_lebs = kmalloc(sizeof(int) * (c->lst.idx_lebs + 1), GFP_NOFS);
+	if (!c->gap_lebs)
+		return -ENOMEM;
+
+	p = c->gap_lebs;
+	do {
+		ubifs_assert(p < c->gap_lebs + sizeof(int) * c->lst.idx_lebs);
+		written = layout_leb_in_gaps(c, p);
+		if (written < 0) {
+			err = written;
+			if (err == -ENOSPC) {
+				if (!dbg_force_in_the_gaps_enabled) {
+					/*
+					 * Do not print scary warnings if the
+					 * debugging option which forces
+					 * in-the-gaps is enabled.
+					 */
+					ubifs_err("out of space");
+					spin_lock(&c->space_lock);
+					dbg_dump_budg(c);
+					spin_unlock(&c->space_lock);
+					dbg_dump_lprops(c);
+				}
+				/* Try to commit anyway */
+				err = 0;
+				break;
+			}
+			kfree(c->gap_lebs);
+			c->gap_lebs = NULL;
+			return err;
+		}
+		p++;
+		cnt -= written;
+		leb_needed_cnt = get_leb_cnt(c, cnt);
+		dbg_gc("%d znodes remaining, need %d LEBs, have %d", cnt,
+		       leb_needed_cnt, c->ileb_cnt);
+	} while (leb_needed_cnt > c->ileb_cnt);
+
+	*p = -1;
+	return 0;
+}
+
+/**
+ * layout_in_empty_space - layout index nodes in empty space.
+ * @c: UBIFS file-system description object
+ *
+ * This function lays out new index nodes for dirty znodes using empty LEBs.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int layout_in_empty_space(struct ubifs_info *c)
+{
+	struct ubifs_znode *znode, *cnext, *zp;
+	int lnum, offs, len, next_len, buf_len, buf_offs, used, avail;
+	int wlen, blen, err;
+
+	cnext = c->enext;
+	if (!cnext)
+		return 0;
+
+	lnum = c->ihead_lnum;
+	buf_offs = c->ihead_offs;
+
+	buf_len = ubifs_idx_node_sz(c, c->fanout);
+	buf_len = ALIGN(buf_len, c->min_io_size);
+	used = 0;
+	avail = buf_len;
+
+	/* Ensure there is enough room for first write */
+	next_len = ubifs_idx_node_sz(c, cnext->child_cnt);
+	if (buf_offs + next_len > c->leb_size)
+		lnum = -1;
+
+	while (1) {
+		znode = cnext;
+
+		len = ubifs_idx_node_sz(c, znode->child_cnt);
+
+		/* Determine the index node position */
+		if (lnum == -1) {
+			if (c->ileb_nxt >= c->ileb_cnt) {
+				ubifs_err("out of space");
+				return -ENOSPC;
+			}
+			lnum = c->ilebs[c->ileb_nxt++];
+			buf_offs = 0;
+			used = 0;
+			avail = buf_len;
+		}
+
+		offs = buf_offs + used;
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+		znode->lnum = lnum;
+		znode->offs = offs;
+		znode->len = len;
+#endif
+
+		/* Update the parent */
+		zp = znode->parent;
+		if (zp) {
+			struct ubifs_zbranch *zbr;
+			int i;
+
+			i = znode->iip;
+			zbr = &zp->zbranch[i];
+			zbr->lnum = lnum;
+			zbr->offs = offs;
+			zbr->len = len;
+		} else {
+			c->zroot.lnum = lnum;
+			c->zroot.offs = offs;
+			c->zroot.len = len;
+		}
+		c->calc_idx_sz += ALIGN(len, 8);
+
+		/*
+		 * Once lprops is updated, we can decrease the dirty znode count
+		 * but it is easier to just do it here.
+		 */
+		atomic_long_dec(&c->dirty_zn_cnt);
+
+		/*
+		 * Calculate the next index node length to see if there is
+		 * enough room for it
+		 */
+		cnext = znode->cnext;
+		if (cnext == c->cnext)
+			next_len = 0;
+		else
+			next_len = ubifs_idx_node_sz(c, cnext->child_cnt);
+
+		if (c->min_io_size == 1) {
+			buf_offs += ALIGN(len, 8);
+			if (next_len) {
+				if (buf_offs + next_len <= c->leb_size)
+					continue;
+				err = ubifs_update_one_lp(c, lnum, 0,
+						c->leb_size - buf_offs, 0, 0);
+				if (err)
+					return err;
+				lnum = -1;
+				continue;
+			}
+			err = ubifs_update_one_lp(c, lnum,
+					c->leb_size - buf_offs, 0, 0, 0);
+			if (err)
+				return err;
+			break;
+		}
+
+		/* Update buffer positions */
+		wlen = used + len;
+		used += ALIGN(len, 8);
+		avail -= ALIGN(len, 8);
+
+		if (next_len != 0 &&
+		    buf_offs + used + next_len <= c->leb_size &&
+		    avail > 0)
+			continue;
+
+		if (avail <= 0 && next_len &&
+		    buf_offs + used + next_len <= c->leb_size)
+			blen = buf_len;
+		else
+			blen = ALIGN(wlen, c->min_io_size);
+
+		/* The buffer is full or there are no more znodes to do */
+		buf_offs += blen;
+		if (next_len) {
+			if (buf_offs + next_len > c->leb_size) {
+				err = ubifs_update_one_lp(c, lnum,
+					c->leb_size - buf_offs, blen - used,
+					0, 0);
+				if (err)
+					return err;
+				lnum = -1;
+			}
+			used -= blen;
+			if (used < 0)
+				used = 0;
+			avail = buf_len - used;
+			continue;
+		}
+		err = ubifs_update_one_lp(c, lnum, c->leb_size - buf_offs,
+					  blen - used, 0, 0);
+		if (err)
+			return err;
+		break;
+	}
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+	c->new_ihead_lnum = lnum;
+	c->new_ihead_offs = buf_offs;
+#endif
+
+	return 0;
+}
+
+/**
+ * layout_commit - determine positions of index nodes to commit.
+ * @c: UBIFS file-system description object
+ * @no_space: indicates that insufficient empty LEBs were allocated
+ * @cnt: number of znodes to commit
+ *
+ * Calculate and update the positions of index nodes to commit.  If there were
+ * an insufficient number of empty LEBs allocated, then index nodes are placed
+ * into the gaps created by obsolete index nodes in non-empty index LEBs.  For
+ * this purpose, an obsolete index node is one that was not in the index as at
+ * the end of the last commit.  To write "in-the-gaps" requires that those index
+ * LEBs are updated atomically in-place.
+ */
+static int layout_commit(struct ubifs_info *c, int no_space, int cnt)
+{
+	int err;
+
+	if (no_space) {
+		err = layout_in_gaps(c, cnt);
+		if (err)
+			return err;
+	}
+	err = layout_in_empty_space(c);
+	return err;
+}
+
+/**
+ * find_first_dirty - find first dirty znode.
+ * @znode: znode to begin searching from
+ */
+static struct ubifs_znode *find_first_dirty(struct ubifs_znode *znode)
+{
+	int i, cont;
+
+	if (!znode)
+		return NULL;
+
+	while (1) {
+		if (znode->level == 0) {
+			if (ubifs_zn_dirty(znode))
+				return znode;
+			return NULL;
+		}
+		cont = 0;
+		for (i = 0; i < znode->child_cnt; i++) {
+			struct ubifs_zbranch *zbr = &znode->zbranch[i];
+
+			if (zbr->znode && ubifs_zn_dirty(zbr->znode)) {
+				znode = zbr->znode;
+				cont = 1;
+				break;
+			}
+		}
+		if (!cont) {
+			if (ubifs_zn_dirty(znode))
+				return znode;
+			return NULL;
+		}
+	}
+}
+
+/**
+ * find_next_dirty - find next dirty znode.
+ * @znode: znode to begin searching from
+ */
+static struct ubifs_znode *find_next_dirty(struct ubifs_znode *znode)
+{
+	int n = znode->iip + 1;
+
+	znode = znode->parent;
+	if (!znode)
+		return NULL;
+	for (; n < znode->child_cnt; n++) {
+		struct ubifs_zbranch *zbr = &znode->zbranch[n];
+
+		if (zbr->znode && ubifs_zn_dirty(zbr->znode))
+			return find_first_dirty(zbr->znode);
+	}
+	return znode;
+}
+
+/**
+ * get_znodes_to_commit - create list of dirty znodes to commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns the number of znodes to commit.
+ */
+static int get_znodes_to_commit(struct ubifs_info *c)
+{
+	struct ubifs_znode *znode, *cnext;
+	int cnt = 0;
+
+	c->cnext = find_first_dirty(c->zroot.znode);
+	znode = c->enext = c->cnext;
+	if (!znode) {
+		dbg_cmt("no znodes to commit");
+		return 0;
+	}
+	cnt += 1;
+	while (1) {
+		ubifs_assert(!test_bit(COW_ZNODE, &znode->flags));
+		__set_bit(COW_ZNODE, &znode->flags);
+		znode->alt = 0;
+		cnext = find_next_dirty(znode);
+		if (!cnext) {
+			znode->cnext = c->cnext;
+			break;
+		}
+		znode->cnext = cnext;
+		znode = cnext;
+		cnt += 1;
+	}
+	dbg_cmt("committing %d znodes", cnt);
+	ubifs_assert(cnt == atomic_long_read(&c->dirty_zn_cnt));
+	return cnt;
+}
+
+/**
+ * alloc_idx_lebs - allocate empty LEBs to be used to commit.
+ * @c: UBIFS file-system description object
+ * @cnt: number of znodes to commit
+ *
+ * This function returns %-ENOSPC if it cannot allocate a sufficient number of
+ * empty LEBs.  %0 is returned on success, otherwise a negative error code
+ * is returned.
+ */
+static int alloc_idx_lebs(struct ubifs_info *c, int cnt)
+{
+	int i, leb_cnt, lnum;
+
+	c->ileb_cnt = 0;
+	c->ileb_nxt = 0;
+	leb_cnt = get_leb_cnt(c, cnt);
+	dbg_cmt("need about %d empty LEBS for TNC commit", leb_cnt);
+	if (!leb_cnt)
+		return 0;
+	c->ilebs = kmalloc(leb_cnt * sizeof(int), GFP_NOFS);
+	if (!c->ilebs)
+		return -ENOMEM;
+	for (i = 0; i < leb_cnt; i++) {
+		lnum = ubifs_find_free_leb_for_idx(c);
+		if (lnum < 0)
+			return lnum;
+		c->ilebs[c->ileb_cnt++] = lnum;
+		dbg_cmt("LEB %d", lnum);
+	}
+	if (dbg_force_in_the_gaps())
+		return -ENOSPC;
+	return 0;
+}
+
+/**
+ * free_unused_idx_lebs - free unused LEBs that were allocated for the commit.
+ * @c: UBIFS file-system description object
+ *
+ * It is possible that we allocate more empty LEBs for the commit than we need.
+ * This functions frees the surplus.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int free_unused_idx_lebs(struct ubifs_info *c)
+{
+	int i, err = 0, lnum, er;
+
+	for (i = c->ileb_nxt; i < c->ileb_cnt; i++) {
+		lnum = c->ilebs[i];
+		dbg_cmt("LEB %d", lnum);
+		er = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
+					 LPROPS_INDEX | LPROPS_TAKEN, 0);
+		if (!err)
+			err = er;
+	}
+	return err;
+}
+
+/**
+ * free_idx_lebs - free unused LEBs after commit end.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int free_idx_lebs(struct ubifs_info *c)
+{
+	int err;
+
+	err = free_unused_idx_lebs(c);
+	kfree(c->ilebs);
+	c->ilebs = NULL;
+	return err;
+}
+
+/**
+ * ubifs_tnc_start_commit - start TNC commit.
+ * @c: UBIFS file-system description object
+ * @zroot: new index root position is returned here
+ *
+ * This function prepares the list of indexing nodes to commit and lays out
+ * their positions on flash. If there is not enough free space it uses the
+ * in-gap commit method. Returns zero in case of success and a negative error
+ * code in case of failure.
+ */
+int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot)
+{
+	int err = 0, cnt;
+
+	mutex_lock(&c->tnc_mutex);
+	err = dbg_check_tnc(c, 1);
+	if (err)
+		goto out;
+	cnt = get_znodes_to_commit(c);
+	if (cnt != 0) {
+		int no_space = 0;
+
+		err = alloc_idx_lebs(c, cnt);
+		if (err == -ENOSPC)
+			no_space = 1;
+		else if (err)
+			goto out_free;
+		err = layout_commit(c, no_space, cnt);
+		if (err)
+			goto out_free;
+		ubifs_assert(atomic_long_read(&c->dirty_zn_cnt) == 0);
+		err = free_unused_idx_lebs(c);
+		if (err)
+			goto out;
+	}
+	destroy_old_idx(c);
+	memcpy(zroot, &c->zroot, sizeof(struct ubifs_zbranch));
+
+	err = ubifs_save_dirty_idx_lnums(c);
+	if (err)
+		goto out;
+
+	spin_lock(&c->space_lock);
+	/*
+	 * Although we have not finished committing yet, update size of the
+	 * committed index ('c->old_idx_sz') and zero out the index growth
+	 * budget. It is OK to do this now, because we've reserved all the
+	 * space which is needed to commit the index, and it is save for the
+	 * budgeting subsystem to assume the index is already committed,
+	 * even though it is not.
+	 */
+	c->old_idx_sz = c->calc_idx_sz;
+	c->budg_uncommitted_idx = 0;
+	spin_unlock(&c->space_lock);
+	mutex_unlock(&c->tnc_mutex);
+
+	dbg_cmt("number of index LEBs %d", c->lst.idx_lebs);
+	dbg_cmt("size of index %llu", c->calc_idx_sz);
+	return err;
+
+out_free:
+	free_idx_lebs(c);
+out:
+	mutex_unlock(&c->tnc_mutex);
+	return err;
+}
+
+/**
+ * write_index - write index nodes.
+ * @c: UBIFS file-system description object
+ *
+ * This function writes the index nodes whose positions were laid out in the
+ * layout_in_empty_space function.
+ */
+static int write_index(struct ubifs_info *c)
+{
+	struct ubifs_idx_node *idx;
+	struct ubifs_znode *znode, *cnext;
+	int i, lnum, offs, len, next_len, buf_len, buf_offs, used;
+	int avail, wlen, err, lnum_pos = 0;
+
+	cnext = c->enext;
+	if (!cnext)
+		return 0;
+
+	/*
+	 * Always write index nodes to the index head so that index nodes and
+	 * other types of nodes are never mixed in the same erase block.
+	 */
+	lnum = c->ihead_lnum;
+	buf_offs = c->ihead_offs;
+
+	/* Allocate commit buffer */
+	buf_len = ALIGN(c->max_idx_node_sz, c->min_io_size);
+	used = 0;
+	avail = buf_len;
+
+	/* Ensure there is enough room for first write */
+	next_len = ubifs_idx_node_sz(c, cnext->child_cnt);
+	if (buf_offs + next_len > c->leb_size) {
+		err = ubifs_update_one_lp(c, lnum, LPROPS_NC, 0, 0,
+					  LPROPS_TAKEN);
+		if (err)
+			return err;
+		lnum = -1;
+	}
+
+	while (1) {
+		cond_resched();
+
+		znode = cnext;
+		idx = c->cbuf + used;
+
+		/* Make index node */
+		idx->ch.node_type = UBIFS_IDX_NODE;
+		idx->child_cnt = cpu_to_le16(znode->child_cnt);
+		idx->level = cpu_to_le16(znode->level);
+		for (i = 0; i < znode->child_cnt; i++) {
+			struct ubifs_branch *br = ubifs_idx_branch(c, idx, i);
+			struct ubifs_zbranch *zbr = &znode->zbranch[i];
+
+			key_write_idx(c, &zbr->key, &br->key);
+			br->lnum = cpu_to_le32(zbr->lnum);
+			br->offs = cpu_to_le32(zbr->offs);
+			br->len = cpu_to_le32(zbr->len);
+			if (!zbr->lnum || !zbr->len) {
+				ubifs_err("bad ref in znode");
+				dbg_dump_znode(c, znode);
+				if (zbr->znode)
+					dbg_dump_znode(c, zbr->znode);
+			}
+		}
+		len = ubifs_idx_node_sz(c, znode->child_cnt);
+		ubifs_prepare_node(c, idx, len, 0);
+
+		/* Determine the index node position */
+		if (lnum == -1) {
+			lnum = c->ilebs[lnum_pos++];
+			buf_offs = 0;
+			used = 0;
+			avail = buf_len;
+		}
+		offs = buf_offs + used;
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+		if (lnum != znode->lnum || offs != znode->offs ||
+		    len != znode->len) {
+			ubifs_err("inconsistent znode posn");
+			return -EINVAL;
+		}
+#endif
+
+		/* Grab some stuff from znode while we still can */
+		cnext = znode->cnext;
+
+		ubifs_assert(ubifs_zn_dirty(znode));
+		ubifs_assert(test_bit(COW_ZNODE, &znode->flags));
+
+		/*
+		 * It is important that other threads should see %DIRTY_ZNODE
+		 * flag cleared before %COW_ZNODE. Specifically, it matters in
+		 * the 'dirty_cow_znode()' function. This is the reason for the
+		 * first barrier. Also, we want the bit changes to be seen to
+		 * other threads ASAP, to avoid unnecesarry copying, which is
+		 * the reason for the second barrier.
+		 */
+		clear_bit(DIRTY_ZNODE, &znode->flags);
+		smp_mb__before_clear_bit();
+		clear_bit(COW_ZNODE, &znode->flags);
+		smp_mb__after_clear_bit();
+
+		/* Do not access znode from this point on */
+
+		/* Update buffer positions */
+		wlen = used + len;
+		used += ALIGN(len, 8);
+		avail -= ALIGN(len, 8);
+
+		/*
+		 * Calculate the next index node length to see if there is
+		 * enough room for it
+		 */
+		if (cnext == c->cnext)
+			next_len = 0;
+		else
+			next_len = ubifs_idx_node_sz(c, cnext->child_cnt);
+
+		if (c->min_io_size == 1) {
+			/*
+			 * Write the prepared index node immediately if there is
+			 * no minimum IO size
+			 */
+			err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs,
+					      wlen, UBI_SHORTTERM);
+			if (err)
+				return err;
+			buf_offs += ALIGN(wlen, 8);
+			if (next_len) {
+				used = 0;
+				avail = buf_len;
+				if (buf_offs + next_len > c->leb_size) {
+					err = ubifs_update_one_lp(c, lnum,
+						LPROPS_NC, 0, 0, LPROPS_TAKEN);
+					if (err)
+						return err;
+					lnum = -1;
+				}
+				continue;
+			}
+		} else {
+			int blen, nxt_offs = buf_offs + used + next_len;
+
+			if (next_len && nxt_offs <= c->leb_size) {
+				if (avail > 0)
+					continue;
+				else
+					blen = buf_len;
+			} else {
+				wlen = ALIGN(wlen, 8);
+				blen = ALIGN(wlen, c->min_io_size);
+				ubifs_pad(c, c->cbuf + wlen, blen - wlen);
+			}
+			/*
+			 * The buffer is full or there are no more znodes
+			 * to do
+			 */
+			err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs,
+					      blen, UBI_SHORTTERM);
+			if (err)
+				return err;
+			buf_offs += blen;
+			if (next_len) {
+				if (nxt_offs > c->leb_size) {
+					err = ubifs_update_one_lp(c, lnum,
+						LPROPS_NC, 0, 0, LPROPS_TAKEN);
+					if (err)
+						return err;
+					lnum = -1;
+				}
+				used -= blen;
+				if (used < 0)
+					used = 0;
+				avail = buf_len - used;
+				memmove(c->cbuf, c->cbuf + blen, used);
+				continue;
+			}
+		}
+		break;
+	}
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+	if (lnum != c->new_ihead_lnum || buf_offs != c->new_ihead_offs) {
+		ubifs_err("inconsistent ihead");
+		return -EINVAL;
+	}
+#endif
+
+	c->ihead_lnum = lnum;
+	c->ihead_offs = buf_offs;
+
+	return 0;
+}
+
+/**
+ * free_obsolete_znodes - free obsolete znodes.
+ * @c: UBIFS file-system description object
+ *
+ * At the end of commit end, obsolete znodes are freed.
+ */
+static void free_obsolete_znodes(struct ubifs_info *c)
+{
+	struct ubifs_znode *znode, *cnext;
+
+	cnext = c->cnext;
+	do {
+		znode = cnext;
+		cnext = znode->cnext;
+		if (test_bit(OBSOLETE_ZNODE, &znode->flags))
+			kfree(znode);
+		else {
+			znode->cnext = NULL;
+			atomic_long_inc(&c->clean_zn_cnt);
+			atomic_long_inc(&ubifs_clean_zn_cnt);
+		}
+	} while (cnext != c->cnext);
+}
+
+/**
+ * return_gap_lebs - return LEBs used by the in-gap commit method.
+ * @c: UBIFS file-system description object
+ *
+ * This function clears the "taken" flag for the LEBs which were used by the
+ * "commit in-the-gaps" method.
+ */
+static int return_gap_lebs(struct ubifs_info *c)
+{
+	int *p, err;
+
+	if (!c->gap_lebs)
+		return 0;
+
+	dbg_cmt("");
+	for (p = c->gap_lebs; *p != -1; p++) {
+		err = ubifs_change_one_lp(c, *p, LPROPS_NC, LPROPS_NC, 0,
+					  LPROPS_TAKEN, 0);
+		if (err)
+			return err;
+	}
+
+	kfree(c->gap_lebs);
+	c->gap_lebs = NULL;
+	return 0;
+}
+
+/**
+ * ubifs_tnc_end_commit - update the TNC for commit end.
+ * @c: UBIFS file-system description object
+ *
+ * Write the dirty znodes.
+ */
+int ubifs_tnc_end_commit(struct ubifs_info *c)
+{
+	int err;
+
+	if (!c->cnext)
+		return 0;
+
+	err = return_gap_lebs(c);
+	if (err)
+		return err;
+
+	err = write_index(c);
+	if (err)
+		return err;
+
+	mutex_lock(&c->tnc_mutex);
+
+	dbg_cmt("TNC height is %d", c->zroot.znode->level + 1);
+
+	free_obsolete_znodes(c);
+
+	c->cnext = NULL;
+	kfree(c->ilebs);
+	c->ilebs = NULL;
+
+	mutex_unlock(&c->tnc_mutex);
+
+	return 0;
+}
diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c
new file mode 100644
index 00000000000..a25c1cc1f8d
--- /dev/null
+++ b/fs/ubifs/tnc_misc.c
@@ -0,0 +1,494 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ *          Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file contains miscelanious TNC-related functions shared betweend
+ * different files. This file does not form any logically separate TNC
+ * sub-system. The file was created because there is a lot of TNC code and
+ * putting it all in one file would make that file too big and unreadable.
+ */
+
+#include "ubifs.h"
+
+/**
+ * ubifs_tnc_levelorder_next - next TNC tree element in levelorder traversal.
+ * @zr: root of the subtree to traverse
+ * @znode: previous znode
+ *
+ * This function implements levelorder TNC traversal. The LNC is ignored.
+ * Returns the next element or %NULL if @znode is already the last one.
+ */
+struct ubifs_znode *ubifs_tnc_levelorder_next(struct ubifs_znode *zr,
+					      struct ubifs_znode *znode)
+{
+	int level, iip, level_search = 0;
+	struct ubifs_znode *zn;
+
+	ubifs_assert(zr);
+
+	if (unlikely(!znode))
+		return zr;
+
+	if (unlikely(znode == zr)) {
+		if (znode->level == 0)
+			return NULL;
+		return ubifs_tnc_find_child(zr, 0);
+	}
+
+	level = znode->level;
+
+	iip = znode->iip;
+	while (1) {
+		ubifs_assert(znode->level <= zr->level);
+
+		/*
+		 * First walk up until there is a znode with next branch to
+		 * look at.
+		 */
+		while (znode->parent != zr && iip >= znode->parent->child_cnt) {
+			znode = znode->parent;
+			iip = znode->iip;
+		}
+
+		if (unlikely(znode->parent == zr &&
+			     iip >= znode->parent->child_cnt)) {
+			/* This level is done, switch to the lower one */
+			level -= 1;
+			if (level_search || level < 0)
+				/*
+				 * We were already looking for znode at lower
+				 * level ('level_search'). As we are here
+				 * again, it just does not exist. Or all levels
+				 * were finished ('level < 0').
+				 */
+				return NULL;
+
+			level_search = 1;
+			iip = -1;
+			znode = ubifs_tnc_find_child(zr, 0);
+			ubifs_assert(znode);
+		}
+
+		/* Switch to the next index */
+		zn = ubifs_tnc_find_child(znode->parent, iip + 1);
+		if (!zn) {
+			/* No more children to look at, we have walk up */
+			iip = znode->parent->child_cnt;
+			continue;
+		}
+
+		/* Walk back down to the level we came from ('level') */
+		while (zn->level != level) {
+			znode = zn;
+			zn = ubifs_tnc_find_child(zn, 0);
+			if (!zn) {
+				/*
+				 * This path is not too deep so it does not
+				 * reach 'level'. Try next path.
+				 */
+				iip = znode->iip;
+				break;
+			}
+		}
+
+		if (zn) {
+			ubifs_assert(zn->level >= 0);
+			return zn;
+		}
+	}
+}
+
+/**
+ * ubifs_search_zbranch - search znode branch.
+ * @c: UBIFS file-system description object
+ * @znode: znode to search in
+ * @key: key to search for
+ * @n: znode branch slot number is returned here
+ *
+ * This is a helper function which search branch with key @key in @znode using
+ * binary search. The result of the search may be:
+ *   o exact match, then %1 is returned, and the slot number of the branch is
+ *     stored in @n;
+ *   o no exact match, then %0 is returned and the slot number of the left
+ *     closest branch is returned in @n; the slot if all keys in this znode are
+ *     greater than @key, then %-1 is returned in @n.
+ */
+int ubifs_search_zbranch(const struct ubifs_info *c,
+			 const struct ubifs_znode *znode,
+			 const union ubifs_key *key, int *n)
+{
+	int beg = 0, end = znode->child_cnt, uninitialized_var(mid);
+	int uninitialized_var(cmp);
+	const struct ubifs_zbranch *zbr = &znode->zbranch[0];
+
+	ubifs_assert(end > beg);
+
+	while (end > beg) {
+		mid = (beg + end) >> 1;
+		cmp = keys_cmp(c, key, &zbr[mid].key);
+		if (cmp > 0)
+			beg = mid + 1;
+		else if (cmp < 0)
+			end = mid;
+		else {
+			*n = mid;
+			return 1;
+		}
+	}
+
+	*n = end - 1;
+
+	/* The insert point is after *n */
+	ubifs_assert(*n >= -1 && *n < znode->child_cnt);
+	if (*n == -1)
+		ubifs_assert(keys_cmp(c, key, &zbr[0].key) < 0);
+	else
+		ubifs_assert(keys_cmp(c, key, &zbr[*n].key) > 0);
+	if (*n + 1 < znode->child_cnt)
+		ubifs_assert(keys_cmp(c, key, &zbr[*n + 1].key) < 0);
+
+	return 0;
+}
+
+/**
+ * ubifs_tnc_postorder_first - find first znode to do postorder tree traversal.
+ * @znode: znode to start at (root of the sub-tree to traverse)
+ *
+ * Find the lowest leftmost znode in a subtree of the TNC tree. The LNC is
+ * ignored.
+ */
+struct ubifs_znode *ubifs_tnc_postorder_first(struct ubifs_znode *znode)
+{
+	if (unlikely(!znode))
+		return NULL;
+
+	while (znode->level > 0) {
+		struct ubifs_znode *child;
+
+		child = ubifs_tnc_find_child(znode, 0);
+		if (!child)
+			return znode;
+		znode = child;
+	}
+
+	return znode;
+}
+
+/**
+ * ubifs_tnc_postorder_next - next TNC tree element in postorder traversal.
+ * @znode: previous znode
+ *
+ * This function implements postorder TNC traversal. The LNC is ignored.
+ * Returns the next element or %NULL if @znode is already the last one.
+ */
+struct ubifs_znode *ubifs_tnc_postorder_next(struct ubifs_znode *znode)
+{
+	struct ubifs_znode *zn;
+
+	ubifs_assert(znode);
+	if (unlikely(!znode->parent))
+		return NULL;
+
+	/* Switch to the next index in the parent */
+	zn = ubifs_tnc_find_child(znode->parent, znode->iip + 1);
+	if (!zn)
+		/* This is in fact the last child, return parent */
+		return znode->parent;
+
+	/* Go to the first znode in this new subtree */
+	return ubifs_tnc_postorder_first(zn);
+}
+
+/**
+ * ubifs_destroy_tnc_subtree - destroy all znodes connected to a subtree.
+ * @znode: znode defining subtree to destroy
+ *
+ * This function destroys subtree of the TNC tree. Returns number of clean
+ * znodes in the subtree.
+ */
+long ubifs_destroy_tnc_subtree(struct ubifs_znode *znode)
+{
+	struct ubifs_znode *zn = ubifs_tnc_postorder_first(znode);
+	long clean_freed = 0;
+	int n;
+
+	ubifs_assert(zn);
+	while (1) {
+		for (n = 0; n < zn->child_cnt; n++) {
+			if (!zn->zbranch[n].znode)
+				continue;
+
+			if (zn->level > 0 &&
+			    !ubifs_zn_dirty(zn->zbranch[n].znode))
+				clean_freed += 1;
+
+			cond_resched();
+			kfree(zn->zbranch[n].znode);
+		}
+
+		if (zn == znode) {
+			if (!ubifs_zn_dirty(zn))
+				clean_freed += 1;
+			kfree(zn);
+			return clean_freed;
+		}
+
+		zn = ubifs_tnc_postorder_next(zn);
+	}
+}
+
+/**
+ * read_znode - read an indexing node from flash and fill znode.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB of the indexing node to read
+ * @offs: node offset
+ * @len: node length
+ * @znode: znode to read to
+ *
+ * This function reads an indexing node from the flash media and fills znode
+ * with the read data. Returns zero in case of success and a negative error
+ * code in case of failure. The read indexing node is validated and if anything
+ * is wrong with it, this function prints complaint messages and returns
+ * %-EINVAL.
+ */
+static int read_znode(struct ubifs_info *c, int lnum, int offs, int len,
+		      struct ubifs_znode *znode)
+{
+	int i, err, type, cmp;
+	struct ubifs_idx_node *idx;
+
+	idx = kmalloc(c->max_idx_node_sz, GFP_NOFS);
+	if (!idx)
+		return -ENOMEM;
+
+	err = ubifs_read_node(c, idx, UBIFS_IDX_NODE, len, lnum, offs);
+	if (err < 0) {
+		kfree(idx);
+		return err;
+	}
+
+	znode->child_cnt = le16_to_cpu(idx->child_cnt);
+	znode->level = le16_to_cpu(idx->level);
+
+	dbg_tnc("LEB %d:%d, level %d, %d branch",
+		lnum, offs, znode->level, znode->child_cnt);
+
+	if (znode->child_cnt > c->fanout || znode->level > UBIFS_MAX_LEVELS) {
+		dbg_err("current fanout %d, branch count %d",
+			c->fanout, znode->child_cnt);
+		dbg_err("max levels %d, znode level %d",
+			UBIFS_MAX_LEVELS, znode->level);
+		err = 1;
+		goto out_dump;
+	}
+
+	for (i = 0; i < znode->child_cnt; i++) {
+		const struct ubifs_branch *br = ubifs_idx_branch(c, idx, i);
+		struct ubifs_zbranch *zbr = &znode->zbranch[i];
+
+		key_read(c, &br->key, &zbr->key);
+		zbr->lnum = le32_to_cpu(br->lnum);
+		zbr->offs = le32_to_cpu(br->offs);
+		zbr->len  = le32_to_cpu(br->len);
+		zbr->znode = NULL;
+
+		/* Validate branch */
+
+		if (zbr->lnum < c->main_first ||
+		    zbr->lnum >= c->leb_cnt || zbr->offs < 0 ||
+		    zbr->offs + zbr->len > c->leb_size || zbr->offs & 7) {
+			dbg_err("bad branch %d", i);
+			err = 2;
+			goto out_dump;
+		}
+
+		switch (key_type(c, &zbr->key)) {
+		case UBIFS_INO_KEY:
+		case UBIFS_DATA_KEY:
+		case UBIFS_DENT_KEY:
+		case UBIFS_XENT_KEY:
+			break;
+		default:
+			dbg_msg("bad key type at slot %d: %s", i,
+				DBGKEY(&zbr->key));
+			err = 3;
+			goto out_dump;
+		}
+
+		if (znode->level)
+			continue;
+
+		type = key_type(c, &zbr->key);
+		if (c->ranges[type].max_len == 0) {
+			if (zbr->len != c->ranges[type].len) {
+				dbg_err("bad target node (type %d) length (%d)",
+					type, zbr->len);
+				dbg_err("have to be %d", c->ranges[type].len);
+				err = 4;
+				goto out_dump;
+			}
+		} else if (zbr->len < c->ranges[type].min_len ||
+			   zbr->len > c->ranges[type].max_len) {
+			dbg_err("bad target node (type %d) length (%d)",
+				type, zbr->len);
+			dbg_err("have to be in range of %d-%d",
+				c->ranges[type].min_len,
+				c->ranges[type].max_len);
+			err = 5;
+			goto out_dump;
+		}
+	}
+
+	/*
+	 * Ensure that the next key is greater or equivalent to the
+	 * previous one.
+	 */
+	for (i = 0; i < znode->child_cnt - 1; i++) {
+		const union ubifs_key *key1, *key2;
+
+		key1 = &znode->zbranch[i].key;
+		key2 = &znode->zbranch[i + 1].key;
+
+		cmp = keys_cmp(c, key1, key2);
+		if (cmp > 0) {
+			dbg_err("bad key order (keys %d and %d)", i, i + 1);
+			err = 6;
+			goto out_dump;
+		} else if (cmp == 0 && !is_hash_key(c, key1)) {
+			/* These can only be keys with colliding hash */
+			dbg_err("keys %d and %d are not hashed but equivalent",
+				i, i + 1);
+			err = 7;
+			goto out_dump;
+		}
+	}
+
+	kfree(idx);
+	return 0;
+
+out_dump:
+	ubifs_err("bad indexing node at LEB %d:%d, error %d", lnum, offs, err);
+	dbg_dump_node(c, idx);
+	kfree(idx);
+	return -EINVAL;
+}
+
+/**
+ * ubifs_load_znode - load znode to TNC cache.
+ * @c: UBIFS file-system description object
+ * @zbr: znode branch
+ * @parent: znode's parent
+ * @iip: index in parent
+ *
+ * This function loads znode pointed to by @zbr into the TNC cache and
+ * returns pointer to it in case of success and a negative error code in case
+ * of failure.
+ */
+struct ubifs_znode *ubifs_load_znode(struct ubifs_info *c,
+				     struct ubifs_zbranch *zbr,
+				     struct ubifs_znode *parent, int iip)
+{
+	int err;
+	struct ubifs_znode *znode;
+
+	ubifs_assert(!zbr->znode);
+	/*
+	 * A slab cache is not presently used for znodes because the znode size
+	 * depends on the fanout which is stored in the superblock.
+	 */
+	znode = kzalloc(c->max_znode_sz, GFP_NOFS);
+	if (!znode)
+		return ERR_PTR(-ENOMEM);
+
+	err = read_znode(c, zbr->lnum, zbr->offs, zbr->len, znode);
+	if (err)
+		goto out;
+
+	atomic_long_inc(&c->clean_zn_cnt);
+
+	/*
+	 * Increment the global clean znode counter as well. It is OK that
+	 * global and per-FS clean znode counters may be inconsistent for some
+	 * short time (because we might be preempted at this point), the global
+	 * one is only used in shrinker.
+	 */
+	atomic_long_inc(&ubifs_clean_zn_cnt);
+
+	zbr->znode = znode;
+	znode->parent = parent;
+	znode->time = get_seconds();
+	znode->iip = iip;
+
+	return znode;
+
+out:
+	kfree(znode);
+	return ERR_PTR(err);
+}
+
+/**
+ * ubifs_tnc_read_node - read a leaf node from the flash media.
+ * @c: UBIFS file-system description object
+ * @zbr: key and position of the node
+ * @node: node is returned here
+ *
+ * This function reads a node defined by @zbr from the flash media. Returns
+ * zero in case of success or a negative negative error code in case of
+ * failure.
+ */
+int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+			void *node)
+{
+	union ubifs_key key1, *key = &zbr->key;
+	int err, type = key_type(c, key);
+	struct ubifs_wbuf *wbuf;
+
+	/*
+	 * 'zbr' has to point to on-flash node. The node may sit in a bud and
+	 * may even be in a write buffer, so we have to take care about this.
+	 */
+	wbuf = ubifs_get_wbuf(c, zbr->lnum);
+	if (wbuf)
+		err = ubifs_read_node_wbuf(wbuf, node, type, zbr->len,
+					   zbr->lnum, zbr->offs);
+	else
+		err = ubifs_read_node(c, node, type, zbr->len, zbr->lnum,
+				      zbr->offs);
+
+	if (err) {
+		dbg_tnc("key %s", DBGKEY(key));
+		return err;
+	}
+
+	/* Make sure the key of the read node is correct */
+	key_read(c, key, &key1);
+	if (memcmp(node + UBIFS_KEY_OFFSET, &key1, c->key_len)) {
+		ubifs_err("bad key in node at LEB %d:%d",
+			  zbr->lnum, zbr->offs);
+		dbg_tnc("looked for key %s found node's key %s",
+			DBGKEY(key), DBGKEY1(&key1));
+		dbg_dump_node(c, node);
+		return -EINVAL;
+	}
+
+	return 0;
+}
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
new file mode 100644
index 00000000000..0cc7da9bed4
--- /dev/null
+++ b/fs/ubifs/ubifs-media.h
@@ -0,0 +1,745 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/*
+ * This file describes UBIFS on-flash format and contains definitions of all the
+ * relevant data structures and constants.
+ *
+ * All UBIFS on-flash objects are stored in the form of nodes. All nodes start
+ * with the UBIFS node magic number and have the same common header. Nodes
+ * always sit at 8-byte aligned positions on the media and node header sizes are
+ * also 8-byte aligned (except for the indexing node and the padding node).
+ */
+
+#ifndef __UBIFS_MEDIA_H__
+#define __UBIFS_MEDIA_H__
+
+/* UBIFS node magic number (must not have the padding byte first or last) */
+#define UBIFS_NODE_MAGIC  0x06101831
+
+/* UBIFS on-flash format version */
+#define UBIFS_FORMAT_VERSION 4
+
+/* Minimum logical eraseblock size in bytes */
+#define UBIFS_MIN_LEB_SZ (15*1024)
+
+/* Initial CRC32 value used when calculating CRC checksums */
+#define UBIFS_CRC32_INIT 0xFFFFFFFFU
+
+/*
+ * UBIFS does not try to compress data if its length is less than the below
+ * constant.
+ */
+#define UBIFS_MIN_COMPR_LEN 128
+
+/* Root inode number */
+#define UBIFS_ROOT_INO 1
+
+/* Lowest inode number used for regular inodes (not UBIFS-only internal ones) */
+#define UBIFS_FIRST_INO 64
+
+/*
+ * Maximum file name and extended attribute length (must be a multiple of 8,
+ * minus 1).
+ */
+#define UBIFS_MAX_NLEN 255
+
+/* Maximum number of data journal heads */
+#define UBIFS_MAX_JHEADS 1
+
+/*
+ * Size of UBIFS data block. Note, UBIFS is not a block oriented file-system,
+ * which means that it does not treat the underlying media as consisting of
+ * blocks like in case of hard drives. Do not be confused. UBIFS block is just
+ * the maximum amount of data which one data node can have or which can be
+ * attached to an inode node.
+ */
+#define UBIFS_BLOCK_SIZE  4096
+#define UBIFS_BLOCK_SHIFT 12
+#define UBIFS_BLOCK_MASK  0x00000FFF
+
+/* UBIFS padding byte pattern (must not be first or last byte of node magic) */
+#define UBIFS_PADDING_BYTE 0xCE
+
+/* Maximum possible key length */
+#define UBIFS_MAX_KEY_LEN 16
+
+/* Key length ("simple" format) */
+#define UBIFS_SK_LEN 8
+
+/* Minimum index tree fanout */
+#define UBIFS_MIN_FANOUT 2
+
+/* Maximum number of levels in UBIFS indexing B-tree */
+#define UBIFS_MAX_LEVELS 512
+
+/* Maximum amount of data attached to an inode in bytes */
+#define UBIFS_MAX_INO_DATA UBIFS_BLOCK_SIZE
+
+/* LEB Properties Tree fanout (must be power of 2) and fanout shift */
+#define UBIFS_LPT_FANOUT 4
+#define UBIFS_LPT_FANOUT_SHIFT 2
+
+/* LEB Properties Tree bit field sizes */
+#define UBIFS_LPT_CRC_BITS 16
+#define UBIFS_LPT_CRC_BYTES 2
+#define UBIFS_LPT_TYPE_BITS 4
+
+/* The key is always at the same position in all keyed nodes */
+#define UBIFS_KEY_OFFSET offsetof(struct ubifs_ino_node, key)
+
+/*
+ * LEB Properties Tree node types.
+ *
+ * UBIFS_LPT_PNODE: LPT leaf node (contains LEB properties)
+ * UBIFS_LPT_NNODE: LPT internal node
+ * UBIFS_LPT_LTAB: LPT's own lprops table
+ * UBIFS_LPT_LSAVE: LPT's save table (big model only)
+ * UBIFS_LPT_NODE_CNT: count of LPT node types
+ * UBIFS_LPT_NOT_A_NODE: all ones (15 for 4 bits) is never a valid node type
+ */
+enum {
+	UBIFS_LPT_PNODE,
+	UBIFS_LPT_NNODE,
+	UBIFS_LPT_LTAB,
+	UBIFS_LPT_LSAVE,
+	UBIFS_LPT_NODE_CNT,
+	UBIFS_LPT_NOT_A_NODE = (1 << UBIFS_LPT_TYPE_BITS) - 1,
+};
+
+/*
+ * UBIFS inode types.
+ *
+ * UBIFS_ITYPE_REG: regular file
+ * UBIFS_ITYPE_DIR: directory
+ * UBIFS_ITYPE_LNK: soft link
+ * UBIFS_ITYPE_BLK: block device node
+ * UBIFS_ITYPE_CHR: character device node
+ * UBIFS_ITYPE_FIFO: fifo
+ * UBIFS_ITYPE_SOCK: socket
+ * UBIFS_ITYPES_CNT: count of supported file types
+ */
+enum {
+	UBIFS_ITYPE_REG,
+	UBIFS_ITYPE_DIR,
+	UBIFS_ITYPE_LNK,
+	UBIFS_ITYPE_BLK,
+	UBIFS_ITYPE_CHR,
+	UBIFS_ITYPE_FIFO,
+	UBIFS_ITYPE_SOCK,
+	UBIFS_ITYPES_CNT,
+};
+
+/*
+ * Supported key hash functions.
+ *
+ * UBIFS_KEY_HASH_R5: R5 hash
+ * UBIFS_KEY_HASH_TEST: test hash which just returns first 4 bytes of the name
+ */
+enum {
+	UBIFS_KEY_HASH_R5,
+	UBIFS_KEY_HASH_TEST,
+};
+
+/*
+ * Supported key formats.
+ *
+ * UBIFS_SIMPLE_KEY_FMT: simple key format
+ */
+enum {
+	UBIFS_SIMPLE_KEY_FMT,
+};
+
+/*
+ * The simple key format uses 29 bits for storing UBIFS block number and hash
+ * value.
+ */
+#define UBIFS_S_KEY_BLOCK_BITS 29
+#define UBIFS_S_KEY_BLOCK_MASK 0x1FFFFFFF
+#define UBIFS_S_KEY_HASH_BITS  UBIFS_S_KEY_BLOCK_BITS
+#define UBIFS_S_KEY_HASH_MASK  UBIFS_S_KEY_BLOCK_MASK
+
+/*
+ * Key types.
+ *
+ * UBIFS_INO_KEY: inode node key
+ * UBIFS_DATA_KEY: data node key
+ * UBIFS_DENT_KEY: directory entry node key
+ * UBIFS_XENT_KEY: extended attribute entry key
+ * UBIFS_KEY_TYPES_CNT: number of supported key types
+ */
+enum {
+	UBIFS_INO_KEY,
+	UBIFS_DATA_KEY,
+	UBIFS_DENT_KEY,
+	UBIFS_XENT_KEY,
+	UBIFS_KEY_TYPES_CNT,
+};
+
+/* Count of LEBs reserved for the superblock area */
+#define UBIFS_SB_LEBS 1
+/* Count of LEBs reserved for the master area */
+#define UBIFS_MST_LEBS 2
+
+/* First LEB of the superblock area */
+#define UBIFS_SB_LNUM 0
+/* First LEB of the master area */
+#define UBIFS_MST_LNUM (UBIFS_SB_LNUM + UBIFS_SB_LEBS)
+/* First LEB of the log area */
+#define UBIFS_LOG_LNUM (UBIFS_MST_LNUM + UBIFS_MST_LEBS)
+
+/*
+ * The below constants define the absolute minimum values for various UBIFS
+ * media areas. Many of them actually depend of flash geometry and the FS
+ * configuration (number of journal heads, orphan LEBs, etc). This means that
+ * the smallest volume size which can be used for UBIFS cannot be pre-defined
+ * by these constants. The file-system that meets the below limitation will not
+ * necessarily mount. UBIFS does run-time calculations and validates the FS
+ * size.
+ */
+
+/* Minimum number of logical eraseblocks in the log */
+#define UBIFS_MIN_LOG_LEBS 2
+/* Minimum number of bud logical eraseblocks (one for each head) */
+#define UBIFS_MIN_BUD_LEBS 3
+/* Minimum number of journal logical eraseblocks */
+#define UBIFS_MIN_JNL_LEBS (UBIFS_MIN_LOG_LEBS + UBIFS_MIN_BUD_LEBS)
+/* Minimum number of LPT area logical eraseblocks */
+#define UBIFS_MIN_LPT_LEBS 2
+/* Minimum number of orphan area logical eraseblocks */
+#define UBIFS_MIN_ORPH_LEBS 1
+/*
+ * Minimum number of main area logical eraseblocks (buds, 2 for the index, 1
+ * for GC, 1 for deletions, and at least 1 for committed data).
+ */
+#define UBIFS_MIN_MAIN_LEBS (UBIFS_MIN_BUD_LEBS + 5)
+
+/* Minimum number of logical eraseblocks */
+#define UBIFS_MIN_LEB_CNT (UBIFS_SB_LEBS + UBIFS_MST_LEBS + \
+			   UBIFS_MIN_LOG_LEBS + UBIFS_MIN_LPT_LEBS + \
+			   UBIFS_MIN_ORPH_LEBS + UBIFS_MIN_MAIN_LEBS)
+
+/* Node sizes (N.B. these are guaranteed to be multiples of 8) */
+#define UBIFS_CH_SZ        sizeof(struct ubifs_ch)
+#define UBIFS_INO_NODE_SZ  sizeof(struct ubifs_ino_node)
+#define UBIFS_DATA_NODE_SZ sizeof(struct ubifs_data_node)
+#define UBIFS_DENT_NODE_SZ sizeof(struct ubifs_dent_node)
+#define UBIFS_TRUN_NODE_SZ sizeof(struct ubifs_trun_node)
+#define UBIFS_PAD_NODE_SZ  sizeof(struct ubifs_pad_node)
+#define UBIFS_SB_NODE_SZ   sizeof(struct ubifs_sb_node)
+#define UBIFS_MST_NODE_SZ  sizeof(struct ubifs_mst_node)
+#define UBIFS_REF_NODE_SZ  sizeof(struct ubifs_ref_node)
+#define UBIFS_IDX_NODE_SZ  sizeof(struct ubifs_idx_node)
+#define UBIFS_CS_NODE_SZ   sizeof(struct ubifs_cs_node)
+#define UBIFS_ORPH_NODE_SZ sizeof(struct ubifs_orph_node)
+/* Extended attribute entry nodes are identical to directory entry nodes */
+#define UBIFS_XENT_NODE_SZ UBIFS_DENT_NODE_SZ
+/* Only this does not have to be multiple of 8 bytes */
+#define UBIFS_BRANCH_SZ    sizeof(struct ubifs_branch)
+
+/* Maximum node sizes (N.B. these are guaranteed to be multiples of 8) */
+#define UBIFS_MAX_DATA_NODE_SZ  (UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE)
+#define UBIFS_MAX_INO_NODE_SZ   (UBIFS_INO_NODE_SZ + UBIFS_MAX_INO_DATA)
+#define UBIFS_MAX_DENT_NODE_SZ  (UBIFS_DENT_NODE_SZ + UBIFS_MAX_NLEN + 1)
+#define UBIFS_MAX_XENT_NODE_SZ  UBIFS_MAX_DENT_NODE_SZ
+
+/* The largest UBIFS node */
+#define UBIFS_MAX_NODE_SZ UBIFS_MAX_INO_NODE_SZ
+
+/*
+ * On-flash inode flags.
+ *
+ * UBIFS_COMPR_FL: use compression for this inode
+ * UBIFS_SYNC_FL:  I/O on this inode has to be synchronous
+ * UBIFS_IMMUTABLE_FL: inode is immutable
+ * UBIFS_APPEND_FL: writes to the inode may only append data
+ * UBIFS_DIRSYNC_FL: I/O on this directory inode has to be synchronous
+ * UBIFS_XATTR_FL: this inode is the inode for an extended attribute value
+ *
+ * Note, these are on-flash flags which correspond to ioctl flags
+ * (@FS_COMPR_FL, etc). They have the same values now, but generally, do not
+ * have to be the same.
+ */
+enum {
+	UBIFS_COMPR_FL     = 0x01,
+	UBIFS_SYNC_FL      = 0x02,
+	UBIFS_IMMUTABLE_FL = 0x04,
+	UBIFS_APPEND_FL    = 0x08,
+	UBIFS_DIRSYNC_FL   = 0x10,
+	UBIFS_XATTR_FL     = 0x20,
+};
+
+/* Inode flag bits used by UBIFS */
+#define UBIFS_FL_MASK 0x0000001F
+
+/*
+ * UBIFS compression algorithms.
+ *
+ * UBIFS_COMPR_NONE: no compression
+ * UBIFS_COMPR_LZO: LZO compression
+ * UBIFS_COMPR_ZLIB: ZLIB compression
+ * UBIFS_COMPR_TYPES_CNT: count of supported compression types
+ */
+enum {
+	UBIFS_COMPR_NONE,
+	UBIFS_COMPR_LZO,
+	UBIFS_COMPR_ZLIB,
+	UBIFS_COMPR_TYPES_CNT,
+};
+
+/*
+ * UBIFS node types.
+ *
+ * UBIFS_INO_NODE: inode node
+ * UBIFS_DATA_NODE: data node
+ * UBIFS_DENT_NODE: directory entry node
+ * UBIFS_XENT_NODE: extended attribute node
+ * UBIFS_TRUN_NODE: truncation node
+ * UBIFS_PAD_NODE: padding node
+ * UBIFS_SB_NODE: superblock node
+ * UBIFS_MST_NODE: master node
+ * UBIFS_REF_NODE: LEB reference node
+ * UBIFS_IDX_NODE: index node
+ * UBIFS_CS_NODE: commit start node
+ * UBIFS_ORPH_NODE: orphan node
+ * UBIFS_NODE_TYPES_CNT: count of supported node types
+ *
+ * Note, we index arrays by these numbers, so keep them low and contiguous.
+ * Node type constants for inodes, direntries and so on have to be the same as
+ * corresponding key type constants.
+ */
+enum {
+	UBIFS_INO_NODE,
+	UBIFS_DATA_NODE,
+	UBIFS_DENT_NODE,
+	UBIFS_XENT_NODE,
+	UBIFS_TRUN_NODE,
+	UBIFS_PAD_NODE,
+	UBIFS_SB_NODE,
+	UBIFS_MST_NODE,
+	UBIFS_REF_NODE,
+	UBIFS_IDX_NODE,
+	UBIFS_CS_NODE,
+	UBIFS_ORPH_NODE,
+	UBIFS_NODE_TYPES_CNT,
+};
+
+/*
+ * Master node flags.
+ *
+ * UBIFS_MST_DIRTY: rebooted uncleanly - master node is dirty
+ * UBIFS_MST_NO_ORPHS: no orphan inodes present
+ * UBIFS_MST_RCVRY: written by recovery
+ */
+enum {
+	UBIFS_MST_DIRTY = 1,
+	UBIFS_MST_NO_ORPHS = 2,
+	UBIFS_MST_RCVRY = 4,
+};
+
+/*
+ * Node group type (used by recovery to recover whole group or none).
+ *
+ * UBIFS_NO_NODE_GROUP: this node is not part of a group
+ * UBIFS_IN_NODE_GROUP: this node is a part of a group
+ * UBIFS_LAST_OF_NODE_GROUP: this node is the last in a group
+ */
+enum {
+	UBIFS_NO_NODE_GROUP = 0,
+	UBIFS_IN_NODE_GROUP,
+	UBIFS_LAST_OF_NODE_GROUP,
+};
+
+/*
+ * Superblock flags.
+ *
+ * UBIFS_FLG_BIGLPT: if "big" LPT model is used if set
+ */
+enum {
+	UBIFS_FLG_BIGLPT = 0x02,
+};
+
+/**
+ * struct ubifs_ch - common header node.
+ * @magic: UBIFS node magic number (%UBIFS_NODE_MAGIC)
+ * @crc: CRC-32 checksum of the node header
+ * @sqnum: sequence number
+ * @len: full node length
+ * @node_type: node type
+ * @group_type: node group type
+ * @padding: reserved for future, zeroes
+ *
+ * Every UBIFS node starts with this common part. If the node has a key, the
+ * key always goes next.
+ */
+struct ubifs_ch {
+	__le32 magic;
+	__le32 crc;
+	__le64 sqnum;
+	__le32 len;
+	__u8 node_type;
+	__u8 group_type;
+	__u8 padding[2];
+} __attribute__ ((packed));
+
+/**
+ * union ubifs_dev_desc - device node descriptor.
+ * @new: new type device descriptor
+ * @huge: huge type device descriptor
+ *
+ * This data structure describes major/minor numbers of a device node. In an
+ * inode is a device node then its data contains an object of this type. UBIFS
+ * uses standard Linux "new" and "huge" device node encodings.
+ */
+union ubifs_dev_desc {
+	__le32 new;
+	__le64 huge;
+} __attribute__ ((packed));
+
+/**
+ * struct ubifs_ino_node - inode node.
+ * @ch: common header
+ * @key: node key
+ * @creat_sqnum: sequence number at time of creation
+ * @size: inode size in bytes (amount of uncompressed data)
+ * @atime_sec: access time seconds
+ * @ctime_sec: creation time seconds
+ * @mtime_sec: modification time seconds
+ * @atime_nsec: access time nanoseconds
+ * @ctime_nsec: creation time nanoseconds
+ * @mtime_nsec: modification time nanoseconds
+ * @nlink: number of hard links
+ * @uid: owner ID
+ * @gid: group ID
+ * @mode: access flags
+ * @flags: per-inode flags (%UBIFS_COMPR_FL, %UBIFS_SYNC_FL, etc)
+ * @data_len: inode data length
+ * @xattr_cnt: count of extended attributes this inode has
+ * @xattr_size: summarized size of all extended attributes in bytes
+ * @padding1: reserved for future, zeroes
+ * @xattr_names: sum of lengths of all extended attribute names belonging to
+ *               this inode
+ * @compr_type: compression type used for this inode
+ * @padding2: reserved for future, zeroes
+ * @data: data attached to the inode
+ *
+ * Note, even though inode compression type is defined by @compr_type, some
+ * nodes of this inode may be compressed with different compressor - this
+ * happens if compression type is changed while the inode already has data
+ * nodes. But @compr_type will be use for further writes to the inode.
+ *
+ * Note, do not forget to amend 'zero_ino_node_unused()' function when changing
+ * the padding fields.
+ */
+struct ubifs_ino_node {
+	struct ubifs_ch ch;
+	__u8 key[UBIFS_MAX_KEY_LEN];
+	__le64 creat_sqnum;
+	__le64 size;
+	__le64 atime_sec;
+	__le64 ctime_sec;
+	__le64 mtime_sec;
+	__le32 atime_nsec;
+	__le32 ctime_nsec;
+	__le32 mtime_nsec;
+	__le32 nlink;
+	__le32 uid;
+	__le32 gid;
+	__le32 mode;
+	__le32 flags;
+	__le32 data_len;
+	__le32 xattr_cnt;
+	__le32 xattr_size;
+	__u8 padding1[4]; /* Watch 'zero_ino_node_unused()' if changing! */
+	__le32 xattr_names;
+	__le16 compr_type;
+	__u8 padding2[26]; /* Watch 'zero_ino_node_unused()' if changing! */
+	__u8 data[];
+} __attribute__ ((packed));
+
+/**
+ * struct ubifs_dent_node - directory entry node.
+ * @ch: common header
+ * @key: node key
+ * @inum: target inode number
+ * @padding1: reserved for future, zeroes
+ * @type: type of the target inode (%UBIFS_ITYPE_REG, %UBIFS_ITYPE_DIR, etc)
+ * @nlen: name length
+ * @padding2: reserved for future, zeroes
+ * @name: zero-terminated name
+ *
+ * Note, do not forget to amend 'zero_dent_node_unused()' function when
+ * changing the padding fields.
+ */
+struct ubifs_dent_node {
+	struct ubifs_ch ch;
+	__u8 key[UBIFS_MAX_KEY_LEN];
+	__le64 inum;
+	__u8 padding1;
+	__u8 type;
+	__le16 nlen;
+	__u8 padding2[4]; /* Watch 'zero_dent_node_unused()' if changing! */
+	__u8 name[];
+} __attribute__ ((packed));
+
+/**
+ * struct ubifs_data_node - data node.
+ * @ch: common header
+ * @key: node key
+ * @size: uncompressed data size in bytes
+ * @compr_type: compression type (%UBIFS_COMPR_NONE, %UBIFS_COMPR_LZO, etc)
+ * @padding: reserved for future, zeroes
+ * @data: data
+ *
+ * Note, do not forget to amend 'zero_data_node_unused()' function when
+ * changing the padding fields.
+ */
+struct ubifs_data_node {
+	struct ubifs_ch ch;
+	__u8 key[UBIFS_MAX_KEY_LEN];
+	__le32 size;
+	__le16 compr_type;
+	__u8 padding[2]; /* Watch 'zero_data_node_unused()' if changing! */
+	__u8 data[];
+} __attribute__ ((packed));
+
+/**
+ * struct ubifs_trun_node - truncation node.
+ * @ch: common header
+ * @inum: truncated inode number
+ * @padding: reserved for future, zeroes
+ * @old_size: size before truncation
+ * @new_size: size after truncation
+ *
+ * This node exists only in the journal and never goes to the main area. Note,
+ * do not forget to amend 'zero_trun_node_unused()' function when changing the
+ * padding fields.
+ */
+struct ubifs_trun_node {
+	struct ubifs_ch ch;
+	__le32 inum;
+	__u8 padding[12]; /* Watch 'zero_trun_node_unused()' if changing! */
+	__le64 old_size;
+	__le64 new_size;
+} __attribute__ ((packed));
+
+/**
+ * struct ubifs_pad_node - padding node.
+ * @ch: common header
+ * @pad_len: how many bytes after this node are unused (because padded)
+ * @padding: reserved for future, zeroes
+ */
+struct ubifs_pad_node {
+	struct ubifs_ch ch;
+	__le32 pad_len;
+} __attribute__ ((packed));
+
+/**
+ * struct ubifs_sb_node - superblock node.
+ * @ch: common header
+ * @padding: reserved for future, zeroes
+ * @key_hash: type of hash function used in keys
+ * @key_fmt: format of the key
+ * @flags: file-system flags (%UBIFS_FLG_BIGLPT, etc)
+ * @min_io_size: minimal input/output unit size
+ * @leb_size: logical eraseblock size in bytes
+ * @leb_cnt: count of LEBs used by file-system
+ * @max_leb_cnt: maximum count of LEBs used by file-system
+ * @max_bud_bytes: maximum amount of data stored in buds
+ * @log_lebs: log size in logical eraseblocks
+ * @lpt_lebs: number of LEBs used for lprops table
+ * @orph_lebs: number of LEBs used for recording orphans
+ * @jhead_cnt: count of journal heads
+ * @fanout: tree fanout (max. number of links per indexing node)
+ * @lsave_cnt: number of LEB numbers in LPT's save table
+ * @fmt_version: UBIFS on-flash format version
+ * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
+ * @padding1: reserved for future, zeroes
+ * @rp_uid: reserve pool UID
+ * @rp_gid: reserve pool GID
+ * @rp_size: size of the reserved pool in bytes
+ * @padding2: reserved for future, zeroes
+ * @time_gran: time granularity in nanoseconds
+ * @uuid: UUID generated when the file system image was created
+ */
+struct ubifs_sb_node {
+	struct ubifs_ch ch;
+	__u8 padding[2];
+	__u8 key_hash;
+	__u8 key_fmt;
+	__le32 flags;
+	__le32 min_io_size;
+	__le32 leb_size;
+	__le32 leb_cnt;
+	__le32 max_leb_cnt;
+	__le64 max_bud_bytes;
+	__le32 log_lebs;
+	__le32 lpt_lebs;
+	__le32 orph_lebs;
+	__le32 jhead_cnt;
+	__le32 fanout;
+	__le32 lsave_cnt;
+	__le32 fmt_version;
+	__le16 default_compr;
+	__u8 padding1[2];
+	__le32 rp_uid;
+	__le32 rp_gid;
+	__le64 rp_size;
+	__le32 time_gran;
+	__u8 uuid[16];
+	__u8 padding2[3972];
+} __attribute__ ((packed));
+
+/**
+ * struct ubifs_mst_node - master node.
+ * @ch: common header
+ * @highest_inum: highest inode number in the committed index
+ * @cmt_no: commit number
+ * @flags: various flags (%UBIFS_MST_DIRTY, etc)
+ * @log_lnum: start of the log
+ * @root_lnum: LEB number of the root indexing node
+ * @root_offs: offset within @root_lnum
+ * @root_len: root indexing node length
+ * @gc_lnum: LEB reserved for garbage collection (%-1 value means the LEB was
+ * not reserved and should be reserved on mount)
+ * @ihead_lnum: LEB number of index head
+ * @ihead_offs: offset of index head
+ * @index_size: size of index on flash
+ * @total_free: total free space in bytes
+ * @total_dirty: total dirty space in bytes
+ * @total_used: total used space in bytes (includes only data LEBs)
+ * @total_dead: total dead space in bytes (includes only data LEBs)
+ * @total_dark: total dark space in bytes (includes only data LEBs)
+ * @lpt_lnum: LEB number of LPT root nnode
+ * @lpt_offs: offset of LPT root nnode
+ * @nhead_lnum: LEB number of LPT head
+ * @nhead_offs: offset of LPT head
+ * @ltab_lnum: LEB number of LPT's own lprops table
+ * @ltab_offs: offset of LPT's own lprops table
+ * @lsave_lnum: LEB number of LPT's save table (big model only)
+ * @lsave_offs: offset of LPT's save table (big model only)
+ * @lscan_lnum: LEB number of last LPT scan
+ * @empty_lebs: number of empty logical eraseblocks
+ * @idx_lebs: number of indexing logical eraseblocks
+ * @leb_cnt: count of LEBs used by file-system
+ * @padding: reserved for future, zeroes
+ */
+struct ubifs_mst_node {
+	struct ubifs_ch ch;
+	__le64 highest_inum;
+	__le64 cmt_no;
+	__le32 flags;
+	__le32 log_lnum;
+	__le32 root_lnum;
+	__le32 root_offs;
+	__le32 root_len;
+	__le32 gc_lnum;
+	__le32 ihead_lnum;
+	__le32 ihead_offs;
+	__le64 index_size;
+	__le64 total_free;
+	__le64 total_dirty;
+	__le64 total_used;
+	__le64 total_dead;
+	__le64 total_dark;
+	__le32 lpt_lnum;
+	__le32 lpt_offs;
+	__le32 nhead_lnum;
+	__le32 nhead_offs;
+	__le32 ltab_lnum;
+	__le32 ltab_offs;
+	__le32 lsave_lnum;
+	__le32 lsave_offs;
+	__le32 lscan_lnum;
+	__le32 empty_lebs;
+	__le32 idx_lebs;
+	__le32 leb_cnt;
+	__u8 padding[344];
+} __attribute__ ((packed));
+
+/**
+ * struct ubifs_ref_node - logical eraseblock reference node.
+ * @ch: common header
+ * @lnum: the referred logical eraseblock number
+ * @offs: start offset in the referred LEB
+ * @jhead: journal head number
+ * @padding: reserved for future, zeroes
+ */
+struct ubifs_ref_node {
+	struct ubifs_ch ch;
+	__le32 lnum;
+	__le32 offs;
+	__le32 jhead;
+	__u8 padding[28];
+} __attribute__ ((packed));
+
+/**
+ * struct ubifs_branch - key/reference/length branch
+ * @lnum: LEB number of the target node
+ * @offs: offset within @lnum
+ * @len: target node length
+ * @key: key
+ */
+struct ubifs_branch {
+	__le32 lnum;
+	__le32 offs;
+	__le32 len;
+	__u8 key[];
+} __attribute__ ((packed));
+
+/**
+ * struct ubifs_idx_node - indexing node.
+ * @ch: common header
+ * @child_cnt: number of child index nodes
+ * @level: tree level
+ * @branches: LEB number / offset / length / key branches
+ */
+struct ubifs_idx_node {
+	struct ubifs_ch ch;
+	__le16 child_cnt;
+	__le16 level;
+	__u8 branches[];
+} __attribute__ ((packed));
+
+/**
+ * struct ubifs_cs_node - commit start node.
+ * @ch: common header
+ * @cmt_no: commit number
+ */
+struct ubifs_cs_node {
+	struct ubifs_ch ch;
+	__le64 cmt_no;
+} __attribute__ ((packed));
+
+/**
+ * struct ubifs_orph_node - orphan node.
+ * @ch: common header
+ * @cmt_no: commit number (also top bit is set on the last node of the commit)
+ * @inos: inode numbers of orphans
+ */
+struct ubifs_orph_node {
+	struct ubifs_ch ch;
+	__le64 cmt_no;
+	__le64 inos[];
+} __attribute__ ((packed));
+
+#endif /* __UBIFS_MEDIA_H__ */
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
new file mode 100644
index 00000000000..e4f89f27182
--- /dev/null
+++ b/fs/ubifs/ubifs.h
@@ -0,0 +1,1649 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/* Implementation version 0.7 */
+
+#ifndef __UBIFS_H__
+#define __UBIFS_H__
+
+#include <asm/div64.h>
+#include <linux/statfs.h>
+#include <linux/fs.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
+#include <linux/mtd/ubi.h>
+#include <linux/pagemap.h>
+#include <linux/backing-dev.h>
+#include "ubifs-media.h"
+
+/* Version of this UBIFS implementation */
+#define UBIFS_VERSION 1
+
+/* Normal UBIFS messages */
+#define ubifs_msg(fmt, ...) \
+		printk(KERN_NOTICE "UBIFS: " fmt "\n", ##__VA_ARGS__)
+/* UBIFS error messages */
+#define ubifs_err(fmt, ...)                                                  \
+	printk(KERN_ERR "UBIFS error (pid %d): %s: " fmt "\n", current->pid, \
+	       __func__, ##__VA_ARGS__)
+/* UBIFS warning messages */
+#define ubifs_warn(fmt, ...)                                         \
+	printk(KERN_WARNING "UBIFS warning (pid %d): %s: " fmt "\n", \
+	       current->pid, __func__, ##__VA_ARGS__)
+
+/* UBIFS file system VFS magic number */
+#define UBIFS_SUPER_MAGIC 0x24051905
+
+/* Number of UBIFS blocks per VFS page */
+#define UBIFS_BLOCKS_PER_PAGE (PAGE_CACHE_SIZE / UBIFS_BLOCK_SIZE)
+#define UBIFS_BLOCKS_PER_PAGE_SHIFT (PAGE_CACHE_SHIFT - UBIFS_BLOCK_SHIFT)
+
+/* "File system end of life" sequence number watermark */
+#define SQNUM_WARN_WATERMARK 0xFFFFFFFF00000000ULL
+#define SQNUM_WATERMARK      0xFFFFFFFFFF000000ULL
+
+/* Minimum amount of data UBIFS writes to the flash */
+#define MIN_WRITE_SZ (UBIFS_DATA_NODE_SZ + 8)
+
+/*
+ * Currently we do not support inode number overlapping and re-using, so this
+ * watermark defines dangerous inode number level. This should be fixed later,
+ * although it is difficult to exceed current limit. Another option is to use
+ * 64-bit inode numbers, but this means more overhead.
+ */
+#define INUM_WARN_WATERMARK 0xFFF00000
+#define INUM_WATERMARK      0xFFFFFF00
+
+/* Largest key size supported in this implementation */
+#define CUR_MAX_KEY_LEN UBIFS_SK_LEN
+
+/* Maximum number of entries in each LPT (LEB category) heap */
+#define LPT_HEAP_SZ 256
+
+/*
+ * Background thread name pattern. The numbers are UBI device and volume
+ * numbers.
+ */
+#define BGT_NAME_PATTERN "ubifs_bgt%d_%d"
+
+/* Default write-buffer synchronization timeout (5 secs) */
+#define DEFAULT_WBUF_TIMEOUT (5 * HZ)
+
+/* Maximum possible inode number (only 32-bit inodes are supported now) */
+#define MAX_INUM 0xFFFFFFFF
+
+/* Number of non-data journal heads */
+#define NONDATA_JHEADS_CNT 2
+
+/* Garbage collector head */
+#define GCHD   0
+/* Base journal head number */
+#define BASEHD 1
+/* First "general purpose" journal head */
+#define DATAHD 2
+
+/* 'No change' value for 'ubifs_change_lp()' */
+#define LPROPS_NC 0x80000001
+
+/*
+ * There is no notion of truncation key because truncation nodes do not exist
+ * in TNC. However, when replaying, it is handy to introduce fake "truncation"
+ * keys for truncation nodes because the code becomes simpler. So we define
+ * %UBIFS_TRUN_KEY type.
+ */
+#define UBIFS_TRUN_KEY UBIFS_KEY_TYPES_CNT
+
+/*
+ * How much a directory entry/extended attribute entry adds to the parent/host
+ * inode.
+ */
+#define CALC_DENT_SIZE(name_len) ALIGN(UBIFS_DENT_NODE_SZ + (name_len) + 1, 8)
+
+/* How much an extended attribute adds to the host inode */
+#define CALC_XATTR_BYTES(data_len) ALIGN(UBIFS_INO_NODE_SZ + (data_len) + 1, 8)
+
+/*
+ * Znodes which were not touched for 'OLD_ZNODE_AGE' seconds are considered
+ * "old", and znode which were touched last 'YOUNG_ZNODE_AGE' seconds ago are
+ * considered "young". This is used by shrinker when selecting znode to trim
+ * off.
+ */
+#define OLD_ZNODE_AGE 20
+#define YOUNG_ZNODE_AGE 5
+
+/*
+ * Some compressors, like LZO, may end up with more data then the input buffer.
+ * So UBIFS always allocates larger output buffer, to be sure the compressor
+ * will not corrupt memory in case of worst case compression.
+ */
+#define WORST_COMPR_FACTOR 2
+
+/* Maximum expected tree height for use by bottom_up_buf */
+#define BOTTOM_UP_HEIGHT 64
+
+/*
+ * Lockdep classes for UBIFS inode @ui_mutex.
+ */
+enum {
+	WB_MUTEX_1 = 0,
+	WB_MUTEX_2 = 1,
+	WB_MUTEX_3 = 2,
+};
+
+/*
+ * Znode flags (actually, bit numbers which store the flags).
+ *
+ * DIRTY_ZNODE: znode is dirty
+ * COW_ZNODE: znode is being committed and a new instance of this znode has to
+ *            be created before changing this znode
+ * OBSOLETE_ZNODE: znode is obsolete, which means it was deleted, but it is
+ *                 still in the commit list and the ongoing commit operation
+ *                 will commit it, and delete this znode after it is done
+ */
+enum {
+	DIRTY_ZNODE    = 0,
+	COW_ZNODE      = 1,
+	OBSOLETE_ZNODE = 2,
+};
+
+/*
+ * Commit states.
+ *
+ * COMMIT_RESTING: commit is not wanted
+ * COMMIT_BACKGROUND: background commit has been requested
+ * COMMIT_REQUIRED: commit is required
+ * COMMIT_RUNNING_BACKGROUND: background commit is running
+ * COMMIT_RUNNING_REQUIRED: commit is running and it is required
+ * COMMIT_BROKEN: commit failed
+ */
+enum {
+	COMMIT_RESTING = 0,
+	COMMIT_BACKGROUND,
+	COMMIT_REQUIRED,
+	COMMIT_RUNNING_BACKGROUND,
+	COMMIT_RUNNING_REQUIRED,
+	COMMIT_BROKEN,
+};
+
+/*
+ * 'ubifs_scan_a_node()' return values.
+ *
+ * SCANNED_GARBAGE:  scanned garbage
+ * SCANNED_EMPTY_SPACE: scanned empty space
+ * SCANNED_A_NODE: scanned a valid node
+ * SCANNED_A_CORRUPT_NODE: scanned a corrupted node
+ * SCANNED_A_BAD_PAD_NODE: scanned a padding node with invalid pad length
+ *
+ * Greater than zero means: 'scanned that number of padding bytes'
+ */
+enum {
+	SCANNED_GARBAGE        = 0,
+	SCANNED_EMPTY_SPACE    = -1,
+	SCANNED_A_NODE         = -2,
+	SCANNED_A_CORRUPT_NODE = -3,
+	SCANNED_A_BAD_PAD_NODE = -4,
+};
+
+/*
+ * LPT cnode flag bits.
+ *
+ * DIRTY_CNODE: cnode is dirty
+ * COW_CNODE: cnode is being committed and must be copied before writing
+ * OBSOLETE_CNODE: cnode is being committed and has been copied (or deleted),
+ * so it can (and must) be freed when the commit is finished
+ */
+enum {
+	DIRTY_CNODE    = 0,
+	COW_CNODE      = 1,
+	OBSOLETE_CNODE = 2,
+};
+
+/*
+ * Dirty flag bits (lpt_drty_flgs) for LPT special nodes.
+ *
+ * LTAB_DIRTY: ltab node is dirty
+ * LSAVE_DIRTY: lsave node is dirty
+ */
+enum {
+	LTAB_DIRTY  = 1,
+	LSAVE_DIRTY = 2,
+};
+
+/*
+ * Return codes used by the garbage collector.
+ * @LEB_FREED: the logical eraseblock was freed and is ready to use
+ * @LEB_FREED_IDX: indexing LEB was freed and can be used only after the commit
+ * @LEB_RETAINED: the logical eraseblock was freed and retained for GC purposes
+ */
+enum {
+	LEB_FREED,
+	LEB_FREED_IDX,
+	LEB_RETAINED,
+};
+
+/**
+ * struct ubifs_old_idx - index node obsoleted since last commit start.
+ * @rb: rb-tree node
+ * @lnum: LEB number of obsoleted index node
+ * @offs: offset of obsoleted index node
+ */
+struct ubifs_old_idx {
+	struct rb_node rb;
+	int lnum;
+	int offs;
+};
+
+/* The below union makes it easier to deal with keys */
+union ubifs_key {
+	uint8_t u8[CUR_MAX_KEY_LEN];
+	uint32_t u32[CUR_MAX_KEY_LEN/4];
+	uint64_t u64[CUR_MAX_KEY_LEN/8];
+	__le32 j32[CUR_MAX_KEY_LEN/4];
+};
+
+/**
+ * struct ubifs_scan_node - UBIFS scanned node information.
+ * @list: list of scanned nodes
+ * @key: key of node scanned (if it has one)
+ * @sqnum: sequence number
+ * @type: type of node scanned
+ * @offs: offset with LEB of node scanned
+ * @len: length of node scanned
+ * @node: raw node
+ */
+struct ubifs_scan_node {
+	struct list_head list;
+	union ubifs_key key;
+	unsigned long long sqnum;
+	int type;
+	int offs;
+	int len;
+	void *node;
+};
+
+/**
+ * struct ubifs_scan_leb - UBIFS scanned LEB information.
+ * @lnum: logical eraseblock number
+ * @nodes_cnt: number of nodes scanned
+ * @nodes: list of struct ubifs_scan_node
+ * @endpt: end point (and therefore the start of empty space)
+ * @ecc: read returned -EBADMSG
+ * @buf: buffer containing entire LEB scanned
+ */
+struct ubifs_scan_leb {
+	int lnum;
+	int nodes_cnt;
+	struct list_head nodes;
+	int endpt;
+	int ecc;
+	void *buf;
+};
+
+/**
+ * struct ubifs_gced_idx_leb - garbage-collected indexing LEB.
+ * @list: list
+ * @lnum: LEB number
+ * @unmap: OK to unmap this LEB
+ *
+ * This data structure is used to temporary store garbage-collected indexing
+ * LEBs - they are not released immediately, but only after the next commit.
+ * This is needed to guarantee recoverability.
+ */
+struct ubifs_gced_idx_leb {
+	struct list_head list;
+	int lnum;
+	int unmap;
+};
+
+/**
+ * struct ubifs_inode - UBIFS in-memory inode description.
+ * @vfs_inode: VFS inode description object
+ * @creat_sqnum: sequence number at time of creation
+ * @xattr_size: summarized size of all extended attributes in bytes
+ * @xattr_cnt: count of extended attributes this inode has
+ * @xattr_names: sum of lengths of all extended attribute names belonging to
+ *               this inode
+ * @dirty: non-zero if the inode is dirty
+ * @xattr: non-zero if this is an extended attribute inode
+ * @ui_mutex: serializes inode write-back with the rest of VFS operations,
+ *            serializes "clean <-> dirty" state changes, protects @dirty,
+ *            @ui_size, and @xattr_size
+ * @ui_lock: protects @synced_i_size
+ * @synced_i_size: synchronized size of inode, i.e. the value of inode size
+ *                 currently stored on the flash; used only for regular file
+ *                 inodes
+ * @ui_size: inode size used by UBIFS when writing to flash
+ * @flags: inode flags (@UBIFS_COMPR_FL, etc)
+ * @compr_type: default compression type used for this inode
+ * @data_len: length of the data attached to the inode
+ * @data: inode's data
+ *
+ * @ui_mutex exists for two main reasons. At first it prevents inodes from
+ * being written back while UBIFS changing them, being in the middle of an VFS
+ * operation. This way UBIFS makes sure the inode fields are consistent. For
+ * example, in 'ubifs_rename()' we change 3 inodes simultaneously, and
+ * write-back must not write any of them before we have finished.
+ *
+ * The second reason is budgeting - UBIFS has to budget all operations. If an
+ * operation is going to mark an inode dirty, it has to allocate budget for
+ * this. It cannot just mark it dirty because there is no guarantee there will
+ * be enough flash space to write the inode back later. This means UBIFS has
+ * to have full control over inode "clean <-> dirty" transitions (and pages
+ * actually). But unfortunately, VFS marks inodes dirty in many places, and it
+ * does not ask the file-system if it is allowed to do so (there is a notifier,
+ * but it is not enough), i.e., there is no mechanism to synchronize with this.
+ * So UBIFS has its own inode dirty flag and its own mutex to serialize
+ * "clean <-> dirty" transitions.
+ *
+ * The @synced_i_size field is used to make sure we never write pages which are
+ * beyond last synchronized inode size. See 'ubifs_writepage()' for more
+ * information.
+ *
+ * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses
+ * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot
+ * make sure @inode->i_size is always changed under @ui_mutex, because it
+ * cannot call 'vmtruncate()' with @ui_mutex locked, because it would deadlock
+ * with 'ubifs_writepage()' (see file.c). All the other inode fields are
+ * changed under @ui_mutex, so they do not need "shadow" fields. Note, one
+ * could consider to rework locking and base it on "shadow" fields.
+ */
+struct ubifs_inode {
+	struct inode vfs_inode;
+	unsigned long long creat_sqnum;
+	unsigned int xattr_size;
+	unsigned int xattr_cnt;
+	unsigned int xattr_names;
+	unsigned int dirty:1;
+	unsigned int xattr:1;
+	struct mutex ui_mutex;
+	spinlock_t ui_lock;
+	loff_t synced_i_size;
+	loff_t ui_size;
+	int flags;
+	int compr_type;
+	int data_len;
+	void *data;
+};
+
+/**
+ * struct ubifs_unclean_leb - records a LEB recovered under read-only mode.
+ * @list: list
+ * @lnum: LEB number of recovered LEB
+ * @endpt: offset where recovery ended
+ *
+ * This structure records a LEB identified during recovery that needs to be
+ * cleaned but was not because UBIFS was mounted read-only. The information
+ * is used to clean the LEB when remounting to read-write mode.
+ */
+struct ubifs_unclean_leb {
+	struct list_head list;
+	int lnum;
+	int endpt;
+};
+
+/*
+ * LEB properties flags.
+ *
+ * LPROPS_UNCAT: not categorized
+ * LPROPS_DIRTY: dirty > 0, not index
+ * LPROPS_DIRTY_IDX: dirty + free > UBIFS_CH_SZ and index
+ * LPROPS_FREE: free > 0, not empty, not index
+ * LPROPS_HEAP_CNT: number of heaps used for storing categorized LEBs
+ * LPROPS_EMPTY: LEB is empty, not taken
+ * LPROPS_FREEABLE: free + dirty == leb_size, not index, not taken
+ * LPROPS_FRDI_IDX: free + dirty == leb_size and index, may be taken
+ * LPROPS_CAT_MASK: mask for the LEB categories above
+ * LPROPS_TAKEN: LEB was taken (this flag is not saved on the media)
+ * LPROPS_INDEX: LEB contains indexing nodes (this flag also exists on flash)
+ */
+enum {
+	LPROPS_UNCAT     =  0,
+	LPROPS_DIRTY     =  1,
+	LPROPS_DIRTY_IDX =  2,
+	LPROPS_FREE      =  3,
+	LPROPS_HEAP_CNT  =  3,
+	LPROPS_EMPTY     =  4,
+	LPROPS_FREEABLE  =  5,
+	LPROPS_FRDI_IDX  =  6,
+	LPROPS_CAT_MASK  = 15,
+	LPROPS_TAKEN     = 16,
+	LPROPS_INDEX     = 32,
+};
+
+/**
+ * struct ubifs_lprops - logical eraseblock properties.
+ * @free: amount of free space in bytes
+ * @dirty: amount of dirty space in bytes
+ * @flags: LEB properties flags (see above)
+ * @lnum: LEB number
+ * @list: list of same-category lprops (for LPROPS_EMPTY and LPROPS_FREEABLE)
+ * @hpos: heap position in heap of same-category lprops (other categories)
+ */
+struct ubifs_lprops {
+	int free;
+	int dirty;
+	int flags;
+	int lnum;
+	union {
+		struct list_head list;
+		int hpos;
+	};
+};
+
+/**
+ * struct ubifs_lpt_lprops - LPT logical eraseblock properties.
+ * @free: amount of free space in bytes
+ * @dirty: amount of dirty space in bytes
+ * @tgc: trivial GC flag (1 => unmap after commit end)
+ * @cmt: commit flag (1 => reserved for commit)
+ */
+struct ubifs_lpt_lprops {
+	int free;
+	int dirty;
+	unsigned tgc : 1;
+	unsigned cmt : 1;
+};
+
+/**
+ * struct ubifs_lp_stats - statistics of eraseblocks in the main area.
+ * @empty_lebs: number of empty LEBs
+ * @taken_empty_lebs: number of taken LEBs
+ * @idx_lebs: number of indexing LEBs
+ * @total_free: total free space in bytes
+ * @total_dirty: total dirty space in bytes
+ * @total_used: total used space in bytes (includes only data LEBs)
+ * @total_dead: total dead space in bytes (includes only data LEBs)
+ * @total_dark: total dark space in bytes (includes only data LEBs)
+ *
+ * N.B. total_dirty and total_used are different to other total_* fields,
+ * because they account _all_ LEBs, not just data LEBs.
+ *
+ * 'taken_empty_lebs' counts the LEBs that are in the transient state of having
+ * been 'taken' for use but not yet written to. 'taken_empty_lebs' is needed
+ * to account correctly for gc_lnum, otherwise 'empty_lebs' could be used
+ * by itself (in which case 'unused_lebs' would be a better name). In the case
+ * of gc_lnum, it is 'taken' at mount time or whenever a LEB is retained by GC,
+ * but unlike other empty LEBs that are 'taken', it may not be written straight
+ * away (i.e. before the next commit start or unmount), so either gc_lnum must
+ * be specially accounted for, or the current approach followed i.e. count it
+ * under 'taken_empty_lebs'.
+ */
+struct ubifs_lp_stats {
+	int empty_lebs;
+	int taken_empty_lebs;
+	int idx_lebs;
+	long long total_free;
+	long long total_dirty;
+	long long total_used;
+	long long total_dead;
+	long long total_dark;
+};
+
+struct ubifs_nnode;
+
+/**
+ * struct ubifs_cnode - LEB Properties Tree common node.
+ * @parent: parent nnode
+ * @cnext: next cnode to commit
+ * @flags: flags (%DIRTY_LPT_NODE or %OBSOLETE_LPT_NODE)
+ * @iip: index in parent
+ * @level: level in the tree (zero for pnodes, greater than zero for nnodes)
+ * @num: node number
+ */
+struct ubifs_cnode {
+	struct ubifs_nnode *parent;
+	struct ubifs_cnode *cnext;
+	unsigned long flags;
+	int iip;
+	int level;
+	int num;
+};
+
+/**
+ * struct ubifs_pnode - LEB Properties Tree leaf node.
+ * @parent: parent nnode
+ * @cnext: next cnode to commit
+ * @flags: flags (%DIRTY_LPT_NODE or %OBSOLETE_LPT_NODE)
+ * @iip: index in parent
+ * @level: level in the tree (always zero for pnodes)
+ * @num: node number
+ * @lprops: LEB properties array
+ */
+struct ubifs_pnode {
+	struct ubifs_nnode *parent;
+	struct ubifs_cnode *cnext;
+	unsigned long flags;
+	int iip;
+	int level;
+	int num;
+	struct ubifs_lprops lprops[UBIFS_LPT_FANOUT];
+};
+
+/**
+ * struct ubifs_nbranch - LEB Properties Tree internal node branch.
+ * @lnum: LEB number of child
+ * @offs: offset of child
+ * @nnode: nnode child
+ * @pnode: pnode child
+ * @cnode: cnode child
+ */
+struct ubifs_nbranch {
+	int lnum;
+	int offs;
+	union {
+		struct ubifs_nnode *nnode;
+		struct ubifs_pnode *pnode;
+		struct ubifs_cnode *cnode;
+	};
+};
+
+/**
+ * struct ubifs_nnode - LEB Properties Tree internal node.
+ * @parent: parent nnode
+ * @cnext: next cnode to commit
+ * @flags: flags (%DIRTY_LPT_NODE or %OBSOLETE_LPT_NODE)
+ * @iip: index in parent
+ * @level: level in the tree (always greater than zero for nnodes)
+ * @num: node number
+ * @nbranch: branches to child nodes
+ */
+struct ubifs_nnode {
+	struct ubifs_nnode *parent;
+	struct ubifs_cnode *cnext;
+	unsigned long flags;
+	int iip;
+	int level;
+	int num;
+	struct ubifs_nbranch nbranch[UBIFS_LPT_FANOUT];
+};
+
+/**
+ * struct ubifs_lpt_heap - heap of categorized lprops.
+ * @arr: heap array
+ * @cnt: number in heap
+ * @max_cnt: maximum number allowed in heap
+ *
+ * There are %LPROPS_HEAP_CNT heaps.
+ */
+struct ubifs_lpt_heap {
+	struct ubifs_lprops **arr;
+	int cnt;
+	int max_cnt;
+};
+
+/*
+ * Return codes for LPT scan callback function.
+ *
+ * LPT_SCAN_CONTINUE: continue scanning
+ * LPT_SCAN_ADD: add the LEB properties scanned to the tree in memory
+ * LPT_SCAN_STOP: stop scanning
+ */
+enum {
+	LPT_SCAN_CONTINUE = 0,
+	LPT_SCAN_ADD = 1,
+	LPT_SCAN_STOP = 2,
+};
+
+struct ubifs_info;
+
+/* Callback used by the 'ubifs_lpt_scan_nolock()' function */
+typedef int (*ubifs_lpt_scan_callback)(struct ubifs_info *c,
+				       const struct ubifs_lprops *lprops,
+				       int in_tree, void *data);
+
+/**
+ * struct ubifs_wbuf - UBIFS write-buffer.
+ * @c: UBIFS file-system description object
+ * @buf: write-buffer (of min. flash I/O unit size)
+ * @lnum: logical eraseblock number the write-buffer points to
+ * @offs: write-buffer offset in this logical eraseblock
+ * @avail: number of bytes available in the write-buffer
+ * @used:  number of used bytes in the write-buffer
+ * @dtype: type of data stored in this LEB (%UBI_LONGTERM, %UBI_SHORTTERM,
+ * %UBI_UNKNOWN)
+ * @jhead: journal head the mutex belongs to (note, needed only to shut lockdep
+ *         up by 'mutex_lock_nested()).
+ * @sync_callback: write-buffer synchronization callback
+ * @io_mutex: serializes write-buffer I/O
+ * @lock: serializes @buf, @lnum, @offs, @avail, @used, @next_ino and @inodes
+ *        fields
+ * @timer: write-buffer timer
+ * @timeout: timer expire interval in jiffies
+ * @need_sync: it is set if its timer expired and needs sync
+ * @next_ino: points to the next position of the following inode number
+ * @inodes: stores the inode numbers of the nodes which are in wbuf
+ *
+ * The write-buffer synchronization callback is called when the write-buffer is
+ * synchronized in order to notify how much space was wasted due to
+ * write-buffer padding and how much free space is left in the LEB.
+ *
+ * Note: the fields @buf, @lnum, @offs, @avail and @used can be read under
+ * spin-lock or mutex because they are written under both mutex and spin-lock.
+ * @buf is appended to under mutex but overwritten under both mutex and
+ * spin-lock. Thus the data between @buf and @buf + @used can be read under
+ * spinlock.
+ */
+struct ubifs_wbuf {
+	struct ubifs_info *c;
+	void *buf;
+	int lnum;
+	int offs;
+	int avail;
+	int used;
+	int dtype;
+	int jhead;
+	int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad);
+	struct mutex io_mutex;
+	spinlock_t lock;
+	struct timer_list timer;
+	int timeout;
+	int need_sync;
+	int next_ino;
+	ino_t *inodes;
+};
+
+/**
+ * struct ubifs_bud - bud logical eraseblock.
+ * @lnum: logical eraseblock number
+ * @start: where the (uncommitted) bud data starts
+ * @jhead: journal head number this bud belongs to
+ * @list: link in the list buds belonging to the same journal head
+ * @rb: link in the tree of all buds
+ */
+struct ubifs_bud {
+	int lnum;
+	int start;
+	int jhead;
+	struct list_head list;
+	struct rb_node rb;
+};
+
+/**
+ * struct ubifs_jhead - journal head.
+ * @wbuf: head's write-buffer
+ * @buds_list: list of bud LEBs belonging to this journal head
+ *
+ * Note, the @buds list is protected by the @c->buds_lock.
+ */
+struct ubifs_jhead {
+	struct ubifs_wbuf wbuf;
+	struct list_head buds_list;
+};
+
+/**
+ * struct ubifs_zbranch - key/coordinate/length branch stored in znodes.
+ * @key: key
+ * @znode: znode address in memory
+ * @lnum: LEB number of the indexing node
+ * @offs: offset of the indexing node within @lnum
+ * @len: target node length
+ */
+struct ubifs_zbranch {
+	union ubifs_key key;
+	union {
+		struct ubifs_znode *znode;
+		void *leaf;
+	};
+	int lnum;
+	int offs;
+	int len;
+};
+
+/**
+ * struct ubifs_znode - in-memory representation of an indexing node.
+ * @parent: parent znode or NULL if it is the root
+ * @cnext: next znode to commit
+ * @flags: znode flags (%DIRTY_ZNODE, %COW_ZNODE or %OBSOLETE_ZNODE)
+ * @time: last access time (seconds)
+ * @level: level of the entry in the TNC tree
+ * @child_cnt: count of child znodes
+ * @iip: index in parent's zbranch array
+ * @alt: lower bound of key range has altered i.e. child inserted at slot 0
+ * @lnum: LEB number of the corresponding indexing node
+ * @offs: offset of the corresponding indexing node
+ * @len: length  of the corresponding indexing node
+ * @zbranch: array of znode branches (@c->fanout elements)
+ */
+struct ubifs_znode {
+	struct ubifs_znode *parent;
+	struct ubifs_znode *cnext;
+	unsigned long flags;
+	unsigned long time;
+	int level;
+	int child_cnt;
+	int iip;
+	int alt;
+#ifdef CONFIG_UBIFS_FS_DEBUG
+	int lnum, offs, len;
+#endif
+	struct ubifs_zbranch zbranch[];
+};
+
+/**
+ * struct ubifs_node_range - node length range description data structure.
+ * @len: fixed node length
+ * @min_len: minimum possible node length
+ * @max_len: maximum possible node length
+ *
+ * If @max_len is %0, the node has fixed length @len.
+ */
+struct ubifs_node_range {
+	union {
+		int len;
+		int min_len;
+	};
+	int max_len;
+};
+
+/**
+ * struct ubifs_compressor - UBIFS compressor description structure.
+ * @compr_type: compressor type (%UBIFS_COMPR_LZO, etc)
+ * @cc: cryptoapi compressor handle
+ * @comp_mutex: mutex used during compression
+ * @decomp_mutex: mutex used during decompression
+ * @name: compressor name
+ * @capi_name: cryptoapi compressor name
+ */
+struct ubifs_compressor {
+	int compr_type;
+	struct crypto_comp *cc;
+	struct mutex *comp_mutex;
+	struct mutex *decomp_mutex;
+	const char *name;
+	const char *capi_name;
+};
+
+/**
+ * struct ubifs_budget_req - budget requirements of an operation.
+ *
+ * @fast: non-zero if the budgeting should try to aquire budget quickly and
+ *        should not try to call write-back
+ * @recalculate: non-zero if @idx_growth, @data_growth, and @dd_growth fields
+ *               have to be re-calculated
+ * @new_page: non-zero if the operation adds a new page
+ * @dirtied_page: non-zero if the operation makes a page dirty
+ * @new_dent: non-zero if the operation adds a new directory entry
+ * @mod_dent: non-zero if the operation removes or modifies an existing
+ *            directory entry
+ * @new_ino: non-zero if the operation adds a new inode
+ * @new_ino_d: now much data newly created inode contains
+ * @dirtied_ino: how many inodes the operation makes dirty
+ * @dirtied_ino_d: now much data dirtied inode contains
+ * @idx_growth: how much the index will supposedly grow
+ * @data_growth: how much new data the operation will supposedly add
+ * @dd_growth: how much data that makes other data dirty the operation will
+ *             supposedly add
+ *
+ * @idx_growth, @data_growth and @dd_growth are not used in budget request. The
+ * budgeting subsystem caches index and data growth values there to avoid
+ * re-calculating them when the budget is released. However, if @idx_growth is
+ * %-1, it is calculated by the release function using other fields.
+ *
+ * An inode may contain 4KiB of data at max., thus the widths of @new_ino_d
+ * is 13 bits, and @dirtied_ino_d - 15, because up to 4 inodes may be made
+ * dirty by the re-name operation.
+ */
+struct ubifs_budget_req {
+	unsigned int fast:1;
+	unsigned int recalculate:1;
+	unsigned int new_page:1;
+	unsigned int dirtied_page:1;
+	unsigned int new_dent:1;
+	unsigned int mod_dent:1;
+	unsigned int new_ino:1;
+	unsigned int new_ino_d:13;
+#ifndef UBIFS_DEBUG
+	unsigned int dirtied_ino:4;
+	unsigned int dirtied_ino_d:15;
+#else
+	/* Not bit-fields to check for overflows */
+	unsigned int dirtied_ino;
+	unsigned int dirtied_ino_d;
+#endif
+	int idx_growth;
+	int data_growth;
+	int dd_growth;
+};
+
+/**
+ * struct ubifs_orphan - stores the inode number of an orphan.
+ * @rb: rb-tree node of rb-tree of orphans sorted by inode number
+ * @list: list head of list of orphans in order added
+ * @new_list: list head of list of orphans added since the last commit
+ * @cnext: next orphan to commit
+ * @dnext: next orphan to delete
+ * @inum: inode number
+ * @new: %1 => added since the last commit, otherwise %0
+ */
+struct ubifs_orphan {
+	struct rb_node rb;
+	struct list_head list;
+	struct list_head new_list;
+	struct ubifs_orphan *cnext;
+	struct ubifs_orphan *dnext;
+	ino_t inum;
+	int new;
+};
+
+/**
+ * struct ubifs_mount_opts - UBIFS-specific mount options information.
+ * @unmount_mode: selected unmount mode (%0 default, %1 normal, %2 fast)
+ */
+struct ubifs_mount_opts {
+	unsigned int unmount_mode:2;
+};
+
+/**
+ * struct ubifs_info - UBIFS file-system description data structure
+ * (per-superblock).
+ * @vfs_sb: VFS @struct super_block object
+ * @bdi: backing device info object to make VFS happy and disable readahead
+ *
+ * @highest_inum: highest used inode number
+ * @vfs_gen: VFS inode generation counter
+ * @max_sqnum: current global sequence number
+ * @cmt_no: commit number (last successfully completed commit)
+ * @cnt_lock: protects @highest_inum, @vfs_gen, and @max_sqnum counters
+ * @fmt_version: UBIFS on-flash format version
+ * @uuid: UUID from super block
+ *
+ * @lhead_lnum: log head logical eraseblock number
+ * @lhead_offs: log head offset
+ * @ltail_lnum: log tail logical eraseblock number (offset is always 0)
+ * @log_mutex: protects the log, @lhead_lnum, @lhead_offs, @ltail_lnum, and
+ *             @bud_bytes
+ * @min_log_bytes: minimum required number of bytes in the log
+ * @cmt_bud_bytes: used during commit to temporarily amount of bytes in
+ *                 committed buds
+ *
+ * @buds: tree of all buds indexed by bud LEB number
+ * @bud_bytes: how many bytes of flash is used by buds
+ * @buds_lock: protects the @buds tree, @bud_bytes, and per-journal head bud
+ *             lists
+ * @jhead_cnt: count of journal heads
+ * @jheads: journal heads (head zero is base head)
+ * @max_bud_bytes: maximum number of bytes allowed in buds
+ * @bg_bud_bytes: number of bud bytes when background commit is initiated
+ * @old_buds: buds to be released after commit ends
+ * @max_bud_cnt: maximum number of buds
+ *
+ * @commit_sem: synchronizes committer with other processes
+ * @cmt_state: commit state
+ * @cs_lock: commit state lock
+ * @cmt_wq: wait queue to sleep on if the log is full and a commit is running
+ * @fast_unmount: do not run journal commit before un-mounting
+ * @big_lpt: flag that LPT is too big to write whole during commit
+ * @check_lpt_free: flag that indicates LPT GC may be needed
+ * @nospace: non-zero if the file-system does not have flash space (used as
+ *           optimization)
+ * @nospace_rp: the same as @nospace, but additionally means that even reserved
+ *              pool is full
+ *
+ * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and
+ *             @calc_idx_sz
+ * @zroot: zbranch which points to the root index node and znode
+ * @cnext: next znode to commit
+ * @enext: next znode to commit to empty space
+ * @gap_lebs: array of LEBs used by the in-gaps commit method
+ * @cbuf: commit buffer
+ * @ileb_buf: buffer for commit in-the-gaps method
+ * @ileb_len: length of data in ileb_buf
+ * @ihead_lnum: LEB number of index head
+ * @ihead_offs: offset of index head
+ * @ilebs: pre-allocated index LEBs
+ * @ileb_cnt: number of pre-allocated index LEBs
+ * @ileb_nxt: next pre-allocated index LEBs
+ * @old_idx: tree of index nodes obsoleted since the last commit start
+ * @bottom_up_buf: a buffer which is used by 'dirty_cow_bottom_up()' in tnc.c
+ * @new_ihead_lnum: used by debugging to check ihead_lnum
+ * @new_ihead_offs: used by debugging to check ihead_offs
+ *
+ * @mst_node: master node
+ * @mst_offs: offset of valid master node
+ * @mst_mutex: protects the master node area, @mst_node, and @mst_offs
+ *
+ * @log_lebs: number of logical eraseblocks in the log
+ * @log_bytes: log size in bytes
+ * @log_last: last LEB of the log
+ * @lpt_lebs: number of LEBs used for lprops table
+ * @lpt_first: first LEB of the lprops table area
+ * @lpt_last: last LEB of the lprops table area
+ * @orph_lebs: number of LEBs used for the orphan area
+ * @orph_first: first LEB of the orphan area
+ * @orph_last: last LEB of the orphan area
+ * @main_lebs: count of LEBs in the main area
+ * @main_first: first LEB of the main area
+ * @main_bytes: main area size in bytes
+ * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
+ *
+ * @key_hash_type: type of the key hash
+ * @key_hash: direntry key hash function
+ * @key_fmt: key format
+ * @key_len: key length
+ * @fanout: fanout of the index tree (number of links per indexing node)
+ *
+ * @min_io_size: minimal input/output unit size
+ * @min_io_shift: number of bits in @min_io_size minus one
+ * @leb_size: logical eraseblock size in bytes
+ * @half_leb_size: half LEB size
+ * @leb_cnt: count of logical eraseblocks
+ * @max_leb_cnt: maximum count of logical eraseblocks
+ * @old_leb_cnt: count of logical eraseblocks before re-size
+ * @ro_media: the underlying UBI volume is read-only
+ *
+ * @dirty_pg_cnt: number of dirty pages (not used)
+ * @dirty_zn_cnt: number of dirty znodes
+ * @clean_zn_cnt: number of clean znodes
+ *
+ * @budg_idx_growth: amount of bytes budgeted for index growth
+ * @budg_data_growth: amount of bytes budgeted for cached data
+ * @budg_dd_growth: amount of bytes budgeted for cached data that will make
+ *                  other data dirty
+ * @budg_uncommitted_idx: amount of bytes were budgeted for growth of the index,
+ *                        but which still have to be taken into account because
+ *                        the index has not been committed so far
+ * @space_lock: protects @budg_idx_growth, @budg_data_growth, @budg_dd_growth,
+ *              @budg_uncommited_idx, @min_idx_lebs, @old_idx_sz, and @lst;
+ * @min_idx_lebs: minimum number of LEBs required for the index
+ * @old_idx_sz: size of index on flash
+ * @calc_idx_sz: temporary variable which is used to calculate new index size
+ *               (contains accurate new index size at end of TNC commit start)
+ * @lst: lprops statistics
+ *
+ * @page_budget: budget for a page
+ * @inode_budget: budget for an inode
+ * @dent_budget: budget for a directory entry
+ *
+ * @ref_node_alsz: size of the LEB reference node aligned to the min. flash
+ * I/O unit
+ * @mst_node_alsz: master node aligned size
+ * @min_idx_node_sz: minimum indexing node aligned on 8-bytes boundary
+ * @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary
+ * @max_inode_sz: maximum possible inode size in bytes
+ * @max_znode_sz: size of znode in bytes
+ * @dead_wm: LEB dead space watermark
+ * @dark_wm: LEB dark space watermark
+ * @block_cnt: count of 4KiB blocks on the FS
+ *
+ * @ranges: UBIFS node length ranges
+ * @ubi: UBI volume descriptor
+ * @di: UBI device information
+ * @vi: UBI volume information
+ *
+ * @orph_tree: rb-tree of orphan inode numbers
+ * @orph_list: list of orphan inode numbers in order added
+ * @orph_new: list of orphan inode numbers added since last commit
+ * @orph_cnext: next orphan to commit
+ * @orph_dnext: next orphan to delete
+ * @orphan_lock: lock for orph_tree and orph_new
+ * @orph_buf: buffer for orphan nodes
+ * @new_orphans: number of orphans since last commit
+ * @cmt_orphans: number of orphans being committed
+ * @tot_orphans: number of orphans in the rb_tree
+ * @max_orphans: maximum number of orphans allowed
+ * @ohead_lnum: orphan head LEB number
+ * @ohead_offs: orphan head offset
+ * @no_orphs: non-zero if there are no orphans
+ *
+ * @bgt: UBIFS background thread
+ * @bgt_name: background thread name
+ * @need_bgt: if background thread should run
+ * @need_wbuf_sync: if write-buffers have to be synchronized
+ *
+ * @gc_lnum: LEB number used for garbage collection
+ * @sbuf: a buffer of LEB size used by GC and replay for scanning
+ * @idx_gc: list of index LEBs that have been garbage collected
+ * @idx_gc_cnt: number of elements on the idx_gc list
+ *
+ * @infos_list: links all 'ubifs_info' objects
+ * @umount_mutex: serializes shrinker and un-mount
+ * @shrinker_run_no: shrinker run number
+ *
+ * @space_bits: number of bits needed to record free or dirty space
+ * @lpt_lnum_bits: number of bits needed to record a LEB number in the LPT
+ * @lpt_offs_bits: number of bits needed to record an offset in the LPT
+ * @lpt_spc_bits: number of bits needed to space in the LPT
+ * @pcnt_bits: number of bits needed to record pnode or nnode number
+ * @lnum_bits: number of bits needed to record LEB number
+ * @nnode_sz: size of on-flash nnode
+ * @pnode_sz: size of on-flash pnode
+ * @ltab_sz: size of on-flash LPT lprops table
+ * @lsave_sz: size of on-flash LPT save table
+ * @pnode_cnt: number of pnodes
+ * @nnode_cnt: number of nnodes
+ * @lpt_hght: height of the LPT
+ * @pnodes_have: number of pnodes in memory
+ *
+ * @lp_mutex: protects lprops table and all the other lprops-related fields
+ * @lpt_lnum: LEB number of the root nnode of the LPT
+ * @lpt_offs: offset of the root nnode of the LPT
+ * @nhead_lnum: LEB number of LPT head
+ * @nhead_offs: offset of LPT head
+ * @lpt_drty_flgs: dirty flags for LPT special nodes e.g. ltab
+ * @dirty_nn_cnt: number of dirty nnodes
+ * @dirty_pn_cnt: number of dirty pnodes
+ * @lpt_sz: LPT size
+ * @lpt_nod_buf: buffer for an on-flash nnode or pnode
+ * @lpt_buf: buffer of LEB size used by LPT
+ * @nroot: address in memory of the root nnode of the LPT
+ * @lpt_cnext: next LPT node to commit
+ * @lpt_heap: array of heaps of categorized lprops
+ * @dirty_idx: a (reverse sorted) copy of the LPROPS_DIRTY_IDX heap as at
+ *             previous commit start
+ * @uncat_list: list of un-categorized LEBs
+ * @empty_list: list of empty LEBs
+ * @freeable_list: list of freeable non-index LEBs (free + dirty == leb_size)
+ * @frdi_idx_list: list of freeable index LEBs (free + dirty == leb_size)
+ * @freeable_cnt: number of freeable LEBs in @freeable_list
+ *
+ * @ltab_lnum: LEB number of LPT's own lprops table
+ * @ltab_offs: offset of LPT's own lprops table
+ * @ltab: LPT's own lprops table
+ * @ltab_cmt: LPT's own lprops table (commit copy)
+ * @lsave_cnt: number of LEB numbers in LPT's save table
+ * @lsave_lnum: LEB number of LPT's save table
+ * @lsave_offs: offset of LPT's save table
+ * @lsave: LPT's save table
+ * @lscan_lnum: LEB number of last LPT scan
+ *
+ * @rp_size: size of the reserved pool in bytes
+ * @report_rp_size: size of the reserved pool reported to user-space
+ * @rp_uid: reserved pool user ID
+ * @rp_gid: reserved pool group ID
+ *
+ * @empty: if the UBI device is empty
+ * @replay_tree: temporary tree used during journal replay
+ * @replay_list: temporary list used during journal replay
+ * @replay_buds: list of buds to replay
+ * @cs_sqnum: sequence number of first node in the log (commit start node)
+ * @replay_sqnum: sequence number of node currently being replayed
+ * @need_recovery: file-system needs recovery
+ * @replaying: set to %1 during journal replay
+ * @unclean_leb_list: LEBs to recover when mounting ro to rw
+ * @rcvrd_mst_node: recovered master node to write when mounting ro to rw
+ * @size_tree: inode size information for recovery
+ * @remounting_rw: set while remounting from ro to rw (sb flags have MS_RDONLY)
+ * @mount_opts: UBIFS-specific mount options
+ *
+ * @dbg_buf: a buffer of LEB size used for debugging purposes
+ * @old_zroot: old index root - used by 'dbg_check_old_index()'
+ * @old_zroot_level: old index root level - used by 'dbg_check_old_index()'
+ * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()'
+ * @failure_mode: failure mode for recovery testing
+ * @fail_delay: 0=>don't delay, 1=>delay a time, 2=>delay a number of calls
+ * @fail_timeout: time in jiffies when delay of failure mode expires
+ * @fail_cnt: current number of calls to failure mode I/O functions
+ * @fail_cnt_max: number of calls by which to delay failure mode
+ */
+struct ubifs_info {
+	struct super_block *vfs_sb;
+	struct backing_dev_info bdi;
+
+	ino_t highest_inum;
+	unsigned int vfs_gen;
+	unsigned long long max_sqnum;
+	unsigned long long cmt_no;
+	spinlock_t cnt_lock;
+	int fmt_version;
+	unsigned char uuid[16];
+
+	int lhead_lnum;
+	int lhead_offs;
+	int ltail_lnum;
+	struct mutex log_mutex;
+	int min_log_bytes;
+	long long cmt_bud_bytes;
+
+	struct rb_root buds;
+	long long bud_bytes;
+	spinlock_t buds_lock;
+	int jhead_cnt;
+	struct ubifs_jhead *jheads;
+	long long max_bud_bytes;
+	long long bg_bud_bytes;
+	struct list_head old_buds;
+	int max_bud_cnt;
+
+	struct rw_semaphore commit_sem;
+	int cmt_state;
+	spinlock_t cs_lock;
+	wait_queue_head_t cmt_wq;
+	unsigned int fast_unmount:1;
+	unsigned int big_lpt:1;
+	unsigned int check_lpt_free:1;
+	unsigned int nospace:1;
+	unsigned int nospace_rp:1;
+
+	struct mutex tnc_mutex;
+	struct ubifs_zbranch zroot;
+	struct ubifs_znode *cnext;
+	struct ubifs_znode *enext;
+	int *gap_lebs;
+	void *cbuf;
+	void *ileb_buf;
+	int ileb_len;
+	int ihead_lnum;
+	int ihead_offs;
+	int *ilebs;
+	int ileb_cnt;
+	int ileb_nxt;
+	struct rb_root old_idx;
+	int *bottom_up_buf;
+#ifdef CONFIG_UBIFS_FS_DEBUG
+	int new_ihead_lnum;
+	int new_ihead_offs;
+#endif
+
+	struct ubifs_mst_node *mst_node;
+	int mst_offs;
+	struct mutex mst_mutex;
+
+	int log_lebs;
+	long long log_bytes;
+	int log_last;
+	int lpt_lebs;
+	int lpt_first;
+	int lpt_last;
+	int orph_lebs;
+	int orph_first;
+	int orph_last;
+	int main_lebs;
+	int main_first;
+	long long main_bytes;
+	int default_compr;
+
+	uint8_t key_hash_type;
+	uint32_t (*key_hash)(const char *str, int len);
+	int key_fmt;
+	int key_len;
+	int fanout;
+
+	int min_io_size;
+	int min_io_shift;
+	int leb_size;
+	int half_leb_size;
+	int leb_cnt;
+	int max_leb_cnt;
+	int old_leb_cnt;
+	int ro_media;
+
+	atomic_long_t dirty_pg_cnt;
+	atomic_long_t dirty_zn_cnt;
+	atomic_long_t clean_zn_cnt;
+
+	long long budg_idx_growth;
+	long long budg_data_growth;
+	long long budg_dd_growth;
+	long long budg_uncommitted_idx;
+	spinlock_t space_lock;
+	int min_idx_lebs;
+	unsigned long long old_idx_sz;
+	unsigned long long calc_idx_sz;
+	struct ubifs_lp_stats lst;
+
+	int page_budget;
+	int inode_budget;
+	int dent_budget;
+
+	int ref_node_alsz;
+	int mst_node_alsz;
+	int min_idx_node_sz;
+	int max_idx_node_sz;
+	long long max_inode_sz;
+	int max_znode_sz;
+	int dead_wm;
+	int dark_wm;
+	int block_cnt;
+
+	struct ubifs_node_range ranges[UBIFS_NODE_TYPES_CNT];
+	struct ubi_volume_desc *ubi;
+	struct ubi_device_info di;
+	struct ubi_volume_info vi;
+
+	struct rb_root orph_tree;
+	struct list_head orph_list;
+	struct list_head orph_new;
+	struct ubifs_orphan *orph_cnext;
+	struct ubifs_orphan *orph_dnext;
+	spinlock_t orphan_lock;
+	void *orph_buf;
+	int new_orphans;
+	int cmt_orphans;
+	int tot_orphans;
+	int max_orphans;
+	int ohead_lnum;
+	int ohead_offs;
+	int no_orphs;
+
+	struct task_struct *bgt;
+	char bgt_name[sizeof(BGT_NAME_PATTERN) + 9];
+	int need_bgt;
+	int need_wbuf_sync;
+
+	int gc_lnum;
+	void *sbuf;
+	struct list_head idx_gc;
+	int idx_gc_cnt;
+
+	struct list_head infos_list;
+	struct mutex umount_mutex;
+	unsigned int shrinker_run_no;
+
+	int space_bits;
+	int lpt_lnum_bits;
+	int lpt_offs_bits;
+	int lpt_spc_bits;
+	int pcnt_bits;
+	int lnum_bits;
+	int nnode_sz;
+	int pnode_sz;
+	int ltab_sz;
+	int lsave_sz;
+	int pnode_cnt;
+	int nnode_cnt;
+	int lpt_hght;
+	int pnodes_have;
+
+	struct mutex lp_mutex;
+	int lpt_lnum;
+	int lpt_offs;
+	int nhead_lnum;
+	int nhead_offs;
+	int lpt_drty_flgs;
+	int dirty_nn_cnt;
+	int dirty_pn_cnt;
+	long long lpt_sz;
+	void *lpt_nod_buf;
+	void *lpt_buf;
+	struct ubifs_nnode *nroot;
+	struct ubifs_cnode *lpt_cnext;
+	struct ubifs_lpt_heap lpt_heap[LPROPS_HEAP_CNT];
+	struct ubifs_lpt_heap dirty_idx;
+	struct list_head uncat_list;
+	struct list_head empty_list;
+	struct list_head freeable_list;
+	struct list_head frdi_idx_list;
+	int freeable_cnt;
+
+	int ltab_lnum;
+	int ltab_offs;
+	struct ubifs_lpt_lprops *ltab;
+	struct ubifs_lpt_lprops *ltab_cmt;
+	int lsave_cnt;
+	int lsave_lnum;
+	int lsave_offs;
+	int *lsave;
+	int lscan_lnum;
+
+	long long rp_size;
+	long long report_rp_size;
+	uid_t rp_uid;
+	gid_t rp_gid;
+
+	/* The below fields are used only during mounting and re-mounting */
+	int empty;
+	struct rb_root replay_tree;
+	struct list_head replay_list;
+	struct list_head replay_buds;
+	unsigned long long cs_sqnum;
+	unsigned long long replay_sqnum;
+	int need_recovery;
+	int replaying;
+	struct list_head unclean_leb_list;
+	struct ubifs_mst_node *rcvrd_mst_node;
+	struct rb_root size_tree;
+	int remounting_rw;
+	struct ubifs_mount_opts mount_opts;
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+	void *dbg_buf;
+	struct ubifs_zbranch old_zroot;
+	int old_zroot_level;
+	unsigned long long old_zroot_sqnum;
+	int failure_mode;
+	int fail_delay;
+	unsigned long fail_timeout;
+	unsigned int fail_cnt;
+	unsigned int fail_cnt_max;
+#endif
+};
+
+extern struct list_head ubifs_infos;
+extern spinlock_t ubifs_infos_lock;
+extern atomic_long_t ubifs_clean_zn_cnt;
+extern struct kmem_cache *ubifs_inode_slab;
+extern struct super_operations ubifs_super_operations;
+extern struct address_space_operations ubifs_file_address_operations;
+extern struct file_operations ubifs_file_operations;
+extern struct inode_operations ubifs_file_inode_operations;
+extern struct file_operations ubifs_dir_operations;
+extern struct inode_operations ubifs_dir_inode_operations;
+extern struct inode_operations ubifs_symlink_inode_operations;
+extern struct backing_dev_info ubifs_backing_dev_info;
+extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
+
+/* io.c */
+int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len);
+int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
+			   int dtype);
+int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf);
+int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len,
+		    int lnum, int offs);
+int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
+			 int lnum, int offs);
+int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum,
+		     int offs, int dtype);
+int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
+		     int offs, int quiet);
+void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad);
+void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last);
+int ubifs_io_init(struct ubifs_info *c);
+void ubifs_pad(const struct ubifs_info *c, void *buf, int pad);
+int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf);
+int ubifs_bg_wbufs_sync(struct ubifs_info *c);
+void ubifs_wbuf_add_ino_nolock(struct ubifs_wbuf *wbuf, ino_t inum);
+int ubifs_sync_wbufs_by_inode(struct ubifs_info *c, struct inode *inode);
+
+/* scan.c */
+struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
+				  int offs, void *sbuf);
+void ubifs_scan_destroy(struct ubifs_scan_leb *sleb);
+int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
+		      int offs, int quiet);
+struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum,
+					int offs, void *sbuf);
+void ubifs_end_scan(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+		    int lnum, int offs);
+int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+		   void *buf, int offs);
+void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs,
+			      void *buf);
+
+/* log.c */
+void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud);
+void ubifs_create_buds_lists(struct ubifs_info *c);
+int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs);
+struct ubifs_bud *ubifs_search_bud(struct ubifs_info *c, int lnum);
+struct ubifs_wbuf *ubifs_get_wbuf(struct ubifs_info *c, int lnum);
+int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum);
+int ubifs_log_end_commit(struct ubifs_info *c, int new_ltail_lnum);
+int ubifs_log_post_commit(struct ubifs_info *c, int old_ltail_lnum);
+int ubifs_consolidate_log(struct ubifs_info *c);
+
+/* journal.c */
+int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
+		     const struct qstr *nm, const struct inode *inode,
+		     int deletion, int xent);
+int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
+			 const union ubifs_key *key, const void *buf, int len);
+int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode,
+			  int last_reference);
+int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
+		     const struct dentry *old_dentry,
+		     const struct inode *new_dir,
+		     const struct dentry *new_dentry, int sync);
+int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
+		       loff_t old_size, loff_t new_size);
+int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
+			   const struct inode *inode, const struct qstr *nm);
+int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode1,
+			   const struct inode *inode2);
+
+/* budget.c */
+int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req);
+void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req);
+void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
+				      struct ubifs_inode *ui);
+int ubifs_budget_inode_op(struct ubifs_info *c, struct inode *inode,
+			  struct ubifs_budget_req *req);
+void ubifs_release_ino_dirty(struct ubifs_info *c, struct inode *inode,
+				struct ubifs_budget_req *req);
+void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode,
+			 struct ubifs_budget_req *req);
+long long ubifs_budg_get_free_space(struct ubifs_info *c);
+int ubifs_calc_min_idx_lebs(struct ubifs_info *c);
+void ubifs_convert_page_budget(struct ubifs_info *c);
+long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs);
+
+/* find.c */
+int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
+			  int squeeze);
+int ubifs_find_free_leb_for_idx(struct ubifs_info *c);
+int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
+			 int min_space, int pick_free);
+int ubifs_find_dirty_idx_leb(struct ubifs_info *c);
+int ubifs_save_dirty_idx_lnums(struct ubifs_info *c);
+
+/* tnc.c */
+int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
+			struct ubifs_znode **zn, int *n);
+int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key,
+		     void *node);
+int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
+			void *node, const struct qstr *nm);
+int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
+		     void *node, int *lnum, int *offs);
+int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum,
+		  int offs, int len);
+int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key,
+		      int old_lnum, int old_offs, int lnum, int offs, int len);
+int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
+		     int lnum, int offs, int len, const struct qstr *nm);
+int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key);
+int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key,
+			const struct qstr *nm);
+int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key,
+			   union ubifs_key *to_key);
+int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum);
+struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c,
+					   union ubifs_key *key,
+					   const struct qstr *nm);
+void ubifs_tnc_close(struct ubifs_info *c);
+int ubifs_tnc_has_node(struct ubifs_info *c, union ubifs_key *key, int level,
+		       int lnum, int offs, int is_idx);
+int ubifs_dirty_idx_node(struct ubifs_info *c, union ubifs_key *key, int level,
+			 int lnum, int offs);
+/* Shared by tnc.c for tnc_commit.c */
+void destroy_old_idx(struct ubifs_info *c);
+int is_idx_node_in_tnc(struct ubifs_info *c, union ubifs_key *key, int level,
+		       int lnum, int offs);
+int insert_old_idx_znode(struct ubifs_info *c, struct ubifs_znode *znode);
+
+/* tnc_misc.c */
+struct ubifs_znode *ubifs_tnc_levelorder_next(struct ubifs_znode *zr,
+					      struct ubifs_znode *znode);
+int ubifs_search_zbranch(const struct ubifs_info *c,
+			 const struct ubifs_znode *znode,
+			 const union ubifs_key *key, int *n);
+struct ubifs_znode *ubifs_tnc_postorder_first(struct ubifs_znode *znode);
+struct ubifs_znode *ubifs_tnc_postorder_next(struct ubifs_znode *znode);
+long ubifs_destroy_tnc_subtree(struct ubifs_znode *zr);
+struct ubifs_znode *ubifs_load_znode(struct ubifs_info *c,
+				     struct ubifs_zbranch *zbr,
+				     struct ubifs_znode *parent, int iip);
+int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+			void *node);
+
+/* tnc_commit.c */
+int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot);
+int ubifs_tnc_end_commit(struct ubifs_info *c);
+
+/* shrinker.c */
+int ubifs_shrinker(int nr_to_scan, gfp_t gfp_mask);
+
+/* commit.c */
+int ubifs_bg_thread(void *info);
+void ubifs_commit_required(struct ubifs_info *c);
+void ubifs_request_bg_commit(struct ubifs_info *c);
+int ubifs_run_commit(struct ubifs_info *c);
+void ubifs_recovery_commit(struct ubifs_info *c);
+int ubifs_gc_should_commit(struct ubifs_info *c);
+void ubifs_wait_for_commit(struct ubifs_info *c);
+
+/* master.c */
+int ubifs_read_master(struct ubifs_info *c);
+int ubifs_write_master(struct ubifs_info *c);
+
+/* sb.c */
+int ubifs_read_superblock(struct ubifs_info *c);
+struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c);
+int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup);
+
+/* replay.c */
+int ubifs_validate_entry(struct ubifs_info *c,
+			 const struct ubifs_dent_node *dent);
+int ubifs_replay_journal(struct ubifs_info *c);
+
+/* gc.c */
+int ubifs_garbage_collect(struct ubifs_info *c, int anyway);
+int ubifs_gc_start_commit(struct ubifs_info *c);
+int ubifs_gc_end_commit(struct ubifs_info *c);
+void ubifs_destroy_idx_gc(struct ubifs_info *c);
+int ubifs_get_idx_gc_leb(struct ubifs_info *c);
+int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp);
+
+/* orphan.c */
+int ubifs_add_orphan(struct ubifs_info *c, ino_t inum);
+void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum);
+int ubifs_orphan_start_commit(struct ubifs_info *c);
+int ubifs_orphan_end_commit(struct ubifs_info *c);
+int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only);
+
+/* lpt.c */
+int ubifs_calc_lpt_geom(struct ubifs_info *c);
+int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first,
+			  int *lpt_lebs, int *big_lpt);
+int ubifs_lpt_init(struct ubifs_info *c, int rd, int wr);
+struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum);
+struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum);
+int ubifs_lpt_scan_nolock(struct ubifs_info *c, int start_lnum, int end_lnum,
+			  ubifs_lpt_scan_callback scan_cb, void *data);
+
+/* Shared by lpt.c for lpt_commit.c */
+void ubifs_pack_lsave(struct ubifs_info *c, void *buf, int *lsave);
+void ubifs_pack_ltab(struct ubifs_info *c, void *buf,
+		     struct ubifs_lpt_lprops *ltab);
+void ubifs_pack_pnode(struct ubifs_info *c, void *buf,
+		      struct ubifs_pnode *pnode);
+void ubifs_pack_nnode(struct ubifs_info *c, void *buf,
+		      struct ubifs_nnode *nnode);
+struct ubifs_pnode *ubifs_get_pnode(struct ubifs_info *c,
+				    struct ubifs_nnode *parent, int iip);
+struct ubifs_nnode *ubifs_get_nnode(struct ubifs_info *c,
+				    struct ubifs_nnode *parent, int iip);
+int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip);
+void ubifs_add_lpt_dirt(struct ubifs_info *c, int lnum, int dirty);
+void ubifs_add_nnode_dirt(struct ubifs_info *c, struct ubifs_nnode *nnode);
+uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits);
+struct ubifs_nnode *ubifs_first_nnode(struct ubifs_info *c, int *hght);
+
+/* lpt_commit.c */
+int ubifs_lpt_start_commit(struct ubifs_info *c);
+int ubifs_lpt_end_commit(struct ubifs_info *c);
+int ubifs_lpt_post_commit(struct ubifs_info *c);
+void ubifs_lpt_free(struct ubifs_info *c, int wr_only);
+
+/* lprops.c */
+void ubifs_get_lprops(struct ubifs_info *c);
+const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
+					   const struct ubifs_lprops *lp,
+					   int free, int dirty, int flags,
+					   int idx_gc_cnt);
+void ubifs_release_lprops(struct ubifs_info *c);
+void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *stats);
+void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops,
+		      int cat);
+void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops,
+		       struct ubifs_lprops *new_lprops);
+void ubifs_ensure_cat(struct ubifs_info *c, struct ubifs_lprops *lprops);
+int ubifs_categorize_lprops(const struct ubifs_info *c,
+			    const struct ubifs_lprops *lprops);
+int ubifs_change_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
+			int flags_set, int flags_clean, int idx_gc_cnt);
+int ubifs_update_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
+			int flags_set, int flags_clean);
+int ubifs_read_one_lp(struct ubifs_info *c, int lnum, struct ubifs_lprops *lp);
+const struct ubifs_lprops *ubifs_fast_find_free(struct ubifs_info *c);
+const struct ubifs_lprops *ubifs_fast_find_empty(struct ubifs_info *c);
+const struct ubifs_lprops *ubifs_fast_find_freeable(struct ubifs_info *c);
+const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c);
+
+/* file.c */
+int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync);
+int ubifs_setattr(struct dentry *dentry, struct iattr *attr);
+
+/* dir.c */
+struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
+			      int mode);
+int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		  struct kstat *stat);
+
+/* xattr.c */
+int ubifs_setxattr(struct dentry *dentry, const char *name,
+		   const void *value, size_t size, int flags);
+ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
+		       size_t size);
+ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size);
+int ubifs_removexattr(struct dentry *dentry, const char *name);
+
+/* super.c */
+struct inode *ubifs_iget(struct super_block *sb, unsigned long inum);
+
+/* recovery.c */
+int ubifs_recover_master_node(struct ubifs_info *c);
+int ubifs_write_rcvrd_mst_node(struct ubifs_info *c);
+struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
+					 int offs, void *sbuf, int grouped);
+struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
+					     int offs, void *sbuf);
+int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf);
+int ubifs_clean_lebs(const struct ubifs_info *c, void *sbuf);
+int ubifs_rcvry_gc_commit(struct ubifs_info *c);
+int ubifs_recover_size_accum(struct ubifs_info *c, union ubifs_key *key,
+			     int deletion, loff_t new_size);
+int ubifs_recover_size(struct ubifs_info *c);
+void ubifs_destroy_size_tree(struct ubifs_info *c);
+
+/* ioctl.c */
+long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+void ubifs_set_inode_flags(struct inode *inode);
+#ifdef CONFIG_COMPAT
+long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+#endif
+
+/* compressor.c */
+int __init ubifs_compressors_init(void);
+void __exit ubifs_compressors_exit(void);
+void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
+		    int *compr_type);
+int ubifs_decompress(const void *buf, int len, void *out, int *out_len,
+		     int compr_type);
+
+#include "debug.h"
+#include "misc.h"
+#include "key.h"
+
+#endif /* !__UBIFS_H__ */
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
new file mode 100644
index 00000000000..1388a078e1a
--- /dev/null
+++ b/fs/ubifs/xattr.c
@@ -0,0 +1,581 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ *          Adrian Hunter
+ */
+
+/*
+ * This file implements UBIFS extended attributes support.
+ *
+ * Extended attributes are implemented as regular inodes with attached data,
+ * which limits extended attribute size to UBIFS block size (4KiB). Names of
+ * extended attributes are described by extended attribute entries (xentries),
+ * which are almost identical to directory entries, but have different key type.
+ *
+ * In other words, the situation with extended attributes is very similar to
+ * directories. Indeed, any inode (but of course not xattr inodes) may have a
+ * number of associated xentries, just like directory inodes have associated
+ * directory entries. Extended attribute entries store the name of the extended
+ * attribute, the host inode number, and the extended attribute inode number.
+ * Similarly, direntries store the name, the parent and the target inode
+ * numbers. Thus, most of the common UBIFS mechanisms may be re-used for
+ * extended attributes.
+ *
+ * The number of extended attributes is not limited, but there is Linux
+ * limitation on the maximum possible size of the list of all extended
+ * attributes associated with an inode (%XATTR_LIST_MAX), so UBIFS makes sure
+ * the sum of all extended attribute names of the inode does not exceed that
+ * limit.
+ *
+ * Extended attributes are synchronous, which means they are written to the
+ * flash media synchronously and there is no write-back for extended attribute
+ * inodes. The extended attribute values are not stored in compressed form on
+ * the media.
+ *
+ * Since extended attributes are represented by regular inodes, they are cached
+ * in the VFS inode cache. The xentries are cached in the LNC cache (see
+ * tnc.c).
+ *
+ * ACL support is not implemented.
+ */
+
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+#include "ubifs.h"
+
+/*
+ * Limit the number of extended attributes per inode so that the total size
+ * (xattr_size) is guaranteeded to fit in an 'unsigned int'.
+ */
+#define MAX_XATTRS_PER_INODE 65535
+
+/*
+ * Extended attribute type constants.
+ *
+ * USER_XATTR: user extended attribute ("user.*")
+ * TRUSTED_XATTR: trusted extended attribute ("trusted.*)
+ * SECURITY_XATTR: security extended attribute ("security.*")
+ */
+enum {
+	USER_XATTR,
+	TRUSTED_XATTR,
+	SECURITY_XATTR,
+};
+
+static struct inode_operations none_inode_operations;
+static struct address_space_operations none_address_operations;
+static struct file_operations none_file_operations;
+
+/**
+ * create_xattr - create an extended attribute.
+ * @c: UBIFS file-system description object
+ * @host: host inode
+ * @nm: extended attribute name
+ * @value: extended attribute value
+ * @size: size of extended attribute value
+ *
+ * This is a helper function which creates an extended attribute of name @nm
+ * and value @value for inode @host. The host inode is also updated on flash
+ * because the ctime and extended attribute accounting data changes. This
+ * function returns zero in case of success and a negative error code in case
+ * of failure.
+ */
+static int create_xattr(struct ubifs_info *c, struct inode *host,
+			const struct qstr *nm, const void *value, int size)
+{
+	int err;
+	struct inode *inode;
+	struct ubifs_inode *ui, *host_ui = ubifs_inode(host);
+	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
+					.new_ino_d = size, .dirtied_ino = 1,
+					.dirtied_ino_d = host_ui->data_len};
+
+	if (host_ui->xattr_cnt >= MAX_XATTRS_PER_INODE)
+		return -ENOSPC;
+	/*
+	 * Linux limits the maximum size of the extended attribute names list
+	 * to %XATTR_LIST_MAX. This means we should not allow creating more*
+	 * extended attributes if the name list becomes larger. This limitation
+	 * is artificial for UBIFS, though.
+	 */
+	if (host_ui->xattr_names + host_ui->xattr_cnt +
+					nm->len + 1 > XATTR_LIST_MAX)
+		return -ENOSPC;
+
+	err = ubifs_budget_space(c, &req);
+	if (err)
+		return err;
+
+	inode = ubifs_new_inode(c, host, S_IFREG | S_IRWXUGO);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out_budg;
+	}
+
+	mutex_lock(&host_ui->ui_mutex);
+	/* Re-define all operations to be "nothing" */
+	inode->i_mapping->a_ops = &none_address_operations;
+	inode->i_op = &none_inode_operations;
+	inode->i_fop = &none_file_operations;
+
+	inode->i_flags |= S_SYNC | S_NOATIME | S_NOCMTIME | S_NOQUOTA;
+	ui = ubifs_inode(inode);
+	ui->xattr = 1;
+	ui->flags |= UBIFS_XATTR_FL;
+	ui->data = kmalloc(size, GFP_NOFS);
+	if (!ui->data) {
+		err = -ENOMEM;
+		goto out_unlock;
+	}
+
+	memcpy(ui->data, value, size);
+	host->i_ctime = ubifs_current_time(host);
+	host_ui->xattr_cnt += 1;
+	host_ui->xattr_size += CALC_DENT_SIZE(nm->len);
+	host_ui->xattr_size += CALC_XATTR_BYTES(size);
+	host_ui->xattr_names += nm->len;
+
+	/*
+	 * We do not use i_size_write() because nobody can race with us as we
+	 * are holding host @host->i_mutex - every xattr operation for this
+	 * inode is serialized by it.
+	 */
+	inode->i_size = ui->ui_size = size;
+	ui->data_len = size;
+	err = ubifs_jnl_update(c, host, nm, inode, 0, 1);
+	if (err)
+		goto out_cancel;
+	mutex_unlock(&host_ui->ui_mutex);
+
+	ubifs_release_budget(c, &req);
+	insert_inode_hash(inode);
+	iput(inode);
+	return 0;
+
+out_cancel:
+	host_ui->xattr_cnt -= 1;
+	host_ui->xattr_size -= CALC_DENT_SIZE(nm->len);
+	host_ui->xattr_size -= CALC_XATTR_BYTES(size);
+out_unlock:
+	mutex_unlock(&host_ui->ui_mutex);
+	make_bad_inode(inode);
+	iput(inode);
+out_budg:
+	ubifs_release_budget(c, &req);
+	return err;
+}
+
+/**
+ * change_xattr - change an extended attribute.
+ * @c: UBIFS file-system description object
+ * @host: host inode
+ * @inode: extended attribute inode
+ * @value: extended attribute value
+ * @size: size of extended attribute value
+ *
+ * This helper function changes the value of extended attribute @inode with new
+ * data from @value. Returns zero in case of success and a negative error code
+ * in case of failure.
+ */
+static int change_xattr(struct ubifs_info *c, struct inode *host,
+			struct inode *inode, const void *value, int size)
+{
+	int err;
+	struct ubifs_inode *host_ui = ubifs_inode(host);
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	struct ubifs_budget_req req = { .dirtied_ino = 2,
+				.dirtied_ino_d = size + host_ui->data_len };
+
+	ubifs_assert(ui->data_len == inode->i_size);
+	err = ubifs_budget_space(c, &req);
+	if (err)
+		return err;
+
+	mutex_lock(&host_ui->ui_mutex);
+	host->i_ctime = ubifs_current_time(host);
+	host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len);
+	host_ui->xattr_size += CALC_XATTR_BYTES(size);
+
+	kfree(ui->data);
+	ui->data = kmalloc(size, GFP_NOFS);
+	if (!ui->data) {
+		err = -ENOMEM;
+		goto out_unlock;
+	}
+
+	memcpy(ui->data, value, size);
+	inode->i_size = ui->ui_size = size;
+	ui->data_len = size;
+
+	/*
+	 * It is important to write the host inode after the xattr inode
+	 * because if the host inode gets synchronized (via 'fsync()'), then
+	 * the extended attribute inode gets synchronized, because it goes
+	 * before the host inode in the write-buffer.
+	 */
+	err = ubifs_jnl_change_xattr(c, inode, host);
+	if (err)
+		goto out_cancel;
+	mutex_unlock(&host_ui->ui_mutex);
+
+	ubifs_release_budget(c, &req);
+	return 0;
+
+out_cancel:
+	host_ui->xattr_size -= CALC_XATTR_BYTES(size);
+	host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len);
+	make_bad_inode(inode);
+out_unlock:
+	mutex_unlock(&host_ui->ui_mutex);
+	ubifs_release_budget(c, &req);
+	return err;
+}
+
+/**
+ * check_namespace - check extended attribute name-space.
+ * @nm: extended attribute name
+ *
+ * This function makes sure the extended attribute name belongs to one of the
+ * supported extended attribute name-spaces. Returns name-space index in case
+ * of success and a negative error code in case of failure.
+ */
+static int check_namespace(const struct qstr *nm)
+{
+	int type;
+
+	if (nm->len > UBIFS_MAX_NLEN)
+		return -ENAMETOOLONG;
+
+	if (!strncmp(nm->name, XATTR_TRUSTED_PREFIX,
+		     XATTR_TRUSTED_PREFIX_LEN)) {
+		if (nm->name[sizeof(XATTR_TRUSTED_PREFIX) - 1] == '\0')
+			return -EINVAL;
+		type = TRUSTED_XATTR;
+	} else if (!strncmp(nm->name, XATTR_USER_PREFIX,
+				      XATTR_USER_PREFIX_LEN)) {
+		if (nm->name[XATTR_USER_PREFIX_LEN] == '\0')
+			return -EINVAL;
+		type = USER_XATTR;
+	} else if (!strncmp(nm->name, XATTR_SECURITY_PREFIX,
+				     XATTR_SECURITY_PREFIX_LEN)) {
+		if (nm->name[sizeof(XATTR_SECURITY_PREFIX) - 1] == '\0')
+			return -EINVAL;
+		type = SECURITY_XATTR;
+	} else
+		return -EOPNOTSUPP;
+
+	return type;
+}
+
+static struct inode *iget_xattr(struct ubifs_info *c, ino_t inum)
+{
+	struct inode *inode;
+
+	inode = ubifs_iget(c->vfs_sb, inum);
+	if (IS_ERR(inode)) {
+		ubifs_err("dead extended attribute entry, error %d",
+			  (int)PTR_ERR(inode));
+		return inode;
+	}
+	if (ubifs_inode(inode)->xattr)
+		return inode;
+	ubifs_err("corrupt extended attribute entry");
+	iput(inode);
+	return ERR_PTR(-EINVAL);
+}
+
+int ubifs_setxattr(struct dentry *dentry, const char *name,
+		   const void *value, size_t size, int flags)
+{
+	struct inode *inode, *host = dentry->d_inode;
+	struct ubifs_info *c = host->i_sb->s_fs_info;
+	struct qstr nm = { .name = name, .len = strlen(name) };
+	struct ubifs_dent_node *xent;
+	union ubifs_key key;
+	int err, type;
+
+	dbg_gen("xattr '%s', host ino %lu ('%.*s'), size %zd", name,
+		host->i_ino, dentry->d_name.len, dentry->d_name.name, size);
+
+	if (size > UBIFS_MAX_INO_DATA)
+		return -ERANGE;
+
+	type = check_namespace(&nm);
+	if (type < 0)
+		return type;
+
+	xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS);
+	if (!xent)
+		return -ENOMEM;
+
+	/*
+	 * The extended attribute entries are stored in LNC, so multiple
+	 * look-ups do not involve reading the flash.
+	 */
+	xent_key_init(c, &key, host->i_ino, &nm);
+	err = ubifs_tnc_lookup_nm(c, &key, xent, &nm);
+	if (err) {
+		if (err != -ENOENT)
+			goto out_free;
+
+		if (flags & XATTR_REPLACE)
+			/* We are asked not to create the xattr */
+			err = -ENODATA;
+		else
+			err = create_xattr(c, host, &nm, value, size);
+		goto out_free;
+	}
+
+	if (flags & XATTR_CREATE) {
+		/* We are asked not to replace the xattr */
+		err = -EEXIST;
+		goto out_free;
+	}
+
+	inode = iget_xattr(c, le64_to_cpu(xent->inum));
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out_free;
+	}
+
+	err = change_xattr(c, host, inode, value, size);
+	iput(inode);
+
+out_free:
+	kfree(xent);
+	return err;
+}
+
+ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
+		       size_t size)
+{
+	struct inode *inode, *host = dentry->d_inode;
+	struct ubifs_info *c = host->i_sb->s_fs_info;
+	struct qstr nm = { .name = name, .len = strlen(name) };
+	struct ubifs_inode *ui;
+	struct ubifs_dent_node *xent;
+	union ubifs_key key;
+	int err;
+
+	dbg_gen("xattr '%s', ino %lu ('%.*s'), buf size %zd", name,
+		host->i_ino, dentry->d_name.len, dentry->d_name.name, size);
+
+	err = check_namespace(&nm);
+	if (err < 0)
+		return err;
+
+	xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS);
+	if (!xent)
+		return -ENOMEM;
+
+	mutex_lock(&host->i_mutex);
+	xent_key_init(c, &key, host->i_ino, &nm);
+	err = ubifs_tnc_lookup_nm(c, &key, xent, &nm);
+	if (err) {
+		if (err == -ENOENT)
+			err = -ENODATA;
+		goto out_unlock;
+	}
+
+	inode = iget_xattr(c, le64_to_cpu(xent->inum));
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out_unlock;
+	}
+
+	ui = ubifs_inode(inode);
+	ubifs_assert(inode->i_size == ui->data_len);
+	ubifs_assert(ubifs_inode(host)->xattr_size > ui->data_len);
+
+	if (buf) {
+		/* If @buf is %NULL we are supposed to return the length */
+		if (ui->data_len > size) {
+			dbg_err("buffer size %zd, xattr len %d",
+				size, ui->data_len);
+			err = -ERANGE;
+			goto out_iput;
+		}
+
+		memcpy(buf, ui->data, ui->data_len);
+	}
+	err = ui->data_len;
+
+out_iput:
+	iput(inode);
+out_unlock:
+	mutex_unlock(&host->i_mutex);
+	kfree(xent);
+	return err;
+}
+
+ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+	union ubifs_key key;
+	struct inode *host = dentry->d_inode;
+	struct ubifs_info *c = host->i_sb->s_fs_info;
+	struct ubifs_inode *host_ui = ubifs_inode(host);
+	struct ubifs_dent_node *xent, *pxent = NULL;
+	int err, len, written = 0;
+	struct qstr nm = { .name = NULL };
+
+	dbg_gen("ino %lu ('%.*s'), buffer size %zd", host->i_ino,
+		dentry->d_name.len, dentry->d_name.name, size);
+
+	len = host_ui->xattr_names + host_ui->xattr_cnt;
+	if (!buffer)
+		/*
+		 * We should return the minimum buffer size which will fit a
+		 * null-terminated list of all the extended attribute names.
+		 */
+		return len;
+
+	if (len > size)
+		return -ERANGE;
+
+	lowest_xent_key(c, &key, host->i_ino);
+
+	mutex_lock(&host->i_mutex);
+	while (1) {
+		int type;
+
+		xent = ubifs_tnc_next_ent(c, &key, &nm);
+		if (unlikely(IS_ERR(xent))) {
+			err = PTR_ERR(xent);
+			break;
+		}
+
+		nm.name = xent->name;
+		nm.len = le16_to_cpu(xent->nlen);
+
+		type = check_namespace(&nm);
+		if (unlikely(type < 0)) {
+			err = type;
+			break;
+		}
+
+		/* Show trusted namespace only for "power" users */
+		if (type != TRUSTED_XATTR || capable(CAP_SYS_ADMIN)) {
+			memcpy(buffer + written, nm.name, nm.len + 1);
+			written += nm.len + 1;
+		}
+
+		kfree(pxent);
+		pxent = xent;
+		key_read(c, &xent->key, &key);
+	}
+	mutex_unlock(&host->i_mutex);
+
+	kfree(pxent);
+	if (err != -ENOENT) {
+		ubifs_err("cannot find next direntry, error %d", err);
+		return err;
+	}
+
+	ubifs_assert(written <= size);
+	return written;
+}
+
+static int remove_xattr(struct ubifs_info *c, struct inode *host,
+			struct inode *inode, const struct qstr *nm)
+{
+	int err;
+	struct ubifs_inode *host_ui = ubifs_inode(host);
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	struct ubifs_budget_req req = { .dirtied_ino = 1, .mod_dent = 1,
+					.dirtied_ino_d = host_ui->data_len };
+
+	ubifs_assert(ui->data_len == inode->i_size);
+
+	err = ubifs_budget_space(c, &req);
+	if (err)
+		return err;
+
+	mutex_lock(&host_ui->ui_mutex);
+	host->i_ctime = ubifs_current_time(host);
+	host_ui->xattr_cnt -= 1;
+	host_ui->xattr_size -= CALC_DENT_SIZE(nm->len);
+	host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len);
+	host_ui->xattr_names -= nm->len;
+
+	err = ubifs_jnl_delete_xattr(c, host, inode, nm);
+	if (err)
+		goto out_cancel;
+	mutex_unlock(&host_ui->ui_mutex);
+
+	ubifs_release_budget(c, &req);
+	return 0;
+
+out_cancel:
+	host_ui->xattr_cnt += 1;
+	host_ui->xattr_size += CALC_DENT_SIZE(nm->len);
+	host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len);
+	mutex_unlock(&host_ui->ui_mutex);
+	ubifs_release_budget(c, &req);
+	make_bad_inode(inode);
+	return err;
+}
+
+int ubifs_removexattr(struct dentry *dentry, const char *name)
+{
+	struct inode *inode, *host = dentry->d_inode;
+	struct ubifs_info *c = host->i_sb->s_fs_info;
+	struct qstr nm = { .name = name, .len = strlen(name) };
+	struct ubifs_dent_node *xent;
+	union ubifs_key key;
+	int err;
+
+	dbg_gen("xattr '%s', ino %lu ('%.*s')", name,
+		host->i_ino, dentry->d_name.len, dentry->d_name.name);
+	ubifs_assert(mutex_is_locked(&host->i_mutex));
+
+	err = check_namespace(&nm);
+	if (err < 0)
+		return err;
+
+	xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS);
+	if (!xent)
+		return -ENOMEM;
+
+	xent_key_init(c, &key, host->i_ino, &nm);
+	err = ubifs_tnc_lookup_nm(c, &key, xent, &nm);
+	if (err) {
+		if (err == -ENOENT)
+			err = -ENODATA;
+		goto out_free;
+	}
+
+	inode = iget_xattr(c, le64_to_cpu(xent->inum));
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out_free;
+	}
+
+	ubifs_assert(inode->i_nlink == 1);
+	inode->i_nlink = 0;
+	err = remove_xattr(c, host, inode, &nm);
+	if (err)
+		inode->i_nlink = 1;
+
+	/* If @i_nlink is 0, 'iput()' will delete the inode */
+	iput(inode);
+
+out_free:
+	kfree(xent);
+	return err;
+}
-- 
cgit v1.2.3-70-g09d2


From 0d7eff873caaeac84de01a1acdca983d2c7ba3fe Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Mon, 14 Jul 2008 19:08:38 +0300
Subject: UBIFS: include to compilation

Add UBIFS to Makefile and Kbuild.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: Adrian Hunter <ext-adrian.hunter@nokia.com>
---
 fs/Kconfig        |  3 +++
 fs/Makefile       |  1 +
 fs/ubifs/Kconfig  | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ubifs/Makefile |  9 +++++++
 4 files changed, 85 insertions(+)
 create mode 100644 fs/ubifs/Kconfig
 create mode 100644 fs/ubifs/Makefile

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 2694648cbd1..a52cf6280b4 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1375,6 +1375,9 @@ config JFFS2_CMODE_FAVOURLZO
 
 endchoice
 
+# UBIFS File system configuration
+source "fs/ubifs/Kconfig"
+
 config CRAMFS
 	tristate "Compressed ROM file system support (cramfs)"
 	depends on BLOCK
diff --git a/fs/Makefile b/fs/Makefile
index 1e7a11bd4da..fcae06aaadc 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -100,6 +100,7 @@ obj-$(CONFIG_NTFS_FS)		+= ntfs/
 obj-$(CONFIG_UFS_FS)		+= ufs/
 obj-$(CONFIG_EFS_FS)		+= efs/
 obj-$(CONFIG_JFFS2_FS)		+= jffs2/
+obj-$(CONFIG_UBIFS_FS)		+= ubifs/
 obj-$(CONFIG_AFFS_FS)		+= affs/
 obj-$(CONFIG_ROMFS_FS)		+= romfs/
 obj-$(CONFIG_QNX4FS_FS)		+= qnx4/
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
new file mode 100644
index 00000000000..91ceeda7e5b
--- /dev/null
+++ b/fs/ubifs/Kconfig
@@ -0,0 +1,72 @@
+config UBIFS_FS
+	tristate "UBIFS file system support"
+	select CRC16
+	select CRC32
+	select CRYPTO if UBIFS_FS_ADVANCED_COMPR
+	select CRYPTO if UBIFS_FS_LZO
+	select CRYPTO if UBIFS_FS_ZLIB
+	select CRYPTO_LZO if UBIFS_FS_LZO
+	select CRYPTO_DEFLATE if UBIFS_FS_ZLIB
+	depends on MTD_UBI
+	help
+	  UBIFS is a file system for flash devices which works on top of UBI.
+
+config UBIFS_FS_XATTR
+	bool "Extended attributes support"
+	depends on UBIFS_FS
+	help
+	  This option enables support of extended attributes.
+
+config UBIFS_FS_ADVANCED_COMPR
+	bool "Advanced compression options"
+	depends on UBIFS_FS
+	help
+	  This option allows to explicitly choose which compressions, if any,
+	  are enabled in UBIFS. Removing compressors means inbility to read
+	  existing file systems.
+
+	  If unsure, say 'N'.
+
+config UBIFS_FS_LZO
+	bool "LZO compression support" if UBIFS_FS_ADVANCED_COMPR
+	depends on UBIFS_FS
+	default y
+	help
+	   LZO compressor is generally faster then zlib but compresses worse.
+	   Say 'Y' if unsure.
+
+config UBIFS_FS_ZLIB
+	bool "ZLIB compression support" if UBIFS_FS_ADVANCED_COMPR
+	depends on UBIFS_FS
+	default y
+	help
+	  Zlib copresses better then LZO but it is slower. Say 'Y' if unsure.
+
+# Debugging-related stuff
+config UBIFS_FS_DEBUG
+	bool "Enable debugging"
+	depends on UBIFS_FS
+	select DEBUG_FS
+	select KALLSYMS_ALL
+	help
+	  This option enables UBIFS debugging.
+
+config UBIFS_FS_DEBUG_MSG_LVL
+	int "Default message level (0 = no extra messages, 3 = lots)"
+	depends on UBIFS_FS_DEBUG
+	default "0"
+	help
+	  This controls the amount of debugging messages produced by UBIFS.
+	  If reporting bugs, please try to have available a full dump of the
+	  messages at level 1 while the misbehaviour was occurring. Level 2
+	  may become necessary if level 1 messages were not enough to find the
+	  bug. Generally Level 3 should be avoided.
+
+config UBIFS_FS_DEBUG_CHKS
+	bool "Enable extra checks"
+	depends on UBIFS_FS_DEBUG
+	help
+	  If extra checks are enabled UBIFS will check the consistency of its
+	  internal data structures during operation. However, UBIFS performance
+	  is dramatically slower when this option is selected especially if the
+	  file system is large.
diff --git a/fs/ubifs/Makefile b/fs/ubifs/Makefile
new file mode 100644
index 00000000000..80e93c35e49
--- /dev/null
+++ b/fs/ubifs/Makefile
@@ -0,0 +1,9 @@
+obj-$(CONFIG_UBIFS_FS) += ubifs.o
+
+ubifs-y += shrinker.o journal.o file.o dir.o super.o sb.o io.o
+ubifs-y += tnc.o master.o scan.o replay.o log.o commit.o gc.o orphan.o
+ubifs-y += budget.o find.o tnc_commit.o compress.o lpt.o lprops.o
+ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o
+
+ubifs-$(CONFIG_UBIFS_FS_DEBUG) += debug.o
+ubifs-$(CONFIG_UBIFS_FS_XATTR) += xattr.o
-- 
cgit v1.2.3-70-g09d2


From b0e92aae157cbf78ede7b7dd03967b49fcb2102e Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 15 Jul 2008 12:35:20 -0400
Subject: lockd: nlm_release_host() checks for NULL, caller needn't

No need to check for a NULL argument twice.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/svc4proc.c | 3 +--
 fs/lockd/svcproc.c  | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 385437e3387..006a832d46f 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -58,8 +58,7 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
 	return 0;
 
 no_locks:
-	if (host)
-		nlm_release_host(host);
+	nlm_release_host(host);
  	if (error)
 		return error;	
 	return nlm_lck_denied_nolocks;
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 88379cc6e0b..fce3d703962 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -87,8 +87,7 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
 	return 0;
 
 no_locks:
-	if (host)
-		nlm_release_host(host);
+	nlm_release_host(host);
 	if (error)
 		return error;
 	return nlm_lck_denied_nolocks;
-- 
cgit v1.2.3-70-g09d2


From 8f920d5e29f86d3425a68e1c3bc264d1f6f55112 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 15 Jul 2008 14:06:48 -0400
Subject: lockd: eliminate duplicate nlmsvc_lookup_host call from
 nlmsvc_testlock

nlmsvc_testlock calls nlmsvc_lookup_host to find a nlm_host struct. The
callers of this functions, however, call nlmsvc_retrieve_args or
nlm4svc_retrieve_args, which also return a nlm_host struct.

Change nlmsvc_testlock to take a host arg instead of calling
nlmsvc_lookup_host itself and change the callers to pass a pointer to
the nlm_host they've already found.

We take a reference to host in the place where nlmsvc_testlock()
previous did a new lookup, so the reference counting is unchanged from
before.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/svc4proc.c         |  2 +-
 fs/lockd/svclock.c          | 12 +++---------
 fs/lockd/svcproc.c          |  2 +-
 include/linux/lockd/lockd.h |  3 ++-
 4 files changed, 7 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 006a832d46f..8cfb9daa7c7 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -99,7 +99,7 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
 		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
 
 	/* Now check for conflicting locks */
-	resp->status = nlmsvc_testlock(rqstp, file, &argp->lock, &resp->lock, &resp->cookie);
+	resp->status = nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie);
 	if (resp->status == nlm_drop_reply)
 		rc = rpc_drop_reply;
 	else
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 81aca859bfd..f40afb3a0e6 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -460,8 +460,8 @@ out:
  */
 __be32
 nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
-		struct nlm_lock *lock, struct nlm_lock *conflock,
-		struct nlm_cookie *cookie)
+		struct nlm_host *host, struct nlm_lock *lock,
+		struct nlm_lock *conflock, struct nlm_cookie *cookie)
 {
 	struct nlm_block 	*block = NULL;
 	int			error;
@@ -479,16 +479,10 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
 
 	if (block == NULL) {
 		struct file_lock *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
-		struct nlm_host	*host;
 
 		if (conf == NULL)
 			return nlm_granted;
-		/* Create host handle for callback */
-		host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len);
-		if (host == NULL) {
-			kfree(conf);
-			return nlm_lck_denied_nolocks;
-		}
+		nlm_get_host(host);
 		block = nlmsvc_create_block(rqstp, host, file, lock, cookie);
 		if (block == NULL) {
 			kfree(conf);
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index fce3d703962..e099f589b61 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -128,7 +128,7 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
 		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
 
 	/* Now check for conflicting locks */
-	resp->status = cast_status(nlmsvc_testlock(rqstp, file, &argp->lock, &resp->lock, &resp->cookie));
+	resp->status = cast_status(nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie));
 	if (resp->status == nlm_drop_reply)
 		rc = rpc_drop_reply;
 	else
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index 102d928f720..b27967034b5 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -203,7 +203,8 @@ __be32		  nlmsvc_lock(struct svc_rqst *, struct nlm_file *,
 					struct nlm_lock *, int, struct nlm_cookie *);
 __be32		  nlmsvc_unlock(struct nlm_file *, struct nlm_lock *);
 __be32		  nlmsvc_testlock(struct svc_rqst *, struct nlm_file *,
-			struct nlm_lock *, struct nlm_lock *, struct nlm_cookie *);
+			struct nlm_host *, struct nlm_lock *,
+			struct nlm_lock *, struct nlm_cookie *);
 __be32		  nlmsvc_cancel_blocked(struct nlm_file *, struct nlm_lock *);
 unsigned long	  nlmsvc_retry_blocked(void);
 void		  nlmsvc_traverse_blocks(struct nlm_host *, struct nlm_file *,
-- 
cgit v1.2.3-70-g09d2


From 6cde4de80773497d8333985b135f472eda870904 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 15 Jul 2008 14:26:17 -0400
Subject: lockd: eliminate duplicate nlmsvc_lookup_host call from nlmsvc_lock

nlmsvc_lock calls nlmsvc_lookup_host to find a nlm_host struct. The
callers of this function, however, call nlmsvc_retrieve_args or
nlm4svc_retrieve_args, which also return a nlm_host struct.

Change nlmsvc_lock to take a host arg instead of calling
nlmsvc_lookup_host itself and change the callers to pass a pointer to
the nlm_host they've already found.

Since nlmsvc_testlock() now just uses the caller's reference, we no
longer need to get or release it.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/svc4proc.c         |  2 +-
 fs/lockd/svclock.c          | 10 ++--------
 fs/lockd/svcproc.c          |  2 +-
 include/linux/lockd/lockd.h |  3 ++-
 4 files changed, 6 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 8cfb9daa7c7..189b2ce01da 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -145,7 +145,7 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 #endif
 
 	/* Now try to lock the file */
-	resp->status = nlmsvc_lock(rqstp, file, &argp->lock,
+	resp->status = nlmsvc_lock(rqstp, file, host, &argp->lock,
 					argp->block, &argp->cookie);
 	if (resp->status == nlm_drop_reply)
 		rc = rpc_drop_reply;
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index f40afb3a0e6..bcf73f6e822 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -358,10 +358,10 @@ nlmsvc_defer_lock_rqst(struct svc_rqst *rqstp, struct nlm_block *block)
  */
 __be32
 nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
-			struct nlm_lock *lock, int wait, struct nlm_cookie *cookie)
+	    struct nlm_host *host, struct nlm_lock *lock, int wait,
+	    struct nlm_cookie *cookie)
 {
 	struct nlm_block	*block = NULL;
-	struct nlm_host		*host;
 	int			error;
 	__be32			ret;
 
@@ -373,11 +373,6 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 				(long long)lock->fl.fl_end,
 				wait);
 
-	/* Create host handle for callback */
-	host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len);
-	if (host == NULL)
-		return nlm_lck_denied_nolocks;
-
 	/* Lock file against concurrent access */
 	mutex_lock(&file->f_mutex);
 	/* Get existing block (in case client is busy-waiting)
@@ -450,7 +445,6 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 out:
 	mutex_unlock(&file->f_mutex);
 	nlmsvc_release_block(block);
-	nlm_release_host(host);
 	dprintk("lockd: nlmsvc_lock returned %u\n", ret);
 	return ret;
 }
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index e099f589b61..82dc9083ba6 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -175,7 +175,7 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 #endif
 
 	/* Now try to lock the file */
-	resp->status = cast_status(nlmsvc_lock(rqstp, file, &argp->lock,
+	resp->status = cast_status(nlmsvc_lock(rqstp, file, host, &argp->lock,
 					       argp->block, &argp->cookie));
 	if (resp->status == nlm_drop_reply)
 		rc = rpc_drop_reply;
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index b27967034b5..f81f9dd5f14 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -200,7 +200,8 @@ typedef int	  (*nlm_host_match_fn_t)(void *cur, struct nlm_host *ref);
  * Server-side lock handling
  */
 __be32		  nlmsvc_lock(struct svc_rqst *, struct nlm_file *,
-					struct nlm_lock *, int, struct nlm_cookie *);
+			      struct nlm_host *, struct nlm_lock *, int,
+			      struct nlm_cookie *);
 __be32		  nlmsvc_unlock(struct nlm_file *, struct nlm_lock *);
 __be32		  nlmsvc_testlock(struct svc_rqst *, struct nlm_file *,
 			struct nlm_host *, struct nlm_lock *,
-- 
cgit v1.2.3-70-g09d2


From 6d7bbbbacc5202eaabbc232681cc325b22a73eeb Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Tue, 15 Jul 2008 14:38:32 -0400
Subject: lockd: minor svclock.c style fixes

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/svclock.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index bcf73f6e822..51a0dea0ae8 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -129,9 +129,9 @@ nlmsvc_lookup_block(struct nlm_file *file, struct nlm_lock *lock)
 
 static inline int nlm_cookie_match(struct nlm_cookie *a, struct nlm_cookie *b)
 {
-	if(a->len != b->len)
+	if (a->len != b->len)
 		return 0;
-	if(memcmp(a->data,b->data,a->len))
+	if (memcmp(a->data, b->data, a->len))
 		return 0;
 	return 1;
 }
@@ -381,7 +381,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 	block = nlmsvc_lookup_block(file, lock);
 	if (block == NULL) {
 		block = nlmsvc_create_block(rqstp, nlm_get_host(host), file,
-				lock, cookie);
+					    lock, cookie);
 		ret = nlm_lck_denied_nolocks;
 		if (block == NULL)
 			goto out;
@@ -412,7 +412,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 	lock->fl.fl_flags &= ~FL_SLEEP;
 
 	dprintk("lockd: vfs_lock_file returned %d\n", error);
-	switch(error) {
+	switch (error) {
 		case 0:
 			ret = nlm_granted;
 			goto out;
@@ -880,7 +880,7 @@ nlmsvc_retry_blocked(void)
 
 		if (block->b_when == NLM_NEVER)
 			break;
-	        if (time_after(block->b_when,jiffies)) {
+		if (time_after(block->b_when, jiffies)) {
 			timeout = block->b_when - jiffies;
 			break;
 		}
-- 
cgit v1.2.3-70-g09d2


From 560de0e65904db392e1c443c4bf5ee750573336b Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Tue, 15 Jul 2008 15:05:45 -0400
Subject: lockd: get host reference in nlmsvc_create_block() instead of callers

It may not be obvious (till you look at the definition of
nlm_alloc_call()) that a function like nlmsvc_create_block() should
consume a reference on success or failure, so I find it clearer if it
takes the reference it needs itself.

And both callers already do this immediately before the call anyway.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/svclock.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 51a0dea0ae8..b8f86b73a85 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -180,6 +180,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host,
 	struct nlm_block	*block;
 	struct nlm_rqst		*call = NULL;
 
+	nlm_get_host(host);
 	call = nlm_alloc_call(host);
 	if (call == NULL)
 		return NULL;
@@ -380,8 +381,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 	 */
 	block = nlmsvc_lookup_block(file, lock);
 	if (block == NULL) {
-		block = nlmsvc_create_block(rqstp, nlm_get_host(host), file,
-					    lock, cookie);
+		block = nlmsvc_create_block(rqstp, host, file, lock, cookie);
 		ret = nlm_lck_denied_nolocks;
 		if (block == NULL)
 			goto out;
@@ -476,7 +476,6 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
 
 		if (conf == NULL)
 			return nlm_granted;
-		nlm_get_host(host);
 		block = nlmsvc_create_block(rqstp, host, file, lock, cookie);
 		if (block == NULL) {
 			kfree(conf);
-- 
cgit v1.2.3-70-g09d2


From 367c8c7bd9a2882daad6c9cb607e1db8ef781ad4 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 30 Jun 2008 18:58:14 -0400
Subject: lockd: Pass "struct sockaddr *" to new failover-by-IP function

Pass a more generic socket address type to nlmsvc_unlock_all_by_ip() to
allow for future support of IPv6.  Also provide additional sanity
checking in failover_unlock_ip() when constructing the server's IP
address.

As an added bonus, provide clean kerneldoc comments on related NLM
interfaces which were recently added.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/svcsubs.c          | 32 +++++++++++++++++++++++---------
 fs/nfsd/nfsctl.c            | 15 ++++++++++-----
 include/linux/lockd/lockd.h |  2 +-
 3 files changed, 34 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index d1c48b539df..198b4e55b37 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -373,13 +373,16 @@ nlmsvc_free_host_resources(struct nlm_host *host)
 	}
 }
 
-/*
- * Remove all locks held for clients
+/**
+ * nlmsvc_invalidate_all - remove all locks held for clients
+ *
+ * Release all locks held by NFS clients.
+ *
  */
 void
 nlmsvc_invalidate_all(void)
 {
-	/* Release all locks held by NFS clients.
+	/*
 	 * Previously, the code would call
 	 * nlmsvc_free_host_resources for each client in
 	 * turn, which is about as inefficient as it gets.
@@ -396,6 +399,12 @@ nlmsvc_match_sb(void *datap, struct nlm_file *file)
 	return sb == file->f_file->f_path.mnt->mnt_sb;
 }
 
+/**
+ * nlmsvc_unlock_all_by_sb - release locks held on this file system
+ * @sb: super block
+ *
+ * Release all locks held by clients accessing this file system.
+ */
 int
 nlmsvc_unlock_all_by_sb(struct super_block *sb)
 {
@@ -409,17 +418,22 @@ EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_sb);
 static int
 nlmsvc_match_ip(void *datap, struct nlm_host *host)
 {
-	__be32 *server_addr = datap;
-
-	return host->h_saddr.sin_addr.s_addr == *server_addr;
+	return nlm_cmp_addr(&host->h_saddr, datap);
 }
 
+/**
+ * nlmsvc_unlock_all_by_ip - release local locks by IP address
+ * @server_addr: server's IP address as seen by clients
+ *
+ * Release all locks held by clients accessing this host
+ * via the passed in IP address.
+ */
 int
-nlmsvc_unlock_all_by_ip(__be32 server_addr)
+nlmsvc_unlock_all_by_ip(struct sockaddr *server_addr)
 {
 	int ret;
-	ret = nlm_traverse_files(&server_addr, nlmsvc_match_ip, NULL);
-	return ret ? -EIO : 0;
 
+	ret = nlm_traverse_files(server_addr, nlmsvc_match_ip, NULL);
+	return ret ? -EIO : 0;
 }
 EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_ip);
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 2c2eb8796c1..1955a2702e6 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -310,9 +310,12 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size)
 
 static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size)
 {
-	__be32 server_ip;
-	char *fo_path, c;
+	struct sockaddr_in sin = {
+		.sin_family	= AF_INET,
+	};
 	int b1, b2, b3, b4;
+	char c;
+	char *fo_path;
 
 	/* sanity check */
 	if (size == 0)
@@ -326,11 +329,13 @@ static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size)
 		return -EINVAL;
 
 	/* get ipv4 address */
-	if (sscanf(fo_path, "%u.%u.%u.%u%c", &b1, &b2, &b3, &b4, &c) != 4)
+	if (sscanf(fo_path, NIPQUAD_FMT "%c", &b1, &b2, &b3, &b4, &c) != 4)
+		return -EINVAL;
+	if (b1 > 255 || b2 > 255 || b3 > 255 || b4 > 255)
 		return -EINVAL;
-	server_ip = htonl((((((b1<<8)|b2)<<8)|b3)<<8)|b4);
+	sin.sin_addr.s_addr = htonl((b1 << 24) | (b2 << 16) | (b3 << 8) | b4);
 
-	return nlmsvc_unlock_all_by_ip(server_ip);
+	return nlmsvc_unlock_all_by_ip((struct sockaddr *)&sin);
 }
 
 static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size)
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index f81f9dd5f14..dbb87ab282e 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -226,7 +226,7 @@ void		  nlmsvc_invalidate_all(void);
  * Cluster failover support
  */
 int           nlmsvc_unlock_all_by_sb(struct super_block *sb);
-int           nlmsvc_unlock_all_by_ip(__be32 server_addr);
+int           nlmsvc_unlock_all_by_ip(struct sockaddr *server_addr);
 
 static inline struct inode *nlmsvc_file_inode(struct nlm_file *file)
 {
-- 
cgit v1.2.3-70-g09d2


From d67d1c7bf948341fd8678c8e337ec27f4b46b206 Mon Sep 17 00:00:00 2001
From: Felix Blyakher <felixb@sgi.com>
Date: Tue, 15 Jul 2008 12:40:22 -0500
Subject: nfs: set correct fl_len in nlmclnt_test()

fcntl(F_GETLK) on an nfs client incorrectly returns
the values for the conflicting lock. fl_len value is
always 1.
If the conflicting lock is (0, 4095) the F_GETLK
request for (1024, 10) returns (0, 1), which doesn't
even cover the requested range, and is quite confusing.
The fix is trivial, set fl_end from the fl_end value
recieved from the nfs server.

Signed-off-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: "J. Bruce Fields" <bfields@citi.umich.edu>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/clntproc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 5df517b81f3..fd7d4669776 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -430,7 +430,7 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl)
 			 * Report the conflicting lock back to the application.
 			 */
 			fl->fl_start = req->a_res.lock.fl.fl_start;
-			fl->fl_end = req->a_res.lock.fl.fl_start;
+			fl->fl_end = req->a_res.lock.fl.fl_end;
 			fl->fl_type = req->a_res.lock.fl.fl_type;
 			fl->fl_pid = 0;
 			break;
-- 
cgit v1.2.3-70-g09d2


From 1b83d707032a1be40a60ed0a9bd841662cc04a5d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Jun 2008 15:44:04 -0400
Subject: NFS: Protect inode->i_nlink updates using inode->i_lock

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index b1940660502..d6ec1c85995 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -870,6 +870,14 @@ static int nfs_dentry_delete(struct dentry *dentry)
 
 }
 
+static void nfs_drop_nlink(struct inode *inode)
+{
+	spin_lock(&inode->i_lock);
+	if (inode->i_nlink > 0)
+		drop_nlink(inode);
+	spin_unlock(&inode->i_lock);
+}
+
 /*
  * Called when the dentry loses inode.
  * We use it to clean up silly-renamed files.
@@ -1420,7 +1428,7 @@ static int nfs_safe_remove(struct dentry *dentry)
 		error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
 		/* The VFS may want to delete this inode */
 		if (error == 0)
-			drop_nlink(inode);
+			nfs_drop_nlink(inode);
 		nfs_mark_for_revalidate(inode);
 	} else
 		error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
@@ -1647,7 +1655,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			/* dentry still busy? */
 			goto out;
 	} else
-		drop_nlink(new_inode);
+		nfs_drop_nlink(new_inode);
 
 go_ahead:
 	/*
-- 
cgit v1.2.3-70-g09d2


From a3d01454bc58b5a211ef64a7670572a40b71e682 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Jun 2008 12:21:19 -0400
Subject: NFS: Remove BKL requirement from attribute updates

The main problem is dealing with inode->i_size: we need to set the
inode->i_lock on all attribute updates, and so vmtruncate won't cut it.
Make an NFS-private version of vmtruncate that has the necessary locking
semantics.

The result should be that the following inode attribute updates are
protected by inode->i_lock
	nfsi->cache_validity
	nfsi->read_cache_jiffies
	nfsi->attrtimeo
	nfsi->attrtimeo_timestamp
	nfsi->change_attr
	nfsi->last_updated
	nfsi->cache_change_attribute
	nfsi->access_cache
	nfsi->access_cache_entry_lru
	nfsi->access_cache_inode_lru
	nfsi->acl_access
	nfsi->acl_default
	nfsi->nfs_page_tree
	nfsi->ncommit
	nfsi->npages
	nfsi->open_files
	nfsi->silly_list
	nfsi->acl
	nfsi->open_states
	inode->i_size
	inode->i_atime
	inode->i_mtime
	inode->i_ctime
	inode->i_nlink
	inode->i_uid
	inode->i_gid

The following is protected by dir->i_mutex
	nfsi->cookieverf

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++------
 fs/nfs/write.c | 15 ++++++++-----
 2 files changed, 71 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 2c23d067e2a..3adabd15477 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -388,6 +388,62 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
 	return error;
 }
 
+/**
+ * nfs_vmtruncate - unmap mappings "freed" by truncate() syscall
+ * @inode: inode of the file used
+ * @offset: file offset to start truncating
+ *
+ * This is a copy of the common vmtruncate, but with the locking
+ * corrected to take into account the fact that NFS requires
+ * inode->i_size to be updated under the inode->i_lock.
+ */
+static int nfs_vmtruncate(struct inode * inode, loff_t offset)
+{
+	if (i_size_read(inode) < offset) {
+		unsigned long limit;
+
+		limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+		if (limit != RLIM_INFINITY && offset > limit)
+			goto out_sig;
+		if (offset > inode->i_sb->s_maxbytes)
+			goto out_big;
+		spin_lock(&inode->i_lock);
+		i_size_write(inode, offset);
+		spin_unlock(&inode->i_lock);
+	} else {
+		struct address_space *mapping = inode->i_mapping;
+
+		/*
+		 * truncation of in-use swapfiles is disallowed - it would
+		 * cause subsequent swapout to scribble on the now-freed
+		 * blocks.
+		 */
+		if (IS_SWAPFILE(inode))
+			return -ETXTBSY;
+		spin_lock(&inode->i_lock);
+		i_size_write(inode, offset);
+		spin_unlock(&inode->i_lock);
+
+		/*
+		 * unmap_mapping_range is called twice, first simply for
+		 * efficiency so that truncate_inode_pages does fewer
+		 * single-page unmaps.  However after this first call, and
+		 * before truncate_inode_pages finishes, it is possible for
+		 * private pages to be COWed, which remain after
+		 * truncate_inode_pages finishes, hence the second
+		 * unmap_mapping_range call must be made for correctness.
+		 */
+		unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
+		truncate_inode_pages(mapping, offset);
+		unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
+	}
+	return 0;
+out_sig:
+	send_sig(SIGXFSZ, current, 0);
+out_big:
+	return -EFBIG;
+}
+
 /**
  * nfs_setattr_update_inode - Update inode metadata after a setattr call.
  * @inode: pointer to struct inode
@@ -414,8 +470,7 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
 	}
 	if ((attr->ia_valid & ATTR_SIZE) != 0) {
 		nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC);
-		inode->i_size = attr->ia_size;
-		vmtruncate(inode, attr->ia_size);
+		nfs_vmtruncate(inode, attr->ia_size);
 	}
 }
 
@@ -829,9 +884,9 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 			if (S_ISDIR(inode->i_mode))
 				nfsi->cache_validity |= NFS_INO_INVALID_DATA;
 		}
-		if (inode->i_size == nfs_size_to_loff_t(fattr->pre_size) &&
+		if (i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) &&
 		    nfsi->npages == 0)
-			inode->i_size = nfs_size_to_loff_t(fattr->size);
+			i_size_write(inode, nfs_size_to_loff_t(fattr->size));
 	}
 }
 
@@ -972,7 +1027,7 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa
 			(fattr->valid & NFS_ATTR_WCC) == 0) {
 		memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime));
 		memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime));
-		fattr->pre_size = inode->i_size;
+		fattr->pre_size = i_size_read(inode);
 		fattr->valid |= NFS_ATTR_WCC;
 	}
 	return nfs_post_op_update_inode(inode, fattr);
@@ -1057,7 +1112,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 		/* Do we perhaps have any outstanding writes, or has
 		 * the file grown beyond our last write? */
 		if (nfsi->npages == 0 || new_isize > cur_isize) {
-			inode->i_size = new_isize;
+			i_size_write(inode, new_isize);
 			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
 		}
 		dprintk("NFS: isize change on server for file %s/%ld\n",
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index feca8c64876..3229e217c77 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -133,16 +133,21 @@ static struct nfs_page *nfs_page_find_request(struct page *page)
 static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count)
 {
 	struct inode *inode = page->mapping->host;
-	loff_t end, i_size = i_size_read(inode);
-	pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
+	loff_t end, i_size;
+	pgoff_t end_index;
 
+	spin_lock(&inode->i_lock);
+	i_size = i_size_read(inode);
+	end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
 	if (i_size > 0 && page->index < end_index)
-		return;
+		goto out;
 	end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + ((loff_t)offset+count);
 	if (i_size >= end)
-		return;
-	nfs_inc_stats(inode, NFSIOS_EXTENDWRITE);
+		goto out;
 	i_size_write(inode, end);
+	nfs_inc_stats(inode, NFSIOS_EXTENDWRITE);
+out:
+	spin_unlock(&inode->i_lock);
 }
 
 /* A writeback failed: mark the page as bad, and invalidate the page cache */
-- 
cgit v1.2.3-70-g09d2


From fa6dc9dc59c3a76fd209a97c8cf37395980fb903 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Jun 2008 13:26:14 -0400
Subject: NFS: Remove attribute update related BKL references

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 4 ----
 fs/nfs/super.c | 4 ----
 2 files changed, 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 3adabd15477..df23f987da6 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -370,7 +370,6 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
 	if ((attr->ia_valid & ~ATTR_FILE) == 0)
 		return 0;
 
-	lock_kernel();
 	/* Write all dirty data */
 	if (S_ISREG(inode->i_mode)) {
 		filemap_write_and_wait(inode->i_mapping);
@@ -384,7 +383,6 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
 	error = NFS_PROTO(inode)->setattr(dentry, &fattr, attr);
 	if (error == 0)
 		nfs_refresh_inode(inode, &fattr);
-	unlock_kernel();
 	return error;
 }
 
@@ -700,7 +698,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 		inode->i_sb->s_id, (long long)NFS_FILEID(inode));
 
 	nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
-	lock_kernel();
 	if (is_bad_inode(inode))
  		goto out_nowait;
 	if (NFS_STALE(inode))
@@ -749,7 +746,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 	nfs_wake_up_inode(inode);
 
  out_nowait:
-	unlock_kernel();
 	return status;
 }
 
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 47cf83e917b..1b94e3650f5 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -374,8 +374,6 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	};
 	int error;
 
-	lock_kernel();
-
 	error = server->nfs_client->rpc_ops->statfs(server, fh, &res);
 	if (error < 0)
 		goto out_err;
@@ -407,12 +405,10 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 
 	buf->f_namelen = server->namelen;
 
-	unlock_kernel();
 	return 0;
 
  out_err:
 	dprintk("%s: statfs error = %d\n", __func__, -error);
-	unlock_kernel();
 	return error;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 4d80f2ecd506d9732ad94a6da104580bb47680d6 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Jun 2008 15:44:18 -0400
Subject: NFS: Remove the BKL from the permission checking code

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index d6ec1c85995..73e0f9740dd 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1982,8 +1982,6 @@ int nfs_permission(struct inode *inode, int mask, struct nameidata *nd)
 	}
 
 force_lookup:
-	lock_kernel();
-
 	if (!NFS_PROTO(inode)->access)
 		goto out_notsup;
 
@@ -1993,7 +1991,6 @@ force_lookup:
 		put_rpccred(cred);
 	} else
 		res = PTR_ERR(cred);
-	unlock_kernel();
 out:
 	dfprintk(VFS, "NFS: permission(%s/%ld), mask=0x%x, res=%d\n",
 		inode->i_sb->s_id, inode->i_ino, mask, res);
@@ -2002,7 +1999,6 @@ out_notsup:
 	res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
 	if (res == 0)
 		res = generic_permission(inode, mask, NULL);
-	unlock_kernel();
 	goto out;
 }
 
-- 
cgit v1.2.3-70-g09d2


From b6a2e569e2157509951c9f3f58dfa18b44ce91b3 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Jun 2008 13:26:16 -0400
Subject: NFS: Remove BKL usage from the write path

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/file.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 509dcb58959..f919d9a01a2 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -393,9 +393,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
 			zero_user_segment(page, pglen, PAGE_CACHE_SIZE);
 	}
 
-	lock_kernel();
 	status = nfs_updatepage(file, page, offset, copied);
-	unlock_kernel();
 
 	unlock_page(page);
 	page_cache_release(page);
-- 
cgit v1.2.3-70-g09d2


From bba67e0e3f4caba2b2b90b48ed798fb0461bcb86 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Jun 2008 13:26:23 -0400
Subject: NFS: Remove BKL usage from open()

All the NFSv4 stateful operations are already protected by other locks (in
particular by the rpc_sequence locks.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c      | 6 ------
 fs/nfs/file.c     | 2 --
 fs/nfs/nfs4proc.c | 2 --
 3 files changed, 10 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 73e0f9740dd..c68ec447ace 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -139,10 +139,8 @@ nfs_opendir(struct inode *inode, struct file *filp)
 
 	nfs_inc_stats(inode, NFSIOS_VFSOPEN);
 
-	lock_kernel();
 	/* Call generic open code in order to cache credentials */
 	res = nfs_open(inode, filp);
-	unlock_kernel();
 	return res;
 }
 
@@ -1019,9 +1017,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
 	}
 
 	/* Open the file on the server */
-	lock_kernel();
 	res = nfs4_atomic_open(dir, dentry, nd);
-	unlock_kernel();
 	if (IS_ERR(res)) {
 		error = PTR_ERR(res);
 		switch (error) {
@@ -1083,9 +1079,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
 	 * operations that change the directory. We therefore save the
 	 * change attribute *before* we do the RPC call.
 	 */
-	lock_kernel();
 	ret = nfs4_open_revalidate(dir, dentry, openflags, nd);
-	unlock_kernel();
 out:
 	dput(parent);
 	if (!ret)
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index f919d9a01a2..9f1bed944b2 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -128,9 +128,7 @@ nfs_file_open(struct inode *inode, struct file *filp)
 		return res;
 
 	nfs_inc_stats(inode, NFSIOS_VFSOPEN);
-	lock_kernel();
 	res = nfs_open(inode, filp);
-	unlock_kernel();
 	return res;
 }
 
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 4451287a81d..c910413eaec 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -451,9 +451,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
 		/* Save the delegation */
 		memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data));
 		rcu_read_unlock();
-		lock_kernel();
 		ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode);
-		unlock_kernel();
 		if (ret != 0)
 			goto out;
 		ret = -EAGAIN;
-- 
cgit v1.2.3-70-g09d2


From f1e2eda23513b68003202bddf1f84158baad8844 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Jun 2008 15:50:50 -0400
Subject: NFS: Remove the BKL from the inode creation operations

nfs_instantiate() does not require the BKL, neither do the attribute
updates or the RPC code.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index c68ec447ace..d40e91e5c94 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1232,14 +1232,11 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	if ((nd->flags & LOOKUP_CREATE) != 0)
 		open_flags = nd->intent.open.flags;
 
-	lock_kernel();
 	error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, nd);
 	if (error != 0)
 		goto out_err;
-	unlock_kernel();
 	return 0;
 out_err:
-	unlock_kernel();
 	d_drop(dentry);
 	return error;
 }
@@ -1262,14 +1259,11 @@ nfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
 	attr.ia_mode = mode;
 	attr.ia_valid = ATTR_MODE;
 
-	lock_kernel();
 	status = NFS_PROTO(dir)->mknod(dir, dentry, &attr, rdev);
 	if (status != 0)
 		goto out_err;
-	unlock_kernel();
 	return 0;
 out_err:
-	unlock_kernel();
 	d_drop(dentry);
 	return status;
 }
@@ -1288,15 +1282,12 @@ static int nfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	attr.ia_valid = ATTR_MODE;
 	attr.ia_mode = mode | S_IFDIR;
 
-	lock_kernel();
 	error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr);
 	if (error != 0)
 		goto out_err;
-	unlock_kernel();
 	return 0;
 out_err:
 	d_drop(dentry);
-	unlock_kernel();
 	return error;
 }
 
-- 
cgit v1.2.3-70-g09d2


From fc81af535e462764e17f638d542973fbef13b026 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Jun 2008 15:52:40 -0400
Subject: NFS: Remove the BKL from nfs_link()

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index d40e91e5c94..5ae8ee6b298 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1549,14 +1549,12 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
 		old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
 		dentry->d_parent->d_name.name, dentry->d_name.name);
 
-	lock_kernel();
 	d_drop(dentry);
 	error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
 	if (error == 0) {
 		atomic_inc(&inode->i_count);
 		d_add(dentry, inode);
 	}
-	unlock_kernel();
 	return error;
 }
 
-- 
cgit v1.2.3-70-g09d2


From fc0f684c21b5d4b41dc2ec76f7c0897ac98f5b6e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Jun 2008 15:44:20 -0400
Subject: NFS: Remove BKL from NFS lookup code

All dentry-related operations are already BKL-safe, since they are
protected by the VFS locking. No extra locks should be needed in the NFS
code.

In the case of nfs_revalidate_inode(), we're only doing an attribute
update (protected by the inode->i_lock).
In the case of nfs_lookup(), we're instantiating a new dentry, so there
should be no contention possible until after we call d_materialise_unique.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 5ae8ee6b298..60da7550133 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -777,7 +777,6 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
 	struct nfs_fattr fattr;
 
 	parent = dget_parent(dentry);
-	lock_kernel();
 	dir = parent->d_inode;
 	nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
 	inode = dentry->d_inode;
@@ -815,7 +814,6 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
 
 	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
  out_valid:
-	unlock_kernel();
 	dput(parent);
 	dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is valid\n",
 			__func__, dentry->d_parent->d_name.name,
@@ -834,7 +832,6 @@ out_zap_parent:
 		shrink_dcache_parent(dentry);
 	}
 	d_drop(dentry);
-	unlock_kernel();
 	dput(parent);
 	dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n",
 			__func__, dentry->d_parent->d_name.name,
@@ -921,8 +918,6 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
 	res = ERR_PTR(-ENOMEM);
 	dentry->d_op = NFS_PROTO(dir)->dentry_ops;
 
-	lock_kernel();
-
 	/*
 	 * If we're doing an exclusive create, optimize away the lookup
 	 * but don't hash the dentry.
@@ -930,7 +925,7 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
 	if (nfs_is_exclusive_create(dir, nd)) {
 		d_instantiate(dentry, NULL);
 		res = NULL;
-		goto out_unlock;
+		goto out;
 	}
 
 	parent = dentry->d_parent;
@@ -958,8 +953,6 @@ no_entry:
 	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 out_unblock_sillyrename:
 	nfs_unblock_sillyrename(parent);
-out_unlock:
-	unlock_kernel();
 out:
 	return res;
 }
-- 
cgit v1.2.3-70-g09d2


From bd9bb454b76fb6ca2d00f17313f9f9df5f5c404a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Jun 2008 16:09:59 -0400
Subject: NFS: Remove the BKL from the rename, rmdir and unlink operations

Attribute updates are safe, and dentry operations are protected using VFS
level locks. Defer removing the BKL from sillyrename until a separate
patch.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 60da7550133..68e0688904e 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1297,14 +1297,12 @@ static int nfs_rmdir(struct inode *dir, struct dentry *dentry)
 	dfprintk(VFS, "NFS: rmdir(%s/%ld), %s\n",
 			dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
 
-	lock_kernel();
 	error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
 	/* Ensure the VFS deletes this inode */
 	if (error == 0 && dentry->d_inode != NULL)
 		clear_nlink(dentry->d_inode);
 	else if (error == -ENOENT)
 		nfs_dentry_handle_enoent(dentry);
-	unlock_kernel();
 
 	return error;
 }
@@ -1429,7 +1427,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
 	dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id,
 		dir->i_ino, dentry->d_name.name);
 
-	lock_kernel();
 	spin_lock(&dcache_lock);
 	spin_lock(&dentry->d_lock);
 	if (atomic_read(&dentry->d_count) > 1) {
@@ -1437,6 +1434,7 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
 		spin_unlock(&dcache_lock);
 		/* Start asynchronous writeout of the inode */
 		write_inode_now(dentry->d_inode, 0);
+		lock_kernel();
 		error = nfs_sillyrename(dir, dentry);
 		unlock_kernel();
 		return error;
@@ -1452,7 +1450,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
 		nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 	} else if (need_rehash)
 		d_rehash(dentry);
-	unlock_kernel();
 	return error;
 }
 
@@ -1587,7 +1584,6 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	 * To prevent any new references to the target during the rename,
 	 * we unhash the dentry and free the inode in advance.
 	 */
-	lock_kernel();
 	if (!d_unhashed(new_dentry)) {
 		d_drop(new_dentry);
 		rehash = new_dentry;
@@ -1621,7 +1617,9 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			goto out;
 
 		/* silly-rename the existing target ... */
+		lock_kernel();
 		err = nfs_sillyrename(new_dir, new_dentry);
+		unlock_kernel();
 		if (!err) {
 			new_dentry = rehash = dentry;
 			new_inode = NULL;
@@ -1665,7 +1663,6 @@ out:
 	/* new dentry created? */
 	if (dentry)
 		dput(dentry);
-	unlock_kernel();
 	return error;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 52e2e8d37e01edf38ccdccc983fb13ec1456d63d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Jun 2008 15:44:21 -0400
Subject: NFS: Remove BKL from the sillydelete operations

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 68e0688904e..1bdc36bf178 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -884,10 +884,8 @@ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
 		NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
 
 	if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
-		lock_kernel();
 		drop_nlink(inode);
 		nfs_complete_unlink(dentry, inode);
-		unlock_kernel();
 	}
 	iput(inode);
 }
@@ -1434,9 +1432,7 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
 		spin_unlock(&dcache_lock);
 		/* Start asynchronous writeout of the inode */
 		write_inode_now(dentry->d_inode, 0);
-		lock_kernel();
 		error = nfs_sillyrename(dir, dentry);
-		unlock_kernel();
 		return error;
 	}
 	if (!d_unhashed(dentry)) {
@@ -1617,9 +1613,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			goto out;
 
 		/* silly-rename the existing target ... */
-		lock_kernel();
 		err = nfs_sillyrename(new_dir, new_dentry);
-		unlock_kernel();
 		if (!err) {
 			new_dentry = rehash = dentry;
 			new_inode = NULL;
-- 
cgit v1.2.3-70-g09d2


From 76566991f94c206d9c5881edcaf99ba72c9e9d61 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Jun 2008 15:44:22 -0400
Subject: NFS: Remove BKL from the symlink code

Page cache accesses are serialised using page locks, whereas attribute
updates are serialised using inode->i_lock.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 1bdc36bf178..e5f95029192 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1482,13 +1482,9 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym
 	attr.ia_mode = S_IFLNK | S_IRWXUGO;
 	attr.ia_valid = ATTR_MODE;
 
-	lock_kernel();
-
 	page = alloc_page(GFP_HIGHUSER);
-	if (!page) {
-		unlock_kernel();
+	if (!page)
 		return -ENOMEM;
-	}
 
 	kaddr = kmap_atomic(page, KM_USER0);
 	memcpy(kaddr, symname, pathlen);
@@ -1503,7 +1499,6 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym
 			dentry->d_name.name, symname, error);
 		d_drop(dentry);
 		__free_page(page);
-		unlock_kernel();
 		return error;
 	}
 
@@ -1521,7 +1516,6 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym
 	} else
 		__free_page(page);
 
-	unlock_kernel();
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From c3cc8c019ca09767d7c9b5457d5cf8ac65085f44 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Jun 2008 15:44:23 -0400
Subject: NFS: Remove BKL from the readdir code

Page accesses are serialised using the page locks, whereas all attribute
updates are serialised using the inode->i_lock.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e5f95029192..28a238dab23 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -534,8 +534,6 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 			(long long)filp->f_pos);
 	nfs_inc_stats(inode, NFSIOS_VFSGETDENTS);
 
-	lock_kernel();
-
 	/*
 	 * filp->f_pos points to the dirent entry number.
 	 * *desc->dir_cookie has the cookie for the next entry. We have
@@ -593,7 +591,6 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	}
 out:
 	nfs_unblock_sillyrename(dentry);
-	unlock_kernel();
 	if (res > 0)
 		res = 0;
 	dfprintk(FILE, "NFS: readdir(%s/%s) returns %ld\n",
-- 
cgit v1.2.3-70-g09d2


From a86dc496b764ebb1431677b38eab45310e5a2ad4 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Jun 2008 13:37:09 -0400
Subject: SUNRPC: Remove the BKL from the callback functions

Push it into those callback functions that actually need it.

Note that all the NFS operations use their own locking, so don't need the
BKL. Ditto for the rpcbind client.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/clntproc.c | 6 ++++++
 fs/lockd/svc4proc.c | 2 ++
 fs/lockd/svclock.c  | 7 ++++++-
 fs/lockd/svcproc.c  | 2 ++
 net/sunrpc/sched.c  | 9 +--------
 5 files changed, 17 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index fd7d4669776..1f6dc518505 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -224,7 +224,9 @@ void nlm_release_call(struct nlm_rqst *call)
 
 static void nlmclnt_rpc_release(void *data)
 {
+	lock_kernel();
 	nlm_release_call(data);
+	unlock_kernel();
 }
 
 static int nlm_wait_on_grace(wait_queue_head_t *queue)
@@ -710,7 +712,9 @@ static void nlmclnt_unlock_callback(struct rpc_task *task, void *data)
 die:
 	return;
  retry_rebind:
+	lock_kernel();
 	nlm_rebind_host(req->a_host);
+	unlock_kernel();
  retry_unlock:
 	rpc_restart_call(task);
 }
@@ -788,7 +792,9 @@ retry_cancel:
 	/* Don't ever retry more than 3 times */
 	if (req->a_retries++ >= NLMCLNT_MAX_RETRIES)
 		goto die;
+	lock_kernel();
 	nlm_rebind_host(req->a_host);
+	unlock_kernel();
 	rpc_restart_call(task);
 	rpc_delay(task, 30 * HZ);
 }
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 385437e3387..2e27176ff42 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -248,7 +248,9 @@ static void nlm4svc_callback_exit(struct rpc_task *task, void *data)
 
 static void nlm4svc_callback_release(void *data)
 {
+	lock_kernel();
 	nlm_release_call(data);
+	unlock_kernel();
 }
 
 static const struct rpc_call_ops nlm4svc_callback_ops = {
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 81aca859bfd..56a08ab9a4c 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -795,6 +795,7 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
 
 	dprintk("lockd: GRANT_MSG RPC callback\n");
 
+	lock_kernel();
 	/* if the block is not on a list at this point then it has
 	 * been invalidated. Don't try to requeue it.
 	 *
@@ -804,7 +805,7 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
 	 * for nlm_blocked?
 	 */
 	if (list_empty(&block->b_list))
-		return;
+		goto out;
 
 	/* Technically, we should down the file semaphore here. Since we
 	 * move the block towards the head of the queue only, no harm
@@ -818,13 +819,17 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
 	}
 	nlmsvc_insert_block(block, timeout);
 	svc_wake_up(block->b_daemon);
+out:
+	unlock_kernel();
 }
 
 static void nlmsvc_grant_release(void *data)
 {
 	struct nlm_rqst		*call = data;
 
+	lock_kernel();
 	nlmsvc_release_block(call->a_block);
+	unlock_kernel();
 }
 
 static const struct rpc_call_ops nlmsvc_grant_ops = {
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 88379cc6e0b..ce6952b50a7 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -278,7 +278,9 @@ static void nlmsvc_callback_exit(struct rpc_task *task, void *data)
 
 static void nlmsvc_callback_release(void *data)
 {
+	lock_kernel();
 	nlm_release_call(data);
+	unlock_kernel();
 }
 
 static const struct rpc_call_ops nlmsvc_callback_ops = {
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 6288af05c20..385f427beda 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -576,9 +576,7 @@ EXPORT_SYMBOL_GPL(rpc_delay);
  */
 static void rpc_prepare_task(struct rpc_task *task)
 {
-	lock_kernel();
 	task->tk_ops->rpc_call_prepare(task, task->tk_calldata);
-	unlock_kernel();
 }
 
 /*
@@ -588,9 +586,7 @@ void rpc_exit_task(struct rpc_task *task)
 {
 	task->tk_action = NULL;
 	if (task->tk_ops->rpc_call_done != NULL) {
-		lock_kernel();
 		task->tk_ops->rpc_call_done(task, task->tk_calldata);
-		unlock_kernel();
 		if (task->tk_action != NULL) {
 			WARN_ON(RPC_ASSASSINATED(task));
 			/* Always release the RPC slot and buffer memory */
@@ -602,11 +598,8 @@ EXPORT_SYMBOL_GPL(rpc_exit_task);
 
 void rpc_release_calldata(const struct rpc_call_ops *ops, void *calldata)
 {
-	if (ops->rpc_release != NULL) {
-		lock_kernel();
+	if (ops->rpc_release != NULL)
 		ops->rpc_release(calldata);
-		unlock_kernel();
-	}
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From f839c4c1991cc9b580ae38f98f54554938a7f49c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 11 Jun 2008 15:44:26 -0400
Subject: NFSv4: Remove BKL from the nfsv4 state recovery

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4state.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 856a8934f61..401ef8b28f9 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -940,7 +940,6 @@ static int reclaimer(void *ptr)
 	allow_signal(SIGKILL);
 
 	/* Ensure exclusive access to NFSv4 state */
-	lock_kernel();
 	down_write(&clp->cl_sem);
 	/* Are there any NFS mounts out there? */
 	if (list_empty(&clp->cl_superblocks))
@@ -1000,7 +999,6 @@ restart_loop:
 	nfs_delegation_reap_unclaimed(clp);
 out:
 	up_write(&clp->cl_sem);
-	unlock_kernel();
 	if (status == -NFS4ERR_CB_PATH_DOWN)
 		nfs_handle_cb_pathdown(clp);
 	nfs4_clear_recover_bit(clp);
-- 
cgit v1.2.3-70-g09d2


From 3c3622dcb64c76c9abd2e468f802db9ba523421c Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 16 Jul 2008 08:52:00 -0500
Subject: Fix compile issues in fs/compat_ioctl.c when CONFIG_BLOCK is disabled

Fix fs/compat_ioctl.c to handle CONFIG_BLOCK=n, CONFIG_SCSI=n to avoid
build errors:

In file included from include/scsi/scsi.h:12,
                 from fs/compat_ioctl.c:71:
include/scsi/scsi_cmnd.h:27:25: warning: "BLK_MAX_CDB" is not defined
include/scsi/scsi_cmnd.h:28:3: error: #error MAX_COMMAND_SIZE can not be bigger than BLK_MAX_CDB
In file included from include/scsi/scsi.h:12,
                 from fs/compat_ioctl.c:71:
include/scsi/scsi_cmnd.h: In function 'scsi_bidi_cmnd':
include/scsi/scsi_cmnd.h:182: error: implicit declaration of function 'blk_bidi_rq'
include/scsi/scsi_cmnd.h:183: error: dereferencing pointer to incomplete type
include/scsi/scsi_cmnd.h: In function 'scsi_in':
include/scsi/scsi_cmnd.h:189: error: dereferencing pointer to incomplete type

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/compat_ioctl.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 97dba0d9234..c54eaab71a1 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -69,9 +69,11 @@
 #include <linux/capi.h>
 #include <linux/gigaset_dev.h>
 
+#ifdef CONFIG_BLOCK
 #include <scsi/scsi.h>
 #include <scsi/scsi_ioctl.h>
 #include <scsi/sg.h>
+#endif
 
 #include <asm/uaccess.h>
 #include <linux/ethtool.h>
@@ -2024,6 +2026,7 @@ COMPATIBLE_IOCTL(GIO_UNISCRNMAP)
 COMPATIBLE_IOCTL(PIO_UNISCRNMAP)
 COMPATIBLE_IOCTL(PIO_FONTRESET)
 COMPATIBLE_IOCTL(PIO_UNIMAPCLR)
+#ifdef CONFIG_BLOCK
 /* Big S */
 COMPATIBLE_IOCTL(SCSI_IOCTL_GET_IDLUN)
 COMPATIBLE_IOCTL(SCSI_IOCTL_DOORLOCK)
@@ -2033,6 +2036,7 @@ COMPATIBLE_IOCTL(SCSI_IOCTL_GET_BUS_NUMBER)
 COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND)
 COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST)
 COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI)
+#endif
 /* Big T */
 COMPATIBLE_IOCTL(TUNSETNOCSUM)
 COMPATIBLE_IOCTL(TUNSETDEBUG)
@@ -2103,6 +2107,7 @@ COMPATIBLE_IOCTL(SIOCGIFVLAN)
 COMPATIBLE_IOCTL(SIOCSIFVLAN)
 COMPATIBLE_IOCTL(SIOCBRADDBR)
 COMPATIBLE_IOCTL(SIOCBRDELBR)
+#ifdef CONFIG_BLOCK
 /* SG stuff */
 COMPATIBLE_IOCTL(SG_SET_TIMEOUT)
 COMPATIBLE_IOCTL(SG_GET_TIMEOUT)
@@ -2127,6 +2132,7 @@ COMPATIBLE_IOCTL(SG_SCSI_RESET)
 COMPATIBLE_IOCTL(SG_GET_REQUEST_TABLE)
 COMPATIBLE_IOCTL(SG_SET_KEEP_ORPHAN)
 COMPATIBLE_IOCTL(SG_GET_KEEP_ORPHAN)
+#endif
 /* PPP stuff */
 COMPATIBLE_IOCTL(PPPIOCGFLAGS)
 COMPATIBLE_IOCTL(PPPIOCSFLAGS)
-- 
cgit v1.2.3-70-g09d2


From c0420ad2ca514551ca086510b0e7d17a05c70492 Mon Sep 17 00:00:00 2001
From: Coly Li <coyli@suse.de>
Date: Mon, 30 Jun 2008 18:45:45 +0800
Subject: [PATCH] ocfs2: fix oops in mmap_truncate testing

This patch fixes a mmap_truncate bug which was found by ocfs2 test suite.

In an ocfs2 cluster more than 1 node, run program mmap_truncate, which races
mmap writes and truncates from multiple processes. While the test is
running, a stat from another node forces writeout, causing an oops in
ocfs2_get_block() because it sees a buffer to write which isn't allocated.

This patch fixed the bug by clear dirty and uptodate bits in buffer, leave
the buffer unmapped and return.

Fix is suggested by Mark Fasheh, and I code up the patch.

Signed-off-by: Coly Li <coyli@suse.de>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/aops.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 17964c0505a..1db080135c6 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -174,10 +174,17 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
 	 * need to use BH_New is when we're extending i_size on a file
 	 * system which doesn't support holes, in which case BH_New
 	 * allows block_prepare_write() to zero.
+	 *
+	 * If we see this on a sparse file system, then a truncate has
+	 * raced us and removed the cluster. In this case, we clear
+	 * the buffers dirty and uptodate bits and let the buffer code
+	 * ignore it as a hole.
 	 */
-	mlog_bug_on_msg(create && p_blkno == 0 && ocfs2_sparse_alloc(osb),
-			"ino %lu, iblock %llu\n", inode->i_ino,
-			(unsigned long long)iblock);
+	if (create && p_blkno == 0 && ocfs2_sparse_alloc(osb)) {
+		clear_buffer_dirty(bh_result);
+		clear_buffer_uptodate(bh_result);
+		goto bail;
+	}
 
 	/* Treat the unwritten extent as a hole for zeroing purposes. */
 	if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
-- 
cgit v1.2.3-70-g09d2


From f89ab8619e5320cc9c2576f5f8dcbaf6c0ba3950 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 17 Jul 2008 14:53:48 -0700
Subject: Revert "configfs: Allow ->make_item() and ->make_group() to return
 detailed errors."

This reverts commit 11c3b79218390a139f2d474ee1e983a672d5839a.  The code
will move to PTR_ERR().

Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 Documentation/filesystems/configfs/configfs.txt    | 10 ++---
 .../filesystems/configfs/configfs_example.c        | 14 +++----
 drivers/net/netconsole.c                           | 10 ++---
 fs/configfs/dir.c                                  | 13 ++++---
 fs/dlm/config.c                                    | 45 ++++++++--------------
 fs/ocfs2/cluster/heartbeat.c                       | 17 ++++----
 fs/ocfs2/cluster/nodemanager.c                     | 45 ++++++++--------------
 include/linux/configfs.h                           |  4 +-
 8 files changed, 64 insertions(+), 94 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/configfs/configfs.txt b/Documentation/filesystems/configfs/configfs.txt
index 15838d706ea..44c97e6accb 100644
--- a/Documentation/filesystems/configfs/configfs.txt
+++ b/Documentation/filesystems/configfs/configfs.txt
@@ -233,12 +233,10 @@ accomplished via the group operations specified on the group's
 config_item_type.
 
 	struct configfs_group_operations {
-		int (*make_item)(struct config_group *group,
-				 const char *name,
-				 struct config_item **new_item);
-		int (*make_group)(struct config_group *group,
-				  const char *name,
-				  struct config_group **new_group);
+		struct config_item *(*make_item)(struct config_group *group,
+						 const char *name);
+		struct config_group *(*make_group)(struct config_group *group,
+						   const char *name);
 		int (*commit_item)(struct config_item *item);
 		void (*disconnect_notify)(struct config_group *group,
 					  struct config_item *item);
diff --git a/Documentation/filesystems/configfs/configfs_example.c b/Documentation/filesystems/configfs/configfs_example.c
index 0b422acd470..25151fd5c2c 100644
--- a/Documentation/filesystems/configfs/configfs_example.c
+++ b/Documentation/filesystems/configfs/configfs_example.c
@@ -273,13 +273,13 @@ static inline struct simple_children *to_simple_children(struct config_item *ite
 	return item ? container_of(to_config_group(item), struct simple_children, group) : NULL;
 }
 
-static int simple_children_make_item(struct config_group *group, const char *name, struct config_item **new_item)
+static struct config_item *simple_children_make_item(struct config_group *group, const char *name)
 {
 	struct simple_child *simple_child;
 
 	simple_child = kzalloc(sizeof(struct simple_child), GFP_KERNEL);
 	if (!simple_child)
-		return -ENOMEM;
+		return NULL;
 
 
 	config_item_init_type_name(&simple_child->item, name,
@@ -287,8 +287,7 @@ static int simple_children_make_item(struct config_group *group, const char *nam
 
 	simple_child->storeme = 0;
 
-	*new_item = &simple_child->item;
-	return 0;
+	return &simple_child->item;
 }
 
 static struct configfs_attribute simple_children_attr_description = {
@@ -360,21 +359,20 @@ static struct configfs_subsystem simple_children_subsys = {
  * children of its own.
  */
 
-static int group_children_make_group(struct config_group *group, const char *name, struct config_group **new_group)
+static struct config_group *group_children_make_group(struct config_group *group, const char *name)
 {
 	struct simple_children *simple_children;
 
 	simple_children = kzalloc(sizeof(struct simple_children),
 				  GFP_KERNEL);
 	if (!simple_children)
-		return -ENOMEM;
+		return NULL;
 
 
 	config_group_init_type_name(&simple_children->group, name,
 				    &simple_children_type);
 
-	*new_group = &simple_children->group;
-	return 0;
+	return &simple_children->group;
 }
 
 static struct configfs_attribute group_children_attr_description = {
diff --git a/drivers/net/netconsole.c b/drivers/net/netconsole.c
index 387a1339501..665341e4305 100644
--- a/drivers/net/netconsole.c
+++ b/drivers/net/netconsole.c
@@ -585,9 +585,8 @@ static struct config_item_type netconsole_target_type = {
  * Group operations and type for netconsole_subsys.
  */
 
-static int make_netconsole_target(struct config_group *group,
-				  const char *name,
-				  struct config_item **new_item)
+static struct config_item *make_netconsole_target(struct config_group *group,
+						  const char *name)
 {
 	unsigned long flags;
 	struct netconsole_target *nt;
@@ -599,7 +598,7 @@ static int make_netconsole_target(struct config_group *group,
 	nt = kzalloc(sizeof(*nt), GFP_KERNEL);
 	if (!nt) {
 		printk(KERN_ERR "netconsole: failed to allocate memory\n");
-		return -ENOMEM;
+		return NULL;
 	}
 
 	nt->np.name = "netconsole";
@@ -616,8 +615,7 @@ static int make_netconsole_target(struct config_group *group,
 	list_add(&nt->list, &target_list);
 	spin_unlock_irqrestore(&target_list_lock, flags);
 
-	*new_item = &nt->item;
-	return 0;
+	return &nt->item;
 }
 
 static void drop_netconsole_target(struct config_group *group,
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 0e64312a084..614e382a604 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1073,24 +1073,25 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	group = NULL;
 	item = NULL;
 	if (type->ct_group_ops->make_group) {
-		ret = type->ct_group_ops->make_group(to_config_group(parent_item), name, &group);
-		if (!ret) {
+		group = type->ct_group_ops->make_group(to_config_group(parent_item), name);
+		if (group) {
 			link_group(to_config_group(parent_item), group);
 			item = &group->cg_item;
 		}
 	} else {
-		ret = type->ct_group_ops->make_item(to_config_group(parent_item), name, &item);
-		if (!ret)
+		item = type->ct_group_ops->make_item(to_config_group(parent_item), name);
+		if (item)
 			link_obj(parent_item, item);
 	}
 	mutex_unlock(&subsys->su_mutex);
 
 	kfree(name);
-	if (ret) {
+	if (!item) {
 		/*
-		 * If ret != 0, then link_obj() was never called.
+		 * If item == NULL, then link_obj() was never called.
 		 * There are no extra references to clean up.
 		 */
+		ret = -ENOMEM;
 		goto out_put;
 	}
 
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 492d8caaaf2..eac23bd288b 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -41,20 +41,16 @@ struct comm;
 struct nodes;
 struct node;
 
-static int make_cluster(struct config_group *, const char *,
-			struct config_group **);
+static struct config_group *make_cluster(struct config_group *, const char *);
 static void drop_cluster(struct config_group *, struct config_item *);
 static void release_cluster(struct config_item *);
-static int make_space(struct config_group *, const char *,
-		      struct config_group **);
+static struct config_group *make_space(struct config_group *, const char *);
 static void drop_space(struct config_group *, struct config_item *);
 static void release_space(struct config_item *);
-static int make_comm(struct config_group *, const char *,
-		     struct config_item **);
+static struct config_item *make_comm(struct config_group *, const char *);
 static void drop_comm(struct config_group *, struct config_item *);
 static void release_comm(struct config_item *);
-static int make_node(struct config_group *, const char *,
-		     struct config_item **);
+static struct config_item *make_node(struct config_group *, const char *);
 static void drop_node(struct config_group *, struct config_item *);
 static void release_node(struct config_item *);
 
@@ -396,8 +392,8 @@ static struct node *to_node(struct config_item *i)
 	return i ? container_of(i, struct node, item) : NULL;
 }
 
-static int make_cluster(struct config_group *g, const char *name,
-			struct config_group **new_g)
+static struct config_group *make_cluster(struct config_group *g,
+					 const char *name)
 {
 	struct cluster *cl = NULL;
 	struct spaces *sps = NULL;
@@ -435,15 +431,14 @@ static int make_cluster(struct config_group *g, const char *name,
 
 	space_list = &sps->ss_group;
 	comm_list = &cms->cs_group;
-	*new_g = &cl->group;
-	return 0;
+	return &cl->group;
 
  fail:
 	kfree(cl);
 	kfree(gps);
 	kfree(sps);
 	kfree(cms);
-	return -ENOMEM;
+	return NULL;
 }
 
 static void drop_cluster(struct config_group *g, struct config_item *i)
@@ -471,8 +466,7 @@ static void release_cluster(struct config_item *i)
 	kfree(cl);
 }
 
-static int make_space(struct config_group *g, const char *name,
-		      struct config_group **new_g)
+static struct config_group *make_space(struct config_group *g, const char *name)
 {
 	struct space *sp = NULL;
 	struct nodes *nds = NULL;
@@ -495,14 +489,13 @@ static int make_space(struct config_group *g, const char *name,
 	INIT_LIST_HEAD(&sp->members);
 	mutex_init(&sp->members_lock);
 	sp->members_count = 0;
-	*new_g = &sp->group;
-	return 0;
+	return &sp->group;
 
  fail:
 	kfree(sp);
 	kfree(gps);
 	kfree(nds);
-	return -ENOMEM;
+	return NULL;
 }
 
 static void drop_space(struct config_group *g, struct config_item *i)
@@ -529,21 +522,19 @@ static void release_space(struct config_item *i)
 	kfree(sp);
 }
 
-static int make_comm(struct config_group *g, const char *name,
-		     struct config_item **new_i)
+static struct config_item *make_comm(struct config_group *g, const char *name)
 {
 	struct comm *cm;
 
 	cm = kzalloc(sizeof(struct comm), GFP_KERNEL);
 	if (!cm)
-		return -ENOMEM;
+		return NULL;
 
 	config_item_init_type_name(&cm->item, name, &comm_type);
 	cm->nodeid = -1;
 	cm->local = 0;
 	cm->addr_count = 0;
-	*new_i = &cm->item;
-	return 0;
+	return &cm->item;
 }
 
 static void drop_comm(struct config_group *g, struct config_item *i)
@@ -563,15 +554,14 @@ static void release_comm(struct config_item *i)
 	kfree(cm);
 }
 
-static int make_node(struct config_group *g, const char *name,
-		     struct config_item **new_i)
+static struct config_item *make_node(struct config_group *g, const char *name)
 {
 	struct space *sp = to_space(g->cg_item.ci_parent);
 	struct node *nd;
 
 	nd = kzalloc(sizeof(struct node), GFP_KERNEL);
 	if (!nd)
-		return -ENOMEM;
+		return NULL;
 
 	config_item_init_type_name(&nd->item, name, &node_type);
 	nd->nodeid = -1;
@@ -583,8 +573,7 @@ static int make_node(struct config_group *g, const char *name,
 	sp->members_count++;
 	mutex_unlock(&sp->members_lock);
 
-	*new_i = &nd->item;
-	return 0;
+	return &nd->item;
 }
 
 static void drop_node(struct config_group *g, struct config_item *i)
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 443d108211a..f02ccb34604 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1489,28 +1489,25 @@ static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group
 		: NULL;
 }
 
-static int o2hb_heartbeat_group_make_item(struct config_group *group,
-					  const char *name,
-					  struct config_item **new_item)
+static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group,
+							  const char *name)
 {
 	struct o2hb_region *reg = NULL;
-	int ret = 0;
+	struct config_item *ret = NULL;
 
 	reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL);
-	if (reg == NULL) {
-		ret = -ENOMEM;
-		goto out;
-	}
+	if (reg == NULL)
+		goto out; /* ENOMEM */
 
 	config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
 
-	*new_item = &reg->hr_item;
+	ret = &reg->hr_item;
 
 	spin_lock(&o2hb_live_lock);
 	list_add_tail(&reg->hr_all_item, &o2hb_all_regions);
 	spin_unlock(&o2hb_live_lock);
 out:
-	if (ret)
+	if (ret == NULL)
 		kfree(reg);
 
 	return ret;
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index b364b7052e4..cfdb08b484e 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -644,32 +644,27 @@ out:
 	return ret;
 }
 
-static int o2nm_node_group_make_item(struct config_group *group,
-				     const char *name,
-				     struct config_item **new_item)
+static struct config_item *o2nm_node_group_make_item(struct config_group *group,
+						     const char *name)
 {
 	struct o2nm_node *node = NULL;
-	int ret = 0;
+	struct config_item *ret = NULL;
 
-	if (strlen(name) > O2NM_MAX_NAME_LEN) {
-		ret = -ENAMETOOLONG;
-		goto out;
-	}
+	if (strlen(name) > O2NM_MAX_NAME_LEN)
+		goto out; /* ENAMETOOLONG */
 
 	node = kzalloc(sizeof(struct o2nm_node), GFP_KERNEL);
-	if (node == NULL) {
-		ret = -ENOMEM;
-		goto out;
-	}
+	if (node == NULL)
+		goto out; /* ENOMEM */
 
 	strcpy(node->nd_name, name); /* use item.ci_namebuf instead? */
 	config_item_init_type_name(&node->nd_item, name, &o2nm_node_type);
 	spin_lock_init(&node->nd_lock);
 
-	*new_item = &node->nd_item;
+	ret = &node->nd_item;
 
 out:
-	if (ret)
+	if (ret == NULL)
 		kfree(node);
 
 	return ret;
@@ -756,31 +751,25 @@ static struct o2nm_cluster_group *to_o2nm_cluster_group(struct config_group *gro
 }
 #endif
 
-static int o2nm_cluster_group_make_group(struct config_group *group,
-					 const char *name,
-					 struct config_group **new_group)
+static struct config_group *o2nm_cluster_group_make_group(struct config_group *group,
+							  const char *name)
 {
 	struct o2nm_cluster *cluster = NULL;
 	struct o2nm_node_group *ns = NULL;
-	struct config_group *o2hb_group = NULL;
+	struct config_group *o2hb_group = NULL, *ret = NULL;
 	void *defs = NULL;
-	int ret = 0;
 
 	/* this runs under the parent dir's i_mutex; there can be only
 	 * one caller in here at a time */
-	if (o2nm_single_cluster) {
-		ret = -ENOSPC;
-		goto out;
-	}
+	if (o2nm_single_cluster)
+		goto out; /* ENOSPC */
 
 	cluster = kzalloc(sizeof(struct o2nm_cluster), GFP_KERNEL);
 	ns = kzalloc(sizeof(struct o2nm_node_group), GFP_KERNEL);
 	defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
 	o2hb_group = o2hb_alloc_hb_set();
-	if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL) {
-		ret = -ENOMEM;
+	if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL)
 		goto out;
-	}
 
 	config_group_init_type_name(&cluster->cl_group, name,
 				    &o2nm_cluster_type);
@@ -797,11 +786,11 @@ static int o2nm_cluster_group_make_group(struct config_group *group,
 	cluster->cl_idle_timeout_ms    = O2NET_IDLE_TIMEOUT_MS_DEFAULT;
 	cluster->cl_keepalive_delay_ms = O2NET_KEEPALIVE_DELAY_MS_DEFAULT;
 
-	*new_group = &cluster->cl_group;
+	ret = &cluster->cl_group;
 	o2nm_single_cluster = cluster;
 
 out:
-	if (ret) {
+	if (ret == NULL) {
 		kfree(cluster);
 		kfree(ns);
 		o2hb_free_hb_set(o2hb_group);
diff --git a/include/linux/configfs.h b/include/linux/configfs.h
index 0488f937634..3ae65b1bf90 100644
--- a/include/linux/configfs.h
+++ b/include/linux/configfs.h
@@ -165,8 +165,8 @@ struct configfs_item_operations {
 };
 
 struct configfs_group_operations {
-	int (*make_item)(struct config_group *group, const char *name, struct config_item **new_item);
-	int (*make_group)(struct config_group *group, const char *name, struct config_group **new_group);
+	struct config_item *(*make_item)(struct config_group *group, const char *name);
+	struct config_group *(*make_group)(struct config_group *group, const char *name);
 	int (*commit_item)(struct config_item *item);
 	void (*disconnect_notify)(struct config_group *group, struct config_item *item);
 	void (*drop_item)(struct config_group *group, struct config_item *item);
-- 
cgit v1.2.3-70-g09d2


From a6795e9ebb420d87af43789174689af0d66d1d35 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 17 Jul 2008 15:21:29 -0700
Subject: configfs: Allow ->make_item() and ->make_group() to return detailed
 errors.

The configfs operations ->make_item() and ->make_group() currently
return a new item/group.  A return of NULL signifies an error.  Because
of this, -ENOMEM is the only return code bubbled up the stack.

Multiple folks have requested the ability to return specific error codes
when these operations fail.  This patch adds that ability by changing the
->make_item/group() ops to return ERR_PTR() values.  These errors are
bubbled up appropriately.  NULL returns are changed to -ENOMEM for
compatibility.

Also updated are the in-kernel users of configfs.

This is a rework of reverted commit 11c3b79218390a139f2d474ee1e983a672d5839a.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 .../filesystems/configfs/configfs_example.c        |  4 ++--
 drivers/net/netconsole.c                           |  2 +-
 fs/configfs/dir.c                                  | 25 +++++++++++++---------
 fs/dlm/config.c                                    |  8 +++----
 fs/ocfs2/cluster/heartbeat.c                       | 10 ++-------
 fs/ocfs2/cluster/nodemanager.c                     | 16 +++++---------
 include/linux/configfs.h                           |  3 ++-
 7 files changed, 31 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/configfs/configfs_example.c b/Documentation/filesystems/configfs/configfs_example.c
index 25151fd5c2c..03964879170 100644
--- a/Documentation/filesystems/configfs/configfs_example.c
+++ b/Documentation/filesystems/configfs/configfs_example.c
@@ -279,7 +279,7 @@ static struct config_item *simple_children_make_item(struct config_group *group,
 
 	simple_child = kzalloc(sizeof(struct simple_child), GFP_KERNEL);
 	if (!simple_child)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 
 	config_item_init_type_name(&simple_child->item, name,
@@ -366,7 +366,7 @@ static struct config_group *group_children_make_group(struct config_group *group
 	simple_children = kzalloc(sizeof(struct simple_children),
 				  GFP_KERNEL);
 	if (!simple_children)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 
 	config_group_init_type_name(&simple_children->group, name,
diff --git a/drivers/net/netconsole.c b/drivers/net/netconsole.c
index 665341e4305..e13966bb5f7 100644
--- a/drivers/net/netconsole.c
+++ b/drivers/net/netconsole.c
@@ -598,7 +598,7 @@ static struct config_item *make_netconsole_target(struct config_group *group,
 	nt = kzalloc(sizeof(*nt), GFP_KERNEL);
 	if (!nt) {
 		printk(KERN_ERR "netconsole: failed to allocate memory\n");
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 	}
 
 	nt->np.name = "netconsole";
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 614e382a604..179589be063 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1027,9 +1027,10 @@ EXPORT_SYMBOL(configfs_undepend_item);
 
 static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
-	int ret, module_got = 0;
-	struct config_group *group;
-	struct config_item *item;
+	int ret = 0;
+	int module_got = 0;
+	struct config_group *group = NULL;
+	struct config_item *item = NULL;
 	struct config_item *parent_item;
 	struct configfs_subsystem *subsys;
 	struct configfs_dirent *sd;
@@ -1070,28 +1071,32 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name);
 
 	mutex_lock(&subsys->su_mutex);
-	group = NULL;
-	item = NULL;
 	if (type->ct_group_ops->make_group) {
 		group = type->ct_group_ops->make_group(to_config_group(parent_item), name);
-		if (group) {
+		if (!group)
+			group = ERR_PTR(-ENOMEM);
+		if (!IS_ERR(group)) {
 			link_group(to_config_group(parent_item), group);
 			item = &group->cg_item;
-		}
+		} else
+			ret = PTR_ERR(group);
 	} else {
 		item = type->ct_group_ops->make_item(to_config_group(parent_item), name);
-		if (item)
+		if (!item)
+			item = ERR_PTR(-ENOMEM);
+		if (!IS_ERR(item))
 			link_obj(parent_item, item);
+		else
+			ret = PTR_ERR(item);
 	}
 	mutex_unlock(&subsys->su_mutex);
 
 	kfree(name);
-	if (!item) {
+	if (ret) {
 		/*
 		 * If item == NULL, then link_obj() was never called.
 		 * There are no extra references to clean up.
 		 */
-		ret = -ENOMEM;
 		goto out_put;
 	}
 
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index eac23bd288b..c4e7d721bd8 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -438,7 +438,7 @@ static struct config_group *make_cluster(struct config_group *g,
 	kfree(gps);
 	kfree(sps);
 	kfree(cms);
-	return NULL;
+	return ERR_PTR(-ENOMEM);
 }
 
 static void drop_cluster(struct config_group *g, struct config_item *i)
@@ -495,7 +495,7 @@ static struct config_group *make_space(struct config_group *g, const char *name)
 	kfree(sp);
 	kfree(gps);
 	kfree(nds);
-	return NULL;
+	return ERR_PTR(-ENOMEM);
 }
 
 static void drop_space(struct config_group *g, struct config_item *i)
@@ -528,7 +528,7 @@ static struct config_item *make_comm(struct config_group *g, const char *name)
 
 	cm = kzalloc(sizeof(struct comm), GFP_KERNEL);
 	if (!cm)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	config_item_init_type_name(&cm->item, name, &comm_type);
 	cm->nodeid = -1;
@@ -561,7 +561,7 @@ static struct config_item *make_node(struct config_group *g, const char *name)
 
 	nd = kzalloc(sizeof(struct node), GFP_KERNEL);
 	if (!nd)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	config_item_init_type_name(&nd->item, name, &node_type);
 	nd->nodeid = -1;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index f02ccb34604..7dce1612553 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1493,24 +1493,18 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g
 							  const char *name)
 {
 	struct o2hb_region *reg = NULL;
-	struct config_item *ret = NULL;
 
 	reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL);
 	if (reg == NULL)
-		goto out; /* ENOMEM */
+		return ERR_PTR(-ENOMEM);
 
 	config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
 
-	ret = &reg->hr_item;
-
 	spin_lock(&o2hb_live_lock);
 	list_add_tail(&reg->hr_all_item, &o2hb_all_regions);
 	spin_unlock(&o2hb_live_lock);
-out:
-	if (ret == NULL)
-		kfree(reg);
 
-	return ret;
+	return &reg->hr_item;
 }
 
 static void o2hb_heartbeat_group_drop_item(struct config_group *group,
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index cfdb08b484e..816a3f61330 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -648,26 +648,19 @@ static struct config_item *o2nm_node_group_make_item(struct config_group *group,
 						     const char *name)
 {
 	struct o2nm_node *node = NULL;
-	struct config_item *ret = NULL;
 
 	if (strlen(name) > O2NM_MAX_NAME_LEN)
-		goto out; /* ENAMETOOLONG */
+		return ERR_PTR(-ENAMETOOLONG);
 
 	node = kzalloc(sizeof(struct o2nm_node), GFP_KERNEL);
 	if (node == NULL)
-		goto out; /* ENOMEM */
+		return ERR_PTR(-ENOMEM);
 
 	strcpy(node->nd_name, name); /* use item.ci_namebuf instead? */
 	config_item_init_type_name(&node->nd_item, name, &o2nm_node_type);
 	spin_lock_init(&node->nd_lock);
 
-	ret = &node->nd_item;
-
-out:
-	if (ret == NULL)
-		kfree(node);
-
-	return ret;
+	return &node->nd_item;
 }
 
 static void o2nm_node_group_drop_item(struct config_group *group,
@@ -762,7 +755,7 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g
 	/* this runs under the parent dir's i_mutex; there can be only
 	 * one caller in here at a time */
 	if (o2nm_single_cluster)
-		goto out; /* ENOSPC */
+		return ERR_PTR(-ENOSPC);
 
 	cluster = kzalloc(sizeof(struct o2nm_cluster), GFP_KERNEL);
 	ns = kzalloc(sizeof(struct o2nm_node_group), GFP_KERNEL);
@@ -795,6 +788,7 @@ out:
 		kfree(ns);
 		o2hb_free_hb_set(o2hb_group);
 		kfree(defs);
+		ret = ERR_PTR(-ENOMEM);
 	}
 
 	return ret;
diff --git a/include/linux/configfs.h b/include/linux/configfs.h
index 3ae65b1bf90..d62c19ff041 100644
--- a/include/linux/configfs.h
+++ b/include/linux/configfs.h
@@ -148,7 +148,8 @@ struct configfs_attribute {
  * items.  If the item is a group, it may support mkdir(2).
  * Groups supply one of make_group() and make_item().  If the
  * group supports make_group(), one can create group children.  If it
- * supports make_item(), one can create config_item children.  If it has
+ * supports make_item(), one can create config_item children.  make_group()
+ * and make_item() return ERR_PTR() on errors.  If it has
  * default_groups on group->default_groups, it has automatically created
  * group children.  default_groups may coexist alongsize make_group() or
  * make_item(), but if the group wishes to have only default_groups
-- 
cgit v1.2.3-70-g09d2


From de05c557b24c7dffc6d392e3db120cf11c9f6ae7 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Fri, 18 Jul 2008 04:07:21 -0700
Subject: proc: consolidate per-net single_open callers

There are already 7 of them - time to kill some duplicate code.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/proc/proc_net.c           | 24 +++++++++++++++++++
 include/linux/seq_file_net.h |  2 ++
 net/ipv4/fib_trie.c          | 13 +---------
 net/ipv4/proc.c              | 57 +++-----------------------------------------
 net/ipv6/proc.c              | 19 +--------------
 net/ipv6/route.c             | 26 ++------------------
 6 files changed, 33 insertions(+), 108 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 83f357b30d7..ab232ad2e6f 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -51,6 +51,30 @@ int seq_open_net(struct inode *ino, struct file *f,
 }
 EXPORT_SYMBOL_GPL(seq_open_net);
 
+int single_open_net(struct inode *inode, struct file *file,
+		int (*show)(struct seq_file *, void *))
+{
+	int err;
+	struct net *net;
+
+	err = -ENXIO;
+	net = get_proc_net(inode);
+	if (net == NULL)
+		goto err_net;
+
+	err = single_open(file, show, net);
+	if (err < 0)
+		goto err_open;
+
+	return 0;
+
+err_open:
+	put_net(net);
+err_net:
+	return err;
+}
+EXPORT_SYMBOL_GPL(single_open_net);
+
 int seq_release_net(struct inode *ino, struct file *f)
 {
 	struct seq_file *seq;
diff --git a/include/linux/seq_file_net.h b/include/linux/seq_file_net.h
index 4ac52542a56..87dcc0ecf6e 100644
--- a/include/linux/seq_file_net.h
+++ b/include/linux/seq_file_net.h
@@ -14,6 +14,8 @@ struct seq_net_private {
 
 int seq_open_net(struct inode *, struct file *,
 		 const struct seq_operations *, int);
+int single_open_net(struct inode *, struct file *file,
+		int (*show)(struct seq_file *, void *));
 int seq_release_net(struct inode *, struct file *);
 static inline struct net *seq_file_net(struct seq_file *seq)
 {
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index f155a66d6eb..6009df238ed 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -2251,18 +2251,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)
 
 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
 {
-	int err;
-	struct net *net;
-
-	net = get_proc_net(inode);
-	if (net == NULL)
-		return -ENXIO;
-	err = single_open(file, fib_triestat_seq_show, net);
-	if (err < 0) {
-		put_net(net);
-		return err;
-	}
-	return 0;
+	return single_open_net(inode, file, fib_triestat_seq_show);
 }
 
 static int fib_triestat_seq_release(struct inode *ino, struct file *f)
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 554390431a4..daf5d3c80ce 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -71,24 +71,7 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
 
 static int sockstat_seq_open(struct inode *inode, struct file *file)
 {
-	int err;
-	struct net *net;
-
-	err = -ENXIO;
-	net = get_proc_net(inode);
-	if (net == NULL)
-		goto err_net;
-
-	err = single_open(file, sockstat_seq_show, net);
-	if (err < 0)
-		goto err_open;
-
-	return 0;
-
-err_open:
-	put_net(net);
-err_net:
-	return err;
+	return single_open_net(inode, file, sockstat_seq_show);
 }
 
 static int sockstat_seq_release(struct inode *inode, struct file *file)
@@ -397,24 +380,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
 
 static int snmp_seq_open(struct inode *inode, struct file *file)
 {
-	int err;
-	struct net *net;
-
-	err = -ENXIO;
-	net = get_proc_net(inode);
-	if (net == NULL)
-		goto err_net;
-
-	err = single_open(file, snmp_seq_show, net);
-	if (err < 0)
-		goto err_open;
-
-	return 0;
-
-err_open:
-	put_net(net);
-err_net:
-	return err;
+	return single_open_net(inode, file, snmp_seq_show);
 }
 
 static int snmp_seq_release(struct inode *inode, struct file *file)
@@ -469,24 +435,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
 
 static int netstat_seq_open(struct inode *inode, struct file *file)
 {
-	int err;
-	struct net *net;
-
-	err = -ENXIO;
-	net = get_proc_net(inode);
-	if (net == NULL)
-		goto err_net;
-
-	err = single_open(file, netstat_seq_show, net);
-	if (err < 0)
-		goto err_open;
-
-	return 0;
-
-err_open:
-	put_net(net);
-err_net:
-	return err;
+	return single_open_net(inode, file, netstat_seq_show);
 }
 
 static int netstat_seq_release(struct inode *inode, struct file *file)
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index cbc7e514d3e..29c5a79444c 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -183,24 +183,7 @@ static int snmp6_seq_show(struct seq_file *seq, void *v)
 
 static int sockstat6_seq_open(struct inode *inode, struct file *file)
 {
-	int err;
-	struct net *net;
-
-	err = -ENXIO;
-	net = get_proc_net(inode);
-	if (net == NULL)
-		goto err_net;
-
-	err = single_open(file, sockstat6_seq_show, net);
-	if (err < 0)
-		goto err_open;
-
-	return 0;
-
-err_open:
-	put_net(net);
-err_net:
-	return err;
+	return single_open_net(inode, file, sockstat6_seq_show);
 }
 
 static int sockstat6_seq_release(struct inode *inode, struct file *file)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 5d6c166dfbb..fb7ff8f0c6d 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2416,18 +2416,7 @@ static int ipv6_route_show(struct seq_file *m, void *v)
 
 static int ipv6_route_open(struct inode *inode, struct file *file)
 {
-	int err;
-	struct net *net = get_proc_net(inode);
-	if (!net)
-		return -ENXIO;
-
-	err = single_open(file, ipv6_route_show, net);
-	if (err < 0) {
-		put_net(net);
-		return err;
-	}
-
-	return 0;
+	return single_open_net(inode, file, ipv6_route_show);
 }
 
 static int ipv6_route_release(struct inode *inode, struct file *file)
@@ -2463,18 +2452,7 @@ static int rt6_stats_seq_show(struct seq_file *seq, void *v)
 
 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
 {
-	int err;
-	struct net *net = get_proc_net(inode);
-	if (!net)
-		return -ENXIO;
-
-	err = single_open(file, rt6_stats_seq_show, net);
-	if (err < 0) {
-		put_net(net);
-		return err;
-	}
-
-	return 0;
+	return single_open_net(inode, file, rt6_stats_seq_show);
 }
 
 static int rt6_stats_seq_release(struct inode *inode, struct file *file)
-- 
cgit v1.2.3-70-g09d2


From b6fcbdb4f283f7ba67cec3cda6be23da8e959031 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Fri, 18 Jul 2008 04:07:44 -0700
Subject: proc: consolidate per-net single-release callers

They are symmetrical to single_open ones :)

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/proc/proc_net.c           |  8 ++++++++
 include/linux/seq_file_net.h |  1 +
 net/ipv4/fib_trie.c          |  9 +--------
 net/ipv4/proc.c              | 30 +++---------------------------
 net/ipv6/proc.c              | 10 +---------
 net/ipv6/route.c             | 20 ++------------------
 6 files changed, 16 insertions(+), 62 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index ab232ad2e6f..b224a28e0c1 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -87,6 +87,14 @@ int seq_release_net(struct inode *ino, struct file *f)
 }
 EXPORT_SYMBOL_GPL(seq_release_net);
 
+int single_release_net(struct inode *ino, struct file *f)
+{
+	struct seq_file *seq = f->private_data;
+	put_net(seq->private);
+	return single_release(ino, f);
+}
+EXPORT_SYMBOL_GPL(single_release_net);
+
 static struct net *get_proc_task_net(struct inode *dir)
 {
 	struct task_struct *task;
diff --git a/include/linux/seq_file_net.h b/include/linux/seq_file_net.h
index 87dcc0ecf6e..32c89bbe24a 100644
--- a/include/linux/seq_file_net.h
+++ b/include/linux/seq_file_net.h
@@ -17,6 +17,7 @@ int seq_open_net(struct inode *, struct file *,
 int single_open_net(struct inode *, struct file *file,
 		int (*show)(struct seq_file *, void *));
 int seq_release_net(struct inode *, struct file *);
+int single_release_net(struct inode *, struct file *);
 static inline struct net *seq_file_net(struct seq_file *seq)
 {
 #ifdef CONFIG_NET_NS
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 6009df238ed..5cb72786a8a 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -2254,19 +2254,12 @@ static int fib_triestat_seq_open(struct inode *inode, struct file *file)
 	return single_open_net(inode, file, fib_triestat_seq_show);
 }
 
-static int fib_triestat_seq_release(struct inode *ino, struct file *f)
-{
-	struct seq_file *seq = f->private_data;
-	put_net(seq->private);
-	return single_release(ino, f);
-}
-
 static const struct file_operations fib_triestat_fops = {
 	.owner	= THIS_MODULE,
 	.open	= fib_triestat_seq_open,
 	.read	= seq_read,
 	.llseek	= seq_lseek,
-	.release = fib_triestat_seq_release,
+	.release = single_release_net,
 };
 
 static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index daf5d3c80ce..834356ea99d 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -74,20 +74,12 @@ static int sockstat_seq_open(struct inode *inode, struct file *file)
 	return single_open_net(inode, file, sockstat_seq_show);
 }
 
-static int sockstat_seq_release(struct inode *inode, struct file *file)
-{
-	struct net *net = ((struct seq_file *)file->private_data)->private;
-
-	put_net(net);
-	return single_release(inode, file);
-}
-
 static const struct file_operations sockstat_seq_fops = {
 	.owner	 = THIS_MODULE,
 	.open	 = sockstat_seq_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
-	.release = sockstat_seq_release,
+	.release = single_release_net,
 };
 
 /* snmp items */
@@ -383,20 +375,12 @@ static int snmp_seq_open(struct inode *inode, struct file *file)
 	return single_open_net(inode, file, snmp_seq_show);
 }
 
-static int snmp_seq_release(struct inode *inode, struct file *file)
-{
-	struct net *net = ((struct seq_file *)file->private_data)->private;
-
-	put_net(net);
-	return single_release(inode, file);
-}
-
 static const struct file_operations snmp_seq_fops = {
 	.owner	 = THIS_MODULE,
 	.open	 = snmp_seq_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
-	.release = snmp_seq_release,
+	.release = single_release_net,
 };
 
 
@@ -438,20 +422,12 @@ static int netstat_seq_open(struct inode *inode, struct file *file)
 	return single_open_net(inode, file, netstat_seq_show);
 }
 
-static int netstat_seq_release(struct inode *inode, struct file *file)
-{
-	struct net *net = ((struct seq_file *)file->private_data)->private;
-
-	put_net(net);
-	return single_release(inode, file);
-}
-
 static const struct file_operations netstat_seq_fops = {
 	.owner	 = THIS_MODULE,
 	.open	 = netstat_seq_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
-	.release = netstat_seq_release,
+	.release = single_release_net,
 };
 
 static __net_init int ip_proc_init_net(struct net *net)
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 29c5a79444c..70940b3654a 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -186,20 +186,12 @@ static int sockstat6_seq_open(struct inode *inode, struct file *file)
 	return single_open_net(inode, file, sockstat6_seq_show);
 }
 
-static int sockstat6_seq_release(struct inode *inode, struct file *file)
-{
-	struct net *net = ((struct seq_file *)file->private_data)->private;
-
-	put_net(net);
-	return single_release(inode, file);
-}
-
 static const struct file_operations sockstat6_seq_fops = {
 	.owner	 = THIS_MODULE,
 	.open	 = sockstat6_seq_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
-	.release = sockstat6_seq_release,
+	.release = single_release_net,
 };
 
 static int snmp6_seq_open(struct inode *inode, struct file *file)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index fb7ff8f0c6d..cb8a51271b6 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2419,20 +2419,12 @@ static int ipv6_route_open(struct inode *inode, struct file *file)
 	return single_open_net(inode, file, ipv6_route_show);
 }
 
-static int ipv6_route_release(struct inode *inode, struct file *file)
-{
-	struct seq_file *seq = file->private_data;
-	struct net *net = seq->private;
-	put_net(net);
-	return single_release(inode, file);
-}
-
 static const struct file_operations ipv6_route_proc_fops = {
 	.owner		= THIS_MODULE,
 	.open		= ipv6_route_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= ipv6_route_release,
+	.release	= single_release_net,
 };
 
 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
@@ -2455,20 +2447,12 @@ static int rt6_stats_seq_open(struct inode *inode, struct file *file)
 	return single_open_net(inode, file, rt6_stats_seq_show);
 }
 
-static int rt6_stats_seq_release(struct inode *inode, struct file *file)
-{
-	struct seq_file *seq = file->private_data;
-	struct net *net = (struct net *)seq->private;
-	put_net(net);
-	return single_release(inode, file);
-}
-
 static const struct file_operations rt6_stats_seq_fops = {
 	.owner	 = THIS_MODULE,
 	.open	 = rt6_stats_seq_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
-	.release = rt6_stats_seq_release,
+	.release = single_release_net,
 };
 #endif	/* CONFIG_PROC_FS */
 
-- 
cgit v1.2.3-70-g09d2


From ad1060c89cfe451de849373d98e42fad58dd25ae Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Fri, 18 Jul 2008 15:04:16 -0400
Subject: nfsd: Use C99 initializers in fs/nfsd/nfs4xdr.c

Thanks to problem report and original patch from Harvey Harrison.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Cc: Harvey Harrison <harvey.harrison@gmail.com>
Cc: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfsd/nfs4xdr.c | 148 +++++++++++++++++++++++++++---------------------------
 1 file changed, 74 insertions(+), 74 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 9b6a9bafc6b..d4b9d09a668 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1000,43 +1000,43 @@ nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p)
 typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *);
 
 static nfsd4_dec nfsd4_dec_ops[] = {
-	[OP_ACCESS]		(nfsd4_dec)nfsd4_decode_access,
-	[OP_CLOSE]		(nfsd4_dec)nfsd4_decode_close,
-	[OP_COMMIT]		(nfsd4_dec)nfsd4_decode_commit,
-	[OP_CREATE]		(nfsd4_dec)nfsd4_decode_create,
-	[OP_DELEGPURGE]		(nfsd4_dec)nfsd4_decode_notsupp,
-	[OP_DELEGRETURN]	(nfsd4_dec)nfsd4_decode_delegreturn,
-	[OP_GETATTR]		(nfsd4_dec)nfsd4_decode_getattr,
-	[OP_GETFH]		(nfsd4_dec)nfsd4_decode_noop,
-	[OP_LINK]		(nfsd4_dec)nfsd4_decode_link,
-	[OP_LOCK]		(nfsd4_dec)nfsd4_decode_lock,
-	[OP_LOCKT]		(nfsd4_dec)nfsd4_decode_lockt,
-	[OP_LOCKU]		(nfsd4_dec)nfsd4_decode_locku,
-	[OP_LOOKUP]		(nfsd4_dec)nfsd4_decode_lookup,
-	[OP_LOOKUPP]		(nfsd4_dec)nfsd4_decode_noop,
-	[OP_NVERIFY]		(nfsd4_dec)nfsd4_decode_verify,
-	[OP_OPEN]		(nfsd4_dec)nfsd4_decode_open,
-	[OP_OPENATTR]		(nfsd4_dec)nfsd4_decode_notsupp,
-	[OP_OPEN_CONFIRM]	(nfsd4_dec)nfsd4_decode_open_confirm,
-	[OP_OPEN_DOWNGRADE]	(nfsd4_dec)nfsd4_decode_open_downgrade,
-	[OP_PUTFH]		(nfsd4_dec)nfsd4_decode_putfh,
-	[OP_PUTPUBFH]		(nfsd4_dec)nfsd4_decode_notsupp,
-	[OP_PUTROOTFH]		(nfsd4_dec)nfsd4_decode_noop,
-	[OP_READ]		(nfsd4_dec)nfsd4_decode_read,
-	[OP_READDIR]		(nfsd4_dec)nfsd4_decode_readdir,
-	[OP_READLINK]		(nfsd4_dec)nfsd4_decode_noop,
-	[OP_REMOVE]		(nfsd4_dec)nfsd4_decode_remove,
-	[OP_RENAME]		(nfsd4_dec)nfsd4_decode_rename,
-	[OP_RENEW]		(nfsd4_dec)nfsd4_decode_renew,
-	[OP_RESTOREFH]		(nfsd4_dec)nfsd4_decode_noop,
-	[OP_SAVEFH]		(nfsd4_dec)nfsd4_decode_noop,
-	[OP_SECINFO]		(nfsd4_dec)nfsd4_decode_secinfo,
-	[OP_SETATTR]		(nfsd4_dec)nfsd4_decode_setattr,
-	[OP_SETCLIENTID]	(nfsd4_dec)nfsd4_decode_setclientid,
-	[OP_SETCLIENTID_CONFIRM](nfsd4_dec)nfsd4_decode_setclientid_confirm,
-	[OP_VERIFY]		(nfsd4_dec)nfsd4_decode_verify,
-	[OP_WRITE]		(nfsd4_dec)nfsd4_decode_write,
-	[OP_RELEASE_LOCKOWNER]	(nfsd4_dec)nfsd4_decode_release_lockowner,
+	[OP_ACCESS]		= (nfsd4_dec)nfsd4_decode_access,
+	[OP_CLOSE]		= (nfsd4_dec)nfsd4_decode_close,
+	[OP_COMMIT]		= (nfsd4_dec)nfsd4_decode_commit,
+	[OP_CREATE]		= (nfsd4_dec)nfsd4_decode_create,
+	[OP_DELEGPURGE]		= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_DELEGRETURN]	= (nfsd4_dec)nfsd4_decode_delegreturn,
+	[OP_GETATTR]		= (nfsd4_dec)nfsd4_decode_getattr,
+	[OP_GETFH]		= (nfsd4_dec)nfsd4_decode_noop,
+	[OP_LINK]		= (nfsd4_dec)nfsd4_decode_link,
+	[OP_LOCK]		= (nfsd4_dec)nfsd4_decode_lock,
+	[OP_LOCKT]		= (nfsd4_dec)nfsd4_decode_lockt,
+	[OP_LOCKU]		= (nfsd4_dec)nfsd4_decode_locku,
+	[OP_LOOKUP]		= (nfsd4_dec)nfsd4_decode_lookup,
+	[OP_LOOKUPP]		= (nfsd4_dec)nfsd4_decode_noop,
+	[OP_NVERIFY]		= (nfsd4_dec)nfsd4_decode_verify,
+	[OP_OPEN]		= (nfsd4_dec)nfsd4_decode_open,
+	[OP_OPENATTR]		= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_OPEN_CONFIRM]	= (nfsd4_dec)nfsd4_decode_open_confirm,
+	[OP_OPEN_DOWNGRADE]	= (nfsd4_dec)nfsd4_decode_open_downgrade,
+	[OP_PUTFH]		= (nfsd4_dec)nfsd4_decode_putfh,
+	[OP_PUTPUBFH]		= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_PUTROOTFH]		= (nfsd4_dec)nfsd4_decode_noop,
+	[OP_READ]		= (nfsd4_dec)nfsd4_decode_read,
+	[OP_READDIR]		= (nfsd4_dec)nfsd4_decode_readdir,
+	[OP_READLINK]		= (nfsd4_dec)nfsd4_decode_noop,
+	[OP_REMOVE]		= (nfsd4_dec)nfsd4_decode_remove,
+	[OP_RENAME]		= (nfsd4_dec)nfsd4_decode_rename,
+	[OP_RENEW]		= (nfsd4_dec)nfsd4_decode_renew,
+	[OP_RESTOREFH]		= (nfsd4_dec)nfsd4_decode_noop,
+	[OP_SAVEFH]		= (nfsd4_dec)nfsd4_decode_noop,
+	[OP_SECINFO]		= (nfsd4_dec)nfsd4_decode_secinfo,
+	[OP_SETATTR]		= (nfsd4_dec)nfsd4_decode_setattr,
+	[OP_SETCLIENTID]	= (nfsd4_dec)nfsd4_decode_setclientid,
+	[OP_SETCLIENTID_CONFIRM] = (nfsd4_dec)nfsd4_decode_setclientid_confirm,
+	[OP_VERIFY]		= (nfsd4_dec)nfsd4_decode_verify,
+	[OP_WRITE]		= (nfsd4_dec)nfsd4_decode_write,
+	[OP_RELEASE_LOCKOWNER]	= (nfsd4_dec)nfsd4_decode_release_lockowner,
 };
 
 struct nfsd4_minorversion_ops {
@@ -1045,7 +1045,7 @@ struct nfsd4_minorversion_ops {
 };
 
 static struct nfsd4_minorversion_ops nfsd4_minorversion[] = {
-	[0] { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) },
+	[0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) },
 };
 
 static __be32
@@ -2577,42 +2577,42 @@ nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
 typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *);
 
 static nfsd4_enc nfsd4_enc_ops[] = {
-	[OP_ACCESS]		(nfsd4_enc)nfsd4_encode_access,
-	[OP_CLOSE]		(nfsd4_enc)nfsd4_encode_close,
-	[OP_COMMIT]		(nfsd4_enc)nfsd4_encode_commit,
-	[OP_CREATE]		(nfsd4_enc)nfsd4_encode_create,
-	[OP_DELEGPURGE]		(nfsd4_enc)nfsd4_encode_noop,
-	[OP_DELEGRETURN]	(nfsd4_enc)nfsd4_encode_noop,
-	[OP_GETATTR]		(nfsd4_enc)nfsd4_encode_getattr,
-	[OP_GETFH]		(nfsd4_enc)nfsd4_encode_getfh,
-	[OP_LINK]		(nfsd4_enc)nfsd4_encode_link,
-	[OP_LOCK]		(nfsd4_enc)nfsd4_encode_lock,
-	[OP_LOCKT]		(nfsd4_enc)nfsd4_encode_lockt,
-	[OP_LOCKU]		(nfsd4_enc)nfsd4_encode_locku,
-	[OP_LOOKUP]		(nfsd4_enc)nfsd4_encode_noop,
-	[OP_LOOKUPP]		(nfsd4_enc)nfsd4_encode_noop,
-	[OP_NVERIFY]		(nfsd4_enc)nfsd4_encode_noop,
-	[OP_OPEN]		(nfsd4_enc)nfsd4_encode_open,
-	[OP_OPEN_CONFIRM]	(nfsd4_enc)nfsd4_encode_open_confirm,
-	[OP_OPEN_DOWNGRADE]	(nfsd4_enc)nfsd4_encode_open_downgrade,
-	[OP_PUTFH]		(nfsd4_enc)nfsd4_encode_noop,
-	[OP_PUTPUBFH]		(nfsd4_enc)nfsd4_encode_noop,
-	[OP_PUTROOTFH]		(nfsd4_enc)nfsd4_encode_noop,
-	[OP_READ]		(nfsd4_enc)nfsd4_encode_read,
-	[OP_READDIR]		(nfsd4_enc)nfsd4_encode_readdir,
-	[OP_READLINK]		(nfsd4_enc)nfsd4_encode_readlink,
-	[OP_REMOVE]		(nfsd4_enc)nfsd4_encode_remove,
-	[OP_RENAME]		(nfsd4_enc)nfsd4_encode_rename,
-	[OP_RENEW]		(nfsd4_enc)nfsd4_encode_noop,
-	[OP_RESTOREFH]		(nfsd4_enc)nfsd4_encode_noop,
-	[OP_SAVEFH]		(nfsd4_enc)nfsd4_encode_noop,
-	[OP_SECINFO]		(nfsd4_enc)nfsd4_encode_secinfo,
-	[OP_SETATTR]		(nfsd4_enc)nfsd4_encode_setattr,
-	[OP_SETCLIENTID]	(nfsd4_enc)nfsd4_encode_setclientid,
-	[OP_SETCLIENTID_CONFIRM](nfsd4_enc)nfsd4_encode_noop,
-	[OP_VERIFY]		(nfsd4_enc)nfsd4_encode_noop,
-	[OP_WRITE]		(nfsd4_enc)nfsd4_encode_write,
-	[OP_RELEASE_LOCKOWNER]	(nfsd4_enc)nfsd4_encode_noop,
+	[OP_ACCESS]		= (nfsd4_enc)nfsd4_encode_access,
+	[OP_CLOSE]		= (nfsd4_enc)nfsd4_encode_close,
+	[OP_COMMIT]		= (nfsd4_enc)nfsd4_encode_commit,
+	[OP_CREATE]		= (nfsd4_enc)nfsd4_encode_create,
+	[OP_DELEGPURGE]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_DELEGRETURN]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_GETATTR]		= (nfsd4_enc)nfsd4_encode_getattr,
+	[OP_GETFH]		= (nfsd4_enc)nfsd4_encode_getfh,
+	[OP_LINK]		= (nfsd4_enc)nfsd4_encode_link,
+	[OP_LOCK]		= (nfsd4_enc)nfsd4_encode_lock,
+	[OP_LOCKT]		= (nfsd4_enc)nfsd4_encode_lockt,
+	[OP_LOCKU]		= (nfsd4_enc)nfsd4_encode_locku,
+	[OP_LOOKUP]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_LOOKUPP]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_NVERIFY]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_OPEN]		= (nfsd4_enc)nfsd4_encode_open,
+	[OP_OPEN_CONFIRM]	= (nfsd4_enc)nfsd4_encode_open_confirm,
+	[OP_OPEN_DOWNGRADE]	= (nfsd4_enc)nfsd4_encode_open_downgrade,
+	[OP_PUTFH]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_PUTPUBFH]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_PUTROOTFH]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_READ]		= (nfsd4_enc)nfsd4_encode_read,
+	[OP_READDIR]		= (nfsd4_enc)nfsd4_encode_readdir,
+	[OP_READLINK]		= (nfsd4_enc)nfsd4_encode_readlink,
+	[OP_REMOVE]		= (nfsd4_enc)nfsd4_encode_remove,
+	[OP_RENAME]		= (nfsd4_enc)nfsd4_encode_rename,
+	[OP_RENEW]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_RESTOREFH]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_SAVEFH]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_SECINFO]		= (nfsd4_enc)nfsd4_encode_secinfo,
+	[OP_SETATTR]		= (nfsd4_enc)nfsd4_encode_setattr,
+	[OP_SETCLIENTID]	= (nfsd4_enc)nfsd4_encode_setclientid,
+	[OP_SETCLIENTID_CONFIRM] = (nfsd4_enc)nfsd4_encode_noop,
+	[OP_VERIFY]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_WRITE]		= (nfsd4_enc)nfsd4_encode_write,
+	[OP_RELEASE_LOCKOWNER]	= (nfsd4_enc)nfsd4_encode_noop,
 };
 
 void
-- 
cgit v1.2.3-70-g09d2


From 5108b27651727b5aba0826e8fd7be71b42428701 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Thu, 17 Jul 2008 21:33:04 -0700
Subject: nfsd: nfs4xdr.c do-while is not a compound statement

The WRITEMEM macro produces sparse warnings of the form:
fs/nfsd/nfs4xdr.c:2668:2: warning: do-while statement is not a compound statement

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Cc: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4xdr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index d4b9d09a668..14ba4d9b285 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1160,11 +1160,11 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 	*p++ = htonl((u32)((n) >> 32));				\
 	*p++ = htonl((u32)(n));					\
 } while (0)
-#define WRITEMEM(ptr,nbytes)     do if (nbytes > 0) {		\
+#define WRITEMEM(ptr,nbytes)     do { if (nbytes > 0) {		\
 	*(p + XDR_QUADLEN(nbytes) -1) = 0;                      \
 	memcpy(p, ptr, nbytes);					\
 	p += XDR_QUADLEN(nbytes);				\
-} while (0)
+}} while (0)
 #define WRITECINFO(c)		do {				\
 	*p++ = htonl(c.atomic);					\
 	*p++ = htonl(c.before_ctime_sec);				\
-- 
cgit v1.2.3-70-g09d2


From a352def21a642133758b868c71bee12ab34ad5c5 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@redhat.com>
Date: Wed, 16 Jul 2008 21:53:12 +0100
Subject: tty: Ldisc revamp

Move the line disciplines towards a conventional ->ops arrangement.  For
the moment the actual 'tty_ldisc' struct in the tty is kept as part of
the tty struct but this can then be changed if it turns out that when it
all settles down we want to refcount ldiscs separately to the tty.

Pull the ldisc code out of /proc and put it with our ldisc code.

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/bluetooth/hci_ldisc.c      |   6 +-
 drivers/char/cyclades.c            |   3 +-
 drivers/char/epca.c                |   4 +-
 drivers/char/ip2/i2lib.c           |   4 +-
 drivers/char/ip2/ip2main.c         |   7 +-
 drivers/char/n_hdlc.c              |   6 +-
 drivers/char/n_r3964.c             |   2 +-
 drivers/char/n_tty.c               |   2 +-
 drivers/char/pcmcia/synclink_cs.c  |   4 +-
 drivers/char/pty.c                 |  10 +-
 drivers/char/selection.c           |   3 +-
 drivers/char/synclink.c            |   4 +-
 drivers/char/synclink_gt.c         |   4 +-
 drivers/char/synclinkmp.c          |   4 +-
 drivers/char/tty_io.c              | 336 +++++++++++++++++++++++--------------
 drivers/char/tty_ioctl.c           |  16 +-
 drivers/input/serio/serport.c      |   2 +-
 drivers/isdn/capi/capi.c           |   4 +-
 drivers/isdn/gigaset/ser-gigaset.c |   2 +-
 drivers/net/hamradio/6pack.c       |   2 +-
 drivers/net/hamradio/mkiss.c       |   2 +-
 drivers/net/irda/irtty-sir.c       |   2 +-
 drivers/net/ppp_async.c            |   2 +-
 drivers/net/ppp_synctty.c          |   2 +-
 drivers/net/slip.c                 |   2 +-
 drivers/net/wan/pc300_tty.c        |   4 +-
 drivers/net/wan/x25_asy.c          |   2 +-
 fs/proc/proc_tty.c                 |  48 ------
 include/linux/tty.h                |   9 +-
 include/linux/tty_ldisc.h          |   7 +-
 net/bluetooth/rfcomm/tty.c         |  13 +-
 net/irda/ircomm/ircomm_tty.c       |  14 +-
 32 files changed, 286 insertions(+), 246 deletions(-)

(limited to 'fs')

diff --git a/drivers/bluetooth/hci_ldisc.c b/drivers/bluetooth/hci_ldisc.c
index e5cd856a2fe..69df187d74c 100644
--- a/drivers/bluetooth/hci_ldisc.c
+++ b/drivers/bluetooth/hci_ldisc.c
@@ -282,8 +282,8 @@ static int hci_uart_tty_open(struct tty_struct *tty)
 	/* FIXME: why is this needed. Note don't use ldisc_ref here as the
 	   open path is before the ldisc is referencable */
 
-	if (tty->ldisc.flush_buffer)
-		tty->ldisc.flush_buffer(tty);
+	if (tty->ldisc.ops->flush_buffer)
+		tty->ldisc.ops->flush_buffer(tty);
 	tty_driver_flush_buffer(tty);
 
 	return 0;
@@ -514,7 +514,7 @@ static unsigned int hci_uart_tty_poll(struct tty_struct *tty,
 
 static int __init hci_uart_init(void)
 {
-	static struct tty_ldisc hci_uart_ldisc;
+	static struct tty_ldisc_ops hci_uart_ldisc;
 	int err;
 
 	BT_INFO("HCI UART driver ver %s", VERSION);
diff --git a/drivers/char/cyclades.c b/drivers/char/cyclades.c
index 6bff9d87dc5..a957dbcc5a4 100644
--- a/drivers/char/cyclades.c
+++ b/drivers/char/cyclades.c
@@ -5246,7 +5246,8 @@ cyclades_get_proc_info(char *buf, char **start, off_t offset, int length,
 					HZ, info->idle_stats.recv_bytes,
 					(cur_jifs - info->idle_stats.recv_idle)/
 					HZ, info->idle_stats.overruns,
-					(long)info->tty->ldisc.num);
+					/* FIXME: double check locking */
+					(long)info->tty->ldisc.ops->num);
 			else
 				size = sprintf(buf + len, "%3d %8lu %10lu %8lu "
 					"%10lu %8lu %9lu %6ld\n",
diff --git a/drivers/char/epca.c b/drivers/char/epca.c
index 60a4df7dac1..aa8e19f44b4 100644
--- a/drivers/char/epca.c
+++ b/drivers/char/epca.c
@@ -2262,8 +2262,8 @@ static int pc_ioctl(struct tty_struct *tty, struct file *file,
 			tty_wait_until_sent(tty, 0);
 		} else {
 			/* ldisc lock already held in ioctl */
-			if (tty->ldisc.flush_buffer)
-				tty->ldisc.flush_buffer(tty);
+			if (tty->ldisc.ops->flush_buffer)
+				tty->ldisc.ops->flush_buffer(tty);
 		}
 		unlock_kernel();
 		/* Fall Thru */
diff --git a/drivers/char/ip2/i2lib.c b/drivers/char/ip2/i2lib.c
index 938879cc7bc..0061e18aff6 100644
--- a/drivers/char/ip2/i2lib.c
+++ b/drivers/char/ip2/i2lib.c
@@ -868,11 +868,11 @@ i2Input(i2ChanStrPtr pCh)
 		amountToMove = count;
 	}
 	// Move the first block
-	pCh->pTTY->ldisc.receive_buf( pCh->pTTY, 
+	pCh->pTTY->ldisc.ops->receive_buf( pCh->pTTY,
 		 &(pCh->Ibuf[stripIndex]), NULL, amountToMove );
 	// If we needed to wrap, do the second data move
 	if (count > amountToMove) {
-		pCh->pTTY->ldisc.receive_buf( pCh->pTTY, 
+		pCh->pTTY->ldisc.ops->receive_buf( pCh->pTTY,
 		 pCh->Ibuf, NULL, count - amountToMove );
 	}
 	// Bump and wrap the stripIndex all at once by the amount of data read. This
diff --git a/drivers/char/ip2/ip2main.c b/drivers/char/ip2/ip2main.c
index 9a2394cda94..5dc74404058 100644
--- a/drivers/char/ip2/ip2main.c
+++ b/drivers/char/ip2/ip2main.c
@@ -1289,11 +1289,12 @@ static void do_input(struct work_struct *work)
 // code duplicated from n_tty (ldisc)
 static inline void  isig(int sig, struct tty_struct *tty, int flush)
 {
+	/* FIXME: This is completely bogus */
 	if (tty->pgrp)
 		kill_pgrp(tty->pgrp, sig, 1);
 	if (flush || !L_NOFLSH(tty)) {
-		if ( tty->ldisc.flush_buffer )  
-			tty->ldisc.flush_buffer(tty);
+		if ( tty->ldisc.ops->flush_buffer )  
+			tty->ldisc.ops->flush_buffer(tty);
 		i2InputFlush( tty->driver_data );
 	}
 }
@@ -1342,7 +1343,7 @@ static void do_status(struct work_struct *work)
 		}
 		tmp = pCh->pTTY->real_raw;
 		pCh->pTTY->real_raw = 0;
-		pCh->pTTY->ldisc.receive_buf( pCh->pTTY, &brkc, &brkf, 1 );
+		pCh->pTTY->ldisc->ops.receive_buf( pCh->pTTY, &brkc, &brkf, 1 );
 		pCh->pTTY->real_raw = tmp;
 	}
 #endif /* NEVER_HAPPENS_AS_SETUP_XXX */
diff --git a/drivers/char/n_hdlc.c b/drivers/char/n_hdlc.c
index a35bfd7ee80..ed4e03333ab 100644
--- a/drivers/char/n_hdlc.c
+++ b/drivers/char/n_hdlc.c
@@ -199,7 +199,7 @@ static void n_hdlc_tty_wakeup(struct tty_struct *tty);
 #define tty2n_hdlc(tty)	((struct n_hdlc *) ((tty)->disc_data))
 #define n_hdlc2tty(n_hdlc)	((n_hdlc)->tty)
 
-static struct tty_ldisc n_hdlc_ldisc = {
+static struct tty_ldisc_ops n_hdlc_ldisc = {
 	.owner		= THIS_MODULE,
 	.magic		= TTY_LDISC_MAGIC,
 	.name		= "hdlc",
@@ -342,8 +342,8 @@ static int n_hdlc_tty_open (struct tty_struct *tty)
 #endif
 	
 	/* Flush any pending characters in the driver and discipline. */
-	if (tty->ldisc.flush_buffer)
-		tty->ldisc.flush_buffer(tty);
+	if (tty->ldisc.ops->flush_buffer)
+		tty->ldisc.ops->flush_buffer(tty);
 
 	tty_driver_flush_buffer(tty);
 		
diff --git a/drivers/char/n_r3964.c b/drivers/char/n_r3964.c
index 90216906233..ae377aa473b 100644
--- a/drivers/char/n_r3964.c
+++ b/drivers/char/n_r3964.c
@@ -143,7 +143,7 @@ static unsigned int r3964_poll(struct tty_struct *tty, struct file *file,
 static void r3964_receive_buf(struct tty_struct *tty, const unsigned char *cp,
 		char *fp, int count);
 
-static struct tty_ldisc tty_ldisc_N_R3964 = {
+static struct tty_ldisc_ops tty_ldisc_N_R3964 = {
 	.owner = THIS_MODULE,
 	.magic = TTY_LDISC_MAGIC,
 	.name = "R3964",
diff --git a/drivers/char/n_tty.c b/drivers/char/n_tty.c
index 8096389b0dc..708c2b1dbe5 100644
--- a/drivers/char/n_tty.c
+++ b/drivers/char/n_tty.c
@@ -1573,7 +1573,7 @@ static unsigned int normal_poll(struct tty_struct *tty, struct file *file,
 	return mask;
 }
 
-struct tty_ldisc tty_ldisc_N_TTY = {
+struct tty_ldisc_ops tty_ldisc_N_TTY = {
 	.magic           = TTY_LDISC_MAGIC,
 	.name            = "n_tty",
 	.open            = n_tty_open,
diff --git a/drivers/char/pcmcia/synclink_cs.c b/drivers/char/pcmcia/synclink_cs.c
index 1dd0e992c83..95743e2682f 100644
--- a/drivers/char/pcmcia/synclink_cs.c
+++ b/drivers/char/pcmcia/synclink_cs.c
@@ -514,8 +514,8 @@ static void ldisc_receive_buf(struct tty_struct *tty,
 		return;
 	ld = tty_ldisc_ref(tty);
 	if (ld) {
-		if (ld->receive_buf)
-			ld->receive_buf(tty, data, flags, count);
+		if (ld->ops->receive_buf)
+			ld->ops->receive_buf(tty, data, flags, count);
 		tty_ldisc_deref(ld);
 	}
 }
diff --git a/drivers/char/pty.c b/drivers/char/pty.c
index 0a05c038ae6..76b27932d22 100644
--- a/drivers/char/pty.c
+++ b/drivers/char/pty.c
@@ -111,7 +111,7 @@ static int pty_write(struct tty_struct * tty, const unsigned char *buf, int coun
 	c = to->receive_room;
 	if (c > count)
 		c = count;
-	to->ldisc.receive_buf(to, buf, NULL, c);
+	to->ldisc.ops->receive_buf(to, buf, NULL, c);
 	
 	return c;
 }
@@ -149,11 +149,11 @@ static int pty_chars_in_buffer(struct tty_struct *tty)
 	int count;
 
 	/* We should get the line discipline lock for "tty->link" */
-	if (!to || !to->ldisc.chars_in_buffer)
+	if (!to || !to->ldisc.ops->chars_in_buffer)
 		return 0;
 
 	/* The ldisc must report 0 if no characters available to be read */
-	count = to->ldisc.chars_in_buffer(to);
+	count = to->ldisc.ops->chars_in_buffer(to);
 
 	if (tty->driver->subtype == PTY_TYPE_SLAVE) return count;
 
@@ -186,8 +186,8 @@ static void pty_flush_buffer(struct tty_struct *tty)
 	if (!to)
 		return;
 	
-	if (to->ldisc.flush_buffer)
-		to->ldisc.flush_buffer(to);
+	if (to->ldisc.ops->flush_buffer)
+		to->ldisc.ops->flush_buffer(to);
 	
 	if (to->packet) {
 		spin_lock_irqsave(&tty->ctrl_lock, flags);
diff --git a/drivers/char/selection.c b/drivers/char/selection.c
index d63f5ccc29e..2978a49a172 100644
--- a/drivers/char/selection.c
+++ b/drivers/char/selection.c
@@ -327,7 +327,8 @@ int paste_selection(struct tty_struct *tty)
 		}
 		count = sel_buffer_lth - pasted;
 		count = min(count, tty->receive_room);
-		tty->ldisc.receive_buf(tty, sel_buffer + pasted, NULL, count);
+		tty->ldisc.ops->receive_buf(tty, sel_buffer + pasted,
+								NULL, count);
 		pasted += count;
 	}
 	remove_wait_queue(&vc->paste_wait, &wait);
diff --git a/drivers/char/synclink.c b/drivers/char/synclink.c
index ac5080df256..5e4b2e638d0 100644
--- a/drivers/char/synclink.c
+++ b/drivers/char/synclink.c
@@ -975,8 +975,8 @@ static void ldisc_receive_buf(struct tty_struct *tty,
 		return;
 	ld = tty_ldisc_ref(tty);
 	if (ld) {
-		if (ld->receive_buf)
-			ld->receive_buf(tty, data, flags, count);
+		if (ld->ops->receive_buf)
+			ld->ops->receive_buf(tty, data, flags, count);
 		tty_ldisc_deref(ld);
 	}
 }
diff --git a/drivers/char/synclink_gt.c b/drivers/char/synclink_gt.c
index 55c1653be00..e473778cd6f 100644
--- a/drivers/char/synclink_gt.c
+++ b/drivers/char/synclink_gt.c
@@ -641,8 +641,8 @@ static void ldisc_receive_buf(struct tty_struct *tty,
 		return;
 	ld = tty_ldisc_ref(tty);
 	if (ld) {
-		if (ld->receive_buf)
-			ld->receive_buf(tty, data, flags, count);
+		if (ld->ops->receive_buf)
+			ld->ops->receive_buf(tty, data, flags, count);
 		tty_ldisc_deref(ld);
 	}
 }
diff --git a/drivers/char/synclinkmp.c b/drivers/char/synclinkmp.c
index bec54866e0b..5341b5aaf8b 100644
--- a/drivers/char/synclinkmp.c
+++ b/drivers/char/synclinkmp.c
@@ -712,8 +712,8 @@ static void ldisc_receive_buf(struct tty_struct *tty,
 		return;
 	ld = tty_ldisc_ref(tty);
 	if (ld) {
-		if (ld->receive_buf)
-			ld->receive_buf(tty, data, flags, count);
+		if (ld->ops->receive_buf)
+			ld->ops->receive_buf(tty, data, flags, count);
 		tty_ldisc_deref(ld);
 	}
 }
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index 047a17339f8..54c4ada460e 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -95,8 +95,9 @@
 #include <linux/wait.h>
 #include <linux/bitops.h>
 #include <linux/delay.h>
+#include <linux/seq_file.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/system.h>
 
 #include <linux/kbd_kern.h>
@@ -682,7 +683,7 @@ static void tty_set_termios_ldisc(struct tty_struct *tty, int num)
 static DEFINE_SPINLOCK(tty_ldisc_lock);
 static DECLARE_WAIT_QUEUE_HEAD(tty_ldisc_wait);
 /* Line disc dispatch table */
-static struct tty_ldisc tty_ldiscs[NR_LDISCS];
+static struct tty_ldisc_ops *tty_ldiscs[NR_LDISCS];
 
 /**
  *	tty_register_ldisc	-	install a line discipline
@@ -697,7 +698,7 @@ static struct tty_ldisc tty_ldiscs[NR_LDISCS];
  *		takes tty_ldisc_lock to guard against ldisc races
  */
 
-int tty_register_ldisc(int disc, struct tty_ldisc *new_ldisc)
+int tty_register_ldisc(int disc, struct tty_ldisc_ops *new_ldisc)
 {
 	unsigned long flags;
 	int ret = 0;
@@ -706,10 +707,9 @@ int tty_register_ldisc(int disc, struct tty_ldisc *new_ldisc)
 		return -EINVAL;
 
 	spin_lock_irqsave(&tty_ldisc_lock, flags);
-	tty_ldiscs[disc] = *new_ldisc;
-	tty_ldiscs[disc].num = disc;
-	tty_ldiscs[disc].flags |= LDISC_FLAG_DEFINED;
-	tty_ldiscs[disc].refcount = 0;
+	tty_ldiscs[disc] = new_ldisc;
+	new_ldisc->num = disc;
+	new_ldisc->refcount = 0;
 	spin_unlock_irqrestore(&tty_ldisc_lock, flags);
 
 	return ret;
@@ -737,19 +737,56 @@ int tty_unregister_ldisc(int disc)
 		return -EINVAL;
 
 	spin_lock_irqsave(&tty_ldisc_lock, flags);
-	if (tty_ldiscs[disc].refcount)
+	if (tty_ldiscs[disc]->refcount)
 		ret = -EBUSY;
 	else
-		tty_ldiscs[disc].flags &= ~LDISC_FLAG_DEFINED;
+		tty_ldiscs[disc] = NULL;
 	spin_unlock_irqrestore(&tty_ldisc_lock, flags);
 
 	return ret;
 }
 EXPORT_SYMBOL(tty_unregister_ldisc);
 
+
+/**
+ *	tty_ldisc_try_get	-	try and reference an ldisc
+ *	@disc: ldisc number
+ *	@ld: tty ldisc structure to complete
+ *
+ *	Attempt to open and lock a line discipline into place. Return
+ *	the line discipline refcounted and assigned in ld. On an error
+ *	report the error code back
+ */
+
+static int tty_ldisc_try_get(int disc, struct tty_ldisc *ld)
+{
+	unsigned long flags;
+	struct tty_ldisc_ops *ldops;
+	int err = -EINVAL;
+	
+	spin_lock_irqsave(&tty_ldisc_lock, flags);
+	ld->ops = NULL;
+	ldops = tty_ldiscs[disc];
+	/* Check the entry is defined */
+	if (ldops) {
+		/* If the module is being unloaded we can't use it */
+		if (!try_module_get(ldops->owner))
+			err = -EAGAIN;
+		else {
+			/* lock it */
+			ldops->refcount++;
+			ld->ops = ldops;
+			err = 0;
+		}
+	}
+	spin_unlock_irqrestore(&tty_ldisc_lock, flags);
+	return err;
+}
+
 /**
  *	tty_ldisc_get		-	take a reference to an ldisc
  *	@disc: ldisc number
+ *	@ld: tty line discipline structure to use
  *
  *	Takes a reference to a line discipline. Deals with refcounts and
  *	module locking counts. Returns NULL if the discipline is not available.
@@ -760,32 +797,20 @@ EXPORT_SYMBOL(tty_unregister_ldisc);
  *		takes tty_ldisc_lock to guard against ldisc races
  */
 
-struct tty_ldisc *tty_ldisc_get(int disc)
+static int tty_ldisc_get(int disc, struct tty_ldisc *ld)
 {
-	unsigned long flags;
-	struct tty_ldisc *ld;
+	int err;
 
 	if (disc < N_TTY || disc >= NR_LDISCS)
-		return NULL;
-
-	spin_lock_irqsave(&tty_ldisc_lock, flags);
-
-	ld = &tty_ldiscs[disc];
-	/* Check the entry is defined */
-	if (ld->flags & LDISC_FLAG_DEFINED) {
-		/* If the module is being unloaded we can't use it */
-		if (!try_module_get(ld->owner))
-			ld = NULL;
-		else /* lock it */
-			ld->refcount++;
-	} else
-		ld = NULL;
-	spin_unlock_irqrestore(&tty_ldisc_lock, flags);
-	return ld;
+		return -EINVAL;
+	err = tty_ldisc_try_get(disc, ld);
+	if (err == -EAGAIN) {
+		request_module("tty-ldisc-%d", disc);
+		err = tty_ldisc_try_get(disc, ld);
+	}
+	return err;
 }
 
-EXPORT_SYMBOL_GPL(tty_ldisc_get);
-
 /**
  *	tty_ldisc_put		-	drop ldisc reference
  *	@disc: ldisc number
@@ -797,22 +822,67 @@ EXPORT_SYMBOL_GPL(tty_ldisc_get);
  *		takes tty_ldisc_lock to guard against ldisc races
  */
 
-void tty_ldisc_put(int disc)
+static void tty_ldisc_put(struct tty_ldisc_ops *ld)
 {
-	struct tty_ldisc *ld;
 	unsigned long flags;
+	int disc = ld->num;
 
 	BUG_ON(disc < N_TTY || disc >= NR_LDISCS);
 
 	spin_lock_irqsave(&tty_ldisc_lock, flags);
-	ld = &tty_ldiscs[disc];
+	ld = tty_ldiscs[disc];
 	BUG_ON(ld->refcount == 0);
 	ld->refcount--;
 	module_put(ld->owner);
 	spin_unlock_irqrestore(&tty_ldisc_lock, flags);
 }
 
-EXPORT_SYMBOL_GPL(tty_ldisc_put);
+static void * tty_ldiscs_seq_start(struct seq_file *m, loff_t *pos)
+{
+	return (*pos < NR_LDISCS) ? pos : NULL;
+}
+
+static void * tty_ldiscs_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	(*pos)++;
+	return (*pos < NR_LDISCS) ? pos : NULL;
+}
+
+static void tty_ldiscs_seq_stop(struct seq_file *m, void *v)
+{
+}
+
+static int tty_ldiscs_seq_show(struct seq_file *m, void *v)
+{
+	int i = *(loff_t *)v;
+	struct tty_ldisc ld;
+	
+	if (tty_ldisc_get(i, &ld) < 0)
+		return 0;
+	seq_printf(m, "%-10s %2d\n", ld.ops->name ? ld.ops->name : "???", i);
+	tty_ldisc_put(ld.ops);
+	return 0;
+}
+
+static const struct seq_operations tty_ldiscs_seq_ops = {
+	.start	= tty_ldiscs_seq_start,
+	.next	= tty_ldiscs_seq_next,
+	.stop	= tty_ldiscs_seq_stop,
+	.show	= tty_ldiscs_seq_show,
+};
+
+static int proc_tty_ldiscs_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &tty_ldiscs_seq_ops);
+}
+
+const struct file_operations tty_ldiscs_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= proc_tty_ldiscs_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
 
 /**
  *	tty_ldisc_assign	-	set ldisc on a tty
@@ -829,8 +899,8 @@ EXPORT_SYMBOL_GPL(tty_ldisc_put);
 
 static void tty_ldisc_assign(struct tty_struct *tty, struct tty_ldisc *ld)
 {
+	ld->refcount = 0;
 	tty->ldisc = *ld;
-	tty->ldisc.refcount = 0;
 }
 
 /**
@@ -953,6 +1023,41 @@ static void tty_ldisc_enable(struct tty_struct *tty)
 	wake_up(&tty_ldisc_wait);
 }
 
+/**
+ *	tty_ldisc_restore	-	helper for tty ldisc change
+ *	@tty: tty to recover
+ *	@old: previous ldisc
+ *
+ *	Restore the previous line discipline or N_TTY when a line discipline
+ *	change fails due to an open error
+ */
+
+static void tty_ldisc_restore(struct tty_struct *tty, struct tty_ldisc *old)
+{
+	char buf[64];
+	struct tty_ldisc new_ldisc;
+
+	/* There is an outstanding reference here so this is safe */
+	tty_ldisc_get(old->ops->num, old);
+	tty_ldisc_assign(tty, old);
+	tty_set_termios_ldisc(tty, old->ops->num);
+	if (old->ops->open && (old->ops->open(tty) < 0)) {
+		tty_ldisc_put(old->ops);
+		/* This driver is always present */
+		if (tty_ldisc_get(N_TTY, &new_ldisc) < 0)
+			panic("n_tty: get");
+		tty_ldisc_assign(tty, &new_ldisc);
+		tty_set_termios_ldisc(tty, N_TTY);
+		if (new_ldisc.ops->open) {
+			int r = new_ldisc.ops->open(tty);
+				if (r < 0)
+				panic("Couldn't open N_TTY ldisc for "
+				      "%s --- error %d.",
+				      tty_name(tty, buf), r);
+		}
+	}
+}
+
 /**
  *	tty_set_ldisc		-	set line discipline
  *	@tty: the terminal to set
@@ -967,28 +1072,18 @@ static void tty_ldisc_enable(struct tty_struct *tty)
 
 static int tty_set_ldisc(struct tty_struct *tty, int ldisc)
 {
-	int retval = 0;
-	struct tty_ldisc o_ldisc;
-	char buf[64];
+	int retval;
+	struct tty_ldisc o_ldisc, new_ldisc;
 	int work;
 	unsigned long flags;
-	struct tty_ldisc *ld;
 	struct tty_struct *o_tty;
 
-	if ((ldisc < N_TTY) || (ldisc >= NR_LDISCS))
-		return -EINVAL;
-
 restart:
-
-	ld = tty_ldisc_get(ldisc);
-	/* Eduardo Blanco <ejbs@cs.cs.com.uy> */
-	/* Cyrus Durgin <cider@speakeasy.org> */
-	if (ld == NULL) {
-		request_module("tty-ldisc-%d", ldisc);
-		ld = tty_ldisc_get(ldisc);
-	}
-	if (ld == NULL)
-		return -EINVAL;
+	/* This is a bit ugly for now but means we can break the 'ldisc
+	   is part of the tty struct' assumption later */
+	retval = tty_ldisc_get(ldisc, &new_ldisc);
+	if (retval)
+		return retval;
 
 	/*
 	 *	Problem: What do we do if this blocks ?
@@ -996,8 +1091,8 @@ restart:
 
 	tty_wait_until_sent(tty, 0);
 
-	if (tty->ldisc.num == ldisc) {
-		tty_ldisc_put(ldisc);
+	if (tty->ldisc.ops->num == ldisc) {
+		tty_ldisc_put(new_ldisc.ops);
 		return 0;
 	}
 
@@ -1024,7 +1119,7 @@ restart:
 			/* Free the new ldisc we grabbed. Must drop the lock
 			   first. */
 			spin_unlock_irqrestore(&tty_ldisc_lock, flags);
-			tty_ldisc_put(ldisc);
+			tty_ldisc_put(o_ldisc.ops);
 			/*
 			 * There are several reasons we may be busy, including
 			 * random momentary I/O traffic. We must therefore
@@ -1038,7 +1133,7 @@ restart:
 		}
 		if (o_tty && o_tty->ldisc.refcount) {
 			spin_unlock_irqrestore(&tty_ldisc_lock, flags);
-			tty_ldisc_put(ldisc);
+			tty_ldisc_put(o_tty->ldisc.ops);
 			if (wait_event_interruptible(tty_ldisc_wait, o_tty->ldisc.refcount == 0) < 0)
 				return -ERESTARTSYS;
 			goto restart;
@@ -1049,8 +1144,9 @@ restart:
 	 *	another ldisc change
 	 */
 	if (!test_bit(TTY_LDISC, &tty->flags)) {
+		struct tty_ldisc *ld;
 		spin_unlock_irqrestore(&tty_ldisc_lock, flags);
-		tty_ldisc_put(ldisc);
+		tty_ldisc_put(new_ldisc.ops);
 		ld = tty_ldisc_ref_wait(tty);
 		tty_ldisc_deref(ld);
 		goto restart;
@@ -1060,7 +1156,7 @@ restart:
 	if (o_tty)
 		clear_bit(TTY_LDISC, &o_tty->flags);
 	spin_unlock_irqrestore(&tty_ldisc_lock, flags);
-
+	
 	/*
 	 *	From this point on we know nobody has an ldisc
 	 *	usage reference, nor can they obtain one until
@@ -1070,45 +1166,30 @@ restart:
 	work = cancel_delayed_work(&tty->buf.work);
 	/*
 	 * Wait for ->hangup_work and ->buf.work handlers to terminate
+	 * MUST NOT hold locks here.
 	 */
 	flush_scheduled_work();
 	/* Shutdown the current discipline. */
-	if (tty->ldisc.close)
-		(tty->ldisc.close)(tty);
+	if (o_ldisc.ops->close)
+		(o_ldisc.ops->close)(tty);
 
 	/* Now set up the new line discipline. */
-	tty_ldisc_assign(tty, ld);
+	tty_ldisc_assign(tty, &new_ldisc);
 	tty_set_termios_ldisc(tty, ldisc);
-	if (tty->ldisc.open)
-		retval = (tty->ldisc.open)(tty);
+	if (new_ldisc.ops->open)
+		retval = (new_ldisc.ops->open)(tty);
 	if (retval < 0) {
-		tty_ldisc_put(ldisc);
-		/* There is an outstanding reference here so this is safe */
-		tty_ldisc_assign(tty, tty_ldisc_get(o_ldisc.num));
-		tty_set_termios_ldisc(tty, tty->ldisc.num);
-		if (tty->ldisc.open && (tty->ldisc.open(tty) < 0)) {
-			tty_ldisc_put(o_ldisc.num);
-			/* This driver is always present */
-			tty_ldisc_assign(tty, tty_ldisc_get(N_TTY));
-			tty_set_termios_ldisc(tty, N_TTY);
-			if (tty->ldisc.open) {
-				int r = tty->ldisc.open(tty);
-
-				if (r < 0)
-					panic("Couldn't open N_TTY ldisc for "
-					      "%s --- error %d.",
-					      tty_name(tty, buf), r);
-			}
-		}
+		tty_ldisc_put(new_ldisc.ops);
+		tty_ldisc_restore(tty, &o_ldisc);
 	}
 	/* At this point we hold a reference to the new ldisc and a
 	   a reference to the old ldisc. If we ended up flipping back
 	   to the existing ldisc we have two references to it */
 
-	if (tty->ldisc.num != o_ldisc.num && tty->ops->set_ldisc)
+	if (tty->ldisc.ops->num != o_ldisc.ops->num && tty->ops->set_ldisc)
 		tty->ops->set_ldisc(tty);
 
-	tty_ldisc_put(o_ldisc.num);
+	tty_ldisc_put(o_ldisc.ops);
 
 	/*
 	 *	Allow ldisc referencing to occur as soon as the driver
@@ -1335,8 +1416,8 @@ void tty_wakeup(struct tty_struct *tty)
 	if (test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags)) {
 		ld = tty_ldisc_ref(tty);
 		if (ld) {
-			if (ld->write_wakeup)
-				ld->write_wakeup(tty);
+			if (ld->ops->write_wakeup)
+				ld->ops->write_wakeup(tty);
 			tty_ldisc_deref(ld);
 		}
 	}
@@ -1357,8 +1438,8 @@ void tty_ldisc_flush(struct tty_struct *tty)
 {
 	struct tty_ldisc *ld = tty_ldisc_ref(tty);
 	if (ld) {
-		if (ld->flush_buffer)
-			ld->flush_buffer(tty);
+		if (ld->ops->flush_buffer)
+			ld->ops->flush_buffer(tty);
 		tty_ldisc_deref(ld);
 	}
 	tty_buffer_flush(tty);
@@ -1386,7 +1467,7 @@ static void tty_reset_termios(struct tty_struct *tty)
  *	do_tty_hangup		-	actual handler for hangup events
  *	@work: tty device
  *
- *	This can be called by the "eventd" kernel thread.  That is process
+k *	This can be called by the "eventd" kernel thread.  That is process
  *	synchronous but doesn't hold any locks, so we need to make sure we
  *	have the appropriate locks for what we're doing.
  *
@@ -1449,14 +1530,14 @@ static void do_tty_hangup(struct work_struct *work)
 	ld = tty_ldisc_ref(tty);
 	if (ld != NULL) {
 		/* We may have no line discipline at this point */
-		if (ld->flush_buffer)
-			ld->flush_buffer(tty);
+		if (ld->ops->flush_buffer)
+			ld->ops->flush_buffer(tty);
 		tty_driver_flush_buffer(tty);
 		if ((test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags)) &&
-		    ld->write_wakeup)
-			ld->write_wakeup(tty);
-		if (ld->hangup)
-			ld->hangup(tty);
+		    ld->ops->write_wakeup)
+			ld->ops->write_wakeup(tty);
+		if (ld->ops->hangup)
+			ld->ops->hangup(tty);
 	}
 	/*
 	 * FIXME: Once we trust the LDISC code better we can wait here for
@@ -1825,8 +1906,8 @@ static ssize_t tty_read(struct file *file, char __user *buf, size_t count,
 	/* We want to wait for the line discipline to sort out in this
 	   situation */
 	ld = tty_ldisc_ref_wait(tty);
-	if (ld->read)
-		i = (ld->read)(tty, file, buf, count);
+	if (ld->ops->read)
+		i = (ld->ops->read)(tty, file, buf, count);
 	else
 		i = -EIO;
 	tty_ldisc_deref(ld);
@@ -1978,10 +2059,10 @@ static ssize_t tty_write(struct file *file, const char __user *buf,
 		printk(KERN_ERR "tty driver %s lacks a write_room method.\n",
 			tty->driver->name);
 	ld = tty_ldisc_ref_wait(tty);
-	if (!ld->write)
+	if (!ld->ops->write)
 		ret = -EIO;
 	else
-		ret = do_tty_write(ld->write, tty, file, buf, count);
+		ret = do_tty_write(ld->ops->write, tty, file, buf, count);
 	tty_ldisc_deref(ld);
 	return ret;
 }
@@ -2076,6 +2157,7 @@ static int init_dev(struct tty_driver *driver, int idx,
 	struct ktermios *tp, **tp_loc, *o_tp, **o_tp_loc;
 	struct ktermios *ltp, **ltp_loc, *o_ltp, **o_ltp_loc;
 	int retval = 0;
+	struct tty_ldisc *ld;
 
 	/* check whether we're reopening an existing tty */
 	if (driver->flags & TTY_DRIVER_DEVPTS_MEM) {
@@ -2224,17 +2306,19 @@ static int init_dev(struct tty_driver *driver, int idx,
 	 * If we fail here just call release_tty to clean up.  No need
 	 * to decrement the use counts, as release_tty doesn't care.
 	 */
+	 
+	ld = &tty->ldisc;
 
-	if (tty->ldisc.open) {
-		retval = (tty->ldisc.open)(tty);
+	if (ld->ops->open) {
+		retval = (ld->ops->open)(tty);
 		if (retval)
 			goto release_mem_out;
 	}
-	if (o_tty && o_tty->ldisc.open) {
-		retval = (o_tty->ldisc.open)(o_tty);
+	if (o_tty && o_tty->ldisc.ops->open) {
+		retval = (o_tty->ldisc.ops->open)(o_tty);
 		if (retval) {
-			if (tty->ldisc.close)
-				(tty->ldisc.close)(tty);
+			if (ld->ops->close)
+				(ld->ops->close)(tty);
 			goto release_mem_out;
 		}
 		tty_ldisc_enable(o_tty);
@@ -2378,6 +2462,7 @@ static void release_tty(struct tty_struct *tty, int idx)
 static void release_dev(struct file *filp)
 {
 	struct tty_struct *tty, *o_tty;
+	struct tty_ldisc ld;
 	int	pty_master, tty_closing, o_tty_closing, do_sleep;
 	int	devpts;
 	int	idx;
@@ -2611,26 +2696,27 @@ static void release_dev(struct file *filp)
 	spin_unlock_irqrestore(&tty_ldisc_lock, flags);
 	/*
 	 * Shutdown the current line discipline, and reset it to N_TTY.
-	 * N.B. why reset ldisc when we're releasing the memory??
 	 *
 	 * FIXME: this MUST get fixed for the new reflocking
 	 */
-	if (tty->ldisc.close)
-		(tty->ldisc.close)(tty);
-	tty_ldisc_put(tty->ldisc.num);
+	if (tty->ldisc.ops->close)
+		(tty->ldisc.ops->close)(tty);
+	tty_ldisc_put(tty->ldisc.ops);
 
 	/*
 	 *	Switch the line discipline back
 	 */
-	tty_ldisc_assign(tty, tty_ldisc_get(N_TTY));
+	WARN_ON(tty_ldisc_get(N_TTY, &ld));
+	tty_ldisc_assign(tty, &ld);
 	tty_set_termios_ldisc(tty, N_TTY);
 	if (o_tty) {
 		/* FIXME: could o_tty be in setldisc here ? */
 		clear_bit(TTY_LDISC, &o_tty->flags);
-		if (o_tty->ldisc.close)
-			(o_tty->ldisc.close)(o_tty);
-		tty_ldisc_put(o_tty->ldisc.num);
-		tty_ldisc_assign(o_tty, tty_ldisc_get(N_TTY));
+		if (o_tty->ldisc.ops->close)
+			(o_tty->ldisc.ops->close)(o_tty);
+		tty_ldisc_put(o_tty->ldisc.ops);
+		WARN_ON(tty_ldisc_get(N_TTY, &ld));
+		tty_ldisc_assign(o_tty, &ld);
 		tty_set_termios_ldisc(o_tty, N_TTY);
 	}
 	/*
@@ -2899,8 +2985,8 @@ static unsigned int tty_poll(struct file *filp, poll_table *wait)
 		return 0;
 
 	ld = tty_ldisc_ref_wait(tty);
-	if (ld->poll)
-		ret = (ld->poll)(tty, filp, wait);
+	if (ld->ops->poll)
+		ret = (ld->ops->poll)(tty, filp, wait);
 	tty_ldisc_deref(ld);
 	return ret;
 }
@@ -2974,7 +3060,7 @@ static int tiocsti(struct tty_struct *tty, char __user *p)
 	if (get_user(ch, p))
 		return -EFAULT;
 	ld = tty_ldisc_ref_wait(tty);
-	ld->receive_buf(tty, &ch, &mbz, 1);
+	ld->ops->receive_buf(tty, &ch, &mbz, 1);
 	tty_ldisc_deref(ld);
 	return 0;
 }
@@ -3528,7 +3614,7 @@ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case TIOCGSID:
 		return tiocgsid(tty, real_tty, p);
 	case TIOCGETD:
-		return put_user(tty->ldisc.num, (int __user *)p);
+		return put_user(tty->ldisc.ops->num, (int __user *)p);
 	case TIOCSETD:
 		return tiocsetd(tty, p);
 #ifdef CONFIG_VT
@@ -3581,8 +3667,8 @@ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	}
 	ld = tty_ldisc_ref_wait(tty);
 	retval = -EINVAL;
-	if (ld->ioctl) {
-		retval = ld->ioctl(tty, file, cmd, arg);
+	if (ld->ops->ioctl) {
+		retval = ld->ops->ioctl(tty, file, cmd, arg);
 		if (retval == -ENOIOCTLCMD)
 			retval = -EINVAL;
 	}
@@ -3609,8 +3695,8 @@ static long tty_compat_ioctl(struct file *file, unsigned int cmd,
 	}
 
 	ld = tty_ldisc_ref_wait(tty);
-	if (ld->compat_ioctl)
-		retval = ld->compat_ioctl(tty, file, cmd, arg);
+	if (ld->ops->compat_ioctl)
+		retval = ld->ops->compat_ioctl(tty, file, cmd, arg);
 	tty_ldisc_deref(ld);
 
 	return retval;
@@ -3782,7 +3868,8 @@ static void flush_to_ldisc(struct work_struct *work)
 			flag_buf = head->flag_buf_ptr + head->read;
 			head->read += count;
 			spin_unlock_irqrestore(&tty->buf.lock, flags);
-			disc->receive_buf(tty, char_buf, flag_buf, count);
+			disc->ops->receive_buf(tty, char_buf,
+							flag_buf, count);
 			spin_lock_irqsave(&tty->buf.lock, flags);
 		}
 		/* Restore the queue head */
@@ -3843,9 +3930,12 @@ EXPORT_SYMBOL(tty_flip_buffer_push);
 
 static void initialize_tty_struct(struct tty_struct *tty)
 {
+	struct tty_ldisc ld;
 	memset(tty, 0, sizeof(struct tty_struct));
 	tty->magic = TTY_MAGIC;
-	tty_ldisc_assign(tty, tty_ldisc_get(N_TTY));
+	if (tty_ldisc_get(N_TTY, &ld) < 0)
+		panic("n_tty: init_tty");
+	tty_ldisc_assign(tty, &ld);
 	tty->session = NULL;
 	tty->pgrp = NULL;
 	tty->overrun_time = jiffies;
diff --git a/drivers/char/tty_ioctl.c b/drivers/char/tty_ioctl.c
index 8f81139d619..ea9fc5d03b9 100644
--- a/drivers/char/tty_ioctl.c
+++ b/drivers/char/tty_ioctl.c
@@ -491,8 +491,8 @@ static void change_termios(struct tty_struct *tty, struct ktermios *new_termios)
 
 	ld = tty_ldisc_ref(tty);
 	if (ld != NULL) {
-		if (ld->set_termios)
-			(ld->set_termios)(tty, &old_termios);
+		if (ld->ops->set_termios)
+			(ld->ops->set_termios)(tty, &old_termios);
 		tty_ldisc_deref(ld);
 	}
 	mutex_unlock(&tty->termios_mutex);
@@ -552,8 +552,8 @@ static int set_termios(struct tty_struct *tty, void __user *arg, int opt)
 	ld = tty_ldisc_ref(tty);
 
 	if (ld != NULL) {
-		if ((opt & TERMIOS_FLUSH) && ld->flush_buffer)
-			ld->flush_buffer(tty);
+		if ((opt & TERMIOS_FLUSH) && ld->ops->flush_buffer)
+			ld->ops->flush_buffer(tty);
 		tty_ldisc_deref(ld);
 	}
 
@@ -959,12 +959,12 @@ int tty_perform_flush(struct tty_struct *tty, unsigned long arg)
 	ld = tty_ldisc_ref(tty);
 	switch (arg) {
 	case TCIFLUSH:
-		if (ld && ld->flush_buffer)
-			ld->flush_buffer(tty);
+		if (ld && ld->ops->flush_buffer)
+			ld->ops->flush_buffer(tty);
 		break;
 	case TCIOFLUSH:
-		if (ld && ld->flush_buffer)
-			ld->flush_buffer(tty);
+		if (ld && ld->ops->flush_buffer)
+			ld->ops->flush_buffer(tty);
 		/* fall through */
 	case TCOFLUSH:
 		tty_driver_flush_buffer(tty);
diff --git a/drivers/input/serio/serport.c b/drivers/input/serio/serport.c
index 7ff71ba7b7c..b9694b6445d 100644
--- a/drivers/input/serio/serport.c
+++ b/drivers/input/serio/serport.c
@@ -216,7 +216,7 @@ static void serport_ldisc_write_wakeup(struct tty_struct * tty)
  * The line discipline structure.
  */
 
-static struct tty_ldisc serport_ldisc = {
+static struct tty_ldisc_ops serport_ldisc = {
 	.owner =	THIS_MODULE,
 	.name =		"input",
 	.open =		serport_ldisc_open,
diff --git a/drivers/isdn/capi/capi.c b/drivers/isdn/capi/capi.c
index 2095153582f..8a35029caca 100644
--- a/drivers/isdn/capi/capi.c
+++ b/drivers/isdn/capi/capi.c
@@ -466,7 +466,7 @@ static int handle_recv_skb(struct capiminor *mp, struct sk_buff *skb)
 	ld = tty_ldisc_ref(mp->tty);
 	if (ld == NULL)
 		return -1;
-	if (ld->receive_buf == NULL) {
+	if (ld->ops->receive_buf == NULL) {
 #if defined(_DEBUG_DATAFLOW) || defined(_DEBUG_TTYFUNCS)
 		printk(KERN_DEBUG "capi: ldisc has no receive_buf function\n");
 #endif
@@ -501,7 +501,7 @@ static int handle_recv_skb(struct capiminor *mp, struct sk_buff *skb)
 	printk(KERN_DEBUG "capi: DATA_B3_RESP %u len=%d => ldisc\n",
 				datahandle, skb->len);
 #endif
-	ld->receive_buf(mp->tty, skb->data, NULL, skb->len);
+	ld->ops->receive_buf(mp->tty, skb->data, NULL, skb->len);
 	kfree_skb(skb);
 	tty_ldisc_deref(ld);
 	return 0;
diff --git a/drivers/isdn/gigaset/ser-gigaset.c b/drivers/isdn/gigaset/ser-gigaset.c
index 45d1ee93cd3..5e89fa17781 100644
--- a/drivers/isdn/gigaset/ser-gigaset.c
+++ b/drivers/isdn/gigaset/ser-gigaset.c
@@ -766,7 +766,7 @@ gigaset_tty_wakeup(struct tty_struct *tty)
 	cs_put(cs);
 }
 
-static struct tty_ldisc gigaset_ldisc = {
+static struct tty_ldisc_ops gigaset_ldisc = {
 	.owner		= THIS_MODULE,
 	.magic		= TTY_LDISC_MAGIC,
 	.name		= "ser_gigaset",
diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c
index 9d5721287d6..19dd0a61749 100644
--- a/drivers/net/hamradio/6pack.c
+++ b/drivers/net/hamradio/6pack.c
@@ -783,7 +783,7 @@ static int sixpack_ioctl(struct tty_struct *tty, struct file *file,
 	return err;
 }
 
-static struct tty_ldisc sp_ldisc = {
+static struct tty_ldisc_ops sp_ldisc = {
 	.owner		= THIS_MODULE,
 	.magic		= TTY_LDISC_MAGIC,
 	.name		= "6pack",
diff --git a/drivers/net/hamradio/mkiss.c b/drivers/net/hamradio/mkiss.c
index 65166035aca..c6ca47599fd 100644
--- a/drivers/net/hamradio/mkiss.c
+++ b/drivers/net/hamradio/mkiss.c
@@ -969,7 +969,7 @@ out:
 	mkiss_put(ax);
 }
 
-static struct tty_ldisc ax_ldisc = {
+static struct tty_ldisc_ops ax_ldisc = {
 	.owner		= THIS_MODULE,
 	.magic		= TTY_LDISC_MAGIC,
 	.name		= "mkiss",
diff --git a/drivers/net/irda/irtty-sir.c b/drivers/net/irda/irtty-sir.c
index e6f40b7f904..9e33196f945 100644
--- a/drivers/net/irda/irtty-sir.c
+++ b/drivers/net/irda/irtty-sir.c
@@ -533,7 +533,7 @@ static void irtty_close(struct tty_struct *tty)
 
 /* ------------------------------------------------------- */
 
-static struct tty_ldisc irda_ldisc = {
+static struct tty_ldisc_ops irda_ldisc = {
 	.magic		= TTY_LDISC_MAGIC,
  	.name		= "irda",
 	.flags		= 0,
diff --git a/drivers/net/ppp_async.c b/drivers/net/ppp_async.c
index f1a52def124..451bdb57d6f 100644
--- a/drivers/net/ppp_async.c
+++ b/drivers/net/ppp_async.c
@@ -378,7 +378,7 @@ ppp_asynctty_wakeup(struct tty_struct *tty)
 }
 
 
-static struct tty_ldisc ppp_ldisc = {
+static struct tty_ldisc_ops ppp_ldisc = {
 	.owner  = THIS_MODULE,
 	.magic	= TTY_LDISC_MAGIC,
 	.name	= "ppp",
diff --git a/drivers/net/ppp_synctty.c b/drivers/net/ppp_synctty.c
index b8f0369a71e..801d8f99d47 100644
--- a/drivers/net/ppp_synctty.c
+++ b/drivers/net/ppp_synctty.c
@@ -418,7 +418,7 @@ ppp_sync_wakeup(struct tty_struct *tty)
 }
 
 
-static struct tty_ldisc ppp_sync_ldisc = {
+static struct tty_ldisc_ops ppp_sync_ldisc = {
 	.owner	= THIS_MODULE,
 	.magic	= TTY_LDISC_MAGIC,
 	.name	= "pppsync",
diff --git a/drivers/net/slip.c b/drivers/net/slip.c
index 84af68fdb6c..1d58991d395 100644
--- a/drivers/net/slip.c
+++ b/drivers/net/slip.c
@@ -1301,7 +1301,7 @@ static int sl_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 #endif
 /* VSV changes end */
 
-static struct tty_ldisc	sl_ldisc = {
+static struct tty_ldisc_ops sl_ldisc = {
 	.owner 		= THIS_MODULE,
 	.magic 		= TTY_LDISC_MAGIC,
 	.name 		= "slip",
diff --git a/drivers/net/wan/pc300_tty.c b/drivers/net/wan/pc300_tty.c
index e03eef2f228..c2c10c63226 100644
--- a/drivers/net/wan/pc300_tty.c
+++ b/drivers/net/wan/pc300_tty.c
@@ -688,9 +688,9 @@ static void cpc_tty_rx_work(struct work_struct *work)
 				if (cpc_tty->tty) {
 					ld = tty_ldisc_ref(cpc_tty->tty);
 					if (ld) {
-						if (ld->receive_buf) {
+						if (ld->ops->receive_buf) {
 							CPC_TTY_DBG("%s: call line disc. receive_buf\n",cpc_tty->name);
-							ld->receive_buf(cpc_tty->tty, (char *)(buf->data), &flags, buf->size);
+							ld->ops->receive_buf(cpc_tty->tty, (char *)(buf->data), &flags, buf->size);
 						}
 						tty_ldisc_deref(ld);
 					}
diff --git a/drivers/net/wan/x25_asy.c b/drivers/net/wan/x25_asy.c
index 069f8bb0a99..2a6c7a60756 100644
--- a/drivers/net/wan/x25_asy.c
+++ b/drivers/net/wan/x25_asy.c
@@ -754,7 +754,7 @@ static void x25_asy_setup(struct net_device *dev)
 	dev->flags		= IFF_NOARP;
 }
 
-static struct tty_ldisc x25_ldisc = {
+static struct tty_ldisc_ops x25_ldisc = {
 	.owner		= THIS_MODULE,
 	.magic		= TTY_LDISC_MAGIC,
 	.name		= "X.25",
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index 21f490f5d65..d153946d6d1 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -136,54 +136,6 @@ static const struct file_operations proc_tty_drivers_operations = {
 	.release	= seq_release,
 };
 
-static void * tty_ldiscs_seq_start(struct seq_file *m, loff_t *pos)
-{
-	return (*pos < NR_LDISCS) ? pos : NULL;
-}
-
-static void * tty_ldiscs_seq_next(struct seq_file *m, void *v, loff_t *pos)
-{
-	(*pos)++;
-	return (*pos < NR_LDISCS) ? pos : NULL;
-}
-
-static void tty_ldiscs_seq_stop(struct seq_file *m, void *v)
-{
-}
-
-static int tty_ldiscs_seq_show(struct seq_file *m, void *v)
-{
-	int i = *(loff_t *)v;
-	struct tty_ldisc *ld;
-	
-	ld = tty_ldisc_get(i);
-	if (ld == NULL)
-		return 0;
-	seq_printf(m, "%-10s %2d\n", ld->name ? ld->name : "???", i);
-	tty_ldisc_put(i);
-	return 0;
-}
-
-static const struct seq_operations tty_ldiscs_seq_ops = {
-	.start	= tty_ldiscs_seq_start,
-	.next	= tty_ldiscs_seq_next,
-	.stop	= tty_ldiscs_seq_stop,
-	.show	= tty_ldiscs_seq_show,
-};
-
-static int proc_tty_ldiscs_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &tty_ldiscs_seq_ops);
-}
-
-static const struct file_operations tty_ldiscs_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= proc_tty_ldiscs_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 /*
  * This function is called by tty_register_driver() to handle
  * registering the driver's /proc handler into /proc/tty/driver/<foo>
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 324a3b231d4..013711ea738 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -185,6 +185,7 @@ struct tty_struct {
 	struct tty_driver *driver;
 	const struct tty_operations *ops;
 	int index;
+	/* The ldisc objects are protected by tty_ldisc_lock at the moment */
 	struct tty_ldisc ldisc;
 	struct mutex termios_mutex;
 	spinlock_t ctrl_lock;
@@ -289,7 +290,7 @@ extern void tty_wait_until_sent(struct tty_struct * tty, long timeout);
 extern int tty_check_change(struct tty_struct * tty);
 extern void stop_tty(struct tty_struct * tty);
 extern void start_tty(struct tty_struct * tty);
-extern int tty_register_ldisc(int disc, struct tty_ldisc *new_ldisc);
+extern int tty_register_ldisc(int disc, struct tty_ldisc_ops *new_ldisc);
 extern int tty_unregister_ldisc(int disc);
 extern int tty_register_driver(struct tty_driver *driver);
 extern int tty_unregister_driver(struct tty_driver *driver);
@@ -330,9 +331,7 @@ extern int tty_termios_hw_change(struct ktermios *a, struct ktermios *b);
 extern struct tty_ldisc *tty_ldisc_ref(struct tty_struct *);
 extern void tty_ldisc_deref(struct tty_ldisc *);
 extern struct tty_ldisc *tty_ldisc_ref_wait(struct tty_struct *);
-
-extern struct tty_ldisc *tty_ldisc_get(int);
-extern void tty_ldisc_put(int);
+extern const struct file_operations tty_ldiscs_proc_fops;
 
 extern void tty_wakeup(struct tty_struct *tty);
 extern void tty_ldisc_flush(struct tty_struct *tty);
@@ -354,7 +353,7 @@ extern int tty_write_lock(struct tty_struct *tty, int ndelay);
 
 
 /* n_tty.c */
-extern struct tty_ldisc tty_ldisc_N_TTY;
+extern struct tty_ldisc_ops tty_ldisc_N_TTY;
 
 /* tty_audit.c */
 #ifdef CONFIG_AUDIT
diff --git a/include/linux/tty_ldisc.h b/include/linux/tty_ldisc.h
index 6226504d910..40f38d89677 100644
--- a/include/linux/tty_ldisc.h
+++ b/include/linux/tty_ldisc.h
@@ -104,7 +104,7 @@
 #include <linux/fs.h>
 #include <linux/wait.h>
 
-struct tty_ldisc {
+struct tty_ldisc_ops {
 	int	magic;
 	char	*name;
 	int	num;
@@ -142,6 +142,11 @@ struct tty_ldisc {
 	int refcount;
 };
 
+struct tty_ldisc {
+	struct tty_ldisc_ops *ops;
+	int refcount;
+};
+
 #define TTY_LDISC_MAGIC	0x5403
 
 #define LDISC_FLAG_DEFINED	0x00000001
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index c9191871c1e..0a387f2eb7a 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -617,14 +617,7 @@ static void rfcomm_tty_wakeup(unsigned long arg)
 		return;
 
 	BT_DBG("dev %p tty %p", dev, tty);
-
-	if (test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags) && tty->ldisc.write_wakeup)
-		(tty->ldisc.write_wakeup)(tty);
-
-	wake_up_interruptible(&tty->write_wait);
-#ifdef SERIAL_HAVE_POLL_WAIT
-	wake_up_interruptible(&tty->poll_wait);
-#endif
+	tty_wakeup(tty);
 }
 
 static int rfcomm_tty_open(struct tty_struct *tty, struct file *filp)
@@ -1005,9 +998,7 @@ static void rfcomm_tty_flush_buffer(struct tty_struct *tty)
 		return;
 
 	skb_queue_purge(&dev->dlc->tx_queue);
-
-	if (test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags) && tty->ldisc.write_wakeup)
-		tty->ldisc.write_wakeup(tty);
+	tty_wakeup(tty);
 }
 
 static void rfcomm_tty_send_xchar(struct tty_struct *tty, char ch)
diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c
index 76c3057d017..e4e2caeb9d8 100644
--- a/net/irda/ircomm/ircomm_tty.c
+++ b/net/irda/ircomm/ircomm_tty.c
@@ -650,12 +650,7 @@ static void ircomm_tty_do_softint(struct work_struct *work)
 	}
 
 	/* Check if user (still) wants to be waken up */
-	if ((tty->flags & (1 << TTY_DO_WRITE_WAKEUP)) &&
-	    tty->ldisc.write_wakeup)
-	{
-		(tty->ldisc.write_wakeup)(tty);
-	}
-	wake_up_interruptible(&tty->write_wait);
+	tty_wakeup(tty);
 }
 
 /*
@@ -1141,6 +1136,7 @@ static int ircomm_tty_data_indication(void *instance, void *sap,
 				      struct sk_buff *skb)
 {
 	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance;
+	struct tty_ldisc *ld;
 
 	IRDA_DEBUG(2, "%s()\n", __func__ );
 
@@ -1173,7 +1169,11 @@ static int ircomm_tty_data_indication(void *instance, void *sap,
 	 * involve the flip buffers, since we are not running in an interrupt
 	 * handler
 	 */
-	self->tty->ldisc.receive_buf(self->tty, skb->data, NULL, skb->len);
+
+	ld = tty_ldisc_ref(self->tty);
+	if (ld)
+		ld->ops->receive_buf(self->tty, skb->data, NULL, skb->len);
+	tty_ldisc_deref(ld);
 
 	/* No need to kfree_skb - see ircomm_ttp_data_indication() */
 
-- 
cgit v1.2.3-70-g09d2


From 6143b599700f7d6d7961e2de88f1486b2b19b1f2 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Wed, 21 May 2008 12:52:33 -0700
Subject: device create: coda: convert device_create to device_create_drvdata

device_create() is race-prone, so use the race-free
device_create_drvdata() instead as device_create() is going away.

Cc: Jan Harkes <jaharkes@cs.cmu.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/coda/psdev.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index e3eb3556622..40c36f7352a 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -362,8 +362,9 @@ static int init_coda_psdev(void)
 		goto out_chrdev;
 	}		
 	for (i = 0; i < MAX_CODADEVS; i++)
-		device_create(coda_psdev_class, NULL,
-			      MKDEV(CODA_PSDEV_MAJOR,i), "cfs%d", i);
+		device_create_drvdata(coda_psdev_class, NULL,
+				      MKDEV(CODA_PSDEV_MAJOR, i),
+				      NULL, "cfs%d", i);
 	coda_sysctl_init();
 	goto out;
 
-- 
cgit v1.2.3-70-g09d2


From aab0de245150c09e61c30962feb16aacde508dc3 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Fri, 2 May 2008 06:02:41 +0200
Subject: driver core: remove KOBJ_NAME_LEN define

Kobjects do not have a limit in name size since a while, so stop
pretending that they do.

Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/arm/plat-omap/mailbox.c        | 2 +-
 arch/sparc64/kernel/vio.c           | 2 +-
 drivers/message/fusion/mptbase.c    | 3 ++-
 drivers/message/fusion/mptbase.h    | 4 ++--
 drivers/message/fusion/mptfc.c      | 4 ++--
 drivers/pci/hotplug/acpiphp.h       | 4 ++--
 drivers/scsi/hosts.c                | 4 ++--
 drivers/scsi/scsi_transport_fc.c    | 9 +++++----
 drivers/scsi/scsi_transport_iscsi.c | 4 ++--
 fs/partitions/check.c               | 2 +-
 include/linux/device.h              | 3 +--
 include/linux/kobject.h             | 1 -
 include/linux/spi/spi.h             | 2 +-
 include/scsi/scsi_host.h            | 2 +-
 include/scsi/scsi_transport_fc.h    | 4 ++--
 include/scsi/scsi_transport_iscsi.h | 2 +-
 16 files changed, 26 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/arch/arm/plat-omap/mailbox.c b/arch/arm/plat-omap/mailbox.c
index 6f33f58bca4..848fdcafaa2 100644
--- a/arch/arm/plat-omap/mailbox.c
+++ b/arch/arm/plat-omap/mailbox.c
@@ -334,7 +334,7 @@ static int omap_mbox_init(struct omap_mbox *mbox)
 	}
 
 	mbox->dev.class = &omap_mbox_class;
-	strlcpy(mbox->dev.bus_id, mbox->name, KOBJ_NAME_LEN);
+	strlcpy(mbox->dev.bus_id, mbox->name, BUS_ID_SIZE);
 	dev_set_drvdata(&mbox->dev, mbox);
 
 	ret = device_register(&mbox->dev);
diff --git a/arch/sparc64/kernel/vio.c b/arch/sparc64/kernel/vio.c
index e78b3517940..ecbb8b618b8 100644
--- a/arch/sparc64/kernel/vio.c
+++ b/arch/sparc64/kernel/vio.c
@@ -224,7 +224,7 @@ static struct vio_dev *vio_create_one(struct mdesc_handle *hp, u64 mp,
 	if (!strcmp(type, "domain-services-port"))
 		bus_id_name = "ds";
 
-	if (strlen(bus_id_name) >= KOBJ_NAME_LEN - 4) {
+	if (strlen(bus_id_name) >= BUS_ID_SIZE - 4) {
 		printk(KERN_ERR "VIO: bus_id_name [%s] is too long.\n",
 		       bus_id_name);
 		return NULL;
diff --git a/drivers/message/fusion/mptbase.c b/drivers/message/fusion/mptbase.c
index 75e599b85b6..34402c47027 100644
--- a/drivers/message/fusion/mptbase.c
+++ b/drivers/message/fusion/mptbase.c
@@ -1670,7 +1670,8 @@ mpt_attach(struct pci_dev *pdev, const struct pci_device_id *id)
 	INIT_DELAYED_WORK(&ioc->fault_reset_work, mpt_fault_reset_work);
 	spin_lock_init(&ioc->fault_reset_work_lock);
 
-	snprintf(ioc->reset_work_q_name, KOBJ_NAME_LEN, "mpt_poll_%d", ioc->id);
+	snprintf(ioc->reset_work_q_name, sizeof(ioc->reset_work_q_name),
+		 "mpt_poll_%d", ioc->id);
 	ioc->reset_work_q =
 		create_singlethread_workqueue(ioc->reset_work_q_name);
 	if (!ioc->reset_work_q) {
diff --git a/drivers/message/fusion/mptbase.h b/drivers/message/fusion/mptbase.h
index 6adab648dbb..dff048cfa10 100644
--- a/drivers/message/fusion/mptbase.h
+++ b/drivers/message/fusion/mptbase.h
@@ -707,12 +707,12 @@ typedef struct _MPT_ADAPTER
 	u8			 fc_link_speed[2];
 	spinlock_t		 fc_rescan_work_lock;
 	struct work_struct	 fc_rescan_work;
-	char			 fc_rescan_work_q_name[KOBJ_NAME_LEN];
+	char			 fc_rescan_work_q_name[20];
 	struct workqueue_struct *fc_rescan_work_q;
 	struct scsi_cmnd	**ScsiLookup;
 	spinlock_t		  scsi_lookup_lock;
 
-	char			 reset_work_q_name[KOBJ_NAME_LEN];
+	char			 reset_work_q_name[20];
 	struct workqueue_struct *reset_work_q;
 	struct delayed_work	 fault_reset_work;
 	spinlock_t		 fault_reset_work_lock;
diff --git a/drivers/message/fusion/mptfc.c b/drivers/message/fusion/mptfc.c
index fc31ca6829d..b36cae9ec6d 100644
--- a/drivers/message/fusion/mptfc.c
+++ b/drivers/message/fusion/mptfc.c
@@ -1326,8 +1326,8 @@ mptfc_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	/* initialize workqueue */
 
-	snprintf(ioc->fc_rescan_work_q_name, KOBJ_NAME_LEN, "mptfc_wq_%d",
-		sh->host_no);
+	snprintf(ioc->fc_rescan_work_q_name, sizeof(ioc->fc_rescan_work_q_name),
+		 "mptfc_wq_%d", sh->host_no);
 	ioc->fc_rescan_work_q =
 		create_singlethread_workqueue(ioc->fc_rescan_work_q_name);
 	if (!ioc->fc_rescan_work_q)
diff --git a/drivers/pci/hotplug/acpiphp.h b/drivers/pci/hotplug/acpiphp.h
index eecf7cbf413..5a58b075dd8 100644
--- a/drivers/pci/hotplug/acpiphp.h
+++ b/drivers/pci/hotplug/acpiphp.h
@@ -36,7 +36,7 @@
 #define _ACPIPHP_H
 
 #include <linux/acpi.h>
-#include <linux/kobject.h>	/* for KOBJ_NAME_LEN */
+#include <linux/kobject.h>
 #include <linux/mutex.h>
 #include <linux/pci_hotplug.h>
 
@@ -51,7 +51,7 @@
 #define warn(format, arg...) printk(KERN_WARNING "%s: " format, MY_NAME , ## arg)
 
 /* name size which is used for entries in pcihpfs */
-#define SLOT_NAME_SIZE	KOBJ_NAME_LEN		/* {_SUN} */
+#define SLOT_NAME_SIZE	20		/* {_SUN} */
 
 struct acpiphp_bridge;
 struct acpiphp_slot;
diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index 78dad28b70d..fed0b02ebc1 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -232,8 +232,8 @@ int scsi_add_host(struct Scsi_Host *shost, struct device *dev)
 	}
 
 	if (shost->transportt->create_work_queue) {
-		snprintf(shost->work_q_name, KOBJ_NAME_LEN, "scsi_wq_%d",
-			shost->host_no);
+		snprintf(shost->work_q_name, sizeof(shost->work_q_name),
+			 "scsi_wq_%d", shost->host_no);
 		shost->work_q = create_singlethread_workqueue(
 					shost->work_q_name);
 		if (!shost->work_q) {
diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index 5fd64e70029..a272b9a2c86 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -417,15 +417,16 @@ static int fc_host_setup(struct transport_container *tc, struct device *dev,
 	fc_host->next_vport_number = 0;
 	fc_host->npiv_vports_inuse = 0;
 
-	snprintf(fc_host->work_q_name, KOBJ_NAME_LEN, "fc_wq_%d",
-		shost->host_no);
+	snprintf(fc_host->work_q_name, sizeof(fc_host->work_q_name),
+		 "fc_wq_%d", shost->host_no);
 	fc_host->work_q = create_singlethread_workqueue(
 					fc_host->work_q_name);
 	if (!fc_host->work_q)
 		return -ENOMEM;
 
-	snprintf(fc_host->devloss_work_q_name, KOBJ_NAME_LEN, "fc_dl_%d",
-		shost->host_no);
+	snprintf(fc_host->devloss_work_q_name,
+		 sizeof(fc_host->devloss_work_q_name),
+		 "fc_dl_%d", shost->host_no);
 	fc_host->devloss_work_q = create_singlethread_workqueue(
 					fc_host->devloss_work_q_name);
 	if (!fc_host->devloss_work_q) {
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 06748f318cd..043c3921164 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -247,8 +247,8 @@ static int iscsi_setup_host(struct transport_container *tc, struct device *dev,
 	atomic_set(&ihost->nr_scans, 0);
 	mutex_init(&ihost->mutex);
 
-	snprintf(ihost->scan_workq_name, KOBJ_NAME_LEN, "iscsi_scan_%d",
-		shost->host_no);
+	snprintf(ihost->scan_workq_name, sizeof(ihost->scan_workq_name),
+		 "iscsi_scan_%d", shost->host_no);
 	ihost->scan_workq = create_singlethread_workqueue(
 						ihost->scan_workq_name);
 	if (!ihost->scan_workq)
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 6149e4b58c8..efef715135d 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -401,7 +401,7 @@ void register_disk(struct gendisk *disk)
 	disk->dev.parent = disk->driverfs_dev;
 	disk->dev.devt = MKDEV(disk->major, disk->first_minor);
 
-	strlcpy(disk->dev.bus_id, disk->disk_name, KOBJ_NAME_LEN);
+	strlcpy(disk->dev.bus_id, disk->disk_name, BUS_ID_SIZE);
 	/* ewww... some of these buggers have / in the name... */
 	s = strchr(disk->dev.bus_id, '/');
 	if (s)
diff --git a/include/linux/device.h b/include/linux/device.h
index 0e1d24c2ed4..fba1bb0d175 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -29,8 +29,7 @@
 /* DEVICE_NAME_HALF is really less than half to accommodate slop */
 #define DEVICE_NAME_HALF	__stringify(20)
 #define DEVICE_ID_SIZE		32
-#define BUS_ID_SIZE		KOBJ_NAME_LEN
-
+#define BUS_ID_SIZE		20
 
 struct device;
 struct device_driver;
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index 39e709f88aa..d542faa6cb4 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -26,7 +26,6 @@
 #include <linux/wait.h>
 #include <asm/atomic.h>
 
-#define KOBJ_NAME_LEN			20
 #define UEVENT_HELPER_PATH_LEN		256
 #define UEVENT_NUM_ENVP			32	/* number of env pointers */
 #define UEVENT_BUFFER_SIZE		2048	/* buffer for the variables */
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 387e428f1cd..b9a76c97208 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -733,7 +733,7 @@ struct spi_board_info {
 	 * controller_data goes to spi_device.controller_data,
 	 * irq is copied too
 	 */
-	char		modalias[KOBJ_NAME_LEN];
+	char		modalias[32];
 	const void	*platform_data;
 	void		*controller_data;
 	int		irq;
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index 1834fdfe82a..a594bac4a77 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -623,7 +623,7 @@ struct Scsi_Host {
 	/*
 	 * Optional work queue to be utilized by the transport
 	 */
-	char work_q_name[KOBJ_NAME_LEN];
+	char work_q_name[20];
 	struct workqueue_struct *work_q;
 
 	/*
diff --git a/include/scsi/scsi_transport_fc.h b/include/scsi/scsi_transport_fc.h
index 06f72bab9df..878373c32ef 100644
--- a/include/scsi/scsi_transport_fc.h
+++ b/include/scsi/scsi_transport_fc.h
@@ -489,9 +489,9 @@ struct fc_host_attrs {
 	u16 npiv_vports_inuse;
 
 	/* work queues for rport state manipulation */
-	char work_q_name[KOBJ_NAME_LEN];
+	char work_q_name[20];
 	struct workqueue_struct *work_q;
-	char devloss_work_q_name[KOBJ_NAME_LEN];
+	char devloss_work_q_name[20];
 	struct workqueue_struct *devloss_work_q;
 };
 
diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h
index f5444e033cc..8b6c91df4c7 100644
--- a/include/scsi/scsi_transport_iscsi.h
+++ b/include/scsi/scsi_transport_iscsi.h
@@ -198,7 +198,7 @@ struct iscsi_cls_host {
 	atomic_t nr_scans;
 	struct mutex mutex;
 	struct workqueue_struct *scan_workq;
-	char scan_workq_name[KOBJ_NAME_LEN];
+	char scan_workq_name[20];
 };
 
 extern void iscsi_host_for_each_session(struct Scsi_Host *shost,
-- 
cgit v1.2.3-70-g09d2


From 93265d13ea5c3ec5f61a8009407fbe046ce6b7c0 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Mon, 16 Jun 2008 13:46:47 +0200
Subject: sysfs: don't call notify_change

sysfs_chmod_file() calls notify_change() to change the permission bits
on a sysfs file.  Replace with explicit call to sysfs_setattr() and
fsnotify_change().

This is equivalent, except that security_inode_setattr() is not
called.  This function is called by drivers, so the security checks do
not make any sense.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/file.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index e7735f643cd..3f07893ff89 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -14,6 +14,7 @@
 #include <linux/kobject.h>
 #include <linux/kallsyms.h>
 #include <linux/slab.h>
+#include <linux/fsnotify.h>
 #include <linux/namei.h>
 #include <linux/poll.h>
 #include <linux/list.h>
@@ -585,9 +586,11 @@ int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode)
 
 	newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
 	newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
-	rc = notify_change(victim, &newattrs);
+	newattrs.ia_ctime = current_fs_time(inode->i_sb);
+	rc = sysfs_setattr(victim, &newattrs);
 
 	if (rc == 0) {
+		fsnotify_change(victim, newattrs.ia_valid);
 		mutex_lock(&sysfs_mutex);
 		victim_sd->s_mode = newattrs.ia_mode;
 		mutex_unlock(&sysfs_mutex);
-- 
cgit v1.2.3-70-g09d2


From 9505e6375640fc61d92d36c8e9f25a6a218f3f57 Mon Sep 17 00:00:00 2001
From: Haavard Skinnemoen <haavard.skinnemoen@atmel.com>
Date: Tue, 1 Jul 2008 15:14:51 +0200
Subject: debugfs: Implement debugfs_remove_recursive()

debugfs_remove_recursive() will remove a dentry and all its children.
Drivers can use this to zap their whole debugfs tree so that they don't
need to keep track of every single debugfs dentry they created.

It may fail to remove the whole tree in certain cases:

sh-3.2# rmmod atmel-mci < /sys/kernel/debug/mmc0/ios/clock
mmc0: card b368 removed
atmel_mci atmel_mci.0: Lost dma0chan1, falling back to PIO
sh-3.2# ls /sys/kernel/debug/mmc0/
ios

But I'm not sure if that case can be handled in any sane manner.

Signed-off-by: Haavard Skinnemoen <haavard.skinnemoen@atmel.com>
Cc: Pierre Ossman <drzeus-list@drzeus.cx>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/debugfs/inode.c      | 114 ++++++++++++++++++++++++++++++++++++++++--------
 include/linux/debugfs.h |   4 ++
 2 files changed, 100 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index e9602d85c11..08e28c9bb41 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -309,6 +309,31 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
 }
 EXPORT_SYMBOL_GPL(debugfs_create_symlink);
 
+static void __debugfs_remove(struct dentry *dentry, struct dentry *parent)
+{
+	int ret = 0;
+
+	if (debugfs_positive(dentry)) {
+		if (dentry->d_inode) {
+			dget(dentry);
+			switch (dentry->d_inode->i_mode & S_IFMT) {
+			case S_IFDIR:
+				ret = simple_rmdir(parent->d_inode, dentry);
+				break;
+			case S_IFLNK:
+				kfree(dentry->d_inode->i_private);
+				/* fall through */
+			default:
+				simple_unlink(parent->d_inode, dentry);
+				break;
+			}
+			if (!ret)
+				d_delete(dentry);
+			dput(dentry);
+		}
+	}
+}
+
 /**
  * debugfs_remove - removes a file or directory from the debugfs filesystem
  * @dentry: a pointer to a the dentry of the file or directory to be
@@ -325,7 +350,6 @@ EXPORT_SYMBOL_GPL(debugfs_create_symlink);
 void debugfs_remove(struct dentry *dentry)
 {
 	struct dentry *parent;
-	int ret = 0;
 	
 	if (!dentry)
 		return;
@@ -335,29 +359,83 @@ void debugfs_remove(struct dentry *dentry)
 		return;
 
 	mutex_lock(&parent->d_inode->i_mutex);
-	if (debugfs_positive(dentry)) {
-		if (dentry->d_inode) {
-			dget(dentry);
-			switch (dentry->d_inode->i_mode & S_IFMT) {
-			case S_IFDIR:
-				ret = simple_rmdir(parent->d_inode, dentry);
-				break;
-			case S_IFLNK:
-				kfree(dentry->d_inode->i_private);
-				/* fall through */
-			default:
-				simple_unlink(parent->d_inode, dentry);
+	__debugfs_remove(dentry, parent);
+	mutex_unlock(&parent->d_inode->i_mutex);
+	simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+}
+EXPORT_SYMBOL_GPL(debugfs_remove);
+
+/**
+ * debugfs_remove_recursive - recursively removes a directory
+ * @dentry: a pointer to a the dentry of the directory to be removed.
+ *
+ * This function recursively removes a directory tree in debugfs that
+ * was previously created with a call to another debugfs function
+ * (like debugfs_create_file() or variants thereof.)
+ *
+ * This function is required to be called in order for the file to be
+ * removed, no automatic cleanup of files will happen when a module is
+ * removed, you are responsible here.
+ */
+void debugfs_remove_recursive(struct dentry *dentry)
+{
+	struct dentry *child;
+	struct dentry *parent;
+
+	if (!dentry)
+		return;
+
+	parent = dentry->d_parent;
+	if (!parent || !parent->d_inode)
+		return;
+
+	parent = dentry;
+	mutex_lock(&parent->d_inode->i_mutex);
+
+	while (1) {
+		/*
+		 * When all dentries under "parent" has been removed,
+		 * walk up the tree until we reach our starting point.
+		 */
+		if (list_empty(&parent->d_subdirs)) {
+			mutex_unlock(&parent->d_inode->i_mutex);
+			if (parent == dentry)
 				break;
-			}
-			if (!ret)
-				d_delete(dentry);
-			dput(dentry);
+			parent = parent->d_parent;
+			mutex_lock(&parent->d_inode->i_mutex);
+		}
+		child = list_entry(parent->d_subdirs.next, struct dentry,
+				d_u.d_child);
+
+		/*
+		 * If "child" isn't empty, walk down the tree and
+		 * remove all its descendants first.
+		 */
+		if (!list_empty(&child->d_subdirs)) {
+			mutex_unlock(&parent->d_inode->i_mutex);
+			parent = child;
+			mutex_lock(&parent->d_inode->i_mutex);
+			continue;
 		}
+		__debugfs_remove(child, parent);
+		if (parent->d_subdirs.next == &child->d_u.d_child) {
+			/*
+			 * Avoid infinite loop if we fail to remove
+			 * one dentry.
+			 */
+			mutex_unlock(&parent->d_inode->i_mutex);
+			break;
+		}
+		simple_release_fs(&debugfs_mount, &debugfs_mount_count);
 	}
+
+	parent = dentry->d_parent;
+	mutex_lock(&parent->d_inode->i_mutex);
+	__debugfs_remove(dentry, parent);
 	mutex_unlock(&parent->d_inode->i_mutex);
 	simple_release_fs(&debugfs_mount, &debugfs_mount_count);
 }
-EXPORT_SYMBOL_GPL(debugfs_remove);
+EXPORT_SYMBOL_GPL(debugfs_remove_recursive);
 
 /**
  * debugfs_rename - rename a file/directory in the debugfs filesystem
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index 32755cdf68d..e1a6c046cea 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -44,6 +44,7 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
 				      const char *dest);
 
 void debugfs_remove(struct dentry *dentry);
+void debugfs_remove_recursive(struct dentry *dentry);
 
 struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
                 struct dentry *new_dir, const char *new_name);
@@ -101,6 +102,9 @@ static inline struct dentry *debugfs_create_symlink(const char *name,
 static inline void debugfs_remove(struct dentry *dentry)
 { }
 
+static inline void debugfs_remove_recursive(struct dentry *dentry)
+{ }
+
 static inline struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
                 struct dentry *new_dir, char *new_name)
 {
-- 
cgit v1.2.3-70-g09d2


From 36ce6dad6e3cb3f050ed41e0beac0070d2062b25 Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cornelia.huck@de.ibm.com>
Date: Tue, 10 Jun 2008 11:09:08 +0200
Subject: driver core: Suppress sysfs warnings for device_rename().

driver core: Suppress sysfs warnings for device_rename().

Renaming network devices to an already existing name is not
something we want sysfs to print a scary warning for, since the
callers can deal with this correctly. So let's introduce
sysfs_create_link_nowarn() which gets rid of the common warning.

Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/core.c   |  9 +++++----
 fs/sysfs/dir.c        | 37 +++++++++++++++++++++++++++++++++++--
 fs/sysfs/symlink.c    | 41 +++++++++++++++++++++++++++++++++--------
 fs/sysfs/sysfs.h      |  1 +
 include/linux/sysfs.h | 10 ++++++++++
 5 files changed, 84 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index c05b1159023..7d5c63c81a5 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1345,8 +1345,9 @@ int device_rename(struct device *dev, char *new_name)
 	if (old_class_name) {
 		new_class_name = make_class_name(dev->class->name, &dev->kobj);
 		if (new_class_name) {
-			error = sysfs_create_link(&dev->parent->kobj,
-						  &dev->kobj, new_class_name);
+			error = sysfs_create_link_nowarn(&dev->parent->kobj,
+							 &dev->kobj,
+							 new_class_name);
 			if (error)
 				goto out;
 			sysfs_remove_link(&dev->parent->kobj, old_class_name);
@@ -1354,8 +1355,8 @@ int device_rename(struct device *dev, char *new_name)
 	}
 #else
 	if (dev->class) {
-		error = sysfs_create_link(&dev->class->p->class_subsys.kobj,
-					  &dev->kobj, dev->bus_id);
+		error = sysfs_create_link_nowarn(&dev->class->p->class_subsys.kobj,
+						 &dev->kobj, dev->bus_id);
 		if (error)
 			goto out;
 		sysfs_remove_link(&dev->class->p->class_subsys.kobj,
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 8c0e4b92574..c1a7efb310b 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -398,7 +398,7 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
 }
 
 /**
- *	sysfs_add_one - add sysfs_dirent to parent
+ *	__sysfs_add_one - add sysfs_dirent to parent without warning
  *	@acxt: addrm context to use
  *	@sd: sysfs_dirent to be added
  *
@@ -417,7 +417,7 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
  *	0 on success, -EEXIST if entry with the given name already
  *	exists.
  */
-int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
+int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 {
 	if (sysfs_find_dirent(acxt->parent_sd, sd->s_name))
 		return -EEXIST;
@@ -434,6 +434,39 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 	return 0;
 }
 
+/**
+ *	sysfs_add_one - add sysfs_dirent to parent
+ *	@acxt: addrm context to use
+ *	@sd: sysfs_dirent to be added
+ *
+ *	Get @acxt->parent_sd and set sd->s_parent to it and increment
+ *	nlink of parent inode if @sd is a directory and link into the
+ *	children list of the parent.
+ *
+ *	This function should be called between calls to
+ *	sysfs_addrm_start() and sysfs_addrm_finish() and should be
+ *	passed the same @acxt as passed to sysfs_addrm_start().
+ *
+ *	LOCKING:
+ *	Determined by sysfs_addrm_start().
+ *
+ *	RETURNS:
+ *	0 on success, -EEXIST if entry with the given name already
+ *	exists.
+ */
+int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
+{
+	int ret;
+
+	ret = __sysfs_add_one(acxt, sd);
+	if (ret == -EEXIST) {
+		printk(KERN_WARNING "sysfs: duplicate filename '%s' "
+		       "can not be created\n", sd->s_name);
+		WARN_ON(1);
+	}
+	return ret;
+}
+
 /**
  *	sysfs_remove_one - remove sysfs_dirent from parent
  *	@acxt: addrm context to use
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 817f5966edc..a3ba217fbe7 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -19,13 +19,8 @@
 
 #include "sysfs.h"
 
-/**
- *	sysfs_create_link - create symlink between two objects.
- *	@kobj:	object whose directory we're creating the link in.
- *	@target:	object we're pointing to.
- *	@name:		name of the symlink.
- */
-int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char * name)
+static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
+				const char *name, int warn)
 {
 	struct sysfs_dirent *parent_sd = NULL;
 	struct sysfs_dirent *target_sd = NULL;
@@ -65,7 +60,10 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char
 	target_sd = NULL;	/* reference is now owned by the symlink */
 
 	sysfs_addrm_start(&acxt, parent_sd);
-	error = sysfs_add_one(&acxt, sd);
+	if (warn)
+		error = sysfs_add_one(&acxt, sd);
+	else
+		error = __sysfs_add_one(&acxt, sd);
 	sysfs_addrm_finish(&acxt);
 
 	if (error)
@@ -79,6 +77,33 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char
 	return error;
 }
 
+/**
+ *	sysfs_create_link - create symlink between two objects.
+ *	@kobj:	object whose directory we're creating the link in.
+ *	@target:	object we're pointing to.
+ *	@name:		name of the symlink.
+ */
+int sysfs_create_link(struct kobject *kobj, struct kobject *target,
+		      const char *name)
+{
+	return sysfs_do_create_link(kobj, target, name, 1);
+}
+
+/**
+ *	sysfs_create_link_nowarn - create symlink between two objects.
+ *	@kobj:	object whose directory we're creating the link in.
+ *	@target:	object we're pointing to.
+ *	@name:		name of the symlink.
+ *
+ *	This function does the same as sysf_create_link(), but it
+ *	doesn't warn if the link already exists.
+ */
+int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target,
+			     const char *name)
+{
+	return sysfs_do_create_link(kobj, target, name, 0);
+}
+
 /**
  *	sysfs_remove_link - remove symlink in object's directory.
  *	@kobj:	object we're acting for.
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index ce4e15f8aae..a5db496f71c 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -107,6 +107,7 @@ struct sysfs_dirent *sysfs_get_active_two(struct sysfs_dirent *sd);
 void sysfs_put_active_two(struct sysfs_dirent *sd);
 void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
 		       struct sysfs_dirent *parent_sd);
+int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
 int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
 void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
 void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 7858eac40aa..37fa24152bd 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -101,6 +101,9 @@ void sysfs_remove_bin_file(struct kobject *kobj, struct bin_attribute *attr);
 
 int __must_check sysfs_create_link(struct kobject *kobj, struct kobject *target,
 				   const char *name);
+int __must_check sysfs_create_link_nowarn(struct kobject *kobj,
+					  struct kobject *target,
+					  const char *name);
 void sysfs_remove_link(struct kobject *kobj, const char *name);
 
 int __must_check sysfs_create_group(struct kobject *kobj,
@@ -180,6 +183,13 @@ static inline int sysfs_create_link(struct kobject *kobj,
 	return 0;
 }
 
+static inline int sysfs_create_link_nowarn(struct kobject *kobj,
+					   struct kobject *target,
+					   const char *name)
+{
+	return 0;
+}
+
 static inline void sysfs_remove_link(struct kobject *kobj, const char *name)
 {
 }
-- 
cgit v1.2.3-70-g09d2


From 04e1e0cccade330ab3715ce59234f7e3b087e246 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Tue, 22 Jul 2008 13:04:18 +0000
Subject: [CIFS] Fix compiler warning on 64-bit

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/asn1.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index f58e41d3ba4..42765465701 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -400,7 +400,7 @@ asn1_oid_decode(struct asn1_ctx *ctx,
 	size = eoc - ctx->pointer + 1;
 
 	/* first subid actually encodes first two subids */
-	if (size < 2 || size > ULONG_MAX/sizeof(unsigned long))
+	if (size < 2 || size > UINT_MAX/sizeof(unsigned long))
 		return 0;
 
 	*oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC);
-- 
cgit v1.2.3-70-g09d2


From 651910874633a75f4a726d44e449be0a56b4b2e0 Mon Sep 17 00:00:00 2001
From: John Reiser <jreiser@BitWagon.com>
Date: Mon, 21 Jul 2008 14:21:32 -0700
Subject: execve filename: document and export via auxiliary vector

The Linux kernel puts the filename argument of execve() into the new
address space.  Many developers are surprised to learn this.  Those who
know and could use it, object "But it's not documented."

Those who want to use it dislike the expression
  (char *)(1+ strlen(env[-1+ n_env]) + env[-1+ n_env])
because it requires locating the last original environment variable,
and assumes that the filename follows the characters.

This patch documents the insertion of the filename, and makes it easier
to find by adding a new tag AT_EXECFN in the ElfXX_auxv_t; see <elf.h>.

In many cases readlink("/proc/self/exe",) gives the same answer.  But if
all the original pages get unmapped, then the kernel erases the symlink
for /proc/self/exe.  This can happen when a program decompressor does a
good job of cleaning up after uncompressing directly to memory, so that
the address space of the target program looks the same as if compression
had never happened.  One example is http://upx.sourceforge.net .

One notable use of the underlying concept (what path containED the
executable) is glibc expanding $ORIGIN in DT_RUNPATH.  In practice for
the near term, it may be a good idea for user-mode code to use both
/proc/self/exe and AT_EXECFN as fall-back methods for each other.
/proc/self/exe can fail due to unmapping, AT_EXECFN can fail because it
won't be present on non-new systems.  The auxvec or {AT_EXECFN}.d_val
also can get overwritten, although in nearly all cases this would be the
result of a bug.

The runtime cost is one NEW_AUX_ENT using two words of stack space.  The
underlying value is maintained already as bprm->exec; setup_arg_pages()
in fs/exec.c slides it for stack_shift, etc.

Signed-off-by: John Reiser <jreiser@BitWagon.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Jakub Jelinek <jakub@redhat.com>
Cc: Ulrich Drepper <drepper@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_elf.c        | 1 +
 include/linux/auxvec.h | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index d48ff5f370f..639d2d8b571 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -204,6 +204,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 	NEW_AUX_ENT(AT_GID, tsk->gid);
 	NEW_AUX_ENT(AT_EGID, tsk->egid);
  	NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm));
+	NEW_AUX_ENT(AT_EXECFN, bprm->exec);
 	if (k_platform) {
 		NEW_AUX_ENT(AT_PLATFORM,
 			    (elf_addr_t)(unsigned long)u_platform);
diff --git a/include/linux/auxvec.h b/include/linux/auxvec.h
index ad895455ab7..0da17d14fd1 100644
--- a/include/linux/auxvec.h
+++ b/include/linux/auxvec.h
@@ -26,8 +26,10 @@
 
 #define AT_SECURE 23   /* secure mode boolean */
 
+#define AT_EXECFN  31	/* filename of program */
 #ifdef __KERNEL__
-#define AT_VECTOR_SIZE_BASE (14 + 2) /* NEW_AUX_ENT entries in auxiliary table */
+#define AT_VECTOR_SIZE_BASE 17 /* NEW_AUX_ENT entries in auxiliary table */
+  /* number of "#define AT_.*" above, minus {AT_NULL, AT_IGNORE, AT_NOTELF} */
 #endif
 
 #endif /* _LINUX_AUXVEC_H */
-- 
cgit v1.2.3-70-g09d2


From ee1e6ab6056a8b9c19377257002da98b83819531 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 21 Jul 2008 14:21:36 -0700
Subject: proc: fix /proc/*/pagemap some more

struct pagemap_walk was placed on stack, some hooks are initialized, the
rest (->pgd_entry, ->pud_entry, ->pte_entry) are valid but junk.

Reported-by: Eric Sesterhenn <snakebyte@gmx.de>
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: "Vegard Nossum" <vegard.nossum@gmail.com>
Cc: <stable@kernel.org> [2.6.25.x, 2.6.26.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 164bd9f9ede..7546a918f79 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -636,7 +636,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 	struct pagemapread pm;
 	int pagecount;
 	int ret = -ESRCH;
-	struct mm_walk pagemap_walk;
+	struct mm_walk pagemap_walk = {};
 	unsigned long src;
 	unsigned long svpfn;
 	unsigned long start_vaddr;
-- 
cgit v1.2.3-70-g09d2


From 8086cd451f08f4c0f9693fc66d87754bbd18cfba Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Tue, 22 Jul 2008 14:19:19 -0700
Subject: netns: make get_proc_net() static

get_proc_net() can now become static.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Acked-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/proc/proc_net.c      | 11 +++++------
 include/linux/proc_fs.h |  2 --
 2 files changed, 5 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index b224a28e0c1..7bc296f424a 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -27,6 +27,11 @@
 #include "internal.h"
 
 
+static struct net *get_proc_net(const struct inode *inode)
+{
+	return maybe_get_net(PDE_NET(PDE(inode)));
+}
+
 int seq_open_net(struct inode *ino, struct file *f,
 		 const struct seq_operations *ops, int size)
 {
@@ -185,12 +190,6 @@ void proc_net_remove(struct net *net, const char *name)
 }
 EXPORT_SYMBOL_GPL(proc_net_remove);
 
-struct net *get_proc_net(const struct inode *inode)
-{
-	return maybe_get_net(PDE_NET(PDE(inode)));
-}
-EXPORT_SYMBOL_GPL(get_proc_net);
-
 static __net_init int proc_net_ns_init(struct net *net)
 {
 	struct proc_dir_entry *netd, *net_statd;
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index fff1d27ddb4..15a9eaf4a80 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -305,8 +305,6 @@ static inline struct net *PDE_NET(struct proc_dir_entry *pde)
 	return pde->parent->data;
 }
 
-struct net *get_proc_net(const struct inode *inode);
-
 struct proc_maps_private {
 	struct pid *pid;
 	struct task_struct *task;
-- 
cgit v1.2.3-70-g09d2


From 09e50d55a9490e9b7a6fdfbf8fc078924b25ecb5 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 23 Jul 2008 10:11:19 -0400
Subject: lockdep: annotate cifs in-kernel sockets

Put CIFS sockets in their own class to avoid some lockdep warnings. CIFS
sockets are not exposed to user-space, and so are not subject to the
same deadlock scenarios.

A similar change was made a couple of years ago for RPC sockets in commit
ed07536ed6731775219c1df7fa26a7588753e693.

This patch should prevent lockdep false-positives like this one:

=======================================================
[ INFO: possible circular locking dependency detected ]
2.6.18-98.el5.jtltest.38.bz456320.1debug #1
-------------------------------------------------------
test5/2483 is trying to acquire lock:
 (sk_lock-AF_INET){--..}, at: [<ffffffff800270d2>] tcp_sendmsg+0x1c/0xb2f

but task is already holding lock:
 (&inode->i_alloc_sem){--..}, at: [<ffffffff8002e454>] notify_change+0xf5/0x2e0

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #3 (&inode->i_alloc_sem){--..}:
       [<ffffffff800a817c>] __lock_acquire+0x9a9/0xadf
       [<ffffffff800a8a72>] lock_acquire+0x55/0x70
       [<ffffffff8002e454>] notify_change+0xf5/0x2e0
       [<ffffffff800a4e36>] down_write+0x3c/0x68
       [<ffffffff8002e454>] notify_change+0xf5/0x2e0
       [<ffffffff800e358d>] do_truncate+0x50/0x6b
       [<ffffffff8005197c>] get_write_access+0x40/0x46
       [<ffffffff80012cf1>] may_open+0x1d3/0x22e
       [<ffffffff8001bc81>] open_namei+0x2c6/0x6dd
       [<ffffffff800289c6>] do_filp_open+0x1c/0x38
       [<ffffffff800683ef>] _spin_unlock+0x17/0x20
       [<ffffffff800167a7>] get_unused_fd+0xf9/0x107
       [<ffffffff8001a704>] do_sys_open+0x44/0xbe
       [<ffffffff80060116>] system_call+0x7e/0x83
       [<ffffffffffffffff>] 0xffffffffffffffff

-> #2 (&sysfs_inode_imutex_key){--..}:
       [<ffffffff800a817c>] __lock_acquire+0x9a9/0xadf
       [<ffffffff8010f6df>] create_dir+0x26/0x1d7
       [<ffffffff800a8a72>] lock_acquire+0x55/0x70
       [<ffffffff8010f6df>] create_dir+0x26/0x1d7
       [<ffffffff800671c0>] mutex_lock_nested+0x104/0x29c
       [<ffffffff800a819d>] __lock_acquire+0x9ca/0xadf
       [<ffffffff8010f6df>] create_dir+0x26/0x1d7
       [<ffffffff8010fc67>] sysfs_create_dir+0x58/0x76
       [<ffffffff8015144c>] kobject_add+0xdb/0x198
       [<ffffffff801be765>] class_device_add+0xb2/0x465
       [<ffffffff8005a6ff>] kobject_get+0x12/0x17
       [<ffffffff80225265>] register_netdevice+0x270/0x33e
       [<ffffffff8022538c>] register_netdev+0x59/0x67
       [<ffffffff80464d40>] net_olddevs_init+0xb/0xac
       [<ffffffff80448a79>] init+0x1f9/0x2fc
       [<ffffffff80068885>] _spin_unlock_irq+0x24/0x27
       [<ffffffff80067f86>] trace_hardirqs_on_thunk+0x35/0x37
       [<ffffffff80061079>] child_rip+0xa/0x11
       [<ffffffff80068885>] _spin_unlock_irq+0x24/0x27
       [<ffffffff800606a8>] restore_args+0x0/0x30
       [<ffffffff80179a59>] acpi_ds_init_one_object+0x0/0x80
       [<ffffffff80448880>] init+0x0/0x2fc
       [<ffffffff8006106f>] child_rip+0x0/0x11
       [<ffffffffffffffff>] 0xffffffffffffffff

-> #1 (rtnl_mutex){--..}:
       [<ffffffff800a817c>] __lock_acquire+0x9a9/0xadf
       [<ffffffff8025acf8>] ip_mc_leave_group+0x23/0xb7
       [<ffffffff800a8a72>] lock_acquire+0x55/0x70
       [<ffffffff8025acf8>] ip_mc_leave_group+0x23/0xb7
       [<ffffffff800671c0>] mutex_lock_nested+0x104/0x29c
       [<ffffffff8025acf8>] ip_mc_leave_group+0x23/0xb7
       [<ffffffff802451b0>] do_ip_setsockopt+0x6d1/0x9bf
       [<ffffffff800a575e>] lock_release_holdtime+0x27/0x48
       [<ffffffff800a575e>] lock_release_holdtime+0x27/0x48
       [<ffffffff8006a85e>] do_page_fault+0x503/0x835
       [<ffffffff8012cbf6>] socket_has_perm+0x5b/0x68
       [<ffffffff80245556>] ip_setsockopt+0x22/0x78
       [<ffffffff8021c973>] sys_setsockopt+0x91/0xb7
       [<ffffffff800602a6>] tracesys+0xd5/0xdf
       [<ffffffffffffffff>] 0xffffffffffffffff

-> #0 (sk_lock-AF_INET){--..}:
       [<ffffffff800a5037>] print_stack_trace+0x59/0x68
       [<ffffffff800a8092>] __lock_acquire+0x8bf/0xadf
       [<ffffffff800a8a72>] lock_acquire+0x55/0x70
       [<ffffffff800270d2>] tcp_sendmsg+0x1c/0xb2f
       [<ffffffff80035466>] lock_sock+0xd4/0xe4
       [<ffffffff80096e91>] _local_bh_enable+0xcb/0xe0
       [<ffffffff800606a8>] restore_args+0x0/0x30
       [<ffffffff800270d2>] tcp_sendmsg+0x1c/0xb2f
       [<ffffffff80057540>] sock_sendmsg+0xf3/0x110
       [<ffffffff800a2bb6>] autoremove_wake_function+0x0/0x2e
       [<ffffffff800a10e4>] kernel_text_address+0x1a/0x26
       [<ffffffff8006f4e2>] dump_trace+0x211/0x23a
       [<ffffffff800a6d3d>] find_usage_backwards+0x5f/0x88
       [<ffffffff8840221a>] MD5Final+0xaf/0xc2 [cifs]
       [<ffffffff884032ec>] cifs_calculate_signature+0x55/0x69 [cifs]
       [<ffffffff8021d891>] kernel_sendmsg+0x35/0x47
       [<ffffffff883ff38e>] smb_send+0xa3/0x151 [cifs]
       [<ffffffff883ff5de>] SendReceive+0x1a2/0x448 [cifs]
       [<ffffffff800a812f>] __lock_acquire+0x95c/0xadf
       [<ffffffff883e758a>] CIFSSMBSetEOF+0x20d/0x25b [cifs]
       [<ffffffff883fa430>] cifs_set_file_size+0x110/0x3b7 [cifs]
       [<ffffffff883faa89>] cifs_setattr+0x3b2/0x6f6 [cifs]
       [<ffffffff8002e454>] notify_change+0xf5/0x2e0
       [<ffffffff8002e4a4>] notify_change+0x145/0x2e0
       [<ffffffff800e358d>] do_truncate+0x50/0x6b
       [<ffffffff8005197c>] get_write_access+0x40/0x46
       [<ffffffff80012cf1>] may_open+0x1d3/0x22e
       [<ffffffff8001bc81>] open_namei+0x2c6/0x6dd
       [<ffffffff800289c6>] do_filp_open+0x1c/0x38
       [<ffffffff800683ef>] _spin_unlock+0x17/0x20
       [<ffffffff800167a7>] get_unused_fd+0xf9/0x107
       [<ffffffff8001a704>] do_sys_open+0x44/0xbe
       [<ffffffff800602a6>] tracesys+0xd5/0xdf
       [<ffffffffffffffff>] 0xffffffffffffffff

other info that might help us debug this:

2 locks held by test5/2483:
 #0:  (&inode->i_mutex){--..}, at: [<ffffffff800e3582>] do_truncate+0x45/0x6b
 #1:  (&inode->i_alloc_sem){--..}, at: [<ffffffff8002e454>] notify_change+0xf5/0x2e0

stack backtrace:

Call Trace:
 [<ffffffff800a6a7b>] print_circular_bug_tail+0x65/0x6e
 [<ffffffff800a5037>] print_stack_trace+0x59/0x68
 [<ffffffff800a8092>] __lock_acquire+0x8bf/0xadf
 [<ffffffff800a8a72>] lock_acquire+0x55/0x70
 [<ffffffff800270d2>] tcp_sendmsg+0x1c/0xb2f
 [<ffffffff80035466>] lock_sock+0xd4/0xe4
 [<ffffffff80096e91>] _local_bh_enable+0xcb/0xe0
 [<ffffffff800606a8>] restore_args+0x0/0x30
 [<ffffffff800270d2>] tcp_sendmsg+0x1c/0xb2f
 [<ffffffff80057540>] sock_sendmsg+0xf3/0x110
 [<ffffffff800a2bb6>] autoremove_wake_function+0x0/0x2e
 [<ffffffff800a10e4>] kernel_text_address+0x1a/0x26
 [<ffffffff8006f4e2>] dump_trace+0x211/0x23a
 [<ffffffff800a6d3d>] find_usage_backwards+0x5f/0x88
 [<ffffffff8840221a>] :cifs:MD5Final+0xaf/0xc2
 [<ffffffff884032ec>] :cifs:cifs_calculate_signature+0x55/0x69
 [<ffffffff8021d891>] kernel_sendmsg+0x35/0x47
 [<ffffffff883ff38e>] :cifs:smb_send+0xa3/0x151
 [<ffffffff883ff5de>] :cifs:SendReceive+0x1a2/0x448
 [<ffffffff800a812f>] __lock_acquire+0x95c/0xadf
 [<ffffffff883e758a>] :cifs:CIFSSMBSetEOF+0x20d/0x25b
 [<ffffffff883fa430>] :cifs:cifs_set_file_size+0x110/0x3b7
 [<ffffffff883faa89>] :cifs:cifs_setattr+0x3b2/0x6f6
 [<ffffffff8002e454>] notify_change+0xf5/0x2e0
 [<ffffffff8002e4a4>] notify_change+0x145/0x2e0
 [<ffffffff800e358d>] do_truncate+0x50/0x6b
 [<ffffffff8005197c>] get_write_access+0x40/0x46
 [<ffffffff80012cf1>] may_open+0x1d3/0x22e
 [<ffffffff8001bc81>] open_namei+0x2c6/0x6dd
 [<ffffffff800289c6>] do_filp_open+0x1c/0x38
 [<ffffffff800683ef>] _spin_unlock+0x17/0x20
 [<ffffffff800167a7>] get_unused_fd+0xf9/0x107
 [<ffffffff8001a704>] do_sys_open+0x44/0xbe
 [<ffffffff800602a6>] tracesys+0xd5/0xdf

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index e8fa46c7cff..64446272938 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1461,6 +1461,39 @@ get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
 	return rc;
 }
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static struct lock_class_key cifs_key[2];
+static struct lock_class_key cifs_slock_key[2];
+
+static inline void
+cifs_reclassify_socket4(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	BUG_ON(sock_owned_by_user(sk));
+	sock_lock_init_class_and_name(sk, "slock-AF_INET-CIFS",
+		&cifs_slock_key[0], "sk_lock-AF_INET-CIFS", &cifs_key[0]);
+}
+
+static inline void
+cifs_reclassify_socket6(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	BUG_ON(sock_owned_by_user(sk));
+	sock_lock_init_class_and_name(sk, "slock-AF_INET6-CIFS",
+		&cifs_slock_key[1], "sk_lock-AF_INET6-CIFS", &cifs_key[1]);
+}
+#else
+static inline void
+cifs_reclassify_socket4(struct socket *sock)
+{
+}
+
+static inline void
+cifs_reclassify_socket6(struct socket *sock)
+{
+}
+#endif
+
 /* See RFC1001 section 14 on representation of Netbios names */
 static void rfc1002mangle(char *target, char *source, unsigned int length)
 {
@@ -1495,6 +1528,7 @@ ipv4_connect(struct sockaddr_in *psin_server, struct socket **csocket,
 		/* BB other socket options to set KEEPALIVE, NODELAY? */
 			cFYI(1, ("Socket created"));
 			(*csocket)->sk->sk_allocation = GFP_NOFS;
+			cifs_reclassify_socket4(*csocket);
 		}
 	}
 
@@ -1627,6 +1661,7 @@ ipv6_connect(struct sockaddr_in6 *psin_server, struct socket **csocket)
 		/* BB other socket options to set KEEPALIVE, NODELAY? */
 			 cFYI(1, ("ipv6 Socket created"));
 			(*csocket)->sk->sk_allocation = GFP_NOFS;
+			cifs_reclassify_socket6(*csocket);
 		}
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 8efdbde647f542ce0d303273df7ad4157caa03d0 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 23 Jul 2008 21:28:12 +0000
Subject: [CIFS] break ATTR_SIZE changes out into their own function

Move the code that handles ATTR_SIZE changes to its own function. This
makes for a smaller function and reduces the level of indentation.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/inode.c | 151 +++++++++++++++++++++++++++++---------------------------
 1 file changed, 78 insertions(+), 73 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 2e904bd111c..46e54d39461 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1413,6 +1413,82 @@ out_busy:
 	return -ETXTBSY;
 }
 
+static int
+cifs_set_file_size(struct inode *inode, struct iattr *attrs,
+		   int xid, char *full_path)
+{
+	int rc;
+	struct cifsFileInfo *open_file;
+	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifsTconInfo *pTcon = cifs_sb->tcon;
+
+	/*
+	 * To avoid spurious oplock breaks from server, in the case of
+	 * inodes that we already have open, avoid doing path based
+	 * setting of file size if we can do it by handle.
+	 * This keeps our caching token (oplock) and avoids timeouts
+	 * when the local oplock break takes longer to flush
+	 * writebehind data than the SMB timeout for the SetPathInfo
+	 * request would allow
+	 */
+	open_file = find_writable_file(cifsInode);
+	if (open_file) {
+		__u16 nfid = open_file->netfid;
+		__u32 npid = open_file->pid;
+		rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid,
+					npid, false);
+		atomic_dec(&open_file->wrtPending);
+		cFYI(1, ("SetFSize for attrs rc = %d", rc));
+		if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
+			unsigned int bytes_written;
+			rc = CIFSSMBWrite(xid, pTcon, nfid, 0, attrs->ia_size,
+					  &bytes_written, NULL, NULL, 1);
+			cFYI(1, ("Wrt seteof rc %d", rc));
+		}
+	} else
+		rc = -EINVAL;
+
+	if (rc != 0) {
+		/* Set file size by pathname rather than by handle
+		   either because no valid, writeable file handle for
+		   it was found or because there was an error setting
+		   it by handle */
+		rc = CIFSSMBSetEOF(xid, pTcon, full_path, attrs->ia_size,
+				   false, cifs_sb->local_nls,
+				   cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+		cFYI(1, ("SetEOF by path (setattrs) rc = %d", rc));
+		if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
+			__u16 netfid;
+			int oplock = 0;
+
+			rc = SMBLegacyOpen(xid, pTcon, full_path,
+				FILE_OPEN, GENERIC_WRITE,
+				CREATE_NOT_DIR, &netfid, &oplock, NULL,
+				cifs_sb->local_nls,
+				cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+			if (rc == 0) {
+				unsigned int bytes_written;
+				rc = CIFSSMBWrite(xid, pTcon, netfid, 0,
+						  attrs->ia_size,
+						  &bytes_written, NULL,
+						  NULL, 1);
+				cFYI(1, ("wrt seteof rc %d", rc));
+				CIFSSMBClose(xid, pTcon, netfid);
+			}
+		}
+	}
+
+	if (rc == 0) {
+		rc = cifs_vmtruncate(inode, attrs->ia_size);
+		cifs_truncate_page(inode->i_mapping, inode->i_size);
+	}
+
+	return rc;
+}
+
 int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 {
 	int xid;
@@ -1420,7 +1496,6 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 	struct cifsTconInfo *pTcon;
 	char *full_path = NULL;
 	int rc = -EACCES;
-	struct cifsFileInfo *open_file = NULL;
 	FILE_BASIC_INFO time_buf;
 	bool set_time = false;
 	bool set_dosattr = false;
@@ -1472,78 +1547,8 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 	}
 
 	if (attrs->ia_valid & ATTR_SIZE) {
-		/* To avoid spurious oplock breaks from server, in the case of
-		   inodes that we already have open, avoid doing path based
-		   setting of file size if we can do it by handle.
-		   This keeps our caching token (oplock) and avoids timeouts
-		   when the local oplock break takes longer to flush
-		   writebehind data than the SMB timeout for the SetPathInfo
-		   request would allow */
-
-		open_file = find_writable_file(cifsInode);
-		if (open_file) {
-			__u16 nfid = open_file->netfid;
-			__u32 npid = open_file->pid;
-			rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size,
-						nfid, npid, false);
-			atomic_dec(&open_file->wrtPending);
-			cFYI(1, ("SetFSize for attrs rc = %d", rc));
-			if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
-				unsigned int bytes_written;
-				rc = CIFSSMBWrite(xid, pTcon,
-						  nfid, 0, attrs->ia_size,
-						  &bytes_written, NULL, NULL,
-						  1 /* 45 seconds */);
-				cFYI(1, ("Wrt seteof rc %d", rc));
-			}
-		} else
-			rc = -EINVAL;
-
-		if (rc != 0) {
-			/* Set file size by pathname rather than by handle
-			   either because no valid, writeable file handle for
-			   it was found or because there was an error setting
-			   it by handle */
-			rc = CIFSSMBSetEOF(xid, pTcon, full_path,
-					   attrs->ia_size, false,
-					   cifs_sb->local_nls,
-					   cifs_sb->mnt_cifs_flags &
-						CIFS_MOUNT_MAP_SPECIAL_CHR);
-			cFYI(1, ("SetEOF by path (setattrs) rc = %d", rc));
-			if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
-				__u16 netfid;
-				int oplock = 0;
-
-				rc = SMBLegacyOpen(xid, pTcon, full_path,
-					FILE_OPEN, GENERIC_WRITE,
-					CREATE_NOT_DIR, &netfid, &oplock,
-					NULL, cifs_sb->local_nls,
-					cifs_sb->mnt_cifs_flags &
-						CIFS_MOUNT_MAP_SPECIAL_CHR);
-				if (rc == 0) {
-					unsigned int bytes_written;
-					rc = CIFSSMBWrite(xid, pTcon,
-							netfid, 0,
-							attrs->ia_size,
-							&bytes_written, NULL,
-							NULL, 1 /* 45 sec */);
-					cFYI(1, ("wrt seteof rc %d", rc));
-					CIFSSMBClose(xid, pTcon, netfid);
-				}
-
-			}
-		}
-
-		/* Server is ok setting allocation size implicitly - no need
-		   to call:
-		CIFSSMBSetEOF(xid, pTcon, full_path, attrs->ia_size, true,
-			 cifs_sb->local_nls);
-		   */
-
-		if (rc == 0) {
-			rc = cifs_vmtruncate(inode, attrs->ia_size);
-			cifs_truncate_page(inode->i_mapping, inode->i_size);
-		} else
+		rc = cifs_set_file_size(inode, attrs, xid, full_path);
+		if (rc != 0)
 			goto cifs_setattr_exit;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 5ca33c6ac30596fc1403a092f46ea48c406734e4 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Wed, 23 Jul 2008 17:45:58 -0700
Subject: cifs: assorted endian annotations

fs/cifs/cifssmb.c:3917:13: warning: incorrect type in assignment (different base types)
fs/cifs/cifssmb.c:3917:13:    expected bool [unsigned] [usertype] is_unicode
fs/cifs/cifssmb.c:3917:13:    got restricted __le16

The comment explains why __force is used here.
fs/cifs/connect.c:458:16: warning: cast to restricted __be32
fs/cifs/connect.c:458:16: warning: cast to restricted __be32
fs/cifs/connect.c:458:16: warning: cast to restricted __be32
fs/cifs/connect.c:458:16: warning: cast to restricted __be32
fs/cifs/connect.c:458:16: warning: cast to restricted __be32
fs/cifs/connect.c:458:16: warning: cast to restricted __be32

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifssmb.c | 5 ++++-
 fs/cifs/connect.c | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 4511b708f0f..7e175e1399d 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -3914,7 +3914,10 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
 	bool is_unicode;
 	struct dfs_referral_level_3 *ref;
 
-	is_unicode = pSMBr->hdr.Flags2 & SMBFLG2_UNICODE;
+	if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE)
+		is_unicode = true;
+	else
+		is_unicode = false;
 	*num_of_nodes = le16_to_cpu(pSMBr->NumberOfReferrals);
 
 	if (*num_of_nodes < 1) {
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 64446272938..b51d5777cde 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -455,7 +455,7 @@ incomplete_rcv:
 		/* Note that FC 1001 length is big endian on the wire,
 		but we convert it here so it is always manipulated
 		as host byte order */
-		pdu_length = ntohl(smb_buffer->smb_buf_length);
+		pdu_length = be32_to_cpu((__force __be32)smb_buffer->smb_buf_length);
 		smb_buffer->smb_buf_length = pdu_length;
 
 		cFYI(1, ("rfc1002 length 0x%x", pdu_length+4));
-- 
cgit v1.2.3-70-g09d2


From f984c7b98253c541ed6e702ee521f6a84c8113d4 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 24 Jul 2008 02:34:24 +0000
Subject: Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com> Cc: Steven
 French <sfrench@us.ibm.com> Signed-off-by: Andrew Morton
 <akpm@linux-foundation.org> Signed-off-by: Steve French <sfrench@us.ibm.com>

---
 fs/cifs/cifs_debug.c | 636 +++++++++++++++++++--------------------------------
 1 file changed, 241 insertions(+), 395 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index cc950f69e51..73925a77375 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -107,9 +107,7 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
 #endif /* CONFIG_CIFS_DEBUG2 */
 
 #ifdef CONFIG_PROC_FS
-static int
-cifs_debug_data_read(char *buf, char **beginBuffer, off_t offset,
-		     int count, int *eof, void *data)
+static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 {
 	struct list_head *tmp;
 	struct list_head *tmp1;
@@ -117,23 +115,13 @@ cifs_debug_data_read(char *buf, char **beginBuffer, off_t offset,
 	struct cifsSesInfo *ses;
 	struct cifsTconInfo *tcon;
 	int i;
-	int length = 0;
-	char *original_buf = buf;
 
-	*beginBuffer = buf + offset;
-
-	length =
-	    sprintf(buf,
+	seq_puts(m,
 		    "Display Internal CIFS Data Structures for Debugging\n"
 		    "---------------------------------------------------\n");
-	buf += length;
-	length = sprintf(buf, "CIFS Version %s\n", CIFS_VERSION);
-	buf += length;
-	length = sprintf(buf,
-		"Active VFS Requests: %d\n", GlobalTotalActiveXid);
-	buf += length;
-	length = sprintf(buf, "Servers:");
-	buf += length;
+	seq_printf(m, "CIFS Version %s\n", CIFS_VERSION);
+	seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid);
+	seq_printf(m, "Servers:");
 
 	i = 0;
 	read_lock(&GlobalSMBSeslock);
@@ -142,11 +130,10 @@ cifs_debug_data_read(char *buf, char **beginBuffer, off_t offset,
 		ses = list_entry(tmp, struct cifsSesInfo, cifsSessionList);
 		if ((ses->serverDomain == NULL) || (ses->serverOS == NULL) ||
 		   (ses->serverNOS == NULL)) {
-			buf += sprintf(buf, "\nentry for %s not fully "
+			seq_printf(m, "\nentry for %s not fully "
 					"displayed\n\t", ses->serverName);
 		} else {
-			length =
-			    sprintf(buf,
+			seq_printf(m,
 				    "\n%d) Name: %s  Domain: %s Mounts: %d OS:"
 				    " %s  \n\tNOS: %s\tCapability: 0x%x\n\tSMB"
 				    " session status: %d\t",
@@ -154,10 +141,9 @@ cifs_debug_data_read(char *buf, char **beginBuffer, off_t offset,
 				atomic_read(&ses->inUse),
 				ses->serverOS, ses->serverNOS,
 				ses->capabilities, ses->status);
-			buf += length;
 		}
 		if (ses->server) {
-			buf += sprintf(buf, "TCP status: %d\n\tLocal Users To "
+			seq_printf(m, "TCP status: %d\n\tLocal Users To "
 				    "Server: %d SecMode: 0x%x Req On Wire: %d",
 				ses->server->tcpStatus,
 				atomic_read(&ses->server->socketUseCount),
@@ -165,13 +151,12 @@ cifs_debug_data_read(char *buf, char **beginBuffer, off_t offset,
 				atomic_read(&ses->server->inFlight));
 
 #ifdef CONFIG_CIFS_STATS2
-			buf += sprintf(buf, " In Send: %d In MaxReq Wait: %d",
+			seq_printf(m, " In Send: %d In MaxReq Wait: %d",
 				atomic_read(&ses->server->inSend),
 				atomic_read(&ses->server->num_waiters));
 #endif
 
-			length = sprintf(buf, "\nMIDs:\n");
-			buf += length;
+			seq_puts(m, "\nMIDs:\n");
 
 			spin_lock(&GlobalMid_Lock);
 			list_for_each(tmp1, &ses->server->pending_mid_q) {
@@ -179,7 +164,7 @@ cifs_debug_data_read(char *buf, char **beginBuffer, off_t offset,
 					mid_q_entry,
 					qhead);
 				if (mid_entry) {
-					length = sprintf(buf,
+					seq_printf(m,
 							"State: %d com: %d pid:"
 							" %d tsk: %p mid %d\n",
 							mid_entry->midState,
@@ -187,7 +172,6 @@ cifs_debug_data_read(char *buf, char **beginBuffer, off_t offset,
 							mid_entry->pid,
 							mid_entry->tsk,
 							mid_entry->mid);
-					buf += length;
 				}
 			}
 			spin_unlock(&GlobalMid_Lock);
@@ -195,11 +179,9 @@ cifs_debug_data_read(char *buf, char **beginBuffer, off_t offset,
 
 	}
 	read_unlock(&GlobalSMBSeslock);
-	sprintf(buf, "\n");
-	buf++;
+	seq_putc(m, '\n');
 
-	length = sprintf(buf, "Shares:");
-	buf += length;
+	seq_puts(m, "Shares:");
 
 	i = 0;
 	read_lock(&GlobalSMBSeslock);
@@ -208,62 +190,52 @@ cifs_debug_data_read(char *buf, char **beginBuffer, off_t offset,
 		i++;
 		tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
 		dev_type = le32_to_cpu(tcon->fsDevInfo.DeviceType);
-		length = sprintf(buf, "\n%d) %s Uses: %d ", i,
+		seq_printf(m, "\n%d) %s Uses: %d ", i,
 				 tcon->treeName, atomic_read(&tcon->useCount));
-		buf += length;
 		if (tcon->nativeFileSystem) {
-			length = sprintf(buf, "Type: %s ",
+			seq_printf(m, "Type: %s ",
 					 tcon->nativeFileSystem);
-			buf += length;
 		}
-		length = sprintf(buf, "DevInfo: 0x%x Attributes: 0x%x"
+		seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x"
 				 "\nPathComponentMax: %d Status: %d",
 			    le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics),
 			    le32_to_cpu(tcon->fsAttrInfo.Attributes),
 			    le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength),
 			    tcon->tidStatus);
-		buf += length;
 		if (dev_type == FILE_DEVICE_DISK)
-			length = sprintf(buf, " type: DISK ");
+			seq_puts(m, " type: DISK ");
 		else if (dev_type == FILE_DEVICE_CD_ROM)
-			length = sprintf(buf, " type: CDROM ");
+			seq_puts(m, " type: CDROM ");
 		else
-			length =
-			    sprintf(buf, " type: %d ", dev_type);
-		buf += length;
+			seq_printf(m, " type: %d ", dev_type);
 		if (tcon->tidStatus == CifsNeedReconnect) {
-			buf += sprintf(buf, "\tDISCONNECTED ");
-			length += 14;
+			seq_puts(m, "\tDISCONNECTED ");
 		}
 	}
 	read_unlock(&GlobalSMBSeslock);
 
-	length = sprintf(buf, "\n");
-	buf += length;
+	seq_putc(m, '\n');
 
 	/* BB add code to dump additional info such as TCP session info now */
-	/* Now calculate total size of returned data */
-	length = buf - original_buf;
-
-	if (offset + count >= length)
-		*eof = 1;
-	if (length < offset) {
-		*eof = 1;
-		return 0;
-	} else {
-		length = length - offset;
-	}
-	if (length > count)
-		length = count;
+	return 0;
+}
 
-	return length;
+static int cifs_debug_data_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, cifs_debug_data_proc_show, NULL);
 }
 
-#ifdef CONFIG_CIFS_STATS
+static const struct file_operations cifs_debug_data_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cifs_debug_data_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
 
-static int
-cifs_stats_write(struct file *file, const char __user *buffer,
-		 unsigned long count, void *data)
+#ifdef CONFIG_CIFS_STATS
+static ssize_t cifs_stats_proc_write(struct file *file,
+		const char __user *buffer, size_t count, loff_t *ppos)
 {
 	char c;
 	int rc;
@@ -307,236 +279,127 @@ cifs_stats_write(struct file *file, const char __user *buffer,
 	return count;
 }
 
-static int
-cifs_stats_read(char *buf, char **beginBuffer, off_t offset,
-		  int count, int *eof, void *data)
+static int cifs_stats_proc_show(struct seq_file *m, void *v)
 {
-	int item_length, i, length;
+	int i;
 	struct list_head *tmp;
 	struct cifsTconInfo *tcon;
 
-	*beginBuffer = buf + offset;
-
-	length = sprintf(buf,
+	seq_printf(m,
 			"Resources in use\nCIFS Session: %d\n",
 			sesInfoAllocCount.counter);
-	buf += length;
-	item_length =
-		sprintf(buf, "Share (unique mount targets): %d\n",
+	seq_printf(m, "Share (unique mount targets): %d\n",
 			tconInfoAllocCount.counter);
-	length += item_length;
-	buf += item_length;
-	item_length =
-		sprintf(buf, "SMB Request/Response Buffer: %d Pool size: %d\n",
+	seq_printf(m, "SMB Request/Response Buffer: %d Pool size: %d\n",
 			bufAllocCount.counter,
 			cifs_min_rcv + tcpSesAllocCount.counter);
-	length += item_length;
-	buf += item_length;
-	item_length =
-		sprintf(buf, "SMB Small Req/Resp Buffer: %d Pool size: %d\n",
+	seq_printf(m, "SMB Small Req/Resp Buffer: %d Pool size: %d\n",
 			smBufAllocCount.counter, cifs_min_small);
-	length += item_length;
-	buf += item_length;
 #ifdef CONFIG_CIFS_STATS2
-	item_length = sprintf(buf, "Total Large %d Small %d Allocations\n",
+	seq_printf(m, "Total Large %d Small %d Allocations\n",
 				atomic_read(&totBufAllocCount),
 				atomic_read(&totSmBufAllocCount));
-	length += item_length;
-	buf += item_length;
 #endif /* CONFIG_CIFS_STATS2 */
 
-	item_length =
-		sprintf(buf, "Operations (MIDs): %d\n",
-			midCount.counter);
-	length += item_length;
-	buf += item_length;
-	item_length = sprintf(buf,
+	seq_printf(m, "Operations (MIDs): %d\n", midCount.counter);
+	seq_printf(m,
 		"\n%d session %d share reconnects\n",
 		tcpSesReconnectCount.counter, tconInfoReconnectCount.counter);
-	length += item_length;
-	buf += item_length;
 
-	item_length = sprintf(buf,
+	seq_printf(m,
 		"Total vfs operations: %d maximum at one time: %d\n",
 		GlobalCurrentXid, GlobalMaxActiveXid);
-	length += item_length;
-	buf += item_length;
 
 	i = 0;
 	read_lock(&GlobalSMBSeslock);
 	list_for_each(tmp, &GlobalTreeConnectionList) {
 		i++;
 		tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
-		item_length = sprintf(buf, "\n%d) %s", i, tcon->treeName);
-		buf += item_length;
-		length += item_length;
-		if (tcon->tidStatus == CifsNeedReconnect) {
-			buf += sprintf(buf, "\tDISCONNECTED ");
-			length += 14;
-		}
-		item_length = sprintf(buf, "\nSMBs: %d Oplock Breaks: %d",
+		seq_printf(m, "\n%d) %s", i, tcon->treeName);
+		if (tcon->tidStatus == CifsNeedReconnect)
+			seq_puts(m, "\tDISCONNECTED ");
+		seq_printf(m, "\nSMBs: %d Oplock Breaks: %d",
 			atomic_read(&tcon->num_smbs_sent),
 			atomic_read(&tcon->num_oplock_brks));
-		buf += item_length;
-		length += item_length;
-		item_length = sprintf(buf, "\nReads:  %d Bytes: %lld",
+		seq_printf(m, "\nReads:  %d Bytes: %lld",
 			atomic_read(&tcon->num_reads),
 			(long long)(tcon->bytes_read));
-		buf += item_length;
-		length += item_length;
-		item_length = sprintf(buf, "\nWrites: %d Bytes: %lld",
+		seq_printf(m, "\nWrites: %d Bytes: %lld",
 			atomic_read(&tcon->num_writes),
 			(long long)(tcon->bytes_written));
-		buf += item_length;
-		length += item_length;
-		item_length = sprintf(buf,
+		seq_printf(m,
 			"\nLocks: %d HardLinks: %d Symlinks: %d",
 			atomic_read(&tcon->num_locks),
 			atomic_read(&tcon->num_hardlinks),
 			atomic_read(&tcon->num_symlinks));
-		buf += item_length;
-		length += item_length;
 
-		item_length = sprintf(buf, "\nOpens: %d Closes: %d Deletes: %d",
+		seq_printf(m, "\nOpens: %d Closes: %d Deletes: %d",
 			atomic_read(&tcon->num_opens),
 			atomic_read(&tcon->num_closes),
 			atomic_read(&tcon->num_deletes));
-		buf += item_length;
-		length += item_length;
-		item_length = sprintf(buf, "\nMkdirs: %d Rmdirs: %d",
+		seq_printf(m, "\nMkdirs: %d Rmdirs: %d",
 			atomic_read(&tcon->num_mkdirs),
 			atomic_read(&tcon->num_rmdirs));
-		buf += item_length;
-		length += item_length;
-		item_length = sprintf(buf, "\nRenames: %d T2 Renames %d",
+		seq_printf(m, "\nRenames: %d T2 Renames %d",
 			atomic_read(&tcon->num_renames),
 			atomic_read(&tcon->num_t2renames));
-		buf += item_length;
-		length += item_length;
-		item_length = sprintf(buf, "\nFindFirst: %d FNext %d FClose %d",
+		seq_printf(m, "\nFindFirst: %d FNext %d FClose %d",
 			atomic_read(&tcon->num_ffirst),
 			atomic_read(&tcon->num_fnext),
 			atomic_read(&tcon->num_fclose));
-		buf += item_length;
-		length += item_length;
 	}
 	read_unlock(&GlobalSMBSeslock);
 
-	buf += sprintf(buf, "\n");
-	length++;
-
-	if (offset + count >= length)
-		*eof = 1;
-	if (length < offset) {
-		*eof = 1;
-		return 0;
-	} else {
-		length = length - offset;
-	}
-	if (length > count)
-		length = count;
+	seq_putc(m, '\n');
+	return 0;
+}
 
-	return length;
+static int cifs_stats_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, cifs_stats_proc_show, NULL);
 }
+
+static const struct file_operations cifs_stats_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cifs_stats_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= cifs_stats_proc_write,
+};
 #endif /* STATS */
 
 static struct proc_dir_entry *proc_fs_cifs;
-read_proc_t cifs_txanchor_read;
-static read_proc_t cifsFYI_read;
-static write_proc_t cifsFYI_write;
-static read_proc_t oplockEnabled_read;
-static write_proc_t oplockEnabled_write;
-static read_proc_t lookupFlag_read;
-static write_proc_t lookupFlag_write;
-static read_proc_t traceSMB_read;
-static write_proc_t traceSMB_write;
-static read_proc_t multiuser_mount_read;
-static write_proc_t multiuser_mount_write;
-static read_proc_t security_flags_read;
-static write_proc_t security_flags_write;
-/* static read_proc_t ntlmv2_enabled_read;
-static write_proc_t ntlmv2_enabled_write;
-static read_proc_t packet_signing_enabled_read;
-static write_proc_t packet_signing_enabled_write;*/
-static read_proc_t experimEnabled_read;
-static write_proc_t experimEnabled_write;
-static read_proc_t linuxExtensionsEnabled_read;
-static write_proc_t linuxExtensionsEnabled_write;
+static const struct file_operations cifsFYI_proc_fops;
+static const struct file_operations cifs_oplock_proc_fops;
+static const struct file_operations cifs_lookup_cache_proc_fops;
+static const struct file_operations traceSMB_proc_fops;
+static const struct file_operations cifs_multiuser_mount_proc_fops;
+static const struct file_operations cifs_security_flags_proc_fops;
+static const struct file_operations cifs_experimental_proc_fops;
+static const struct file_operations cifs_linux_ext_proc_fops;
 
 void
 cifs_proc_init(void)
 {
-	struct proc_dir_entry *pde;
-
 	proc_fs_cifs = proc_mkdir("fs/cifs", NULL);
 	if (proc_fs_cifs == NULL)
 		return;
 
 	proc_fs_cifs->owner = THIS_MODULE;
-	create_proc_read_entry("DebugData", 0, proc_fs_cifs,
-				cifs_debug_data_read, NULL);
+	proc_create("DebugData", 0, proc_fs_cifs, &cifs_debug_data_proc_fops);
 
 #ifdef CONFIG_CIFS_STATS
-	pde = create_proc_read_entry("Stats", 0, proc_fs_cifs,
-				cifs_stats_read, NULL);
-	if (pde)
-		pde->write_proc = cifs_stats_write;
+	proc_create("Stats", 0, proc_fs_cifs, &cifs_stats_proc_fops);
 #endif /* STATS */
-	pde = create_proc_read_entry("cifsFYI", 0, proc_fs_cifs,
-				cifsFYI_read, NULL);
-	if (pde)
-		pde->write_proc = cifsFYI_write;
-
-	pde =
-	    create_proc_read_entry("traceSMB", 0, proc_fs_cifs,
-				traceSMB_read, NULL);
-	if (pde)
-		pde->write_proc = traceSMB_write;
-
-	pde = create_proc_read_entry("OplockEnabled", 0, proc_fs_cifs,
-				oplockEnabled_read, NULL);
-	if (pde)
-		pde->write_proc = oplockEnabled_write;
-
-	pde = create_proc_read_entry("Experimental", 0, proc_fs_cifs,
-				experimEnabled_read, NULL);
-	if (pde)
-		pde->write_proc = experimEnabled_write;
-
-	pde = create_proc_read_entry("LinuxExtensionsEnabled", 0, proc_fs_cifs,
-				linuxExtensionsEnabled_read, NULL);
-	if (pde)
-		pde->write_proc = linuxExtensionsEnabled_write;
-
-	pde =
-	    create_proc_read_entry("MultiuserMount", 0, proc_fs_cifs,
-				multiuser_mount_read, NULL);
-	if (pde)
-		pde->write_proc = multiuser_mount_write;
-
-	pde =
-	    create_proc_read_entry("SecurityFlags", 0, proc_fs_cifs,
-				security_flags_read, NULL);
-	if (pde)
-		pde->write_proc = security_flags_write;
-
-	pde =
-	create_proc_read_entry("LookupCacheEnabled", 0, proc_fs_cifs,
-				lookupFlag_read, NULL);
-	if (pde)
-		pde->write_proc = lookupFlag_write;
-
-/*	pde =
-	    create_proc_read_entry("NTLMV2Enabled", 0, proc_fs_cifs,
-				ntlmv2_enabled_read, NULL);
-	if (pde)
-		pde->write_proc = ntlmv2_enabled_write;
-
-	pde =
-	    create_proc_read_entry("PacketSigningEnabled", 0, proc_fs_cifs,
-				packet_signing_enabled_read, NULL);
-	if (pde)
-		pde->write_proc = packet_signing_enabled_write;*/
+	proc_create("cifsFYI", 0, proc_fs_cifs, &cifsFYI_proc_fops);
+	proc_create("traceSMB", 0, proc_fs_cifs, &traceSMB_proc_fops);
+	proc_create("OplockEnabled", 0, proc_fs_cifs, &cifs_oplock_proc_fops);
+	proc_create("Experimental", 0, proc_fs_cifs, &cifs_experimental_proc_fops);
+	proc_create("LinuxExtensionsEnabled", 0, proc_fs_cifs, &cifs_linux_ext_proc_fops);
+	proc_create("MultiuserMount", 0, proc_fs_cifs, &cifs_multiuser_mount_proc_fops);
+	proc_create("SecurityFlags", 0, proc_fs_cifs, &cifs_security_flags_proc_fops);
+	proc_create("LookupCacheEnabled", 0, proc_fs_cifs, &cifs_lookup_cache_proc_fops);
 }
 
 void
@@ -553,39 +416,26 @@ cifs_proc_clean(void)
 #endif
 	remove_proc_entry("MultiuserMount", proc_fs_cifs);
 	remove_proc_entry("OplockEnabled", proc_fs_cifs);
-/*	remove_proc_entry("NTLMV2Enabled",proc_fs_cifs); */
 	remove_proc_entry("SecurityFlags", proc_fs_cifs);
-/*	remove_proc_entry("PacketSigningEnabled", proc_fs_cifs); */
 	remove_proc_entry("LinuxExtensionsEnabled", proc_fs_cifs);
 	remove_proc_entry("Experimental", proc_fs_cifs);
 	remove_proc_entry("LookupCacheEnabled", proc_fs_cifs);
 	remove_proc_entry("fs/cifs", NULL);
 }
 
-static int
-cifsFYI_read(char *page, char **start, off_t off, int count,
-	     int *eof, void *data)
+static int cifsFYI_proc_show(struct seq_file *m, void *v)
 {
-	int len;
-
-	len = sprintf(page, "%d\n", cifsFYI);
-
-	len -= off;
-	*start = page + off;
-
-	if (len > count)
-		len = count;
-	else
-		*eof = 1;
-
-	if (len < 0)
-		len = 0;
+	seq_printf(m, "%d\n", cifsFYI);
+	return 0;
+}
 
-	return len;
+static int cifsFYI_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, cifsFYI_proc_show, NULL);
 }
-static int
-cifsFYI_write(struct file *file, const char __user *buffer,
-	      unsigned long count, void *data)
+
+static ssize_t cifsFYI_proc_write(struct file *file, const char __user *buffer,
+		size_t count, loff_t *ppos)
 {
 	char c;
 	int rc;
@@ -603,30 +453,28 @@ cifsFYI_write(struct file *file, const char __user *buffer,
 	return count;
 }
 
-static int
-oplockEnabled_read(char *page, char **start, off_t off,
-		   int count, int *eof, void *data)
-{
-	int len;
-
-	len = sprintf(page, "%d\n", oplockEnabled);
+static const struct file_operations cifsFYI_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cifsFYI_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= cifsFYI_proc_write,
+};
 
-	len -= off;
-	*start = page + off;
-
-	if (len > count)
-		len = count;
-	else
-		*eof = 1;
-
-	if (len < 0)
-		len = 0;
+static int cifs_oplock_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%d\n", oplockEnabled);
+	return 0;
+}
 
-	return len;
+static int cifs_oplock_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, cifs_oplock_proc_show, NULL);
 }
-static int
-oplockEnabled_write(struct file *file, const char __user *buffer,
-		    unsigned long count, void *data)
+
+static ssize_t cifs_oplock_proc_write(struct file *file,
+		const char __user *buffer, size_t count, loff_t *ppos)
 {
 	char c;
 	int rc;
@@ -642,30 +490,28 @@ oplockEnabled_write(struct file *file, const char __user *buffer,
 	return count;
 }
 
-static int
-experimEnabled_read(char *page, char **start, off_t off,
-		    int count, int *eof, void *data)
-{
-	int len;
-
-	len = sprintf(page, "%d\n", experimEnabled);
-
-	len -= off;
-	*start = page + off;
-
-	if (len > count)
-		len = count;
-	else
-		*eof = 1;
+static const struct file_operations cifs_oplock_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cifs_oplock_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= cifs_oplock_proc_write,
+};
 
-	if (len < 0)
-		len = 0;
+static int cifs_experimental_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%d\n", experimEnabled);
+	return 0;
+}
 
-	return len;
+static int cifs_experimental_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, cifs_experimental_proc_show, NULL);
 }
-static int
-experimEnabled_write(struct file *file, const char __user *buffer,
-		     unsigned long count, void *data)
+
+static ssize_t cifs_experimental_proc_write(struct file *file,
+		const char __user *buffer, size_t count, loff_t *ppos)
 {
 	char c;
 	int rc;
@@ -683,29 +529,28 @@ experimEnabled_write(struct file *file, const char __user *buffer,
 	return count;
 }
 
-static int
-linuxExtensionsEnabled_read(char *page, char **start, off_t off,
-			    int count, int *eof, void *data)
-{
-	int len;
-
-	len = sprintf(page, "%d\n", linuxExtEnabled);
-	len -= off;
-	*start = page + off;
-
-	if (len > count)
-		len = count;
-	else
-		*eof = 1;
+static const struct file_operations cifs_experimental_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cifs_experimental_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= cifs_experimental_proc_write,
+};
 
-	if (len < 0)
-		len = 0;
+static int cifs_linux_ext_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%d\n", linuxExtEnabled);
+	return 0;
+}
 
-	return len;
+static int cifs_linux_ext_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, cifs_linux_ext_proc_show, NULL);
 }
-static int
-linuxExtensionsEnabled_write(struct file *file, const char __user *buffer,
-			     unsigned long count, void *data)
+
+static ssize_t cifs_linux_ext_proc_write(struct file *file,
+		const char __user *buffer, size_t count, loff_t *ppos)
 {
 	char c;
 	int rc;
@@ -721,31 +566,28 @@ linuxExtensionsEnabled_write(struct file *file, const char __user *buffer,
 	return count;
 }
 
+static const struct file_operations cifs_linux_ext_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cifs_linux_ext_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= cifs_linux_ext_proc_write,
+};
 
-static int
-lookupFlag_read(char *page, char **start, off_t off,
-		int count, int *eof, void *data)
+static int cifs_lookup_cache_proc_show(struct seq_file *m, void *v)
 {
-	int len;
-
-	len = sprintf(page, "%d\n", lookupCacheEnabled);
-
-	len -= off;
-	*start = page + off;
-
-	if (len > count)
-		len = count;
-	else
-		*eof = 1;
-
-	if (len < 0)
-		len = 0;
+	seq_printf(m, "%d\n", lookupCacheEnabled);
+	return 0;
+}
 
-	return len;
+static int cifs_lookup_cache_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, cifs_lookup_cache_proc_show, NULL);
 }
-static int
-lookupFlag_write(struct file *file, const char __user *buffer,
-		    unsigned long count, void *data)
+
+static ssize_t cifs_lookup_cache_proc_write(struct file *file,
+		const char __user *buffer, size_t count, loff_t *ppos)
 {
 	char c;
 	int rc;
@@ -760,30 +602,29 @@ lookupFlag_write(struct file *file, const char __user *buffer,
 
 	return count;
 }
-static int
-traceSMB_read(char *page, char **start, off_t off, int count,
-	      int *eof, void *data)
-{
-	int len;
 
-	len = sprintf(page, "%d\n", traceSMB);
+static const struct file_operations cifs_lookup_cache_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cifs_lookup_cache_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= cifs_lookup_cache_proc_write,
+};
 
-	len -= off;
-	*start = page + off;
-
-	if (len > count)
-		len = count;
-	else
-		*eof = 1;
-
-	if (len < 0)
-		len = 0;
+static int traceSMB_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%d\n", traceSMB);
+	return 0;
+}
 
-	return len;
+static int traceSMB_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, traceSMB_proc_show, NULL);
 }
-static int
-traceSMB_write(struct file *file, const char __user *buffer,
-	       unsigned long count, void *data)
+
+static ssize_t traceSMB_proc_write(struct file *file, const char __user *buffer,
+		size_t count, loff_t *ppos)
 {
 	char c;
 	int rc;
@@ -799,30 +640,28 @@ traceSMB_write(struct file *file, const char __user *buffer,
 	return count;
 }
 
-static int
-multiuser_mount_read(char *page, char **start, off_t off,
-		     int count, int *eof, void *data)
-{
-	int len;
-
-	len = sprintf(page, "%d\n", multiuser_mount);
-
-	len -= off;
-	*start = page + off;
+static const struct file_operations traceSMB_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= traceSMB_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= traceSMB_proc_write,
+};
 
-	if (len > count)
-		len = count;
-	else
-		*eof = 1;
-
-	if (len < 0)
-		len = 0;
+static int cifs_multiuser_mount_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%d\n", multiuser_mount);
+	return 0;
+}
 
-	return len;
+static int cifs_multiuser_mount_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, cifs_multiuser_mount_proc_show, NULL);
 }
-static int
-multiuser_mount_write(struct file *file, const char __user *buffer,
-		      unsigned long count, void *data)
+
+static ssize_t cifs_multiuser_mount_proc_write(struct file *file,
+		const char __user *buffer, size_t count, loff_t *ppos)
 {
 	char c;
 	int rc;
@@ -838,30 +677,28 @@ multiuser_mount_write(struct file *file, const char __user *buffer,
 	return count;
 }
 
-static int
-security_flags_read(char *page, char **start, off_t off,
-		       int count, int *eof, void *data)
-{
-	int len;
-
-	len = sprintf(page, "0x%x\n", extended_security);
-
-	len -= off;
-	*start = page + off;
+static const struct file_operations cifs_multiuser_mount_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cifs_multiuser_mount_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= cifs_multiuser_mount_proc_write,
+};
 
-	if (len > count)
-		len = count;
-	else
-		*eof = 1;
-
-	if (len < 0)
-		len = 0;
+static int cifs_security_flags_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "0x%x\n", extended_security);
+	return 0;
+}
 
-	return len;
+static int cifs_security_flags_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, cifs_security_flags_proc_show, NULL);
 }
-static int
-security_flags_write(struct file *file, const char __user *buffer,
-			unsigned long count, void *data)
+
+static ssize_t cifs_security_flags_proc_write(struct file *file,
+		const char __user *buffer, size_t count, loff_t *ppos)
 {
 	unsigned int flags;
 	char flags_string[12];
@@ -917,6 +754,15 @@ security_flags_write(struct file *file, const char __user *buffer,
 	/* BB should we turn on MAY flags for other MUST options? */
 	return count;
 }
+
+static const struct file_operations cifs_security_flags_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cifs_security_flags_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= cifs_security_flags_proc_write,
+};
 #else
 inline void cifs_proc_init(void)
 {
-- 
cgit v1.2.3-70-g09d2


From 99b1f5b2f6cd2f65cce02c5f63302df5878a5fbc Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 24 Jul 2008 02:37:45 +0000
Subject: [CIFS] remove checkpatch warning

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_debug.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 73925a77375..c213aa40064 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -395,11 +395,16 @@ cifs_proc_init(void)
 	proc_create("cifsFYI", 0, proc_fs_cifs, &cifsFYI_proc_fops);
 	proc_create("traceSMB", 0, proc_fs_cifs, &traceSMB_proc_fops);
 	proc_create("OplockEnabled", 0, proc_fs_cifs, &cifs_oplock_proc_fops);
-	proc_create("Experimental", 0, proc_fs_cifs, &cifs_experimental_proc_fops);
-	proc_create("LinuxExtensionsEnabled", 0, proc_fs_cifs, &cifs_linux_ext_proc_fops);
-	proc_create("MultiuserMount", 0, proc_fs_cifs, &cifs_multiuser_mount_proc_fops);
-	proc_create("SecurityFlags", 0, proc_fs_cifs, &cifs_security_flags_proc_fops);
-	proc_create("LookupCacheEnabled", 0, proc_fs_cifs, &cifs_lookup_cache_proc_fops);
+	proc_create("Experimental", 0, proc_fs_cifs,
+		    &cifs_experimental_proc_fops);
+	proc_create("LinuxExtensionsEnabled", 0, proc_fs_cifs,
+		    &cifs_linux_ext_proc_fops);
+	proc_create("MultiuserMount", 0, proc_fs_cifs,
+		    &cifs_multiuser_mount_proc_fops);
+	proc_create("SecurityFlags", 0, proc_fs_cifs,
+		    &cifs_security_flags_proc_fops);
+	proc_create("LookupCacheEnabled", 0, proc_fs_cifs,
+		    &cifs_lookup_cache_proc_fops);
 }
 
 void
@@ -655,9 +660,9 @@ static int cifs_multiuser_mount_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int cifs_multiuser_mount_proc_open(struct inode *inode, struct file *file)
+static int cifs_multiuser_mount_proc_open(struct inode *inode, struct file *fh)
 {
-	return single_open(file, cifs_multiuser_mount_proc_show, NULL);
+	return single_open(fh, cifs_multiuser_mount_proc_show, NULL);
 }
 
 static ssize_t cifs_multiuser_mount_proc_write(struct file *file,
-- 
cgit v1.2.3-70-g09d2


From 76c510ad2e7d56cfe8f2cc7b23783e5c687cf704 Mon Sep 17 00:00:00 2001
From: Shirish Pargaonkar <shirishp@gmail.com>
Date: Thu, 24 Jul 2008 14:48:33 +0000
Subject: [CIFS] Fix possible double free if search immediately after search
 rewind fails

Signed-off-by: Shirish Pargaonkar <shirishp@us.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/readdir.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 83f30695488..5f40ed3473f 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -690,6 +690,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 			else
 				cifs_buf_release(cifsFile->srch_inf.
 						ntwrk_buf_start);
+			cifsFile->srch_inf.ntwrk_buf_start = NULL;
 		}
 		rc = initiate_cifs_search(xid, file);
 		if (rc) {
-- 
cgit v1.2.3-70-g09d2


From b1910ad6222a705650dc991c003af43b94cdb3e1 Mon Sep 17 00:00:00 2001
From: Shirish Pargaonkar <shirishp@gmail.com>
Date: Thu, 24 Jul 2008 14:53:20 +0000
Subject: [CIFS] Fix improper endian conversion of ACL subauth field

In mode_to_acl when converting a Unix mode to a Windows ACL
the subauth fields of the SID in the ACL were translated
incorrectly on bigendian architectures

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsacl.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 0e9fc2ba90e..18faf65b911 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -169,8 +169,7 @@ static void copy_sec_desc(const struct cifs_ntsd *pntsd,
 	for (i = 0; i < 6; i++)
 		ngroup_sid_ptr->authority[i] = group_sid_ptr->authority[i];
 	for (i = 0; i < 5; i++)
-		ngroup_sid_ptr->sub_auth[i] =
-				cpu_to_le32(group_sid_ptr->sub_auth[i]);
+		ngroup_sid_ptr->sub_auth[i] = group_sid_ptr->sub_auth[i];
 
 	return;
 }
-- 
cgit v1.2.3-70-g09d2


From ef571cadd516e7ffcdeac6bb8054e5908fcccfcf Mon Sep 17 00:00:00 2001
From: Shirish Pargaonkar <shirishp@gmail.com>
Date: Thu, 24 Jul 2008 15:56:05 +0000
Subject: [CIFS] Fix warnings from checkpatch

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_debug.c  |  4 ++--
 fs/cifs/cifsacl.c     | 38 +++++++++++++++++++-------------------
 fs/cifs/cifsencrypt.c |  3 +--
 fs/cifs/cifsglob.h    |  6 +++---
 fs/cifs/cifspdu.h     |  8 ++++----
 fs/cifs/cifssmb.c     |  5 ++---
 6 files changed, 31 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index c213aa40064..688a2d42153 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -208,9 +208,9 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 			seq_puts(m, " type: CDROM ");
 		else
 			seq_printf(m, " type: %d ", dev_type);
-		if (tcon->tidStatus == CifsNeedReconnect) {
+
+		if (tcon->tidStatus == CifsNeedReconnect)
 			seq_puts(m, "\tDISCONNECTED ");
-		}
 	}
 	read_unlock(&GlobalSMBSeslock);
 
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 18faf65b911..57ecdc83c26 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -56,7 +56,7 @@ int match_sid(struct cifs_sid *ctsid)
 	struct cifs_sid *cwsid;
 
 	if (!ctsid)
-		return (-1);
+		return -1;
 
 	for (i = 0; i < NUM_WK_SIDS; ++i) {
 		cwsid = &(wksidarr[i].cifssid);
@@ -87,11 +87,11 @@ int match_sid(struct cifs_sid *ctsid)
 		}
 
 		cFYI(1, ("matching sid: %s\n", wksidarr[i].sidname));
-		return (0); /* sids compare/match */
+		return 0; /* sids compare/match */
 	}
 
 	cFYI(1, ("No matching sid"));
-	return (-1);
+	return -1;
 }
 
 /* if the two SIDs (roughly equivalent to a UUID for a user or group) are
@@ -102,16 +102,16 @@ int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
 	int num_subauth, num_sat, num_saw;
 
 	if ((!ctsid) || (!cwsid))
-		return (0);
+		return 0;
 
 	/* compare the revision */
 	if (ctsid->revision != cwsid->revision)
-		return (0);
+		return 0;
 
 	/* compare all of the six auth values */
 	for (i = 0; i < 6; ++i) {
 		if (ctsid->authority[i] != cwsid->authority[i])
-			return (0);
+			return 0;
 	}
 
 	/* compare all of the subauth values if any */
@@ -121,11 +121,11 @@ int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
 	if (num_subauth) {
 		for (i = 0; i < num_subauth; ++i) {
 			if (ctsid->sub_auth[i] != cwsid->sub_auth[i])
-				return (0);
+				return 0;
 		}
 	}
 
-	return (1); /* sids compare/match */
+	return 1; /* sids compare/match */
 }
 
 
@@ -284,7 +284,7 @@ static __u16 fill_ace_for_sid(struct cifs_ace *pntace,
 	size = 1 + 1 + 2 + 4 + 1 + 1 + 6 + (psid->num_subauth * 4);
 	pntace->size = cpu_to_le16(size);
 
-	return (size);
+	return size;
 }
 
 
@@ -425,7 +425,7 @@ static int set_chmod_dacl(struct cifs_acl *pndacl, struct cifs_sid *pownersid,
 	pndacl->size = cpu_to_le16(size + sizeof(struct cifs_acl));
 	pndacl->num_aces = cpu_to_le32(3);
 
-	return (0);
+	return 0;
 }
 
 
@@ -509,7 +509,7 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
 			sizeof(struct cifs_sid)); */
 
 
-	return (0);
+	return 0;
 }
 
 
@@ -526,7 +526,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
 	struct cifs_acl *ndacl_ptr = NULL; /* no need for SACL ptr */
 
 	if ((inode == NULL) || (pntsd == NULL) || (pnntsd == NULL))
-		return (-EIO);
+		return -EIO;
 
 	owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
 				le32_to_cpu(pntsd->osidoffset));
@@ -549,7 +549,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
 	/* copy security descriptor control portion and owner and group sid */
 	copy_sec_desc(pntsd, pnntsd, sidsoffset);
 
-	return (rc);
+	return rc;
 }
 
 
@@ -628,11 +628,11 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
 	cFYI(DBG2, ("set ACL for %s from mode 0x%x", path, inode->i_mode));
 
 	if (!inode)
-		return (rc);
+		return rc;
 
 	sb = inode->i_sb;
 	if (sb == NULL)
-		return (rc);
+		return rc;
 
 	cifs_sb = CIFS_SB(sb);
 	xid = GetXid();
@@ -651,7 +651,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
 		if (rc != 0) {
 			cERROR(1, ("Unable to open file to set ACL"));
 			FreeXid(xid);
-			return (rc);
+			return rc;
 		}
 	}
 
@@ -664,7 +664,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
 
 	FreeXid(xid);
 
-	return (rc);
+	return rc;
 }
 
 /* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
@@ -714,7 +714,7 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
 		if (!pnntsd) {
 			cERROR(1, ("Unable to allocate security descriptor"));
 			kfree(pntsd);
-			return (-ENOMEM);
+			return -ENOMEM;
 		}
 
 		rc = build_sec_desc(pntsd, pnntsd, inode, nmode);
@@ -731,6 +731,6 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
 		kfree(pntsd);
 	}
 
-	return (rc);
+	return rc;
 }
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 4ff8939c6cc..83fd40dc1ef 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -310,9 +310,8 @@ void calc_lanman_hash(struct cifsSesInfo *ses, char *lnm_session_key)
 	utf8 and other multibyte codepages each need their own strupper
 	function since a byte at a time will ont work. */
 
-	for (i = 0; i < CIFS_ENCPWD_SIZE; i++) {
+	for (i = 0; i < CIFS_ENCPWD_SIZE; i++)
 		password_with_pad[i] = toupper(password_with_pad[i]);
-	}
 
 	SMBencrypt(password_with_pad, ses->server->cryptKey, lnm_session_key);
 	/* clear password before we return/free memory */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 9cfcf326ead..7e1cf262eff 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -27,7 +27,7 @@
 #define MAX_SES_INFO 2
 #define MAX_TCON_INFO 4
 
-#define MAX_TREE_SIZE 2 + MAX_SERVER_SIZE + 1 + MAX_SHARE_SIZE + 1
+#define MAX_TREE_SIZE (2 + MAX_SERVER_SIZE + 1 + MAX_SHARE_SIZE + 1)
 #define MAX_SERVER_SIZE 15
 #define MAX_SHARE_SIZE  64	/* used to be 20, this should still be enough */
 #define MAX_USERNAME_SIZE 32	/* 32 is to allow for 15 char names + null
@@ -537,8 +537,8 @@ require use of the stronger protocol */
 #endif /* WEAK_PW_HASH */
 #define   CIFSSEC_MUST_SEAL	0x40040 /* not supported yet */
 
-#define   CIFSSEC_DEF  CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2
-#define   CIFSSEC_MAX  CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2
+#define   CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2)
+#define   CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2)
 #define   CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5)
 /*
  *****************************************************************
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 0f327c224da..409abce1273 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -31,7 +31,7 @@
 #else
 #define CIFS_PROT   0
 #endif
-#define POSIX_PROT  CIFS_PROT+1
+#define POSIX_PROT  (CIFS_PROT+1)
 #define BAD_PROT 0xFFFF
 
 /* SMB command codes */
@@ -341,7 +341,7 @@
 #define CREATE_COMPLETE_IF_OPLK 0x00000100	/* should be zero */
 #define CREATE_NO_EA_KNOWLEDGE  0x00000200
 #define CREATE_EIGHT_DOT_THREE  0x00000400	/* doc says this is obsolete
-						 "open for recovery" flag - should
+						 "open for recovery" flag should
 						 be zero in any case */
 #define CREATE_OPEN_FOR_RECOVERY 0x00000400
 #define CREATE_RANDOM_ACCESS	0x00000800
@@ -414,8 +414,8 @@ struct smb_hdr {
 	__u8 WordCount;
 } __attribute__((packed));
 /* given a pointer to an smb_hdr retrieve the value of byte count */
-#define BCC(smb_var) ( *(__u16 *)((char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount)))
-#define BCC_LE(smb_var) ( *(__le16 *)((char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount)))
+#define BCC(smb_var) (*(__u16 *)((char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount)))
+#define BCC_LE(smb_var) (*(__le16 *)((char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount)))
 /* given a pointer to an smb_hdr retrieve the pointer to the byte area */
 #define pByteArea(smb_var) ((unsigned char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount) + 2)
 
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 7e175e1399d..c621ffa2ca9 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -686,11 +686,10 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 						 SecurityBlob,
 						 count - 16,
 						 &server->secType);
-			if (rc == 1) {
+			if (rc == 1)
 				rc = 0;
-			} else {
+			else
 				rc = -EINVAL;
-			}
 		}
 	} else
 		server->capabilities &= ~CAP_EXTENDED_SECURITY;
-- 
cgit v1.2.3-70-g09d2


From c748e1340e0de3fa7fed86f8bdf499be9242afff Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Wed, 23 Jul 2008 21:27:03 -0700
Subject: mm/vmstat.c: proper externs

This patch adds proper extern declarations for five variables in
include/linux/vmstat.h

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/proc_misc.c    | 4 ----
 include/linux/vmstat.h | 6 ++++++
 kernel/sysctl.c        | 2 +-
 mm/vmstat.c            | 1 +
 4 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index c652d469dc0..b14f43d25e9 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -232,7 +232,6 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 #undef K
 }
 
-extern const struct seq_operations fragmentation_op;
 static int fragmentation_open(struct inode *inode, struct file *file)
 {
 	(void)inode;
@@ -246,7 +245,6 @@ static const struct file_operations fragmentation_file_operations = {
 	.release	= seq_release,
 };
 
-extern const struct seq_operations pagetypeinfo_op;
 static int pagetypeinfo_open(struct inode *inode, struct file *file)
 {
 	return seq_open(file, &pagetypeinfo_op);
@@ -259,7 +257,6 @@ static const struct file_operations pagetypeinfo_file_ops = {
 	.release	= seq_release,
 };
 
-extern const struct seq_operations zoneinfo_op;
 static int zoneinfo_open(struct inode *inode, struct file *file)
 {
 	return seq_open(file, &zoneinfo_op);
@@ -356,7 +353,6 @@ static const struct file_operations proc_devinfo_operations = {
 	.release	= seq_release,
 };
 
-extern const struct seq_operations vmstat_op;
 static int vmstat_open(struct inode *inode, struct file *file)
 {
 	return seq_open(file, &vmstat_op);
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index e83b69346d2..58334d43951 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -44,6 +44,12 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		NR_VM_EVENT_ITEMS
 };
 
+extern const struct seq_operations fragmentation_op;
+extern const struct seq_operations pagetypeinfo_op;
+extern const struct seq_operations zoneinfo_op;
+extern const struct seq_operations vmstat_op;
+extern int sysctl_stat_interval;
+
 #ifdef CONFIG_VM_EVENT_COUNTERS
 /*
  * Light weight per cpu counter implementation.
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2a7b9d88706..1f7b3b76a16 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -43,6 +43,7 @@
 #include <linux/limits.h>
 #include <linux/dcache.h>
 #include <linux/syscalls.h>
+#include <linux/vmstat.h>
 #include <linux/nfs_fs.h>
 #include <linux/acpi.h>
 #include <linux/reboot.h>
@@ -80,7 +81,6 @@ extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
 extern int compat_log;
 extern int maps_protect;
-extern int sysctl_stat_interval;
 extern int latencytop_enabled;
 extern int sysctl_nr_open_min, sysctl_nr_open_max;
 #ifdef CONFIG_RCU_TORTURE_TEST
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c3d4a781802..b0d08e667ec 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -13,6 +13,7 @@
 #include <linux/err.h>
 #include <linux/module.h>
 #include <linux/cpu.h>
+#include <linux/vmstat.h>
 #include <linux/sched.h>
 
 #ifdef CONFIG_VM_EVENT_COUNTERS
-- 
cgit v1.2.3-70-g09d2


From 42b7772812d15b86543a23b82bd6070eef9a08b1 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Wed, 23 Jul 2008 21:27:10 -0700
Subject: mm: remove double indirection on tlb parameter to free_pgd_range() &
 Co

The double indirection here is not needed anywhere and hence (at least)
confusing.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: "David S. Miller" <davem@davemloft.net>
Acked-by: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/mm/hugetlbpage.c    |  2 +-
 arch/powerpc/mm/hugetlbpage.c |  8 ++++----
 fs/exec.c                     |  4 ++--
 include/asm-ia64/hugetlb.h    |  2 +-
 include/asm-powerpc/hugetlb.h |  2 +-
 include/asm-sh/hugetlb.h      |  2 +-
 include/asm-sparc/hugetlb.h   |  2 +-
 include/asm-x86/hugetlb.h     |  2 +-
 include/linux/mm.h            |  4 +---
 mm/internal.h                 |  3 +++
 mm/memory.c                   | 10 ++++++----
 mm/mmap.c                     |  6 ++++--
 12 files changed, 26 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index d3ce8f3bcaa..cd49e2860ee 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -112,7 +112,7 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int wri
 	return NULL;
 }
 
-void hugetlb_free_pgd_range(struct mmu_gather **tlb,
+void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 			unsigned long addr, unsigned long end,
 			unsigned long floor, unsigned long ceiling)
 {
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 0d12fba31bc..1a96cc891cf 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -255,7 +255,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
  *
  * Must be called with pagetable lock held.
  */
-void hugetlb_free_pgd_range(struct mmu_gather **tlb,
+void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 			    unsigned long addr, unsigned long end,
 			    unsigned long floor, unsigned long ceiling)
 {
@@ -315,13 +315,13 @@ void hugetlb_free_pgd_range(struct mmu_gather **tlb,
 		return;
 
 	start = addr;
-	pgd = pgd_offset((*tlb)->mm, addr);
+	pgd = pgd_offset(tlb->mm, addr);
 	do {
-		BUG_ON(get_slice_psize((*tlb)->mm, addr) != mmu_huge_psize);
+		BUG_ON(get_slice_psize(tlb->mm, addr) != mmu_huge_psize);
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		hugetlb_free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
+		hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
 	} while (pgd++, addr = next, addr != end);
 }
 
diff --git a/fs/exec.c b/fs/exec.c
index fd9234379e8..190ed1f9277 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -541,7 +541,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
 		/*
 		 * when the old and new regions overlap clear from new_end.
 		 */
-		free_pgd_range(&tlb, new_end, old_end, new_end,
+		free_pgd_range(tlb, new_end, old_end, new_end,
 			vma->vm_next ? vma->vm_next->vm_start : 0);
 	} else {
 		/*
@@ -550,7 +550,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
 		 * have constraints on va-space that make this illegal (IA64) -
 		 * for the others its just a little faster.
 		 */
-		free_pgd_range(&tlb, old_start, old_end, new_end,
+		free_pgd_range(tlb, old_start, old_end, new_end,
 			vma->vm_next ? vma->vm_next->vm_start : 0);
 	}
 	tlb_finish_mmu(tlb, new_end, old_end);
diff --git a/include/asm-ia64/hugetlb.h b/include/asm-ia64/hugetlb.h
index f28a9701f1c..e9d1e5e2382 100644
--- a/include/asm-ia64/hugetlb.h
+++ b/include/asm-ia64/hugetlb.h
@@ -4,7 +4,7 @@
 #include <asm/page.h>
 
 
-void hugetlb_free_pgd_range(struct mmu_gather **tlb, unsigned long addr,
+void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
 			    unsigned long end, unsigned long floor,
 			    unsigned long ceiling);
 
diff --git a/include/asm-powerpc/hugetlb.h b/include/asm-powerpc/hugetlb.h
index be32ff02f4a..0a37aa5ecaa 100644
--- a/include/asm-powerpc/hugetlb.h
+++ b/include/asm-powerpc/hugetlb.h
@@ -7,7 +7,7 @@
 int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
 			   unsigned long len);
 
-void hugetlb_free_pgd_range(struct mmu_gather **tlb, unsigned long addr,
+void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
 			    unsigned long end, unsigned long floor,
 			    unsigned long ceiling);
 
diff --git a/include/asm-sh/hugetlb.h b/include/asm-sh/hugetlb.h
index 02402303d89..fb30018938c 100644
--- a/include/asm-sh/hugetlb.h
+++ b/include/asm-sh/hugetlb.h
@@ -26,7 +26,7 @@ static inline int prepare_hugepage_range(unsigned long addr, unsigned long len)
 static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) {
 }
 
-static inline void hugetlb_free_pgd_range(struct mmu_gather **tlb,
+static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 					  unsigned long addr, unsigned long end,
 					  unsigned long floor,
 					  unsigned long ceiling)
diff --git a/include/asm-sparc/hugetlb.h b/include/asm-sparc/hugetlb.h
index 412af58926a..aeb92374ca3 100644
--- a/include/asm-sparc/hugetlb.h
+++ b/include/asm-sparc/hugetlb.h
@@ -31,7 +31,7 @@ static inline int prepare_hugepage_range(unsigned long addr, unsigned long len)
 	return 0;
 }
 
-static inline void hugetlb_free_pgd_range(struct mmu_gather **tlb,
+static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 					  unsigned long addr, unsigned long end,
 					  unsigned long floor,
 					  unsigned long ceiling)
diff --git a/include/asm-x86/hugetlb.h b/include/asm-x86/hugetlb.h
index 14171a4924f..7eed6e0883b 100644
--- a/include/asm-x86/hugetlb.h
+++ b/include/asm-x86/hugetlb.h
@@ -26,7 +26,7 @@ static inline int prepare_hugepage_range(unsigned long addr, unsigned long len)
 static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) {
 }
 
-static inline void hugetlb_free_pgd_range(struct mmu_gather **tlb,
+static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 					  unsigned long addr, unsigned long end,
 					  unsigned long floor,
 					  unsigned long ceiling)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5c7f8f64f70..f8071097302 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -769,10 +769,8 @@ struct mm_walk {
 
 int walk_page_range(unsigned long addr, unsigned long end,
 		struct mm_walk *walk);
-void free_pgd_range(struct mmu_gather **tlb, unsigned long addr,
+void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
 		unsigned long end, unsigned long floor, unsigned long ceiling);
-void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *start_vma,
-		unsigned long floor, unsigned long ceiling);
 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
 			struct vm_area_struct *vma);
 void unmap_mapping_range(struct address_space *mapping,
diff --git a/mm/internal.h b/mm/internal.h
index 50807e12490..858ad01864d 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -13,6 +13,9 @@
 
 #include <linux/mm.h>
 
+void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
+		unsigned long floor, unsigned long ceiling);
+
 static inline void set_page_count(struct page *page, int v)
 {
 	atomic_set(&page->_count, v);
diff --git a/mm/memory.c b/mm/memory.c
index 87350321e66..82f3f1c5cf1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -61,6 +61,8 @@
 #include <linux/swapops.h>
 #include <linux/elf.h>
 
+#include "internal.h"
+
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 /* use the per-pgdat data instead for discontigmem - mbligh */
 unsigned long max_mapnr;
@@ -211,7 +213,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
  *
  * Must be called with pagetable lock held.
  */
-void free_pgd_range(struct mmu_gather **tlb,
+void free_pgd_range(struct mmu_gather *tlb,
 			unsigned long addr, unsigned long end,
 			unsigned long floor, unsigned long ceiling)
 {
@@ -262,16 +264,16 @@ void free_pgd_range(struct mmu_gather **tlb,
 		return;
 
 	start = addr;
-	pgd = pgd_offset((*tlb)->mm, addr);
+	pgd = pgd_offset(tlb->mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
+		free_pud_range(tlb, pgd, addr, next, floor, ceiling);
 	} while (pgd++, addr = next, addr != end);
 }
 
-void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
+void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		unsigned long floor, unsigned long ceiling)
 {
 	while (vma) {
diff --git a/mm/mmap.c b/mm/mmap.c
index 1d102b956fd..75e0d0673d7 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -32,6 +32,8 @@
 #include <asm/tlb.h>
 #include <asm/mmu_context.h>
 
+#include "internal.h"
+
 #ifndef arch_mmap_check
 #define arch_mmap_check(addr, len, flags)	(0)
 #endif
@@ -1763,7 +1765,7 @@ static void unmap_region(struct mm_struct *mm,
 	update_hiwater_rss(mm);
 	unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
 	vm_unacct_memory(nr_accounted);
-	free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
+	free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
 				 next? next->vm_start: 0);
 	tlb_finish_mmu(tlb, start, end);
 }
@@ -2063,7 +2065,7 @@ void exit_mmap(struct mm_struct *mm)
 	/* Use -1 here to ensure all VMAs in the mm are unmapped */
 	end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
 	vm_unacct_memory(nr_accounted);
-	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
+	free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
 	tlb_finish_mmu(tlb, 0, end);
 
 	/*
-- 
cgit v1.2.3-70-g09d2


From da3bbdd4632c0171406b2677e31494afa5bde2f8 Mon Sep 17 00:00:00 2001
From: Kentaro Makita <k-makita@np.css.fujitsu.com>
Date: Wed, 23 Jul 2008 21:27:13 -0700
Subject: fix soft lock up at NFS mount via per-SB LRU-list of unused dentries

[Summary]

 Split LRU-list of unused dentries to one per superblock to avoid soft
 lock up during NFS mounts and remounting of any filesystem.

 Previously I posted here:
 http://lkml.org/lkml/2008/3/5/590

[Descriptions]

- background

  dentry_unused is a list of dentries which are not referenced.
  dentry_unused grows up when references on directories or files are
  released.  This list can be very long if there is huge free memory.

- the problem

  When shrink_dcache_sb() is called, it scans all dentry_unused linearly
  under spin_lock(), and if dentry->d_sb is differnt from given
  superblock, scan next dentry.  This scan costs very much if there are
  many entries, and very ineffective if there are many superblocks.

  IOW, When we need to shrink unused dentries on one dentry, but scans
  unused dentries on all superblocks in the system.  For example, we scan
  500 dentries to unmount a filesystem, but scans 1,000,000 or more unused
  dentries on other superblocks.

  In our case , At mounting NFS*, shrink_dcache_sb() is called to shrink
  unused dentries on NFS, but scans 100,000,000 unused dentries on
  superblocks in the system such as local ext3 filesystems.  I hear NFS
  mounting took 1 min on some system in use.

* : NFS uses virtual filesystem in rpc layer, so NFS is affected by
  this problem.

  100,000,000 is possible number on large systems.

  Per-superblock LRU of unused dentried can reduce the cost in
  reasonable manner.

- How to fix

  I found this problem is solved by David Chinner's "Per-superblock
  unused dentry LRU lists V3"(1), so I rebase it and add some fix to
  reclaim with fairness, which is in Andrew Morton's comments(2).

  1) http://lkml.org/lkml/2006/5/25/318
  2) http://lkml.org/lkml/2006/5/25/320

  Split LRU-list of unused dentries to each superblocks.  Then, NFS
  mounting will check dentries under a superblock instead of all.  But
  this spliting will break LRU of dentry-unused.  So, I've attempted to
  make reclaim unused dentrins with fairness by calculate number of
  dentries to scan on this sb based on following way

  number of dentries to scan on this sb =
  count * (number of dentries on this sb / number of dentries in the machine)

- ToDo
 - I have to measuring performance number and do stress tests.

 - When unmount occurs during prune_dcache(), scanning on same
  superblock, It is unable to reach next superblock because it is gone
  away.  We restart scannig superblock from first one, it causes
  unfairness of reclaim unused dentries on first superblock.  But I think
  this happens very rarely.

- Test Results

  Result on 6GB boxes with excessive unused dentries.

Without patch:

$ cat /proc/sys/fs/dentry-state
10181835        10180203        45      0       0       0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real    0m1.830s
user    0m0.001s
sys     0m1.653s

 With this patch:
$ cat /proc/sys/fs/dentry-state
10236610        10234751        45      0       0       0
# mount -t nfs 10.124.60.70:/work/kernel-src nfs
real    0m0.106s
user    0m0.002s
sys     0m0.032s

[akpm@linux-foundation.org: fix comments]
Signed-off-by: Kentaro Makita <k-makita@np.css.fujitsu.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: David Chinner <dgc@sgi.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dcache.c        | 335 ++++++++++++++++++++++++++++-------------------------
 fs/super.c         |   1 +
 include/linux/fs.h |   4 +
 3 files changed, 185 insertions(+), 155 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 6068c25b393..3818d6ab76c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -61,7 +61,6 @@ static struct kmem_cache *dentry_cache __read_mostly;
 static unsigned int d_hash_mask __read_mostly;
 static unsigned int d_hash_shift __read_mostly;
 static struct hlist_head *dentry_hashtable __read_mostly;
-static LIST_HEAD(dentry_unused);
 
 /* Statistics gathering. */
 struct dentry_stat_t dentry_stat = {
@@ -96,14 +95,6 @@ static void d_free(struct dentry *dentry)
 		call_rcu(&dentry->d_u.d_rcu, d_callback);
 }
 
-static void dentry_lru_remove(struct dentry *dentry)
-{
-	if (!list_empty(&dentry->d_lru)) {
-		list_del_init(&dentry->d_lru);
-		dentry_stat.nr_unused--;
-	}
-}
-
 /*
  * Release the dentry's inode, using the filesystem
  * d_iput() operation if defined.
@@ -130,6 +121,41 @@ static void dentry_iput(struct dentry * dentry)
 	}
 }
 
+/*
+ * dentry_lru_(add|add_tail|del|del_init) must be called with dcache_lock held.
+ */
+static void dentry_lru_add(struct dentry *dentry)
+{
+	list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
+	dentry->d_sb->s_nr_dentry_unused++;
+	dentry_stat.nr_unused++;
+}
+
+static void dentry_lru_add_tail(struct dentry *dentry)
+{
+	list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
+	dentry->d_sb->s_nr_dentry_unused++;
+	dentry_stat.nr_unused++;
+}
+
+static void dentry_lru_del(struct dentry *dentry)
+{
+	if (!list_empty(&dentry->d_lru)) {
+		list_del(&dentry->d_lru);
+		dentry->d_sb->s_nr_dentry_unused--;
+		dentry_stat.nr_unused--;
+	}
+}
+
+static void dentry_lru_del_init(struct dentry *dentry)
+{
+	if (likely(!list_empty(&dentry->d_lru))) {
+		list_del_init(&dentry->d_lru);
+		dentry->d_sb->s_nr_dentry_unused--;
+		dentry_stat.nr_unused--;
+	}
+}
+
 /**
  * d_kill - kill dentry and return parent
  * @dentry: dentry to kill
@@ -212,8 +238,7 @@ repeat:
 		goto kill_it;
   	if (list_empty(&dentry->d_lru)) {
   		dentry->d_flags |= DCACHE_REFERENCED;
-  		list_add(&dentry->d_lru, &dentry_unused);
-  		dentry_stat.nr_unused++;
+		dentry_lru_add(dentry);
   	}
  	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dcache_lock);
@@ -222,7 +247,8 @@ repeat:
 unhash_it:
 	__d_drop(dentry);
 kill_it:
-	dentry_lru_remove(dentry);
+	/* if dentry was on the d_lru list delete it from there */
+	dentry_lru_del(dentry);
 	dentry = d_kill(dentry);
 	if (dentry)
 		goto repeat;
@@ -290,7 +316,7 @@ int d_invalidate(struct dentry * dentry)
 static inline struct dentry * __dget_locked(struct dentry *dentry)
 {
 	atomic_inc(&dentry->d_count);
-	dentry_lru_remove(dentry);
+	dentry_lru_del_init(dentry);
 	return dentry;
 }
 
@@ -406,133 +432,167 @@ static void prune_one_dentry(struct dentry * dentry)
 
 		if (dentry->d_op && dentry->d_op->d_delete)
 			dentry->d_op->d_delete(dentry);
-		dentry_lru_remove(dentry);
+		dentry_lru_del_init(dentry);
 		__d_drop(dentry);
 		dentry = d_kill(dentry);
 		spin_lock(&dcache_lock);
 	}
 }
 
-/**
- * prune_dcache - shrink the dcache
- * @count: number of entries to try and free
- * @sb: if given, ignore dentries for other superblocks
- *         which are being unmounted.
- *
- * Shrink the dcache. This is done when we need
- * more memory, or simply when we need to unmount
- * something (at which point we need to unuse
- * all dentries).
- *
- * This function may fail to free any resources if
- * all the dentries are in use.
+/*
+ * Shrink the dentry LRU on a given superblock.
+ * @sb   : superblock to shrink dentry LRU.
+ * @count: If count is NULL, we prune all dentries on superblock.
+ * @flags: If flags is non-zero, we need to do special processing based on
+ * which flags are set. This means we don't need to maintain multiple
+ * similar copies of this loop.
  */
- 
-static void prune_dcache(int count, struct super_block *sb)
+static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
 {
-	spin_lock(&dcache_lock);
-	for (; count ; count--) {
-		struct dentry *dentry;
-		struct list_head *tmp;
-		struct rw_semaphore *s_umount;
-
-		cond_resched_lock(&dcache_lock);
+	LIST_HEAD(referenced);
+	LIST_HEAD(tmp);
+	struct dentry *dentry;
+	int cnt = 0;
 
-		tmp = dentry_unused.prev;
-		if (sb) {
-			/* Try to find a dentry for this sb, but don't try
-			 * too hard, if they aren't near the tail they will
-			 * be moved down again soon
+	BUG_ON(!sb);
+	BUG_ON((flags & DCACHE_REFERENCED) && count == NULL);
+	spin_lock(&dcache_lock);
+	if (count != NULL)
+		/* called from prune_dcache() and shrink_dcache_parent() */
+		cnt = *count;
+restart:
+	if (count == NULL)
+		list_splice_init(&sb->s_dentry_lru, &tmp);
+	else {
+		while (!list_empty(&sb->s_dentry_lru)) {
+			dentry = list_entry(sb->s_dentry_lru.prev,
+					struct dentry, d_lru);
+			BUG_ON(dentry->d_sb != sb);
+
+			spin_lock(&dentry->d_lock);
+			/*
+			 * If we are honouring the DCACHE_REFERENCED flag and
+			 * the dentry has this flag set, don't free it. Clear
+			 * the flag and put it back on the LRU.
 			 */
-			int skip = count;
-			while (skip && tmp != &dentry_unused &&
-			    list_entry(tmp, struct dentry, d_lru)->d_sb != sb) {
-				skip--;
-				tmp = tmp->prev;
+			if ((flags & DCACHE_REFERENCED)
+				&& (dentry->d_flags & DCACHE_REFERENCED)) {
+				dentry->d_flags &= ~DCACHE_REFERENCED;
+				list_move_tail(&dentry->d_lru, &referenced);
+				spin_unlock(&dentry->d_lock);
+			} else {
+				list_move_tail(&dentry->d_lru, &tmp);
+				spin_unlock(&dentry->d_lock);
+				cnt--;
+				if (!cnt)
+					break;
 			}
 		}
-		if (tmp == &dentry_unused)
-			break;
-		list_del_init(tmp);
-		prefetch(dentry_unused.prev);
- 		dentry_stat.nr_unused--;
-		dentry = list_entry(tmp, struct dentry, d_lru);
-
- 		spin_lock(&dentry->d_lock);
+	}
+	while (!list_empty(&tmp)) {
+		dentry = list_entry(tmp.prev, struct dentry, d_lru);
+		dentry_lru_del_init(dentry);
+		spin_lock(&dentry->d_lock);
 		/*
 		 * We found an inuse dentry which was not removed from
-		 * dentry_unused because of laziness during lookup.  Do not free
-		 * it - just keep it off the dentry_unused list.
+		 * the LRU because of laziness during lookup.  Do not free
+		 * it - just keep it off the LRU list.
 		 */
- 		if (atomic_read(&dentry->d_count)) {
- 			spin_unlock(&dentry->d_lock);
+		if (atomic_read(&dentry->d_count)) {
+			spin_unlock(&dentry->d_lock);
 			continue;
 		}
-		/* If the dentry was recently referenced, don't free it. */
-		if (dentry->d_flags & DCACHE_REFERENCED) {
-			dentry->d_flags &= ~DCACHE_REFERENCED;
- 			list_add(&dentry->d_lru, &dentry_unused);
- 			dentry_stat.nr_unused++;
- 			spin_unlock(&dentry->d_lock);
+		prune_one_dentry(dentry);
+		/* dentry->d_lock was dropped in prune_one_dentry() */
+		cond_resched_lock(&dcache_lock);
+	}
+	if (count == NULL && !list_empty(&sb->s_dentry_lru))
+		goto restart;
+	if (count != NULL)
+		*count = cnt;
+	if (!list_empty(&referenced))
+		list_splice(&referenced, &sb->s_dentry_lru);
+	spin_unlock(&dcache_lock);
+}
+
+/**
+ * prune_dcache - shrink the dcache
+ * @count: number of entries to try to free
+ *
+ * Shrink the dcache. This is done when we need more memory, or simply when we
+ * need to unmount something (at which point we need to unuse all dentries).
+ *
+ * This function may fail to free any resources if all the dentries are in use.
+ */
+static void prune_dcache(int count)
+{
+	struct super_block *sb;
+	int w_count;
+	int unused = dentry_stat.nr_unused;
+	int prune_ratio;
+	int pruned;
+
+	if (unused == 0 || count == 0)
+		return;
+	spin_lock(&dcache_lock);
+restart:
+	if (count >= unused)
+		prune_ratio = 1;
+	else
+		prune_ratio = unused / count;
+	spin_lock(&sb_lock);
+	list_for_each_entry(sb, &super_blocks, s_list) {
+		if (sb->s_nr_dentry_unused == 0)
 			continue;
-		}
-		/*
-		 * If the dentry is not DCACHED_REFERENCED, it is time
-		 * to remove it from the dcache, provided the super block is
-		 * NULL (which means we are trying to reclaim memory)
-		 * or this dentry belongs to the same super block that
-		 * we want to shrink.
-		 */
-		/*
-		 * If this dentry is for "my" filesystem, then I can prune it
-		 * without taking the s_umount lock (I already hold it).
+		sb->s_count++;
+		/* Now, we reclaim unused dentrins with fairness.
+		 * We reclaim them same percentage from each superblock.
+		 * We calculate number of dentries to scan on this sb
+		 * as follows, but the implementation is arranged to avoid
+		 * overflows:
+		 * number of dentries to scan on this sb =
+		 * count * (number of dentries on this sb /
+		 * number of dentries in the machine)
 		 */
-		if (sb && dentry->d_sb == sb) {
-			prune_one_dentry(dentry);
-			continue;
-		}
+		spin_unlock(&sb_lock);
+		if (prune_ratio != 1)
+			w_count = (sb->s_nr_dentry_unused / prune_ratio) + 1;
+		else
+			w_count = sb->s_nr_dentry_unused;
+		pruned = w_count;
 		/*
-		 * ...otherwise we need to be sure this filesystem isn't being
-		 * unmounted, otherwise we could race with
-		 * generic_shutdown_super(), and end up holding a reference to
-		 * an inode while the filesystem is unmounted.
-		 * So we try to get s_umount, and make sure s_root isn't NULL.
-		 * (Take a local copy of s_umount to avoid a use-after-free of
-		 * `dentry').
+		 * We need to be sure this filesystem isn't being unmounted,
+		 * otherwise we could race with generic_shutdown_super(), and
+		 * end up holding a reference to an inode while the filesystem
+		 * is unmounted.  So we try to get s_umount, and make sure
+		 * s_root isn't NULL.
 		 */
-		s_umount = &dentry->d_sb->s_umount;
-		if (down_read_trylock(s_umount)) {
-			if (dentry->d_sb->s_root != NULL) {
-				prune_one_dentry(dentry);
-				up_read(s_umount);
-				continue;
+		if (down_read_trylock(&sb->s_umount)) {
+			if ((sb->s_root != NULL) &&
+			    (!list_empty(&sb->s_dentry_lru))) {
+				spin_unlock(&dcache_lock);
+				__shrink_dcache_sb(sb, &w_count,
+						DCACHE_REFERENCED);
+				pruned -= w_count;
+				spin_lock(&dcache_lock);
 			}
-			up_read(s_umount);
+			up_read(&sb->s_umount);
 		}
-		spin_unlock(&dentry->d_lock);
+		spin_lock(&sb_lock);
+		count -= pruned;
 		/*
-		 * Insert dentry at the head of the list as inserting at the
-		 * tail leads to a cycle.
+		 * restart only when sb is no longer on the list and
+		 * we have more work to do.
 		 */
- 		list_add(&dentry->d_lru, &dentry_unused);
-		dentry_stat.nr_unused++;
+		if (__put_super_and_need_restart(sb) && count > 0) {
+			spin_unlock(&sb_lock);
+			goto restart;
+		}
 	}
+	spin_unlock(&sb_lock);
 	spin_unlock(&dcache_lock);
 }
 
-/*
- * Shrink the dcache for the specified super block.
- * This allows us to unmount a device without disturbing
- * the dcache for the other devices.
- *
- * This implementation makes just two traversals of the
- * unused list.  On the first pass we move the selected
- * dentries to the most recent end, and on the second
- * pass we free them.  The second pass must restart after
- * each dput(), but since the target dentries are all at
- * the end, it's really just a single traversal.
- */
-
 /**
  * shrink_dcache_sb - shrink dcache for a superblock
  * @sb: superblock
@@ -541,44 +601,9 @@ static void prune_dcache(int count, struct super_block *sb)
  * is used to free the dcache before unmounting a file
  * system
  */
-
 void shrink_dcache_sb(struct super_block * sb)
 {
-	struct list_head *tmp, *next;
-	struct dentry *dentry;
-
-	/*
-	 * Pass one ... move the dentries for the specified
-	 * superblock to the most recent end of the unused list.
-	 */
-	spin_lock(&dcache_lock);
-	list_for_each_prev_safe(tmp, next, &dentry_unused) {
-		dentry = list_entry(tmp, struct dentry, d_lru);
-		if (dentry->d_sb != sb)
-			continue;
-		list_move_tail(tmp, &dentry_unused);
-	}
-
-	/*
-	 * Pass two ... free the dentries for this superblock.
-	 */
-repeat:
-	list_for_each_prev_safe(tmp, next, &dentry_unused) {
-		dentry = list_entry(tmp, struct dentry, d_lru);
-		if (dentry->d_sb != sb)
-			continue;
-		dentry_stat.nr_unused--;
-		list_del_init(tmp);
-		spin_lock(&dentry->d_lock);
-		if (atomic_read(&dentry->d_count)) {
-			spin_unlock(&dentry->d_lock);
-			continue;
-		}
-		prune_one_dentry(dentry);
-		cond_resched_lock(&dcache_lock);
-		goto repeat;
-	}
-	spin_unlock(&dcache_lock);
+	__shrink_dcache_sb(sb, NULL, 0);
 }
 
 /*
@@ -595,7 +620,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 
 	/* detach this root from the system */
 	spin_lock(&dcache_lock);
-	dentry_lru_remove(dentry);
+	dentry_lru_del_init(dentry);
 	__d_drop(dentry);
 	spin_unlock(&dcache_lock);
 
@@ -609,7 +634,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 			spin_lock(&dcache_lock);
 			list_for_each_entry(loop, &dentry->d_subdirs,
 					    d_u.d_child) {
-				dentry_lru_remove(loop);
+				dentry_lru_del_init(loop);
 				__d_drop(loop);
 				cond_resched_lock(&dcache_lock);
 			}
@@ -791,14 +816,13 @@ resume:
 		struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
 		next = tmp->next;
 
-		dentry_lru_remove(dentry);
+		dentry_lru_del_init(dentry);
 		/* 
 		 * move only zero ref count dentries to the end 
 		 * of the unused list for prune_dcache
 		 */
 		if (!atomic_read(&dentry->d_count)) {
-			list_add_tail(&dentry->d_lru, &dentry_unused);
-			dentry_stat.nr_unused++;
+			dentry_lru_add_tail(dentry);
 			found++;
 		}
 
@@ -840,10 +864,11 @@ out:
  
 void shrink_dcache_parent(struct dentry * parent)
 {
+	struct super_block *sb = parent->d_sb;
 	int found;
 
 	while ((found = select_parent(parent)) != 0)
-		prune_dcache(found, parent->d_sb);
+		__shrink_dcache_sb(sb, &found, 0);
 }
 
 /*
@@ -863,7 +888,7 @@ static int shrink_dcache_memory(int nr, gfp_t gfp_mask)
 	if (nr) {
 		if (!(gfp_mask & __GFP_FS))
 			return -1;
-		prune_dcache(nr, NULL);
+		prune_dcache(nr);
 	}
 	return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
 }
@@ -1215,7 +1240,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
  * rcu_read_lock() and rcu_read_unlock() are used to disable preemption while
  * lookup is going on.
  *
- * dentry_unused list is not updated even if lookup finds the required dentry
+ * The dentry unused LRU is not updated even if lookup finds the required dentry
  * in there. It is updated in places such as prune_dcache, shrink_dcache_sb,
  * select_parent and __dget_locked. This laziness saves lookup from dcache_lock
  * acquisition.
diff --git a/fs/super.c b/fs/super.c
index 453877c5697..e931ae9511f 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -70,6 +70,7 @@ static struct super_block *alloc_super(struct file_system_type *type)
 		INIT_LIST_HEAD(&s->s_instances);
 		INIT_HLIST_HEAD(&s->s_anon);
 		INIT_LIST_HEAD(&s->s_inodes);
+		INIT_LIST_HEAD(&s->s_dentry_lru);
 		init_rwsem(&s->s_umount);
 		mutex_init(&s->s_lock);
 		lockdep_set_class(&s->s_umount, &type->s_umount_key);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ff54ae4933f..e5e6a244096 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1025,6 +1025,7 @@ extern int send_sigurg(struct fown_struct *fown);
 extern struct list_head super_blocks;
 extern spinlock_t sb_lock;
 
+#define sb_entry(list)  list_entry((list), struct super_block, s_list)
 #define S_BIAS (1<<30)
 struct super_block {
 	struct list_head	s_list;		/* Keep this first */
@@ -1058,6 +1059,9 @@ struct super_block {
 	struct list_head	s_more_io;	/* parked for more writeback */
 	struct hlist_head	s_anon;		/* anonymous dentries for (nfs) exporting */
 	struct list_head	s_files;
+	/* s_dentry_lru and s_nr_dentry_unused are protected by dcache_lock */
+	struct list_head	s_dentry_lru;	/* unused dentry lru */
+	int			s_nr_dentry_unused;	/* # of dentry on lru */
 
 	struct block_device	*s_bdev;
 	struct mtd_info		*s_mtd;
-- 
cgit v1.2.3-70-g09d2


From a1e78772d72b2616ed20e54896e68e0e7044854e Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Wed, 23 Jul 2008 21:27:23 -0700
Subject: hugetlb: reserve huge pages for reliable MAP_PRIVATE hugetlbfs
 mappings until fork()

This patch reserves huge pages at mmap() time for MAP_PRIVATE mappings in
a similar manner to the reservations taken for MAP_SHARED mappings.  The
reserve count is accounted both globally and on a per-VMA basis for
private mappings.  This guarantees that a process that successfully calls
mmap() will successfully fault all pages in the future unless fork() is
called.

The characteristics of private mappings of hugetlbfs files behaviour after
this patch are;

1. The process calling mmap() is guaranteed to succeed all future faults until
   it forks().
2. On fork(), the parent may die due to SIGKILL on writes to the private
   mapping if enough pages are not available for the COW. For reasonably
   reliable behaviour in the face of a small huge page pool, children of
   hugepage-aware processes should not reference the mappings; such as
   might occur when fork()ing to exec().
3. On fork(), the child VMAs inherit no reserves. Reads on pages already
   faulted by the parent will succeed. Successful writes will depend on enough
   huge pages being free in the pool.
4. Quotas of the hugetlbfs mount are checked at reserve time for the mapper
   and at fault time otherwise.

Before this patch, all reads or writes in the child potentially needs page
allocations that can later lead to the death of the parent.  This applies
to reads and writes of uninstantiated pages as well as COW.  After the
patch it is only a write to an instantiated page that causes problems.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Adam Litke <agl@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hugetlbfs/inode.c    |   8 +--
 include/linux/hugetlb.h |   9 ++-
 kernel/fork.c           |   9 +++
 mm/hugetlb.c            | 158 ++++++++++++++++++++++++++++++++++++------------
 4 files changed, 140 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index aeabf80f81a..1576bbecd08 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -103,9 +103,9 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	ret = -ENOMEM;
 	len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
 
-	if (vma->vm_flags & VM_MAYSHARE &&
-	    hugetlb_reserve_pages(inode, vma->vm_pgoff >> (HPAGE_SHIFT-PAGE_SHIFT),
-				  len >> HPAGE_SHIFT))
+	if (hugetlb_reserve_pages(inode,
+				vma->vm_pgoff >> (HPAGE_SHIFT-PAGE_SHIFT),
+				len >> HPAGE_SHIFT, vma))
 		goto out;
 
 	ret = 0;
@@ -942,7 +942,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size)
 		goto out_dentry;
 
 	error = -ENOMEM;
-	if (hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT))
+	if (hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT, NULL))
 		goto out_inode;
 
 	d_instantiate(dentry, inode);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index a79e80b689d..185b14c9f02 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -17,6 +17,7 @@ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
 	return vma->vm_flags & VM_HUGETLB;
 }
 
+void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
 int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
 int hugetlb_overcommit_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
 int hugetlb_treat_movable_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
@@ -30,7 +31,8 @@ int hugetlb_report_node_meminfo(int, char *);
 unsigned long hugetlb_total_pages(void);
 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, int write_access);
-int hugetlb_reserve_pages(struct inode *inode, long from, long to);
+int hugetlb_reserve_pages(struct inode *inode, long from, long to,
+						struct vm_area_struct *vma);
 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
 
 extern unsigned long max_huge_pages;
@@ -58,6 +60,11 @@ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
 {
 	return 0;
 }
+
+static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
+{
+}
+
 static inline unsigned long hugetlb_total_pages(void)
 {
 	return 0;
diff --git a/kernel/fork.c b/kernel/fork.c
index adefc1131f2..552c8d8e77a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -33,6 +33,7 @@
 #include <linux/cpu.h>
 #include <linux/cgroup.h>
 #include <linux/security.h>
+#include <linux/hugetlb.h>
 #include <linux/swap.h>
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
@@ -306,6 +307,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 			spin_unlock(&file->f_mapping->i_mmap_lock);
 		}
 
+		/*
+		 * Clear hugetlb-related page reserves for children. This only
+		 * affects MAP_PRIVATE mappings. Faults generated by the child
+		 * are not guaranteed to succeed, even if read-only
+		 */
+		if (is_vm_hugetlb_page(tmp))
+			reset_vma_resv_huge_pages(tmp);
+
 		/*
 		 * Link in the new vma and copy the page table entries.
 		 */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a4dbba8965f..0af500db363 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -40,6 +40,69 @@ static int hugetlb_next_nid;
  */
 static DEFINE_SPINLOCK(hugetlb_lock);
 
+/*
+ * These helpers are used to track how many pages are reserved for
+ * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
+ * is guaranteed to have their future faults succeed.
+ *
+ * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
+ * the reserve counters are updated with the hugetlb_lock held. It is safe
+ * to reset the VMA at fork() time as it is not in use yet and there is no
+ * chance of the global counters getting corrupted as a result of the values.
+ */
+static unsigned long vma_resv_huge_pages(struct vm_area_struct *vma)
+{
+	VM_BUG_ON(!is_vm_hugetlb_page(vma));
+	if (!(vma->vm_flags & VM_SHARED))
+		return (unsigned long)vma->vm_private_data;
+	return 0;
+}
+
+static void set_vma_resv_huge_pages(struct vm_area_struct *vma,
+							unsigned long reserve)
+{
+	VM_BUG_ON(!is_vm_hugetlb_page(vma));
+	VM_BUG_ON(vma->vm_flags & VM_SHARED);
+
+	vma->vm_private_data = (void *)reserve;
+}
+
+/* Decrement the reserved pages in the hugepage pool by one */
+static void decrement_hugepage_resv_vma(struct vm_area_struct *vma)
+{
+	if (vma->vm_flags & VM_SHARED) {
+		/* Shared mappings always use reserves */
+		resv_huge_pages--;
+	} else {
+		/*
+		 * Only the process that called mmap() has reserves for
+		 * private mappings.
+		 */
+		if (vma_resv_huge_pages(vma)) {
+			resv_huge_pages--;
+			reserve = (unsigned long)vma->vm_private_data - 1;
+			vma->vm_private_data = (void *)reserve;
+		}
+	}
+}
+
+void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
+{
+	VM_BUG_ON(!is_vm_hugetlb_page(vma));
+	if (!(vma->vm_flags & VM_SHARED))
+		vma->vm_private_data = (void *)0;
+}
+
+/* Returns true if the VMA has associated reserve pages */
+static int vma_has_private_reserves(struct vm_area_struct *vma)
+{
+	if (vma->vm_flags & VM_SHARED)
+		return 0;
+	if (!vma_resv_huge_pages(vma))
+		return 0;
+	return 1;
+}
+
 static void clear_huge_page(struct page *page, unsigned long addr)
 {
 	int i;
@@ -101,6 +164,15 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
 	struct zone *zone;
 	struct zoneref *z;
 
+	/*
+	 * A child process with MAP_PRIVATE mappings created by their parent
+	 * have no page reserves. This check ensures that reservations are
+	 * not "stolen". The child may still get SIGKILLed
+	 */
+	if (!vma_has_private_reserves(vma) &&
+			free_huge_pages - resv_huge_pages == 0)
+		return NULL;
+
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
 						MAX_NR_ZONES - 1, nodemask) {
 		nid = zone_to_nid(zone);
@@ -111,8 +183,8 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
 			list_del(&page->lru);
 			free_huge_pages--;
 			free_huge_pages_node[nid]--;
-			if (vma && vma->vm_flags & VM_MAYSHARE)
-				resv_huge_pages--;
+			decrement_hugepage_resv_vma(vma);
+
 			break;
 		}
 	}
@@ -461,55 +533,40 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages)
 	}
 }
 
-
-static struct page *alloc_huge_page_shared(struct vm_area_struct *vma,
-						unsigned long addr)
+static struct page *alloc_huge_page(struct vm_area_struct *vma,
+				    unsigned long addr)
 {
 	struct page *page;
+	struct address_space *mapping = vma->vm_file->f_mapping;
+	struct inode *inode = mapping->host;
+	unsigned int chg = 0;
+
+	/*
+	 * Processes that did not create the mapping will have no reserves and
+	 * will not have accounted against quota. Check that the quota can be
+	 * made before satisfying the allocation
+	 */
+	if (!vma_has_private_reserves(vma)) {
+		chg = 1;
+		if (hugetlb_get_quota(inode->i_mapping, chg))
+			return ERR_PTR(-ENOSPC);
+	}
 
 	spin_lock(&hugetlb_lock);
 	page = dequeue_huge_page_vma(vma, addr);
 	spin_unlock(&hugetlb_lock);
-	return page ? page : ERR_PTR(-VM_FAULT_OOM);
-}
 
-static struct page *alloc_huge_page_private(struct vm_area_struct *vma,
-						unsigned long addr)
-{
-	struct page *page = NULL;
-
-	if (hugetlb_get_quota(vma->vm_file->f_mapping, 1))
-		return ERR_PTR(-VM_FAULT_SIGBUS);
-
-	spin_lock(&hugetlb_lock);
-	if (free_huge_pages > resv_huge_pages)
-		page = dequeue_huge_page_vma(vma, addr);
-	spin_unlock(&hugetlb_lock);
 	if (!page) {
 		page = alloc_buddy_huge_page(vma, addr);
 		if (!page) {
-			hugetlb_put_quota(vma->vm_file->f_mapping, 1);
+			hugetlb_put_quota(inode->i_mapping, chg);
 			return ERR_PTR(-VM_FAULT_OOM);
 		}
 	}
-	return page;
-}
 
-static struct page *alloc_huge_page(struct vm_area_struct *vma,
-				    unsigned long addr)
-{
-	struct page *page;
-	struct address_space *mapping = vma->vm_file->f_mapping;
-
-	if (vma->vm_flags & VM_MAYSHARE)
-		page = alloc_huge_page_shared(vma, addr);
-	else
-		page = alloc_huge_page_private(vma, addr);
+	set_page_refcounted(page);
+	set_page_private(page, (unsigned long) mapping);
 
-	if (!IS_ERR(page)) {
-		set_page_refcounted(page);
-		set_page_private(page, (unsigned long) mapping);
-	}
 	return page;
 }
 
@@ -757,6 +814,13 @@ out:
 	return ret;
 }
 
+static void hugetlb_vm_op_close(struct vm_area_struct *vma)
+{
+	unsigned long reserve = vma_resv_huge_pages(vma);
+	if (reserve)
+		hugetlb_acct_memory(-reserve);
+}
+
 /*
  * We cannot handle pagefaults against hugetlb pages at all.  They cause
  * handle_mm_fault() to try to instantiate regular-sized pages in the
@@ -771,6 +835,7 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 
 struct vm_operations_struct hugetlb_vm_ops = {
 	.fault = hugetlb_vm_op_fault,
+	.close = hugetlb_vm_op_close,
 };
 
 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
@@ -1289,11 +1354,25 @@ static long region_truncate(struct list_head *head, long end)
 	return chg;
 }
 
-int hugetlb_reserve_pages(struct inode *inode, long from, long to)
+int hugetlb_reserve_pages(struct inode *inode,
+					long from, long to,
+					struct vm_area_struct *vma)
 {
 	long ret, chg;
 
-	chg = region_chg(&inode->i_mapping->private_list, from, to);
+	/*
+	 * Shared mappings base their reservation on the number of pages that
+	 * are already allocated on behalf of the file. Private mappings need
+	 * to reserve the full area even if read-only as mprotect() may be
+	 * called to make the mapping read-write. Assume !vma is a shm mapping
+	 */
+	if (!vma || vma->vm_flags & VM_SHARED)
+		chg = region_chg(&inode->i_mapping->private_list, from, to);
+	else {
+		chg = to - from;
+		set_vma_resv_huge_pages(vma, chg);
+	}
+
 	if (chg < 0)
 		return chg;
 
@@ -1304,7 +1383,8 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to)
 		hugetlb_put_quota(inode->i_mapping, chg);
 		return ret;
 	}
-	region_add(&inode->i_mapping->private_list, from, to);
+	if (!vma || vma->vm_flags & VM_SHARED)
+		region_add(&inode->i_mapping->private_list, from, to);
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 04f2cbe35699d22dbf428373682ead85ca1240f5 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Wed, 23 Jul 2008 21:27:25 -0700
Subject: hugetlb: guarantee that COW faults for a process that called
 mmap(MAP_PRIVATE) on hugetlbfs will succeed

After patch 2 in this series, a process that successfully calls mmap() for
a MAP_PRIVATE mapping will be guaranteed to successfully fault until a
process calls fork().  At that point, the next write fault from the parent
could fail due to COW if the child still has a reference.

We only reserve pages for the parent but a copy must be made to avoid
leaking data from the parent to the child after fork().  Reserves could be
taken for both parent and child at fork time to guarantee faults but if
the mapping is large it is highly likely we will not have sufficient pages
for the reservation, and it is common to fork only to exec() immediatly
after.  A failure here would be very undesirable.

Note that the current behaviour of mainline with MAP_PRIVATE pages is
pretty bad.  The following situation is allowed to occur today.

1. Process calls mmap(MAP_PRIVATE)
2. Process calls mlock() to fault all pages and makes sure it succeeds
3. Process forks()
4. Process writes to MAP_PRIVATE mapping while child still exists
5. If the COW fails at this point, the process gets SIGKILLed even though it
   had taken care to ensure the pages existed

This patch improves the situation by guaranteeing the reliability of the
process that successfully calls mmap().  When the parent performs COW, it
will try to satisfy the allocation without using reserves.  If that fails
the parent will steal the page leaving any children without a page.
Faults from the child after that point will result in failure.  If the
child COW happens first, an attempt will be made to allocate the page
without reserves and the child will get SIGKILLed on failure.

To summarise the new behaviour:

1. If the original mapper performs COW on a private mapping with multiple
   references, it will attempt to allocate a hugepage from the pool or
   the buddy allocator without using the existing reserves. On fail, VMAs
   mapping the same area are traversed and the page being COW'd is unmapped
   where found. It will then steal the original page as the last mapper in
   the normal way.

2. The VMAs the pages were unmapped from are flagged to note that pages
   with data no longer exist. Future no-page faults on those VMAs will
   terminate the process as otherwise it would appear that data was corrupted.
   A warning is printed to the console that this situation occured.

2. If the child performs COW first, it will attempt to satisfy the COW
   from the pool if there are enough pages or via the buddy allocator if
   overcommit is allowed and the buddy allocator can satisfy the request. If
   it fails, the child will be killed.

If the pool is large enough, existing applications will not notice that
the reserves were a factor.  Existing applications depending on the
no-reserves been set are unlikely to exist as for much of the history of
hugetlbfs, pages were prefaulted at mmap(), allocating the pages at that
point or failing the mmap().

[npiggin@suse.de: fix CONFIG_HUGETLB=n build]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Adam Litke <agl@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hugetlbfs/inode.c    |   2 +-
 include/linux/hugetlb.h |   8 +-
 mm/hugetlb.c            | 201 +++++++++++++++++++++++++++++++++++++++++++-----
 mm/memory.c             |   2 +-
 4 files changed, 190 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 1576bbecd08..428eff5b73f 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -441,7 +441,7 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, pgoff_t pgoff)
 			v_offset = 0;
 
 		__unmap_hugepage_range(vma,
-				vma->vm_start + v_offset, vma->vm_end);
+				vma->vm_start + v_offset, vma->vm_end, NULL);
 	}
 }
 
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 185b14c9f02..abbc187193a 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -23,8 +23,10 @@ int hugetlb_overcommit_handler(struct ctl_table *, int, struct file *, void __us
 int hugetlb_treat_movable_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
 int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
 int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, int *, int, int);
-void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long);
-void __unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long);
+void unmap_hugepage_range(struct vm_area_struct *,
+			unsigned long, unsigned long, struct page *);
+void __unmap_hugepage_range(struct vm_area_struct *,
+			unsigned long, unsigned long, struct page *);
 int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
 int hugetlb_report_meminfo(char *);
 int hugetlb_report_node_meminfo(int, char *);
@@ -74,7 +76,7 @@ static inline unsigned long hugetlb_total_pages(void)
 #define follow_huge_addr(mm, addr, write)	ERR_PTR(-EINVAL)
 #define copy_hugetlb_page_range(src, dst, vma)	({ BUG(); 0; })
 #define hugetlb_prefault(mapping, vma)		({ BUG(); 0; })
-#define unmap_hugepage_range(vma, start, end)	BUG()
+#define unmap_hugepage_range(vma, start, end, page)	BUG()
 #define hugetlb_report_meminfo(buf)		0
 #define hugetlb_report_node_meminfo(n, buf)	0
 #define follow_huge_pmd(mm, addr, pmd, write)	NULL
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0af500db363..a2d29b84501 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -40,6 +40,9 @@ static int hugetlb_next_nid;
  */
 static DEFINE_SPINLOCK(hugetlb_lock);
 
+#define HPAGE_RESV_OWNER    (1UL << (BITS_PER_LONG - 1))
+#define HPAGE_RESV_UNMAPPED (1UL << (BITS_PER_LONG - 2))
+#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
 /*
  * These helpers are used to track how many pages are reserved for
  * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
@@ -54,17 +57,32 @@ static unsigned long vma_resv_huge_pages(struct vm_area_struct *vma)
 {
 	VM_BUG_ON(!is_vm_hugetlb_page(vma));
 	if (!(vma->vm_flags & VM_SHARED))
-		return (unsigned long)vma->vm_private_data;
+		return (unsigned long)vma->vm_private_data & ~HPAGE_RESV_MASK;
 	return 0;
 }
 
 static void set_vma_resv_huge_pages(struct vm_area_struct *vma,
 							unsigned long reserve)
 {
+	unsigned long flags;
 	VM_BUG_ON(!is_vm_hugetlb_page(vma));
 	VM_BUG_ON(vma->vm_flags & VM_SHARED);
 
-	vma->vm_private_data = (void *)reserve;
+	flags = (unsigned long)vma->vm_private_data & HPAGE_RESV_MASK;
+	vma->vm_private_data = (void *)(reserve | flags);
+}
+
+static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
+{
+	unsigned long reserveflags = (unsigned long)vma->vm_private_data;
+	VM_BUG_ON(!is_vm_hugetlb_page(vma));
+	vma->vm_private_data = (void *)(reserveflags | flags);
+}
+
+static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
+{
+	VM_BUG_ON(!is_vm_hugetlb_page(vma));
+	return ((unsigned long)vma->vm_private_data & flag) != 0;
 }
 
 /* Decrement the reserved pages in the hugepage pool by one */
@@ -78,14 +96,18 @@ static void decrement_hugepage_resv_vma(struct vm_area_struct *vma)
 		 * Only the process that called mmap() has reserves for
 		 * private mappings.
 		 */
-		if (vma_resv_huge_pages(vma)) {
+		if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+			unsigned long flags, reserve;
 			resv_huge_pages--;
+			flags = (unsigned long)vma->vm_private_data &
+							HPAGE_RESV_MASK;
 			reserve = (unsigned long)vma->vm_private_data - 1;
-			vma->vm_private_data = (void *)reserve;
+			vma->vm_private_data = (void *)(reserve | flags);
 		}
 	}
 }
 
+/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
 void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
 {
 	VM_BUG_ON(!is_vm_hugetlb_page(vma));
@@ -153,7 +175,7 @@ static struct page *dequeue_huge_page(void)
 }
 
 static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
-				unsigned long address)
+				unsigned long address, int avoid_reserve)
 {
 	int nid;
 	struct page *page = NULL;
@@ -173,6 +195,10 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
 			free_huge_pages - resv_huge_pages == 0)
 		return NULL;
 
+	/* If reserves cannot be used, ensure enough pages are in the pool */
+	if (avoid_reserve && free_huge_pages - resv_huge_pages == 0)
+		return NULL;
+
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
 						MAX_NR_ZONES - 1, nodemask) {
 		nid = zone_to_nid(zone);
@@ -183,7 +209,9 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
 			list_del(&page->lru);
 			free_huge_pages--;
 			free_huge_pages_node[nid]--;
-			decrement_hugepage_resv_vma(vma);
+
+			if (!avoid_reserve)
+				decrement_hugepage_resv_vma(vma);
 
 			break;
 		}
@@ -534,7 +562,7 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages)
 }
 
 static struct page *alloc_huge_page(struct vm_area_struct *vma,
-				    unsigned long addr)
+				    unsigned long addr, int avoid_reserve)
 {
 	struct page *page;
 	struct address_space *mapping = vma->vm_file->f_mapping;
@@ -546,14 +574,15 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
 	 * will not have accounted against quota. Check that the quota can be
 	 * made before satisfying the allocation
 	 */
-	if (!vma_has_private_reserves(vma)) {
+	if (!(vma->vm_flags & VM_SHARED) &&
+			!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
 		chg = 1;
 		if (hugetlb_get_quota(inode->i_mapping, chg))
 			return ERR_PTR(-ENOSPC);
 	}
 
 	spin_lock(&hugetlb_lock);
-	page = dequeue_huge_page_vma(vma, addr);
+	page = dequeue_huge_page_vma(vma, addr, avoid_reserve);
 	spin_unlock(&hugetlb_lock);
 
 	if (!page) {
@@ -909,7 +938,7 @@ nomem:
 }
 
 void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
-			    unsigned long end)
+			    unsigned long end, struct page *ref_page)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long address;
@@ -937,6 +966,27 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 		if (huge_pmd_unshare(mm, &address, ptep))
 			continue;
 
+		/*
+		 * If a reference page is supplied, it is because a specific
+		 * page is being unmapped, not a range. Ensure the page we
+		 * are about to unmap is the actual page of interest.
+		 */
+		if (ref_page) {
+			pte = huge_ptep_get(ptep);
+			if (huge_pte_none(pte))
+				continue;
+			page = pte_page(pte);
+			if (page != ref_page)
+				continue;
+
+			/*
+			 * Mark the VMA as having unmapped its page so that
+			 * future faults in this VMA will fail rather than
+			 * looking like data was lost
+			 */
+			set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
+		}
+
 		pte = huge_ptep_get_and_clear(mm, address, ptep);
 		if (huge_pte_none(pte))
 			continue;
@@ -955,7 +1005,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 }
 
 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
-			  unsigned long end)
+			  unsigned long end, struct page *ref_page)
 {
 	/*
 	 * It is undesirable to test vma->vm_file as it should be non-null
@@ -967,19 +1017,68 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 	 */
 	if (vma->vm_file) {
 		spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
-		__unmap_hugepage_range(vma, start, end);
+		__unmap_hugepage_range(vma, start, end, ref_page);
 		spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
 	}
 }
 
+/*
+ * This is called when the original mapper is failing to COW a MAP_PRIVATE
+ * mappping it owns the reserve page for. The intention is to unmap the page
+ * from other VMAs and let the children be SIGKILLed if they are faulting the
+ * same region.
+ */
+int unmap_ref_private(struct mm_struct *mm,
+					struct vm_area_struct *vma,
+					struct page *page,
+					unsigned long address)
+{
+	struct vm_area_struct *iter_vma;
+	struct address_space *mapping;
+	struct prio_tree_iter iter;
+	pgoff_t pgoff;
+
+	/*
+	 * vm_pgoff is in PAGE_SIZE units, hence the different calculation
+	 * from page cache lookup which is in HPAGE_SIZE units.
+	 */
+	address = address & huge_page_mask(hstate_vma(vma));
+	pgoff = ((address - vma->vm_start) >> PAGE_SHIFT)
+		+ (vma->vm_pgoff >> PAGE_SHIFT);
+	mapping = (struct address_space *)page_private(page);
+
+	vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+		/* Do not unmap the current VMA */
+		if (iter_vma == vma)
+			continue;
+
+		/*
+		 * Unmap the page from other VMAs without their own reserves.
+		 * They get marked to be SIGKILLed if they fault in these
+		 * areas. This is because a future no-page fault on this VMA
+		 * could insert a zeroed page instead of the data existing
+		 * from the time of fork. This would look like data corruption
+		 */
+		if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
+			unmap_hugepage_range(iter_vma,
+				address, address + HPAGE_SIZE,
+				page);
+	}
+
+	return 1;
+}
+
 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
-			unsigned long address, pte_t *ptep, pte_t pte)
+			unsigned long address, pte_t *ptep, pte_t pte,
+			struct page *pagecache_page)
 {
 	struct page *old_page, *new_page;
 	int avoidcopy;
+	int outside_reserve = 0;
 
 	old_page = pte_page(pte);
 
+retry_avoidcopy:
 	/* If no-one else is actually using this page, avoid the copy
 	 * and just make the page writable */
 	avoidcopy = (page_count(old_page) == 1);
@@ -988,11 +1087,43 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 		return 0;
 	}
 
+	/*
+	 * If the process that created a MAP_PRIVATE mapping is about to
+	 * perform a COW due to a shared page count, attempt to satisfy
+	 * the allocation without using the existing reserves. The pagecache
+	 * page is used to determine if the reserve at this address was
+	 * consumed or not. If reserves were used, a partial faulted mapping
+	 * at the time of fork() could consume its reserves on COW instead
+	 * of the full address range.
+	 */
+	if (!(vma->vm_flags & VM_SHARED) &&
+			is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
+			old_page != pagecache_page)
+		outside_reserve = 1;
+
 	page_cache_get(old_page);
-	new_page = alloc_huge_page(vma, address);
+	new_page = alloc_huge_page(vma, address, outside_reserve);
 
 	if (IS_ERR(new_page)) {
 		page_cache_release(old_page);
+
+		/*
+		 * If a process owning a MAP_PRIVATE mapping fails to COW,
+		 * it is due to references held by a child and an insufficient
+		 * huge page pool. To guarantee the original mappers
+		 * reliability, unmap the page from child processes. The child
+		 * may get SIGKILLed if it later faults.
+		 */
+		if (outside_reserve) {
+			BUG_ON(huge_pte_none(pte));
+			if (unmap_ref_private(mm, vma, old_page, address)) {
+				BUG_ON(page_count(old_page) != 1);
+				BUG_ON(huge_pte_none(pte));
+				goto retry_avoidcopy;
+			}
+			WARN_ON_ONCE(1);
+		}
+
 		return -PTR_ERR(new_page);
 	}
 
@@ -1015,6 +1146,20 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 	return 0;
 }
 
+/* Return the pagecache page at a given address within a VMA */
+static struct page *hugetlbfs_pagecache_page(struct vm_area_struct *vma,
+			unsigned long address)
+{
+	struct address_space *mapping;
+	unsigned long idx;
+
+	mapping = vma->vm_file->f_mapping;
+	idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
+		+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+
+	return find_lock_page(mapping, idx);
+}
+
 static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, pte_t *ptep, int write_access)
 {
@@ -1025,6 +1170,18 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	struct address_space *mapping;
 	pte_t new_pte;
 
+	/*
+	 * Currently, we are forced to kill the process in the event the
+	 * original mapper has unmapped pages from the child due to a failed
+	 * COW. Warn that such a situation has occured as it may not be obvious
+	 */
+	if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
+		printk(KERN_WARNING
+			"PID %d killed due to inadequate hugepage pool\n",
+			current->pid);
+		return ret;
+	}
+
 	mapping = vma->vm_file->f_mapping;
 	idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
 		+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
@@ -1039,7 +1196,7 @@ retry:
 		size = i_size_read(mapping->host) >> HPAGE_SHIFT;
 		if (idx >= size)
 			goto out;
-		page = alloc_huge_page(vma, address);
+		page = alloc_huge_page(vma, address, 0);
 		if (IS_ERR(page)) {
 			ret = -PTR_ERR(page);
 			goto out;
@@ -1081,7 +1238,7 @@ retry:
 
 	if (write_access && !(vma->vm_flags & VM_SHARED)) {
 		/* Optimization, do the COW without a second fault */
-		ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
+		ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
 	}
 
 	spin_unlock(&mm->page_table_lock);
@@ -1126,8 +1283,15 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	spin_lock(&mm->page_table_lock);
 	/* Check for a racing update before calling hugetlb_cow */
 	if (likely(pte_same(entry, huge_ptep_get(ptep))))
-		if (write_access && !pte_write(entry))
-			ret = hugetlb_cow(mm, vma, address, ptep, entry);
+		if (write_access && !pte_write(entry)) {
+			struct page *page;
+			page = hugetlbfs_pagecache_page(vma, address);
+			ret = hugetlb_cow(mm, vma, address, ptep, entry, page);
+			if (page) {
+				unlock_page(page);
+				put_page(page);
+			}
+		}
 	spin_unlock(&mm->page_table_lock);
 	mutex_unlock(&hugetlb_instantiation_mutex);
 
@@ -1371,6 +1535,7 @@ int hugetlb_reserve_pages(struct inode *inode,
 	else {
 		chg = to - from;
 		set_vma_resv_huge_pages(vma, chg);
+		set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
 	}
 
 	if (chg < 0)
diff --git a/mm/memory.c b/mm/memory.c
index 82f3f1c5cf1..72932489a08 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -901,7 +901,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
 			}
 
 			if (unlikely(is_vm_hugetlb_page(vma))) {
-				unmap_hugepage_range(vma, start, end);
+				unmap_hugepage_range(vma, start, end, NULL);
 				zap_work -= (end - start) /
 						(HPAGE_SIZE / PAGE_SIZE);
 				start = end;
-- 
cgit v1.2.3-70-g09d2


From cce770815869e9209171d819dfde89bcc738ab62 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@suse.cz>
Date: Wed, 23 Jul 2008 21:27:36 -0700
Subject: SYNC_FILE_RANGE_WRITE may and will block. Document that.

[akpm@linux-foundation.org: fix comment text]
Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/sync.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/sync.c b/fs/sync.c
index 228e17b5e9e..2967562d416 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -139,7 +139,8 @@ asmlinkage long sys_fdatasync(unsigned int fd)
  * before performing the write.
  *
  * SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the
- * range which are not presently under writeback.
+ * range which are not presently under writeback. Note that this may block for
+ * significant periods due to exhaustion of disk request structures.
  *
  * SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range
  * after performing the write.
-- 
cgit v1.2.3-70-g09d2


From a47a126ad5ea072aca3e611ed8f8dc6adad24bab Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Wed, 23 Jul 2008 21:27:38 -0700
Subject: vmallocinfo: add NUMA information

Christoph recently added /proc/vmallocinfo file to get information about
vmalloc allocations.

This patch adds NUMA specific information, giving number of pages
allocated on each memory node.

This should help to check that vmalloc() is able to respect NUMA policies.

Example of output on a four nodes machine (one cpu per node)

1) network hash tables are evenly spreaded on four nodes (OK) (Same
   point for inodes and dentries hash tables)

2) iptables tables (x_tables) are correctly allocated on each cpu node
   (OK).

3) sys_swapon() allocates its memory from one node only.

4) each loaded module is using memory on one node.

Sysadmins could tune their setup to change points 3) and 4) if necessary.

grep "pages="  /proc/vmallocinfo
0xffffc20000000000-0xffffc20000201000 2101248 alloc_large_system_hash+0x204/0x2c0 pages=512 vmalloc N0=128 N1=128 N2=128 N3=128
0xffffc20000201000-0xffffc20000302000 1052672 alloc_large_system_hash+0x204/0x2c0 pages=256 vmalloc N0=64 N1=64 N2=64 N3=64
0xffffc2000031a000-0xffffc2000031d000   12288 alloc_large_system_hash+0x204/0x2c0 pages=2 vmalloc N1=1 N2=1
0xffffc2000031f000-0xffffc2000032b000   49152 cramfs_uncompress_init+0x2e/0x80 pages=11 vmalloc N0=3 N1=3 N2=2 N3=3
0xffffc2000033e000-0xffffc20000341000   12288 sys_swapon+0x640/0xac0 pages=2 vmalloc N0=2
0xffffc20000341000-0xffffc20000344000   12288 xt_alloc_table_info+0xfe/0x130 [x_tables] pages=2 vmalloc N0=2
0xffffc20000344000-0xffffc20000347000   12288 xt_alloc_table_info+0xfe/0x130 [x_tables] pages=2 vmalloc N1=2
0xffffc20000347000-0xffffc2000034a000   12288 xt_alloc_table_info+0xfe/0x130 [x_tables] pages=2 vmalloc N2=2
0xffffc2000034a000-0xffffc2000034d000   12288 xt_alloc_table_info+0xfe/0x130 [x_tables] pages=2 vmalloc N3=2
0xffffc20004381000-0xffffc20004402000  528384 alloc_large_system_hash+0x204/0x2c0 pages=128 vmalloc N0=32 N1=32 N2=32 N3=32
0xffffc20004402000-0xffffc20004803000 4198400 alloc_large_system_hash+0x204/0x2c0 pages=1024 vmalloc vpages N0=256 N1=256 N2=256 N3=256
0xffffc20004803000-0xffffc20004904000 1052672 alloc_large_system_hash+0x204/0x2c0 pages=256 vmalloc N0=64 N1=64 N2=64 N3=64
0xffffc20004904000-0xffffc20004bec000 3047424 sys_swapon+0x640/0xac0 pages=743 vmalloc vpages N0=743
0xffffffffa0000000-0xffffffffa000f000   61440 sys_init_module+0xc27/0x1d00 pages=14 vmalloc N1=14
0xffffffffa000f000-0xffffffffa0014000   20480 sys_init_module+0xc27/0x1d00 pages=4 vmalloc N0=4
0xffffffffa0014000-0xffffffffa0017000   12288 sys_init_module+0xc27/0x1d00 pages=2 vmalloc N0=2
0xffffffffa0017000-0xffffffffa0022000   45056 sys_init_module+0xc27/0x1d00 pages=10 vmalloc N1=10
0xffffffffa0022000-0xffffffffa0028000   24576 sys_init_module+0xc27/0x1d00 pages=5 vmalloc N3=5
0xffffffffa0028000-0xffffffffa0050000  163840 sys_init_module+0xc27/0x1d00 pages=39 vmalloc N1=39
0xffffffffa0050000-0xffffffffa0052000    8192 sys_init_module+0xc27/0x1d00 pages=1 vmalloc N1=1
0xffffffffa0052000-0xffffffffa0056000   16384 sys_init_module+0xc27/0x1d00 pages=3 vmalloc N1=3
0xffffffffa0056000-0xffffffffa0081000  176128 sys_init_module+0xc27/0x1d00 pages=42 vmalloc N3=42
0xffffffffa0081000-0xffffffffa00ae000  184320 sys_init_module+0xc27/0x1d00 pages=44 vmalloc N3=44
0xffffffffa00ae000-0xffffffffa00b1000   12288 sys_init_module+0xc27/0x1d00 pages=2 vmalloc N3=2
0xffffffffa00b1000-0xffffffffa00b9000   32768 sys_init_module+0xc27/0x1d00 pages=7 vmalloc N0=7
0xffffffffa00b9000-0xffffffffa00c4000   45056 sys_init_module+0xc27/0x1d00 pages=10 vmalloc N3=10
0xffffffffa00c6000-0xffffffffa00e0000  106496 sys_init_module+0xc27/0x1d00 pages=25 vmalloc N2=25
0xffffffffa00e0000-0xffffffffa00f1000   69632 sys_init_module+0xc27/0x1d00 pages=16 vmalloc N2=16
0xffffffffa00f1000-0xffffffffa00f4000   12288 sys_init_module+0xc27/0x1d00 pages=2 vmalloc N3=2
0xffffffffa00f4000-0xffffffffa00f7000   12288 sys_init_module+0xc27/0x1d00 pages=2 vmalloc N3=2

[akpm@linux-foundation.org: fix comment]
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt | 44 ++++++++++++++++++++++++++++++++++++++
 fs/proc/proc_misc.c                | 15 +++++++++++--
 mm/vmalloc.c                       | 20 +++++++++++++++++
 3 files changed, 77 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 7f268f327d7..8c6384bdfed 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -296,6 +296,7 @@ Table 1-4: Kernel info in /proc
  uptime      System uptime                                     
  version     Kernel version                                    
  video	     bttv info of video resources			(2.4)
+ vmallocinfo Show vmalloced areas
 ..............................................................................
 
 You can,  for  example,  check  which interrupts are currently in use and what
@@ -557,6 +558,49 @@ VmallocTotal: total size of vmalloc memory area
  VmallocUsed: amount of vmalloc area which is used
 VmallocChunk: largest contigious block of vmalloc area which is free
 
+..............................................................................
+
+vmallocinfo:
+
+Provides information about vmalloced/vmaped areas. One line per area,
+containing the virtual address range of the area, size in bytes,
+caller information of the creator, and optional information depending
+on the kind of area :
+
+ pages=nr    number of pages
+ phys=addr   if a physical address was specified
+ ioremap     I/O mapping (ioremap() and friends)
+ vmalloc     vmalloc() area
+ vmap        vmap()ed pages
+ user        VM_USERMAP area
+ vpages      buffer for pages pointers was vmalloced (huge area)
+ N<node>=nr  (Only on NUMA kernels)
+             Number of pages allocated on memory node <node>
+
+> cat /proc/vmallocinfo
+0xffffc20000000000-0xffffc20000201000 2101248 alloc_large_system_hash+0x204 ...
+  /0x2c0 pages=512 vmalloc N0=128 N1=128 N2=128 N3=128
+0xffffc20000201000-0xffffc20000302000 1052672 alloc_large_system_hash+0x204 ...
+  /0x2c0 pages=256 vmalloc N0=64 N1=64 N2=64 N3=64
+0xffffc20000302000-0xffffc20000304000    8192 acpi_tb_verify_table+0x21/0x4f...
+  phys=7fee8000 ioremap
+0xffffc20000304000-0xffffc20000307000   12288 acpi_tb_verify_table+0x21/0x4f...
+  phys=7fee7000 ioremap
+0xffffc2000031d000-0xffffc2000031f000    8192 init_vdso_vars+0x112/0x210
+0xffffc2000031f000-0xffffc2000032b000   49152 cramfs_uncompress_init+0x2e ...
+  /0x80 pages=11 vmalloc N0=3 N1=3 N2=2 N3=3
+0xffffc2000033a000-0xffffc2000033d000   12288 sys_swapon+0x640/0xac0      ...
+  pages=2 vmalloc N1=2
+0xffffc20000347000-0xffffc2000034c000   20480 xt_alloc_table_info+0xfe ...
+  /0x130 [x_tables] pages=4 vmalloc N0=4
+0xffffffffa0000000-0xffffffffa000f000   61440 sys_init_module+0xc27/0x1d00 ...
+   pages=14 vmalloc N2=14
+0xffffffffa000f000-0xffffffffa0014000   20480 sys_init_module+0xc27/0x1d00 ...
+   pages=4 vmalloc N1=4
+0xffffffffa0014000-0xffffffffa0017000   12288 sys_init_module+0xc27/0x1d00 ...
+   pages=2 vmalloc N1=2
+0xffffffffa0017000-0xffffffffa0022000   45056 sys_init_module+0xc27/0x1d00 ...
+   pages=10 vmalloc N0=10
 
 1.3 IDE devices in /proc/ide
 ----------------------------
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index b14f43d25e9..ded96986296 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -464,14 +464,25 @@ static const struct file_operations proc_slabstats_operations = {
 #ifdef CONFIG_MMU
 static int vmalloc_open(struct inode *inode, struct file *file)
 {
-	return seq_open(file, &vmalloc_op);
+	unsigned int *ptr = NULL;
+	int ret;
+
+	if (NUMA_BUILD)
+		ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
+	ret = seq_open(file, &vmalloc_op);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+		m->private = ptr;
+	} else
+		kfree(ptr);
+	return ret;
 }
 
 static const struct file_operations proc_vmalloc_operations = {
 	.open		= vmalloc_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= seq_release,
+	.release	= seq_release_private,
 };
 #endif
 
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 6e45b0f3d12..35f29381629 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -931,6 +931,25 @@ static void s_stop(struct seq_file *m, void *p)
 	read_unlock(&vmlist_lock);
 }
 
+static void show_numa_info(struct seq_file *m, struct vm_struct *v)
+{
+	if (NUMA_BUILD) {
+		unsigned int nr, *counters = m->private;
+
+		if (!counters)
+			return;
+
+		memset(counters, 0, nr_node_ids * sizeof(unsigned int));
+
+		for (nr = 0; nr < v->nr_pages; nr++)
+			counters[page_to_nid(v->pages[nr])]++;
+
+		for_each_node_state(nr, N_HIGH_MEMORY)
+			if (counters[nr])
+				seq_printf(m, " N%u=%u", nr, counters[nr]);
+	}
+}
+
 static int s_show(struct seq_file *m, void *p)
 {
 	struct vm_struct *v = p;
@@ -967,6 +986,7 @@ static int s_show(struct seq_file *m, void *p)
 	if (v->flags & VM_VPAGES)
 		seq_printf(m, " vpages");
 
+	show_numa_info(m, v);
 	seq_putc(m, '\n');
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From a5516438959d90b071ff0a484ce4f3f523dc3152 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 23 Jul 2008 21:27:41 -0700
Subject: hugetlb: modular state for hugetlb page size

The goal of this patchset is to support multiple hugetlb page sizes.  This
is achieved by introducing a new struct hstate structure, which
encapsulates the important hugetlb state and constants (eg.  huge page
size, number of huge pages currently allocated, etc).

The hstate structure is then passed around the code which requires these
fields, they will do the right thing regardless of the exact hstate they
are operating on.

This patch adds the hstate structure, with a single global instance of it
(default_hstate), and does the basic work of converting hugetlb to use the
hstate.

Future patches will add more hstate structures to allow for different
hugetlbfs mounts to have different page sizes.

[akpm@linux-foundation.org: coding-style fixes]
Acked-by: Adam Litke <agl@us.ibm.com>
Acked-by: Nishanth Aravamudan <nacc@us.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/mm/hugetlbpage.c    |   7 +-
 arch/powerpc/mm/hugetlbpage.c |   3 +-
 arch/s390/mm/hugetlbpage.c    |   3 +-
 arch/sh/mm/hugetlbpage.c      |   3 +-
 arch/sparc64/mm/hugetlbpage.c |   5 +-
 arch/x86/mm/hugetlbpage.c     |   5 +-
 fs/hugetlbfs/inode.c          |  52 +++---
 include/asm-ia64/hugetlb.h    |   3 +-
 include/asm-powerpc/hugetlb.h |   3 +-
 include/asm-s390/hugetlb.h    |   3 +-
 include/asm-sh/hugetlb.h      |   3 +-
 include/asm-sparc/hugetlb.h   |   3 +-
 include/asm-x86/hugetlb.h     |   8 +-
 include/linux/hugetlb.h       |  88 +++++++++-
 ipc/shm.c                     |   3 +-
 mm/hugetlb.c                  | 368 +++++++++++++++++++++++-------------------
 mm/memory.c                   |   2 +-
 mm/mempolicy.c                |   9 +-
 mm/mmap.c                     |   3 +-
 19 files changed, 356 insertions(+), 218 deletions(-)

(limited to 'fs')

diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index cd49e2860ee..6170f097d25 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -24,7 +24,7 @@
 unsigned int hpage_shift=HPAGE_SHIFT_DEFAULT;
 
 pte_t *
-huge_pte_alloc (struct mm_struct *mm, unsigned long addr)
+huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
 {
 	unsigned long taddr = htlbpage_to_page(addr);
 	pgd_t *pgd;
@@ -75,7 +75,8 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
  * Don't actually need to do any preparation, but need to make sure
  * the address is in the right region.
  */
-int prepare_hugepage_range(unsigned long addr, unsigned long len)
+int prepare_hugepage_range(struct file *file,
+			unsigned long addr, unsigned long len)
 {
 	if (len & ~HPAGE_MASK)
 		return -EINVAL;
@@ -149,7 +150,7 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, u
 
 	/* Handle MAP_FIXED */
 	if (flags & MAP_FIXED) {
-		if (prepare_hugepage_range(addr, len))
+		if (prepare_hugepage_range(file, addr, len))
 			return -EINVAL;
 		return addr;
 	}
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 1a96cc891cf..c94dc71af98 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -128,7 +128,8 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 	return NULL;
 }
 
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_alloc(struct mm_struct *mm,
+			unsigned long addr, unsigned long sz)
 {
 	pgd_t *pg;
 	pud_t *pu;
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index f4b6124fdb7..9162dc84f77 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -72,7 +72,8 @@ void arch_release_hugepage(struct page *page)
 	page[1].index = 0;
 }
 
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_alloc(struct mm_struct *mm,
+			unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgdp;
 	pud_t *pudp;
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c
index ae8c321d6e2..2f9dbe0ef4a 100644
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -22,7 +22,8 @@
 #include <asm/tlbflush.h>
 #include <asm/cacheflush.h>
 
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_alloc(struct mm_struct *mm,
+			unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgd;
 	pud_t *pud;
diff --git a/arch/sparc64/mm/hugetlbpage.c b/arch/sparc64/mm/hugetlbpage.c
index ebefd2a1437..1307b23f6a7 100644
--- a/arch/sparc64/mm/hugetlbpage.c
+++ b/arch/sparc64/mm/hugetlbpage.c
@@ -175,7 +175,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 		return -ENOMEM;
 
 	if (flags & MAP_FIXED) {
-		if (prepare_hugepage_range(addr, len))
+		if (prepare_hugepage_range(file, addr, len))
 			return -EINVAL;
 		return addr;
 	}
@@ -195,7 +195,8 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 				pgoff, flags);
 }
 
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_alloc(struct mm_struct *mm,
+			unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgd;
 	pud_t *pud;
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 0b3d567e686..52476fde899 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -124,7 +124,8 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
 	return 1;
 }
 
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_alloc(struct mm_struct *mm,
+			unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgd;
 	pud_t *pud;
@@ -368,7 +369,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 		return -ENOMEM;
 
 	if (flags & MAP_FIXED) {
-		if (prepare_hugepage_range(addr, len))
+		if (prepare_hugepage_range(file, addr, len))
 			return -EINVAL;
 		return addr;
 	}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 428eff5b73f..516c581b537 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -80,6 +80,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	struct inode *inode = file->f_path.dentry->d_inode;
 	loff_t len, vma_len;
 	int ret;
+	struct hstate *h = hstate_file(file);
 
 	/*
 	 * vma address alignment (but not the pgoff alignment) has
@@ -92,7 +93,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	vma->vm_flags |= VM_HUGETLB | VM_RESERVED;
 	vma->vm_ops = &hugetlb_vm_ops;
 
-	if (vma->vm_pgoff & ~(HPAGE_MASK >> PAGE_SHIFT))
+	if (vma->vm_pgoff & ~(huge_page_mask(h) >> PAGE_SHIFT))
 		return -EINVAL;
 
 	vma_len = (loff_t)(vma->vm_end - vma->vm_start);
@@ -104,8 +105,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
 
 	if (hugetlb_reserve_pages(inode,
-				vma->vm_pgoff >> (HPAGE_SHIFT-PAGE_SHIFT),
-				len >> HPAGE_SHIFT, vma))
+				vma->vm_pgoff >> huge_page_order(h),
+				len >> huge_page_shift(h), vma))
 		goto out;
 
 	ret = 0;
@@ -130,20 +131,21 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
 	unsigned long start_addr;
+	struct hstate *h = hstate_file(file);
 
-	if (len & ~HPAGE_MASK)
+	if (len & ~huge_page_mask(h))
 		return -EINVAL;
 	if (len > TASK_SIZE)
 		return -ENOMEM;
 
 	if (flags & MAP_FIXED) {
-		if (prepare_hugepage_range(addr, len))
+		if (prepare_hugepage_range(file, addr, len))
 			return -EINVAL;
 		return addr;
 	}
 
 	if (addr) {
-		addr = ALIGN(addr, HPAGE_SIZE);
+		addr = ALIGN(addr, huge_page_size(h));
 		vma = find_vma(mm, addr);
 		if (TASK_SIZE - len >= addr &&
 		    (!vma || addr + len <= vma->vm_start))
@@ -156,7 +158,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 		start_addr = TASK_UNMAPPED_BASE;
 
 full_search:
-	addr = ALIGN(start_addr, HPAGE_SIZE);
+	addr = ALIGN(start_addr, huge_page_size(h));
 
 	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
 		/* At this point:  (!vma || addr < vma->vm_end). */
@@ -174,7 +176,7 @@ full_search:
 
 		if (!vma || addr + len <= vma->vm_start)
 			return addr;
-		addr = ALIGN(vma->vm_end, HPAGE_SIZE);
+		addr = ALIGN(vma->vm_end, huge_page_size(h));
 	}
 }
 #endif
@@ -225,10 +227,11 @@ hugetlbfs_read_actor(struct page *page, unsigned long offset,
 static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
 			      size_t len, loff_t *ppos)
 {
+	struct hstate *h = hstate_file(filp);
 	struct address_space *mapping = filp->f_mapping;
 	struct inode *inode = mapping->host;
-	unsigned long index = *ppos >> HPAGE_SHIFT;
-	unsigned long offset = *ppos & ~HPAGE_MASK;
+	unsigned long index = *ppos >> huge_page_shift(h);
+	unsigned long offset = *ppos & ~huge_page_mask(h);
 	unsigned long end_index;
 	loff_t isize;
 	ssize_t retval = 0;
@@ -243,17 +246,17 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
 	if (!isize)
 		goto out;
 
-	end_index = (isize - 1) >> HPAGE_SHIFT;
+	end_index = (isize - 1) >> huge_page_shift(h);
 	for (;;) {
 		struct page *page;
-		int nr, ret;
+		unsigned long nr, ret;
 
 		/* nr is the maximum number of bytes to copy from this page */
-		nr = HPAGE_SIZE;
+		nr = huge_page_size(h);
 		if (index >= end_index) {
 			if (index > end_index)
 				goto out;
-			nr = ((isize - 1) & ~HPAGE_MASK) + 1;
+			nr = ((isize - 1) & ~huge_page_mask(h)) + 1;
 			if (nr <= offset) {
 				goto out;
 			}
@@ -287,8 +290,8 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
 		offset += ret;
 		retval += ret;
 		len -= ret;
-		index += offset >> HPAGE_SHIFT;
-		offset &= ~HPAGE_MASK;
+		index += offset >> huge_page_shift(h);
+		offset &= ~huge_page_mask(h);
 
 		if (page)
 			page_cache_release(page);
@@ -298,7 +301,7 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
 			break;
 	}
 out:
-	*ppos = ((loff_t)index << HPAGE_SHIFT) + offset;
+	*ppos = ((loff_t)index << huge_page_shift(h)) + offset;
 	mutex_unlock(&inode->i_mutex);
 	return retval;
 }
@@ -339,8 +342,9 @@ static void truncate_huge_page(struct page *page)
 
 static void truncate_hugepages(struct inode *inode, loff_t lstart)
 {
+	struct hstate *h = hstate_inode(inode);
 	struct address_space *mapping = &inode->i_data;
-	const pgoff_t start = lstart >> HPAGE_SHIFT;
+	const pgoff_t start = lstart >> huge_page_shift(h);
 	struct pagevec pvec;
 	pgoff_t next;
 	int i, freed = 0;
@@ -449,8 +453,9 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
 {
 	pgoff_t pgoff;
 	struct address_space *mapping = inode->i_mapping;
+	struct hstate *h = hstate_inode(inode);
 
-	BUG_ON(offset & ~HPAGE_MASK);
+	BUG_ON(offset & ~huge_page_mask(h));
 	pgoff = offset >> PAGE_SHIFT;
 
 	i_size_write(inode, offset);
@@ -465,6 +470,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
 static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
+	struct hstate *h = hstate_inode(inode);
 	int error;
 	unsigned int ia_valid = attr->ia_valid;
 
@@ -476,7 +482,7 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
 
 	if (ia_valid & ATTR_SIZE) {
 		error = -EINVAL;
-		if (!(attr->ia_size & ~HPAGE_MASK))
+		if (!(attr->ia_size & ~huge_page_mask(h)))
 			error = hugetlb_vmtruncate(inode, attr->ia_size);
 		if (error)
 			goto out;
@@ -610,9 +616,10 @@ static int hugetlbfs_set_page_dirty(struct page *page)
 static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
+	struct hstate *h = hstate_inode(dentry->d_inode);
 
 	buf->f_type = HUGETLBFS_MAGIC;
-	buf->f_bsize = HPAGE_SIZE;
+	buf->f_bsize = huge_page_size(h);
 	if (sbinfo) {
 		spin_lock(&sbinfo->stat_lock);
 		/* If no limits set, just report 0 for max/free/used
@@ -942,7 +949,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size)
 		goto out_dentry;
 
 	error = -ENOMEM;
-	if (hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT, NULL))
+	if (hugetlb_reserve_pages(inode, 0,
+			size >> huge_page_shift(hstate_inode(inode)), NULL))
 		goto out_inode;
 
 	d_instantiate(dentry, inode);
diff --git a/include/asm-ia64/hugetlb.h b/include/asm-ia64/hugetlb.h
index e9d1e5e2382..da55c63728e 100644
--- a/include/asm-ia64/hugetlb.h
+++ b/include/asm-ia64/hugetlb.h
@@ -8,7 +8,8 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
 			    unsigned long end, unsigned long floor,
 			    unsigned long ceiling);
 
-int prepare_hugepage_range(unsigned long addr, unsigned long len);
+int prepare_hugepage_range(struct file *file,
+			unsigned long addr, unsigned long len);
 
 static inline int is_hugepage_only_range(struct mm_struct *mm,
 					 unsigned long addr,
diff --git a/include/asm-powerpc/hugetlb.h b/include/asm-powerpc/hugetlb.h
index 0a37aa5ecaa..ca37c4af27b 100644
--- a/include/asm-powerpc/hugetlb.h
+++ b/include/asm-powerpc/hugetlb.h
@@ -21,7 +21,8 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
  * If the arch doesn't supply something else, assume that hugepage
  * size aligned regions are ok without further preparation.
  */
-static inline int prepare_hugepage_range(unsigned long addr, unsigned long len)
+static inline int prepare_hugepage_range(struct file *file,
+			unsigned long addr, unsigned long len)
 {
 	if (len & ~HPAGE_MASK)
 		return -EINVAL;
diff --git a/include/asm-s390/hugetlb.h b/include/asm-s390/hugetlb.h
index 600a776f8f7..670a1d1745d 100644
--- a/include/asm-s390/hugetlb.h
+++ b/include/asm-s390/hugetlb.h
@@ -22,7 +22,8 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
  * If the arch doesn't supply something else, assume that hugepage
  * size aligned regions are ok without further preparation.
  */
-static inline int prepare_hugepage_range(unsigned long addr, unsigned long len)
+static inline int prepare_hugepage_range(struct file *file,
+			unsigned long addr, unsigned long len)
 {
 	if (len & ~HPAGE_MASK)
 		return -EINVAL;
diff --git a/include/asm-sh/hugetlb.h b/include/asm-sh/hugetlb.h
index fb30018938c..967068fb79a 100644
--- a/include/asm-sh/hugetlb.h
+++ b/include/asm-sh/hugetlb.h
@@ -14,7 +14,8 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
  * If the arch doesn't supply something else, assume that hugepage
  * size aligned regions are ok without further preparation.
  */
-static inline int prepare_hugepage_range(unsigned long addr, unsigned long len)
+static inline int prepare_hugepage_range(struct file *file,
+			unsigned long addr, unsigned long len)
 {
 	if (len & ~HPAGE_MASK)
 		return -EINVAL;
diff --git a/include/asm-sparc/hugetlb.h b/include/asm-sparc/hugetlb.h
index aeb92374ca3..177061064ee 100644
--- a/include/asm-sparc/hugetlb.h
+++ b/include/asm-sparc/hugetlb.h
@@ -22,7 +22,8 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
  * If the arch doesn't supply something else, assume that hugepage
  * size aligned regions are ok without further preparation.
  */
-static inline int prepare_hugepage_range(unsigned long addr, unsigned long len)
+static inline int prepare_hugepage_range(struct file *file,
+			unsigned long addr, unsigned long len)
 {
 	if (len & ~HPAGE_MASK)
 		return -EINVAL;
diff --git a/include/asm-x86/hugetlb.h b/include/asm-x86/hugetlb.h
index 7eed6e0883b..439a9acc132 100644
--- a/include/asm-x86/hugetlb.h
+++ b/include/asm-x86/hugetlb.h
@@ -14,11 +14,13 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
  * If the arch doesn't supply something else, assume that hugepage
  * size aligned regions are ok without further preparation.
  */
-static inline int prepare_hugepage_range(unsigned long addr, unsigned long len)
+static inline int prepare_hugepage_range(struct file *file,
+			unsigned long addr, unsigned long len)
 {
-	if (len & ~HPAGE_MASK)
+	struct hstate *h = hstate_file(file);
+	if (len & ~huge_page_mask(h))
 		return -EINVAL;
-	if (addr & ~HPAGE_MASK)
+	if (addr & ~huge_page_mask(h))
 		return -EINVAL;
 	return 0;
 }
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index abbc187193a..ad2271e11f9 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -8,7 +8,6 @@
 #include <linux/mempolicy.h>
 #include <linux/shm.h>
 #include <asm/tlbflush.h>
-#include <asm/hugetlb.h>
 
 struct ctl_table;
 
@@ -45,7 +44,8 @@ extern int sysctl_hugetlb_shm_group;
 
 /* arch callbacks */
 
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr);
+pte_t *huge_pte_alloc(struct mm_struct *mm,
+			unsigned long addr, unsigned long sz);
 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr);
 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep);
 struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
@@ -80,7 +80,7 @@ static inline unsigned long hugetlb_total_pages(void)
 #define hugetlb_report_meminfo(buf)		0
 #define hugetlb_report_node_meminfo(n, buf)	0
 #define follow_huge_pmd(mm, addr, pmd, write)	NULL
-#define prepare_hugepage_range(addr,len)	(-EINVAL)
+#define prepare_hugepage_range(file, addr, len)	(-EINVAL)
 #define pmd_huge(x)	0
 #define is_hugepage_only_range(mm, addr, len)	0
 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
@@ -134,8 +134,6 @@ struct file *hugetlb_file_setup(const char *name, size_t);
 int hugetlb_get_quota(struct address_space *mapping, long delta);
 void hugetlb_put_quota(struct address_space *mapping, long delta);
 
-#define BLOCKS_PER_HUGEPAGE	(HPAGE_SIZE / 512)
-
 static inline int is_file_hugepages(struct file *file)
 {
 	if (file->f_op == &hugetlbfs_file_operations)
@@ -164,4 +162,84 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 					unsigned long flags);
 #endif /* HAVE_ARCH_HUGETLB_UNMAPPED_AREA */
 
+#ifdef CONFIG_HUGETLB_PAGE
+
+/* Defines one hugetlb page size */
+struct hstate {
+	int hugetlb_next_nid;
+	unsigned int order;
+	unsigned long mask;
+	unsigned long max_huge_pages;
+	unsigned long nr_huge_pages;
+	unsigned long free_huge_pages;
+	unsigned long resv_huge_pages;
+	unsigned long surplus_huge_pages;
+	unsigned long nr_overcommit_huge_pages;
+	struct list_head hugepage_freelists[MAX_NUMNODES];
+	unsigned int nr_huge_pages_node[MAX_NUMNODES];
+	unsigned int free_huge_pages_node[MAX_NUMNODES];
+	unsigned int surplus_huge_pages_node[MAX_NUMNODES];
+};
+
+extern struct hstate default_hstate;
+
+static inline struct hstate *hstate_vma(struct vm_area_struct *vma)
+{
+	return &default_hstate;
+}
+
+static inline struct hstate *hstate_file(struct file *f)
+{
+	return &default_hstate;
+}
+
+static inline struct hstate *hstate_inode(struct inode *i)
+{
+	return &default_hstate;
+}
+
+static inline unsigned long huge_page_size(struct hstate *h)
+{
+	return (unsigned long)PAGE_SIZE << h->order;
+}
+
+static inline unsigned long huge_page_mask(struct hstate *h)
+{
+	return h->mask;
+}
+
+static inline unsigned int huge_page_order(struct hstate *h)
+{
+	return h->order;
+}
+
+static inline unsigned huge_page_shift(struct hstate *h)
+{
+	return h->order + PAGE_SHIFT;
+}
+
+static inline unsigned int pages_per_huge_page(struct hstate *h)
+{
+	return 1 << h->order;
+}
+
+static inline unsigned int blocks_per_huge_page(struct hstate *h)
+{
+	return huge_page_size(h) / 512;
+}
+
+#include <asm/hugetlb.h>
+
+#else
+struct hstate {};
+#define hstate_file(f) NULL
+#define hstate_vma(v) NULL
+#define hstate_inode(i) NULL
+#define huge_page_size(h) PAGE_SIZE
+#define huge_page_mask(h) PAGE_MASK
+#define huge_page_order(h) 0
+#define huge_page_shift(h) PAGE_SHIFT
+#define pages_per_huge_page(h) 1
+#endif
+
 #endif /* _LINUX_HUGETLB_H */
diff --git a/ipc/shm.c b/ipc/shm.c
index 790240cd067..a726aebce7d 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -577,7 +577,8 @@ static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss,
 
 		if (is_file_hugepages(shp->shm_file)) {
 			struct address_space *mapping = inode->i_mapping;
-			*rss += (HPAGE_SIZE/PAGE_SIZE)*mapping->nrpages;
+			struct hstate *h = hstate_file(shp->shm_file);
+			*rss += pages_per_huge_page(h) * mapping->nrpages;
 		} else {
 			struct shmem_inode_info *info = SHMEM_I(inode);
 			spin_lock(&info->lock);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 32dff4290c6..0d8153e25f0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -22,18 +22,12 @@
 #include "internal.h"
 
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
-static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
-static unsigned long surplus_huge_pages;
-static unsigned long nr_overcommit_huge_pages;
 unsigned long max_huge_pages;
 unsigned long sysctl_overcommit_huge_pages;
-static struct list_head hugepage_freelists[MAX_NUMNODES];
-static unsigned int nr_huge_pages_node[MAX_NUMNODES];
-static unsigned int free_huge_pages_node[MAX_NUMNODES];
-static unsigned int surplus_huge_pages_node[MAX_NUMNODES];
 static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
 unsigned long hugepages_treat_as_movable;
-static int hugetlb_next_nid;
+
+struct hstate default_hstate;
 
 /*
  * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
@@ -203,11 +197,11 @@ static long region_count(struct list_head *head, long f, long t)
  * Convert the address within this vma to the page offset within
  * the mapping, in pagecache page units; huge pages here.
  */
-static pgoff_t vma_hugecache_offset(struct vm_area_struct *vma,
-					unsigned long address)
+static pgoff_t vma_hugecache_offset(struct hstate *h,
+			struct vm_area_struct *vma, unsigned long address)
 {
-	return ((address - vma->vm_start) >> HPAGE_SHIFT) +
-			(vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+	return ((address - vma->vm_start) >> huge_page_shift(h)) +
+			(vma->vm_pgoff >> huge_page_order(h));
 }
 
 /*
@@ -309,20 +303,21 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
 }
 
 /* Decrement the reserved pages in the hugepage pool by one */
-static void decrement_hugepage_resv_vma(struct vm_area_struct *vma)
+static void decrement_hugepage_resv_vma(struct hstate *h,
+			struct vm_area_struct *vma)
 {
 	if (vma->vm_flags & VM_NORESERVE)
 		return;
 
 	if (vma->vm_flags & VM_SHARED) {
 		/* Shared mappings always use reserves */
-		resv_huge_pages--;
+		h->resv_huge_pages--;
 	} else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
 		/*
 		 * Only the process that called mmap() has reserves for
 		 * private mappings.
 		 */
-		resv_huge_pages--;
+		h->resv_huge_pages--;
 	}
 }
 
@@ -344,12 +339,13 @@ static int vma_has_private_reserves(struct vm_area_struct *vma)
 	return 1;
 }
 
-static void clear_huge_page(struct page *page, unsigned long addr)
+static void clear_huge_page(struct page *page,
+			unsigned long addr, unsigned long sz)
 {
 	int i;
 
 	might_sleep();
-	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
+	for (i = 0; i < sz/PAGE_SIZE; i++) {
 		cond_resched();
 		clear_user_highpage(page + i, addr + i * PAGE_SIZE);
 	}
@@ -359,41 +355,43 @@ static void copy_huge_page(struct page *dst, struct page *src,
 			   unsigned long addr, struct vm_area_struct *vma)
 {
 	int i;
+	struct hstate *h = hstate_vma(vma);
 
 	might_sleep();
-	for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
+	for (i = 0; i < pages_per_huge_page(h); i++) {
 		cond_resched();
 		copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
 	}
 }
 
-static void enqueue_huge_page(struct page *page)
+static void enqueue_huge_page(struct hstate *h, struct page *page)
 {
 	int nid = page_to_nid(page);
-	list_add(&page->lru, &hugepage_freelists[nid]);
-	free_huge_pages++;
-	free_huge_pages_node[nid]++;
+	list_add(&page->lru, &h->hugepage_freelists[nid]);
+	h->free_huge_pages++;
+	h->free_huge_pages_node[nid]++;
 }
 
-static struct page *dequeue_huge_page(void)
+static struct page *dequeue_huge_page(struct hstate *h)
 {
 	int nid;
 	struct page *page = NULL;
 
 	for (nid = 0; nid < MAX_NUMNODES; ++nid) {
-		if (!list_empty(&hugepage_freelists[nid])) {
-			page = list_entry(hugepage_freelists[nid].next,
+		if (!list_empty(&h->hugepage_freelists[nid])) {
+			page = list_entry(h->hugepage_freelists[nid].next,
 					  struct page, lru);
 			list_del(&page->lru);
-			free_huge_pages--;
-			free_huge_pages_node[nid]--;
+			h->free_huge_pages--;
+			h->free_huge_pages_node[nid]--;
 			break;
 		}
 	}
 	return page;
 }
 
-static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
+static struct page *dequeue_huge_page_vma(struct hstate *h,
+				struct vm_area_struct *vma,
 				unsigned long address, int avoid_reserve)
 {
 	int nid;
@@ -411,26 +409,26 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
 	 * not "stolen". The child may still get SIGKILLed
 	 */
 	if (!vma_has_private_reserves(vma) &&
-			free_huge_pages - resv_huge_pages == 0)
+			h->free_huge_pages - h->resv_huge_pages == 0)
 		return NULL;
 
 	/* If reserves cannot be used, ensure enough pages are in the pool */
-	if (avoid_reserve && free_huge_pages - resv_huge_pages == 0)
+	if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
 		return NULL;
 
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
 						MAX_NR_ZONES - 1, nodemask) {
 		nid = zone_to_nid(zone);
 		if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
-		    !list_empty(&hugepage_freelists[nid])) {
-			page = list_entry(hugepage_freelists[nid].next,
+		    !list_empty(&h->hugepage_freelists[nid])) {
+			page = list_entry(h->hugepage_freelists[nid].next,
 					  struct page, lru);
 			list_del(&page->lru);
-			free_huge_pages--;
-			free_huge_pages_node[nid]--;
+			h->free_huge_pages--;
+			h->free_huge_pages_node[nid]--;
 
 			if (!avoid_reserve)
-				decrement_hugepage_resv_vma(vma);
+				decrement_hugepage_resv_vma(h, vma);
 
 			break;
 		}
@@ -439,12 +437,13 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
 	return page;
 }
 
-static void update_and_free_page(struct page *page)
+static void update_and_free_page(struct hstate *h, struct page *page)
 {
 	int i;
-	nr_huge_pages--;
-	nr_huge_pages_node[page_to_nid(page)]--;
-	for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
+
+	h->nr_huge_pages--;
+	h->nr_huge_pages_node[page_to_nid(page)]--;
+	for (i = 0; i < pages_per_huge_page(h); i++) {
 		page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
 				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
 				1 << PG_private | 1<< PG_writeback);
@@ -452,11 +451,16 @@ static void update_and_free_page(struct page *page)
 	set_compound_page_dtor(page, NULL);
 	set_page_refcounted(page);
 	arch_release_hugepage(page);
-	__free_pages(page, HUGETLB_PAGE_ORDER);
+	__free_pages(page, huge_page_order(h));
 }
 
 static void free_huge_page(struct page *page)
 {
+	/*
+	 * Can't pass hstate in here because it is called from the
+	 * compound page destructor.
+	 */
+	struct hstate *h = &default_hstate;
 	int nid = page_to_nid(page);
 	struct address_space *mapping;
 
@@ -466,12 +470,12 @@ static void free_huge_page(struct page *page)
 	INIT_LIST_HEAD(&page->lru);
 
 	spin_lock(&hugetlb_lock);
-	if (surplus_huge_pages_node[nid]) {
-		update_and_free_page(page);
-		surplus_huge_pages--;
-		surplus_huge_pages_node[nid]--;
+	if (h->surplus_huge_pages_node[nid]) {
+		update_and_free_page(h, page);
+		h->surplus_huge_pages--;
+		h->surplus_huge_pages_node[nid]--;
 	} else {
-		enqueue_huge_page(page);
+		enqueue_huge_page(h, page);
 	}
 	spin_unlock(&hugetlb_lock);
 	if (mapping)
@@ -483,7 +487,7 @@ static void free_huge_page(struct page *page)
  * balanced by operating on them in a round-robin fashion.
  * Returns 1 if an adjustment was made.
  */
-static int adjust_pool_surplus(int delta)
+static int adjust_pool_surplus(struct hstate *h, int delta)
 {
 	static int prev_nid;
 	int nid = prev_nid;
@@ -496,15 +500,15 @@ static int adjust_pool_surplus(int delta)
 			nid = first_node(node_online_map);
 
 		/* To shrink on this node, there must be a surplus page */
-		if (delta < 0 && !surplus_huge_pages_node[nid])
+		if (delta < 0 && !h->surplus_huge_pages_node[nid])
 			continue;
 		/* Surplus cannot exceed the total number of pages */
-		if (delta > 0 && surplus_huge_pages_node[nid] >=
-						nr_huge_pages_node[nid])
+		if (delta > 0 && h->surplus_huge_pages_node[nid] >=
+						h->nr_huge_pages_node[nid])
 			continue;
 
-		surplus_huge_pages += delta;
-		surplus_huge_pages_node[nid] += delta;
+		h->surplus_huge_pages += delta;
+		h->surplus_huge_pages_node[nid] += delta;
 		ret = 1;
 		break;
 	} while (nid != prev_nid);
@@ -513,46 +517,46 @@ static int adjust_pool_surplus(int delta)
 	return ret;
 }
 
-static void prep_new_huge_page(struct page *page, int nid)
+static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
 {
 	set_compound_page_dtor(page, free_huge_page);
 	spin_lock(&hugetlb_lock);
-	nr_huge_pages++;
-	nr_huge_pages_node[nid]++;
+	h->nr_huge_pages++;
+	h->nr_huge_pages_node[nid]++;
 	spin_unlock(&hugetlb_lock);
 	put_page(page); /* free it into the hugepage allocator */
 }
 
-static struct page *alloc_fresh_huge_page_node(int nid)
+static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
 {
 	struct page *page;
 
 	page = alloc_pages_node(nid,
 		htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
 						__GFP_REPEAT|__GFP_NOWARN,
-		HUGETLB_PAGE_ORDER);
+		huge_page_order(h));
 	if (page) {
 		if (arch_prepare_hugepage(page)) {
 			__free_pages(page, HUGETLB_PAGE_ORDER);
 			return NULL;
 		}
-		prep_new_huge_page(page, nid);
+		prep_new_huge_page(h, page, nid);
 	}
 
 	return page;
 }
 
-static int alloc_fresh_huge_page(void)
+static int alloc_fresh_huge_page(struct hstate *h)
 {
 	struct page *page;
 	int start_nid;
 	int next_nid;
 	int ret = 0;
 
-	start_nid = hugetlb_next_nid;
+	start_nid = h->hugetlb_next_nid;
 
 	do {
-		page = alloc_fresh_huge_page_node(hugetlb_next_nid);
+		page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid);
 		if (page)
 			ret = 1;
 		/*
@@ -566,11 +570,11 @@ static int alloc_fresh_huge_page(void)
 		 * if we just successfully allocated a hugepage so that
 		 * the next caller gets hugepages on the next node.
 		 */
-		next_nid = next_node(hugetlb_next_nid, node_online_map);
+		next_nid = next_node(h->hugetlb_next_nid, node_online_map);
 		if (next_nid == MAX_NUMNODES)
 			next_nid = first_node(node_online_map);
-		hugetlb_next_nid = next_nid;
-	} while (!page && hugetlb_next_nid != start_nid);
+		h->hugetlb_next_nid = next_nid;
+	} while (!page && h->hugetlb_next_nid != start_nid);
 
 	if (ret)
 		count_vm_event(HTLB_BUDDY_PGALLOC);
@@ -580,8 +584,8 @@ static int alloc_fresh_huge_page(void)
 	return ret;
 }
 
-static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
-						unsigned long address)
+static struct page *alloc_buddy_huge_page(struct hstate *h,
+			struct vm_area_struct *vma, unsigned long address)
 {
 	struct page *page;
 	unsigned int nid;
@@ -610,18 +614,18 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
 	 * per-node value is checked there.
 	 */
 	spin_lock(&hugetlb_lock);
-	if (surplus_huge_pages >= nr_overcommit_huge_pages) {
+	if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
 		spin_unlock(&hugetlb_lock);
 		return NULL;
 	} else {
-		nr_huge_pages++;
-		surplus_huge_pages++;
+		h->nr_huge_pages++;
+		h->surplus_huge_pages++;
 	}
 	spin_unlock(&hugetlb_lock);
 
 	page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
 					__GFP_REPEAT|__GFP_NOWARN,
-					HUGETLB_PAGE_ORDER);
+					huge_page_order(h));
 
 	spin_lock(&hugetlb_lock);
 	if (page) {
@@ -636,12 +640,12 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
 		/*
 		 * We incremented the global counters already
 		 */
-		nr_huge_pages_node[nid]++;
-		surplus_huge_pages_node[nid]++;
+		h->nr_huge_pages_node[nid]++;
+		h->surplus_huge_pages_node[nid]++;
 		__count_vm_event(HTLB_BUDDY_PGALLOC);
 	} else {
-		nr_huge_pages--;
-		surplus_huge_pages--;
+		h->nr_huge_pages--;
+		h->surplus_huge_pages--;
 		__count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
 	}
 	spin_unlock(&hugetlb_lock);
@@ -653,16 +657,16 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
  * Increase the hugetlb pool such that it can accomodate a reservation
  * of size 'delta'.
  */
-static int gather_surplus_pages(int delta)
+static int gather_surplus_pages(struct hstate *h, int delta)
 {
 	struct list_head surplus_list;
 	struct page *page, *tmp;
 	int ret, i;
 	int needed, allocated;
 
-	needed = (resv_huge_pages + delta) - free_huge_pages;
+	needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
 	if (needed <= 0) {
-		resv_huge_pages += delta;
+		h->resv_huge_pages += delta;
 		return 0;
 	}
 
@@ -673,7 +677,7 @@ static int gather_surplus_pages(int delta)
 retry:
 	spin_unlock(&hugetlb_lock);
 	for (i = 0; i < needed; i++) {
-		page = alloc_buddy_huge_page(NULL, 0);
+		page = alloc_buddy_huge_page(h, NULL, 0);
 		if (!page) {
 			/*
 			 * We were not able to allocate enough pages to
@@ -694,7 +698,8 @@ retry:
 	 * because either resv_huge_pages or free_huge_pages may have changed.
 	 */
 	spin_lock(&hugetlb_lock);
-	needed = (resv_huge_pages + delta) - (free_huge_pages + allocated);
+	needed = (h->resv_huge_pages + delta) -
+			(h->free_huge_pages + allocated);
 	if (needed > 0)
 		goto retry;
 
@@ -707,7 +712,7 @@ retry:
 	 * before they are reserved.
 	 */
 	needed += allocated;
-	resv_huge_pages += delta;
+	h->resv_huge_pages += delta;
 	ret = 0;
 free:
 	/* Free the needed pages to the hugetlb pool */
@@ -715,7 +720,7 @@ free:
 		if ((--needed) < 0)
 			break;
 		list_del(&page->lru);
-		enqueue_huge_page(page);
+		enqueue_huge_page(h, page);
 	}
 
 	/* Free unnecessary surplus pages to the buddy allocator */
@@ -743,7 +748,8 @@ free:
  * allocated to satisfy the reservation must be explicitly freed if they were
  * never used.
  */
-static void return_unused_surplus_pages(unsigned long unused_resv_pages)
+static void return_unused_surplus_pages(struct hstate *h,
+					unsigned long unused_resv_pages)
 {
 	static int nid = -1;
 	struct page *page;
@@ -758,27 +764,27 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages)
 	unsigned long remaining_iterations = num_online_nodes();
 
 	/* Uncommit the reservation */
-	resv_huge_pages -= unused_resv_pages;
+	h->resv_huge_pages -= unused_resv_pages;
 
-	nr_pages = min(unused_resv_pages, surplus_huge_pages);
+	nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
 
 	while (remaining_iterations-- && nr_pages) {
 		nid = next_node(nid, node_online_map);
 		if (nid == MAX_NUMNODES)
 			nid = first_node(node_online_map);
 
-		if (!surplus_huge_pages_node[nid])
+		if (!h->surplus_huge_pages_node[nid])
 			continue;
 
-		if (!list_empty(&hugepage_freelists[nid])) {
-			page = list_entry(hugepage_freelists[nid].next,
+		if (!list_empty(&h->hugepage_freelists[nid])) {
+			page = list_entry(h->hugepage_freelists[nid].next,
 					  struct page, lru);
 			list_del(&page->lru);
-			update_and_free_page(page);
-			free_huge_pages--;
-			free_huge_pages_node[nid]--;
-			surplus_huge_pages--;
-			surplus_huge_pages_node[nid]--;
+			update_and_free_page(h, page);
+			h->free_huge_pages--;
+			h->free_huge_pages_node[nid]--;
+			h->surplus_huge_pages--;
+			h->surplus_huge_pages_node[nid]--;
 			nr_pages--;
 			remaining_iterations = num_online_nodes();
 		}
@@ -794,13 +800,14 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages)
  * an instantiated the change should be committed via vma_commit_reservation.
  * No action is required on failure.
  */
-static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr)
+static int vma_needs_reservation(struct hstate *h,
+			struct vm_area_struct *vma, unsigned long addr)
 {
 	struct address_space *mapping = vma->vm_file->f_mapping;
 	struct inode *inode = mapping->host;
 
 	if (vma->vm_flags & VM_SHARED) {
-		pgoff_t idx = vma_hugecache_offset(vma, addr);
+		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
 		return region_chg(&inode->i_mapping->private_list,
 							idx, idx + 1);
 
@@ -809,7 +816,7 @@ static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr)
 
 	} else  {
 		int err;
-		pgoff_t idx = vma_hugecache_offset(vma, addr);
+		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
 		struct resv_map *reservations = vma_resv_map(vma);
 
 		err = region_chg(&reservations->regions, idx, idx + 1);
@@ -818,18 +825,18 @@ static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr)
 		return 0;
 	}
 }
-static void vma_commit_reservation(struct vm_area_struct *vma,
-							unsigned long addr)
+static void vma_commit_reservation(struct hstate *h,
+			struct vm_area_struct *vma, unsigned long addr)
 {
 	struct address_space *mapping = vma->vm_file->f_mapping;
 	struct inode *inode = mapping->host;
 
 	if (vma->vm_flags & VM_SHARED) {
-		pgoff_t idx = vma_hugecache_offset(vma, addr);
+		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
 		region_add(&inode->i_mapping->private_list, idx, idx + 1);
 
 	} else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
-		pgoff_t idx = vma_hugecache_offset(vma, addr);
+		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
 		struct resv_map *reservations = vma_resv_map(vma);
 
 		/* Mark this page used in the map. */
@@ -840,6 +847,7 @@ static void vma_commit_reservation(struct vm_area_struct *vma,
 static struct page *alloc_huge_page(struct vm_area_struct *vma,
 				    unsigned long addr, int avoid_reserve)
 {
+	struct hstate *h = hstate_vma(vma);
 	struct page *page;
 	struct address_space *mapping = vma->vm_file->f_mapping;
 	struct inode *inode = mapping->host;
@@ -852,7 +860,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
 	 * MAP_NORESERVE mappings may also need pages and quota allocated
 	 * if no reserve mapping overlaps.
 	 */
-	chg = vma_needs_reservation(vma, addr);
+	chg = vma_needs_reservation(h, vma, addr);
 	if (chg < 0)
 		return ERR_PTR(chg);
 	if (chg)
@@ -860,11 +868,11 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
 			return ERR_PTR(-ENOSPC);
 
 	spin_lock(&hugetlb_lock);
-	page = dequeue_huge_page_vma(vma, addr, avoid_reserve);
+	page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
 	spin_unlock(&hugetlb_lock);
 
 	if (!page) {
-		page = alloc_buddy_huge_page(vma, addr);
+		page = alloc_buddy_huge_page(h, vma, addr);
 		if (!page) {
 			hugetlb_put_quota(inode->i_mapping, chg);
 			return ERR_PTR(-VM_FAULT_OOM);
@@ -874,7 +882,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
 	set_page_refcounted(page);
 	set_page_private(page, (unsigned long) mapping);
 
-	vma_commit_reservation(vma, addr);
+	vma_commit_reservation(h, vma, addr);
 
 	return page;
 }
@@ -882,21 +890,28 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
 static int __init hugetlb_init(void)
 {
 	unsigned long i;
+	struct hstate *h = &default_hstate;
 
 	if (HPAGE_SHIFT == 0)
 		return 0;
 
+	if (!h->order) {
+		h->order = HPAGE_SHIFT - PAGE_SHIFT;
+		h->mask = HPAGE_MASK;
+	}
+
 	for (i = 0; i < MAX_NUMNODES; ++i)
-		INIT_LIST_HEAD(&hugepage_freelists[i]);
+		INIT_LIST_HEAD(&h->hugepage_freelists[i]);
 
-	hugetlb_next_nid = first_node(node_online_map);
+	h->hugetlb_next_nid = first_node(node_online_map);
 
 	for (i = 0; i < max_huge_pages; ++i) {
-		if (!alloc_fresh_huge_page())
+		if (!alloc_fresh_huge_page(h))
 			break;
 	}
-	max_huge_pages = free_huge_pages = nr_huge_pages = i;
-	printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
+	max_huge_pages = h->free_huge_pages = h->nr_huge_pages = i;
+	printk(KERN_INFO "Total HugeTLB memory allocated, %ld\n",
+			h->free_huge_pages);
 	return 0;
 }
 module_init(hugetlb_init);
@@ -922,34 +937,36 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
 
 #ifdef CONFIG_SYSCTL
 #ifdef CONFIG_HIGHMEM
-static void try_to_free_low(unsigned long count)
+static void try_to_free_low(struct hstate *h, unsigned long count)
 {
 	int i;
 
 	for (i = 0; i < MAX_NUMNODES; ++i) {
 		struct page *page, *next;
-		list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
-			if (count >= nr_huge_pages)
+		struct list_head *freel = &h->hugepage_freelists[i];
+		list_for_each_entry_safe(page, next, freel, lru) {
+			if (count >= h->nr_huge_pages)
 				return;
 			if (PageHighMem(page))
 				continue;
 			list_del(&page->lru);
 			update_and_free_page(page);
-			free_huge_pages--;
-			free_huge_pages_node[page_to_nid(page)]--;
+			h->free_huge_pages--;
+			h->free_huge_pages_node[page_to_nid(page)]--;
 		}
 	}
 }
 #else
-static inline void try_to_free_low(unsigned long count)
+static inline void try_to_free_low(struct hstate *h, unsigned long count)
 {
 }
 #endif
 
-#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages)
+#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
 static unsigned long set_max_huge_pages(unsigned long count)
 {
 	unsigned long min_count, ret;
+	struct hstate *h = &default_hstate;
 
 	/*
 	 * Increase the pool size
@@ -963,19 +980,19 @@ static unsigned long set_max_huge_pages(unsigned long count)
 	 * within all the constraints specified by the sysctls.
 	 */
 	spin_lock(&hugetlb_lock);
-	while (surplus_huge_pages && count > persistent_huge_pages) {
-		if (!adjust_pool_surplus(-1))
+	while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
+		if (!adjust_pool_surplus(h, -1))
 			break;
 	}
 
-	while (count > persistent_huge_pages) {
+	while (count > persistent_huge_pages(h)) {
 		/*
 		 * If this allocation races such that we no longer need the
 		 * page, free_huge_page will handle it by freeing the page
 		 * and reducing the surplus.
 		 */
 		spin_unlock(&hugetlb_lock);
-		ret = alloc_fresh_huge_page();
+		ret = alloc_fresh_huge_page(h);
 		spin_lock(&hugetlb_lock);
 		if (!ret)
 			goto out;
@@ -997,21 +1014,21 @@ static unsigned long set_max_huge_pages(unsigned long count)
 	 * and won't grow the pool anywhere else. Not until one of the
 	 * sysctls are changed, or the surplus pages go out of use.
 	 */
-	min_count = resv_huge_pages + nr_huge_pages - free_huge_pages;
+	min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
 	min_count = max(count, min_count);
-	try_to_free_low(min_count);
-	while (min_count < persistent_huge_pages) {
-		struct page *page = dequeue_huge_page();
+	try_to_free_low(h, min_count);
+	while (min_count < persistent_huge_pages(h)) {
+		struct page *page = dequeue_huge_page(h);
 		if (!page)
 			break;
-		update_and_free_page(page);
+		update_and_free_page(h, page);
 	}
-	while (count < persistent_huge_pages) {
-		if (!adjust_pool_surplus(1))
+	while (count < persistent_huge_pages(h)) {
+		if (!adjust_pool_surplus(h, 1))
 			break;
 	}
 out:
-	ret = persistent_huge_pages;
+	ret = persistent_huge_pages(h);
 	spin_unlock(&hugetlb_lock);
 	return ret;
 }
@@ -1041,9 +1058,10 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
 			struct file *file, void __user *buffer,
 			size_t *length, loff_t *ppos)
 {
+	struct hstate *h = &default_hstate;
 	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
 	spin_lock(&hugetlb_lock);
-	nr_overcommit_huge_pages = sysctl_overcommit_huge_pages;
+	h->nr_overcommit_huge_pages = sysctl_overcommit_huge_pages;
 	spin_unlock(&hugetlb_lock);
 	return 0;
 }
@@ -1052,37 +1070,40 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
 
 int hugetlb_report_meminfo(char *buf)
 {
+	struct hstate *h = &default_hstate;
 	return sprintf(buf,
 			"HugePages_Total: %5lu\n"
 			"HugePages_Free:  %5lu\n"
 			"HugePages_Rsvd:  %5lu\n"
 			"HugePages_Surp:  %5lu\n"
 			"Hugepagesize:    %5lu kB\n",
-			nr_huge_pages,
-			free_huge_pages,
-			resv_huge_pages,
-			surplus_huge_pages,
-			HPAGE_SIZE/1024);
+			h->nr_huge_pages,
+			h->free_huge_pages,
+			h->resv_huge_pages,
+			h->surplus_huge_pages,
+			1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
 }
 
 int hugetlb_report_node_meminfo(int nid, char *buf)
 {
+	struct hstate *h = &default_hstate;
 	return sprintf(buf,
 		"Node %d HugePages_Total: %5u\n"
 		"Node %d HugePages_Free:  %5u\n"
 		"Node %d HugePages_Surp:  %5u\n",
-		nid, nr_huge_pages_node[nid],
-		nid, free_huge_pages_node[nid],
-		nid, surplus_huge_pages_node[nid]);
+		nid, h->nr_huge_pages_node[nid],
+		nid, h->free_huge_pages_node[nid],
+		nid, h->surplus_huge_pages_node[nid]);
 }
 
 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
 unsigned long hugetlb_total_pages(void)
 {
-	return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
+	struct hstate *h = &default_hstate;
+	return h->nr_huge_pages * pages_per_huge_page(h);
 }
 
-static int hugetlb_acct_memory(long delta)
+static int hugetlb_acct_memory(struct hstate *h, long delta)
 {
 	int ret = -ENOMEM;
 
@@ -1105,18 +1126,18 @@ static int hugetlb_acct_memory(long delta)
 	 * semantics that cpuset has.
 	 */
 	if (delta > 0) {
-		if (gather_surplus_pages(delta) < 0)
+		if (gather_surplus_pages(h, delta) < 0)
 			goto out;
 
-		if (delta > cpuset_mems_nr(free_huge_pages_node)) {
-			return_unused_surplus_pages(delta);
+		if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
+			return_unused_surplus_pages(h, delta);
 			goto out;
 		}
 	}
 
 	ret = 0;
 	if (delta < 0)
-		return_unused_surplus_pages((unsigned long) -delta);
+		return_unused_surplus_pages(h, (unsigned long) -delta);
 
 out:
 	spin_unlock(&hugetlb_lock);
@@ -1141,14 +1162,15 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
 
 static void hugetlb_vm_op_close(struct vm_area_struct *vma)
 {
+	struct hstate *h = hstate_vma(vma);
 	struct resv_map *reservations = vma_resv_map(vma);
 	unsigned long reserve;
 	unsigned long start;
 	unsigned long end;
 
 	if (reservations) {
-		start = vma_hugecache_offset(vma, vma->vm_start);
-		end = vma_hugecache_offset(vma, vma->vm_end);
+		start = vma_hugecache_offset(h, vma, vma->vm_start);
+		end = vma_hugecache_offset(h, vma, vma->vm_end);
 
 		reserve = (end - start) -
 			region_count(&reservations->regions, start, end);
@@ -1156,7 +1178,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
 		kref_put(&reservations->refs, resv_map_release);
 
 		if (reserve)
-			hugetlb_acct_memory(-reserve);
+			hugetlb_acct_memory(h, -reserve);
 	}
 }
 
@@ -1214,14 +1236,16 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 	struct page *ptepage;
 	unsigned long addr;
 	int cow;
+	struct hstate *h = hstate_vma(vma);
+	unsigned long sz = huge_page_size(h);
 
 	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 
-	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
+	for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
 		src_pte = huge_pte_offset(src, addr);
 		if (!src_pte)
 			continue;
-		dst_pte = huge_pte_alloc(dst, addr);
+		dst_pte = huge_pte_alloc(dst, addr, sz);
 		if (!dst_pte)
 			goto nomem;
 
@@ -1257,6 +1281,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 	pte_t pte;
 	struct page *page;
 	struct page *tmp;
+	struct hstate *h = hstate_vma(vma);
+	unsigned long sz = huge_page_size(h);
+
 	/*
 	 * A page gathering list, protected by per file i_mmap_lock. The
 	 * lock is used to avoid list corruption from multiple unmapping
@@ -1265,11 +1292,11 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 	LIST_HEAD(page_list);
 
 	WARN_ON(!is_vm_hugetlb_page(vma));
-	BUG_ON(start & ~HPAGE_MASK);
-	BUG_ON(end & ~HPAGE_MASK);
+	BUG_ON(start & ~huge_page_mask(h));
+	BUG_ON(end & ~huge_page_mask(h));
 
 	spin_lock(&mm->page_table_lock);
-	for (address = start; address < end; address += HPAGE_SIZE) {
+	for (address = start; address < end; address += sz) {
 		ptep = huge_pte_offset(mm, address);
 		if (!ptep)
 			continue;
@@ -1383,6 +1410,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, pte_t *ptep, pte_t pte,
 			struct page *pagecache_page)
 {
+	struct hstate *h = hstate_vma(vma);
 	struct page *old_page, *new_page;
 	int avoidcopy;
 	int outside_reserve = 0;
@@ -1443,7 +1471,7 @@ retry_avoidcopy:
 	__SetPageUptodate(new_page);
 	spin_lock(&mm->page_table_lock);
 
-	ptep = huge_pte_offset(mm, address & HPAGE_MASK);
+	ptep = huge_pte_offset(mm, address & huge_page_mask(h));
 	if (likely(pte_same(huge_ptep_get(ptep), pte))) {
 		/* Break COW */
 		huge_ptep_clear_flush(vma, address, ptep);
@@ -1458,14 +1486,14 @@ retry_avoidcopy:
 }
 
 /* Return the pagecache page at a given address within a VMA */
-static struct page *hugetlbfs_pagecache_page(struct vm_area_struct *vma,
-			unsigned long address)
+static struct page *hugetlbfs_pagecache_page(struct hstate *h,
+			struct vm_area_struct *vma, unsigned long address)
 {
 	struct address_space *mapping;
 	pgoff_t idx;
 
 	mapping = vma->vm_file->f_mapping;
-	idx = vma_hugecache_offset(vma, address);
+	idx = vma_hugecache_offset(h, vma, address);
 
 	return find_lock_page(mapping, idx);
 }
@@ -1473,6 +1501,7 @@ static struct page *hugetlbfs_pagecache_page(struct vm_area_struct *vma,
 static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, pte_t *ptep, int write_access)
 {
+	struct hstate *h = hstate_vma(vma);
 	int ret = VM_FAULT_SIGBUS;
 	pgoff_t idx;
 	unsigned long size;
@@ -1493,7 +1522,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	}
 
 	mapping = vma->vm_file->f_mapping;
-	idx = vma_hugecache_offset(vma, address);
+	idx = vma_hugecache_offset(h, vma, address);
 
 	/*
 	 * Use page lock to guard against racing truncation
@@ -1502,7 +1531,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 retry:
 	page = find_lock_page(mapping, idx);
 	if (!page) {
-		size = i_size_read(mapping->host) >> HPAGE_SHIFT;
+		size = i_size_read(mapping->host) >> huge_page_shift(h);
 		if (idx >= size)
 			goto out;
 		page = alloc_huge_page(vma, address, 0);
@@ -1510,7 +1539,7 @@ retry:
 			ret = -PTR_ERR(page);
 			goto out;
 		}
-		clear_huge_page(page, address);
+		clear_huge_page(page, address, huge_page_size(h));
 		__SetPageUptodate(page);
 
 		if (vma->vm_flags & VM_SHARED) {
@@ -1526,14 +1555,14 @@ retry:
 			}
 
 			spin_lock(&inode->i_lock);
-			inode->i_blocks += BLOCKS_PER_HUGEPAGE;
+			inode->i_blocks += blocks_per_huge_page(h);
 			spin_unlock(&inode->i_lock);
 		} else
 			lock_page(page);
 	}
 
 	spin_lock(&mm->page_table_lock);
-	size = i_size_read(mapping->host) >> HPAGE_SHIFT;
+	size = i_size_read(mapping->host) >> huge_page_shift(h);
 	if (idx >= size)
 		goto backout;
 
@@ -1569,8 +1598,9 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	pte_t entry;
 	int ret;
 	static DEFINE_MUTEX(hugetlb_instantiation_mutex);
+	struct hstate *h = hstate_vma(vma);
 
-	ptep = huge_pte_alloc(mm, address);
+	ptep = huge_pte_alloc(mm, address, huge_page_size(h));
 	if (!ptep)
 		return VM_FAULT_OOM;
 
@@ -1594,7 +1624,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (likely(pte_same(entry, huge_ptep_get(ptep))))
 		if (write_access && !pte_write(entry)) {
 			struct page *page;
-			page = hugetlbfs_pagecache_page(vma, address);
+			page = hugetlbfs_pagecache_page(h, vma, address);
 			ret = hugetlb_cow(mm, vma, address, ptep, entry, page);
 			if (page) {
 				unlock_page(page);
@@ -1615,6 +1645,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	unsigned long pfn_offset;
 	unsigned long vaddr = *position;
 	int remainder = *length;
+	struct hstate *h = hstate_vma(vma);
 
 	spin_lock(&mm->page_table_lock);
 	while (vaddr < vma->vm_end && remainder) {
@@ -1626,7 +1657,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		 * each hugepage.  We have to make * sure we get the
 		 * first, for the page indexing below to work.
 		 */
-		pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
+		pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
 
 		if (!pte || huge_pte_none(huge_ptep_get(pte)) ||
 		    (write && !pte_write(huge_ptep_get(pte)))) {
@@ -1644,7 +1675,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			break;
 		}
 
-		pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
+		pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
 		page = pte_page(huge_ptep_get(pte));
 same_page:
 		if (pages) {
@@ -1660,7 +1691,7 @@ same_page:
 		--remainder;
 		++i;
 		if (vaddr < vma->vm_end && remainder &&
-				pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
+				pfn_offset < pages_per_huge_page(h)) {
 			/*
 			 * We use pfn_offset to avoid touching the pageframes
 			 * of this compound page.
@@ -1682,13 +1713,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
 	unsigned long start = address;
 	pte_t *ptep;
 	pte_t pte;
+	struct hstate *h = hstate_vma(vma);
 
 	BUG_ON(address >= end);
 	flush_cache_range(vma, address, end);
 
 	spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
 	spin_lock(&mm->page_table_lock);
-	for (; address < end; address += HPAGE_SIZE) {
+	for (; address < end; address += huge_page_size(h)) {
 		ptep = huge_pte_offset(mm, address);
 		if (!ptep)
 			continue;
@@ -1711,6 +1743,7 @@ int hugetlb_reserve_pages(struct inode *inode,
 					struct vm_area_struct *vma)
 {
 	long ret, chg;
+	struct hstate *h = hstate_inode(inode);
 
 	if (vma && vma->vm_flags & VM_NORESERVE)
 		return 0;
@@ -1739,7 +1772,7 @@ int hugetlb_reserve_pages(struct inode *inode,
 
 	if (hugetlb_get_quota(inode->i_mapping, chg))
 		return -ENOSPC;
-	ret = hugetlb_acct_memory(chg);
+	ret = hugetlb_acct_memory(h, chg);
 	if (ret < 0) {
 		hugetlb_put_quota(inode->i_mapping, chg);
 		return ret;
@@ -1751,12 +1784,13 @@ int hugetlb_reserve_pages(struct inode *inode,
 
 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
 {
+	struct hstate *h = hstate_inode(inode);
 	long chg = region_truncate(&inode->i_mapping->private_list, offset);
 
 	spin_lock(&inode->i_lock);
-	inode->i_blocks -= BLOCKS_PER_HUGEPAGE * freed;
+	inode->i_blocks -= blocks_per_huge_page(h);
 	spin_unlock(&inode->i_lock);
 
 	hugetlb_put_quota(inode->i_mapping, (chg - freed));
-	hugetlb_acct_memory(-(chg - freed));
+	hugetlb_acct_memory(h, -(chg - freed));
 }
diff --git a/mm/memory.c b/mm/memory.c
index 72932489a08..c1c1d6d8c22 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -903,7 +903,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
 			if (unlikely(is_vm_hugetlb_page(vma))) {
 				unmap_hugepage_range(vma, start, end, NULL);
 				zap_work -= (end - start) /
-						(HPAGE_SIZE / PAGE_SIZE);
+					pages_per_huge_page(hstate_vma(vma));
 				start = end;
 			} else
 				start = unmap_page_range(*tlbp, vma,
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index c94e58b192c..e550bec2058 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1481,7 +1481,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
 
 	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
 		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
-						HPAGE_SHIFT), gfp_flags);
+				huge_page_shift(hstate_vma(vma))), gfp_flags);
 	} else {
 		zl = policy_zonelist(gfp_flags, *mpol);
 		if ((*mpol)->mode == MPOL_BIND)
@@ -2220,9 +2220,12 @@ static void check_huge_range(struct vm_area_struct *vma,
 {
 	unsigned long addr;
 	struct page *page;
+	struct hstate *h = hstate_vma(vma);
+	unsigned long sz = huge_page_size(h);
 
-	for (addr = start; addr < end; addr += HPAGE_SIZE) {
-		pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
+	for (addr = start; addr < end; addr += sz) {
+		pte_t *ptep = huge_pte_offset(vma->vm_mm,
+						addr & huge_page_mask(h));
 		pte_t pte;
 
 		if (!ptep)
diff --git a/mm/mmap.c b/mm/mmap.c
index 57d3b6097de..5e0cc99e9cd 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1812,7 +1812,8 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
 	struct mempolicy *pol;
 	struct vm_area_struct *new;
 
-	if (is_vm_hugetlb_page(vma) && (addr & ~HPAGE_MASK))
+	if (is_vm_hugetlb_page(vma) && (addr &
+					~(huge_page_mask(hstate_vma(vma)))))
 		return -EINVAL;
 
 	if (mm->map_count >= sysctl_max_map_count)
-- 
cgit v1.2.3-70-g09d2


From a137e1cc6d6e7d315fef03962a2a5a113348b13b Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 23 Jul 2008 21:27:43 -0700
Subject: hugetlbfs: per mount huge page sizes

Add the ability to configure the hugetlb hstate used on a per mount basis.

- Add a new pagesize= option to the hugetlbfs mount that allows setting
  the page size
- This option causes the mount code to find the hstate corresponding to the
  specified size, and sets up a pointer to the hstate in the mount's
  superblock.
- Change the hstate accessors to use this information rather than the
  global_hstate they were using (requires a slight change in mm/memory.c
  so we don't NULL deref in the error-unmap path -- see comments).

[np: take hstate out of hugetlbfs inode and vma->vm_private_data]

Acked-by: Adam Litke <agl@us.ibm.com>
Acked-by: Nishanth Aravamudan <nacc@us.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hugetlbfs/inode.c    | 45 ++++++++++++++++++++++++++++++++++++---------
 include/linux/hugetlb.h | 14 +++++++++-----
 mm/hugetlb.c            | 16 +++-------------
 mm/memory.c             | 18 ++++++++++++++++--
 4 files changed, 64 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 516c581b537..dbd01d262ca 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -53,6 +53,7 @@ int sysctl_hugetlb_shm_group;
 enum {
 	Opt_size, Opt_nr_inodes,
 	Opt_mode, Opt_uid, Opt_gid,
+	Opt_pagesize,
 	Opt_err,
 };
 
@@ -62,6 +63,7 @@ static match_table_t tokens = {
 	{Opt_mode,	"mode=%o"},
 	{Opt_uid,	"uid=%u"},
 	{Opt_gid,	"gid=%u"},
+	{Opt_pagesize,	"pagesize=%s"},
 	{Opt_err,	NULL},
 };
 
@@ -750,6 +752,8 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
 	char *p, *rest;
 	substring_t args[MAX_OPT_ARGS];
 	int option;
+	unsigned long long size = 0;
+	enum { NO_SIZE, SIZE_STD, SIZE_PERCENT } setsize = NO_SIZE;
 
 	if (!options)
 		return 0;
@@ -780,17 +784,13 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
 			break;
 
 		case Opt_size: {
- 			unsigned long long size;
 			/* memparse() will accept a K/M/G without a digit */
 			if (!isdigit(*args[0].from))
 				goto bad_val;
 			size = memparse(args[0].from, &rest);
-			if (*rest == '%') {
-				size <<= HPAGE_SHIFT;
-				size *= max_huge_pages;
-				do_div(size, 100);
-			}
-			pconfig->nr_blocks = (size >> HPAGE_SHIFT);
+			setsize = SIZE_STD;
+			if (*rest == '%')
+				setsize = SIZE_PERCENT;
 			break;
 		}
 
@@ -801,6 +801,19 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
 			pconfig->nr_inodes = memparse(args[0].from, &rest);
 			break;
 
+		case Opt_pagesize: {
+			unsigned long ps;
+			ps = memparse(args[0].from, &rest);
+			pconfig->hstate = size_to_hstate(ps);
+			if (!pconfig->hstate) {
+				printk(KERN_ERR
+				"hugetlbfs: Unsupported page size %lu MB\n",
+					ps >> 20);
+				return -EINVAL;
+			}
+			break;
+		}
+
 		default:
 			printk(KERN_ERR "hugetlbfs: Bad mount option: \"%s\"\n",
 				 p);
@@ -808,6 +821,18 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
 			break;
 		}
 	}
+
+	/* Do size after hstate is set up */
+	if (setsize > NO_SIZE) {
+		struct hstate *h = pconfig->hstate;
+		if (setsize == SIZE_PERCENT) {
+			size <<= huge_page_shift(h);
+			size *= h->max_huge_pages;
+			do_div(size, 100);
+		}
+		pconfig->nr_blocks = (size >> huge_page_shift(h));
+	}
+
 	return 0;
 
 bad_val:
@@ -832,6 +857,7 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
 	config.uid = current->fsuid;
 	config.gid = current->fsgid;
 	config.mode = 0755;
+	config.hstate = &default_hstate;
 	ret = hugetlbfs_parse_options(data, &config);
 	if (ret)
 		return ret;
@@ -840,14 +866,15 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
 	if (!sbinfo)
 		return -ENOMEM;
 	sb->s_fs_info = sbinfo;
+	sbinfo->hstate = config.hstate;
 	spin_lock_init(&sbinfo->stat_lock);
 	sbinfo->max_blocks = config.nr_blocks;
 	sbinfo->free_blocks = config.nr_blocks;
 	sbinfo->max_inodes = config.nr_inodes;
 	sbinfo->free_inodes = config.nr_inodes;
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
-	sb->s_blocksize = HPAGE_SIZE;
-	sb->s_blocksize_bits = HPAGE_SHIFT;
+	sb->s_blocksize = huge_page_size(config.hstate);
+	sb->s_blocksize_bits = huge_page_shift(config.hstate);
 	sb->s_magic = HUGETLBFS_MAGIC;
 	sb->s_op = &hugetlbfs_ops;
 	sb->s_time_gran = 1;
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index b75bdb4deba..ba9263e631b 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -100,6 +100,7 @@ struct hugetlbfs_config {
 	umode_t mode;
 	long	nr_blocks;
 	long	nr_inodes;
+	struct hstate *hstate;
 };
 
 struct hugetlbfs_sb_info {
@@ -108,6 +109,7 @@ struct hugetlbfs_sb_info {
 	long	max_inodes;   /* inodes allowed */
 	long	free_inodes;  /* inodes free */
 	spinlock_t	stat_lock;
+	struct hstate *hstate;
 };
 
 
@@ -191,19 +193,21 @@ extern unsigned int default_hstate_idx;
 
 #define default_hstate (hstates[default_hstate_idx])
 
-static inline struct hstate *hstate_vma(struct vm_area_struct *vma)
+static inline struct hstate *hstate_inode(struct inode *i)
 {
-	return &default_hstate;
+	struct hugetlbfs_sb_info *hsb;
+	hsb = HUGETLBFS_SB(i->i_sb);
+	return hsb->hstate;
 }
 
 static inline struct hstate *hstate_file(struct file *f)
 {
-	return &default_hstate;
+	return hstate_inode(f->f_dentry->d_inode);
 }
 
-static inline struct hstate *hstate_inode(struct inode *i)
+static inline struct hstate *hstate_vma(struct vm_area_struct *vma)
 {
-	return &default_hstate;
+	return hstate_file(vma->vm_file);
 }
 
 static inline unsigned long huge_page_size(struct hstate *h)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 82378d44a0c..4cf7a90e914 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1439,19 +1439,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 			  unsigned long end, struct page *ref_page)
 {
-	/*
-	 * It is undesirable to test vma->vm_file as it should be non-null
-	 * for valid hugetlb area. However, vm_file will be NULL in the error
-	 * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails,
-	 * do_mmap_pgoff() nullifies vma->vm_file before calling this function
-	 * to clean up. Since no pte has actually been setup, it is safe to
-	 * do nothing in this case.
-	 */
-	if (vma->vm_file) {
-		spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
-		__unmap_hugepage_range(vma, start, end, ref_page);
-		spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
-	}
+	spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
+	__unmap_hugepage_range(vma, start, end, ref_page);
+	spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
 }
 
 /*
diff --git a/mm/memory.c b/mm/memory.c
index c1c1d6d8c22..02fc6b1047b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -901,9 +901,23 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
 			}
 
 			if (unlikely(is_vm_hugetlb_page(vma))) {
-				unmap_hugepage_range(vma, start, end, NULL);
-				zap_work -= (end - start) /
+				/*
+				 * It is undesirable to test vma->vm_file as it
+				 * should be non-null for valid hugetlb area.
+				 * However, vm_file will be NULL in the error
+				 * cleanup path of do_mmap_pgoff. When
+				 * hugetlbfs ->mmap method fails,
+				 * do_mmap_pgoff() nullifies vma->vm_file
+				 * before calling this function to clean up.
+				 * Since no pte has actually been setup, it is
+				 * safe to do nothing in this case.
+				 */
+				if (vma->vm_file) {
+					unmap_hugepage_range(vma, start, end, NULL);
+					zap_work -= (end - start) /
 					pages_per_huge_page(hstate_vma(vma));
+				}
+
 				start = end;
 			} else
 				start = unmap_page_range(*tlbp, vma,
-- 
cgit v1.2.3-70-g09d2


From f4a67cceee4a6f5ed38011a698c9e34747270ae5 Mon Sep 17 00:00:00 2001
From: Jon Tollefson <kniht@linux.vnet.ibm.com>
Date: Wed, 23 Jul 2008 21:27:55 -0700
Subject: fs: check for statfs overflow

Adds a check for an overflow in the filesystem size so if someone is
checking with statfs() on a 16G blocksize hugetlbfs in a 32bit binary that
it will report back EOVERFLOW instead of a size of 0.

Acked-by: Nishanth Aravamudan <nacc@us.ibm.com>
Signed-off-by: Jon Tollefson <kniht@linux.vnet.ibm.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/compat.c | 8 ++++----
 fs/open.c   | 3 ++-
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index ed43e17a5dc..b4660428176 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -197,8 +197,8 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
 {
 	
 	if (sizeof ubuf->f_blocks == 4) {
-		if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail) &
-		    0xffffffff00000000ULL)
+		if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail |
+		     kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL)
 			return -EOVERFLOW;
 		/* f_files and f_ffree may be -1; it's okay
 		 * to stuff that into 32 bits */
@@ -271,8 +271,8 @@ out:
 static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstatfs *kbuf)
 {
 	if (sizeof ubuf->f_blocks == 4) {
-		if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail) &
-		    0xffffffff00000000ULL)
+		if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail |
+		     kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL)
 			return -EOVERFLOW;
 		/* f_files and f_ffree may be -1; it's okay
 		 * to stuff that into 32 bits */
diff --git a/fs/open.c b/fs/open.c
index a99ad09c319..bb98d2fe809 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -64,7 +64,8 @@ static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf)
 		memcpy(buf, &st, sizeof(st));
 	else {
 		if (sizeof buf->f_blocks == 4) {
-			if ((st.f_blocks | st.f_bfree | st.f_bavail) &
+			if ((st.f_blocks | st.f_bfree | st.f_bavail |
+			     st.f_bsize | st.f_frsize) &
 			    0xffffffff00000000ULL)
 				return -EOVERFLOW;
 			/*
-- 
cgit v1.2.3-70-g09d2


From 6e2c10a12a2170856f5582d62d583cbcd1cb5eaf Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Wed, 23 Jul 2008 21:29:15 -0700
Subject: binfmt_misc: use simple_read_from_buffer()

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_misc.c | 20 +++-----------------
 1 file changed, 3 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 7191306367c..756205314c2 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -27,6 +27,7 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/syscalls.h>
+#include <linux/fs.h>
 
 #include <asm/uaccess.h>
 
@@ -535,31 +536,16 @@ static ssize_t
 bm_entry_read(struct file * file, char __user * buf, size_t nbytes, loff_t *ppos)
 {
 	Node *e = file->f_path.dentry->d_inode->i_private;
-	loff_t pos = *ppos;
 	ssize_t res;
 	char *page;
-	int len;
 
 	if (!(page = (char*) __get_free_page(GFP_KERNEL)))
 		return -ENOMEM;
 
 	entry_status(e, page);
-	len = strlen(page);
 
-	res = -EINVAL;
-	if (pos < 0)
-		goto out;
-	res = 0;
-	if (pos >= len)
-		goto out;
-	if (len < pos + nbytes)
-		nbytes = len - pos;
-	res = -EFAULT;
-	if (copy_to_user(buf, page + pos, nbytes))
-		goto out;
-	*ppos = pos + nbytes;
-	res = nbytes;
-out:
+	res = simple_read_from_buffer(buf, nbytes, ppos, page, strlen(page));
+
 	free_page((unsigned long) page);
 	return res;
 }
-- 
cgit v1.2.3-70-g09d2


From 7d9dbca34240ebb6ff88d8a29c6c7bffd098f0c1 Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Wed, 23 Jul 2008 21:29:22 -0700
Subject: flag parameters: anon_inode_getfd extension

This patch just extends the anon_inode_getfd interface to take an additional
parameter with a flag value.  The flag value is passed on to
get_unused_fd_flags in anticipation for a use with the O_CLOEXEC flag.

No actual semantic changes here, the changed callers all pass 0 for now.

[akpm@linux-foundation.org: KVM fix]
Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/anon_inodes.c            | 9 +++++----
 fs/eventfd.c                | 2 +-
 fs/eventpoll.c              | 2 +-
 fs/signalfd.c               | 3 ++-
 fs/timerfd.c                | 2 +-
 include/linux/anon_inodes.h | 2 +-
 virt/kvm/kvm_main.c         | 4 ++--
 7 files changed, 13 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 977ef208c05..1a4eee620b0 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -58,8 +58,9 @@ static struct dentry_operations anon_inodefs_dentry_operations = {
  *                    of the file
  *
  * @name:    [in]    name of the "class" of the new file
- * @fops     [in]    file operations for the new file
- * @priv     [in]    private data for the new file (will be file's private_data)
+ * @fops:    [in]    file operations for the new file
+ * @priv:    [in]    private data for the new file (will be file's private_data)
+ * @flags:   [in]    flags
  *
  * Creates a new file by hooking it on a single inode. This is useful for files
  * that do not need to have a full-fledged inode in order to operate correctly.
@@ -68,7 +69,7 @@ static struct dentry_operations anon_inodefs_dentry_operations = {
  * setup.  Returns new descriptor or -error.
  */
 int anon_inode_getfd(const char *name, const struct file_operations *fops,
-		     void *priv)
+		     void *priv, int flags)
 {
 	struct qstr this;
 	struct dentry *dentry;
@@ -78,7 +79,7 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops,
 	if (IS_ERR(anon_inode_inode))
 		return -ENODEV;
 
-	error = get_unused_fd();
+	error = get_unused_fd_flags(flags);
 	if (error < 0)
 		return error;
 	fd = error;
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 343942deeec..6094265ca40 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -214,7 +214,7 @@ asmlinkage long sys_eventfd(unsigned int count)
 	 * When we call this, the initialization must be complete, since
 	 * anon_inode_getfd() will install the fd.
 	 */
-	fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx);
+	fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx, 0);
 	if (fd < 0)
 		kfree(ctx);
 	return fd;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 990c01d2d66..9392dd96812 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1068,7 +1068,7 @@ asmlinkage long sys_epoll_create(int size)
 	 * Creates all the items needed to setup an eventpoll file. That is,
 	 * a file structure and a free file descriptor.
 	 */
-	fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep);
+	fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, 0);
 	if (fd < 0)
 		ep_free(ep);
 
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 619725644c7..ddb328b74bd 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -227,7 +227,8 @@ asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemas
 		 * When we call this, the initialization must be complete, since
 		 * anon_inode_getfd() will install the fd.
 		 */
-		ufd = anon_inode_getfd("[signalfd]", &signalfd_fops, ctx);
+		ufd = anon_inode_getfd("[signalfd]", &signalfd_fops, ctx,
+				       0);
 		if (ufd < 0)
 			kfree(ctx);
 	} else {
diff --git a/fs/timerfd.c b/fs/timerfd.c
index d87d354ec42..77c2bc92cbe 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -198,7 +198,7 @@ asmlinkage long sys_timerfd_create(int clockid, int flags)
 	ctx->clockid = clockid;
 	hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS);
 
-	ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx);
+	ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx, 0);
 	if (ufd < 0)
 		kfree(ctx);
 
diff --git a/include/linux/anon_inodes.h b/include/linux/anon_inodes.h
index 6129e58ca7c..e0a0cdc2da4 100644
--- a/include/linux/anon_inodes.h
+++ b/include/linux/anon_inodes.h
@@ -9,7 +9,7 @@
 #define _LINUX_ANON_INODES_H
 
 int anon_inode_getfd(const char *name, const struct file_operations *fops,
-		     void *priv);
+		     void *priv, int flags);
 
 #endif /* _LINUX_ANON_INODES_H */
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 904d7b7bd78..a845890b680 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -902,7 +902,7 @@ static const struct file_operations kvm_vcpu_fops = {
  */
 static int create_vcpu_fd(struct kvm_vcpu *vcpu)
 {
-	int fd = anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu);
+	int fd = anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, 0);
 	if (fd < 0)
 		kvm_put_kvm(vcpu->kvm);
 	return fd;
@@ -1261,7 +1261,7 @@ static int kvm_dev_ioctl_create_vm(void)
 	kvm = kvm_create_vm();
 	if (IS_ERR(kvm))
 		return PTR_ERR(kvm);
-	fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm);
+	fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, 0);
 	if (fd < 0)
 		kvm_put_kvm(kvm);
 
-- 
cgit v1.2.3-70-g09d2


From 9deb27baedb79759c3ab9435a7d8b841842d56e9 Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Wed, 23 Jul 2008 21:29:24 -0700
Subject: flag parameters: signalfd

This patch adds the new signalfd4 syscall.  It extends the old signalfd
syscall by one parameter which is meant to hold a flag value.  In this
patch the only flag support is SFD_CLOEXEC which causes the close-on-exec
flag for the returned file descriptor to be set.

A new name SFD_CLOEXEC is introduced which in this implementation must
have the same value as O_CLOEXEC.

The following test must be adjusted for architectures other than x86 and
x86-64 and in case the syscall numbers changed.

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#include <fcntl.h>
#include <signal.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>

#ifndef __NR_signalfd4
# ifdef __x86_64__
#  define __NR_signalfd4 289
# elif defined __i386__
#  define __NR_signalfd4 327
# else
#  error "need __NR_signalfd4"
# endif
#endif

#define SFD_CLOEXEC O_CLOEXEC

int
main (void)
{
  sigset_t ss;
  sigemptyset (&ss);
  sigaddset (&ss, SIGUSR1);
  int fd = syscall (__NR_signalfd4, -1, &ss, 8, 0);
  if (fd == -1)
    {
      puts ("signalfd4(0) failed");
      return 1;
    }
  int coe = fcntl (fd, F_GETFD);
  if (coe == -1)
    {
      puts ("fcntl failed");
      return 1;
    }
  if (coe & FD_CLOEXEC)
    {
      puts ("signalfd4(0) set close-on-exec flag");
      return 1;
    }
  close (fd);

  fd = syscall (__NR_signalfd4, -1, &ss, 8, SFD_CLOEXEC);
  if (fd == -1)
    {
      puts ("signalfd4(SFD_CLOEXEC) failed");
      return 1;
    }
  coe = fcntl (fd, F_GETFD);
  if (coe == -1)
    {
      puts ("fcntl failed");
      return 1;
    }
  if ((coe & FD_CLOEXEC) == 0)
    {
      puts ("signalfd4(SFD_CLOEXEC) does not set close-on-exec flag");
      return 1;
    }
  close (fd);

  puts ("OK");

  return 0;
}
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[akpm@linux-foundation.org: add sys_ni stub]
Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/ia32/ia32entry.S          |  1 +
 arch/x86/kernel/syscall_table_32.S |  1 +
 fs/compat.c                        | 14 ++++++++++----
 fs/signalfd.c                      | 14 ++++++++++++--
 include/asm-x86/unistd_32.h        |  1 +
 include/asm-x86/unistd_64.h        |  2 ++
 include/linux/signalfd.h           |  5 +++++
 include/linux/syscalls.h           |  1 +
 kernel/sys_ni.c                    |  1 +
 9 files changed, 34 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 021d71bc69b..c308128b925 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -826,4 +826,5 @@ ia32_sys_call_table:
 	.quad sys32_fallocate
 	.quad compat_sys_timerfd_settime	/* 325 */
 	.quad compat_sys_timerfd_gettime
+	.quad compat_sys_signalfd4
 ia32_syscall_end:
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index adff5562f5f..c12a36c9fd5 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -326,3 +326,4 @@ ENTRY(sys_call_table)
 	.long sys_fallocate
 	.long sys_timerfd_settime	/* 325 */
 	.long sys_timerfd_gettime
+	.long sys_signalfd4
diff --git a/fs/compat.c b/fs/compat.c
index b4660428176..106eba28ec5 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -2131,9 +2131,9 @@ asmlinkage long compat_sys_epoll_pwait(int epfd,
 
 #ifdef CONFIG_SIGNALFD
 
-asmlinkage long compat_sys_signalfd(int ufd,
-				    const compat_sigset_t __user *sigmask,
-				    compat_size_t sigsetsize)
+asmlinkage long compat_sys_signalfd4(int ufd,
+				     const compat_sigset_t __user *sigmask,
+				     compat_size_t sigsetsize, int flags)
 {
 	compat_sigset_t ss32;
 	sigset_t tmp;
@@ -2148,9 +2148,15 @@ asmlinkage long compat_sys_signalfd(int ufd,
 	if (copy_to_user(ksigmask, &tmp, sizeof(sigset_t)))
 		return -EFAULT;
 
-	return sys_signalfd(ufd, ksigmask, sizeof(sigset_t));
+	return sys_signalfd4(ufd, ksigmask, sizeof(sigset_t), flags);
 }
 
+asmlinkage long compat_sys_signalfd(int ufd,
+				    const compat_sigset_t __user *sigmask,
+				    compat_size_t sigsetsize)
+{
+	return compat_sys_signalfd4(ufd, sigmask, sigsetsize, 0);
+}
 #endif /* CONFIG_SIGNALFD */
 
 #ifdef CONFIG_TIMERFD
diff --git a/fs/signalfd.c b/fs/signalfd.c
index ddb328b74bd..c8609fa51a1 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -205,11 +205,15 @@ static const struct file_operations signalfd_fops = {
 	.read		= signalfd_read,
 };
 
-asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemask)
+asmlinkage long sys_signalfd4(int ufd, sigset_t __user *user_mask,
+			      size_t sizemask, int flags)
 {
 	sigset_t sigmask;
 	struct signalfd_ctx *ctx;
 
+	if (flags & ~SFD_CLOEXEC)
+		return -EINVAL;
+
 	if (sizemask != sizeof(sigset_t) ||
 	    copy_from_user(&sigmask, user_mask, sizeof(sigmask)))
 		return -EINVAL;
@@ -228,7 +232,7 @@ asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemas
 		 * anon_inode_getfd() will install the fd.
 		 */
 		ufd = anon_inode_getfd("[signalfd]", &signalfd_fops, ctx,
-				       0);
+				       flags & O_CLOEXEC);
 		if (ufd < 0)
 			kfree(ctx);
 	} else {
@@ -250,3 +254,9 @@ asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemas
 
 	return ufd;
 }
+
+asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask,
+			     size_t sizemask)
+{
+	return sys_signalfd4(ufd, user_mask, sizemask, 0);
+}
diff --git a/include/asm-x86/unistd_32.h b/include/asm-x86/unistd_32.h
index 8317d94771d..c310371f561 100644
--- a/include/asm-x86/unistd_32.h
+++ b/include/asm-x86/unistd_32.h
@@ -332,6 +332,7 @@
 #define __NR_fallocate		324
 #define __NR_timerfd_settime	325
 #define __NR_timerfd_gettime	326
+#define __NR_signalfd4		327
 
 #ifdef __KERNEL__
 
diff --git a/include/asm-x86/unistd_64.h b/include/asm-x86/unistd_64.h
index e323994a370..e0a9b45b234 100644
--- a/include/asm-x86/unistd_64.h
+++ b/include/asm-x86/unistd_64.h
@@ -641,6 +641,8 @@ __SYSCALL(__NR_timerfd_settime, sys_timerfd_settime)
 __SYSCALL(__NR_timerfd_gettime, sys_timerfd_gettime)
 #define __NR_paccept				288
 __SYSCALL(__NR_paccept, sys_paccept)
+#define __NR_signalfd4				289
+__SYSCALL(__NR_signalfd4, sys_signalfd4)
 
 
 #ifndef __NO_STUBS
diff --git a/include/linux/signalfd.h b/include/linux/signalfd.h
index ea037f28df9..8b3f7b7420a 100644
--- a/include/linux/signalfd.h
+++ b/include/linux/signalfd.h
@@ -8,6 +8,11 @@
 #ifndef _LINUX_SIGNALFD_H
 #define _LINUX_SIGNALFD_H
 
+/* For O_CLOEXEC */
+#include <linux/fcntl.h>
+
+/* Flags for signalfd4.  */
+#define SFD_CLOEXEC O_CLOEXEC
 
 struct signalfd_siginfo {
 	__u32 ssi_signo;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 2a2a40af6b2..1c270779784 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -610,6 +610,7 @@ asmlinkage long sys_set_robust_list(struct robust_list_head __user *head,
 				    size_t len);
 asmlinkage long sys_getcpu(unsigned __user *cpu, unsigned __user *node, struct getcpu_cache __user *cache);
 asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemask);
+asmlinkage long sys_signalfd4(int ufd, sigset_t __user *user_mask, size_t sizemask, int flags);
 asmlinkage long sys_timerfd_create(int clockid, int flags);
 asmlinkage long sys_timerfd_settime(int ufd, int flags,
 				    const struct itimerspec __user *utmr,
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 2f0b8a2e600..8627c89ae9e 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -156,6 +156,7 @@ cond_syscall(sys_ioprio_get);
 
 /* New file descriptors */
 cond_syscall(sys_signalfd);
+cond_syscall(sys_signalfd4);
 cond_syscall(compat_sys_signalfd);
 cond_syscall(sys_timerfd_create);
 cond_syscall(sys_timerfd_settime);
-- 
cgit v1.2.3-70-g09d2


From b087498eb5605673b0f260a7620d91818cd72304 Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Wed, 23 Jul 2008 21:29:25 -0700
Subject: flag parameters: eventfd

This patch adds the new eventfd2 syscall.  It extends the old eventfd
syscall by one parameter which is meant to hold a flag value.  In this
patch the only flag support is EFD_CLOEXEC which causes the close-on-exec
flag for the returned file descriptor to be set.

A new name EFD_CLOEXEC is introduced which in this implementation must
have the same value as O_CLOEXEC.

The following test must be adjusted for architectures other than x86 and
x86-64 and in case the syscall numbers changed.

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#include <fcntl.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>

#ifndef __NR_eventfd2
# ifdef __x86_64__
#  define __NR_eventfd2 290
# elif defined __i386__
#  define __NR_eventfd2 328
# else
#  error "need __NR_eventfd2"
# endif
#endif

#define EFD_CLOEXEC O_CLOEXEC

int
main (void)
{
  int fd = syscall (__NR_eventfd2, 1, 0);
  if (fd == -1)
    {
      puts ("eventfd2(0) failed");
      return 1;
    }
  int coe = fcntl (fd, F_GETFD);
  if (coe == -1)
    {
      puts ("fcntl failed");
      return 1;
    }
  if (coe & FD_CLOEXEC)
    {
      puts ("eventfd2(0) sets close-on-exec flag");
      return 1;
    }
  close (fd);

  fd = syscall (__NR_eventfd2, 1, EFD_CLOEXEC);
  if (fd == -1)
    {
      puts ("eventfd2(EFD_CLOEXEC) failed");
      return 1;
    }
  coe = fcntl (fd, F_GETFD);
  if (coe == -1)
    {
      puts ("fcntl failed");
      return 1;
    }
  if ((coe & FD_CLOEXEC) == 0)
    {
      puts ("eventfd2(EFD_CLOEXEC) does not set close-on-exec flag");
      return 1;
    }
  close (fd);

  puts ("OK");

  return 0;
}
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[akpm@linux-foundation.org: add sys_ni stub]
Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/ia32/ia32entry.S          |  1 +
 arch/x86/kernel/syscall_table_32.S |  1 +
 fs/eventfd.c                       | 13 +++++++++++--
 include/asm-x86/unistd_32.h        |  1 +
 include/asm-x86/unistd_64.h        |  2 ++
 include/linux/eventfd.h            |  6 ++++++
 include/linux/syscalls.h           |  1 +
 kernel/sys_ni.c                    |  1 +
 8 files changed, 24 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index c308128b925..cf0eb31745c 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -827,4 +827,5 @@ ia32_sys_call_table:
 	.quad compat_sys_timerfd_settime	/* 325 */
 	.quad compat_sys_timerfd_gettime
 	.quad compat_sys_signalfd4
+	.quad sys_eventfd2
 ia32_syscall_end:
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index c12a36c9fd5..cf112cb11c3 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -327,3 +327,4 @@ ENTRY(sys_call_table)
 	.long sys_timerfd_settime	/* 325 */
 	.long sys_timerfd_gettime
 	.long sys_signalfd4
+	.long sys_eventfd2
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 6094265ca40..bd420e6478a 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -198,11 +198,14 @@ struct file *eventfd_fget(int fd)
 	return file;
 }
 
-asmlinkage long sys_eventfd(unsigned int count)
+asmlinkage long sys_eventfd2(unsigned int count, int flags)
 {
 	int fd;
 	struct eventfd_ctx *ctx;
 
+	if (flags & ~EFD_CLOEXEC)
+		return -EINVAL;
+
 	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
 	if (!ctx)
 		return -ENOMEM;
@@ -214,9 +217,15 @@ asmlinkage long sys_eventfd(unsigned int count)
 	 * When we call this, the initialization must be complete, since
 	 * anon_inode_getfd() will install the fd.
 	 */
-	fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx, 0);
+	fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx,
+			      flags & O_CLOEXEC);
 	if (fd < 0)
 		kfree(ctx);
 	return fd;
 }
 
+asmlinkage long sys_eventfd(unsigned int count)
+{
+	return sys_eventfd2(count, 0);
+}
+
diff --git a/include/asm-x86/unistd_32.h b/include/asm-x86/unistd_32.h
index c310371f561..edbd8723c93 100644
--- a/include/asm-x86/unistd_32.h
+++ b/include/asm-x86/unistd_32.h
@@ -333,6 +333,7 @@
 #define __NR_timerfd_settime	325
 #define __NR_timerfd_gettime	326
 #define __NR_signalfd4		327
+#define __NR_eventfd2		328
 
 #ifdef __KERNEL__
 
diff --git a/include/asm-x86/unistd_64.h b/include/asm-x86/unistd_64.h
index e0a9b45b234..fb059a6feeb 100644
--- a/include/asm-x86/unistd_64.h
+++ b/include/asm-x86/unistd_64.h
@@ -643,6 +643,8 @@ __SYSCALL(__NR_timerfd_gettime, sys_timerfd_gettime)
 __SYSCALL(__NR_paccept, sys_paccept)
 #define __NR_signalfd4				289
 __SYSCALL(__NR_signalfd4, sys_signalfd4)
+#define __NR_eventfd2				290
+__SYSCALL(__NR_eventfd2, sys_eventfd2)
 
 
 #ifndef __NO_STUBS
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index a701399b7fe..a6c0eaedb1b 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -10,6 +10,12 @@
 
 #ifdef CONFIG_EVENTFD
 
+/* For O_CLOEXEC */
+#include <linux/fcntl.h>
+
+/* Flags for eventfd2.  */
+#define EFD_CLOEXEC O_CLOEXEC
+
 struct file *eventfd_fget(int fd);
 int eventfd_signal(struct file *file, int n);
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 1c270779784..9ab09926a7f 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -617,6 +617,7 @@ asmlinkage long sys_timerfd_settime(int ufd, int flags,
 				    struct itimerspec __user *otmr);
 asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr);
 asmlinkage long sys_eventfd(unsigned int count);
+asmlinkage long sys_eventfd2(unsigned int count, int flags);
 asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
 
 int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 8627c89ae9e..2a361ccdc7c 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -164,3 +164,4 @@ cond_syscall(sys_timerfd_gettime);
 cond_syscall(compat_sys_timerfd_settime);
 cond_syscall(compat_sys_timerfd_gettime);
 cond_syscall(sys_eventfd);
+cond_syscall(sys_eventfd2);
-- 
cgit v1.2.3-70-g09d2


From 11fcb6c14676023d0bd437841f5dcd670e7990a0 Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Wed, 23 Jul 2008 21:29:26 -0700
Subject: flag parameters: timerfd_create

The timerfd_create syscall already has a flags parameter.  It just is
unused so far.  This patch changes this by introducing the TFD_CLOEXEC
flag to set the close-on-exec flag for the returned file descriptor.

A new name TFD_CLOEXEC is introduced which in this implementation must
have the same value as O_CLOEXEC.

The following test must be adjusted for architectures other than x86 and
x86-64 and in case the syscall numbers changed.

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#include <fcntl.h>
#include <stdio.h>
#include <time.h>
#include <unistd.h>
#include <sys/syscall.h>

#ifndef __NR_timerfd_create
# ifdef __x86_64__
#  define __NR_timerfd_create 283
# elif defined __i386__
#  define __NR_timerfd_create 322
# else
#  error "need __NR_timerfd_create"
# endif
#endif

#define TFD_CLOEXEC O_CLOEXEC

int
main (void)
{
  int fd = syscall (__NR_timerfd_create, CLOCK_REALTIME, 0);
  if (fd == -1)
    {
      puts ("timerfd_create(0) failed");
      return 1;
    }
  int coe = fcntl (fd, F_GETFD);
  if (coe == -1)
    {
      puts ("fcntl failed");
      return 1;
    }
  if (coe & FD_CLOEXEC)
    {
      puts ("timerfd_create(0) set close-on-exec flag");
      return 1;
    }
  close (fd);

  fd = syscall (__NR_timerfd_create, CLOCK_REALTIME, TFD_CLOEXEC);
  if (fd == -1)
    {
      puts ("timerfd_create(TFD_CLOEXEC) failed");
      return 1;
    }
  coe = fcntl (fd, F_GETFD);
  if (coe == -1)
    {
      puts ("fcntl failed");
      return 1;
    }
  if ((coe & FD_CLOEXEC) == 0)
    {
      puts ("timerfd_create(TFD_CLOEXEC) set close-on-exec flag");
      return 1;
    }
  close (fd);

  puts ("OK");

  return 0;
}
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/timerfd.c            | 5 +++--
 include/linux/timerfd.h | 5 +++++
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/timerfd.c b/fs/timerfd.c
index 77c2bc92cbe..c6ef5e33cb3 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -184,7 +184,7 @@ asmlinkage long sys_timerfd_create(int clockid, int flags)
 	int ufd;
 	struct timerfd_ctx *ctx;
 
-	if (flags)
+	if (flags & ~TFD_CLOEXEC)
 		return -EINVAL;
 	if (clockid != CLOCK_MONOTONIC &&
 	    clockid != CLOCK_REALTIME)
@@ -198,7 +198,8 @@ asmlinkage long sys_timerfd_create(int clockid, int flags)
 	ctx->clockid = clockid;
 	hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS);
 
-	ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx, 0);
+	ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx,
+			       flags & O_CLOEXEC);
 	if (ufd < 0)
 		kfree(ctx);
 
diff --git a/include/linux/timerfd.h b/include/linux/timerfd.h
index cf2b10d7573..96ed97dff00 100644
--- a/include/linux/timerfd.h
+++ b/include/linux/timerfd.h
@@ -8,9 +8,14 @@
 #ifndef _LINUX_TIMERFD_H
 #define _LINUX_TIMERFD_H
 
+/* For O_CLOEXEC */
+#include <linux/fcntl.h>
 
+/* Flags for timerfd_settime.  */
 #define TFD_TIMER_ABSTIME (1 << 0)
 
+/* Flags for timerfd_create.  */
+#define TFD_CLOEXEC O_CLOEXEC
 
 
 #endif /* _LINUX_TIMERFD_H */
-- 
cgit v1.2.3-70-g09d2


From a0998b50c3f0b8fdd265c63e0032f86ebe377dbf Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Wed, 23 Jul 2008 21:29:27 -0700
Subject: flag parameters: epoll_create

This patch adds the new epoll_create2 syscall.  It extends the old epoll_create
syscall by one parameter which is meant to hold a flag value.  In this
patch the only flag support is EPOLL_CLOEXEC which causes the close-on-exec
flag for the returned file descriptor to be set.

A new name EPOLL_CLOEXEC is introduced which in this implementation must
have the same value as O_CLOEXEC.

The following test must be adjusted for architectures other than x86 and
x86-64 and in case the syscall numbers changed.

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#include <fcntl.h>
#include <stdio.h>
#include <time.h>
#include <unistd.h>
#include <sys/syscall.h>

#ifndef __NR_epoll_create2
# ifdef __x86_64__
#  define __NR_epoll_create2 291
# elif defined __i386__
#  define __NR_epoll_create2 329
# else
#  error "need __NR_epoll_create2"
# endif
#endif

#define EPOLL_CLOEXEC O_CLOEXEC

int
main (void)
{
  int fd = syscall (__NR_epoll_create2, 1, 0);
  if (fd == -1)
    {
      puts ("epoll_create2(0) failed");
      return 1;
    }
  int coe = fcntl (fd, F_GETFD);
  if (coe == -1)
    {
      puts ("fcntl failed");
      return 1;
    }
  if (coe & FD_CLOEXEC)
    {
      puts ("epoll_create2(0) set close-on-exec flag");
      return 1;
    }
  close (fd);

  fd = syscall (__NR_epoll_create2, 1, EPOLL_CLOEXEC);
  if (fd == -1)
    {
      puts ("epoll_create2(EPOLL_CLOEXEC) failed");
      return 1;
    }
  coe = fcntl (fd, F_GETFD);
  if (coe == -1)
    {
      puts ("fcntl failed");
      return 1;
    }
  if ((coe & FD_CLOEXEC) == 0)
    {
      puts ("epoll_create2(EPOLL_CLOEXEC) set close-on-exec flag");
      return 1;
    }
  close (fd);

  puts ("OK");

  return 0;
}
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/ia32/ia32entry.S          |  1 +
 arch/x86/kernel/syscall_table_32.S |  1 +
 fs/eventpoll.c                     | 13 +++++++++++--
 include/asm-x86/unistd_32.h        |  1 +
 include/asm-x86/unistd_64.h        |  2 ++
 include/linux/eventpoll.h          |  4 ++++
 include/linux/syscalls.h           |  1 +
 7 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index cf0eb31745c..04366f08f42 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -828,4 +828,5 @@ ia32_sys_call_table:
 	.quad compat_sys_timerfd_gettime
 	.quad compat_sys_signalfd4
 	.quad sys_eventfd2
+	.quad sys_epoll_create2
 ia32_syscall_end:
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index cf112cb11c3..4d7007ca263 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -328,3 +328,4 @@ ENTRY(sys_call_table)
 	.long sys_timerfd_gettime
 	.long sys_signalfd4
 	.long sys_eventfd2
+	.long sys_epoll_create2
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 9392dd96812..3fd4014f3c5 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1046,11 +1046,14 @@ retry:
  * RB tree. With the current implementation, the "size" parameter is ignored
  * (besides sanity checks).
  */
-asmlinkage long sys_epoll_create(int size)
+asmlinkage long sys_epoll_create2(int size, int flags)
 {
 	int error, fd = -1;
 	struct eventpoll *ep;
 
+	if (flags & ~EPOLL_CLOEXEC)
+		return -EINVAL;
+
 	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
 		     current, size));
 
@@ -1068,7 +1071,8 @@ asmlinkage long sys_epoll_create(int size)
 	 * Creates all the items needed to setup an eventpoll file. That is,
 	 * a file structure and a free file descriptor.
 	 */
-	fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, 0);
+	fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
+			      flags & O_CLOEXEC);
 	if (fd < 0)
 		ep_free(ep);
 
@@ -1079,6 +1083,11 @@ error_return:
 	return fd;
 }
 
+asmlinkage long sys_epoll_create(int size)
+{
+	return sys_epoll_create2(size, 0);
+}
+
 /*
  * The following function implements the controller interface for
  * the eventpoll file that enables the insertion/removal/change of
diff --git a/include/asm-x86/unistd_32.h b/include/asm-x86/unistd_32.h
index edbd8723c93..a37d6b0c4e1 100644
--- a/include/asm-x86/unistd_32.h
+++ b/include/asm-x86/unistd_32.h
@@ -334,6 +334,7 @@
 #define __NR_timerfd_gettime	326
 #define __NR_signalfd4		327
 #define __NR_eventfd2		328
+#define __NR_epoll_create2	329
 
 #ifdef __KERNEL__
 
diff --git a/include/asm-x86/unistd_64.h b/include/asm-x86/unistd_64.h
index fb059a6feeb..a1a4a5b6e5e 100644
--- a/include/asm-x86/unistd_64.h
+++ b/include/asm-x86/unistd_64.h
@@ -645,6 +645,8 @@ __SYSCALL(__NR_paccept, sys_paccept)
 __SYSCALL(__NR_signalfd4, sys_signalfd4)
 #define __NR_eventfd2				290
 __SYSCALL(__NR_eventfd2, sys_eventfd2)
+#define __NR_epoll_create2			291
+__SYSCALL(__NR_epoll_create2, sys_epoll_create2)
 
 
 #ifndef __NO_STUBS
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index cf79853967f..1cfaa40059c 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -14,8 +14,12 @@
 #ifndef _LINUX_EVENTPOLL_H
 #define _LINUX_EVENTPOLL_H
 
+/* For O_CLOEXEC */
+#include <linux/fcntl.h>
 #include <linux/types.h>
 
+/* Flags for epoll_create2.  */
+#define EPOLL_CLOEXEC O_CLOEXEC
 
 /* Valid opcodes to issue to sys_epoll_ctl() */
 #define EPOLL_CTL_ADD 1
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 9ab09926a7f..85953240f28 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -430,6 +430,7 @@ asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds,
 asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 			fd_set __user *exp, struct timeval __user *tvp);
 asmlinkage long sys_epoll_create(int size);
+asmlinkage long sys_epoll_create2(int size, int flags);
 asmlinkage long sys_epoll_ctl(int epfd, int op, int fd,
 				struct epoll_event __user *event);
 asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,
-- 
cgit v1.2.3-70-g09d2


From 336dd1f70ff62d7dd8655228caed4c5bfc818c56 Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Wed, 23 Jul 2008 21:29:29 -0700
Subject: flag parameters: dup2

This patch adds the new dup3 syscall.  It extends the old dup2 syscall by one
parameter which is meant to hold a flag value.  Support for the O_CLOEXEC flag
is added in this patch.

The following test must be adjusted for architectures other than x86 and
x86-64 and in case the syscall numbers changed.

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#include <fcntl.h>
#include <stdio.h>
#include <time.h>
#include <unistd.h>
#include <sys/syscall.h>

#ifndef __NR_dup3
# ifdef __x86_64__
#  define __NR_dup3 292
# elif defined __i386__
#  define __NR_dup3 330
# else
#  error "need __NR_dup3"
# endif
#endif

int
main (void)
{
  int fd = syscall (__NR_dup3, 1, 4, 0);
  if (fd == -1)
    {
      puts ("dup3(0) failed");
      return 1;
    }
  int coe = fcntl (fd, F_GETFD);
  if (coe == -1)
    {
      puts ("fcntl failed");
      return 1;
    }
  if (coe & FD_CLOEXEC)
    {
      puts ("dup3(0) set close-on-exec flag");
      return 1;
    }
  close (fd);

  fd = syscall (__NR_dup3, 1, 4, O_CLOEXEC);
  if (fd == -1)
    {
      puts ("dup3(O_CLOEXEC) failed");
      return 1;
    }
  coe = fcntl (fd, F_GETFD);
  if (coe == -1)
    {
      puts ("fcntl failed");
      return 1;
    }
  if ((coe & FD_CLOEXEC) == 0)
    {
      puts ("dup3(O_CLOEXEC) set close-on-exec flag");
      return 1;
    }
  close (fd);

  puts ("OK");

  return 0;
}
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/ia32/ia32entry.S          |  1 +
 arch/x86/kernel/syscall_table_32.S |  1 +
 fs/fcntl.c                         | 15 +++++++++++++--
 include/asm-x86/unistd_32.h        |  1 +
 include/asm-x86/unistd_64.h        |  2 ++
 include/linux/syscalls.h           |  1 +
 6 files changed, 19 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 04366f08f42..5614a8f7bed 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -829,4 +829,5 @@ ia32_sys_call_table:
 	.quad compat_sys_signalfd4
 	.quad sys_eventfd2
 	.quad sys_epoll_create2
+	.quad sys_dup3			/* 330 */
 ia32_syscall_end:
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 4d7007ca263..24a3f1ea6a0 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -329,3 +329,4 @@ ENTRY(sys_call_table)
 	.long sys_signalfd4
 	.long sys_eventfd2
 	.long sys_epoll_create2
+	.long sys_dup3			/* 330 */
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 330a7d78259..9679fcbdeaa 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -125,13 +125,16 @@ static int dupfd(struct file *file, unsigned int start, int cloexec)
 	return fd;
 }
 
-asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd)
+asmlinkage long sys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
 {
 	int err = -EBADF;
 	struct file * file, *tofree;
 	struct files_struct * files = current->files;
 	struct fdtable *fdt;
 
+	if ((flags & ~O_CLOEXEC) != 0)
+		return -EINVAL;
+
 	spin_lock(&files->file_lock);
 	if (!(file = fcheck(oldfd)))
 		goto out_unlock;
@@ -163,7 +166,10 @@ asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd)
 
 	rcu_assign_pointer(fdt->fd[newfd], file);
 	FD_SET(newfd, fdt->open_fds);
-	FD_CLR(newfd, fdt->close_on_exec);
+	if (flags & O_CLOEXEC)
+		FD_SET(newfd, fdt->close_on_exec);
+	else
+		FD_CLR(newfd, fdt->close_on_exec);
 	spin_unlock(&files->file_lock);
 
 	if (tofree)
@@ -181,6 +187,11 @@ out_fput:
 	goto out;
 }
 
+asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd)
+{
+	return sys_dup3(oldfd, newfd, 0);
+}
+
 asmlinkage long sys_dup(unsigned int fildes)
 {
 	int ret = -EBADF;
diff --git a/include/asm-x86/unistd_32.h b/include/asm-x86/unistd_32.h
index a37d6b0c4e1..a1f6383bf69 100644
--- a/include/asm-x86/unistd_32.h
+++ b/include/asm-x86/unistd_32.h
@@ -335,6 +335,7 @@
 #define __NR_signalfd4		327
 #define __NR_eventfd2		328
 #define __NR_epoll_create2	329
+#define __NR_dup3		330
 
 #ifdef __KERNEL__
 
diff --git a/include/asm-x86/unistd_64.h b/include/asm-x86/unistd_64.h
index a1a4a5b6e5e..f0fb2bd40cd 100644
--- a/include/asm-x86/unistd_64.h
+++ b/include/asm-x86/unistd_64.h
@@ -647,6 +647,8 @@ __SYSCALL(__NR_signalfd4, sys_signalfd4)
 __SYSCALL(__NR_eventfd2, sys_eventfd2)
 #define __NR_epoll_create2			291
 __SYSCALL(__NR_epoll_create2, sys_epoll_create2)
+#define __NR_dup3				292
+__SYSCALL(__NR_dup3, sys_dup3)
 
 
 #ifndef __NO_STUBS
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 85953240f28..034d3358549 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -305,6 +305,7 @@ asmlinkage long sys_fcntl64(unsigned int fd,
 #endif
 asmlinkage long sys_dup(unsigned int fildes);
 asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd);
+asmlinkage long sys_dup3(unsigned int oldfd, unsigned int newfd, int flags);
 asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on);
 asmlinkage long sys_ioctl(unsigned int fd, unsigned int cmd,
 				unsigned long arg);
-- 
cgit v1.2.3-70-g09d2


From ed8cae8ba01348bfd83333f4648dd807b04d7f08 Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Wed, 23 Jul 2008 21:29:30 -0700
Subject: flag parameters: pipe

This patch introduces the new syscall pipe2 which is like pipe but it also
takes an additional parameter which takes a flag value.  This patch implements
the handling of O_CLOEXEC for the flag.  I did not add support for the new
syscall for the architectures which have a special sys_pipe implementation.  I
think the maintainers of those archs have the chance to go with the unified
implementation but that's up to them.

The implementation introduces do_pipe_flags.  I did that instead of changing
all callers of do_pipe because some of the callers are written in assembler.
I would probably screw up changing the assembly code.  To avoid breaking code
do_pipe is now a small wrapper around do_pipe_flags.  Once all callers are
changed over to do_pipe_flags the old do_pipe function can be removed.

The following test must be adjusted for architectures other than x86 and
x86-64 and in case the syscall numbers changed.

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#include <fcntl.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>

#ifndef __NR_pipe2
# ifdef __x86_64__
#  define __NR_pipe2 293
# elif defined __i386__
#  define __NR_pipe2 331
# else
#  error "need __NR_pipe2"
# endif
#endif

int
main (void)
{
  int fd[2];
  if (syscall (__NR_pipe2, fd, 0) != 0)
    {
      puts ("pipe2(0) failed");
      return 1;
    }
  for (int i = 0; i < 2; ++i)
    {
      int coe = fcntl (fd[i], F_GETFD);
      if (coe == -1)
        {
          puts ("fcntl failed");
          return 1;
        }
      if (coe & FD_CLOEXEC)
        {
          printf ("pipe2(0) set close-on-exit for fd[%d]\n", i);
          return 1;
        }
    }
  close (fd[0]);
  close (fd[1]);

  if (syscall (__NR_pipe2, fd, O_CLOEXEC) != 0)
    {
      puts ("pipe2(O_CLOEXEC) failed");
      return 1;
    }
  for (int i = 0; i < 2; ++i)
    {
      int coe = fcntl (fd[i], F_GETFD);
      if (coe == -1)
        {
          puts ("fcntl failed");
          return 1;
        }
      if ((coe & FD_CLOEXEC) == 0)
        {
          printf ("pipe2(O_CLOEXEC) does not set close-on-exit for fd[%d]\n", i);
          return 1;
        }
    }
  close (fd[0]);
  close (fd[1]);

  puts ("OK");

  return 0;
}
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/ia32/sys_ia32.c          |  2 +-
 arch/ia64/kernel/sys_ia64.c        |  2 +-
 arch/mips/kernel/syscall.c         |  2 +-
 arch/parisc/hpux/sys_hpux.c        |  2 +-
 arch/sh/kernel/sys_sh32.c          |  2 +-
 arch/sparc/kernel/sys_sparc.c      |  2 +-
 arch/sparc64/kernel/sys_sparc.c    |  2 +-
 arch/x86/ia32/ia32entry.S          |  1 +
 arch/x86/ia32/sys_ia32.c           |  2 +-
 arch/x86/kernel/syscall_table_32.S |  1 +
 arch/xtensa/kernel/syscall.c       |  2 +-
 fs/pipe.c                          | 23 ++++++++++++++++++-----
 include/asm-x86/unistd_32.h        |  1 +
 include/asm-x86/unistd_64.h        |  2 ++
 include/linux/fs.h                 |  1 +
 15 files changed, 33 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/arch/ia64/ia32/sys_ia32.c b/arch/ia64/ia32/sys_ia32.c
index 7e028ceb93b..465116aecb8 100644
--- a/arch/ia64/ia32/sys_ia32.c
+++ b/arch/ia64/ia32/sys_ia32.c
@@ -1139,7 +1139,7 @@ sys32_pipe (int __user *fd)
 	int retval;
 	int fds[2];
 
-	retval = do_pipe(fds);
+	retval = do_pipe_flags(fds, 0);
 	if (retval)
 		goto out;
 	if (copy_to_user(fd, fds, sizeof(fds)))
diff --git a/arch/ia64/kernel/sys_ia64.c b/arch/ia64/kernel/sys_ia64.c
index 1eda194b955..bcbb6d8792d 100644
--- a/arch/ia64/kernel/sys_ia64.c
+++ b/arch/ia64/kernel/sys_ia64.c
@@ -160,7 +160,7 @@ sys_pipe (void)
 	int fd[2];
 	int retval;
 
-	retval = do_pipe(fd);
+	retval = do_pipe_flags(fd, 0);
 	if (retval)
 		goto out;
 	retval = fd[0];
diff --git a/arch/mips/kernel/syscall.c b/arch/mips/kernel/syscall.c
index 3523c8d12ed..343015a2f41 100644
--- a/arch/mips/kernel/syscall.c
+++ b/arch/mips/kernel/syscall.c
@@ -52,7 +52,7 @@ asmlinkage int sysm_pipe(nabi_no_regargs volatile struct pt_regs regs)
 	int fd[2];
 	int error, res;
 
-	error = do_pipe(fd);
+	error = do_pipe_flags(fd, 0);
 	if (error) {
 		res = error;
 		goto out;
diff --git a/arch/parisc/hpux/sys_hpux.c b/arch/parisc/hpux/sys_hpux.c
index 0c5b9dabb47..be255ebb609 100644
--- a/arch/parisc/hpux/sys_hpux.c
+++ b/arch/parisc/hpux/sys_hpux.c
@@ -448,7 +448,7 @@ int hpux_pipe(int *kstack_fildes)
 	int error;
 
 	lock_kernel();
-	error = do_pipe(kstack_fildes);
+	error = do_pipe_flags(kstack_fildes, 0);
 	unlock_kernel();
 	return error;
 }
diff --git a/arch/sh/kernel/sys_sh32.c b/arch/sh/kernel/sys_sh32.c
index 125e493ead8..f0aa5c39865 100644
--- a/arch/sh/kernel/sys_sh32.c
+++ b/arch/sh/kernel/sys_sh32.c
@@ -29,7 +29,7 @@ asmlinkage int sys_pipe(unsigned long r4, unsigned long r5,
 	int fd[2];
 	int error;
 
-	error = do_pipe(fd);
+	error = do_pipe_flags(fd, 0);
 	if (!error) {
 		regs->regs[1] = fd[1];
 		return fd[0];
diff --git a/arch/sparc/kernel/sys_sparc.c b/arch/sparc/kernel/sys_sparc.c
index 3c6b49a53ae..4d73421559c 100644
--- a/arch/sparc/kernel/sys_sparc.c
+++ b/arch/sparc/kernel/sys_sparc.c
@@ -97,7 +97,7 @@ asmlinkage int sparc_pipe(struct pt_regs *regs)
 	int fd[2];
 	int error;
 
-	error = do_pipe(fd);
+	error = do_pipe_flags(fd, 0);
 	if (error)
 		goto out;
 	regs->u_regs[UREG_I1] = fd[1];
diff --git a/arch/sparc64/kernel/sys_sparc.c b/arch/sparc64/kernel/sys_sparc.c
index e1f4eba2e57..39749e32dc7 100644
--- a/arch/sparc64/kernel/sys_sparc.c
+++ b/arch/sparc64/kernel/sys_sparc.c
@@ -418,7 +418,7 @@ asmlinkage long sparc_pipe(struct pt_regs *regs)
 	int fd[2];
 	int error;
 
-	error = do_pipe(fd);
+	error = do_pipe_flags(fd, 0);
 	if (error)
 		goto out;
 	regs->u_regs[UREG_I1] = fd[1];
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 5614a8f7bed..18808b16457 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -830,4 +830,5 @@ ia32_sys_call_table:
 	.quad sys_eventfd2
 	.quad sys_epoll_create2
 	.quad sys_dup3			/* 330 */
+	.quad sys_pipe2
 ia32_syscall_end:
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index f00afdf61e6..d3c64088b98 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -238,7 +238,7 @@ asmlinkage long sys32_pipe(int __user *fd)
 	int retval;
 	int fds[2];
 
-	retval = do_pipe(fds);
+	retval = do_pipe_flags(fds, 0);
 	if (retval)
 		goto out;
 	if (copy_to_user(fd, fds, sizeof(fds)))
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 24a3f1ea6a0..66154769d52 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -330,3 +330,4 @@ ENTRY(sys_call_table)
 	.long sys_eventfd2
 	.long sys_epoll_create2
 	.long sys_dup3			/* 330 */
+	.long sys_pipe2
diff --git a/arch/xtensa/kernel/syscall.c b/arch/xtensa/kernel/syscall.c
index f3e16efcd47..ac15ecbdf91 100644
--- a/arch/xtensa/kernel/syscall.c
+++ b/arch/xtensa/kernel/syscall.c
@@ -49,7 +49,7 @@ asmlinkage long xtensa_pipe(int __user *userfds)
 	int fd[2];
 	int error;
 
-	error = do_pipe(fd);
+	error = do_pipe_flags(fd, 0);
 	if (!error) {
 		if (copy_to_user(userfds, fd, 2 * sizeof(int)))
 			error = -EFAULT;
diff --git a/fs/pipe.c b/fs/pipe.c
index 700f4e0d957..68e82061070 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1027,12 +1027,15 @@ struct file *create_read_pipe(struct file *wrf)
 	return f;
 }
 
-int do_pipe(int *fd)
+int do_pipe_flags(int *fd, int flags)
 {
 	struct file *fw, *fr;
 	int error;
 	int fdw, fdr;
 
+	if (flags & ~O_CLOEXEC)
+		return -EINVAL;
+
 	fw = create_write_pipe();
 	if (IS_ERR(fw))
 		return PTR_ERR(fw);
@@ -1041,12 +1044,12 @@ int do_pipe(int *fd)
 	if (IS_ERR(fr))
 		goto err_write_pipe;
 
-	error = get_unused_fd();
+	error = get_unused_fd_flags(flags);
 	if (error < 0)
 		goto err_read_pipe;
 	fdr = error;
 
-	error = get_unused_fd();
+	error = get_unused_fd_flags(flags);
 	if (error < 0)
 		goto err_fdr;
 	fdw = error;
@@ -1074,16 +1077,21 @@ int do_pipe(int *fd)
 	return error;
 }
 
+int do_pipe(int *fd)
+{
+	return do_pipe_flags(fd, 0);
+}
+
 /*
  * sys_pipe() is the normal C calling standard for creating
  * a pipe. It's not the way Unix traditionally does this, though.
  */
-asmlinkage long __weak sys_pipe(int __user *fildes)
+asmlinkage long __weak sys_pipe2(int __user *fildes, int flags)
 {
 	int fd[2];
 	int error;
 
-	error = do_pipe(fd);
+	error = do_pipe_flags(fd, flags);
 	if (!error) {
 		if (copy_to_user(fildes, fd, sizeof(fd))) {
 			sys_close(fd[0]);
@@ -1094,6 +1102,11 @@ asmlinkage long __weak sys_pipe(int __user *fildes)
 	return error;
 }
 
+asmlinkage long __weak sys_pipe(int __user *fildes)
+{
+	return sys_pipe2(fildes, 0);
+}
+
 /*
  * pipefs should _never_ be mounted by userland - too much of security hassle,
  * no real gain from having the whole whorehouse mounted. So we don't need
diff --git a/include/asm-x86/unistd_32.h b/include/asm-x86/unistd_32.h
index a1f6383bf69..748a05c77da 100644
--- a/include/asm-x86/unistd_32.h
+++ b/include/asm-x86/unistd_32.h
@@ -336,6 +336,7 @@
 #define __NR_eventfd2		328
 #define __NR_epoll_create2	329
 #define __NR_dup3		330
+#define __NR_pipe2		331
 
 #ifdef __KERNEL__
 
diff --git a/include/asm-x86/unistd_64.h b/include/asm-x86/unistd_64.h
index f0fb2bd40cd..d2284b43ad5 100644
--- a/include/asm-x86/unistd_64.h
+++ b/include/asm-x86/unistd_64.h
@@ -649,6 +649,8 @@ __SYSCALL(__NR_eventfd2, sys_eventfd2)
 __SYSCALL(__NR_epoll_create2, sys_epoll_create2)
 #define __NR_dup3				292
 __SYSCALL(__NR_dup3, sys_dup3)
+#define __NR_pipe2				293
+__SYSCALL(__NR_pipe2, sys_pipe2)
 
 
 #ifndef __NO_STUBS
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e5e6a244096..0e80cd717d3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1777,6 +1777,7 @@ static inline void allow_write_access(struct file *file)
 		atomic_inc(&file->f_path.dentry->d_inode->i_writecount);
 }
 extern int do_pipe(int *);
+extern int do_pipe_flags(int *, int);
 extern struct file *create_read_pipe(struct file *f);
 extern struct file *create_write_pipe(void);
 extern void free_write_pipe(struct file *);
-- 
cgit v1.2.3-70-g09d2


From 4006553b06306b34054529477b06b68a1c66249b Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Wed, 23 Jul 2008 21:29:32 -0700
Subject: flag parameters: inotify_init

This patch introduces the new syscall inotify_init1 (note: the 1 stands for
the one parameter the syscall takes, as opposed to no parameter before).  The
values accepted for this parameter are function-specific and defined in the
inotify.h header.  Here the values must match the O_* flags, though.  In this
patch CLOEXEC support is introduced.

The following test must be adjusted for architectures other than x86 and
x86-64 and in case the syscall numbers changed.

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#include <fcntl.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>

#ifndef __NR_inotify_init1
# ifdef __x86_64__
#  define __NR_inotify_init1 294
# elif defined __i386__
#  define __NR_inotify_init1 332
# else
#  error "need __NR_inotify_init1"
# endif
#endif

#define IN_CLOEXEC O_CLOEXEC

int
main (void)
{
  int fd;
  fd = syscall (__NR_inotify_init1, 0);
  if (fd == -1)
    {
      puts ("inotify_init1(0) failed");
      return 1;
    }
  int coe = fcntl (fd, F_GETFD);
  if (coe == -1)
    {
      puts ("fcntl failed");
      return 1;
    }
  if (coe & FD_CLOEXEC)
    {
      puts ("inotify_init1(0) set close-on-exit");
      return 1;
    }
  close (fd);

  fd = syscall (__NR_inotify_init1, IN_CLOEXEC);
  if (fd == -1)
    {
      puts ("inotify_init1(IN_CLOEXEC) failed");
      return 1;
    }
  coe = fcntl (fd, F_GETFD);
  if (coe == -1)
    {
      puts ("fcntl failed");
      return 1;
    }
  if ((coe & FD_CLOEXEC) == 0)
    {
      puts ("inotify_init1(O_CLOEXEC) does not set close-on-exit");
      return 1;
    }
  close (fd);

  puts ("OK");

  return 0;
}
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[akpm@linux-foundation.org: add sys_ni stub]
Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/ia32/ia32entry.S          |  1 +
 arch/x86/kernel/syscall_table_32.S |  1 +
 fs/inotify_user.c                  | 12 ++++++++++--
 include/asm-x86/unistd_32.h        |  1 +
 include/asm-x86/unistd_64.h        |  2 ++
 include/linux/inotify.h            |  5 +++++
 include/linux/syscalls.h           |  1 +
 kernel/sys_ni.c                    |  1 +
 8 files changed, 22 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 18808b16457..4541073dd83 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -831,4 +831,5 @@ ia32_sys_call_table:
 	.quad sys_epoll_create2
 	.quad sys_dup3			/* 330 */
 	.quad sys_pipe2
+	.quad sys_inotify_init1
 ia32_syscall_end:
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 66154769d52..f59aba5ff0f 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -331,3 +331,4 @@ ENTRY(sys_call_table)
 	.long sys_epoll_create2
 	.long sys_dup3			/* 330 */
 	.long sys_pipe2
+	.long sys_inotify_init1
diff --git a/fs/inotify_user.c b/fs/inotify_user.c
index 6676c06bb7c..851005998cd 100644
--- a/fs/inotify_user.c
+++ b/fs/inotify_user.c
@@ -566,7 +566,7 @@ static const struct inotify_operations inotify_user_ops = {
 	.destroy_watch	= free_inotify_user_watch,
 };
 
-asmlinkage long sys_inotify_init(void)
+asmlinkage long sys_inotify_init1(int flags)
 {
 	struct inotify_device *dev;
 	struct inotify_handle *ih;
@@ -574,7 +574,10 @@ asmlinkage long sys_inotify_init(void)
 	struct file *filp;
 	int fd, ret;
 
-	fd = get_unused_fd();
+	if (flags & ~IN_CLOEXEC)
+		return -EINVAL;
+
+	fd = get_unused_fd_flags(flags & O_CLOEXEC);
 	if (fd < 0)
 		return fd;
 
@@ -638,6 +641,11 @@ out_put_fd:
 	return ret;
 }
 
+asmlinkage long sys_inotify_init(void)
+{
+	return sys_inotify_init1(0);
+}
+
 asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
 {
 	struct inode *inode;
diff --git a/include/asm-x86/unistd_32.h b/include/asm-x86/unistd_32.h
index 748a05c77da..b3daf503ab9 100644
--- a/include/asm-x86/unistd_32.h
+++ b/include/asm-x86/unistd_32.h
@@ -337,6 +337,7 @@
 #define __NR_epoll_create2	329
 #define __NR_dup3		330
 #define __NR_pipe2		331
+#define __NR_inotify_init1	332
 
 #ifdef __KERNEL__
 
diff --git a/include/asm-x86/unistd_64.h b/include/asm-x86/unistd_64.h
index d2284b43ad5..c8cb88d70c6 100644
--- a/include/asm-x86/unistd_64.h
+++ b/include/asm-x86/unistd_64.h
@@ -651,6 +651,8 @@ __SYSCALL(__NR_epoll_create2, sys_epoll_create2)
 __SYSCALL(__NR_dup3, sys_dup3)
 #define __NR_pipe2				293
 __SYSCALL(__NR_pipe2, sys_pipe2)
+#define __NR_inotify_init1			294
+__SYSCALL(__NR_inotify_init1, sys_inotify_init1)
 
 
 #ifndef __NO_STUBS
diff --git a/include/linux/inotify.h b/include/linux/inotify.h
index 742b917e7d1..72ef8212051 100644
--- a/include/linux/inotify.h
+++ b/include/linux/inotify.h
@@ -7,6 +7,8 @@
 #ifndef _LINUX_INOTIFY_H
 #define _LINUX_INOTIFY_H
 
+/* For O_CLOEXEC */
+#include <linux/fcntl.h>
 #include <linux/types.h>
 
 /*
@@ -63,6 +65,9 @@ struct inotify_event {
 			 IN_MOVED_TO | IN_DELETE | IN_CREATE | IN_DELETE_SELF | \
 			 IN_MOVE_SELF)
 
+/* Flags for sys_inotify_init1.  */
+#define IN_CLOEXEC O_CLOEXEC
+
 #ifdef __KERNEL__
 
 #include <linux/dcache.h>
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 034d3358549..93a7e7f017a 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -547,6 +547,7 @@ asmlinkage long sys_get_mempolicy(int __user *policy,
 				unsigned long addr, unsigned long flags);
 
 asmlinkage long sys_inotify_init(void);
+asmlinkage long sys_inotify_init1(int flags);
 asmlinkage long sys_inotify_add_watch(int fd, const char __user *path,
 					u32 mask);
 asmlinkage long sys_inotify_rm_watch(int fd, u32 wd);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 2a361ccdc7c..bd66ac5406f 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -96,6 +96,7 @@ cond_syscall(sys_keyctl);
 cond_syscall(compat_sys_keyctl);
 cond_syscall(compat_sys_socketcall);
 cond_syscall(sys_inotify_init);
+cond_syscall(sys_inotify_init1);
 cond_syscall(sys_inotify_add_watch);
 cond_syscall(sys_inotify_rm_watch);
 cond_syscall(sys_migrate_pages);
-- 
cgit v1.2.3-70-g09d2


From 99829b832997d907c30669bfd17da32151e18f04 Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Wed, 23 Jul 2008 21:29:33 -0700
Subject: flag parameters: NONBLOCK in anon_inode_getfd

Building on the previous change to anon_inode_getfd, this patch introduces
support for handling of O_NONBLOCK in addition to the already supported
O_CLOEXEC.  Following patches will take advantage of this support.  As can be
seen, the additional support for supporting this functionality is minimal.

Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/anon_inodes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 1a4eee620b0..3662dd44896 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -116,7 +116,7 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops,
 	file->f_mapping = anon_inode_inode->i_mapping;
 
 	file->f_pos = 0;
-	file->f_flags = O_RDWR;
+	file->f_flags = O_RDWR | (flags & O_NONBLOCK);
 	file->f_version = 0;
 	file->private_data = priv;
 
-- 
cgit v1.2.3-70-g09d2


From 5fb5e04926a54bc1c22bba7ca166840f4476196f Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Wed, 23 Jul 2008 21:29:37 -0700
Subject: flag parameters: NONBLOCK in signalfd

This patch adds support for the SFD_NONBLOCK flag to signalfd4.  The
additional changes needed are minimal.

The following test must be adjusted for architectures other than x86 and
x86-64 and in case the syscall numbers changed.

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#include <fcntl.h>
#include <signal.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>

#ifndef __NR_signalfd4
# ifdef __x86_64__
#  define __NR_signalfd4 289
# elif defined __i386__
#  define __NR_signalfd4 327
# else
#  error "need __NR_signalfd4"
# endif
#endif

#define SFD_NONBLOCK O_NONBLOCK

int
main (void)
{
  sigset_t ss;
  sigemptyset (&ss);
  sigaddset (&ss, SIGUSR1);
  int fd = syscall (__NR_signalfd4, -1, &ss, 8, 0);
  if (fd == -1)
    {
      puts ("signalfd4(0) failed");
      return 1;
    }
  int fl = fcntl (fd, F_GETFL);
  if (fl == -1)
    {
      puts ("fcntl failed");
      return 1;
    }
  if (fl & O_NONBLOCK)
    {
      puts ("signalfd4(0) set non-blocking mode");
      return 1;
    }
  close (fd);

  fd = syscall (__NR_signalfd4, -1, &ss, 8, SFD_NONBLOCK);
  if (fd == -1)
    {
      puts ("signalfd4(SFD_NONBLOCK) failed");
      return 1;
    }
  fl = fcntl (fd, F_GETFL);
  if (fl == -1)
    {
      puts ("fcntl failed");
      return 1;
    }
  if ((fl & O_NONBLOCK) == 0)
    {
      puts ("signalfd4(SFD_NONBLOCK) does not set non-blocking mode");
      return 1;
    }
  close (fd);

  puts ("OK");

  return 0;
}
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/signalfd.c            | 4 ++--
 include/linux/signalfd.h | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/signalfd.c b/fs/signalfd.c
index c8609fa51a1..5441a4bca77 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -211,7 +211,7 @@ asmlinkage long sys_signalfd4(int ufd, sigset_t __user *user_mask,
 	sigset_t sigmask;
 	struct signalfd_ctx *ctx;
 
-	if (flags & ~SFD_CLOEXEC)
+	if (flags & ~(SFD_CLOEXEC | SFD_NONBLOCK))
 		return -EINVAL;
 
 	if (sizemask != sizeof(sigset_t) ||
@@ -232,7 +232,7 @@ asmlinkage long sys_signalfd4(int ufd, sigset_t __user *user_mask,
 		 * anon_inode_getfd() will install the fd.
 		 */
 		ufd = anon_inode_getfd("[signalfd]", &signalfd_fops, ctx,
-				       flags & O_CLOEXEC);
+				       flags & (O_CLOEXEC | O_NONBLOCK));
 		if (ufd < 0)
 			kfree(ctx);
 	} else {
diff --git a/include/linux/signalfd.h b/include/linux/signalfd.h
index 8b3f7b7420a..bef0c46d471 100644
--- a/include/linux/signalfd.h
+++ b/include/linux/signalfd.h
@@ -8,11 +8,12 @@
 #ifndef _LINUX_SIGNALFD_H
 #define _LINUX_SIGNALFD_H
 
-/* For O_CLOEXEC */
+/* For O_CLOEXEC and O_NONBLOCK */
 #include <linux/fcntl.h>
 
 /* Flags for signalfd4.  */
 #define SFD_CLOEXEC O_CLOEXEC
+#define SFD_NONBLOCK O_NONBLOCK
 
 struct signalfd_siginfo {
 	__u32 ssi_signo;
-- 
cgit v1.2.3-70-g09d2


From e7d476dfdf0bcfed478a207aecfdc84f81efecaf Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Wed, 23 Jul 2008 21:29:38 -0700
Subject: flag parameters: NONBLOCK in eventfd

This patch adds support for the EFD_NONBLOCK flag to eventfd2.  The
additional changes needed are minimal.

The following test must be adjusted for architectures other than x86 and
x86-64 and in case the syscall numbers changed.

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#include <fcntl.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>

#ifndef __NR_eventfd2
# ifdef __x86_64__
#  define __NR_eventfd2 290
# elif defined __i386__
#  define __NR_eventfd2 328
# else
#  error "need __NR_eventfd2"
# endif
#endif

#define EFD_NONBLOCK O_NONBLOCK

int
main (void)
{
  int fd = syscall (__NR_eventfd2, 1, 0);
  if (fd == -1)
    {
      puts ("eventfd2(0) failed");
      return 1;
    }
  int fl = fcntl (fd, F_GETFL);
  if (fl == -1)
    {
      puts ("fcntl failed");
      return 1;
    }
  if (fl & O_NONBLOCK)
    {
      puts ("eventfd2(0) sets non-blocking mode");
      return 1;
    }
  close (fd);

  fd = syscall (__NR_eventfd2, 1, EFD_NONBLOCK);
  if (fd == -1)
    {
      puts ("eventfd2(EFD_NONBLOCK) failed");
      return 1;
    }
  fl = fcntl (fd, F_GETFL);
  if (fl == -1)
    {
      puts ("fcntl failed");
      return 1;
    }
  if ((fl & O_NONBLOCK) == 0)
    {
      puts ("eventfd2(EFD_NONBLOCK) does not set non-blocking mode");
      return 1;
    }
  close (fd);

  puts ("OK");

  return 0;
}
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventfd.c            | 4 ++--
 include/linux/eventfd.h | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/eventfd.c b/fs/eventfd.c
index bd420e6478a..3ed4466177a 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -203,7 +203,7 @@ asmlinkage long sys_eventfd2(unsigned int count, int flags)
 	int fd;
 	struct eventfd_ctx *ctx;
 
-	if (flags & ~EFD_CLOEXEC)
+	if (flags & ~(EFD_CLOEXEC | EFD_NONBLOCK))
 		return -EINVAL;
 
 	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
@@ -218,7 +218,7 @@ asmlinkage long sys_eventfd2(unsigned int count, int flags)
 	 * anon_inode_getfd() will install the fd.
 	 */
 	fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx,
-			      flags & O_CLOEXEC);
+			      flags & (O_CLOEXEC | O_NONBLOCK));
 	if (fd < 0)
 		kfree(ctx);
 	return fd;
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index a6c0eaedb1b..a667637b54e 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -10,11 +10,12 @@
 
 #ifdef CONFIG_EVENTFD
 
-/* For O_CLOEXEC */
+/* For O_CLOEXEC and O_NONBLOCK */
 #include <linux/fcntl.h>
 
 /* Flags for eventfd2.  */
 #define EFD_CLOEXEC O_CLOEXEC
+#define EFD_NONBLOCK O_NONBLOCK
 
 struct file *eventfd_fget(int fd);
 int eventfd_signal(struct file *file, int n);
-- 
cgit v1.2.3-70-g09d2


From 6b1ef0e60d42f2fdaec26baee8327eb156347b4f Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Wed, 23 Jul 2008 21:29:39 -0700
Subject: flag parameters: NONBLOCK in timerfd_create

This patch adds support for the TFD_NONBLOCK flag to timerfd_create.  The
additional changes needed are minimal.

The following test must be adjusted for architectures other than x86 and
x86-64 and in case the syscall numbers changed.

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#include <fcntl.h>
#include <stdio.h>
#include <time.h>
#include <unistd.h>
#include <sys/syscall.h>

#ifndef __NR_timerfd_create
# ifdef __x86_64__
#  define __NR_timerfd_create 283
# elif defined __i386__
#  define __NR_timerfd_create 322
# else
#  error "need __NR_timerfd_create"
# endif
#endif

#define TFD_NONBLOCK O_NONBLOCK

int
main (void)
{
  int fd = syscall (__NR_timerfd_create, CLOCK_REALTIME, 0);
  if (fd == -1)
    {
      puts ("timerfd_create(0) failed");
      return 1;
    }
  int fl = fcntl (fd, F_GETFL);
  if (fl == -1)
    {
      puts ("fcntl failed");
      return 1;
    }
  if (fl & O_NONBLOCK)
    {
      puts ("timerfd_create(0) set non-blocking mode");
      return 1;
    }
  close (fd);

  fd = syscall (__NR_timerfd_create, CLOCK_REALTIME, TFD_NONBLOCK);
  if (fd == -1)
    {
      puts ("timerfd_create(TFD_NONBLOCK) failed");
      return 1;
    }
  fl = fcntl (fd, F_GETFL);
  if (fl == -1)
    {
      puts ("fcntl failed");
      return 1;
    }
  if ((fl & O_NONBLOCK) == 0)
    {
      puts ("timerfd_create(TFD_NONBLOCK) set non-blocking mode");
      return 1;
    }
  close (fd);

  puts ("OK");

  return 0;
}
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/timerfd.c            | 4 ++--
 include/linux/timerfd.h | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/timerfd.c b/fs/timerfd.c
index c6ef5e33cb3..75d44efe346 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -184,7 +184,7 @@ asmlinkage long sys_timerfd_create(int clockid, int flags)
 	int ufd;
 	struct timerfd_ctx *ctx;
 
-	if (flags & ~TFD_CLOEXEC)
+	if (flags & ~(TFD_CLOEXEC | TFD_NONBLOCK))
 		return -EINVAL;
 	if (clockid != CLOCK_MONOTONIC &&
 	    clockid != CLOCK_REALTIME)
@@ -199,7 +199,7 @@ asmlinkage long sys_timerfd_create(int clockid, int flags)
 	hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS);
 
 	ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx,
-			       flags & O_CLOEXEC);
+			       flags & (O_CLOEXEC | O_NONBLOCK));
 	if (ufd < 0)
 		kfree(ctx);
 
diff --git a/include/linux/timerfd.h b/include/linux/timerfd.h
index 96ed97dff00..86cb0501d3e 100644
--- a/include/linux/timerfd.h
+++ b/include/linux/timerfd.h
@@ -8,7 +8,7 @@
 #ifndef _LINUX_TIMERFD_H
 #define _LINUX_TIMERFD_H
 
-/* For O_CLOEXEC */
+/* For O_CLOEXEC and O_NONBLOCK */
 #include <linux/fcntl.h>
 
 /* Flags for timerfd_settime.  */
@@ -16,6 +16,7 @@
 
 /* Flags for timerfd_create.  */
 #define TFD_CLOEXEC O_CLOEXEC
+#define TFD_NONBLOCK O_NONBLOCK
 
 
 #endif /* _LINUX_TIMERFD_H */
-- 
cgit v1.2.3-70-g09d2


From be61a86d7237dd80510615f38ae21d6e1e98660c Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Wed, 23 Jul 2008 21:29:40 -0700
Subject: flag parameters: NONBLOCK in pipe

This patch adds O_NONBLOCK support to pipe2.  It is minimally more involved
than the patches for eventfd et.al but still trivial.  The interfaces of the
create_write_pipe and create_read_pipe helper functions were changed and the
one other caller as well.

The following test must be adjusted for architectures other than x86 and
x86-64 and in case the syscall numbers changed.

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#include <fcntl.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>

#ifndef __NR_pipe2
# ifdef __x86_64__
#  define __NR_pipe2 293
# elif defined __i386__
#  define __NR_pipe2 331
# else
#  error "need __NR_pipe2"
# endif
#endif

int
main (void)
{
  int fds[2];
  if (syscall (__NR_pipe2, fds, 0) == -1)
    {
      puts ("pipe2(0) failed");
      return 1;
    }
  for (int i = 0; i < 2; ++i)
    {
      int fl = fcntl (fds[i], F_GETFL);
      if (fl == -1)
        {
          puts ("fcntl failed");
          return 1;
        }
      if (fl & O_NONBLOCK)
        {
          printf ("pipe2(0) set non-blocking mode for fds[%d]\n", i);
          return 1;
        }
      close (fds[i]);
    }

  if (syscall (__NR_pipe2, fds, O_NONBLOCK) == -1)
    {
      puts ("pipe2(O_NONBLOCK) failed");
      return 1;
    }
  for (int i = 0; i < 2; ++i)
    {
      int fl = fcntl (fds[i], F_GETFL);
      if (fl == -1)
        {
          puts ("fcntl failed");
          return 1;
        }
      if ((fl & O_NONBLOCK) == 0)
        {
          printf ("pipe2(O_NONBLOCK) does not set non-blocking mode for fds[%d]\n", i);
          return 1;
        }
      close (fds[i]);
    }

  puts ("OK");

  return 0;
}
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/pipe.c          | 14 +++++++-------
 include/linux/fs.h |  4 ++--
 kernel/kmod.c      |  4 ++--
 3 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index 68e82061070..10c4e9aa5c4 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -950,7 +950,7 @@ fail_inode:
 	return NULL;
 }
 
-struct file *create_write_pipe(void)
+struct file *create_write_pipe(int flags)
 {
 	int err;
 	struct inode *inode;
@@ -983,7 +983,7 @@ struct file *create_write_pipe(void)
 		goto err_dentry;
 	f->f_mapping = inode->i_mapping;
 
-	f->f_flags = O_WRONLY;
+	f->f_flags = O_WRONLY | (flags & O_NONBLOCK);
 	f->f_version = 0;
 
 	return f;
@@ -1007,7 +1007,7 @@ void free_write_pipe(struct file *f)
 	put_filp(f);
 }
 
-struct file *create_read_pipe(struct file *wrf)
+struct file *create_read_pipe(struct file *wrf, int flags)
 {
 	struct file *f = get_empty_filp();
 	if (!f)
@@ -1019,7 +1019,7 @@ struct file *create_read_pipe(struct file *wrf)
 	f->f_mapping = wrf->f_path.dentry->d_inode->i_mapping;
 
 	f->f_pos = 0;
-	f->f_flags = O_RDONLY;
+	f->f_flags = O_RDONLY | (flags & O_NONBLOCK);
 	f->f_op = &read_pipe_fops;
 	f->f_mode = FMODE_READ;
 	f->f_version = 0;
@@ -1033,13 +1033,13 @@ int do_pipe_flags(int *fd, int flags)
 	int error;
 	int fdw, fdr;
 
-	if (flags & ~O_CLOEXEC)
+	if (flags & ~(O_CLOEXEC | O_NONBLOCK))
 		return -EINVAL;
 
-	fw = create_write_pipe();
+	fw = create_write_pipe(flags);
 	if (IS_ERR(fw))
 		return PTR_ERR(fw);
-	fr = create_read_pipe(fw);
+	fr = create_read_pipe(fw, flags);
 	error = PTR_ERR(fr);
 	if (IS_ERR(fr))
 		goto err_write_pipe;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0e80cd717d3..4b86f806014 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1778,8 +1778,8 @@ static inline void allow_write_access(struct file *file)
 }
 extern int do_pipe(int *);
 extern int do_pipe_flags(int *, int);
-extern struct file *create_read_pipe(struct file *f);
-extern struct file *create_write_pipe(void);
+extern struct file *create_read_pipe(struct file *f, int flags);
+extern struct file *create_write_pipe(int flags);
 extern void free_write_pipe(struct file *);
 
 extern struct file *do_filp_open(int dfd, const char *pathname,
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 90d7af1c165..2989f67c444 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -417,12 +417,12 @@ int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info,
 {
 	struct file *f;
 
-	f = create_write_pipe();
+	f = create_write_pipe(0);
 	if (IS_ERR(f))
 		return PTR_ERR(f);
 	*filp = f;
 
-	f = create_read_pipe(f);
+	f = create_read_pipe(f, 0);
 	if (IS_ERR(f)) {
 		free_write_pipe(*filp);
 		return PTR_ERR(f);
-- 
cgit v1.2.3-70-g09d2


From 510df2dd482496083e1c3b1a8c9b6afd5fa4c7d7 Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Wed, 23 Jul 2008 21:29:41 -0700
Subject: flag parameters: NONBLOCK in inotify_init

This patch adds non-blocking support for inotify_init1.  The
additional changes needed are minimal.

The following test must be adjusted for architectures other than x86 and
x86-64 and in case the syscall numbers changed.

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#include <fcntl.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>

#ifndef __NR_inotify_init1
# ifdef __x86_64__
#  define __NR_inotify_init1 294
# elif defined __i386__
#  define __NR_inotify_init1 332
# else
#  error "need __NR_inotify_init1"
# endif
#endif

#define IN_NONBLOCK O_NONBLOCK

int
main (void)
{
  int fd = syscall (__NR_inotify_init1, 0);
  if (fd == -1)
    {
      puts ("inotify_init1(0) failed");
      return 1;
    }
  int fl = fcntl (fd, F_GETFL);
  if (fl == -1)
    {
      puts ("fcntl failed");
      return 1;
    }
  if (fl & O_NONBLOCK)
    {
      puts ("inotify_init1(0) set non-blocking mode");
      return 1;
    }
  close (fd);

  fd = syscall (__NR_inotify_init1, IN_NONBLOCK);
  if (fd == -1)
    {
      puts ("inotify_init1(IN_NONBLOCK) failed");
      return 1;
    }
  fl = fcntl (fd, F_GETFL);
  if (fl == -1)
    {
      puts ("fcntl failed");
      return 1;
    }
  if ((fl & O_NONBLOCK) == 0)
    {
      puts ("inotify_init1(IN_NONBLOCK) set non-blocking mode");
      return 1;
    }
  close (fd);

  puts ("OK");

  return 0;
}
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/inotify_user.c       | 4 ++--
 include/linux/inotify.h | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/inotify_user.c b/fs/inotify_user.c
index 851005998cd..dc7e1f61974 100644
--- a/fs/inotify_user.c
+++ b/fs/inotify_user.c
@@ -574,7 +574,7 @@ asmlinkage long sys_inotify_init1(int flags)
 	struct file *filp;
 	int fd, ret;
 
-	if (flags & ~IN_CLOEXEC)
+	if (flags & ~(IN_CLOEXEC | IN_NONBLOCK))
 		return -EINVAL;
 
 	fd = get_unused_fd_flags(flags & O_CLOEXEC);
@@ -613,7 +613,7 @@ asmlinkage long sys_inotify_init1(int flags)
 	filp->f_path.dentry = dget(inotify_mnt->mnt_root);
 	filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
 	filp->f_mode = FMODE_READ;
-	filp->f_flags = O_RDONLY;
+	filp->f_flags = O_RDONLY | (flags & O_NONBLOCK);
 	filp->private_data = dev;
 
 	INIT_LIST_HEAD(&dev->events);
diff --git a/include/linux/inotify.h b/include/linux/inotify.h
index 72ef8212051..bd578578a8b 100644
--- a/include/linux/inotify.h
+++ b/include/linux/inotify.h
@@ -7,7 +7,7 @@
 #ifndef _LINUX_INOTIFY_H
 #define _LINUX_INOTIFY_H
 
-/* For O_CLOEXEC */
+/* For O_CLOEXEC and O_NONBLOCK */
 #include <linux/fcntl.h>
 #include <linux/types.h>
 
@@ -67,6 +67,7 @@ struct inotify_event {
 
 /* Flags for sys_inotify_init1.  */
 #define IN_CLOEXEC O_CLOEXEC
+#define IN_NONBLOCK O_NONBLOCK
 
 #ifdef __KERNEL__
 
-- 
cgit v1.2.3-70-g09d2


From e38b36f325153eaadd1c2a7abc5762079233e540 Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Wed, 23 Jul 2008 21:29:42 -0700
Subject: flag parameters: check magic constants

This patch adds test that ensure the boundary conditions for the various
constants introduced in the previous patches is met.  No code is generated.

[akpm@linux-foundation.org: fix alpha]
Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventfd.c      | 4 ++++
 fs/eventpoll.c    | 3 +++
 fs/inotify_user.c | 4 ++++
 fs/signalfd.c     | 4 ++++
 fs/timerfd.c      | 4 ++++
 net/socket.c      | 6 ++++++
 6 files changed, 25 insertions(+)

(limited to 'fs')

diff --git a/fs/eventfd.c b/fs/eventfd.c
index 3ed4466177a..08bf558d040 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -203,6 +203,10 @@ asmlinkage long sys_eventfd2(unsigned int count, int flags)
 	int fd;
 	struct eventfd_ctx *ctx;
 
+	/* Check the EFD_* constants for consistency.  */
+	BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
+	BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
+
 	if (flags & ~(EFD_CLOEXEC | EFD_NONBLOCK))
 		return -EINVAL;
 
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 3fd4014f3c5..2fdad420404 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1051,6 +1051,9 @@ asmlinkage long sys_epoll_create2(int size, int flags)
 	int error, fd = -1;
 	struct eventpoll *ep;
 
+	/* Check the EPOLL_* constant for consistency.  */
+	BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
+
 	if (flags & ~EPOLL_CLOEXEC)
 		return -EINVAL;
 
diff --git a/fs/inotify_user.c b/fs/inotify_user.c
index dc7e1f61974..fe79c25d95d 100644
--- a/fs/inotify_user.c
+++ b/fs/inotify_user.c
@@ -574,6 +574,10 @@ asmlinkage long sys_inotify_init1(int flags)
 	struct file *filp;
 	int fd, ret;
 
+	/* Check the IN_* constants for consistency.  */
+	BUILD_BUG_ON(IN_CLOEXEC != O_CLOEXEC);
+	BUILD_BUG_ON(IN_NONBLOCK != O_NONBLOCK);
+
 	if (flags & ~(IN_CLOEXEC | IN_NONBLOCK))
 		return -EINVAL;
 
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 5441a4bca77..9c39bc7f843 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -211,6 +211,10 @@ asmlinkage long sys_signalfd4(int ufd, sigset_t __user *user_mask,
 	sigset_t sigmask;
 	struct signalfd_ctx *ctx;
 
+	/* Check the SFD_* constants for consistency.  */
+	BUILD_BUG_ON(SFD_CLOEXEC != O_CLOEXEC);
+	BUILD_BUG_ON(SFD_NONBLOCK != O_NONBLOCK);
+
 	if (flags & ~(SFD_CLOEXEC | SFD_NONBLOCK))
 		return -EINVAL;
 
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 75d44efe346..c502c60e4f5 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -184,6 +184,10 @@ asmlinkage long sys_timerfd_create(int clockid, int flags)
 	int ufd;
 	struct timerfd_ctx *ctx;
 
+	/* Check the TFD_* constants for consistency.  */
+	BUILD_BUG_ON(TFD_CLOEXEC != O_CLOEXEC);
+	BUILD_BUG_ON(TFD_NONBLOCK != O_NONBLOCK);
+
 	if (flags & ~(TFD_CLOEXEC | TFD_NONBLOCK))
 		return -EINVAL;
 	if (clockid != CLOCK_MONOTONIC &&
diff --git a/net/socket.c b/net/socket.c
index 31105f9048a..1310a82cbba 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1222,6 +1222,12 @@ asmlinkage long sys_socket(int family, int type, int protocol)
 	struct socket *sock;
 	int flags;
 
+	/* Check the SOCK_* constants for consistency.  */
+	BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
+	BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
+	BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
+	BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);
+
 	flags = type & ~SOCK_TYPE_MASK;
 	if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
 		return -EINVAL;
-- 
cgit v1.2.3-70-g09d2


From 9fe5ad9c8cef9ad5873d8ee55d1cf00d9b607df0 Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Wed, 23 Jul 2008 21:29:43 -0700
Subject: flag parameters add-on: remove epoll_create size param

Remove the size parameter from the new epoll_create syscall and renames the
syscall itself.  The updated test program follows.

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#include <fcntl.h>
#include <stdio.h>
#include <time.h>
#include <unistd.h>
#include <sys/syscall.h>

#ifndef __NR_epoll_create2
# ifdef __x86_64__
#  define __NR_epoll_create2 291
# elif defined __i386__
#  define __NR_epoll_create2 329
# else
#  error "need __NR_epoll_create2"
# endif
#endif

#define EPOLL_CLOEXEC O_CLOEXEC

int
main (void)
{
  int fd = syscall (__NR_epoll_create2, 0);
  if (fd == -1)
    {
      puts ("epoll_create2(0) failed");
      return 1;
    }
  int coe = fcntl (fd, F_GETFD);
  if (coe == -1)
    {
      puts ("fcntl failed");
      return 1;
    }
  if (coe & FD_CLOEXEC)
    {
      puts ("epoll_create2(0) set close-on-exec flag");
      return 1;
    }
  close (fd);

  fd = syscall (__NR_epoll_create2, EPOLL_CLOEXEC);
  if (fd == -1)
    {
      puts ("epoll_create2(EPOLL_CLOEXEC) failed");
      return 1;
    }
  coe = fcntl (fd, F_GETFD);
  if (coe == -1)
    {
      puts ("fcntl failed");
      return 1;
    }
  if ((coe & FD_CLOEXEC) == 0)
    {
      puts ("epoll_create2(EPOLL_CLOEXEC) set close-on-exec flag");
      return 1;
    }
  close (fd);

  puts ("OK");

  return 0;
}
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/ia32/ia32entry.S          |  2 +-
 arch/x86/kernel/syscall_table_32.S |  2 +-
 fs/eventpoll.c                     | 18 ++++++++++--------
 include/asm-x86/unistd_32.h        |  2 +-
 include/asm-x86/unistd_64.h        |  4 ++--
 include/linux/eventpoll.h          |  2 +-
 include/linux/syscalls.h           |  2 +-
 7 files changed, 17 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 4541073dd83..e4bd1793a5e 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -828,7 +828,7 @@ ia32_sys_call_table:
 	.quad compat_sys_timerfd_gettime
 	.quad compat_sys_signalfd4
 	.quad sys_eventfd2
-	.quad sys_epoll_create2
+	.quad sys_epoll_create1
 	.quad sys_dup3			/* 330 */
 	.quad sys_pipe2
 	.quad sys_inotify_init1
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index f59aba5ff0f..d44395ff34c 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -328,7 +328,7 @@ ENTRY(sys_call_table)
 	.long sys_timerfd_gettime
 	.long sys_signalfd4
 	.long sys_eventfd2
-	.long sys_epoll_create2
+	.long sys_epoll_create1
 	.long sys_dup3			/* 330 */
 	.long sys_pipe2
 	.long sys_inotify_init1
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 2fdad420404..0c87474f791 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1046,7 +1046,7 @@ retry:
  * RB tree. With the current implementation, the "size" parameter is ignored
  * (besides sanity checks).
  */
-asmlinkage long sys_epoll_create2(int size, int flags)
+asmlinkage long sys_epoll_create1(int flags)
 {
 	int error, fd = -1;
 	struct eventpoll *ep;
@@ -1058,14 +1058,13 @@ asmlinkage long sys_epoll_create2(int size, int flags)
 		return -EINVAL;
 
 	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
-		     current, size));
+		     current, flags));
 
 	/*
-	 * Sanity check on the size parameter, and create the internal data
-	 * structure ( "struct eventpoll" ).
+	 * Create the internal data structure ( "struct eventpoll" ).
 	 */
-	error = -EINVAL;
-	if (size <= 0 || (error = ep_alloc(&ep)) < 0) {
+	error = ep_alloc(&ep);
+	if (error < 0) {
 		fd = error;
 		goto error_return;
 	}
@@ -1081,14 +1080,17 @@ asmlinkage long sys_epoll_create2(int size, int flags)
 
 error_return:
 	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
-		     current, size, fd));
+		     current, flags, fd));
 
 	return fd;
 }
 
 asmlinkage long sys_epoll_create(int size)
 {
-	return sys_epoll_create2(size, 0);
+	if (size < 0)
+		return -EINVAL;
+
+	return sys_epoll_create1(0);
 }
 
 /*
diff --git a/include/asm-x86/unistd_32.h b/include/asm-x86/unistd_32.h
index b3daf503ab9..d7394673b77 100644
--- a/include/asm-x86/unistd_32.h
+++ b/include/asm-x86/unistd_32.h
@@ -334,7 +334,7 @@
 #define __NR_timerfd_gettime	326
 #define __NR_signalfd4		327
 #define __NR_eventfd2		328
-#define __NR_epoll_create2	329
+#define __NR_epoll_create1	329
 #define __NR_dup3		330
 #define __NR_pipe2		331
 #define __NR_inotify_init1	332
diff --git a/include/asm-x86/unistd_64.h b/include/asm-x86/unistd_64.h
index c8cb88d70c6..3a341d79179 100644
--- a/include/asm-x86/unistd_64.h
+++ b/include/asm-x86/unistd_64.h
@@ -645,8 +645,8 @@ __SYSCALL(__NR_paccept, sys_paccept)
 __SYSCALL(__NR_signalfd4, sys_signalfd4)
 #define __NR_eventfd2				290
 __SYSCALL(__NR_eventfd2, sys_eventfd2)
-#define __NR_epoll_create2			291
-__SYSCALL(__NR_epoll_create2, sys_epoll_create2)
+#define __NR_epoll_create1			291
+__SYSCALL(__NR_epoll_create1, sys_epoll_create1)
 #define __NR_dup3				292
 __SYSCALL(__NR_dup3, sys_dup3)
 #define __NR_pipe2				293
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index 1cfaa40059c..f1e1d3c4712 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -18,7 +18,7 @@
 #include <linux/fcntl.h>
 #include <linux/types.h>
 
-/* Flags for epoll_create2.  */
+/* Flags for epoll_create1.  */
 #define EPOLL_CLOEXEC O_CLOEXEC
 
 /* Valid opcodes to issue to sys_epoll_ctl() */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 93a7e7f017a..06f2bf76c03 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -431,7 +431,7 @@ asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds,
 asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 			fd_set __user *exp, struct timeval __user *tvp);
 asmlinkage long sys_epoll_create(int size);
-asmlinkage long sys_epoll_create2(int size, int flags);
+asmlinkage long sys_epoll_create1(int flags);
 asmlinkage long sys_epoll_ctl(int epfd, int op, int fd,
 				struct epoll_event __user *event);
 asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,
-- 
cgit v1.2.3-70-g09d2


From 746f1e558bc52b9693c1a1ecdab60f8392e5ff18 Mon Sep 17 00:00:00 2001
From: Michael Halcrow <mhalcrow@us.ibm.com>
Date: Wed, 23 Jul 2008 21:30:02 -0700
Subject: eCryptfs: Privileged kthread for lower file opens

eCryptfs would really like to have read-write access to all files in the
lower filesystem.  Right now, the persistent lower file may be opened
read-only if the attempt to open it read-write fails.  One way to keep
from having to do that is to have a privileged kthread that can open the
lower persistent file on behalf of the user opening the eCryptfs file;
this patch implements this functionality.

This patch will properly allow a less-privileged user to open the eCryptfs
file, followed by a more-privileged user opening the eCryptfs file, with
the first user only being able to read and the second user being able to
both read and write.  eCryptfs currently does this wrong; it will wind up
calling vfs_write() on a file that was opened read-only.  This is fixed in
this patch.

Signed-off-by: Michael Halcrow <mhalcrow@us.ibm.com>
Cc: Dave Kleikamp <shaggy@austin.ibm.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Cc: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/Makefile          |   2 +-
 fs/ecryptfs/ecryptfs_kernel.h |  19 ++++
 fs/ecryptfs/file.c            |   7 ++
 fs/ecryptfs/kthread.c         | 203 ++++++++++++++++++++++++++++++++++++++++++
 fs/ecryptfs/main.c            |  42 ++++-----
 5 files changed, 251 insertions(+), 22 deletions(-)
 create mode 100644 fs/ecryptfs/kthread.c

(limited to 'fs')

diff --git a/fs/ecryptfs/Makefile b/fs/ecryptfs/Makefile
index 1e34a7fd488..b4755a85996 100644
--- a/fs/ecryptfs/Makefile
+++ b/fs/ecryptfs/Makefile
@@ -4,4 +4,4 @@
 
 obj-$(CONFIG_ECRYPT_FS) += ecryptfs.o
 
-ecryptfs-objs := dentry.o file.o inode.o main.o super.o mmap.o read_write.o crypto.o keystore.o messaging.o netlink.o miscdev.o debug.o
+ecryptfs-objs := dentry.o file.o inode.o main.o super.o mmap.o read_write.o crypto.o keystore.o messaging.o netlink.o miscdev.o kthread.o debug.o
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index c15c25745e0..b4a0cccfdd7 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -559,6 +559,20 @@ extern struct kmem_cache *ecryptfs_key_record_cache;
 extern struct kmem_cache *ecryptfs_key_sig_cache;
 extern struct kmem_cache *ecryptfs_global_auth_tok_cache;
 extern struct kmem_cache *ecryptfs_key_tfm_cache;
+extern struct kmem_cache *ecryptfs_open_req_cache;
+
+struct ecryptfs_open_req {
+#define ECRYPTFS_REQ_PROCESSED 0x00000001
+#define ECRYPTFS_REQ_DROPPED   0x00000002
+#define ECRYPTFS_REQ_ZOMBIE    0x00000004
+	u32 flags;
+	struct file **lower_file;
+	struct dentry *lower_dentry;
+	struct vfsmount *lower_mnt;
+	wait_queue_head_t wait;
+	struct mutex mux;
+	struct list_head kthread_ctl_list;
+};
 
 int ecryptfs_interpose(struct dentry *hidden_dentry,
 		       struct dentry *this_dentry, struct super_block *sb,
@@ -690,5 +704,10 @@ void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx);
 int
 ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid,
 		      struct user_namespace *user_ns, struct pid *pid);
+int ecryptfs_init_kthread(void);
+void ecryptfs_destroy_kthread(void);
+int ecryptfs_privileged_open(struct file **lower_file,
+			     struct dentry *lower_dentry,
+			     struct vfsmount *lower_mnt);
 
 #endif /* #ifndef ECRYPTFS_KERNEL_H */
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 24749bf0668..f0be2905152 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -192,6 +192,13 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
 				      | ECRYPTFS_ENCRYPTED);
 	}
 	mutex_unlock(&crypt_stat->cs_mutex);
+	if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY)
+	    && !(file->f_flags & O_RDONLY)) {
+		rc = -EPERM;
+		printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs "
+		       "file must hence be opened RO\n", __func__);
+		goto out;
+	}
 	ecryptfs_set_file_lower(
 		file, ecryptfs_inode_to_private(inode)->lower_file);
 	if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) {
diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
new file mode 100644
index 00000000000..c440c6b58b2
--- /dev/null
+++ b/fs/ecryptfs/kthread.c
@@ -0,0 +1,203 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 2008 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/wait.h>
+#include <linux/mount.h>
+#include "ecryptfs_kernel.h"
+
+struct kmem_cache *ecryptfs_open_req_cache;
+
+static struct ecryptfs_kthread_ctl {
+#define ECRYPTFS_KTHREAD_ZOMBIE 0x00000001
+	u32 flags;
+	struct mutex mux;
+	struct list_head req_list;
+	wait_queue_head_t wait;
+} ecryptfs_kthread_ctl;
+
+static struct task_struct *ecryptfs_kthread;
+
+/**
+ * ecryptfs_threadfn
+ * @ignored: ignored
+ *
+ * The eCryptfs kernel thread that has the responsibility of getting
+ * the lower persistent file with RW permissions.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_threadfn(void *ignored)
+{
+	set_freezable();
+	while (1)  {
+		struct ecryptfs_open_req *req;
+
+		wait_event_freezable(
+			ecryptfs_kthread_ctl.wait,
+			(!list_empty(&ecryptfs_kthread_ctl.req_list)
+			 || kthread_should_stop()));
+		mutex_lock(&ecryptfs_kthread_ctl.mux);
+		if (ecryptfs_kthread_ctl.flags & ECRYPTFS_KTHREAD_ZOMBIE) {
+			mutex_unlock(&ecryptfs_kthread_ctl.mux);
+			goto out;
+		}
+		while (!list_empty(&ecryptfs_kthread_ctl.req_list)) {
+			req = list_first_entry(&ecryptfs_kthread_ctl.req_list,
+					       struct ecryptfs_open_req,
+					       kthread_ctl_list);
+			mutex_lock(&req->mux);
+			list_del(&req->kthread_ctl_list);
+			if (!(req->flags & ECRYPTFS_REQ_ZOMBIE)) {
+				dget(req->lower_dentry);
+				mntget(req->lower_mnt);
+				(*req->lower_file) = dentry_open(
+					req->lower_dentry, req->lower_mnt,
+					(O_RDWR | O_LARGEFILE));
+				req->flags |= ECRYPTFS_REQ_PROCESSED;
+			}
+			wake_up(&req->wait);
+			mutex_unlock(&req->mux);
+		}
+		mutex_unlock(&ecryptfs_kthread_ctl.mux);
+	}
+out:
+	return 0;
+}
+
+int ecryptfs_init_kthread(void)
+{
+	int rc = 0;
+
+	mutex_init(&ecryptfs_kthread_ctl.mux);
+	init_waitqueue_head(&ecryptfs_kthread_ctl.wait);
+	INIT_LIST_HEAD(&ecryptfs_kthread_ctl.req_list);
+	ecryptfs_kthread = kthread_run(&ecryptfs_threadfn, NULL,
+				       "ecryptfs-kthread");
+	if (IS_ERR(ecryptfs_kthread)) {
+		rc = PTR_ERR(ecryptfs_kthread);
+		printk(KERN_ERR "%s: Failed to create kernel thread; rc = [%d]"
+		       "\n", __func__, rc);
+	}
+	return rc;
+}
+
+void ecryptfs_destroy_kthread(void)
+{
+	struct ecryptfs_open_req *req;
+
+	mutex_lock(&ecryptfs_kthread_ctl.mux);
+	ecryptfs_kthread_ctl.flags |= ECRYPTFS_KTHREAD_ZOMBIE;
+	list_for_each_entry(req, &ecryptfs_kthread_ctl.req_list,
+			    kthread_ctl_list) {
+		mutex_lock(&req->mux);
+		req->flags |= ECRYPTFS_REQ_ZOMBIE;
+		wake_up(&req->wait);
+		mutex_unlock(&req->mux);
+	}
+	mutex_unlock(&ecryptfs_kthread_ctl.mux);
+	kthread_stop(ecryptfs_kthread);
+	wake_up(&ecryptfs_kthread_ctl.wait);
+}
+
+/**
+ * ecryptfs_privileged_open
+ * @lower_file: Result of dentry_open by root on lower dentry
+ * @lower_dentry: Lower dentry for file to open
+ * @lower_mnt: Lower vfsmount for file to open
+ *
+ * This function gets a r/w file opened againt the lower dentry.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_privileged_open(struct file **lower_file,
+			     struct dentry *lower_dentry,
+			     struct vfsmount *lower_mnt)
+{
+	struct ecryptfs_open_req *req;
+	int rc = 0;
+
+	/* Corresponding dput() and mntput() are done when the
+	 * persistent file is fput() when the eCryptfs inode is
+	 * destroyed. */
+	dget(lower_dentry);
+	mntget(lower_mnt);
+	(*lower_file) = dentry_open(lower_dentry, lower_mnt,
+				    (O_RDWR | O_LARGEFILE));
+	if (!IS_ERR(*lower_file))
+		goto out;
+	req = kmem_cache_alloc(ecryptfs_open_req_cache, GFP_KERNEL);
+	if (!req) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	mutex_init(&req->mux);
+	req->lower_file = lower_file;
+	req->lower_dentry = lower_dentry;
+	req->lower_mnt = lower_mnt;
+	init_waitqueue_head(&req->wait);
+	req->flags = 0;
+	mutex_lock(&ecryptfs_kthread_ctl.mux);
+	if (ecryptfs_kthread_ctl.flags & ECRYPTFS_KTHREAD_ZOMBIE) {
+		rc = -EIO;
+		mutex_unlock(&ecryptfs_kthread_ctl.mux);
+		printk(KERN_ERR "%s: We are in the middle of shutting down; "
+		       "aborting privileged request to open lower file\n",
+			__func__);
+		goto out_free;
+	}
+	list_add_tail(&req->kthread_ctl_list, &ecryptfs_kthread_ctl.req_list);
+	mutex_unlock(&ecryptfs_kthread_ctl.mux);
+	wake_up(&ecryptfs_kthread_ctl.wait);
+	wait_event(req->wait, (req->flags != 0));
+	mutex_lock(&req->mux);
+	BUG_ON(req->flags == 0);
+	if (req->flags & ECRYPTFS_REQ_DROPPED
+	    || req->flags & ECRYPTFS_REQ_ZOMBIE) {
+		rc = -EIO;
+		printk(KERN_WARNING "%s: Privileged open request dropped\n",
+		       __func__);
+		goto out_unlock;
+	}
+	if (IS_ERR(*req->lower_file)) {
+		rc = PTR_ERR(*req->lower_file);
+		dget(lower_dentry);
+		mntget(lower_mnt);
+		(*lower_file) = dentry_open(lower_dentry, lower_mnt,
+					    (O_RDONLY | O_LARGEFILE));
+		if (IS_ERR(*lower_file)) {
+			rc = PTR_ERR(*req->lower_file);
+			(*lower_file) = NULL;
+			printk(KERN_WARNING "%s: Error attempting privileged "
+			       "open of lower file with either RW or RO "
+			       "perms; rc = [%d]. Giving up.\n",
+			       __func__, rc);
+		}
+	}
+out_unlock:
+	mutex_unlock(&req->mux);
+out_free:
+	kmem_cache_free(ecryptfs_open_req_cache, req);
+out:
+	return rc;
+}
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index d603631601e..f36ab2feea2 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -130,26 +130,12 @@ static int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
 			ecryptfs_dentry_to_lower_mnt(ecryptfs_dentry);
 
 		lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
-		/* Corresponding dput() and mntput() are done when the
-		 * persistent file is fput() when the eCryptfs inode
-		 * is destroyed. */
-		dget(lower_dentry);
-		mntget(lower_mnt);
-		inode_info->lower_file = dentry_open(lower_dentry,
-						     lower_mnt,
-						     (O_RDWR | O_LARGEFILE));
-		if (IS_ERR(inode_info->lower_file)) {
-			dget(lower_dentry);
-			mntget(lower_mnt);
-			inode_info->lower_file = dentry_open(lower_dentry,
-							     lower_mnt,
-							     (O_RDONLY
-							      | O_LARGEFILE));
-		}
-		if (IS_ERR(inode_info->lower_file)) {
+		rc = ecryptfs_privileged_open(&inode_info->lower_file,
+						     lower_dentry, lower_mnt);
+		if (rc || IS_ERR(inode_info->lower_file)) {
 			printk(KERN_ERR "Error opening lower persistent file "
-			       "for lower_dentry [0x%p] and lower_mnt [0x%p]\n",
-			       lower_dentry, lower_mnt);
+			       "for lower_dentry [0x%p] and lower_mnt [0x%p]; "
+			       "rc = [%d]\n", lower_dentry, lower_mnt, rc);
 			rc = PTR_ERR(inode_info->lower_file);
 			inode_info->lower_file = NULL;
 		}
@@ -679,6 +665,11 @@ static struct ecryptfs_cache_info {
 		.name = "ecryptfs_key_tfm_cache",
 		.size = sizeof(struct ecryptfs_key_tfm),
 	},
+	{
+		.cache = &ecryptfs_open_req_cache,
+		.name = "ecryptfs_open_req_cache",
+		.size = sizeof(struct ecryptfs_open_req),
+	},
 };
 
 static void ecryptfs_free_kmem_caches(void)
@@ -795,11 +786,17 @@ static int __init ecryptfs_init(void)
 		printk(KERN_ERR "sysfs registration failed\n");
 		goto out_unregister_filesystem;
 	}
+	rc = ecryptfs_init_kthread();
+	if (rc) {
+		printk(KERN_ERR "%s: kthread initialization failed; "
+		       "rc = [%d]\n", __func__, rc);
+		goto out_do_sysfs_unregistration;
+	}
 	rc = ecryptfs_init_messaging(ecryptfs_transport);
 	if (rc) {
-		ecryptfs_printk(KERN_ERR, "Failure occured while attempting to "
+		printk(KERN_ERR "Failure occured while attempting to "
 				"initialize the eCryptfs netlink socket\n");
-		goto out_do_sysfs_unregistration;
+		goto out_destroy_kthread;
 	}
 	rc = ecryptfs_init_crypto();
 	if (rc) {
@@ -814,6 +811,8 @@ static int __init ecryptfs_init(void)
 	goto out;
 out_release_messaging:
 	ecryptfs_release_messaging(ecryptfs_transport);
+out_destroy_kthread:
+	ecryptfs_destroy_kthread();
 out_do_sysfs_unregistration:
 	do_sysfs_unregistration();
 out_unregister_filesystem:
@@ -833,6 +832,7 @@ static void __exit ecryptfs_exit(void)
 		printk(KERN_ERR "Failure whilst attempting to destroy crypto; "
 		       "rc = [%d]\n", rc);
 	ecryptfs_release_messaging(ecryptfs_transport);
+	ecryptfs_destroy_kthread();
 	do_sysfs_unregistration();
 	unregister_filesystem(&ecryptfs_fs_type);
 	ecryptfs_free_kmem_caches();
-- 
cgit v1.2.3-70-g09d2


From 6c4c17b073cd4a5a61bc04329561632870bb21fc Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Date: Wed, 23 Jul 2008 21:30:04 -0700
Subject: ecryptfs: discard ecryptfsd registration messages in miscdev

The userspace eCryptfs daemon sends HELO and QUIT messages to the kernel
for per-user daemon (un)registration.  These messages are required when
netlink is used as the transport, but (un)registration is handled by
opening and closing the device file when miscdev is the transport.  These
messages should be discarded in the miscdev transport so that a daemon
isn't registered twice.

Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Cc: Michael Halcrow <mhalcrow@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/miscdev.c | 59 ---------------------------------------------------
 1 file changed, 59 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 09a4522f65e..b484792a099 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -357,46 +357,6 @@ out_unlock_daemon:
 	return rc;
 }
 
-/**
- * ecryptfs_miscdev_helo
- * @euid: effective user id of miscdevess sending helo packet
- * @user_ns: The namespace in which @euid applies
- * @pid: miscdevess id of miscdevess sending helo packet
- *
- * Returns zero on success; non-zero otherwise
- */
-static int ecryptfs_miscdev_helo(uid_t euid, struct user_namespace *user_ns,
-				 struct pid *pid)
-{
-	int rc;
-
-	rc = ecryptfs_process_helo(ECRYPTFS_TRANSPORT_MISCDEV, euid, user_ns,
-				   pid);
-	if (rc)
-		printk(KERN_WARNING "Error processing HELO; rc = [%d]\n", rc);
-	return rc;
-}
-
-/**
- * ecryptfs_miscdev_quit
- * @euid: effective user id of miscdevess sending quit packet
- * @user_ns: The namespace in which @euid applies
- * @pid: miscdevess id of miscdevess sending quit packet
- *
- * Returns zero on success; non-zero otherwise
- */
-static int ecryptfs_miscdev_quit(uid_t euid, struct user_namespace *user_ns,
-				 struct pid *pid)
-{
-	int rc;
-
-	rc = ecryptfs_process_quit(euid, user_ns, pid);
-	if (rc)
-		printk(KERN_WARNING
-		       "Error processing QUIT message; rc = [%d]\n", rc);
-	return rc;
-}
-
 /**
  * ecryptfs_miscdev_response - miscdevess response to message previously sent to daemon
  * @data: Bytes comprising struct ecryptfs_message
@@ -512,26 +472,7 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
 			       __func__, rc);
 		break;
 	case ECRYPTFS_MSG_HELO:
-		rc = ecryptfs_miscdev_helo(current->euid,
-					   current->nsproxy->user_ns,
-					   task_pid(current));
-		if (rc) {
-			printk(KERN_ERR "%s: Error attempting to process "
-			       "helo from pid [0x%p]; rc = [%d]\n", __func__,
-			       task_pid(current), rc);
-			goto out_free;
-		}
-		break;
 	case ECRYPTFS_MSG_QUIT:
-		rc = ecryptfs_miscdev_quit(current->euid,
-					   current->nsproxy->user_ns,
-					   task_pid(current));
-		if (rc) {
-			printk(KERN_ERR "%s: Error attempting to process "
-			       "quit from pid [0x%p]; rc = [%d]\n", __func__,
-			       task_pid(current), rc);
-			goto out_free;
-		}
 		break;
 	default:
 		ecryptfs_printk(KERN_WARNING, "Dropping miscdev "
-- 
cgit v1.2.3-70-g09d2


From 982363c97f8cad7aea4c3d2cfebffc1cc2d2f166 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Wed, 23 Jul 2008 21:30:04 -0700
Subject: ecryptfs: propagate key errors up at mount time

Mounting with invalid key signatures should probably fail, if they were
specifically requested but not available.

Also fix case checks in process_request_key_err() for the right sign of
the errnos, as spotted by Jan Tluka.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Jan Tluka <jtluka@redhat.com>
Acked-by: Michael Halcrow <mhalcrow@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/keystore.c | 9 ++++-----
 fs/ecryptfs/main.c     | 4 ++--
 2 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index e82b457180b..f5b76a331b9 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -44,15 +44,15 @@ static int process_request_key_err(long err_code)
 	int rc = 0;
 
 	switch (err_code) {
-	case ENOKEY:
+	case -ENOKEY:
 		ecryptfs_printk(KERN_WARNING, "No key\n");
 		rc = -ENOENT;
 		break;
-	case EKEYEXPIRED:
+	case -EKEYEXPIRED:
 		ecryptfs_printk(KERN_WARNING, "Key expired\n");
 		rc = -ETIME;
 		break;
-	case EKEYREVOKED:
+	case -EKEYREVOKED:
 		ecryptfs_printk(KERN_WARNING, "Key revoked\n");
 		rc = -EINVAL;
 		break;
@@ -963,8 +963,7 @@ int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key,
 	if (!(*auth_tok_key) || IS_ERR(*auth_tok_key)) {
 		printk(KERN_ERR "Could not find key with description: [%s]\n",
 		       sig);
-		process_request_key_err(PTR_ERR(*auth_tok_key));
-		rc = -EINVAL;
+		rc = process_request_key_err(PTR_ERR(*auth_tok_key));
 		goto out;
 	}
 	(*auth_tok) = ecryptfs_get_key_payload_data(*auth_tok_key);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index f36ab2feea2..8876fe7c76e 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -248,10 +248,11 @@ static int ecryptfs_init_global_auth_toks(
 			       "session keyring for sig specified in mount "
 			       "option: [%s]\n", global_auth_tok->sig);
 			global_auth_tok->flags |= ECRYPTFS_AUTH_TOK_INVALID;
-			rc = 0;
+			goto out;
 		} else
 			global_auth_tok->flags &= ~ECRYPTFS_AUTH_TOK_INVALID;
 	}
+out:
 	return rc;
 }
 
@@ -416,7 +417,6 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
 		printk(KERN_WARNING "One or more global auth toks could not "
 		       "properly register; rc = [%d]\n", rc);
 	}
-	rc = 0;
 out:
 	return rc;
 }
-- 
cgit v1.2.3-70-g09d2


From 8f2368095e25018838e1bf145041f58270ccd32e Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Wed, 23 Jul 2008 21:30:05 -0700
Subject: ecryptfs: string copy cleanup

Clean up overcomplicated string copy, which also gets rid of this
bogus warning:

fs/ecryptfs/main.c: In function 'ecryptfs_parse_options':
include/asm/arch/string_32.h:75: warning: array subscript is above array bounds

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Michael Halcrow <mhalcrow@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/main.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 8876fe7c76e..10475d93ff5 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -301,7 +301,6 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
 	char *cipher_name_dst;
 	char *cipher_name_src;
 	char *cipher_key_bytes_src;
-	int cipher_name_len;
 
 	if (!options) {
 		rc = -EINVAL;
@@ -382,17 +381,12 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
 		goto out;
 	}
 	if (!cipher_name_set) {
-		cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER);
-		if (unlikely(cipher_name_len
-			     >= ECRYPTFS_MAX_CIPHER_NAME_SIZE)) {
-			rc = -EINVAL;
-			BUG();
-			goto out;
-		}
-		memcpy(mount_crypt_stat->global_default_cipher_name,
-		       ECRYPTFS_DEFAULT_CIPHER, cipher_name_len);
-		mount_crypt_stat->global_default_cipher_name[cipher_name_len]
-		    = '\0';
+		int cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER);
+
+		BUG_ON(cipher_name_len >= ECRYPTFS_MAX_CIPHER_NAME_SIZE);
+
+		strcpy(mount_crypt_stat->global_default_cipher_name,
+		       ECRYPTFS_DEFAULT_CIPHER);
 	}
 	if (!cipher_key_bytes_set) {
 		mount_crypt_stat->global_default_cipher_key_size = 0;
-- 
cgit v1.2.3-70-g09d2


From 29335c6a41568d4708d4ec3b9187f9b6d302e5ea Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Wed, 23 Jul 2008 21:30:06 -0700
Subject: ecryptfs: crypto.c use unaligned byteorder helpers

Fixes the following sparse warnings:
fs/ecryptfs/crypto.c:1036:8: warning: cast to restricted __be32
fs/ecryptfs/crypto.c:1038:8: warning: cast to restricted __be32
fs/ecryptfs/crypto.c:1077:10: warning: cast to restricted __be32
fs/ecryptfs/crypto.c:1103:6: warning: incorrect type in assignment (different base types)
fs/ecryptfs/crypto.c:1105:6: warning: incorrect type in assignment (different base types)
fs/ecryptfs/crypto.c:1124:8: warning: incorrect type in assignment (different base types)
fs/ecryptfs/crypto.c:1241:21: warning: incorrect type in assignment (different base types)
fs/ecryptfs/crypto.c:1244:30: warning: incorrect type in assignment (different base types)
fs/ecryptfs/crypto.c:1414:23: warning: cast to restricted __be32
fs/ecryptfs/crypto.c:1417:32: warning: cast to restricted __be16

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Cc: Michael Halcrow <mhalcrow@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/crypto.c | 37 ++++++++++++++-----------------------
 1 file changed, 14 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index e2832bc7869..7b99917ffad 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -33,6 +33,7 @@
 #include <linux/crypto.h>
 #include <linux/file.h>
 #include <linux/scatterlist.h>
+#include <asm/unaligned.h>
 #include "ecryptfs_kernel.h"
 
 static int
@@ -1032,10 +1033,8 @@ static int contains_ecryptfs_marker(char *data)
 {
 	u32 m_1, m_2;
 
-	memcpy(&m_1, data, 4);
-	m_1 = be32_to_cpu(m_1);
-	memcpy(&m_2, (data + 4), 4);
-	m_2 = be32_to_cpu(m_2);
+	m_1 = get_unaligned_be32(data);
+	m_2 = get_unaligned_be32(data + 4);
 	if ((m_1 ^ MAGIC_ECRYPTFS_MARKER) == m_2)
 		return 1;
 	ecryptfs_printk(KERN_DEBUG, "m_1 = [0x%.8x]; m_2 = [0x%.8x]; "
@@ -1073,8 +1072,7 @@ static int ecryptfs_process_flags(struct ecryptfs_crypt_stat *crypt_stat,
 	int i;
 	u32 flags;
 
-	memcpy(&flags, page_virt, 4);
-	flags = be32_to_cpu(flags);
+	flags = get_unaligned_be32(page_virt);
 	for (i = 0; i < ((sizeof(ecryptfs_flag_map)
 			  / sizeof(struct ecryptfs_flag_map_elem))); i++)
 		if (flags & ecryptfs_flag_map[i].file_flag) {
@@ -1100,11 +1098,9 @@ static void write_ecryptfs_marker(char *page_virt, size_t *written)
 
 	get_random_bytes(&m_1, (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2));
 	m_2 = (m_1 ^ MAGIC_ECRYPTFS_MARKER);
-	m_1 = cpu_to_be32(m_1);
-	memcpy(page_virt, &m_1, (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2));
-	m_2 = cpu_to_be32(m_2);
-	memcpy(page_virt + (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2), &m_2,
-	       (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2));
+	put_unaligned_be32(m_1, page_virt);
+	page_virt += (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2);
+	put_unaligned_be32(m_2, page_virt);
 	(*written) = MAGIC_ECRYPTFS_MARKER_SIZE_BYTES;
 }
 
@@ -1121,8 +1117,7 @@ write_ecryptfs_flags(char *page_virt, struct ecryptfs_crypt_stat *crypt_stat,
 			flags |= ecryptfs_flag_map[i].file_flag;
 	/* Version is in top 8 bits of the 32-bit flag vector */
 	flags |= ((((u8)crypt_stat->file_version) << 24) & 0xFF000000);
-	flags = cpu_to_be32(flags);
-	memcpy(page_virt, &flags, 4);
+	put_unaligned_be32(flags, page_virt);
 	(*written) = 4;
 }
 
@@ -1238,11 +1233,9 @@ ecryptfs_write_header_metadata(char *virt,
 	num_header_extents_at_front =
 		(u16)(crypt_stat->num_header_bytes_at_front
 		      / crypt_stat->extent_size);
-	header_extent_size = cpu_to_be32(header_extent_size);
-	memcpy(virt, &header_extent_size, 4);
+	put_unaligned_be32(header_extent_size, virt);
 	virt += 4;
-	num_header_extents_at_front = cpu_to_be16(num_header_extents_at_front);
-	memcpy(virt, &num_header_extents_at_front, 2);
+	put_unaligned_be16(num_header_extents_at_front, virt);
 	(*written) = 6;
 }
 
@@ -1410,15 +1403,13 @@ static int parse_header_metadata(struct ecryptfs_crypt_stat *crypt_stat,
 	u32 header_extent_size;
 	u16 num_header_extents_at_front;
 
-	memcpy(&header_extent_size, virt, sizeof(u32));
-	header_extent_size = be32_to_cpu(header_extent_size);
-	virt += sizeof(u32);
-	memcpy(&num_header_extents_at_front, virt, sizeof(u16));
-	num_header_extents_at_front = be16_to_cpu(num_header_extents_at_front);
+	header_extent_size = get_unaligned_be32(virt);
+	virt += sizeof(__be32);
+	num_header_extents_at_front = get_unaligned_be16(virt);
 	crypt_stat->num_header_bytes_at_front =
 		(((size_t)num_header_extents_at_front
 		  * (size_t)header_extent_size));
-	(*bytes_read) = (sizeof(u32) + sizeof(u16));
+	(*bytes_read) = (sizeof(__be32) + sizeof(__be16));
 	if ((validate_header_size == ECRYPTFS_VALIDATE_HEADER_SIZE)
 	    && (crypt_stat->num_header_bytes_at_front
 		< ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)) {
-- 
cgit v1.2.3-70-g09d2


From 0a688ad713949643e201431d3f4a4ceddfeb70ca Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Wed, 23 Jul 2008 21:30:07 -0700
Subject: ecryptfs: inode.c mmap.c use unaligned byteorder helpers

Fixe sparse warnings:
fs/ecryptfs/inode.c:368:15: warning: cast to restricted __be64
fs/ecryptfs/mmap.c:385:12: warning: incorrect type in assignment (different base types)
fs/ecryptfs/mmap.c:385:12:    expected unsigned long long [unsigned] [assigned] [usertype] file_size
fs/ecryptfs/mmap.c:385:12:    got restricted __be64 [usertype] <noident>
fs/ecryptfs/mmap.c:428:12: warning: incorrect type in assignment (different base types)
fs/ecryptfs/mmap.c:428:12:    expected unsigned long long [unsigned] [assigned] [usertype] file_size
fs/ecryptfs/mmap.c:428:12:    got restricted __be64 [usertype] <noident>

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Cc: Michael Halcrow <mhalcrow@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/inode.c |  4 ++--
 fs/ecryptfs/mmap.c  | 11 +++--------
 2 files changed, 5 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index c92cc1c00aa..7315547193e 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -31,6 +31,7 @@
 #include <linux/mount.h>
 #include <linux/crypto.h>
 #include <linux/fs_stack.h>
+#include <asm/unaligned.h>
 #include "ecryptfs_kernel.h"
 
 static struct dentry *lock_parent(struct dentry *dentry)
@@ -364,8 +365,7 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry,
 		else
 			file_size = i_size_read(lower_dentry->d_inode);
 	} else {
-		memcpy(&file_size, page_virt, sizeof(file_size));
-		file_size = be64_to_cpu(file_size);
+		file_size = get_unaligned_be64(page_virt);
 	}
 	i_size_write(dentry->d_inode, (loff_t)file_size);
 	kmem_cache_free(ecryptfs_header_cache_2, page_virt);
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 2b6fe1e6e8b..245c2dc02d5 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -32,6 +32,7 @@
 #include <linux/file.h>
 #include <linux/crypto.h>
 #include <linux/scatterlist.h>
+#include <asm/unaligned.h>
 #include "ecryptfs_kernel.h"
 
 /**
@@ -372,7 +373,6 @@ out:
  */
 static int ecryptfs_write_inode_size_to_header(struct inode *ecryptfs_inode)
 {
-	u64 file_size;
 	char *file_size_virt;
 	int rc;
 
@@ -381,9 +381,7 @@ static int ecryptfs_write_inode_size_to_header(struct inode *ecryptfs_inode)
 		rc = -ENOMEM;
 		goto out;
 	}
-	file_size = (u64)i_size_read(ecryptfs_inode);
-	file_size = cpu_to_be64(file_size);
-	memcpy(file_size_virt, &file_size, sizeof(u64));
+	put_unaligned_be64(i_size_read(ecryptfs_inode), file_size_virt);
 	rc = ecryptfs_write_lower(ecryptfs_inode, file_size_virt, 0,
 				  sizeof(u64));
 	kfree(file_size_virt);
@@ -403,7 +401,6 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode)
 	struct dentry *lower_dentry =
 		ecryptfs_inode_to_private(ecryptfs_inode)->lower_file->f_dentry;
 	struct inode *lower_inode = lower_dentry->d_inode;
-	u64 file_size;
 	int rc;
 
 	if (!lower_inode->i_op->getxattr || !lower_inode->i_op->setxattr) {
@@ -424,9 +421,7 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode)
 					   xattr_virt, PAGE_CACHE_SIZE);
 	if (size < 0)
 		size = 8;
-	file_size = (u64)i_size_read(ecryptfs_inode);
-	file_size = cpu_to_be64(file_size);
-	memcpy(xattr_virt, &file_size, sizeof(u64));
+	put_unaligned_be64(i_size_read(ecryptfs_inode), xattr_virt);
 	rc = lower_inode->i_op->setxattr(lower_dentry, ECRYPTFS_XATTR_NAME,
 					 xattr_virt, size, 0);
 	mutex_unlock(&lower_inode->i_mutex);
-- 
cgit v1.2.3-70-g09d2


From 72b55fffd631a89e5be6fe1b4f2565bc4cd90deb Mon Sep 17 00:00:00 2001
From: Michael Halcrow <mhalcrow@us.ibm.com>
Date: Wed, 23 Jul 2008 21:30:07 -0700
Subject: eCryptfs: do not try to open device files on mknod

When creating device nodes, eCryptfs needs to delay actually opening the lower
persistent file until an application tries to open.  Device handles may not be
backed by anything when they first come into existence.

[Valdis.Kletnieks@vt.edu: build fix]
Signed-off-by: Michael Halcrow <mhalcrow@us.ibm.com>
Cc: <Valdis.Kletnieks@vt.edu}
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/ecryptfs_kernel.h |  6 +++++-
 fs/ecryptfs/file.c            | 14 ++++++++++++++
 fs/ecryptfs/inode.c           |  6 ++++--
 fs/ecryptfs/main.c            | 29 +++++++++++++++++++----------
 4 files changed, 42 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index b4a0cccfdd7..b0727f91454 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -235,6 +235,7 @@ struct ecryptfs_crypt_stat {
 #define ECRYPTFS_METADATA_IN_XATTR  0x00000100
 #define ECRYPTFS_VIEW_AS_ENCRYPTED  0x00000200
 #define ECRYPTFS_KEY_SET            0x00000400
+#define ECRYPTFS_DELAY_PERSISTENT   0x00000800
 	u32 flags;
 	unsigned int file_version;
 	size_t iv_bytes;
@@ -574,9 +575,11 @@ struct ecryptfs_open_req {
 	struct list_head kthread_ctl_list;
 };
 
+#define ECRYPTFS_INTERPOSE_FLAG_D_ADD                 0x00000001
+#define ECRYPTFS_INTERPOSE_FLAG_DELAY_PERSISTENT_FILE 0x00000002
 int ecryptfs_interpose(struct dentry *hidden_dentry,
 		       struct dentry *this_dentry, struct super_block *sb,
-		       int flag);
+		       u32 flags);
 int ecryptfs_fill_zeros(struct file *file, loff_t new_length);
 int ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat,
 			     const char *name, int length,
@@ -709,5 +712,6 @@ void ecryptfs_destroy_kthread(void);
 int ecryptfs_privileged_open(struct file **lower_file,
 			     struct dentry *lower_dentry,
 			     struct vfsmount *lower_mnt);
+int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry);
 
 #endif /* #ifndef ECRYPTFS_KERNEL_H */
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index f0be2905152..2c2d60df3f6 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -199,6 +199,20 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
 		       "file must hence be opened RO\n", __func__);
 		goto out;
 	}
+	if (!ecryptfs_inode_to_private(inode)->lower_file) {
+		BUG_ON(!(crypt_stat->flags & ECRYPTFS_DELAY_PERSISTENT));
+		mutex_lock(&crypt_stat->cs_mutex);
+		crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
+		mutex_unlock(&crypt_stat->cs_mutex);
+		rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
+		if (rc) {
+			printk(KERN_ERR "%s: Error attempting to initialize "
+			       "the persistent file for the dentry with name "
+			       "[%s]; rc = [%d]\n", __func__,
+			       ecryptfs_dentry->d_name.name, rc);
+			goto out;
+		}
+	}
 	ecryptfs_set_file_lower(
 		file, ecryptfs_inode_to_private(inode)->lower_file);
 	if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) {
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 7315547193e..26090878c93 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -308,7 +308,8 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry,
 		d_add(dentry, NULL);
 		goto out;
 	}
-	rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 1);
+	rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb,
+				ECRYPTFS_INTERPOSE_FLAG_D_ADD);
 	if (rc) {
 		ecryptfs_printk(KERN_ERR, "Error interposing\n");
 		goto out_dput;
@@ -537,7 +538,8 @@ ecryptfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 	rc = vfs_mknod(lower_dir_dentry->d_inode, lower_dentry, mode, dev);
 	if (rc || !lower_dentry->d_inode)
 		goto out;
-	rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0);
+	rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb,
+				ECRYPTFS_INTERPOSE_FLAG_DELAY_PERSISTENT_FILE);
 	if (rc)
 		goto out;
 	fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 10475d93ff5..ee4f84b2041 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -117,7 +117,7 @@ void __ecryptfs_printk(const char *fmt, ...)
  *
  * Returns zero on success; non-zero otherwise
  */
-static int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
+int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
 {
 	struct ecryptfs_inode_info *inode_info =
 		ecryptfs_inode_to_private(ecryptfs_dentry->d_inode);
@@ -149,14 +149,14 @@ static int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
  * @lower_dentry: Existing dentry in the lower filesystem
  * @dentry: ecryptfs' dentry
  * @sb: ecryptfs's super_block
- * @flag: If set to true, then d_add is called, else d_instantiate is called
+ * @flags: flags to govern behavior of interpose procedure
  *
  * Interposes upper and lower dentries.
  *
  * Returns zero on success; non-zero otherwise
  */
 int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
-		       struct super_block *sb, int flag)
+		       struct super_block *sb, u32 flags)
 {
 	struct inode *lower_inode;
 	struct inode *inode;
@@ -193,7 +193,7 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
 		init_special_inode(inode, lower_inode->i_mode,
 				   lower_inode->i_rdev);
 	dentry->d_op = &ecryptfs_dops;
-	if (flag)
+	if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD)
 		d_add(dentry, inode);
 	else
 		d_instantiate(dentry, inode);
@@ -201,12 +201,21 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
 	/* This size will be overwritten for real files w/ headers and
 	 * other metadata */
 	fsstack_copy_inode_size(inode, lower_inode);
-	rc = ecryptfs_init_persistent_file(dentry);
-	if (rc) {
-		printk(KERN_ERR "%s: Error attempting to initialize the "
-		       "persistent file for the dentry with name [%s]; "
-		       "rc = [%d]\n", __func__, dentry->d_name.name, rc);
-		goto out;
+	if (!(flags & ECRYPTFS_INTERPOSE_FLAG_DELAY_PERSISTENT_FILE)) {
+		rc = ecryptfs_init_persistent_file(dentry);
+		if (rc) {
+			printk(KERN_ERR "%s: Error attempting to initialize "
+			       "the persistent file for the dentry with name "
+			       "[%s]; rc = [%d]\n", __func__,
+			       dentry->d_name.name, rc);
+			goto out;
+		}
+	} else {
+		struct ecryptfs_inode_info *inode_info =
+			ecryptfs_inode_to_private(dentry->d_inode);
+
+		inode_info->lower_file = NULL;
+		inode_info->crypt_stat.flags |= ECRYPTFS_DELAY_PERSISTENT;
 	}
 out:
 	return rc;
-- 
cgit v1.2.3-70-g09d2


From 391b52f98cf2e9bff227dad8bf9ea206fec43fa4 Mon Sep 17 00:00:00 2001
From: Michael Halcrow <mhalcrow@us.ibm.com>
Date: Wed, 23 Jul 2008 21:30:08 -0700
Subject: eCryptfs: Make all persistent file opens delayed

There is no good reason to immediately open the lower file, and that can
cause problems with files that the user does not intend to immediately
open, such as device nodes.

This patch removes the persistent file open from the interpose step and
pushes that to the locations where eCryptfs really does need the lower
persistent file, such as just before reading or writing the metadata
stored in the lower file header.

Two functions are jumping to out_dput when they should just be jumping to
out on error paths.  This patch also fixes these.

Signed-off-by: Michael Halcrow <mhalcrow@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/ecryptfs_kernel.h |  2 --
 fs/ecryptfs/file.c            |  4 ----
 fs/ecryptfs/inode.c           | 27 +++++++++++++++++++++++----
 fs/ecryptfs/main.c            | 16 ----------------
 4 files changed, 23 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index b0727f91454..b73fb752c5f 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -235,7 +235,6 @@ struct ecryptfs_crypt_stat {
 #define ECRYPTFS_METADATA_IN_XATTR  0x00000100
 #define ECRYPTFS_VIEW_AS_ENCRYPTED  0x00000200
 #define ECRYPTFS_KEY_SET            0x00000400
-#define ECRYPTFS_DELAY_PERSISTENT   0x00000800
 	u32 flags;
 	unsigned int file_version;
 	size_t iv_bytes;
@@ -576,7 +575,6 @@ struct ecryptfs_open_req {
 };
 
 #define ECRYPTFS_INTERPOSE_FLAG_D_ADD                 0x00000001
-#define ECRYPTFS_INTERPOSE_FLAG_DELAY_PERSISTENT_FILE 0x00000002
 int ecryptfs_interpose(struct dentry *hidden_dentry,
 		       struct dentry *this_dentry, struct super_block *sb,
 		       u32 flags);
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 2c2d60df3f6..9244d653743 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -200,10 +200,6 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
 		goto out;
 	}
 	if (!ecryptfs_inode_to_private(inode)->lower_file) {
-		BUG_ON(!(crypt_stat->flags & ECRYPTFS_DELAY_PERSISTENT));
-		mutex_lock(&crypt_stat->cs_mutex);
-		crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
-		mutex_unlock(&crypt_stat->cs_mutex);
 		rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
 		if (rc) {
 			printk(KERN_ERR "%s: Error attempting to initialize "
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 26090878c93..d755455e3bf 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -189,6 +189,16 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry)
 				"context; rc = [%d]\n", rc);
 		goto out;
 	}
+	if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) {
+		rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
+		if (rc) {
+			printk(KERN_ERR "%s: Error attempting to initialize "
+			       "the persistent file for the dentry with name "
+			       "[%s]; rc = [%d]\n", __func__,
+			       ecryptfs_dentry->d_name.name, rc);
+			goto out;
+		}
+	}
 	rc = ecryptfs_write_metadata(ecryptfs_dentry);
 	if (rc) {
 		printk(KERN_ERR "Error writing headers; rc = [%d]\n", rc);
@@ -312,7 +322,7 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry,
 				ECRYPTFS_INTERPOSE_FLAG_D_ADD);
 	if (rc) {
 		ecryptfs_printk(KERN_ERR, "Error interposing\n");
-		goto out_dput;
+		goto out;
 	}
 	if (S_ISDIR(lower_inode->i_mode)) {
 		ecryptfs_printk(KERN_DEBUG, "Is a directory; returning\n");
@@ -338,11 +348,21 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry,
 		rc = -ENOMEM;
 		ecryptfs_printk(KERN_ERR,
 				"Cannot ecryptfs_kmalloc a page\n");
-		goto out_dput;
+		goto out;
 	}
 	crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
 	if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
 		ecryptfs_set_default_sizes(crypt_stat);
+	if (!ecryptfs_inode_to_private(dentry->d_inode)->lower_file) {
+		rc = ecryptfs_init_persistent_file(dentry);
+		if (rc) {
+			printk(KERN_ERR "%s: Error attempting to initialize "
+			       "the persistent file for the dentry with name "
+			       "[%s]; rc = [%d]\n", __func__,
+			       dentry->d_name.name, rc);
+			goto out;
+		}
+	}
 	rc = ecryptfs_read_and_validate_header_region(page_virt,
 						      dentry->d_inode);
 	if (rc) {
@@ -538,8 +558,7 @@ ecryptfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 	rc = vfs_mknod(lower_dir_dentry->d_inode, lower_dentry, mode, dev);
 	if (rc || !lower_dentry->d_inode)
 		goto out;
-	rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb,
-				ECRYPTFS_INTERPOSE_FLAG_DELAY_PERSISTENT_FILE);
+	rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0);
 	if (rc)
 		goto out;
 	fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index ee4f84b2041..6f403cfba14 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -201,22 +201,6 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
 	/* This size will be overwritten for real files w/ headers and
 	 * other metadata */
 	fsstack_copy_inode_size(inode, lower_inode);
-	if (!(flags & ECRYPTFS_INTERPOSE_FLAG_DELAY_PERSISTENT_FILE)) {
-		rc = ecryptfs_init_persistent_file(dentry);
-		if (rc) {
-			printk(KERN_ERR "%s: Error attempting to initialize "
-			       "the persistent file for the dentry with name "
-			       "[%s]; rc = [%d]\n", __func__,
-			       dentry->d_name.name, rc);
-			goto out;
-		}
-	} else {
-		struct ecryptfs_inode_info *inode_info =
-			ecryptfs_inode_to_private(dentry->d_inode);
-
-		inode_info->lower_file = NULL;
-		inode_info->crypt_stat.flags |= ECRYPTFS_DELAY_PERSISTENT;
-	}
 out:
 	return rc;
 }
-- 
cgit v1.2.3-70-g09d2


From 5f6f4f28b6ba543beef8bad91aa6f69c7ffeee51 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Wed, 23 Jul 2008 21:30:09 -0700
Subject: autofs4: don't make expiring dentry negative

Correct the error of making a positive dentry negative after it has been
instantiated.

The code that makes this error attempts to re-use the dentry from a
concurrent expire and mount to resolve a race and the dentry used for the
lookup must be negative for mounts to trigger in the required cases.  The
fact is that the dentry doesn't need to be re-used because all that is
needed is to preserve the flag that indicates an expire is still
incomplete at the time of the mount request.

This change uses the the dentry to check the flag and wait for the expire
to complete then discards it instead of attempting to re-use it.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/autofs_i.h |   6 +--
 fs/autofs4/inode.c    |   6 +--
 fs/autofs4/root.c     | 118 ++++++++++++++++++++------------------------------
 3 files changed, 52 insertions(+), 78 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index c3d352d7fa9..69b1497b002 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -52,7 +52,7 @@ struct autofs_info {
 
 	int		flags;
 
-	struct list_head rehash;
+	struct list_head expiring;
 
 	struct autofs_sb_info *sbi;
 	unsigned long last_used;
@@ -112,8 +112,8 @@ struct autofs_sb_info {
 	struct mutex wq_mutex;
 	spinlock_t fs_lock;
 	struct autofs_wait_queue *queues; /* Wait queue pointer */
-	spinlock_t rehash_lock;
-	struct list_head rehash_list;
+	spinlock_t lookup_lock;
+	struct list_head expiring_list;
 };
 
 static inline struct autofs_sb_info *autofs4_sbi(struct super_block *sb)
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 2fdcf5e1d23..94bfc154d7a 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -47,7 +47,7 @@ struct autofs_info *autofs4_init_ino(struct autofs_info *ino,
 	ino->dentry = NULL;
 	ino->size = 0;
 
-	INIT_LIST_HEAD(&ino->rehash);
+	INIT_LIST_HEAD(&ino->expiring);
 
 	ino->last_used = jiffies;
 	atomic_set(&ino->count, 0);
@@ -338,8 +338,8 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 	mutex_init(&sbi->wq_mutex);
 	spin_lock_init(&sbi->fs_lock);
 	sbi->queues = NULL;
-	spin_lock_init(&sbi->rehash_lock);
-	INIT_LIST_HEAD(&sbi->rehash_list);
+	spin_lock_init(&sbi->lookup_lock);
+	INIT_LIST_HEAD(&sbi->expiring_list);
 	s->s_blocksize = 1024;
 	s->s_blocksize_bits = 10;
 	s->s_magic = AUTOFS_SUPER_MAGIC;
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index edf5b6bddb5..9ead2279df4 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -493,10 +493,10 @@ void autofs4_dentry_release(struct dentry *de)
 		struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb);
 
 		if (sbi) {
-			spin_lock(&sbi->rehash_lock);
-			if (!list_empty(&inf->rehash))
-				list_del(&inf->rehash);
-			spin_unlock(&sbi->rehash_lock);
+			spin_lock(&sbi->lookup_lock);
+			if (!list_empty(&inf->expiring))
+				list_del(&inf->expiring);
+			spin_unlock(&sbi->lookup_lock);
 		}
 
 		inf->dentry = NULL;
@@ -518,7 +518,7 @@ static struct dentry_operations autofs4_dentry_operations = {
 	.d_release	= autofs4_dentry_release,
 };
 
-static struct dentry *autofs4_lookup_unhashed(struct autofs_sb_info *sbi, struct dentry *parent, struct qstr *name)
+static struct dentry *autofs4_lookup_expiring(struct autofs_sb_info *sbi, struct dentry *parent, struct qstr *name)
 {
 	unsigned int len = name->len;
 	unsigned int hash = name->hash;
@@ -526,14 +526,14 @@ static struct dentry *autofs4_lookup_unhashed(struct autofs_sb_info *sbi, struct
 	struct list_head *p, *head;
 
 	spin_lock(&dcache_lock);
-	spin_lock(&sbi->rehash_lock);
-	head = &sbi->rehash_list;
+	spin_lock(&sbi->lookup_lock);
+	head = &sbi->expiring_list;
 	list_for_each(p, head) {
 		struct autofs_info *ino;
 		struct dentry *dentry;
 		struct qstr *qstr;
 
-		ino = list_entry(p, struct autofs_info, rehash);
+		ino = list_entry(p, struct autofs_info, expiring);
 		dentry = ino->dentry;
 
 		spin_lock(&dentry->d_lock);
@@ -555,33 +555,16 @@ static struct dentry *autofs4_lookup_unhashed(struct autofs_sb_info *sbi, struct
 			goto next;
 
 		if (d_unhashed(dentry)) {
-			struct inode *inode = dentry->d_inode;
-
-			ino = autofs4_dentry_ino(dentry);
-			list_del_init(&ino->rehash);
 			dget(dentry);
-			/*
-			 * Make the rehashed dentry negative so the VFS
-			 * behaves as it should.
-			 */
-			if (inode) {
-				dentry->d_inode = NULL;
-				list_del_init(&dentry->d_alias);
-				spin_unlock(&dentry->d_lock);
-				spin_unlock(&sbi->rehash_lock);
-				spin_unlock(&dcache_lock);
-				iput(inode);
-				return dentry;
-			}
 			spin_unlock(&dentry->d_lock);
-			spin_unlock(&sbi->rehash_lock);
+			spin_unlock(&sbi->lookup_lock);
 			spin_unlock(&dcache_lock);
 			return dentry;
 		}
 next:
 		spin_unlock(&dentry->d_lock);
 	}
-	spin_unlock(&sbi->rehash_lock);
+	spin_unlock(&sbi->lookup_lock);
 	spin_unlock(&dcache_lock);
 
 	return NULL;
@@ -591,7 +574,7 @@ next:
 static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
 	struct autofs_sb_info *sbi;
-	struct dentry *unhashed;
+	struct dentry *expiring;
 	int oz_mode;
 
 	DPRINTK("name = %.*s",
@@ -607,44 +590,44 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 	DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
 		 current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode);
 
-	unhashed = autofs4_lookup_unhashed(sbi, dentry->d_parent, &dentry->d_name);
-	if (!unhashed) {
-		/*
-		 * Mark the dentry incomplete but don't hash it. We do this
-		 * to serialize our inode creation operations (symlink and
-		 * mkdir) which prevents deadlock during the callback to
-		 * the daemon. Subsequent user space lookups for the same
-		 * dentry are placed on the wait queue while the daemon
-		 * itself is allowed passage unresticted so the create
-		 * operation itself can then hash the dentry. Finally,
-		 * we check for the hashed dentry and return the newly
-		 * hashed dentry.
-		 */
-		dentry->d_op = &autofs4_root_dentry_operations;
-
-		dentry->d_fsdata = NULL;
-		d_instantiate(dentry, NULL);
-	} else {
-		struct autofs_info *ino = autofs4_dentry_ino(unhashed);
-		DPRINTK("rehash %p with %p", dentry, unhashed);
+	expiring = autofs4_lookup_expiring(sbi, dentry->d_parent, &dentry->d_name);
+	if (expiring) {
+		struct autofs_info *ino = autofs4_dentry_ino(expiring);
 		/*
 		 * If we are racing with expire the request might not
 		 * be quite complete but the directory has been removed
 		 * so it must have been successful, so just wait for it.
-		 * We need to ensure the AUTOFS_INF_EXPIRING flag is clear
-		 * before continuing as revalidate may fail when calling
-		 * try_to_fill_dentry (returning EAGAIN) if we don't.
 		 */
 		while (ino && (ino->flags & AUTOFS_INF_EXPIRING)) {
 			DPRINTK("wait for incomplete expire %p name=%.*s",
-				unhashed, unhashed->d_name.len,
-				unhashed->d_name.name);
-			autofs4_wait(sbi, unhashed, NFY_NONE);
+				expiring, expiring->d_name.len,
+				expiring->d_name.name);
+			autofs4_wait(sbi, expiring, NFY_NONE);
 			DPRINTK("request completed");
 		}
-		dentry = unhashed;
+		spin_lock(&sbi->lookup_lock);
+		if (!list_empty(&ino->expiring))
+			list_del_init(&ino->expiring);
+		spin_unlock(&sbi->lookup_lock);
+		dput(expiring);
 	}
 
+	/*
+	 * Mark the dentry incomplete but don't hash it. We do this
+	 * to serialize our inode creation operations (symlink and
+	 * mkdir) which prevents deadlock during the callback to
+	 * the daemon. Subsequent user space lookups for the same
+	 * dentry are placed on the wait queue while the daemon
+	 * itself is allowed passage unresticted so the create
+	 * operation itself can then hash the dentry. Finally,
+	 * we check for the hashed dentry and return the newly
+	 * hashed dentry.
+	 */
+	dentry->d_op = &autofs4_root_dentry_operations;
+
+	dentry->d_fsdata = NULL;
+	d_instantiate(dentry, NULL);
+
 	if (!oz_mode) {
 		spin_lock(&dentry->d_lock);
 		dentry->d_flags |= DCACHE_AUTOFS_PENDING;
@@ -668,8 +651,6 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 			if (sigismember (sigset, SIGKILL) ||
 			    sigismember (sigset, SIGQUIT) ||
 			    sigismember (sigset, SIGINT)) {
-			    if (unhashed)
-				dput(unhashed);
 			    return ERR_PTR(-ERESTARTNOINTR);
 			}
 		}
@@ -699,15 +680,9 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 		else
 			dentry = ERR_PTR(-ENOENT);
 
-		if (unhashed)
-			dput(unhashed);
-
 		return dentry;
 	}
 
-	if (unhashed)
-		return dentry;
-
 	return NULL;
 }
 
@@ -769,9 +744,8 @@ static int autofs4_dir_symlink(struct inode *dir,
  * that the file no longer exists. However, doing that means that the
  * VFS layer can turn the dentry into a negative dentry.  We don't want
  * this, because the unlink is probably the result of an expire.
- * We simply d_drop it and add it to a rehash candidates list in the
- * super block, which allows the dentry lookup to reuse it retaining
- * the flags, such as expire in progress, in case we're racing with expire.
+ * We simply d_drop it and add it to a expiring list in the super block,
+ * which allows the dentry lookup to check for an incomplete expire.
  *
  * If a process is blocked on the dentry waiting for the expire to finish,
  * it will invalidate the dentry and try to mount with a new one.
@@ -801,9 +775,9 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
 	dir->i_mtime = CURRENT_TIME;
 
 	spin_lock(&dcache_lock);
-	spin_lock(&sbi->rehash_lock);
-	list_add(&ino->rehash, &sbi->rehash_list);
-	spin_unlock(&sbi->rehash_lock);
+	spin_lock(&sbi->lookup_lock);
+	list_add(&ino->expiring, &sbi->expiring_list);
+	spin_unlock(&sbi->lookup_lock);
 	spin_lock(&dentry->d_lock);
 	__d_drop(dentry);
 	spin_unlock(&dentry->d_lock);
@@ -829,9 +803,9 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
 		spin_unlock(&dcache_lock);
 		return -ENOTEMPTY;
 	}
-	spin_lock(&sbi->rehash_lock);
-	list_add(&ino->rehash, &sbi->rehash_list);
-	spin_unlock(&sbi->rehash_lock);
+	spin_lock(&sbi->lookup_lock);
+	list_add(&ino->expiring, &sbi->expiring_list);
+	spin_unlock(&sbi->lookup_lock);
 	spin_lock(&dentry->d_lock);
 	__d_drop(dentry);
 	spin_unlock(&dentry->d_lock);
-- 
cgit v1.2.3-70-g09d2


From caf7da3d5d4d9dd873eb52d025d8cc63b89f1fdb Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Wed, 23 Jul 2008 21:30:11 -0700
Subject: autofs4: revert - redo lookup in ttfd

This patch series enables the use of a single dentry for lookups prior to
the dentry being hashed and so we no longer need to redo the lookup.  This
patch reverts the patch of commit
033790449ba9c4dcf8478a87693d33df625c23b5.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/root.c | 21 ---------------------
 1 file changed, 21 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 9ead2279df4..53dabe8d5b8 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -242,7 +242,6 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
 	struct autofs_info *ino = autofs4_dentry_ino(dentry);
-	struct dentry *new;
 	int status;
 
 	/* Block on any pending expiry here; invalidate the dentry
@@ -320,26 +319,6 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags)
 	dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
 	spin_unlock(&dentry->d_lock);
 
-	/*
-	 * The dentry that is passed in from lookup may not be the one
-	 * we end up using, as mkdir can create a new one.  If this
-	 * happens, and another process tries the lookup at the same time,
-	 * it will set the PENDING flag on this new dentry, but add itself
-	 * to our waitq.  Then, if after the lookup succeeds, the first
-	 * process that requested the mount performs another lookup of the
-	 * same directory, it will show up as still pending!  So, we need
-	 * to redo the lookup here and clear pending on that dentry.
-	 */
-	if (d_unhashed(dentry)) {
-		new = d_lookup(dentry->d_parent, &dentry->d_name);
-		if (new) {
-			spin_lock(&new->d_lock);
-			new->d_flags &= ~DCACHE_AUTOFS_PENDING;
-			spin_unlock(&new->d_lock);
-			dput(new);
-		}
-	}
-
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 2576737873dc1d9ea461a5955a5f6779b569a350 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Wed, 23 Jul 2008 21:30:12 -0700
Subject: autofs4: use look aside list for lookups

A while ago a patch to resolve a deadlock during directory creation was
merged.  This delayed the hashing of lookup dentrys until the ->mkdir()
(or ->symlink()) operation completed to ensure we always went through
->lookup() instead of also having processes go through ->revalidate() so
our VFS locking remained consistent.

Now we are seeing a couple of side affects of that change in situations
with heavy mount activity.

Two cases have been identified:

1) When a mount request is triggered, due to the delayed hashing, the
   directory created by user space for the mount point doesn't have the
   DCACHE_AUTOFS_PENDING flag set.  In the case of an autofs multi-mount
   where a tree of mount point directories are created this can lead to
   the path walk continuing rather than the dentry being sent to the wait
   queue to wait for request completion.  This is because, if the pending
   flag isn't set, the criteria for deciding this is a mount in progress
   fails to hold, namely that the dentry is not a mount point and has no
   subdirectories.

2) A mount request dentry is initially created negative and unhashed.
   It remains this way until the ->mkdir() callback completes.  Since it
   is unhashed a fresh dentry is used when the user space mount request
   creates the mount point directory.  This leaves the original dentry
   negative and unhashed.  But revalidate has no way to tell the VFS that
   the dentry has changed, other than to force another ->lookup() by
   returning false, which is at best wastefull and at worst not possible.
   This results in an -ENOENT return from the original path walk when in
   fact the mount succeeded.

To resolve this we need to ensure that the same dentry is used in all
calls to ->lookup() during the course of a mount request.  This patch
achieves that by adding the initial dentry to a look aside list and
removes it at ->mkdir() or ->symlink() completion (or when the dentry is
released), since these are the only create operations autofs4 supports.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/autofs_i.h |   2 +
 fs/autofs4/inode.c    |  25 +++++---
 fs/autofs4/root.c     | 169 +++++++++++++++++++++++++++++++++++++++++---------
 3 files changed, 156 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 69b1497b002..2dce2334737 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -52,6 +52,7 @@ struct autofs_info {
 
 	int		flags;
 
+	struct list_head active;
 	struct list_head expiring;
 
 	struct autofs_sb_info *sbi;
@@ -113,6 +114,7 @@ struct autofs_sb_info {
 	spinlock_t fs_lock;
 	struct autofs_wait_queue *queues; /* Wait queue pointer */
 	spinlock_t lookup_lock;
+	struct list_head active_list;
 	struct list_head expiring_list;
 };
 
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 94bfc154d7a..e3e70994ab4 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -24,8 +24,10 @@
 
 static void ino_lnkfree(struct autofs_info *ino)
 {
-	kfree(ino->u.symlink);
-	ino->u.symlink = NULL;
+	if (ino->u.symlink) {
+		kfree(ino->u.symlink);
+		ino->u.symlink = NULL;
+	}
 }
 
 struct autofs_info *autofs4_init_ino(struct autofs_info *ino,
@@ -41,16 +43,18 @@ struct autofs_info *autofs4_init_ino(struct autofs_info *ino,
 	if (ino == NULL)
 		return NULL;
 
-	ino->flags = 0;
-	ino->mode = mode;
-	ino->inode = NULL;
-	ino->dentry = NULL;
-	ino->size = 0;
-
-	INIT_LIST_HEAD(&ino->expiring);
+	if (!reinit) {
+		ino->flags = 0;
+		ino->inode = NULL;
+		ino->dentry = NULL;
+		ino->size = 0;
+		INIT_LIST_HEAD(&ino->active);
+		INIT_LIST_HEAD(&ino->expiring);
+		atomic_set(&ino->count, 0);
+	}
 
+	ino->mode = mode;
 	ino->last_used = jiffies;
-	atomic_set(&ino->count, 0);
 
 	ino->sbi = sbi;
 
@@ -339,6 +343,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 	spin_lock_init(&sbi->fs_lock);
 	sbi->queues = NULL;
 	spin_lock_init(&sbi->lookup_lock);
+	INIT_LIST_HEAD(&sbi->active_list);
 	INIT_LIST_HEAD(&sbi->expiring_list);
 	s->s_blocksize = 1024;
 	s->s_blocksize_bits = 10;
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 53dabe8d5b8..dbb70d5a488 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -473,6 +473,8 @@ void autofs4_dentry_release(struct dentry *de)
 
 		if (sbi) {
 			spin_lock(&sbi->lookup_lock);
+			if (!list_empty(&inf->active))
+				list_del(&inf->active);
 			if (!list_empty(&inf->expiring))
 				list_del(&inf->expiring);
 			spin_unlock(&sbi->lookup_lock);
@@ -497,6 +499,58 @@ static struct dentry_operations autofs4_dentry_operations = {
 	.d_release	= autofs4_dentry_release,
 };
 
+static struct dentry *autofs4_lookup_active(struct autofs_sb_info *sbi, struct dentry *parent, struct qstr *name)
+{
+	unsigned int len = name->len;
+	unsigned int hash = name->hash;
+	const unsigned char *str = name->name;
+	struct list_head *p, *head;
+
+	spin_lock(&dcache_lock);
+	spin_lock(&sbi->lookup_lock);
+	head = &sbi->active_list;
+	list_for_each(p, head) {
+		struct autofs_info *ino;
+		struct dentry *dentry;
+		struct qstr *qstr;
+
+		ino = list_entry(p, struct autofs_info, active);
+		dentry = ino->dentry;
+
+		spin_lock(&dentry->d_lock);
+
+		/* Already gone? */
+		if (atomic_read(&dentry->d_count) == 0)
+			goto next;
+
+		qstr = &dentry->d_name;
+
+		if (dentry->d_name.hash != hash)
+			goto next;
+		if (dentry->d_parent != parent)
+			goto next;
+
+		if (qstr->len != len)
+			goto next;
+		if (memcmp(qstr->name, str, len))
+			goto next;
+
+		if (d_unhashed(dentry)) {
+			dget(dentry);
+			spin_unlock(&dentry->d_lock);
+			spin_unlock(&sbi->lookup_lock);
+			spin_unlock(&dcache_lock);
+			return dentry;
+		}
+next:
+		spin_unlock(&dentry->d_lock);
+	}
+	spin_unlock(&sbi->lookup_lock);
+	spin_unlock(&dcache_lock);
+
+	return NULL;
+}
+
 static struct dentry *autofs4_lookup_expiring(struct autofs_sb_info *sbi, struct dentry *parent, struct qstr *name)
 {
 	unsigned int len = name->len;
@@ -553,7 +607,8 @@ next:
 static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
 	struct autofs_sb_info *sbi;
-	struct dentry *expiring;
+	struct autofs_info *ino;
+	struct dentry *expiring, *unhashed;
 	int oz_mode;
 
 	DPRINTK("name = %.*s",
@@ -571,12 +626,12 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 
 	expiring = autofs4_lookup_expiring(sbi, dentry->d_parent, &dentry->d_name);
 	if (expiring) {
-		struct autofs_info *ino = autofs4_dentry_ino(expiring);
 		/*
 		 * If we are racing with expire the request might not
 		 * be quite complete but the directory has been removed
 		 * so it must have been successful, so just wait for it.
 		 */
+		ino = autofs4_dentry_ino(expiring);
 		while (ino && (ino->flags & AUTOFS_INF_EXPIRING)) {
 			DPRINTK("wait for incomplete expire %p name=%.*s",
 				expiring, expiring->d_name.len,
@@ -591,21 +646,41 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 		dput(expiring);
 	}
 
-	/*
-	 * Mark the dentry incomplete but don't hash it. We do this
-	 * to serialize our inode creation operations (symlink and
-	 * mkdir) which prevents deadlock during the callback to
-	 * the daemon. Subsequent user space lookups for the same
-	 * dentry are placed on the wait queue while the daemon
-	 * itself is allowed passage unresticted so the create
-	 * operation itself can then hash the dentry. Finally,
-	 * we check for the hashed dentry and return the newly
-	 * hashed dentry.
-	 */
-	dentry->d_op = &autofs4_root_dentry_operations;
+	unhashed = autofs4_lookup_active(sbi, dentry->d_parent, &dentry->d_name);
+	if (unhashed)
+		dentry = unhashed;
+	else {
+		/*
+		 * Mark the dentry incomplete but don't hash it. We do this
+		 * to serialize our inode creation operations (symlink and
+		 * mkdir) which prevents deadlock during the callback to
+		 * the daemon. Subsequent user space lookups for the same
+		 * dentry are placed on the wait queue while the daemon
+		 * itself is allowed passage unresticted so the create
+		 * operation itself can then hash the dentry. Finally,
+		 * we check for the hashed dentry and return the newly
+		 * hashed dentry.
+		 */
+		dentry->d_op = &autofs4_root_dentry_operations;
+
+		/*
+		 * And we need to ensure that the same dentry is used for
+		 * all following lookup calls until it is hashed so that
+		 * the dentry flags are persistent throughout the request.
+		 */
+		ino = autofs4_init_ino(NULL, sbi, 0555);
+		if (!ino)
+			return ERR_PTR(-ENOMEM);
+
+		dentry->d_fsdata = ino;
+		ino->dentry = dentry;
+
+		spin_lock(&sbi->lookup_lock);
+		list_add(&ino->active, &sbi->active_list);
+		spin_unlock(&sbi->lookup_lock);
 
-	dentry->d_fsdata = NULL;
-	d_instantiate(dentry, NULL);
+		d_instantiate(dentry, NULL);
+	}
 
 	if (!oz_mode) {
 		spin_lock(&dentry->d_lock);
@@ -630,12 +705,16 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 			if (sigismember (sigset, SIGKILL) ||
 			    sigismember (sigset, SIGQUIT) ||
 			    sigismember (sigset, SIGINT)) {
+			    if (unhashed)
+				dput(unhashed);
 			    return ERR_PTR(-ERESTARTNOINTR);
 			}
 		}
-		spin_lock(&dentry->d_lock);
-		dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
-		spin_unlock(&dentry->d_lock);
+		if (!oz_mode) {
+			spin_lock(&dentry->d_lock);
+			dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
+			spin_unlock(&dentry->d_lock);
+		}
 	}
 
 	/*
@@ -659,9 +738,15 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 		else
 			dentry = ERR_PTR(-ENOENT);
 
+		if (unhashed)
+			dput(unhashed);
+
 		return dentry;
 	}
 
+	if (unhashed)
+		return unhashed;
+
 	return NULL;
 }
 
@@ -682,20 +767,30 @@ static int autofs4_dir_symlink(struct inode *dir,
 		return -EACCES;
 
 	ino = autofs4_init_ino(ino, sbi, S_IFLNK | 0555);
-	if (ino == NULL)
-		return -ENOSPC;
+	if (!ino)
+		return -ENOMEM;
 
-	ino->size = strlen(symname);
-	ino->u.symlink = cp = kmalloc(ino->size + 1, GFP_KERNEL);
+	spin_lock(&sbi->lookup_lock);
+	if (!list_empty(&ino->active))
+		list_del_init(&ino->active);
+	spin_unlock(&sbi->lookup_lock);
 
-	if (cp == NULL) {
-		kfree(ino);
-		return -ENOSPC;
+	cp = kmalloc(ino->size + 1, GFP_KERNEL);
+	if (!cp) {
+		if (!dentry->d_fsdata)
+			kfree(ino);
+		return -ENOMEM;
 	}
 
 	strcpy(cp, symname);
 
 	inode = autofs4_get_inode(dir->i_sb, ino);
+	if (!inode) {
+		kfree(cp);
+		if (!dentry->d_fsdata)
+			kfree(ino);
+		return -ENOMEM;
+	}
 	d_add(dentry, inode);
 
 	if (dir == dir->i_sb->s_root->d_inode)
@@ -711,6 +806,8 @@ static int autofs4_dir_symlink(struct inode *dir,
 		atomic_inc(&p_ino->count);
 	ino->inode = inode;
 
+	ino->size = strlen(symname);
+	ino->u.symlink = cp;
 	dir->i_mtime = CURRENT_TIME;
 
 	return 0;
@@ -755,7 +852,8 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
 
 	spin_lock(&dcache_lock);
 	spin_lock(&sbi->lookup_lock);
-	list_add(&ino->expiring, &sbi->expiring_list);
+	if (list_empty(&ino->expiring))
+		list_add(&ino->expiring, &sbi->expiring_list);
 	spin_unlock(&sbi->lookup_lock);
 	spin_lock(&dentry->d_lock);
 	__d_drop(dentry);
@@ -783,7 +881,8 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
 		return -ENOTEMPTY;
 	}
 	spin_lock(&sbi->lookup_lock);
-	list_add(&ino->expiring, &sbi->expiring_list);
+	if (list_empty(&ino->expiring))
+		list_add(&ino->expiring, &sbi->expiring_list);
 	spin_unlock(&sbi->lookup_lock);
 	spin_lock(&dentry->d_lock);
 	__d_drop(dentry);
@@ -819,10 +918,20 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 		dentry, dentry->d_name.len, dentry->d_name.name);
 
 	ino = autofs4_init_ino(ino, sbi, S_IFDIR | 0555);
-	if (ino == NULL)
-		return -ENOSPC;
+	if (!ino)
+		return -ENOMEM;
+
+	spin_lock(&sbi->lookup_lock);
+	if (!list_empty(&ino->active))
+		list_del_init(&ino->active);
+	spin_unlock(&sbi->lookup_lock);
 
 	inode = autofs4_get_inode(dir->i_sb, ino);
+	if (!inode) {
+		if (!dentry->d_fsdata)
+			kfree(ino);
+		return -ENOMEM;
+	}
 	d_add(dentry, inode);
 
 	if (dir == dir->i_sb->s_root->d_inode)
-- 
cgit v1.2.3-70-g09d2


From ef581a742874ebc4c28d24b374c78b762144ebdc Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Wed, 23 Jul 2008 21:30:13 -0700
Subject: autofs4: fix symlink name allocation

The length of the symlink name has been moved but it needs to be set
before allocating space for it in the dentry info struct.  This corrects a
mistake in a recent patch.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/root.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index dbb70d5a488..324290c6827 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -775,6 +775,7 @@ static int autofs4_dir_symlink(struct inode *dir,
 		list_del_init(&ino->active);
 	spin_unlock(&sbi->lookup_lock);
 
+	ino->size = strlen(symname);
 	cp = kmalloc(ino->size + 1, GFP_KERNEL);
 	if (!cp) {
 		if (!dentry->d_fsdata)
@@ -806,7 +807,6 @@ static int autofs4_dir_symlink(struct inode *dir,
 		atomic_inc(&p_ino->count);
 	ino->inode = inode;
 
-	ino->size = strlen(symname);
 	ino->u.symlink = cp;
 	dir->i_mtime = CURRENT_TIME;
 
-- 
cgit v1.2.3-70-g09d2


From c432c2586a0811c7d0030d78f0993568bc889a6f Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Wed, 23 Jul 2008 21:30:14 -0700
Subject: autofs4: don't release directory mutex if called in oz_mode

Since we now delay hashing of dentrys until the ->mkdir() call, droping
and re-taking the directory mutex within the ->lookup() function when we
are being called by user space is not needed.  This can lead to a race
when other processes are attempting to access the same directory during
mount point directory creation.

In this case we need to hang onto the mutex to ensure we don't get user
processes trying to create a mount request for a newly created dentry
after the mount point entry has already been created.  This ensures that
when we need to check a dentry passed to autofs4_wait(), if it is hashed,
it is always the mount point dentry and not a new dentry created by
another lookup during directory creation.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/root.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 324290c6827..1e901e5ea01 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -686,12 +686,11 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 		spin_lock(&dentry->d_lock);
 		dentry->d_flags |= DCACHE_AUTOFS_PENDING;
 		spin_unlock(&dentry->d_lock);
-	}
-
-	if (dentry->d_op && dentry->d_op->d_revalidate) {
-		mutex_unlock(&dir->i_mutex);
-		(dentry->d_op->d_revalidate)(dentry, nd);
-		mutex_lock(&dir->i_mutex);
+		if (dentry->d_op && dentry->d_op->d_revalidate) {
+			mutex_unlock(&dir->i_mutex);
+			(dentry->d_op->d_revalidate)(dentry, nd);
+			mutex_lock(&dir->i_mutex);
+		}
 	}
 
 	/*
-- 
cgit v1.2.3-70-g09d2


From 6d5cb926fa0162b1e62f37c117cc7ce763cfcbb9 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Wed, 23 Jul 2008 21:30:15 -0700
Subject: autofs4: use lookup intent flags to trigger mounts

When an open(2) call is made on an autofs mount point directory that
already exists and the O_DIRECTORY flag is not used the needed mount
callback to the daemon is not done. This leads to the path walk
continuing resulting in a callback to the daemon with an incorrect
key. open(2) is called without O_DIRECTORY by the "find" utility but
this should be handled properly anyway.

This happens because autofs needs to use the lookup flags to decide
when to callback to the daemon to perform a mount to prevent mount
storms. For example, an autofs indirect mount map that has the "browse"
option will have the mount point directories are pre-created and the
stat(2) call made by a color ls against each directory will cause all
these directories to be mounted. It is unfortunate we need to resort
to this but mount maps can be quite large. Additionally, if a user
manually umounts an autofs indirect mount the directory isn't removed
which also leads to this situation.

To resolve this autofs needs to use the lookup intent flags to enable
it to make this decision. This patch adds this check and triggers a
call back if any of the lookup intent flags are set as all these calls
warrant a mount attempt be requested.

I know that external VFS code which uses the lookup flags is something
that the VFS would like to eliminate but I have no choice as I can't
see any other way to do this. A VFS dentry or inode operation callback
which returns the lookup "type" (requires a definition) would be
sufficient. But this change is needed now and I'm not aware of the form
that coming VFS changes will take so I'm not willing to propose anything
along these lines.

If anyone can provide an alternate method I would be happy to use it.

[akpm@linux-foundation.org: fix build for concurrent VFS changes]
Signed-off-by: Ian Kent <raven@themaw.net>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/root.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 1e901e5ea01..87352654ff4 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -31,6 +31,9 @@ static int autofs4_root_readdir(struct file * filp, void * dirent, filldir_t fil
 static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
 static void *autofs4_follow_link(struct dentry *, struct nameidata *);
 
+#define TRIGGER_FLAGS   (LOOKUP_CONTINUE | LOOKUP_DIRECTORY)
+#define TRIGGER_INTENTS (LOOKUP_OPEN | LOOKUP_CREATE)
+
 const struct file_operations autofs4_root_operations = {
 	.open		= dcache_dir_open,
 	.release	= dcache_dir_close,
@@ -291,7 +294,7 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags)
 			return status;
 		}
 	/* Trigger mount for path component or follow link */
-	} else if (flags & (LOOKUP_CONTINUE | LOOKUP_DIRECTORY) ||
+	} else if (flags & (TRIGGER_FLAGS | TRIGGER_INTENTS) ||
 			current->link_count) {
 		DPRINTK("waiting for mount name=%.*s",
 			dentry->d_name.len, dentry->d_name.name);
@@ -336,7 +339,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 		nd->flags);
 
 	/* If it's our master or we shouldn't trigger a mount we're done */
-	lookup_type = nd->flags & (LOOKUP_CONTINUE | LOOKUP_DIRECTORY);
+	lookup_type = nd->flags & (TRIGGER_FLAGS | TRIGGER_INTENTS);
 	if (oz_mode || !lookup_type)
 		goto done;
 
-- 
cgit v1.2.3-70-g09d2


From 70b52a0a5005ce6a0ceec56e97222437a0ba7506 Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Wed, 23 Jul 2008 21:30:16 -0700
Subject: autofs4: use struct qstr in waitq.c

The autofs_wait_queue already contains all of the fields of the
struct qstr, so change it into a qstr.

This patch, from Jeff Moyer, has been modified a liitle by myself.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/autofs_i.h |  4 +--
 fs/autofs4/waitq.c    | 86 +++++++++++++++++++++++++++------------------------
 2 files changed, 46 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 2dce2334737..da8882ff31e 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -75,9 +75,7 @@ struct autofs_wait_queue {
 	struct autofs_wait_queue *next;
 	autofs_wqt_t wait_queue_token;
 	/* We use the following to see what we are waiting for */
-	unsigned int hash;
-	unsigned int len;
-	char *name;
+	struct qstr name;
 	u32 dev;
 	u64 ino;
 	uid_t uid;
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 75e5955c3f6..5208cfb1df4 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -36,8 +36,10 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi)
 	while (wq) {
 		nwq = wq->next;
 		wq->status = -ENOENT; /* Magic is gone - report failure */
-		kfree(wq->name);
-		wq->name = NULL;
+		if (wq->name.name) {
+			kfree(wq->name.name);
+			wq->name.name = NULL;
+		}
 		wake_up_interruptible(&wq->queue);
 		wq = nwq;
 	}
@@ -92,7 +94,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 	size_t pktsz;
 
 	DPRINTK("wait id = 0x%08lx, name = %.*s, type=%d",
-		wq->wait_queue_token, wq->len, wq->name, type);
+		wq->wait_queue_token, wq->name.len, wq->name.name, type);
 
 	memset(&pkt,0,sizeof pkt); /* For security reasons */
 
@@ -107,9 +109,9 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 		pktsz = sizeof(*mp);
 
 		mp->wait_queue_token = wq->wait_queue_token;
-		mp->len = wq->len;
-		memcpy(mp->name, wq->name, wq->len);
-		mp->name[wq->len] = '\0';
+		mp->len = wq->name.len;
+		memcpy(mp->name, wq->name.name, wq->name.len);
+		mp->name[wq->name.len] = '\0';
 		break;
 	}
 	case autofs_ptype_expire_multi:
@@ -119,9 +121,9 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 		pktsz = sizeof(*ep);
 
 		ep->wait_queue_token = wq->wait_queue_token;
-		ep->len = wq->len;
-		memcpy(ep->name, wq->name, wq->len);
-		ep->name[wq->len] = '\0';
+		ep->len = wq->name.len;
+		memcpy(ep->name, wq->name.name, wq->name.len);
+		ep->name[wq->name.len] = '\0';
 		break;
 	}
 	/*
@@ -138,9 +140,9 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 		pktsz = sizeof(*packet);
 
 		packet->wait_queue_token = wq->wait_queue_token;
-		packet->len = wq->len;
-		memcpy(packet->name, wq->name, wq->len);
-		packet->name[wq->len] = '\0';
+		packet->len = wq->name.len;
+		memcpy(packet->name, wq->name.name, wq->name.len);
+		packet->name[wq->name.len] = '\0';
 		packet->dev = wq->dev;
 		packet->ino = wq->ino;
 		packet->uid = wq->uid;
@@ -191,15 +193,15 @@ static int autofs4_getpath(struct autofs_sb_info *sbi,
 }
 
 static struct autofs_wait_queue *
-autofs4_find_wait(struct autofs_sb_info *sbi,
-		  char *name, unsigned int hash, unsigned int len)
+autofs4_find_wait(struct autofs_sb_info *sbi, struct qstr *qstr)
 {
 	struct autofs_wait_queue *wq;
 
 	for (wq = sbi->queues; wq; wq = wq->next) {
-		if (wq->hash == hash &&
-		    wq->len == len &&
-		    wq->name && !memcmp(wq->name, name, len))
+		if (wq->name.hash == qstr->hash &&
+		    wq->name.len == qstr->len &&
+		    wq->name.name &&
+			 !memcmp(wq->name.name, qstr->name, qstr->len))
 			break;
 	}
 	return wq;
@@ -210,9 +212,8 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 {
 	struct autofs_info *ino;
 	struct autofs_wait_queue *wq;
+	struct qstr qstr;
 	char *name;
-	unsigned int len = 0;
-	unsigned int hash = 0;
 	int status, type;
 
 	/* In catatonic mode, we don't wait for nobody */
@@ -225,22 +226,23 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 
 	/* If this is a direct mount request create a dummy name */
 	if (IS_ROOT(dentry) && (sbi->type & AUTOFS_TYPE_DIRECT))
-		len = sprintf(name, "%p", dentry);
+		qstr.len = sprintf(name, "%p", dentry);
 	else {
-		len = autofs4_getpath(sbi, dentry, &name);
-		if (!len) {
+		qstr.len = autofs4_getpath(sbi, dentry, &name);
+		if (!qstr.len) {
 			kfree(name);
 			return -ENOENT;
 		}
 	}
-	hash = full_name_hash(name, len);
+	qstr.name = name;
+	qstr.hash = full_name_hash(name, qstr.len);
 
 	if (mutex_lock_interruptible(&sbi->wq_mutex)) {
-		kfree(name);
+		kfree(qstr.name);
 		return -EINTR;
 	}
 
-	wq = autofs4_find_wait(sbi, name, hash, len);
+	wq = autofs4_find_wait(sbi, &qstr);
 	ino = autofs4_dentry_ino(dentry);
 	if (!wq && ino && notify == NFY_NONE) {
 		/*
@@ -254,10 +256,10 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 			mutex_unlock(&sbi->wq_mutex);
 			schedule_timeout_interruptible(HZ/10);
 			if (mutex_lock_interruptible(&sbi->wq_mutex)) {
-				kfree(name);
+				kfree(qstr.name);
 				return -EINTR;
 			}
-			wq = autofs4_find_wait(sbi, name, hash, len);
+			wq = autofs4_find_wait(sbi, &qstr);
 			if (wq)
 				break;
 		}
@@ -268,7 +270,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 		 * return status of the wait.
 		 */
 		if (!wq) {
-			kfree(name);
+			kfree(qstr.name);
 			mutex_unlock(&sbi->wq_mutex);
 			return 0;
 		}
@@ -278,7 +280,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 		/* Create a new wait queue */
 		wq = kmalloc(sizeof(struct autofs_wait_queue),GFP_KERNEL);
 		if (!wq) {
-			kfree(name);
+			kfree(qstr.name);
 			mutex_unlock(&sbi->wq_mutex);
 			return -ENOMEM;
 		}
@@ -289,9 +291,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 		wq->next = sbi->queues;
 		sbi->queues = wq;
 		init_waitqueue_head(&wq->queue);
-		wq->hash = hash;
-		wq->name = name;
-		wq->len = len;
+		memcpy(&wq->name, &qstr, sizeof(struct qstr));
 		wq->dev = autofs4_get_dev(sbi);
 		wq->ino = autofs4_get_ino(sbi);
 		wq->uid = current->uid;
@@ -319,16 +319,18 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 		}
 
 		DPRINTK("new wait id = 0x%08lx, name = %.*s, nfy=%d\n",
-			(unsigned long) wq->wait_queue_token, wq->len, wq->name, notify);
+			(unsigned long) wq->wait_queue_token, wq->name.len,
+			wq->name.name, notify);
 
 		/* autofs4_notify_daemon() may block */
 		autofs4_notify_daemon(sbi, wq, type);
 	} else {
 		atomic_inc(&wq->wait_ctr);
 		mutex_unlock(&sbi->wq_mutex);
-		kfree(name);
+		kfree(qstr.name);
 		DPRINTK("existing wait id = 0x%08lx, name = %.*s, nfy=%d",
-			(unsigned long) wq->wait_queue_token, wq->len, wq->name, notify);
+			(unsigned long) wq->wait_queue_token, wq->name.len,
+			wq->name.name, notify);
 	}
 
 	/* wq->name is NULL if and only if the lock is already released */
@@ -336,11 +338,13 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 	if (sbi->catatonic) {
 		/* We might have slept, so check again for catatonic mode */
 		wq->status = -ENOENT;
-		kfree(wq->name);
-		wq->name = NULL;
+		if (wq->name.name) {
+			kfree(wq->name.name);
+			wq->name.name = NULL;
+		}
 	}
 
-	if (wq->name) {
+	if (wq->name.name) {
 		/* Block all but "shutdown" signals while waiting */
 		sigset_t oldset;
 		unsigned long irqflags;
@@ -351,7 +355,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 		recalc_sigpending();
 		spin_unlock_irqrestore(&current->sighand->siglock, irqflags);
 
-		wait_event_interruptible(wq->queue, wq->name == NULL);
+		wait_event_interruptible(wq->queue, wq->name.name == NULL);
 
 		spin_lock_irqsave(&current->sighand->siglock, irqflags);
 		current->blocked = oldset;
@@ -388,8 +392,8 @@ int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_tok
 
 	*wql = wq->next;	/* Unlink from chain */
 	mutex_unlock(&sbi->wq_mutex);
-	kfree(wq->name);
-	wq->name = NULL;	/* Do not wait on this queue */
+	kfree(wq->name.name);
+	wq->name.name = NULL;	/* Do not wait on this queue */
 
 	wq->status = status;
 
-- 
cgit v1.2.3-70-g09d2


From 5a11d4d0ee1ff284271f7265929d07ea4a1168a6 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Wed, 23 Jul 2008 21:30:17 -0700
Subject: autofs4: fix waitq locking

The autofs4_catatonic_mode() function accesses the wait queue without any
locking but can be called at any time.  This could lead to a possible
double free of the name field of the wait and a double fput of the daemon
communication pipe or an fput of a NULL file pointer.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/inode.c |  4 ++--
 fs/autofs4/waitq.c | 23 ++++++++++++-----------
 2 files changed, 14 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index e3e70994ab4..7bb3e5ba053 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -163,8 +163,8 @@ void autofs4_kill_sb(struct super_block *sb)
 	if (!sbi)
 		goto out_kill_sb;
 
-	if (!sbi->catatonic)
-		autofs4_catatonic_mode(sbi); /* Free wait queues, close pipe */
+	/* Free wait queues, close pipe */
+	autofs4_catatonic_mode(sbi);
 
 	/* Clean up and release dangling references */
 	autofs4_force_release(sbi);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 5208cfb1df4..55aac10cf32 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -28,6 +28,12 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi)
 {
 	struct autofs_wait_queue *wq, *nwq;
 
+	mutex_lock(&sbi->wq_mutex);
+	if (sbi->catatonic) {
+		mutex_unlock(&sbi->wq_mutex);
+		return;
+	}
+
 	DPRINTK("entering catatonic mode");
 
 	sbi->catatonic = 1;
@@ -45,6 +51,8 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi)
 	}
 	fput(sbi->pipe);	/* Close the pipe */
 	sbi->pipe = NULL;
+	sbi->pipefd = -1;
+	mutex_unlock(&sbi->wq_mutex);
 }
 
 static int autofs4_write(struct file *file, const void *addr, int bytes)
@@ -333,17 +341,10 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 			wq->name.name, notify);
 	}
 
-	/* wq->name is NULL if and only if the lock is already released */
-
-	if (sbi->catatonic) {
-		/* We might have slept, so check again for catatonic mode */
-		wq->status = -ENOENT;
-		if (wq->name.name) {
-			kfree(wq->name.name);
-			wq->name.name = NULL;
-		}
-	}
-
+	/*
+	 * wq->name.name is NULL iff the lock is already released
+	 * or the mount has been made catatonic.
+	 */
 	if (wq->name.name) {
 		/* Block all but "shutdown" signals while waiting */
 		sigset_t oldset;
-- 
cgit v1.2.3-70-g09d2


From a1362fe92f1bde687b3a9e93d6b8d105d0a84f74 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Wed, 23 Jul 2008 21:30:19 -0700
Subject: autofs4: fix pending mount race

Close a race between a pending mount that is about to finish and a new
lookup for the same directory.

Process P1 triggers a mount of directory foo.  It sets
DCACHE_AUTOFS_PENDING in the ->lookup routine, creates a waitq entry for
'foo', and calls out to the daemon to perform the mount.  The autofs
daemon will then create the directory 'foo', using a new dentry that will
be hashed in the dcache.

Before the mount completes, another process, P2, tries to walk into the
'foo' directory.  The vfs path walking code finds an entry for 'foo' and
calls the revalidate method.  Revalidate finds that the entry is not
PENDING (because PENDING was never set on the dentry created by the
mkdir), but it does find the directory is empty.  Revalidate calls
try_to_fill_dentry, which sets the PENDING flag and then calls into the
autofs4 wait code to trigger or wait for a mount of 'foo'.  The wait code
finds the entry for 'foo' and goes to sleep waiting for the completion of
the mount.

Yet another process, P3, tries to walk into the 'foo' directory.  This
process again finds a dentry in the dcache for 'foo', and calls into the
autofs revalidate code.

The revalidate code finds that the PENDING flag is set, and so calls
try_to_fill_dentry.

a) try_to_fill_dentry sets the PENDING flag redundantly for this
   dentry, then calls into the autofs4 wait code.

b) the autofs4 wait code takes the waitq mutex and searches for an
   entry for 'foo'

Between a and b, P1 is woken up because the mount completed.  P1 takes the
wait queue mutex, clears the PENDING flag from the dentry, and removes the
waitqueue entry for 'foo' from the list.

When it releases the waitq mutex, P3 (eventually) acquires it.  At this
time, it looks for an existing waitq for 'foo', finds none, and so creates
a new one and calls out to the daemon to mount the 'foo' directory.

Now, the reason that three processes are required to trigger this race is
that, because the PENDING flag is not set on the dentry created by mkdir,
the window for the race would be way to slim for it to ever occur.
Basically, between the testing of d_mountpoint(dentry) and the taking of
the waitq mutex, the mount would have to complete and the daemon would
have to be woken up, and that in turn would have to wake up P1.  This is
simply impossible.  Add the third process, though, and it becomes slightly
more likely.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/waitq.c | 135 ++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 97 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 55aac10cf32..cd3b2a67169 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -215,19 +215,106 @@ autofs4_find_wait(struct autofs_sb_info *sbi, struct qstr *qstr)
 	return wq;
 }
 
+/*
+ * Check if we have a valid request.
+ * Returns
+ * 1 if the request should continue.
+ *   In this case we can return an autofs_wait_queue entry if one is
+ *   found or NULL to idicate a new wait needs to be created.
+ * 0 or a negative errno if the request shouldn't continue.
+ */
+static int validate_request(struct autofs_wait_queue **wait,
+			    struct autofs_sb_info *sbi,
+			    struct qstr *qstr,
+			    struct dentry*dentry, enum autofs_notify notify)
+{
+	struct autofs_wait_queue *wq;
+	struct autofs_info *ino;
+
+	/* Wait in progress, continue; */
+	wq = autofs4_find_wait(sbi, qstr);
+	if (wq) {
+		*wait = wq;
+		return 1;
+	}
+
+	*wait = NULL;
+
+	/* If we don't yet have any info this is a new request */
+	ino = autofs4_dentry_ino(dentry);
+	if (!ino)
+		return 1;
+
+	/*
+	 * If we've been asked to wait on an existing expire (NFY_NONE)
+	 * but there is no wait in the queue ...
+	 */
+	if (notify == NFY_NONE) {
+		/*
+		 * Either we've betean the pending expire to post it's
+		 * wait or it finished while we waited on the mutex.
+		 * So we need to wait till either, the wait appears
+		 * or the expire finishes.
+		 */
+
+		while (ino->flags & AUTOFS_INF_EXPIRING) {
+			mutex_unlock(&sbi->wq_mutex);
+			schedule_timeout_interruptible(HZ/10);
+			if (mutex_lock_interruptible(&sbi->wq_mutex))
+				return -EINTR;
+
+			wq = autofs4_find_wait(sbi, qstr);
+			if (wq) {
+				*wait = wq;
+				return 1;
+			}
+		}
+
+		/*
+		 * Not ideal but the status has already gone. Of the two
+		 * cases where we wait on NFY_NONE neither depend on the
+		 * return status of the wait.
+		 */
+		return 0;
+	}
+
+	/*
+	 * If we've been asked to trigger a mount and the request
+	 * completed while we waited on the mutex ...
+	 */
+	if (notify == NFY_MOUNT) {
+		/*
+		 * If the dentry isn't hashed just go ahead and try the
+		 * mount again with a new wait (not much else we can do).
+		*/
+		if (!d_unhashed(dentry)) {
+			/*
+			 * But if the dentry is hashed, that means that we
+			 * got here through the revalidate path.  Thus, we
+			 * need to check if the dentry has been mounted
+			 * while we waited on the wq_mutex. If it has,
+			 * simply return success.
+			 */
+			if (d_mountpoint(dentry))
+				return 0;
+		}
+	}
+
+	return 1;
+}
+
 int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 		enum autofs_notify notify)
 {
-	struct autofs_info *ino;
 	struct autofs_wait_queue *wq;
 	struct qstr qstr;
 	char *name;
-	int status, type;
+	int status, ret, type;
 
 	/* In catatonic mode, we don't wait for nobody */
 	if (sbi->catatonic)
 		return -ENOENT;
-	
+
 	name = kmalloc(NAME_MAX + 1, GFP_KERNEL);
 	if (!name)
 		return -ENOMEM;
@@ -245,43 +332,15 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 	qstr.name = name;
 	qstr.hash = full_name_hash(name, qstr.len);
 
-	if (mutex_lock_interruptible(&sbi->wq_mutex)) {
-		kfree(qstr.name);
+	if (mutex_lock_interruptible(&sbi->wq_mutex))
 		return -EINTR;
-	}
-
-	wq = autofs4_find_wait(sbi, &qstr);
-	ino = autofs4_dentry_ino(dentry);
-	if (!wq && ino && notify == NFY_NONE) {
-		/*
-		 * Either we've betean the pending expire to post it's
-		 * wait or it finished while we waited on the mutex.
-		 * So we need to wait till either, the wait appears
-		 * or the expire finishes.
-		 */
 
-		while (ino->flags & AUTOFS_INF_EXPIRING) {
-			mutex_unlock(&sbi->wq_mutex);
-			schedule_timeout_interruptible(HZ/10);
-			if (mutex_lock_interruptible(&sbi->wq_mutex)) {
-				kfree(qstr.name);
-				return -EINTR;
-			}
-			wq = autofs4_find_wait(sbi, &qstr);
-			if (wq)
-				break;
-		}
-
-		/*
-		 * Not ideal but the status has already gone. Of the two
-		 * cases where we wait on NFY_NONE neither depend on the
-		 * return status of the wait.
-		 */
-		if (!wq) {
-			kfree(qstr.name);
+	ret = validate_request(&wq, sbi, &qstr, dentry, notify);
+	if (ret <= 0) {
+		if (ret == 0)
 			mutex_unlock(&sbi->wq_mutex);
-			return 0;
-		}
+		kfree(qstr.name);
+		return ret;
 	}
 
 	if (!wq) {
@@ -392,9 +451,9 @@ int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_tok
 	}
 
 	*wql = wq->next;	/* Unlink from chain */
-	mutex_unlock(&sbi->wq_mutex);
 	kfree(wq->name.name);
 	wq->name.name = NULL;	/* Do not wait on this queue */
+	mutex_unlock(&sbi->wq_mutex);
 
 	wq->status = status;
 
-- 
cgit v1.2.3-70-g09d2


From f4c7da02615bebcaf89f15a8d055922f515160b8 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Wed, 23 Jul 2008 21:30:19 -0700
Subject: autofs4: add missing kfree

It see that the patch tittled "autofs4 - fix pending mount race" is
missing a change that I had recently made.

It's missing a kfree for the case mutex_lock_interruptible() fails
to aquire the wait queue mutex.

Signed-off-by: Ian Kent <raven@themaw.net>
Cc: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/waitq.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index cd3b2a67169..1132cc2a031 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -332,8 +332,10 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 	qstr.name = name;
 	qstr.hash = full_name_hash(name, qstr.len);
 
-	if (mutex_lock_interruptible(&sbi->wq_mutex))
+	if (mutex_lock_interruptible(&sbi->wq_mutex)) {
+		kfree(qstr.name);
 		return -EINTR;
+	}
 
 	ret = validate_request(&wq, sbi, &qstr, dentry, notify);
 	if (ret <= 0) {
-- 
cgit v1.2.3-70-g09d2


From e64be33ccaceaca67c84237dff8805b861398eab Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Wed, 23 Jul 2008 21:30:20 -0700
Subject: autofs4: check kernel communication pipe is valid for write

It is possible for an autofs mount to become catatonic (and for the daemon
communication pipe to become NULL) after a wait has been initiallized but
before the request has been sent to the daemon.  We need to check for this
before sending the request packet.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/waitq.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 1132cc2a031..dd2914d7ad7 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -99,6 +99,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 		union autofs_packet_union v4_pkt;
 		union autofs_v5_packet_union v5_pkt;
 	} pkt;
+	struct file *pipe = NULL;
 	size_t pktsz;
 
 	DPRINTK("wait id = 0x%08lx, name = %.*s, type=%d",
@@ -164,8 +165,19 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 		return;
 	}
 
-	if (autofs4_write(sbi->pipe, &pkt, pktsz))
-		autofs4_catatonic_mode(sbi);
+	/* Check if we have become catatonic */
+	mutex_lock(&sbi->wq_mutex);
+	if (!sbi->catatonic) {
+		pipe = sbi->pipe;
+		get_file(pipe);
+	}
+	mutex_unlock(&sbi->wq_mutex);
+
+	if (pipe) {
+		if (autofs4_write(pipe, &pkt, pktsz))
+			autofs4_catatonic_mode(sbi);
+		fput(pipe);
+	}
 }
 
 static int autofs4_getpath(struct autofs_sb_info *sbi,
-- 
cgit v1.2.3-70-g09d2


From 296f7bf78bc5c7a4d772aea580ce800d14040d1a Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Wed, 23 Jul 2008 21:30:21 -0700
Subject: autofs4: fix waitq memory leak

If an autofs mount becomes catatonic before autofs4_wait_release() is
called the wait queue counter will not be decremented down to zero and the
entry will never be freed.  There are also races decrementing the wait
counter in the wait release function.  To deal with this the counter needs
to be updated while holding the wait queue mutex and waiters need to be
woken up unconditionally when the wait is removed from the queue to ensure
we eventually free the wait.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/autofs_i.h |  2 +-
 fs/autofs4/waitq.c    | 18 +++++++++---------
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index da8882ff31e..058e1800cae 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -84,7 +84,7 @@ struct autofs_wait_queue {
 	pid_t tgid;
 	/* This is for status reporting upon return */
 	int status;
-	atomic_t wait_ctr;
+	unsigned int wait_ctr;
 };
 
 #define AUTOFS_SBI_MAGIC 0x6d4a556d
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index dd2914d7ad7..3458dbc8fff 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -46,6 +46,7 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi)
 			kfree(wq->name.name);
 			wq->name.name = NULL;
 		}
+		wq->wait_ctr--;
 		wake_up_interruptible(&wq->queue);
 		wq = nwq;
 	}
@@ -380,7 +381,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 		wq->pid = current->pid;
 		wq->tgid = current->tgid;
 		wq->status = -EINTR; /* Status return if interrupted */
-		atomic_set(&wq->wait_ctr, 2);
+		wq->wait_ctr = 2;
 		mutex_unlock(&sbi->wq_mutex);
 
 		if (sbi->version < 5) {
@@ -406,7 +407,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 		/* autofs4_notify_daemon() may block */
 		autofs4_notify_daemon(sbi, wq, type);
 	} else {
-		atomic_inc(&wq->wait_ctr);
+		wq->wait_ctr++;
 		mutex_unlock(&sbi->wq_mutex);
 		kfree(qstr.name);
 		DPRINTK("existing wait id = 0x%08lx, name = %.*s, nfy=%d",
@@ -442,8 +443,10 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 	status = wq->status;
 
 	/* Are we the last process to need status? */
-	if (atomic_dec_and_test(&wq->wait_ctr))
+	mutex_lock(&sbi->wq_mutex);
+	if (!--wq->wait_ctr)
 		kfree(wq);
+	mutex_unlock(&sbi->wq_mutex);
 
 	return status;
 }
@@ -467,14 +470,11 @@ int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_tok
 	*wql = wq->next;	/* Unlink from chain */
 	kfree(wq->name.name);
 	wq->name.name = NULL;	/* Do not wait on this queue */
-	mutex_unlock(&sbi->wq_mutex);
-
 	wq->status = status;
-
-	if (atomic_dec_and_test(&wq->wait_ctr))	/* Is anyone still waiting for this guy? */
+	wake_up_interruptible(&wq->queue);
+	if (!--wq->wait_ctr)
 		kfree(wq);
-	else
-		wake_up_interruptible(&wq->queue);
+	mutex_unlock(&sbi->wq_mutex);
 
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From eb3b176796b0e53fd26fce86847231542eb0d198 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Wed, 23 Jul 2008 21:30:22 -0700
Subject: autofs4: detect invalid direct mount requests

autofs v5 direct and offset mounts within an autofs filesystem are
triggered by existing autofs triger mounts so the mount point dentry must
be positive.  If the mount point dentry is negative then the trigger
doesn't exist so we can return fail immediately.

Signed-off-by: Ian Kent <raven@themaw.net>
Cc: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/waitq.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 3458dbc8fff..bcb6c526546 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -328,6 +328,10 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 	if (sbi->catatonic)
 		return -ENOENT;
 
+	if (!dentry->d_inode &&
+	    (sbi->type & (AUTOFS_TYPE_DIRECT | AUTOFS_TYPE_OFFSET)))
+		return -ENOENT;
+
 	name = kmalloc(NAME_MAX + 1, GFP_KERNEL);
 	if (!name)
 		return -ENOMEM;
-- 
cgit v1.2.3-70-g09d2


From c72305b5472522299bb6f45b736080128eb1c822 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Wed, 23 Jul 2008 21:30:23 -0700
Subject: autofs4: indirect dentry must almost always be positive

We have been seeing mount requests comming to the automount daemon for
keys of the form "<map key>/<non key directory>" which are lookups for
invalid map keys.  But we can check for this in the kernel module and
return a fail immediately, without having to send a request to the daemon.

It is possible to recognise these requests are invalid based on whether
the request dentry is negative and its relation to the autofs file system
root.

For example, given the indirect multi-mount map entry:

idm1  \
    /mm1  <server>:/<path1>
    /mm2  <server>:/<path2>

For a request to mount idm1, IS_ROOT((idm1)->d_parent) will be always be
true and the dentry may be negative.  But directories idm1/mm1 and
idm1/mm2 will always be created as part of the mount request for idm1.  So
any mount request within idm1 itself must have a positive dentry otherwise
the map key is invalid.

In version 4 these multi-mount entries are all mounted and umounted as a
single request and in version 5 the directories idm1/mm1 and idm1/mm2 are
created and an autofs fs mounted on them to act as a mount trigger so the
above is also true.

This also holds true for the autofs version 4 pseudo direct mount feature.
 When this feature is used without the "--ghost" option automount(8) will
create internal submounts as we go down the map key paths which are
essentially normal indirect mounts for which the above holds.  If the
"--ghost" option is given the directories for map keys are created at
daemon startup so valid map entries correspond to postive dentries in the
autofs fs.

autofs version 5 direct mount maps are similar except that the IS_ROOT
check is not needed.  This has been addressed in a previous patch tittled
"autofs4 - detect invalid direct mount requests".

For example, given the direct multi-mount map entry:

/test/dm1  \
    /mm1  <server>:/<path1>
    /mm2  <server>:/<path2>

An autofs fs is mounted on /test/dm1 as a trigger mount and when a mount
is triggered for /test/dm1, the multi-mount offset directories
/test/dm1/mm1 and /test/dm1/mm2 are created and an autofs fs is mounted on
them to act as mount triggers.  So valid direct mount requests must always
have a positive dentry if they correspond to a valid map entry.

Signed-off-by: Ian Kent <raven@themaw.net>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/waitq.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index bcb6c526546..35216d18d8b 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -328,9 +328,20 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 	if (sbi->catatonic)
 		return -ENOENT;
 
-	if (!dentry->d_inode &&
-	    (sbi->type & (AUTOFS_TYPE_DIRECT | AUTOFS_TYPE_OFFSET)))
-		return -ENOENT;
+	if (!dentry->d_inode) {
+		/*
+		 * A wait for a negative dentry is invalid for certain
+		 * cases. A direct or offset mount "always" has its mount
+		 * point directory created and so the request dentry must
+		 * be positive or the map key doesn't exist. The situation
+		 * is very similar for indirect mounts except only dentrys
+		 * in the root of the autofs file system may be negative.
+		 */
+		if (sbi->type & (AUTOFS_TYPE_DIRECT|AUTOFS_TYPE_OFFSET))
+			return -ENOENT;
+		else if (!IS_ROOT(dentry->d_parent))
+			return -ENOENT;
+	}
 
 	name = kmalloc(NAME_MAX + 1, GFP_KERNEL);
 	if (!name)
-- 
cgit v1.2.3-70-g09d2


From ff9cd499d6258952385cb2f12e9a3c0908fd5786 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Wed, 23 Jul 2008 21:30:24 -0700
Subject: autofs4: cleanup redundant readir code

The mount triggering functionality of readdir and related functions is no
longer used (and is quite broken as well).  The unused portions have been
removed.

Signed-off-by: Ian Kent <raven@themaw.net>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/root.c | 149 ++++++------------------------------------------------
 1 file changed, 16 insertions(+), 133 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 87352654ff4..51c873ca8e8 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -25,8 +25,6 @@ static int autofs4_dir_rmdir(struct inode *,struct dentry *);
 static int autofs4_dir_mkdir(struct inode *,struct dentry *,int);
 static int autofs4_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long);
 static int autofs4_dir_open(struct inode *inode, struct file *file);
-static int autofs4_dir_close(struct inode *inode, struct file *file);
-static int autofs4_dir_readdir(struct file * filp, void * dirent, filldir_t filldir);
 static int autofs4_root_readdir(struct file * filp, void * dirent, filldir_t filldir);
 static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
 static void *autofs4_follow_link(struct dentry *, struct nameidata *);
@@ -44,9 +42,9 @@ const struct file_operations autofs4_root_operations = {
 
 const struct file_operations autofs4_dir_operations = {
 	.open		= autofs4_dir_open,
-	.release	= autofs4_dir_close,
+	.release	= dcache_dir_close,
 	.read		= generic_read_dir,
-	.readdir	= autofs4_dir_readdir,
+	.readdir	= dcache_readdir,
 };
 
 const struct inode_operations autofs4_indirect_root_inode_operations = {
@@ -98,17 +96,7 @@ static int autofs4_root_readdir(struct file *file, void *dirent,
 static int autofs4_dir_open(struct inode *inode, struct file *file)
 {
 	struct dentry *dentry = file->f_path.dentry;
-	struct vfsmount *mnt = file->f_path.mnt;
 	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
-	struct dentry *cursor;
-	int status;
-
-	status = dcache_dir_open(inode, file);
-	if (status)
-		goto out;
-
-	cursor = file->private_data;
-	cursor->d_fsdata = NULL;
 
 	DPRINTK("file=%p dentry=%p %.*s",
 		file, dentry, dentry->d_name.len, dentry->d_name.name);
@@ -116,129 +104,24 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
 	if (autofs4_oz_mode(sbi))
 		goto out;
 
-	if (autofs4_ispending(dentry)) {
-		DPRINTK("dentry busy");
-		dcache_dir_close(inode, file);
-		status = -EBUSY;
-		goto out;
-	}
-
-	status = -ENOENT;
-	if (!d_mountpoint(dentry) && dentry->d_op && dentry->d_op->d_revalidate) {
-		struct nameidata nd;
-		int empty, ret;
-
-		/* In case there are stale directory dentrys from a failed mount */
-		spin_lock(&dcache_lock);
-		empty = list_empty(&dentry->d_subdirs);
+	/*
+	 * An empty directory in an autofs file system is always a
+	 * mount point. The daemon must have failed to mount this
+	 * during lookup so it doesn't exist. This can happen, for
+	 * example, if user space returns an incorrect status for a
+	 * mount request. Otherwise we're doing a readdir on the
+	 * autofs file system so just let the libfs routines handle
+	 * it.
+	 */
+	spin_lock(&dcache_lock);
+	if (!d_mountpoint(dentry) && __simple_empty(dentry)) {
 		spin_unlock(&dcache_lock);
-
-		if (!empty)
-			d_invalidate(dentry);
-
-		nd.flags = LOOKUP_DIRECTORY;
-		ret = (dentry->d_op->d_revalidate)(dentry, &nd);
-
-		if (ret <= 0) {
-			if (ret < 0)
-				status = ret;
-			dcache_dir_close(inode, file);
-			goto out;
-		}
+		return -ENOENT;
 	}
+	spin_unlock(&dcache_lock);
 
-	if (d_mountpoint(dentry)) {
-		struct file *fp = NULL;
-		struct path fp_path = { .dentry = dentry, .mnt = mnt };
-
-		path_get(&fp_path);
-
-		if (!autofs4_follow_mount(&fp_path.mnt, &fp_path.dentry)) {
-			path_put(&fp_path);
-			dcache_dir_close(inode, file);
-			goto out;
-		}
-
-		fp = dentry_open(fp_path.dentry, fp_path.mnt, file->f_flags);
-		status = PTR_ERR(fp);
-		if (IS_ERR(fp)) {
-			dcache_dir_close(inode, file);
-			goto out;
-		}
-		cursor->d_fsdata = fp;
-	}
-	return 0;
-out:
-	return status;
-}
-
-static int autofs4_dir_close(struct inode *inode, struct file *file)
-{
-	struct dentry *dentry = file->f_path.dentry;
-	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
-	struct dentry *cursor = file->private_data;
-	int status = 0;
-
-	DPRINTK("file=%p dentry=%p %.*s",
-		file, dentry, dentry->d_name.len, dentry->d_name.name);
-
-	if (autofs4_oz_mode(sbi))
-		goto out;
-
-	if (autofs4_ispending(dentry)) {
-		DPRINTK("dentry busy");
-		status = -EBUSY;
-		goto out;
-	}
-
-	if (d_mountpoint(dentry)) {
-		struct file *fp = cursor->d_fsdata;
-		if (!fp) {
-			status = -ENOENT;
-			goto out;
-		}
-		filp_close(fp, current->files);
-	}
-out:
-	dcache_dir_close(inode, file);
-	return status;
-}
-
-static int autofs4_dir_readdir(struct file *file, void *dirent, filldir_t filldir)
-{
-	struct dentry *dentry = file->f_path.dentry;
-	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
-	struct dentry *cursor = file->private_data;
-	int status;
-
-	DPRINTK("file=%p dentry=%p %.*s",
-		file, dentry, dentry->d_name.len, dentry->d_name.name);
-
-	if (autofs4_oz_mode(sbi))
-		goto out;
-
-	if (autofs4_ispending(dentry)) {
-		DPRINTK("dentry busy");
-		return -EBUSY;
-	}
-
-	if (d_mountpoint(dentry)) {
-		struct file *fp = cursor->d_fsdata;
-
-		if (!fp)
-			return -ENOENT;
-
-		if (!fp->f_op || !fp->f_op->readdir)
-			goto out;
-
-		status = vfs_readdir(fp, filldir, dirent);
-		file->f_pos = fp->f_pos;
-		if (status)
-			autofs4_copy_atime(file, fp);
-		return status;
-	}
 out:
-	return dcache_readdir(file, dirent, filldir);
+	return dcache_dir_open(inode, file);
 }
 
 static int try_to_fill_dentry(struct dentry *dentry, int flags)
-- 
cgit v1.2.3-70-g09d2


From 26e81b3142f1ba497d4cd0365c13661684b784ce Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Wed, 23 Jul 2008 21:30:25 -0700
Subject: autofs4: fix pending checks

There are two cases for which a dentry that has a pending mount request
does not wait for completion.  One is via autofs4_revalidate() and the
other via autofs4_follow_link().

In revalidate, after the mount point directory is created, but before the
mount is done, the check in try_to_fill_dentry() can can fail to send the
dentry to the wait queue since the dentry is positive and the lookup flags
may contain only LOOKUP_FOLLOW.  Although we don't trigger a mount for the
LOOKUP_FOLLOW flag, if ther's one pending we might as well wait and use
the mounted dentry for the lookup.

In autofs4_follow_link() the dentry is not checked to see if it is pending
so it may fail to call try_to_fill_dentry() and not wait for mount
completion.

A dentry that is pending must always be sent to the wait queue.

Signed-off-by: Ian Kent <raven@themaw.net>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/root.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 51c873ca8e8..61d1dca1688 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -177,7 +177,8 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags)
 			return status;
 		}
 	/* Trigger mount for path component or follow link */
-	} else if (flags & (TRIGGER_FLAGS | TRIGGER_INTENTS) ||
+	} else if (dentry->d_flags & DCACHE_AUTOFS_PENDING ||
+			flags & (TRIGGER_FLAGS | TRIGGER_INTENTS) ||
 			current->link_count) {
 		DPRINTK("waiting for mount name=%.*s",
 			dentry->d_name.len, dentry->d_name.name);
@@ -223,7 +224,8 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 
 	/* If it's our master or we shouldn't trigger a mount we're done */
 	lookup_type = nd->flags & (TRIGGER_FLAGS | TRIGGER_INTENTS);
-	if (oz_mode || !lookup_type)
+	if (oz_mode ||
+	    !(lookup_type || dentry->d_flags & DCACHE_AUTOFS_PENDING))
 		goto done;
 
 	/* If an expire request is pending wait for it. */
@@ -242,7 +244,8 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 	 * don't try to mount it again.
 	 */
 	spin_lock(&dcache_lock);
-	if (!d_mountpoint(dentry) && __simple_empty(dentry)) {
+	if (dentry->d_flags & DCACHE_AUTOFS_PENDING ||
+	    (!d_mountpoint(dentry) && __simple_empty(dentry))) {
 		spin_unlock(&dcache_lock);
 
 		status = try_to_fill_dentry(dentry, 0);
-- 
cgit v1.2.3-70-g09d2


From 97e7449a7ad883bf9f516fc970778d75999c7843 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Wed, 23 Jul 2008 21:30:26 -0700
Subject: autofs4: fix indirect mount pending expire race

The selection of a dentry for expiration and the setting of the
AUTOFS_INF_EXPIRING flag isn't done atomically which can lead to lookups
walking into an expiring mount.

What happens is that an expire is initiated by the daemon and a dentry is
selected for expire but, since there is no lock held between the selection
and setting of the expiring flag, a process may find the flag clear and
continue walking into the mount tree at the same time the daemon attempts
the expire it.

Signed-off-by: Ian Kent <raven@themaw.net>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/autofs_i.h | 10 +++-------
 fs/autofs4/expire.c   | 46 +++++++++++++++++++++++++++++++++++-----------
 fs/autofs4/root.c     | 32 +++++++++++++++++++++++++++-----
 3 files changed, 65 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 058e1800cae..5d90ed3b4b4 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -138,18 +138,14 @@ static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) {
 static inline int autofs4_ispending(struct dentry *dentry)
 {
 	struct autofs_info *inf = autofs4_dentry_ino(dentry);
-	int pending = 0;
 
 	if (dentry->d_flags & DCACHE_AUTOFS_PENDING)
 		return 1;
 
-	if (inf) {
-		spin_lock(&inf->sbi->fs_lock);
-		pending = inf->flags & AUTOFS_INF_EXPIRING;
-		spin_unlock(&inf->sbi->fs_lock);
-	}
+	if (inf->flags & AUTOFS_INF_EXPIRING)
+		return 1;
 
-	return pending;
+	return 0;
 }
 
 static inline void autofs4_copy_atime(struct file *src, struct file *dst)
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 894fee54d4d..19f5bea2704 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -292,6 +292,8 @@ static struct dentry *autofs4_expire_indirect(struct super_block *sb,
 	struct list_head *next;
 	int do_now = how & AUTOFS_EXP_IMMEDIATE;
 	int exp_leaves = how & AUTOFS_EXP_LEAVES;
+	struct autofs_info *ino;
+	unsigned int ino_count;
 
 	if (!root)
 		return NULL;
@@ -316,6 +318,9 @@ static struct dentry *autofs4_expire_indirect(struct super_block *sb,
 		dentry = dget(dentry);
 		spin_unlock(&dcache_lock);
 
+		spin_lock(&sbi->fs_lock);
+		ino = autofs4_dentry_ino(dentry);
+
 		/*
 		 * Case 1: (i) indirect mount or top level pseudo direct mount
 		 *	   (autofs-4.1).
@@ -326,6 +331,11 @@ static struct dentry *autofs4_expire_indirect(struct super_block *sb,
 			DPRINTK("checking mountpoint %p %.*s",
 				dentry, (int)dentry->d_name.len, dentry->d_name.name);
 
+			/* Path walk currently on this dentry? */
+			ino_count = atomic_read(&ino->count) + 2;
+			if (atomic_read(&dentry->d_count) > ino_count)
+				goto next;
+
 			/* Can we umount this guy */
 			if (autofs4_mount_busy(mnt, dentry))
 				goto next;
@@ -343,23 +353,25 @@ static struct dentry *autofs4_expire_indirect(struct super_block *sb,
 
 		/* Case 2: tree mount, expire iff entire tree is not busy */
 		if (!exp_leaves) {
-			/* Lock the tree as we must expire as a whole */
-			spin_lock(&sbi->fs_lock);
-			if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) {
-				struct autofs_info *inf = autofs4_dentry_ino(dentry);
+			/* Path walk currently on this dentry? */
+			ino_count = atomic_read(&ino->count) + 1;
+			if (atomic_read(&dentry->d_count) > ino_count)
+				goto next;
 
-				/* Set this flag early to catch sys_chdir and the like */
-				inf->flags |= AUTOFS_INF_EXPIRING;
-				spin_unlock(&sbi->fs_lock);
+			if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) {
 				expired = dentry;
 				goto found;
 			}
-			spin_unlock(&sbi->fs_lock);
 		/*
 		 * Case 3: pseudo direct mount, expire individual leaves
 		 *	   (autofs-4.1).
 		 */
 		} else {
+			/* Path walk currently on this dentry? */
+			ino_count = atomic_read(&ino->count) + 1;
+			if (atomic_read(&dentry->d_count) > ino_count)
+				goto next;
+
 			expired = autofs4_check_leaves(mnt, dentry, timeout, do_now);
 			if (expired) {
 				dput(dentry);
@@ -367,6 +379,7 @@ static struct dentry *autofs4_expire_indirect(struct super_block *sb,
 			}
 		}
 next:
+		spin_unlock(&sbi->fs_lock);
 		dput(dentry);
 		spin_lock(&dcache_lock);
 		next = next->next;
@@ -377,6 +390,9 @@ next:
 found:
 	DPRINTK("returning %p %.*s",
 		expired, (int)expired->d_name.len, expired->d_name.name);
+	ino = autofs4_dentry_ino(expired);
+	ino->flags |= AUTOFS_INF_EXPIRING;
+	spin_unlock(&sbi->fs_lock);
 	spin_lock(&dcache_lock);
 	list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
 	spin_unlock(&dcache_lock);
@@ -390,7 +406,9 @@ int autofs4_expire_run(struct super_block *sb,
 		      struct autofs_packet_expire __user *pkt_p)
 {
 	struct autofs_packet_expire pkt;
+	struct autofs_info *ino;
 	struct dentry *dentry;
+	int ret = 0;
 
 	memset(&pkt,0,sizeof pkt);
 
@@ -406,9 +424,14 @@ int autofs4_expire_run(struct super_block *sb,
 	dput(dentry);
 
 	if ( copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)) )
-		return -EFAULT;
+		ret = -EFAULT;
 
-	return 0;
+	spin_lock(&sbi->fs_lock);
+	ino = autofs4_dentry_ino(dentry);
+	ino->flags &= ~AUTOFS_INF_EXPIRING;
+	spin_unlock(&sbi->fs_lock);
+
+	return ret;
 }
 
 /* Call repeatedly until it returns -EAGAIN, meaning there's nothing
@@ -433,9 +456,10 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
 
 		/* This is synchronous because it makes the daemon a
                    little easier */
-		ino->flags |= AUTOFS_INF_EXPIRING;
 		ret = autofs4_wait(sbi, dentry, NFY_EXPIRE);
+		spin_lock(&sbi->fs_lock);
 		ino->flags &= ~AUTOFS_INF_EXPIRING;
+		spin_unlock(&sbi->fs_lock);
 		dput(dentry);
 	}
 
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 61d1dca1688..1c2579de1f2 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -133,7 +133,10 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags)
 	/* Block on any pending expiry here; invalidate the dentry
            when expiration is done to trigger mount request with a new
            dentry */
-	if (ino && (ino->flags & AUTOFS_INF_EXPIRING)) {
+	spin_lock(&sbi->fs_lock);
+	if (ino->flags & AUTOFS_INF_EXPIRING) {
+		spin_unlock(&sbi->fs_lock);
+
 		DPRINTK("waiting for expire %p name=%.*s",
 			 dentry, dentry->d_name.len, dentry->d_name.name);
 
@@ -149,8 +152,11 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags)
 		status = d_invalidate(dentry);
 		if (status != -EBUSY)
 			return -EAGAIN;
-	}
 
+		goto cont;
+	}
+	spin_unlock(&sbi->fs_lock);
+cont:
 	DPRINTK("dentry=%p %.*s ino=%p",
 		 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
 
@@ -229,15 +235,21 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 		goto done;
 
 	/* If an expire request is pending wait for it. */
-	if (ino && (ino->flags & AUTOFS_INF_EXPIRING)) {
+	spin_lock(&sbi->fs_lock);
+	if (ino->flags & AUTOFS_INF_EXPIRING) {
+		spin_unlock(&sbi->fs_lock);
+
 		DPRINTK("waiting for active request %p name=%.*s",
 			dentry, dentry->d_name.len, dentry->d_name.name);
 
 		status = autofs4_wait(sbi, dentry, NFY_NONE);
 
 		DPRINTK("request done status=%d", status);
-	}
 
+		goto cont;
+	}
+	spin_unlock(&sbi->fs_lock);
+cont:
 	/*
 	 * If the dentry contains directories then it is an
 	 * autofs multi-mount with no root mount offset. So
@@ -292,8 +304,11 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
 	int status = 1;
 
 	/* Pending dentry */
+	spin_lock(&sbi->fs_lock);
 	if (autofs4_ispending(dentry)) {
 		/* The daemon never causes a mount to trigger */
+		spin_unlock(&sbi->fs_lock);
+
 		if (oz_mode)
 			return 1;
 
@@ -316,6 +331,7 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
 
 		return status;
 	}
+	spin_unlock(&sbi->fs_lock);
 
 	/* Negative dentry.. invalidate if "old" */
 	if (dentry->d_inode == NULL)
@@ -329,6 +345,7 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
 		DPRINTK("dentry=%p %.*s, emptydir",
 			 dentry, dentry->d_name.len, dentry->d_name.name);
 		spin_unlock(&dcache_lock);
+
 		/* The daemon never causes a mount to trigger */
 		if (oz_mode)
 			return 1;
@@ -521,13 +538,18 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 		 * so it must have been successful, so just wait for it.
 		 */
 		ino = autofs4_dentry_ino(expiring);
-		while (ino && (ino->flags & AUTOFS_INF_EXPIRING)) {
+		spin_lock(&sbi->fs_lock);
+		if (ino->flags & AUTOFS_INF_EXPIRING) {
+			spin_unlock(&sbi->fs_lock);
 			DPRINTK("wait for incomplete expire %p name=%.*s",
 				expiring, expiring->d_name.len,
 				expiring->d_name.name);
 			autofs4_wait(sbi, expiring, NFY_NONE);
 			DPRINTK("request completed");
+			goto cont;
 		}
+		spin_unlock(&sbi->fs_lock);
+cont:
 		spin_lock(&sbi->lookup_lock);
 		if (!list_empty(&ino->expiring))
 			list_del_init(&ino->expiring);
-- 
cgit v1.2.3-70-g09d2


From 6e60a9ab5f5d314735467752f623072f5b75157a Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Wed, 23 Jul 2008 21:30:27 -0700
Subject: autofs4: fix direct mount pending expire race

For direct and offset type mounts that are covered by another mount we
cannot check the AUTOFS_INF_EXPIRING flag during a path walk which leads
to lookups walking into an expiring mount while it is being expired.

For example, for the direct multi-mount map entry with a couple of
offsets:

/race/mm1  /      <server1>:/<path1>
           /om1   <server2>:/<path2>
           /om2   <server1>:/<path3>

an autofs trigger mount is mounted on /race/mm1 and when accessed it is
over mounted and trigger mounts made for /race/mm1/om1 and /race/mm1/om2.
So it isn't possible for path walks to see the expiring flag at all and
they happily walk into the file system while it is expiring.

When expiring these mounts follow_down() must stop at the autofs mount and
all processes must block in the ->follow_link() method (except the daemon)
until the expire is complete.  This is done by decrementing the d_mounted
field of the autofs trigger mount root dentry until the expire is
completed.  In ->follow_link() all processes wait on the expire and the
mount following is completed for the daemon until the expire is complete.

Signed-off-by: Ian Kent <raven@themaw.net>
Cc: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/autofs_i.h |  3 +++
 fs/autofs4/expire.c   | 16 +++++++++---
 fs/autofs4/root.c     | 72 +++++++++++++++++++++++++++++++++++----------------
 3 files changed, 65 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 5d90ed3b4b4..4b40cbc71e9 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -52,6 +52,8 @@ struct autofs_info {
 
 	int		flags;
 
+	struct completion expire_complete;
+
 	struct list_head active;
 	struct list_head expiring;
 
@@ -69,6 +71,7 @@ struct autofs_info {
 };
 
 #define AUTOFS_INF_EXPIRING	(1<<0) /* dentry is in the process of expiring */
+#define AUTOFS_INF_MOUNTPOINT	(1<<1) /* mountpoint status for direct expire */
 
 struct autofs_wait_queue {
 	wait_queue_head_t queue;
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 19f5bea2704..705b9f057fb 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -259,13 +259,15 @@ static struct dentry *autofs4_expire_direct(struct super_block *sb,
 	now = jiffies;
 	timeout = sbi->exp_timeout;
 
-	/* Lock the tree as we must expire as a whole */
 	spin_lock(&sbi->fs_lock);
 	if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
 		struct autofs_info *ino = autofs4_dentry_ino(root);
-
-		/* Set this flag early to catch sys_chdir and the like */
+		if (d_mountpoint(root)) {
+			ino->flags |= AUTOFS_INF_MOUNTPOINT;
+			root->d_mounted--;
+		}
 		ino->flags |= AUTOFS_INF_EXPIRING;
+		init_completion(&ino->expire_complete);
 		spin_unlock(&sbi->fs_lock);
 		return root;
 	}
@@ -392,6 +394,7 @@ found:
 		expired, (int)expired->d_name.len, expired->d_name.name);
 	ino = autofs4_dentry_ino(expired);
 	ino->flags |= AUTOFS_INF_EXPIRING;
+	init_completion(&ino->expire_complete);
 	spin_unlock(&sbi->fs_lock);
 	spin_lock(&dcache_lock);
 	list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
@@ -429,6 +432,7 @@ int autofs4_expire_run(struct super_block *sb,
 	spin_lock(&sbi->fs_lock);
 	ino = autofs4_dentry_ino(dentry);
 	ino->flags &= ~AUTOFS_INF_EXPIRING;
+	complete_all(&ino->expire_complete);
 	spin_unlock(&sbi->fs_lock);
 
 	return ret;
@@ -457,8 +461,14 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
 		/* This is synchronous because it makes the daemon a
                    little easier */
 		ret = autofs4_wait(sbi, dentry, NFY_EXPIRE);
+
 		spin_lock(&sbi->fs_lock);
+		if (ino->flags & AUTOFS_INF_MOUNTPOINT) {
+			sb->s_root->d_mounted++;
+			ino->flags &= ~AUTOFS_INF_MOUNTPOINT;
+		}
 		ino->flags &= ~AUTOFS_INF_EXPIRING;
+		complete_all(&ino->expire_complete);
 		spin_unlock(&sbi->fs_lock);
 		dput(dentry);
 	}
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 1c2579de1f2..adbd8559e87 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -141,6 +141,7 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags)
 			 dentry, dentry->d_name.len, dentry->d_name.name);
 
 		status = autofs4_wait(sbi, dentry, NFY_NONE);
+		wait_for_completion(&ino->expire_complete);
 
 		DPRINTK("expire done status=%d", status);
 
@@ -227,14 +228,32 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 	DPRINTK("dentry=%p %.*s oz_mode=%d nd->flags=%d",
 		dentry, dentry->d_name.len, dentry->d_name.name, oz_mode,
 		nd->flags);
-
-	/* If it's our master or we shouldn't trigger a mount we're done */
-	lookup_type = nd->flags & (TRIGGER_FLAGS | TRIGGER_INTENTS);
-	if (oz_mode ||
-	    !(lookup_type || dentry->d_flags & DCACHE_AUTOFS_PENDING))
+	/*
+	 * For an expire of a covered direct or offset mount we need
+	 * to beeak out of follow_down() at the autofs mount trigger
+	 * (d_mounted--), so we can see the expiring flag, and manage
+	 * the blocking and following here until the expire is completed.
+	 */
+	if (oz_mode) {
+		spin_lock(&sbi->fs_lock);
+		if (ino->flags & AUTOFS_INF_EXPIRING) {
+			spin_unlock(&sbi->fs_lock);
+			/* Follow down to our covering mount. */
+			if (!follow_down(&nd->path.mnt, &nd->path.dentry))
+				goto done;
+			/*
+			 * We shouldn't need to do this but we have no way
+			 * of knowing what may have been done so try a follow
+			 * just in case.
+			 */
+			autofs4_follow_mount(&nd->path.mnt, &nd->path.dentry);
+			goto done;
+		}
+		spin_unlock(&sbi->fs_lock);
 		goto done;
+	}
 
-	/* If an expire request is pending wait for it. */
+	/* If an expire request is pending everyone must wait. */
 	spin_lock(&sbi->fs_lock);
 	if (ino->flags & AUTOFS_INF_EXPIRING) {
 		spin_unlock(&sbi->fs_lock);
@@ -243,6 +262,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 			dentry, dentry->d_name.len, dentry->d_name.name);
 
 		status = autofs4_wait(sbi, dentry, NFY_NONE);
+		wait_for_completion(&ino->expire_complete);
 
 		DPRINTK("request done status=%d", status);
 
@@ -250,10 +270,15 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 	}
 	spin_unlock(&sbi->fs_lock);
 cont:
+	/* We trigger a mount for almost all flags */
+	lookup_type = nd->flags & (TRIGGER_FLAGS | TRIGGER_INTENTS);
+	if (!(lookup_type || dentry->d_flags & DCACHE_AUTOFS_PENDING))
+		goto done;
+
 	/*
-	 * If the dentry contains directories then it is an
-	 * autofs multi-mount with no root mount offset. So
-	 * don't try to mount it again.
+	 * If the dentry contains directories then it is an autofs
+	 * multi-mount with no root mount offset. So don't try to
+	 * mount it again.
 	 */
 	spin_lock(&dcache_lock);
 	if (dentry->d_flags & DCACHE_AUTOFS_PENDING ||
@@ -264,22 +289,22 @@ cont:
 		if (status)
 			goto out_error;
 
-		/*
-		 * The mount succeeded but if there is no root mount
-		 * it must be an autofs multi-mount with no root offset
-		 * so we don't need to follow the mount.
-		 */
-		if (d_mountpoint(dentry)) {
-			if (!autofs4_follow_mount(&nd->path.mnt,
-						  &nd->path.dentry)) {
-				status = -ENOENT;
-				goto out_error;
-			}
-		}
-
-		goto done;
+		goto follow;
 	}
 	spin_unlock(&dcache_lock);
+follow:
+	/*
+	 * If there is no root mount it must be an autofs
+	 * multi-mount with no root offset so we don't need
+	 * to follow it.
+	 */
+	if (d_mountpoint(dentry)) {
+		if (!autofs4_follow_mount(&nd->path.mnt,
+					  &nd->path.dentry)) {
+			status = -ENOENT;
+			goto out_error;
+		}
+	}
 
 done:
 	return NULL;
@@ -545,6 +570,7 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 				expiring, expiring->d_name.len,
 				expiring->d_name.name);
 			autofs4_wait(sbi, expiring, NFY_NONE);
+			wait_for_completion(&ino->expire_complete);
 			DPRINTK("request completed");
 			goto cont;
 		}
-- 
cgit v1.2.3-70-g09d2


From ec6e8c7d3f9073336ec7b2eed3fcda6f922087c3 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Wed, 23 Jul 2008 21:30:28 -0700
Subject: autofs4: fix direct mount pending expire race - correction

Appologies, somehow I seem to have sent an out dated version of this
patch. Here is an additional patch that brings the patch up to date.

Signed-off-by: Ian Kent <raven@themaw.net>
Cc: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/root.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index adbd8559e87..e062ee5a3ed 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -241,13 +241,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 			/* Follow down to our covering mount. */
 			if (!follow_down(&nd->path.mnt, &nd->path.dentry))
 				goto done;
-			/*
-			 * We shouldn't need to do this but we have no way
-			 * of knowing what may have been done so try a follow
-			 * just in case.
-			 */
-			autofs4_follow_mount(&nd->path.mnt, &nd->path.dentry);
-			goto done;
+			goto follow;
 		}
 		spin_unlock(&sbi->fs_lock);
 		goto done;
@@ -273,7 +267,7 @@ cont:
 	/* We trigger a mount for almost all flags */
 	lookup_type = nd->flags & (TRIGGER_FLAGS | TRIGGER_INTENTS);
 	if (!(lookup_type || dentry->d_flags & DCACHE_AUTOFS_PENDING))
-		goto done;
+		goto follow;
 
 	/*
 	 * If the dentry contains directories then it is an autofs
-- 
cgit v1.2.3-70-g09d2


From 06a3598552dc3b2b30eb18bd53bbac2a901489d7 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Wed, 23 Jul 2008 21:30:28 -0700
Subject: autofs4: reorganize expire pending wait function calls

This patch re-orgnirzes the checking for and waiting on active expires and
elininates redundant checks.

Signed-off-by: Ian Kent <raven@themaw.net>
Cc: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/autofs_i.h |  1 +
 fs/autofs4/expire.c   | 29 ++++++++++++++++++++
 fs/autofs4/root.c     | 75 +++++++--------------------------------------------
 3 files changed, 40 insertions(+), 65 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 4b40cbc71e9..69a2f5c9231 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -163,6 +163,7 @@ void autofs4_free_ino(struct autofs_info *);
 
 /* Expiration */
 int is_autofs4_dentry(struct dentry *);
+int autofs4_expire_wait(struct dentry *dentry);
 int autofs4_expire_run(struct super_block *, struct vfsmount *,
 			struct autofs_sb_info *,
 			struct autofs_packet_expire __user *);
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 705b9f057fb..cdabb796ff0 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -402,6 +402,35 @@ found:
 	return expired;
 }
 
+int autofs4_expire_wait(struct dentry *dentry)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	int status;
+
+	/* Block on any pending expire */
+	spin_lock(&sbi->fs_lock);
+	if (ino->flags & AUTOFS_INF_EXPIRING) {
+		spin_unlock(&sbi->fs_lock);
+
+		DPRINTK("waiting for expire %p name=%.*s",
+			 dentry, dentry->d_name.len, dentry->d_name.name);
+
+		status = autofs4_wait(sbi, dentry, NFY_NONE);
+		wait_for_completion(&ino->expire_complete);
+
+		DPRINTK("expire done status=%d", status);
+
+		if (d_unhashed(dentry))
+			return -EAGAIN;
+
+		return status;
+	}
+	spin_unlock(&sbi->fs_lock);
+
+	return 0;
+}
+
 /* Perform an expiry operation */
 int autofs4_expire_run(struct super_block *sb,
 		      struct vfsmount *mnt,
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index e062ee5a3ed..ae22bde0bbd 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -130,34 +130,6 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags)
 	struct autofs_info *ino = autofs4_dentry_ino(dentry);
 	int status;
 
-	/* Block on any pending expiry here; invalidate the dentry
-           when expiration is done to trigger mount request with a new
-           dentry */
-	spin_lock(&sbi->fs_lock);
-	if (ino->flags & AUTOFS_INF_EXPIRING) {
-		spin_unlock(&sbi->fs_lock);
-
-		DPRINTK("waiting for expire %p name=%.*s",
-			 dentry, dentry->d_name.len, dentry->d_name.name);
-
-		status = autofs4_wait(sbi, dentry, NFY_NONE);
-		wait_for_completion(&ino->expire_complete);
-
-		DPRINTK("expire done status=%d", status);
-
-		/*
-		 * If the directory still exists the mount request must
-		 * continue otherwise it can't be followed at the right
-		 * time during the walk.
-		 */
-		status = d_invalidate(dentry);
-		if (status != -EBUSY)
-			return -EAGAIN;
-
-		goto cont;
-	}
-	spin_unlock(&sbi->fs_lock);
-cont:
 	DPRINTK("dentry=%p %.*s ino=%p",
 		 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
 
@@ -248,22 +220,8 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 	}
 
 	/* If an expire request is pending everyone must wait. */
-	spin_lock(&sbi->fs_lock);
-	if (ino->flags & AUTOFS_INF_EXPIRING) {
-		spin_unlock(&sbi->fs_lock);
-
-		DPRINTK("waiting for active request %p name=%.*s",
-			dentry, dentry->d_name.len, dentry->d_name.name);
-
-		status = autofs4_wait(sbi, dentry, NFY_NONE);
-		wait_for_completion(&ino->expire_complete);
+	autofs4_expire_wait(dentry);
 
-		DPRINTK("request done status=%d", status);
-
-		goto cont;
-	}
-	spin_unlock(&sbi->fs_lock);
-cont:
 	/* We trigger a mount for almost all flags */
 	lookup_type = nd->flags & (TRIGGER_FLAGS | TRIGGER_INTENTS);
 	if (!(lookup_type || dentry->d_flags & DCACHE_AUTOFS_PENDING))
@@ -331,6 +289,14 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
 		if (oz_mode)
 			return 1;
 
+		/*
+		 * If the directory has gone away due to an expire
+		 * we have been called as ->d_revalidate() and so
+		 * we need to return false and proceed to ->lookup().
+		 */
+		if (autofs4_expire_wait(dentry) == -EAGAIN)
+			return 0;
+
 		/*
 		 * A zero status is success otherwise we have a
 		 * negative error code.
@@ -339,15 +305,6 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
 		if (status == 0)
 			return 1;
 
-		/*
-		 * A status of EAGAIN here means that the dentry has gone
-		 * away while waiting for an expire to complete. If we are
-		 * racing with expire lookup will wait for it so this must
-		 * be a revalidate and we need to send it to lookup.
-		 */
-		if (status == -EAGAIN)
-			return 0;
-
 		return status;
 	}
 	spin_unlock(&sbi->fs_lock);
@@ -557,19 +514,7 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 		 * so it must have been successful, so just wait for it.
 		 */
 		ino = autofs4_dentry_ino(expiring);
-		spin_lock(&sbi->fs_lock);
-		if (ino->flags & AUTOFS_INF_EXPIRING) {
-			spin_unlock(&sbi->fs_lock);
-			DPRINTK("wait for incomplete expire %p name=%.*s",
-				expiring, expiring->d_name.len,
-				expiring->d_name.name);
-			autofs4_wait(sbi, expiring, NFY_NONE);
-			wait_for_completion(&ino->expire_complete);
-			DPRINTK("request completed");
-			goto cont;
-		}
-		spin_unlock(&sbi->fs_lock);
-cont:
+		autofs4_expire_wait(expiring);
 		spin_lock(&sbi->lookup_lock);
 		if (!list_empty(&ino->expiring))
 			list_del_init(&ino->expiring);
-- 
cgit v1.2.3-70-g09d2


From aa55ddf340c9fa3f303ee16bbf35887e42c50304 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Wed, 23 Jul 2008 21:30:29 -0700
Subject: autofs4: remove unused ioctls

The ioctls AUTOFS_IOC_TOGGLEREGHOST and AUTOFS_IOC_ASKREGHOST were added
several years ago but what they were intended for has never been
implemented (as far as I'm aware noone uses them) so remove them.

Signed-off-by: Ian Kent <raven@themaw.net>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/root.c        | 68 +-----------------------------------------------
 fs/compat_ioctl.c        |  2 --
 include/linux/auto_fs4.h |  2 --
 3 files changed, 1 insertion(+), 71 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index ae22bde0bbd..bcfb2dc0a61 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -25,7 +25,6 @@ static int autofs4_dir_rmdir(struct inode *,struct dentry *);
 static int autofs4_dir_mkdir(struct inode *,struct dentry *,int);
 static int autofs4_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long);
 static int autofs4_dir_open(struct inode *inode, struct file *file);
-static int autofs4_root_readdir(struct file * filp, void * dirent, filldir_t filldir);
 static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
 static void *autofs4_follow_link(struct dentry *, struct nameidata *);
 
@@ -36,7 +35,7 @@ const struct file_operations autofs4_root_operations = {
 	.open		= dcache_dir_open,
 	.release	= dcache_dir_close,
 	.read		= generic_read_dir,
-	.readdir	= autofs4_root_readdir,
+	.readdir	= dcache_readdir,
 	.ioctl		= autofs4_root_ioctl,
 };
 
@@ -71,28 +70,6 @@ const struct inode_operations autofs4_dir_inode_operations = {
 	.rmdir		= autofs4_dir_rmdir,
 };
 
-static int autofs4_root_readdir(struct file *file, void *dirent,
-				filldir_t filldir)
-{
-	struct autofs_sb_info *sbi = autofs4_sbi(file->f_path.dentry->d_sb);
-	int oz_mode = autofs4_oz_mode(sbi);
-
-	DPRINTK("called, filp->f_pos = %lld", file->f_pos);
-
-	/*
-	 * Don't set reghost flag if:
-	 * 1) f_pos is larger than zero -- we've already been here.
-	 * 2) we haven't even enabled reghosting in the 1st place.
-	 * 3) this is the daemon doing a readdir
-	 */
-	if (oz_mode && file->f_pos == 0 && sbi->reghost_enabled)
-		sbi->needs_reghost = 1;
-
-	DPRINTK("needs_reghost = %d", sbi->needs_reghost);
-
-	return dcache_readdir(file, dirent, filldir);
-}
-
 static int autofs4_dir_open(struct inode *inode, struct file *file)
 {
 	struct dentry *dentry = file->f_path.dentry;
@@ -858,44 +835,6 @@ static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi, int __user
 	return put_user(sbi->sub_version, p);
 }
 
-/*
- * Tells the daemon whether we need to reghost or not. Also, clears
- * the reghost_needed flag.
- */
-static inline int autofs4_ask_reghost(struct autofs_sb_info *sbi, int __user *p)
-{
-	int status;
-
-	DPRINTK("returning %d", sbi->needs_reghost);
-
-	status = put_user(sbi->needs_reghost, p);
-	if (status)
-		return status;
-
-	sbi->needs_reghost = 0;
-	return 0;
-}
-
-/*
- * Enable / Disable reghosting ioctl() operation
- */
-static inline int autofs4_toggle_reghost(struct autofs_sb_info *sbi, int __user *p)
-{
-	int status;
-	int val;
-
-	status = get_user(val, p);
-
-	DPRINTK("reghost = %d", val);
-
-	if (status)
-		return status;
-
-	/* turn on/off reghosting, with the val */
-	sbi->reghost_enabled = val;
-	return 0;
-}
-
 /*
 * Tells the daemon whether it can umount the autofs mount.
 */
@@ -960,11 +899,6 @@ static int autofs4_root_ioctl(struct inode *inode, struct file *filp,
 	case AUTOFS_IOC_SETTIMEOUT:
 		return autofs4_get_set_timeout(sbi, p);
 
-	case AUTOFS_IOC_TOGGLEREGHOST:
-		return autofs4_toggle_reghost(sbi, p);
-	case AUTOFS_IOC_ASKREGHOST:
-		return autofs4_ask_reghost(sbi, p);
-
 	case AUTOFS_IOC_ASKUMOUNT:
 		return autofs4_ask_umount(filp->f_path.mnt, p);
 
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 7b3a03c7c6a..18e2c548161 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -2297,8 +2297,6 @@ COMPATIBLE_IOCTL(AUTOFS_IOC_PROTOVER)
 COMPATIBLE_IOCTL(AUTOFS_IOC_EXPIRE)
 COMPATIBLE_IOCTL(AUTOFS_IOC_EXPIRE_MULTI)
 COMPATIBLE_IOCTL(AUTOFS_IOC_PROTOSUBVER)
-COMPATIBLE_IOCTL(AUTOFS_IOC_ASKREGHOST)
-COMPATIBLE_IOCTL(AUTOFS_IOC_TOGGLEREGHOST)
 COMPATIBLE_IOCTL(AUTOFS_IOC_ASKUMOUNT)
 /* Raw devices */
 COMPATIBLE_IOCTL(RAW_SETBIND)
diff --git a/include/linux/auto_fs4.h b/include/linux/auto_fs4.h
index 31a29541b50..b785c6f8644 100644
--- a/include/linux/auto_fs4.h
+++ b/include/linux/auto_fs4.h
@@ -98,8 +98,6 @@ union autofs_v5_packet_union {
 #define AUTOFS_IOC_EXPIRE_INDIRECT	AUTOFS_IOC_EXPIRE_MULTI
 #define AUTOFS_IOC_EXPIRE_DIRECT	AUTOFS_IOC_EXPIRE_MULTI
 #define AUTOFS_IOC_PROTOSUBVER		_IOR(0x93,0x67,int)
-#define AUTOFS_IOC_ASKREGHOST           _IOR(0x93,0x68,int)
-#define AUTOFS_IOC_TOGGLEREGHOST        _IOR(0x93,0x69,int)
 #define AUTOFS_IOC_ASKUMOUNT		_IOR(0x93,0x70,int)
 
 
-- 
cgit v1.2.3-70-g09d2


From f9247273cb69ba101877e946d2d83044409cc8c5 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 24 Jul 2008 17:22:13 +0100
Subject: UFS: add const to parser token table

This patch adds a "const" to the parser token table. I've done an
allmodconfig build to see if this produces any warnings/failures and the
patch includes a fix for the only warning that was produced.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Acked-by: Alexander Viro <aviro@redhat.com>
Acked-by: Evgeniy Dushistov <dushistov@mail.ru>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ufs/super.c         | 2 +-
 include/linux/parser.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 85b22b5977f..506f724055c 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1232,7 +1232,7 @@ static int ufs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 {
 	struct ufs_sb_info *sbi = UFS_SB(vfs->mnt_sb);
 	unsigned mval = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE;
-	struct match_token *tp = tokens;
+	const struct match_token *tp = tokens;
 
 	while (tp->token != Opt_onerror_panic && tp->token != mval)
 		++tp;
diff --git a/include/linux/parser.h b/include/linux/parser.h
index 7dcd0507575..cc554ca8bc7 100644
--- a/include/linux/parser.h
+++ b/include/linux/parser.h
@@ -14,7 +14,7 @@ struct match_token {
 	const char *pattern;
 };
 
-typedef struct match_token match_table_t[];
+typedef const struct match_token match_table_t[];
 
 /* Maximum number of arguments that match_token will find in a pattern */
 enum {MAX_OPT_ARGS = 3};
-- 
cgit v1.2.3-70-g09d2


From e2d2867ff8700d7431c68c089ff5f5ed7f2d5b40 Mon Sep 17 00:00:00 2001
From: Chris Wright <chrisw@sous-sol.org>
Date: Thu, 24 Jul 2008 20:43:34 +0000
Subject: When verifying the decoded header before decoding the object
 identifier (expecting a SPNEGO pseudo-mechanism oid), the test to verify it
 is a primitive encoding is compared against the asn1 class.  Primitive is not
 a class.  This brings check in line with similar check for krb/ntlmssp oid.

Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Cc: Steven French <sfrench@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/asn1.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index 42765465701..6bb440b257b 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -494,7 +494,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
 		/*      remember to free obj->oid */
 		rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag);
 		if (rc) {
-			if ((tag == ASN1_OJI) && (cls == ASN1_PRI)) {
+			if ((tag == ASN1_OJI) && (con == ASN1_PRI)) {
 				rc = asn1_oid_decode(&ctx, end, &oid, &oidlen);
 				if (rc) {
 					rc = compare_oid(oid, oidlen,
-- 
cgit v1.2.3-70-g09d2


From fb2e405fc1fc8b20d9c78eaa1c7fd5a297efde43 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Fri, 25 Jul 2008 02:55:49 +0300
Subject: fix fs/nfs/nfsroot.c compilation

This fixes the following compile error caused by commit
f9247273cb69ba101877e946d2d83044409cc8c5 ("UFS: add const to parser
token table"):

    CC      fs/nfs/nfsroot.o
  /home/bunk/linux/kernel-2.6/git/linux-2.6/fs/nfs/nfsroot.c:130: error: tokens causes a section type conflict
  make[3]: *** [fs/nfs/nfsroot.o] Error 1

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfs/nfsroot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 46763d1cd39..8478fc25dae 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -127,7 +127,7 @@ enum {
 	Opt_err
 };
 
-static match_table_t __initdata tokens = {
+static match_table_t __initconst tokens = {
 	{Opt_port, "port=%u"},
 	{Opt_rsize, "rsize=%u"},
 	{Opt_wsize, "wsize=%u"},
-- 
cgit v1.2.3-70-g09d2


From 483fad1c3fa1060d7e6710e84a065ad514571739 Mon Sep 17 00:00:00 2001
From: Nathan Lynch <ntl@pobox.com>
Date: Tue, 22 Jul 2008 04:48:46 +1000
Subject: ELF loader support for auxvec base platform string

Some IBM POWER-based platforms have the ability to run in a
mode which mostly appears to the OS as a different processor from the
actual hardware.  For example, a Power6 system may appear to be a
Power5+, which makes the AT_PLATFORM value "power5+".  This means that
programs are restricted to the ISA supported by Power5+;
Power6-specific instructions are treated as illegal.

However, some applications (virtual machines, optimized libraries) can
benefit from knowledge of the underlying CPU model.  A new aux vector
entry, AT_BASE_PLATFORM, will denote the actual hardware.  For
example, on a Power6 system in Power5+ compatibility mode, AT_PLATFORM
will be "power5+" and AT_BASE_PLATFORM will be "power6".  The idea is
that AT_PLATFORM indicates the instruction set supported, while
AT_BASE_PLATFORM indicates the underlying microarchitecture.

If the architecture has defined ELF_BASE_PLATFORM, copy that value to
the user stack in the same manner as ELF_PLATFORM.

Signed-off-by: Nathan Lynch <ntl@pobox.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 fs/binfmt_elf.c        | 28 ++++++++++++++++++++++++++++
 include/linux/auxvec.h |  6 +++++-
 2 files changed, 33 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 639d2d8b571..742c8f53048 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -131,6 +131,15 @@ static int padzero(unsigned long elf_bss)
 #define STACK_ALLOC(sp, len) ({ sp -= len ; sp; })
 #endif
 
+#ifndef ELF_BASE_PLATFORM
+/*
+ * AT_BASE_PLATFORM indicates the "real" hardware/microarchitecture.
+ * If the arch defines ELF_BASE_PLATFORM (in asm/elf.h), the value
+ * will be copied to the user stack in the same manner as AT_PLATFORM.
+ */
+#define ELF_BASE_PLATFORM NULL
+#endif
+
 static int
 create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 		unsigned long load_addr, unsigned long interp_load_addr)
@@ -142,7 +151,9 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 	elf_addr_t __user *envp;
 	elf_addr_t __user *sp;
 	elf_addr_t __user *u_platform;
+	elf_addr_t __user *u_base_platform;
 	const char *k_platform = ELF_PLATFORM;
+	const char *k_base_platform = ELF_BASE_PLATFORM;
 	int items;
 	elf_addr_t *elf_info;
 	int ei_index = 0;
@@ -172,6 +183,19 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 			return -EFAULT;
 	}
 
+	/*
+	 * If this architecture has a "base" platform capability
+	 * string, copy it to userspace.
+	 */
+	u_base_platform = NULL;
+	if (k_base_platform) {
+		size_t len = strlen(k_base_platform) + 1;
+
+		u_base_platform = (elf_addr_t __user *)STACK_ALLOC(p, len);
+		if (__copy_to_user(u_base_platform, k_base_platform, len))
+			return -EFAULT;
+	}
+
 	/* Create the ELF interpreter info */
 	elf_info = (elf_addr_t *)current->mm->saved_auxv;
 	/* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */
@@ -209,6 +233,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 		NEW_AUX_ENT(AT_PLATFORM,
 			    (elf_addr_t)(unsigned long)u_platform);
 	}
+	if (k_base_platform) {
+		NEW_AUX_ENT(AT_BASE_PLATFORM,
+			    (elf_addr_t)(unsigned long)u_base_platform);
+	}
 	if (bprm->interp_flags & BINPRM_FLAGS_EXECFD) {
 		NEW_AUX_ENT(AT_EXECFD, bprm->interp_data);
 	}
diff --git a/include/linux/auxvec.h b/include/linux/auxvec.h
index 0da17d14fd1..d7afa9dd663 100644
--- a/include/linux/auxvec.h
+++ b/include/linux/auxvec.h
@@ -26,9 +26,13 @@
 
 #define AT_SECURE 23   /* secure mode boolean */
 
+#define AT_BASE_PLATFORM 24	/* string identifying real platform, may
+				 * differ from AT_PLATFORM. */
+
 #define AT_EXECFN  31	/* filename of program */
+
 #ifdef __KERNEL__
-#define AT_VECTOR_SIZE_BASE 17 /* NEW_AUX_ENT entries in auxiliary table */
+#define AT_VECTOR_SIZE_BASE 18 /* NEW_AUX_ENT entries in auxiliary table */
   /* number of "#define AT_.*" above, minus {AT_NULL, AT_IGNORE, AT_NOTELF} */
 #endif
 
-- 
cgit v1.2.3-70-g09d2


From b7bbf8fa6ba329b3552b75a0716f5fbc6f839499 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Fri, 25 Jul 2008 01:45:25 -0700
Subject: fs: ldm.[ch] use get_unaligned_* helpers

Replace the private BE16/BE32/BE64 macros with direct calls to
get_unaligned_be16/32/64.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/partitions/ldm.c | 70 ++++++++++++++++++++++++++---------------------------
 fs/partitions/ldm.h |  5 ----
 2 files changed, 35 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 0fdda2e8a4c..8652fb99e96 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -133,17 +133,17 @@ static bool ldm_parse_privhead(const u8 *data, struct privhead *ph)
 	bool is_vista = false;
 
 	BUG_ON(!data || !ph);
-	if (MAGIC_PRIVHEAD != BE64(data)) {
+	if (MAGIC_PRIVHEAD != get_unaligned_be64(data)) {
 		ldm_error("Cannot find PRIVHEAD structure. LDM database is"
 			" corrupt. Aborting.");
 		return false;
 	}
-	ph->ver_major = BE16(data + 0x000C);
-	ph->ver_minor = BE16(data + 0x000E);
-	ph->logical_disk_start = BE64(data + 0x011B);
-	ph->logical_disk_size = BE64(data + 0x0123);
-	ph->config_start = BE64(data + 0x012B);
-	ph->config_size = BE64(data + 0x0133);
+	ph->ver_major = get_unaligned_be16(data + 0x000C);
+	ph->ver_minor = get_unaligned_be16(data + 0x000E);
+	ph->logical_disk_start = get_unaligned_be64(data + 0x011B);
+	ph->logical_disk_size = get_unaligned_be64(data + 0x0123);
+	ph->config_start = get_unaligned_be64(data + 0x012B);
+	ph->config_size = get_unaligned_be64(data + 0x0133);
 	/* Version 2.11 is Win2k/XP and version 2.12 is Vista. */
 	if (ph->ver_major == 2 && ph->ver_minor == 12)
 		is_vista = true;
@@ -191,14 +191,14 @@ static bool ldm_parse_tocblock (const u8 *data, struct tocblock *toc)
 {
 	BUG_ON (!data || !toc);
 
-	if (MAGIC_TOCBLOCK != BE64 (data)) {
+	if (MAGIC_TOCBLOCK != get_unaligned_be64(data)) {
 		ldm_crit ("Cannot find TOCBLOCK, database may be corrupt.");
 		return false;
 	}
 	strncpy (toc->bitmap1_name, data + 0x24, sizeof (toc->bitmap1_name));
 	toc->bitmap1_name[sizeof (toc->bitmap1_name) - 1] = 0;
-	toc->bitmap1_start = BE64 (data + 0x2E);
-	toc->bitmap1_size  = BE64 (data + 0x36);
+	toc->bitmap1_start = get_unaligned_be64(data + 0x2E);
+	toc->bitmap1_size  = get_unaligned_be64(data + 0x36);
 
 	if (strncmp (toc->bitmap1_name, TOC_BITMAP1,
 			sizeof (toc->bitmap1_name)) != 0) {
@@ -208,8 +208,8 @@ static bool ldm_parse_tocblock (const u8 *data, struct tocblock *toc)
 	}
 	strncpy (toc->bitmap2_name, data + 0x46, sizeof (toc->bitmap2_name));
 	toc->bitmap2_name[sizeof (toc->bitmap2_name) - 1] = 0;
-	toc->bitmap2_start = BE64 (data + 0x50);
-	toc->bitmap2_size  = BE64 (data + 0x58);
+	toc->bitmap2_start = get_unaligned_be64(data + 0x50);
+	toc->bitmap2_size  = get_unaligned_be64(data + 0x58);
 	if (strncmp (toc->bitmap2_name, TOC_BITMAP2,
 			sizeof (toc->bitmap2_name)) != 0) {
 		ldm_crit ("TOCBLOCK's second bitmap is '%s', should be '%s'.",
@@ -237,22 +237,22 @@ static bool ldm_parse_vmdb (const u8 *data, struct vmdb *vm)
 {
 	BUG_ON (!data || !vm);
 
-	if (MAGIC_VMDB != BE32 (data)) {
+	if (MAGIC_VMDB != get_unaligned_be32(data)) {
 		ldm_crit ("Cannot find the VMDB, database may be corrupt.");
 		return false;
 	}
 
-	vm->ver_major = BE16 (data + 0x12);
-	vm->ver_minor = BE16 (data + 0x14);
+	vm->ver_major = get_unaligned_be16(data + 0x12);
+	vm->ver_minor = get_unaligned_be16(data + 0x14);
 	if ((vm->ver_major != 4) || (vm->ver_minor != 10)) {
 		ldm_error ("Expected VMDB version %d.%d, got %d.%d. "
 			"Aborting.", 4, 10, vm->ver_major, vm->ver_minor);
 		return false;
 	}
 
-	vm->vblk_size     = BE32 (data + 0x08);
-	vm->vblk_offset   = BE32 (data + 0x0C);
-	vm->last_vblk_seq = BE32 (data + 0x04);
+	vm->vblk_size     = get_unaligned_be32(data + 0x08);
+	vm->vblk_offset   = get_unaligned_be32(data + 0x0C);
+	vm->last_vblk_seq = get_unaligned_be32(data + 0x04);
 
 	ldm_debug ("Parsed VMDB successfully.");
 	return true;
@@ -507,7 +507,7 @@ static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base,
 		goto out;				/* Already logged */
 
 	/* Are there uncommitted transactions? */
-	if (BE16(data + 0x10) != 0x01) {
+	if (get_unaligned_be16(data + 0x10) != 0x01) {
 		ldm_crit ("Database is not in a consistent state.  Aborting.");
 		goto out;
 	}
@@ -802,7 +802,7 @@ static bool ldm_parse_cmp3 (const u8 *buffer, int buflen, struct vblk *vb)
 		return false;
 
 	len += VBLK_SIZE_CMP3;
-	if (len != BE32 (buffer + 0x14))
+	if (len != get_unaligned_be32(buffer + 0x14))
 		return false;
 
 	comp = &vb->vblk.comp;
@@ -851,7 +851,7 @@ static int ldm_parse_dgr3 (const u8 *buffer, int buflen, struct vblk *vb)
 		return false;
 
 	len += VBLK_SIZE_DGR3;
-	if (len != BE32 (buffer + 0x14))
+	if (len != get_unaligned_be32(buffer + 0x14))
 		return false;
 
 	dgrp = &vb->vblk.dgrp;
@@ -895,7 +895,7 @@ static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb)
 		return false;
 
 	len += VBLK_SIZE_DGR4;
-	if (len != BE32 (buffer + 0x14))
+	if (len != get_unaligned_be32(buffer + 0x14))
 		return false;
 
 	dgrp = &vb->vblk.dgrp;
@@ -931,7 +931,7 @@ static bool ldm_parse_dsk3 (const u8 *buffer, int buflen, struct vblk *vb)
 		return false;
 
 	len += VBLK_SIZE_DSK3;
-	if (len != BE32 (buffer + 0x14))
+	if (len != get_unaligned_be32(buffer + 0x14))
 		return false;
 
 	disk = &vb->vblk.disk;
@@ -968,7 +968,7 @@ static bool ldm_parse_dsk4 (const u8 *buffer, int buflen, struct vblk *vb)
 		return false;
 
 	len += VBLK_SIZE_DSK4;
-	if (len != BE32 (buffer + 0x14))
+	if (len != get_unaligned_be32(buffer + 0x14))
 		return false;
 
 	disk = &vb->vblk.disk;
@@ -1034,14 +1034,14 @@ static bool ldm_parse_prt3(const u8 *buffer, int buflen, struct vblk *vb)
 		return false;
 	}
 	len += VBLK_SIZE_PRT3;
-	if (len > BE32(buffer + 0x14)) {
+	if (len > get_unaligned_be32(buffer + 0x14)) {
 		ldm_error("len %d > BE32(buffer + 0x14) %d", len,
-				BE32(buffer + 0x14));
+				get_unaligned_be32(buffer + 0x14));
 		return false;
 	}
 	part = &vb->vblk.part;
-	part->start = BE64(buffer + 0x24 + r_name);
-	part->volume_offset = BE64(buffer + 0x2C + r_name);
+	part->start = get_unaligned_be64(buffer + 0x24 + r_name);
+	part->volume_offset = get_unaligned_be64(buffer + 0x2C + r_name);
 	part->size = ldm_get_vnum(buffer + 0x34 + r_name);
 	part->parent_id = ldm_get_vnum(buffer + 0x34 + r_size);
 	part->disk_id = ldm_get_vnum(buffer + 0x34 + r_parent);
@@ -1139,9 +1139,9 @@ static bool ldm_parse_vol5(const u8 *buffer, int buflen, struct vblk *vb)
 		return false;
 	}
 	len += VBLK_SIZE_VOL5;
-	if (len > BE32(buffer + 0x14)) {
+	if (len > get_unaligned_be32(buffer + 0x14)) {
 		ldm_error("len %d > BE32(buffer + 0x14) %d", len,
-				BE32(buffer + 0x14));
+				get_unaligned_be32(buffer + 0x14));
 		return false;
 	}
 	volu = &vb->vblk.volu;
@@ -1294,9 +1294,9 @@ static bool ldm_frag_add (const u8 *data, int size, struct list_head *frags)
 
 	BUG_ON (!data || !frags);
 
-	group = BE32 (data + 0x08);
-	rec   = BE16 (data + 0x0C);
-	num   = BE16 (data + 0x0E);
+	group = get_unaligned_be32(data + 0x08);
+	rec   = get_unaligned_be16(data + 0x0C);
+	num   = get_unaligned_be16(data + 0x0E);
 	if ((num < 1) || (num > 4)) {
 		ldm_error ("A VBLK claims to have %d parts.", num);
 		return false;
@@ -1425,12 +1425,12 @@ static bool ldm_get_vblks (struct block_device *bdev, unsigned long base,
 		}
 
 		for (v = 0; v < perbuf; v++, data+=size) {  /* For each vblk */
-			if (MAGIC_VBLK != BE32 (data)) {
+			if (MAGIC_VBLK != get_unaligned_be32(data)) {
 				ldm_error ("Expected to find a VBLK.");
 				goto out;
 			}
 
-			recs = BE16 (data + 0x0E);	/* Number of records */
+			recs = get_unaligned_be16(data + 0x0E);	/* Number of records */
 			if (recs == 1) {
 				if (!ldm_ldmdb_add (data, size, ldb))
 					goto out;	/* Already logged */
diff --git a/fs/partitions/ldm.h b/fs/partitions/ldm.h
index 80f63b5fdd9..30e08e809c1 100644
--- a/fs/partitions/ldm.h
+++ b/fs/partitions/ldm.h
@@ -98,11 +98,6 @@ struct parsed_partitions;
 #define TOC_BITMAP1		"config"	/* Names of the two defined */
 #define TOC_BITMAP2		"log"		/* bitmaps in the TOCBLOCK. */
 
-/* Most numbers we deal with are big-endian and won't be aligned. */
-#define BE16(x)			((u16)be16_to_cpu(get_unaligned((__be16*)(x))))
-#define BE32(x)			((u32)be32_to_cpu(get_unaligned((__be32*)(x))))
-#define BE64(x)			((u64)be64_to_cpu(get_unaligned((__be64*)(x))))
-
 /* Borrowed from msdos.c */
 #define SYS_IND(p)		(get_unaligned(&(p)->sys_ind))
 
-- 
cgit v1.2.3-70-g09d2


From ba92a43dbaee339cf5915ef766d3d3ffbaaf103c Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Fri, 25 Jul 2008 01:45:43 -0700
Subject: exec: remove some includes

fs/exec.c used to need mman.h pagemap.h swap.h and rmap.h when it did
mm-ish stuff in install_arg_page(); but no need for them after 2.6.22.

[akpm@linux-foundation.org: unbreak arm]
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 190ed1f9277..e41aef0fb35 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -25,19 +25,18 @@
 #include <linux/slab.h>
 #include <linux/file.h>
 #include <linux/fdtable.h>
-#include <linux/mman.h>
+#include <linux/mm.h>
 #include <linux/stat.h>
 #include <linux/fcntl.h>
 #include <linux/smp_lock.h>
+#include <linux/swap.h>
 #include <linux/string.h>
 #include <linux/init.h>
-#include <linux/pagemap.h>
 #include <linux/highmem.h>
 #include <linux/spinlock.h>
 #include <linux/key.h>
 #include <linux/personality.h>
 #include <linux/binfmts.h>
-#include <linux/swap.h>
 #include <linux/utsname.h>
 #include <linux/pid_namespace.h>
 #include <linux/module.h>
@@ -47,7 +46,6 @@
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
-#include <linux/rmap.h>
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
 #include <linux/audit.h>
-- 
cgit v1.2.3-70-g09d2


From fb523f32275344282f20ef3352cbf03e599241e6 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Fri, 25 Jul 2008 01:46:14 -0700
Subject: minix: remove !NO_TRUNCATE code

This patch removes the !NO_TRUNCATE code that anyway required a manual
editing of the code for being used.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/minix/inode.c |  3 ---
 fs/minix/minix.h |  6 ------
 fs/minix/namei.c | 24 ------------------------
 3 files changed, 33 deletions(-)

(limited to 'fs')

diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 84f6242ba6f..523d7371341 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -256,9 +256,6 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
 	if (!s->s_root)
 		goto out_iput;
 
-	if (!NO_TRUNCATE)
-		s->s_root->d_op = &minix_dentry_operations;
-
 	if (!(s->s_flags & MS_RDONLY)) {
 		if (sbi->s_version != MINIX_V3) /* s_state is now out from V3 sb */
 			ms->s_state &= ~MINIX_VALID_FS;
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index 326edfe9610..e6a0b193bea 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -2,11 +2,6 @@
 #include <linux/pagemap.h>
 #include <linux/minix_fs.h>
 
-/*
- * change the define below to 0 if you want names > info->s_namelen chars to be
- * truncated. Else they will be disallowed (ENAMETOOLONG).
- */
-#define NO_TRUNCATE 1
 #define INODE_VERSION(inode)	minix_sb(inode->i_sb)->s_version
 #define MINIX_V1		0x0001		/* original minix fs */
 #define MINIX_V2		0x0002		/* minix V2 fs */
@@ -83,7 +78,6 @@ extern const struct inode_operations minix_file_inode_operations;
 extern const struct inode_operations minix_dir_inode_operations;
 extern const struct file_operations minix_file_operations;
 extern const struct file_operations minix_dir_operations;
-extern struct dentry_operations minix_dentry_operations;
 
 static inline struct minix_sb_info *minix_sb(struct super_block *sb)
 {
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 102241bc9c7..32b131cd612 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -18,30 +18,6 @@ static int add_nondir(struct dentry *dentry, struct inode *inode)
 	return err;
 }
 
-static int minix_hash(struct dentry *dentry, struct qstr *qstr)
-{
-	unsigned long hash;
-	int i;
-	const unsigned char *name;
-
-	i = minix_sb(dentry->d_inode->i_sb)->s_namelen;
-	if (i >= qstr->len)
-		return 0;
-	/* Truncate the name in place, avoids having to define a compare
-	   function. */
-	qstr->len = i;
-	name = qstr->name;
-	hash = init_name_hash();
-	while (i--)
-		hash = partial_name_hash(*name++, hash);
-	qstr->hash = end_name_hash(hash);
-	return 0;
-}
-
-struct dentry_operations minix_dentry_operations = {
-	.d_hash		= minix_hash,
-};
-
 static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode * inode = NULL;
-- 
cgit v1.2.3-70-g09d2


From f905f06fca5d3949eca12f5a43e251a404b3470a Mon Sep 17 00:00:00 2001
From: Shen Feng <shen@cn.fujitsu.com>
Date: Fri, 25 Jul 2008 01:46:15 -0700
Subject: ext2: remove double definitions of xattr macros

remove the definitions of macros:
XATTR_TRUSTED_PREFIX
XATTR_USER_PREFIX
since they are defined in linux/xattr.h

Signed-off-by: Shen Feng <shen@cn.fujitsu.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext2/xattr_security.c | 2 +-
 fs/ext2/xattr_trusted.c  | 4 +---
 fs/ext2/xattr_user.c     | 4 +---
 3 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index eaa23d2d521..70c0dbdcdcb 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -14,7 +14,7 @@ static size_t
 ext2_xattr_security_list(struct inode *inode, char *list, size_t list_size,
 			 const char *name, size_t name_len)
 {
-	const int prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1;
+	const int prefix_len = XATTR_SECURITY_PREFIX_LEN;
 	const size_t total_len = prefix_len + name_len + 1;
 
 	if (list && total_len <= list_size) {
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index 83ee149f353..e8219f8eae9 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -12,13 +12,11 @@
 #include <linux/ext2_fs.h>
 #include "xattr.h"
 
-#define XATTR_TRUSTED_PREFIX "trusted."
-
 static size_t
 ext2_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
 			const char *name, size_t name_len)
 {
-	const int prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
+	const int prefix_len = XATTR_TRUSTED_PREFIX_LEN;
 	const size_t total_len = prefix_len + name_len + 1;
 
 	if (!capable(CAP_SYS_ADMIN))
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index f383e7c3a7b..92495d28c62 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -11,13 +11,11 @@
 #include "ext2.h"
 #include "xattr.h"
 
-#define XATTR_USER_PREFIX "user."
-
 static size_t
 ext2_xattr_user_list(struct inode *inode, char *list, size_t list_size,
 		     const char *name, size_t name_len)
 {
-	const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1;
+	const size_t prefix_len = XATTR_USER_PREFIX_LEN;
 	const size_t total_len = prefix_len + name_len + 1;
 
 	if (!test_opt(inode->i_sb, XATTR_USER))
-- 
cgit v1.2.3-70-g09d2


From 9cfe7b9010aa66da5f3b2bc33d9e30a4d53bd274 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 25 Jul 2008 01:46:16 -0700
Subject: ext3: fix synchronization of quota files in journal=data mode

In journal=data mode, it is not enough to do write_inode_now as done in
vfs_quota_on() to write all data to their final location (which is needed for
quota_read to work correctly).  Calling journal_flush() does its job.

Reported-by: Nick <gentuu@gmail.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/super.c | 35 +++++++++++++++++++++++++++--------
 1 file changed, 27 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 2845425077e..50796e90d07 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2759,23 +2759,42 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
 
 	if (!test_opt(sb, QUOTA))
 		return -EINVAL;
-	/* Not journalling quota or remount? */
-	if ((!EXT3_SB(sb)->s_qf_names[USRQUOTA] &&
-	    !EXT3_SB(sb)->s_qf_names[GRPQUOTA]) || remount)
+	/* When remounting, no checks are needed and in fact, path is NULL */
+	if (remount)
 		return vfs_quota_on(sb, type, format_id, path, remount);
+
 	err = path_lookup(path, LOOKUP_FOLLOW, &nd);
 	if (err)
 		return err;
+
 	/* Quotafile not on the same filesystem? */
 	if (nd.path.mnt->mnt_sb != sb) {
 		path_put(&nd.path);
 		return -EXDEV;
 	}
-	/* Quotafile not in fs root? */
-	if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode)
-		printk(KERN_WARNING
-			"EXT3-fs: Quota file not on filesystem root. "
-			"Journalled quota will not work.\n");
+	/* Journaling quota? */
+	if (EXT3_SB(sb)->s_qf_names[type]) {
+		/* Quotafile not of fs root? */
+		if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode)
+			printk(KERN_WARNING
+				"EXT3-fs: Quota file not on filesystem root. "
+				"Journaled quota will not work.\n");
+	}
+
+	/*
+	 * When we journal data on quota file, we have to flush journal to see
+	 * all updates to the file when we bypass pagecache...
+	 */
+	if (ext3_should_journal_data(nd.path.dentry->d_inode)) {
+		/*
+		 * We don't need to lock updates but journal_flush() could
+		 * otherwise be livelocked...
+		 */
+		journal_lock_updates(EXT3_SB(sb)->s_journal);
+		journal_flush(EXT3_SB(sb)->s_journal);
+		journal_unlock_updates(EXT3_SB(sb)->s_journal);
+	}
+
 	path_put(&nd.path);
 	return vfs_quota_on(sb, type, format_id, path, remount);
 }
-- 
cgit v1.2.3-70-g09d2


From 99aeaf639f61ab6be1967e5f92e2e28dafad8383 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 25 Jul 2008 01:46:17 -0700
Subject: ext3: fix typos in messages and comments (journalled -> journaled)

Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/super.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 50796e90d07..0a1bf82845c 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1020,7 +1020,7 @@ static int parse_options (char *options, struct super_block *sb,
 set_qf_name:
 			if (sb_any_quota_enabled(sb)) {
 				printk(KERN_ERR
-					"EXT3-fs: Cannot change journalled "
+					"EXT3-fs: Cannot change journaled "
 					"quota options when quota turned on.\n");
 				return 0;
 			}
@@ -1058,7 +1058,7 @@ set_qf_name:
 clear_qf_name:
 			if (sb_any_quota_enabled(sb)) {
 				printk(KERN_ERR "EXT3-fs: Cannot change "
-					"journalled quota options when "
+					"journaled quota options when "
 					"quota turned on.\n");
 				return 0;
 			}
@@ -1169,14 +1169,14 @@ clear_qf_name:
 		}
 
 		if (!sbi->s_jquota_fmt) {
-			printk(KERN_ERR "EXT3-fs: journalled quota format "
+			printk(KERN_ERR "EXT3-fs: journaled quota format "
 					"not specified.\n");
 			return 0;
 		}
 	} else {
 		if (sbi->s_jquota_fmt) {
-			printk(KERN_ERR "EXT3-fs: journalled quota format "
-					"specified with no journalling "
+			printk(KERN_ERR "EXT3-fs: journaled quota format "
+					"specified with no journaling "
 					"enabled.\n");
 			return 0;
 		}
@@ -1370,7 +1370,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
 			int ret = ext3_quota_on_mount(sb, i);
 			if (ret < 0)
 				printk(KERN_ERR
-					"EXT3-fs: Cannot turn on journalled "
+					"EXT3-fs: Cannot turn on journaled "
 					"quota: error %d\n", ret);
 		}
 	}
@@ -2712,7 +2712,7 @@ static int ext3_release_dquot(struct dquot *dquot)
 
 static int ext3_mark_dquot_dirty(struct dquot *dquot)
 {
-	/* Are we journalling quotas? */
+	/* Are we journaling quotas? */
 	if (EXT3_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
 	    EXT3_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
 		dquot_mark_dquot_dirty(dquot);
-- 
cgit v1.2.3-70-g09d2


From d06bf1d252fe16f5f0d13e04da7a9913420aa1cf Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 25 Jul 2008 01:46:18 -0700
Subject: ext3: correct mount option parsing to detect when quota options can
 be changed

We should not allow user to change quota mount options when quota is just
suspended.  I would make mount options and internal quota state inconsistent.
Also we should not allow user to change quota format when quota is turned on.
On the other hand we can just silently ignore when some option is set to the
value it already has (mount does this on remount).

Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/super.c | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 0a1bf82845c..615788c6843 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -842,7 +842,7 @@ static int parse_options (char *options, struct super_block *sb,
 	int data_opt = 0;
 	int option;
 #ifdef CONFIG_QUOTA
-	int qtype;
+	int qtype, qfmt;
 	char *qname;
 #endif
 
@@ -1018,7 +1018,9 @@ static int parse_options (char *options, struct super_block *sb,
 		case Opt_grpjquota:
 			qtype = GRPQUOTA;
 set_qf_name:
-			if (sb_any_quota_enabled(sb)) {
+			if ((sb_any_quota_enabled(sb) ||
+			     sb_any_quota_suspended(sb)) &&
+			    !sbi->s_qf_names[qtype]) {
 				printk(KERN_ERR
 					"EXT3-fs: Cannot change journaled "
 					"quota options when quota turned on.\n");
@@ -1056,7 +1058,9 @@ set_qf_name:
 		case Opt_offgrpjquota:
 			qtype = GRPQUOTA;
 clear_qf_name:
-			if (sb_any_quota_enabled(sb)) {
+			if ((sb_any_quota_enabled(sb) ||
+			     sb_any_quota_suspended(sb)) &&
+			    sbi->s_qf_names[qtype]) {
 				printk(KERN_ERR "EXT3-fs: Cannot change "
 					"journaled quota options when "
 					"quota turned on.\n");
@@ -1069,10 +1073,20 @@ clear_qf_name:
 			sbi->s_qf_names[qtype] = NULL;
 			break;
 		case Opt_jqfmt_vfsold:
-			sbi->s_jquota_fmt = QFMT_VFS_OLD;
-			break;
+			qfmt = QFMT_VFS_OLD;
+			goto set_qf_format;
 		case Opt_jqfmt_vfsv0:
-			sbi->s_jquota_fmt = QFMT_VFS_V0;
+			qfmt = QFMT_VFS_V0;
+set_qf_format:
+			if ((sb_any_quota_enabled(sb) ||
+			     sb_any_quota_suspended(sb)) &&
+			    sbi->s_jquota_fmt != qfmt) {
+				printk(KERN_ERR "EXT3-fs: Cannot change "
+					"journaled quota options when "
+					"quota turned on.\n");
+				return 0;
+			}
+			sbi->s_jquota_fmt = qfmt;
 			break;
 		case Opt_quota:
 		case Opt_usrquota:
@@ -1084,7 +1098,8 @@ clear_qf_name:
 			set_opt(sbi->s_mount_opt, GRPQUOTA);
 			break;
 		case Opt_noquota:
-			if (sb_any_quota_enabled(sb)) {
+			if (sb_any_quota_enabled(sb) ||
+			    sb_any_quota_suspended(sb)) {
 				printk(KERN_ERR "EXT3-fs: Cannot change quota "
 					"options when quota turned on.\n");
 				return 0;
-- 
cgit v1.2.3-70-g09d2


From 3850f7a521dc17659ef6758a219f083418788490 Mon Sep 17 00:00:00 2001
From: Duane Griffin <duaneg@dghda.com>
Date: Fri, 25 Jul 2008 01:46:19 -0700
Subject: jbd: replace potentially false assertion with if block

If an error occurs during jbd cache initialisation it is possible for the
journal_head_cache to be NULL when journal_destroy_journal_head_cache is
called.  Replace the J_ASSERT with an if block to handle the situation
correctly.

Note that even with this fix things will break badly if jbd is statically
compiled in and cache initialisation fails.

Signed-off-by: Duane Griffin <duaneg@dghda.com
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/jbd/journal.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index b99c3b3654c..15ea16ad866 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -1636,9 +1636,10 @@ static int journal_init_journal_head_cache(void)
 
 static void journal_destroy_journal_head_cache(void)
 {
-	J_ASSERT(journal_head_cache != NULL);
-	kmem_cache_destroy(journal_head_cache);
-	journal_head_cache = NULL;
+	if (journal_head_cache) {
+		kmem_cache_destroy(journal_head_cache);
+		journal_head_cache = NULL;
+	}
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From f4d79ca2fa211cffc07306eeed7013448e77d7ec Mon Sep 17 00:00:00 2001
From: Duane Griffin <duaneg@dghda.com>
Date: Fri, 25 Jul 2008 01:46:20 -0700
Subject: jbd: eliminate duplicated code in revocation table init/destroy
 functions

The revocation table initialisation/destruction code is repeated for each
of the two revocation tables stored in the journal.  Refactoring the
duplicated code into functions is tidier, simplifies the logic in
initialisation in particular, and slightly reduces the code size.

There should not be any functional change.

Signed-off-by: Duane Griffin <duaneg@dghda.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/jbd/revoke.c | 127 +++++++++++++++++++++++---------------------------------
 1 file changed, 51 insertions(+), 76 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index 1bb43e987f4..8ff5a7b89b9 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -195,109 +195,84 @@ void journal_destroy_revoke_caches(void)
 	revoke_table_cache = NULL;
 }
 
-/* Initialise the revoke table for a given journal to a given size. */
-
-int journal_init_revoke(journal_t *journal, int hash_size)
+static struct jbd_revoke_table_s *journal_init_revoke_table(int hash_size)
 {
-	int shift, tmp;
+	int shift = 0;
+	int tmp = hash_size;
+	struct jbd_revoke_table_s *table;
 
-	J_ASSERT (journal->j_revoke_table[0] == NULL);
+	table = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL);
+	if (!table)
+		goto out;
 
-	shift = 0;
-	tmp = hash_size;
 	while((tmp >>= 1UL) != 0UL)
 		shift++;
 
-	journal->j_revoke_table[0] = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL);
-	if (!journal->j_revoke_table[0])
-		return -ENOMEM;
-	journal->j_revoke = journal->j_revoke_table[0];
-
-	/* Check that the hash_size is a power of two */
-	J_ASSERT(is_power_of_2(hash_size));
-
-	journal->j_revoke->hash_size = hash_size;
-
-	journal->j_revoke->hash_shift = shift;
-
-	journal->j_revoke->hash_table =
+	table->hash_size = hash_size;
+	table->hash_shift = shift;
+	table->hash_table =
 		kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
-	if (!journal->j_revoke->hash_table) {
-		kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]);
-		journal->j_revoke = NULL;
-		return -ENOMEM;
+	if (!table->hash_table) {
+		kmem_cache_free(revoke_table_cache, table);
+		table = NULL;
+		goto out;
 	}
 
 	for (tmp = 0; tmp < hash_size; tmp++)
-		INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]);
+		INIT_LIST_HEAD(&table->hash_table[tmp]);
 
-	journal->j_revoke_table[1] = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL);
-	if (!journal->j_revoke_table[1]) {
-		kfree(journal->j_revoke_table[0]->hash_table);
-		kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]);
-		return -ENOMEM;
+out:
+	return table;
+}
+
+static void journal_destroy_revoke_table(struct jbd_revoke_table_s *table)
+{
+	int i;
+	struct list_head *hash_list;
+
+	for (i = 0; i < table->hash_size; i++) {
+		hash_list = &table->hash_table[i];
+		J_ASSERT(list_empty(hash_list));
 	}
 
-	journal->j_revoke = journal->j_revoke_table[1];
+	kfree(table->hash_table);
+	kmem_cache_free(revoke_table_cache, table);
+}
 
-	/* Check that the hash_size is a power of two */
+/* Initialise the revoke table for a given journal to a given size. */
+int journal_init_revoke(journal_t *journal, int hash_size)
+{
+	J_ASSERT(journal->j_revoke_table[0] == NULL);
 	J_ASSERT(is_power_of_2(hash_size));
 
-	journal->j_revoke->hash_size = hash_size;
+	journal->j_revoke_table[0] = journal_init_revoke_table(hash_size);
+	if (!journal->j_revoke_table[0])
+		goto fail0;
 
-	journal->j_revoke->hash_shift = shift;
+	journal->j_revoke_table[1] = journal_init_revoke_table(hash_size);
+	if (!journal->j_revoke_table[1])
+		goto fail1;
 
-	journal->j_revoke->hash_table =
-		kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
-	if (!journal->j_revoke->hash_table) {
-		kfree(journal->j_revoke_table[0]->hash_table);
-		kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]);
-		kmem_cache_free(revoke_table_cache, journal->j_revoke_table[1]);
-		journal->j_revoke = NULL;
-		return -ENOMEM;
-	}
-
-	for (tmp = 0; tmp < hash_size; tmp++)
-		INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]);
+	journal->j_revoke = journal->j_revoke_table[1];
 
 	spin_lock_init(&journal->j_revoke_lock);
 
 	return 0;
-}
 
-/* Destoy a journal's revoke table.  The table must already be empty! */
+fail1:
+	journal_destroy_revoke_table(journal->j_revoke_table[0]);
+fail0:
+	return -ENOMEM;
+}
 
+/* Destroy a journal's revoke table.  The table must already be empty! */
 void journal_destroy_revoke(journal_t *journal)
 {
-	struct jbd_revoke_table_s *table;
-	struct list_head *hash_list;
-	int i;
-
-	table = journal->j_revoke_table[0];
-	if (!table)
-		return;
-
-	for (i=0; i<table->hash_size; i++) {
-		hash_list = &table->hash_table[i];
-		J_ASSERT (list_empty(hash_list));
-	}
-
-	kfree(table->hash_table);
-	kmem_cache_free(revoke_table_cache, table);
-	journal->j_revoke = NULL;
-
-	table = journal->j_revoke_table[1];
-	if (!table)
-		return;
-
-	for (i=0; i<table->hash_size; i++) {
-		hash_list = &table->hash_table[i];
-		J_ASSERT (list_empty(hash_list));
-	}
-
-	kfree(table->hash_table);
-	kmem_cache_free(revoke_table_cache, table);
 	journal->j_revoke = NULL;
+	if (journal->j_revoke_table[0])
+		journal_destroy_revoke_table(journal->j_revoke_table[0]);
+	if (journal->j_revoke_table[1])
+		journal_destroy_revoke_table(journal->j_revoke_table[1]);
 }
 
 
-- 
cgit v1.2.3-70-g09d2


From 1984bb763c2e50d0ebfb0cf56d1b319bd7afe63a Mon Sep 17 00:00:00 2001
From: Duane Griffin <duaneg@dghda.com>
Date: Fri, 25 Jul 2008 01:46:21 -0700
Subject: jbd: tidy up revoke cache initialisation and destruction

Make revocation cache destruction safe to call if initialisation fails
partially or entirely.  This allows it to be used to cleanup in the case
of initialisation failure, simplifying that code slightly.

Signed-off-by: Duane Griffin <duaneg@dghda.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/jbd/revoke.c | 36 +++++++++++++++++++++++-------------
 1 file changed, 23 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index 8ff5a7b89b9..c7bd649bbbd 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -166,33 +166,43 @@ static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal,
 	return NULL;
 }
 
+void journal_destroy_revoke_caches(void)
+{
+	if (revoke_record_cache) {
+		kmem_cache_destroy(revoke_record_cache);
+		revoke_record_cache = NULL;
+	}
+	if (revoke_table_cache) {
+		kmem_cache_destroy(revoke_table_cache);
+		revoke_table_cache = NULL;
+	}
+}
+
 int __init journal_init_revoke_caches(void)
 {
+	J_ASSERT(!revoke_record_cache);
+	J_ASSERT(!revoke_table_cache);
+
 	revoke_record_cache = kmem_cache_create("revoke_record",
 					   sizeof(struct jbd_revoke_record_s),
 					   0,
 					   SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
 					   NULL);
 	if (!revoke_record_cache)
-		return -ENOMEM;
+		goto record_cache_failure;
 
 	revoke_table_cache = kmem_cache_create("revoke_table",
 					   sizeof(struct jbd_revoke_table_s),
 					   0, SLAB_TEMPORARY, NULL);
-	if (!revoke_table_cache) {
-		kmem_cache_destroy(revoke_record_cache);
-		revoke_record_cache = NULL;
-		return -ENOMEM;
-	}
+	if (!revoke_table_cache)
+		goto table_cache_failure;
+
 	return 0;
-}
 
-void journal_destroy_revoke_caches(void)
-{
-	kmem_cache_destroy(revoke_record_cache);
-	revoke_record_cache = NULL;
-	kmem_cache_destroy(revoke_table_cache);
-	revoke_table_cache = NULL;
+table_cache_failure:
+	journal_destroy_revoke_caches();
+record_cache_failure:
+	return -ENOMEM;
 }
 
 static struct jbd_revoke_table_s *journal_init_revoke_table(int hash_size)
-- 
cgit v1.2.3-70-g09d2


From 9ebfbe9f926553eabc21b4400918d1216b27ed0c Mon Sep 17 00:00:00 2001
From: Shen Feng <shen@cn.fujitsu.com>
Date: Fri, 25 Jul 2008 01:46:21 -0700
Subject: ext3: improve some code in rb tree part of dir.c

- remove unnecessary code in free_rb_tree_fname
 - rename free_rb_tree_fname to ext3_htree_create_dir_info
   since it and ext3_htree_free_dir_info are a pair
 - replace kmalloc with kzalloc in ext3_htree_free_dir_info

Signed-off-by: Shen Feng <shen@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/dir.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 8ca3bfd7242..2eea96ec78e 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -272,7 +272,7 @@ static void free_rb_tree_fname(struct rb_root *root)
 
 	while (n) {
 		/* Do the node's children first */
-		if ((n)->rb_left) {
+		if (n->rb_left) {
 			n = n->rb_left;
 			continue;
 		}
@@ -301,24 +301,18 @@ static void free_rb_tree_fname(struct rb_root *root)
 			parent->rb_right = NULL;
 		n = parent;
 	}
-	root->rb_node = NULL;
 }
 
 
-static struct dir_private_info *create_dir_info(loff_t pos)
+static struct dir_private_info *ext3_htree_create_dir_info(loff_t pos)
 {
 	struct dir_private_info *p;
 
-	p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
+	p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
 	if (!p)
 		return NULL;
-	p->root.rb_node = NULL;
-	p->curr_node = NULL;
-	p->extra_fname = NULL;
-	p->last_pos = 0;
 	p->curr_hash = pos2maj_hash(pos);
 	p->curr_minor_hash = pos2min_hash(pos);
-	p->next_hash = 0;
 	return p;
 }
 
@@ -433,7 +427,7 @@ static int ext3_dx_readdir(struct file * filp,
 	int	ret;
 
 	if (!info) {
-		info = create_dir_info(filp->f_pos);
+		info = ext3_htree_create_dir_info(filp->f_pos);
 		if (!info)
 			return -ENOMEM;
 		filp->private_data = info;
-- 
cgit v1.2.3-70-g09d2


From 3f31fddfa26b7594b44ff2b34f9a04ba409e0f91 Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@us.ibm.com>
Date: Fri, 25 Jul 2008 01:46:22 -0700
Subject: jbd: fix race between free buffer and commit transaction

journal_try_to_free_buffers() could race with jbd commit transaction when
the later is holding the buffer reference while waiting for the data
buffer to flush to disk.  If the caller of journal_try_to_free_buffers()
request tries hard to release the buffers, it will treat the failure as
error and return back to the caller.  We have seen the directo IO failed
due to this race.  Some of the caller of releasepage() also expecting the
buffer to be dropped when passed with GFP_KERNEL mask to the
releasepage()->journal_try_to_free_buffers().

With this patch, if the caller is passing the __GFP_WAIT and __GFP_FS to
indicating this call could wait, in case of try_to_free_buffers() failed,
let's waiting for journal_commit_transaction() to finish commit the
current committing transaction, then try to free those buffers again.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Reviewed-by: Badari Pulavarty <pbadari@us.ibm.com>
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/jbd/transaction.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 mm/filemap.c         |  3 +--
 2 files changed, 56 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 67ff2024c23..8dee3200750 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1648,12 +1648,42 @@ out:
 	return;
 }
 
+/*
+ * journal_try_to_free_buffers() could race with journal_commit_transaction()
+ * The latter might still hold the a count on buffers when inspecting
+ * them on t_syncdata_list or t_locked_list.
+ *
+ * journal_try_to_free_buffers() will call this function to
+ * wait for the current transaction to finish syncing data buffers, before
+ * tryinf to free that buffer.
+ *
+ * Called with journal->j_state_lock held.
+ */
+static void journal_wait_for_transaction_sync_data(journal_t *journal)
+{
+	transaction_t *transaction = NULL;
+	tid_t tid;
+
+	spin_lock(&journal->j_state_lock);
+	transaction = journal->j_committing_transaction;
+
+	if (!transaction) {
+		spin_unlock(&journal->j_state_lock);
+		return;
+	}
+
+	tid = transaction->t_tid;
+	spin_unlock(&journal->j_state_lock);
+	log_wait_commit(journal, tid);
+}
 
 /**
  * int journal_try_to_free_buffers() - try to free page buffers.
  * @journal: journal for operation
  * @page: to try and free
- * @unused_gfp_mask: unused
+ * @gfp_mask: we use the mask to detect how hard should we try to release
+ * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
+ * release the buffers.
  *
  *
  * For all the buffers on this page,
@@ -1682,9 +1712,11 @@ out:
  * journal_try_to_free_buffer() is changing its state.  But that
  * cannot happen because we never reallocate freed data as metadata
  * while the data is part of a transaction.  Yes?
+ *
+ * Return 0 on failure, 1 on success
  */
 int journal_try_to_free_buffers(journal_t *journal,
-				struct page *page, gfp_t unused_gfp_mask)
+				struct page *page, gfp_t gfp_mask)
 {
 	struct buffer_head *head;
 	struct buffer_head *bh;
@@ -1713,7 +1745,28 @@ int journal_try_to_free_buffers(journal_t *journal,
 		if (buffer_jbd(bh))
 			goto busy;
 	} while ((bh = bh->b_this_page) != head);
+
 	ret = try_to_free_buffers(page);
+
+	/*
+	 * There are a number of places where journal_try_to_free_buffers()
+	 * could race with journal_commit_transaction(), the later still
+	 * holds the reference to the buffers to free while processing them.
+	 * try_to_free_buffers() failed to free those buffers. Some of the
+	 * caller of releasepage() request page buffers to be dropped, otherwise
+	 * treat the fail-to-free as errors (such as generic_file_direct_IO())
+	 *
+	 * So, if the caller of try_to_release_page() wants the synchronous
+	 * behaviour(i.e make sure buffers are dropped upon return),
+	 * let's wait for the current transaction to finish flush of
+	 * dirty data buffers, then try to free those buffers again,
+	 * with the journal locked.
+	 */
+	if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) {
+		journal_wait_for_transaction_sync_data(journal);
+		ret = try_to_free_buffers(page);
+	}
+
 busy:
 	return ret;
 }
diff --git a/mm/filemap.c b/mm/filemap.c
index 7675b91f4f6..5d4c880d7cd 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2563,9 +2563,8 @@ EXPORT_SYMBOL(generic_file_aio_write);
  * Otherwise return zero.
  *
  * The @gfp_mask argument specifies whether I/O may be performed to release
- * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
+ * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
  *
- * NOTE: @gfp_mask may go away, and this function may become non-blocking.
  */
 int try_to_release_page(struct page *page, gfp_t gfp_mask)
 {
-- 
cgit v1.2.3-70-g09d2


From ef1afd39519b74fbe1f63c9ab5a14490effec0e3 Mon Sep 17 00:00:00 2001
From: Shen Feng <shen@cn.fujitsu.com>
Date: Fri, 25 Jul 2008 01:46:23 -0700
Subject: ext3: remove double definitions of xattr macros

remove the definitions of macros:
XATTR_TRUSTED_PREFIX
XATTR_USER_PREFIX
since they are defined in linux/xattr.h

Signed-off-by: Shen Feng <shen@cn.fujitsu.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/xattr_security.c | 2 +-
 fs/ext3/xattr_trusted.c  | 4 +---
 fs/ext3/xattr_user.c     | 4 +---
 3 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index 821efaf2b94..37b81097bdf 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -15,7 +15,7 @@ static size_t
 ext3_xattr_security_list(struct inode *inode, char *list, size_t list_size,
 			 const char *name, size_t name_len)
 {
-	const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1;
+	const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
 	const size_t total_len = prefix_len + name_len + 1;
 
 
diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c
index 0327497a55c..c7c41a410c4 100644
--- a/fs/ext3/xattr_trusted.c
+++ b/fs/ext3/xattr_trusted.c
@@ -13,13 +13,11 @@
 #include <linux/ext3_fs.h>
 #include "xattr.h"
 
-#define XATTR_TRUSTED_PREFIX "trusted."
-
 static size_t
 ext3_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
 			const char *name, size_t name_len)
 {
-	const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
+	const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
 	const size_t total_len = prefix_len + name_len + 1;
 
 	if (!capable(CAP_SYS_ADMIN))
diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c
index 1abd8f92c44..430fe63b31b 100644
--- a/fs/ext3/xattr_user.c
+++ b/fs/ext3/xattr_user.c
@@ -12,13 +12,11 @@
 #include <linux/ext3_fs.h>
 #include "xattr.h"
 
-#define XATTR_USER_PREFIX "user."
-
 static size_t
 ext3_xattr_user_list(struct inode *inode, char *list, size_t list_size,
 		     const char *name, size_t name_len)
 {
-	const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1;
+	const size_t prefix_len = XATTR_USER_PREFIX_LEN;
 	const size_t total_len = prefix_len + name_len + 1;
 
 	if (!test_opt(inode->i_sb, XATTR_USER))
-- 
cgit v1.2.3-70-g09d2


From ae76dd9a6b5bbe5315fb7028e03f68f75b8538f3 Mon Sep 17 00:00:00 2001
From: Duane Griffin <duaneg@dghda.com>
Date: Fri, 25 Jul 2008 01:46:23 -0700
Subject: ext3: handle corrupted orphan list at mount

If the orphan node list includes valid, untruncatable nodes with nlink > 0
the ext3_orphan_cleanup loop which attempts to delete them will not do so,
causing it to loop forever. Fix by checking for such nodes in the
ext3_orphan_get function.

This patch fixes the second case (image hdb.20000009.softlockup.gz)
reported in http://bugzilla.kernel.org/show_bug.cgi?id=10882.

[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: printk warning fix]
Signed-off-by: Duane Griffin <duaneg@dghda.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/ialloc.c        |  9 +++++++++
 fs/ext3/inode.c         | 20 ++++++++++++++------
 include/linux/ext3_fs.h |  1 +
 3 files changed, 24 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 77126821b2e..47b678d73e7 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -669,6 +669,14 @@ struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino)
 	if (IS_ERR(inode))
 		goto iget_failed;
 
+	/*
+	 * If the orphans has i_nlinks > 0 then it should be able to be
+	 * truncated, otherwise it won't be removed from the orphan list
+	 * during processing and an infinite loop will result.
+	 */
+	if (inode->i_nlink && !ext3_can_truncate(inode))
+		goto bad_orphan;
+
 	if (NEXT_ORPHAN(inode) > max_ino)
 		goto bad_orphan;
 	brelse(bitmap_bh);
@@ -690,6 +698,7 @@ bad_orphan:
 		printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
 		       NEXT_ORPHAN(inode));
 		printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
+		printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink);
 		/* Avoid freeing blocks if we got a bad deleted inode */
 		if (inode->i_nlink == 0)
 			inode->i_blocks = 0;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 6ae4ecf3ce4..74b432fa166 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2253,6 +2253,19 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
 	}
 }
 
+int ext3_can_truncate(struct inode *inode)
+{
+	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+		return 0;
+	if (S_ISREG(inode->i_mode))
+		return 1;
+	if (S_ISDIR(inode->i_mode))
+		return 1;
+	if (S_ISLNK(inode->i_mode))
+		return !ext3_inode_is_fast_symlink(inode);
+	return 0;
+}
+
 /*
  * ext3_truncate()
  *
@@ -2297,12 +2310,7 @@ void ext3_truncate(struct inode *inode)
 	unsigned blocksize = inode->i_sb->s_blocksize;
 	struct page *page;
 
-	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-	    S_ISLNK(inode->i_mode)))
-		return;
-	if (ext3_inode_is_fast_symlink(inode))
-		return;
-	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+	if (!ext3_can_truncate(inode))
 		return;
 
 	/*
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index 36c54039637..80171ee89a2 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -832,6 +832,7 @@ extern void ext3_discard_reservation (struct inode *);
 extern void ext3_dirty_inode(struct inode *);
 extern int ext3_change_inode_journal_flag(struct inode *, int);
 extern int ext3_get_inode_loc(struct inode *, struct ext3_iloc *);
+extern int ext3_can_truncate(struct inode *inode);
 extern void ext3_truncate (struct inode *);
 extern void ext3_set_inode_flags(struct inode *);
 extern void ext3_get_inode_flags(struct ext3_inode_info *);
-- 
cgit v1.2.3-70-g09d2


From 95450f5a7e53d5752ce1a0d0b8282e10fe745ae0 Mon Sep 17 00:00:00 2001
From: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Date: Fri, 25 Jul 2008 01:46:24 -0700
Subject: ext3: don't read inode block if the buffer has a write error

A transient I/O error can corrupt inode data.  Here is the scenario:

(1) update inode_A at the block_B
(2) pdflush writes out new inode_A to the filesystem, but it results
    in write I/O error, at this point, BH_Uptodate flag of the buffer
    for block_B is cleared and BH_Write_EIO is set
(3) create new inode_C which located at block_B, and
    __ext3_get_inode_loc() tries to read on-disk block_B because the
    buffer is not uptodate
(4) if it can read on-disk block_B successfully, inode_A is
    overwritten by old data

This patch makes __ext3_get_inode_loc() not read the inode block if the
buffer has BH_Write_EIO flag.  In this case, the buffer should have the
latest information, so setting the uptodate flag to the buffer (this
avoids WARN_ON_ONCE() in mark_buffer_dirty().)

According to this change, we would need to test BH_Write_EIO flag for the
error checking.  Currently nobody checks write I/O errors on metadata
buffers, but it will be done in other patches I'm working on.

Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Cc: sugita <yumiko.sugita.yf@hitachi.com>
Cc: Satoshi OSHIMA <satoshi.oshima.fk@hitachi.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Jan Kara <jack@ucw.cz>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/inode.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 74b432fa166..36f74f17a11 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2521,6 +2521,16 @@ static int __ext3_get_inode_loc(struct inode *inode,
 	}
 	if (!buffer_uptodate(bh)) {
 		lock_buffer(bh);
+
+		/*
+		 * If the buffer has the write error flag, we have failed
+		 * to write out another inode in the same block.  In this
+		 * case, we don't have to read the block because we may
+		 * read the old inode data successfully.
+		 */
+		if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
+			set_buffer_uptodate(bh);
+
 		if (buffer_uptodate(bh)) {
 			/* someone brought it uptodate while we waited */
 			unlock_buffer(bh);
-- 
cgit v1.2.3-70-g09d2


From 3ccc3167b0e5d46ab3bf03e22fbdb7616ce038cd Mon Sep 17 00:00:00 2001
From: Duane Griffin <duaneg@dghda.com>
Date: Fri, 25 Jul 2008 01:46:26 -0700
Subject: ext3: handle deleting corrupted indirect blocks

While freeing indirect blocks we attach a journal head to the parent
buffer head, free the blocks, then journal the parent.  If the indirect
block list is corrupted and points to the parent the journal head will be
detached when the block is cleared, causing an OOPS.

Check for that explicitly and handle it gracefully.

This patch fixes the third case (image hdb.20000057.nullderef.gz)
reported in http://bugzilla.kernel.org/show_bug.cgi?id=10882.

Immediately above the change, in the ext3_free_data function, we call
ext3_clear_blocks to clear the indirect blocks in this parent block.  If
one of those blocks happens to actually be the parent block it will clear
b_private / BH_JBD.

I did the check at the end rather than earlier as it seemed more elegant.
I don't think there should be much practical difference, although it is
possible the FS may not be quite so badly corrupted if we did it the other
way (and didn't clear the block at all).  To be honest, I'm not convinced
there aren't other similar failure modes lurking in this code, although I
couldn't find any with a quick review.

[akpm@linux-foundation.org: fix printk warning]
Signed-off-by: Duane Griffin <duaneg@dghda.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/inode.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 36f74f17a11..3bf07d70b91 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2127,7 +2127,21 @@ static void ext3_free_data(handle_t *handle, struct inode *inode,
 
 	if (this_bh) {
 		BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
-		ext3_journal_dirty_metadata(handle, this_bh);
+
+		/*
+		 * The buffer head should have an attached journal head at this
+		 * point. However, if the data is corrupted and an indirect
+		 * block pointed to itself, it would have been detached when
+		 * the block was cleared. Check for this instead of OOPSing.
+		 */
+		if (bh2jh(this_bh))
+			ext3_journal_dirty_metadata(handle, this_bh);
+		else
+			ext3_error(inode->i_sb, "ext3_free_data",
+				   "circular indirect block detected, "
+				   "inode=%lu, block=%llu",
+				   inode->i_ino,
+				   (unsigned long long)this_bh->b_blocknr);
 	}
 }
 
-- 
cgit v1.2.3-70-g09d2


From a10320e8f7c4dcfa050aac566092f29b40458d5a Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Fri, 25 Jul 2008 01:46:26 -0700
Subject: jbd: unexport journal_update_superblock

Remove the unused EXPORT_SYMBOL(journal_update_superblock).

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/jbd/journal.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 15ea16ad866..aa7143a8349 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -68,7 +68,6 @@ EXPORT_SYMBOL(journal_set_features);
 EXPORT_SYMBOL(journal_create);
 EXPORT_SYMBOL(journal_load);
 EXPORT_SYMBOL(journal_destroy);
-EXPORT_SYMBOL(journal_update_superblock);
 EXPORT_SYMBOL(journal_abort);
 EXPORT_SYMBOL(journal_errno);
 EXPORT_SYMBOL(journal_ack_err);
-- 
cgit v1.2.3-70-g09d2


From fc80c44277b3c92d808b73e9d40e120229aa4b6a Mon Sep 17 00:00:00 2001
From: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
Date: Fri, 25 Jul 2008 01:46:29 -0700
Subject: jbd: positively dispose the unmapped data buffers in
 journal_commit_transaction()

After ext3-ordered files are truncated, there is a possibility that the
pages which cannot be estimated still remain.  Remaining pages can be
released when the system has really few memory.  So, it is not memory
leakage.  But the resource management software etc.  may not work
correctly.

It is possible that journal_unmap_buffer() cannot release the buffers, and
the pages to which they belong because they are attached to a commiting
transaction and journal_unmap_buffer() cannot release them.  To release
such the buffers and the pages later, journal_unmap_buffer() leaves it to
journal_commit_transaction().  (journal_unmap_buffer() puts the mark
'BH_Freed' to the buffers so that journal_commit_transaction() can
identify whether they can be released or not.)

In the journalled mode and the writeback mode, jbd does with only metadata
buffers.  But in the ordered mode, jbd does with metadata buffers and also
data buffers.

Actually, journal_commit_transaction() releases only the metadata buffers
of which release is demanded by journal_unmap_buffer(), and also releases
the pages to which they belong if possible.

As a result, the data buffers of which release is demanded by
journal_unmap_buffer() remain after a transaction commits.  And also the
pages to which they belong remain.

Such the remained pages don't have mapping any longer.  Due to this fact,
there is a possibility that the pages which cannot be estimated remain.

The metadata buffers marked 'BH_Freed' and the pages to which
they belong can be released at 'JBD: commit phase 7'.

Therefore, by applying the same code into 'JBD: commit phase 2' (where the
data buffers are done with), journal_commit_transaction() can also release
the data buffers marked 'BH_Freed' and the pages to which they belong.

As a result, all the buffers marked 'BH_Freed' can be released, and also
all the pages to which these buffers belong can be released at
journal_commit_transaction().  So, the page which cannot be estimated is
lost.

<<Excerpt of code at 'JBD: commit phase 7'>>
 >         spin_lock(&journal->j_list_lock);
 >         while (commit_transaction->t_forget) {
 >                 transaction_t *cp_transaction;
 >                 struct buffer_head *bh;
 >
 >                 jh = commit_transaction->t_forget;
 >...
 >                 if (buffer_freed(bh)) {
 >                 ^^^^^^^^^^^^^^^^^^^^^^^^
 >                         clear_buffer_freed(bh);
 >                        ^^^^^^^^^^^^^^^^^^^^^^^^
 >                         clear_buffer_jbddirty(bh);
 >                 }
 >
 >                 if (buffer_jbddirty(bh)) {
 >                         JBUFFER_TRACE(jh, "add to new checkpointing trans");
 >                         __journal_insert_checkpoint(jh, commit_transaction);
 >                         JBUFFER_TRACE(jh, "refile for checkpoint writeback");
 >                         __journal_refile_buffer(jh);
 >                         jbd_unlock_bh_state(bh);
 >                 } else {
 >                         J_ASSERT_BH(bh, !buffer_dirty(bh));
 > ...
 >                         JBUFFER_TRACE(jh, "refile or unfile freed buffer");
 >                         __journal_refile_buffer(jh);
 >                         if (!jh->b_transaction) {
 >                                 jbd_unlock_bh_state(bh);
 >                                  /* needs a brelse */
 >                                 journal_remove_journal_head(bh);
 >                                 release_buffer_page(bh);
 >                                 ^^^^^^^^^^^^^^^^^^^^^^^^
 >                         } else
 >                 }
****************************************************************
* Apply the code of "^^^^^^" lines into 'JBD: commit phase 2' *
****************************************************************

At journal_commit_transaction() code, there is one extra message in the
series of jbd debug messages.  ("JBD: commit phase 2") This patch fixes
it, too.

Signed-off-by: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
Acked-by: Jan Kara <jack@suse.cz>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/jbd/commit.c | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 5a8ca61498c..f943b9b3f20 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -36,7 +36,7 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 
 /*
  * When an ext3-ordered file is truncated, it is possible that many pages are
- * not sucessfully freed, because they are attached to a committing transaction.
+ * not successfully freed, because they are attached to a committing transaction.
  * After the transaction commits, these pages are left on the LRU, with no
  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
  * by the VM, but their apparent absence upsets the VM accounting, and it makes
@@ -45,8 +45,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
  * So here, we have a buffer which has just come off the forget list.  Look to
  * see if we can strip all buffers from the backing page.
  *
- * Called under lock_journal(), and possibly under journal_datalist_lock.  The
- * caller provided us with a ref against the buffer, and we drop that here.
+ * Called under journal->j_list_lock.  The caller provided us with a ref
+ * against the buffer, and we drop that here.
  */
 static void release_buffer_page(struct buffer_head *bh)
 {
@@ -77,6 +77,19 @@ nope:
 	__brelse(bh);
 }
 
+/*
+ * Decrement reference counter for data buffer. If it has been marked
+ * 'BH_Freed', release it and the page to which it belongs if possible.
+ */
+static void release_data_buffer(struct buffer_head *bh)
+{
+	if (buffer_freed(bh)) {
+		clear_buffer_freed(bh);
+		release_buffer_page(bh);
+	} else
+		put_bh(bh);
+}
+
 /*
  * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
  * held.  For ranking reasons we must trylock.  If we lose, schedule away and
@@ -231,7 +244,7 @@ write_out_data:
 			if (locked)
 				unlock_buffer(bh);
 			BUFFER_TRACE(bh, "already cleaned up");
-			put_bh(bh);
+			release_data_buffer(bh);
 			continue;
 		}
 		if (locked && test_clear_buffer_dirty(bh)) {
@@ -258,10 +271,10 @@ write_out_data:
 			if (locked)
 				unlock_buffer(bh);
 			journal_remove_journal_head(bh);
-			/* Once for our safety reference, once for
+			/* One for our safety reference, other for
 			 * journal_remove_journal_head() */
 			put_bh(bh);
-			put_bh(bh);
+			release_data_buffer(bh);
 		}
 
 		if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
@@ -443,7 +456,7 @@ void journal_commit_transaction(journal_t *journal)
 		} else {
 			jbd_unlock_bh_state(bh);
 		}
-		put_bh(bh);
+		release_data_buffer(bh);
 		cond_resched_lock(&journal->j_list_lock);
 	}
 	spin_unlock(&journal->j_list_lock);
@@ -453,8 +466,6 @@ void journal_commit_transaction(journal_t *journal)
 
 	journal_write_revoke_records(journal, commit_transaction);
 
-	jbd_debug(3, "JBD: commit phase 2\n");
-
 	/*
 	 * If we found any dirty or locked buffers, then we should have
 	 * looped back up to the write_out_data label.  If there weren't
-- 
cgit v1.2.3-70-g09d2


From 8ef2720397bb813d4985405a5ae7b8ad6474188b Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 25 Jul 2008 01:46:29 -0700
Subject: ext3: kill 2 useless magic numbers

dx_root_limit() will never return 20, and I can't figure out what 20
stands for.  This function has never changed since htree directory
indexing was merged.

Similar for dx_node_limit() and the magic 22.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Andreas Dilger <adilger@sun.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/namei.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 0b8cf80154f..d282ea87008 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -240,13 +240,13 @@ static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
 {
 	unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) -
 		EXT3_DIR_REC_LEN(2) - infosize;
-	return 0? 20: entry_space / sizeof(struct dx_entry);
+	return entry_space / sizeof(struct dx_entry);
 }
 
 static inline unsigned dx_node_limit (struct inode *dir)
 {
 	unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0);
-	return 0? 22: entry_space / sizeof(struct dx_entry);
+	return entry_space / sizeof(struct dx_entry);
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From cbe5f466f6995e10a10c7ae66d6dc8608f08a6b8 Mon Sep 17 00:00:00 2001
From: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Date: Fri, 25 Jul 2008 01:46:30 -0700
Subject: jbd: don't abort if flushing file data failed

In ordered mode, the current jbd aborts the journal if a file data buffer
has an error.  But this behavior is unintended, and we found that it has
been adopted accidentally.

This patch undoes it and just calls printk() instead of aborting the
journal.  Additionally, set AS_EIO into the address_space object of the
failed buffer which is submitted by journal_do_submit_data() so that
fsync() can get -EIO.

Missing error checkings are also added to inform errors on file data
buffers to the user.  The following buffers are targeted.

  (a) the buffer which has already been written out by pdflush
  (b) the buffer which has been unlocked before scanned in the
      t_locked_list loop

[akpm@linux-foundation.org: improve grammar in a printk]
Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Acked-by: Jan Kara <jack@suse.cz>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/jbd/commit.c | 35 ++++++++++++++++++++++++++++-------
 1 file changed, 28 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index f943b9b3f20..2eccbfaa1d4 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -185,7 +185,7 @@ static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
 /*
  *  Submit all the data buffers to disk
  */
-static void journal_submit_data_buffers(journal_t *journal,
+static int journal_submit_data_buffers(journal_t *journal,
 				transaction_t *commit_transaction)
 {
 	struct journal_head *jh;
@@ -193,6 +193,7 @@ static void journal_submit_data_buffers(journal_t *journal,
 	int locked;
 	int bufs = 0;
 	struct buffer_head **wbuf = journal->j_wbuf;
+	int err = 0;
 
 	/*
 	 * Whenever we unlock the journal and sleep, things can get added
@@ -266,6 +267,8 @@ write_out_data:
 			put_bh(bh);
 		} else {
 			BUFFER_TRACE(bh, "writeout complete: unfile");
+			if (unlikely(!buffer_uptodate(bh)))
+				err = -EIO;
 			__journal_unfile_buffer(jh);
 			jbd_unlock_bh_state(bh);
 			if (locked)
@@ -284,6 +287,8 @@ write_out_data:
 	}
 	spin_unlock(&journal->j_list_lock);
 	journal_do_submit_data(wbuf, bufs);
+
+	return err;
 }
 
 /*
@@ -423,8 +428,7 @@ void journal_commit_transaction(journal_t *journal)
 	 * Now start flushing things to disk, in the order they appear
 	 * on the transaction lists.  Data blocks go first.
 	 */
-	err = 0;
-	journal_submit_data_buffers(journal, commit_transaction);
+	err = journal_submit_data_buffers(journal, commit_transaction);
 
 	/*
 	 * Wait for all previously submitted IO to complete.
@@ -439,10 +443,21 @@ void journal_commit_transaction(journal_t *journal)
 		if (buffer_locked(bh)) {
 			spin_unlock(&journal->j_list_lock);
 			wait_on_buffer(bh);
-			if (unlikely(!buffer_uptodate(bh)))
-				err = -EIO;
 			spin_lock(&journal->j_list_lock);
 		}
+		if (unlikely(!buffer_uptodate(bh))) {
+			if (TestSetPageLocked(bh->b_page)) {
+				spin_unlock(&journal->j_list_lock);
+				lock_page(bh->b_page);
+				spin_lock(&journal->j_list_lock);
+			}
+			if (bh->b_page->mapping)
+				set_bit(AS_EIO, &bh->b_page->mapping->flags);
+
+			unlock_page(bh->b_page);
+			SetPageError(bh->b_page);
+			err = -EIO;
+		}
 		if (!inverted_lock(journal, bh)) {
 			put_bh(bh);
 			spin_lock(&journal->j_list_lock);
@@ -461,8 +476,14 @@ void journal_commit_transaction(journal_t *journal)
 	}
 	spin_unlock(&journal->j_list_lock);
 
-	if (err)
-		journal_abort(journal, err);
+	if (err) {
+		char b[BDEVNAME_SIZE];
+
+		printk(KERN_WARNING
+			"JBD: Detected IO errors while flushing file data "
+			"on %s\n", bdevname(journal->j_fs_dev, b));
+		err = 0;
+	}
 
 	journal_write_revoke_records(journal, commit_transaction);
 
-- 
cgit v1.2.3-70-g09d2


From 275c0a8f1253a7542ad9726956c918d8a1f694c4 Mon Sep 17 00:00:00 2001
From: Duane Griffin <duaneg@dghda.com>
Date: Fri, 25 Jul 2008 01:46:31 -0700
Subject: ext3: validate directory entry data before use

ext3_dx_find_entry uses ext3_next_entry without verifying that the entry
is valid.  If its rec_len == 0 this causes an infinite loop.  Refactor the
loop to check the validity of entries before checking whether they match
and moving onto the next one.

There are other uses of ext3_next_entry in this file which also look
problematic.  They should be reviewed and fixed if/when we have a
test-case that triggers them.

This patch fixes the first case (image hdb.25.softlockup.gz) reported in
http://bugzilla.kernel.org/show_bug.cgi?id=10882.

Signed-off-by: Duane Griffin <duaneg@dghda.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/namei.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index d282ea87008..de13e919cd8 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -991,19 +991,21 @@ static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
 		de = (struct ext3_dir_entry_2 *) bh->b_data;
 		top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize -
 				       EXT3_DIR_REC_LEN(0));
-		for (; de < top; de = ext3_next_entry(de))
-		if (ext3_match (namelen, name, de)) {
-			if (!ext3_check_dir_entry("ext3_find_entry",
-						  dir, de, bh,
-				  (block<<EXT3_BLOCK_SIZE_BITS(sb))
-					  +((char *)de - bh->b_data))) {
-				brelse (bh);
+		for (; de < top; de = ext3_next_entry(de)) {
+			int off = (block << EXT3_BLOCK_SIZE_BITS(sb))
+				  + ((char *) de - bh->b_data);
+
+			if (!ext3_check_dir_entry(__func__, dir, de, bh, off)) {
+				brelse(bh);
 				*err = ERR_BAD_DX_DIR;
 				goto errout;
 			}
-			*res_dir = de;
-			dx_release (frames);
-			return bh;
+
+			if (ext3_match(namelen, name, de)) {
+				*res_dir = de;
+				dx_release(frames);
+				return bh;
+			}
 		}
 		brelse (bh);
 		/* Check to see if we should continue to search */
-- 
cgit v1.2.3-70-g09d2


From c0a1633b6201ef79e31b7da464d44fdf5953054d Mon Sep 17 00:00:00 2001
From: Adam Greenblatt <adam.greenblatt@gmail.com>
Date: Fri, 25 Jul 2008 01:46:32 -0700
Subject: isofs: fix minor filesystem corruption

Some iso9660 images contain files with rockridge data that is either
incorrect or incompletely parsed.  Prior to commit
f2966632a134e865db3c819346a1dc7d96e05309 ("[PATCH] rock: handle directory
overflows") (included with kernel 2.6.13) the kernel ignored the rockridge
data for these files, while still allowing the files to be accessed under
their non-rockridge names.  That commit inadvertently changed things so
that files with invalid rockridge data could not be accessed at all.  (I
ran across the problem when comparing some old CDs with hard disk copies I
had made long ago under kernel 2.4: a few of the files on the hard disk
copies were no longer visible on the CDs.)

This change reverts to the pre-2.6.13 behavior.

Signed-off-by: Adam Greenblatt <adam.greenblatt@gmail.com>
Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: <stable@kernel.org>		[2.6.25.x, 2.6.26.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/isofs/rock.c | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 6bd48f0a704..c2fb2dd0131 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -209,6 +209,11 @@ repeat:
 
 	while (rs.len > 2) { /* There may be one byte for padding somewhere */
 		rr = (struct rock_ridge *)rs.chr;
+		/*
+		 * Ignore rock ridge info if rr->len is out of range, but
+		 * don't return -EIO because that would make the file
+		 * invisible.
+		 */
 		if (rr->len < 3)
 			goto out;	/* Something got screwed up here */
 		sig = isonum_721(rs.chr);
@@ -216,8 +221,12 @@ repeat:
 			goto eio;
 		rs.chr += rr->len;
 		rs.len -= rr->len;
+		/*
+		 * As above, just ignore the rock ridge info if rr->len
+		 * is bogus.
+		 */
 		if (rs.len < 0)
-			goto eio;	/* corrupted isofs */
+			goto out;	/* Something got screwed up here */
 
 		switch (sig) {
 		case SIG('R', 'R'):
@@ -307,6 +316,11 @@ parse_rock_ridge_inode_internal(struct iso_directory_record *de,
 repeat:
 	while (rs.len > 2) { /* There may be one byte for padding somewhere */
 		rr = (struct rock_ridge *)rs.chr;
+		/*
+		 * Ignore rock ridge info if rr->len is out of range, but
+		 * don't return -EIO because that would make the file
+		 * invisible.
+		 */
 		if (rr->len < 3)
 			goto out;	/* Something got screwed up here */
 		sig = isonum_721(rs.chr);
@@ -314,8 +328,12 @@ repeat:
 			goto eio;
 		rs.chr += rr->len;
 		rs.len -= rr->len;
+		/*
+		 * As above, just ignore the rock ridge info if rr->len
+		 * is bogus.
+		 */
 		if (rs.len < 0)
-			goto eio;	/* corrupted isofs */
+			goto out;	/* Something got screwed up here */
 
 		switch (sig) {
 #ifndef CONFIG_ZISOFS		/* No flag for SF or ZF */
-- 
cgit v1.2.3-70-g09d2


From de0ca06a99c33df8333955642843331ab6b6e7ff Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Fri, 25 Jul 2008 01:46:34 -0700
Subject: coda: remove CODA_FS_OLD_API

While fixing CONFIG_ leakages to the userspace kernel headers I ran into
CODA_FS_OLD_API.

After five years, are there still people using the old API left?
Especially considering that you have to choose at compile time which API
to support in the kernel (and distributions tend to offer the new API for
some time).

Jan: "The old API can definitely go.  Around the time the new
      interface went in there were some non-Coda userspace file system
      implementations that took a while longer to convert to the new API,
      but by now they all switched to the new interface or in some cases
      to a FUSE-based solution."

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Acked-by: Jan Harkes <jaharkes@cs.cmu.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/Kconfig           | 14 --------------
 fs/coda/coda_linux.c |  6 ++----
 fs/coda/psdev.c      |  4 ----
 fs/coda/upcall.c     | 15 +--------------
 include/linux/coda.h | 43 -------------------------------------------
 5 files changed, 3 insertions(+), 79 deletions(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 37db79a2ff9..ed563b9e352 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -2093,20 +2093,6 @@ config CODA_FS
 	  To compile the coda client support as a module, choose M here: the
 	  module will be called coda.
 
-config CODA_FS_OLD_API
-	bool "Use 96-bit Coda file identifiers"
-	depends on CODA_FS
-	help
-	  A new kernel-userspace API had to be introduced for Coda v6.0
-	  to support larger 128-bit file identifiers as needed by the
-	  new realms implementation.
-
-	  However this new API is not backward compatible with older
-	  clients. If you really need to run the old Coda userspace
-	  cache manager then say Y.
-
-	  For most cases you probably want to say N.
-
 config AFS_FS
 	tristate "Andrew File System support (AFS) (EXPERIMENTAL)"
 	depends on INET && EXPERIMENTAL
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index e1c854890f9..bf4a3fd3c8e 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -28,11 +28,9 @@ int coda_fake_statfs;
 char * coda_f2s(struct CodaFid *f)
 {
 	static char s[60];
-#ifdef CONFIG_CODA_FS_OLD_API
- 	sprintf(s, "(%08x.%08x.%08x)", f->opaque[0], f->opaque[1], f->opaque[2]);
-#else
+
  	sprintf(s, "(%08x.%08x.%08x.%08x)", f->opaque[0], f->opaque[1], f->opaque[2], f->opaque[3]);
-#endif
+
 	return s;
 }
 
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 40c36f7352a..0d9b80ec689 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -378,11 +378,7 @@ MODULE_AUTHOR("Jan Harkes, Peter J. Braam");
 MODULE_DESCRIPTION("Coda Distributed File System VFS interface");
 MODULE_ALIAS_CHARDEV_MAJOR(CODA_PSDEV_MAJOR);
 MODULE_LICENSE("GPL");
-#ifdef CONFIG_CODA_FS_OLD_API
-MODULE_VERSION("5.3.21");
-#else
 MODULE_VERSION("6.6");
-#endif
 
 static int __init init_coda(void)
 {
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index 359e531094d..ce432bca95d 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -52,12 +52,8 @@ static void *alloc_upcall(int opcode, int size)
         inp->ih.opcode = opcode;
 	inp->ih.pid = current->pid;
 	inp->ih.pgid = task_pgrp_nr(current);
-#ifdef CONFIG_CODA_FS_OLD_API
-	memset(&inp->ih.cred, 0, sizeof(struct coda_cred));
-	inp->ih.cred.cr_fsuid = current->fsuid;
-#else
 	inp->ih.uid = current->fsuid;
-#endif
+
 	return (void*)inp;
 }
 
@@ -166,20 +162,11 @@ int venus_close(struct super_block *sb, struct CodaFid *fid, int flags,
 	union inputArgs *inp;
 	union outputArgs *outp;
 	int insize, outsize, error;
-#ifdef CONFIG_CODA_FS_OLD_API
-	struct coda_cred cred = { 0, };
-	cred.cr_fsuid = uid;
-#endif
 	
 	insize = SIZE(release);
 	UPARG(CODA_CLOSE);
 	
-#ifdef CONFIG_CODA_FS_OLD_API
-	memcpy(&(inp->ih.cred), &cred, sizeof(cred));
-#else
 	inp->ih.uid = uid;
-#endif
-	
         inp->coda_close.VFid = *fid;
         inp->coda_close.flags = flags;
 
diff --git a/include/linux/coda.h b/include/linux/coda.h
index b5cf0780c51..96c87693800 100644
--- a/include/linux/coda.h
+++ b/include/linux/coda.h
@@ -199,28 +199,6 @@ typedef u_int32_t vuid_t;
 typedef u_int32_t vgid_t;
 #endif /*_VUID_T_ */
 
-#ifdef CONFIG_CODA_FS_OLD_API
-struct CodaFid {
-	u_int32_t opaque[3];
-};
-
-static __inline__ ino_t  coda_f2i(struct CodaFid *fid)
-{
-	if ( ! fid ) 
-		return 0; 
-	if (fid->opaque[1] == 0xfffffffe || fid->opaque[1] == 0xffffffff)
-		return ((fid->opaque[0] << 20) | (fid->opaque[2] & 0xfffff));
-	else
-		return (fid->opaque[2] + (fid->opaque[1]<<10) + (fid->opaque[0]<<20));
-}
-
-struct coda_cred {
-    vuid_t cr_uid, cr_euid, cr_suid, cr_fsuid; /* Real, efftve, set, fs uid*/
-    vgid_t cr_groupid, cr_egid, cr_sgid, cr_fsgid; /* same for groups */
-};
-
-#else /* not defined(CONFIG_CODA_FS_OLD_API) */
-
 struct CodaFid {
 	u_int32_t opaque[4];
 };
@@ -228,8 +206,6 @@ struct CodaFid {
 #define coda_f2i(fid)\
 	(fid ? (fid->opaque[3] ^ (fid->opaque[2]<<10) ^ (fid->opaque[1]<<20) ^ fid->opaque[0]) : 0)
 
-#endif
-
 #ifndef _VENUS_VATTR_T_
 #define _VENUS_VATTR_T_
 /*
@@ -313,15 +289,7 @@ struct coda_statfs {
 
 #define CIOC_KERNEL_VERSION _IOWR('c', 10, size_t)
 
-#if 0
-#define CODA_KERNEL_VERSION 0 /* don't care about kernel version number */
-#define CODA_KERNEL_VERSION 1 /* The old venus 4.6 compatible interface */
-#endif
-#ifdef CONFIG_CODA_FS_OLD_API
-#define CODA_KERNEL_VERSION 2 /* venus_lookup got an extra parameter */
-#else
 #define CODA_KERNEL_VERSION 3 /* 128-bit file identifiers */
-#endif
 
 /*
  *        Venus <-> Coda  RPC arguments
@@ -329,16 +297,9 @@ struct coda_statfs {
 struct coda_in_hdr {
     u_int32_t opcode;
     u_int32_t unique;	    /* Keep multiple outstanding msgs distinct */
-#ifdef CONFIG_CODA_FS_OLD_API
-    u_int16_t pid;	    /* Common to all */
-    u_int16_t pgid;	    /* Common to all */
-    u_int16_t sid;          /* Common to all */
-    struct coda_cred cred;  /* Common to all */
-#else
     pid_t pid;
     pid_t pgid;
     vuid_t uid;
-#endif
 };
 
 /* Really important that opcode and unique are 1st two fields! */
@@ -613,11 +574,7 @@ struct coda_vget_out {
 /* CODA_PURGEUSER is a venus->kernel call */
 struct coda_purgeuser_out {
     struct coda_out_hdr oh;
-#ifdef CONFIG_CODA_FS_OLD_API
-    struct coda_cred cred;
-#else
     vuid_t uid;
-#endif
 };
 
 /* coda_zapfile: */
-- 
cgit v1.2.3-70-g09d2


From 3084b72de73a6f8af0f16989ffb348068bd066d4 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <matthias@kaehlcke.net>
Date: Fri, 25 Jul 2008 01:46:34 -0700
Subject: hfs: convert bitmap_lock in a mutex

Apple Macintosh file system: The semaphore bitmap_lock is used as a mutex.
Convert it to the mutex API

Signed-off-by: Matthias Kaehlcke <matthias@kaehlcke.net>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hfs/bitmap.c | 8 ++++----
 fs/hfs/hfs_fs.h | 3 ++-
 fs/hfs/super.c  | 2 +-
 3 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/hfs/bitmap.c b/fs/hfs/bitmap.c
index 24e75798ddf..c6e97366e8a 100644
--- a/fs/hfs/bitmap.c
+++ b/fs/hfs/bitmap.c
@@ -145,7 +145,7 @@ u32 hfs_vbm_search_free(struct super_block *sb, u32 goal, u32 *num_bits)
 	if (!*num_bits)
 		return 0;
 
-	down(&HFS_SB(sb)->bitmap_lock);
+	mutex_lock(&HFS_SB(sb)->bitmap_lock);
 	bitmap = HFS_SB(sb)->bitmap;
 
 	pos = hfs_find_set_zero_bits(bitmap, HFS_SB(sb)->fs_ablocks, goal, num_bits);
@@ -162,7 +162,7 @@ u32 hfs_vbm_search_free(struct super_block *sb, u32 goal, u32 *num_bits)
 	HFS_SB(sb)->free_ablocks -= *num_bits;
 	hfs_bitmap_dirty(sb);
 out:
-	up(&HFS_SB(sb)->bitmap_lock);
+	mutex_unlock(&HFS_SB(sb)->bitmap_lock);
 	return pos;
 }
 
@@ -205,7 +205,7 @@ int hfs_clear_vbm_bits(struct super_block *sb, u16 start, u16 count)
 	if ((start + count) > HFS_SB(sb)->fs_ablocks)
 		return -2;
 
-	down(&HFS_SB(sb)->bitmap_lock);
+	mutex_lock(&HFS_SB(sb)->bitmap_lock);
 	/* bitmap is always on a 32-bit boundary */
 	curr = HFS_SB(sb)->bitmap + (start / 32);
 	len = count;
@@ -236,7 +236,7 @@ int hfs_clear_vbm_bits(struct super_block *sb, u16 start, u16 count)
 	}
 out:
 	HFS_SB(sb)->free_ablocks += len;
-	up(&HFS_SB(sb)->bitmap_lock);
+	mutex_unlock(&HFS_SB(sb)->bitmap_lock);
 	hfs_bitmap_dirty(sb);
 
 	return 0;
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 147374b6f67..ad652881911 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -11,6 +11,7 @@
 
 #include <linux/slab.h>
 #include <linux/types.h>
+#include <linux/mutex.h>
 #include <linux/buffer_head.h>
 #include <linux/fs.h>
 
@@ -139,7 +140,7 @@ struct hfs_sb_info {
 
 	struct nls_table *nls_io, *nls_disk;
 
-	struct semaphore bitmap_lock;
+	struct mutex bitmap_lock;
 
 	unsigned long flags;
 
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 8cf67974adf..ac2ec5ef66e 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -372,7 +372,7 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
 
 	sb->s_op = &hfs_super_operations;
 	sb->s_flags |= MS_NODIRATIME;
-	init_MUTEX(&sbi->bitmap_lock);
+	mutex_init(&sbi->bitmap_lock);
 
 	res = hfs_mdb_get(sb);
 	if (res) {
-- 
cgit v1.2.3-70-g09d2


From 39f8d472f280dee6503a364d1d911b9e20ce3ec9 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <matthias@kaehlcke.net>
Date: Fri, 25 Jul 2008 01:46:35 -0700
Subject: hfs: convert extents_lock in a mutex

Apple Macintosh file system: The semaphore extens_lock is used as a mutex.
Convert it to the mutex API

Signed-off-by: Matthias Kaehlcke <matthias@kaehlcke.net>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hfs/btree.c  |  2 +-
 fs/hfs/extent.c | 14 +++++++-------
 fs/hfs/hfs_fs.h |  2 +-
 fs/hfs/inode.c  |  4 ++--
 4 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index f6621a78520..9b9d6395bad 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -40,7 +40,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
 	{
 	struct hfs_mdb *mdb = HFS_SB(sb)->mdb;
 	HFS_I(tree->inode)->flags = 0;
-	init_MUTEX(&HFS_I(tree->inode)->extents_lock);
+	mutex_init(&HFS_I(tree->inode)->extents_lock);
 	switch (id) {
 	case HFS_EXT_CNID:
 		hfs_inode_read_fork(tree->inode, mdb->drXTExtRec, mdb->drXTFlSize,
diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c
index c176f67ba0a..2c16316d291 100644
--- a/fs/hfs/extent.c
+++ b/fs/hfs/extent.c
@@ -343,16 +343,16 @@ int hfs_get_block(struct inode *inode, sector_t block,
 		goto done;
 	}
 
-	down(&HFS_I(inode)->extents_lock);
+	mutex_lock(&HFS_I(inode)->extents_lock);
 	res = hfs_ext_read_extent(inode, ablock);
 	if (!res)
 		dblock = hfs_ext_find_block(HFS_I(inode)->cached_extents,
 					    ablock - HFS_I(inode)->cached_start);
 	else {
-		up(&HFS_I(inode)->extents_lock);
+		mutex_unlock(&HFS_I(inode)->extents_lock);
 		return -EIO;
 	}
-	up(&HFS_I(inode)->extents_lock);
+	mutex_unlock(&HFS_I(inode)->extents_lock);
 
 done:
 	map_bh(bh_result, sb, HFS_SB(sb)->fs_start +
@@ -375,7 +375,7 @@ int hfs_extend_file(struct inode *inode)
 	u32 start, len, goal;
 	int res;
 
-	down(&HFS_I(inode)->extents_lock);
+	mutex_lock(&HFS_I(inode)->extents_lock);
 	if (HFS_I(inode)->alloc_blocks == HFS_I(inode)->first_blocks)
 		goal = hfs_ext_lastblock(HFS_I(inode)->first_extents);
 	else {
@@ -425,7 +425,7 @@ int hfs_extend_file(struct inode *inode)
 			goto insert_extent;
 	}
 out:
-	up(&HFS_I(inode)->extents_lock);
+	mutex_unlock(&HFS_I(inode)->extents_lock);
 	if (!res) {
 		HFS_I(inode)->alloc_blocks += len;
 		mark_inode_dirty(inode);
@@ -487,7 +487,7 @@ void hfs_file_truncate(struct inode *inode)
 	if (blk_cnt == alloc_cnt)
 		goto out;
 
-	down(&HFS_I(inode)->extents_lock);
+	mutex_lock(&HFS_I(inode)->extents_lock);
 	hfs_find_init(HFS_SB(sb)->ext_tree, &fd);
 	while (1) {
 		if (alloc_cnt == HFS_I(inode)->first_blocks) {
@@ -514,7 +514,7 @@ void hfs_file_truncate(struct inode *inode)
 		hfs_brec_remove(&fd);
 	}
 	hfs_find_exit(&fd);
-	up(&HFS_I(inode)->extents_lock);
+	mutex_unlock(&HFS_I(inode)->extents_lock);
 
 	HFS_I(inode)->alloc_blocks = blk_cnt;
 out:
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index ad652881911..9955232fdf8 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -54,7 +54,7 @@ struct hfs_inode_info {
 	struct list_head open_dir_list;
 	struct inode *rsrc_inode;
 
-	struct semaphore extents_lock;
+	struct mutex extents_lock;
 
 	u16 alloc_blocks, clump_blocks;
 	sector_t fs_blocks;
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 97f8446c4ff..dc4ec640e87 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -150,7 +150,7 @@ struct inode *hfs_new_inode(struct inode *dir, struct qstr *name, int mode)
 	if (!inode)
 		return NULL;
 
-	init_MUTEX(&HFS_I(inode)->extents_lock);
+	mutex_init(&HFS_I(inode)->extents_lock);
 	INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list);
 	hfs_cat_build_key(sb, (btree_key *)&HFS_I(inode)->cat_key, dir->i_ino, name);
 	inode->i_ino = HFS_SB(sb)->next_id++;
@@ -281,7 +281,7 @@ static int hfs_read_inode(struct inode *inode, void *data)
 
 	HFS_I(inode)->flags = 0;
 	HFS_I(inode)->rsrc_inode = NULL;
-	init_MUTEX(&HFS_I(inode)->extents_lock);
+	mutex_init(&HFS_I(inode)->extents_lock);
 	INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list);
 
 	/* Initialize the inode */
-- 
cgit v1.2.3-70-g09d2


From 895c23f8c39c0c8d7b536bb2566d4aa968d78be2 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <matthias@kaehlcke.net>
Date: Fri, 25 Jul 2008 01:46:36 -0700
Subject: hfsplus: convert the extents_lock in a mutex

Apple Extended HFS file system: The semaphore extents lock is used as a
mutex.  Convert it to the mutex API.

Signed-off-by: Matthias Kaehlcke <matthias@kaehlcke.net>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hfsplus/extents.c    | 14 +++++++-------
 fs/hfsplus/hfsplus_fs.h |  3 ++-
 fs/hfsplus/inode.c      |  4 ++--
 fs/hfsplus/super.c      |  2 +-
 4 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 12e899cd788..fec8f61227f 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -199,16 +199,16 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
 		goto done;
 	}
 
-	down(&HFSPLUS_I(inode).extents_lock);
+	mutex_lock(&HFSPLUS_I(inode).extents_lock);
 	res = hfsplus_ext_read_extent(inode, ablock);
 	if (!res) {
 		dblock = hfsplus_ext_find_block(HFSPLUS_I(inode).cached_extents, ablock -
 					     HFSPLUS_I(inode).cached_start);
 	} else {
-		up(&HFSPLUS_I(inode).extents_lock);
+		mutex_unlock(&HFSPLUS_I(inode).extents_lock);
 		return -EIO;
 	}
-	up(&HFSPLUS_I(inode).extents_lock);
+	mutex_unlock(&HFSPLUS_I(inode).extents_lock);
 
 done:
 	dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", inode->i_ino, (long long)iblock, dblock);
@@ -355,7 +355,7 @@ int hfsplus_file_extend(struct inode *inode)
 		return -ENOSPC;
 	}
 
-	down(&HFSPLUS_I(inode).extents_lock);
+	mutex_lock(&HFSPLUS_I(inode).extents_lock);
 	if (HFSPLUS_I(inode).alloc_blocks == HFSPLUS_I(inode).first_blocks)
 		goal = hfsplus_ext_lastblock(HFSPLUS_I(inode).first_extents);
 	else {
@@ -408,7 +408,7 @@ int hfsplus_file_extend(struct inode *inode)
 			goto insert_extent;
 	}
 out:
-	up(&HFSPLUS_I(inode).extents_lock);
+	mutex_unlock(&HFSPLUS_I(inode).extents_lock);
 	if (!res) {
 		HFSPLUS_I(inode).alloc_blocks += len;
 		mark_inode_dirty(inode);
@@ -465,7 +465,7 @@ void hfsplus_file_truncate(struct inode *inode)
 	if (blk_cnt == alloc_cnt)
 		goto out;
 
-	down(&HFSPLUS_I(inode).extents_lock);
+	mutex_lock(&HFSPLUS_I(inode).extents_lock);
 	hfs_find_init(HFSPLUS_SB(sb).ext_tree, &fd);
 	while (1) {
 		if (alloc_cnt == HFSPLUS_I(inode).first_blocks) {
@@ -492,7 +492,7 @@ void hfsplus_file_truncate(struct inode *inode)
 		hfs_brec_remove(&fd);
 	}
 	hfs_find_exit(&fd);
-	up(&HFSPLUS_I(inode).extents_lock);
+	mutex_unlock(&HFSPLUS_I(inode).extents_lock);
 
 	HFSPLUS_I(inode).alloc_blocks = blk_cnt;
 out:
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 9e59537b43d..f027a905225 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -11,6 +11,7 @@
 #define _LINUX_HFSPLUS_FS_H
 
 #include <linux/fs.h>
+#include <linux/mutex.h>
 #include <linux/buffer_head.h>
 #include "hfsplus_raw.h"
 
@@ -154,7 +155,7 @@ struct hfsplus_sb_info {
 
 
 struct hfsplus_inode_info {
-	struct semaphore extents_lock;
+	struct mutex extents_lock;
 	u32 clump_blocks, alloc_blocks;
 	sector_t fs_blocks;
 	/* Allocation extents from catalog record or volume header */
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 67e1c8b467c..cc3b5e24339 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -163,7 +163,7 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
 
 	inode->i_ino = dir->i_ino;
 	INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list);
-	init_MUTEX(&HFSPLUS_I(inode).extents_lock);
+	mutex_init(&HFSPLUS_I(inode).extents_lock);
 	HFSPLUS_I(inode).flags = HFSPLUS_FLG_RSRC;
 
 	hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
@@ -316,7 +316,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
 	inode->i_nlink = 1;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
 	INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list);
-	init_MUTEX(&HFSPLUS_I(inode).extents_lock);
+	mutex_init(&HFSPLUS_I(inode).extents_lock);
 	atomic_set(&HFSPLUS_I(inode).opencnt, 0);
 	HFSPLUS_I(inode).flags = 0;
 	memset(HFSPLUS_I(inode).first_extents, 0, sizeof(hfsplus_extent_rec));
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index ce97a54518d..3859118531c 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -34,7 +34,7 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
 		return inode;
 
 	INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list);
-	init_MUTEX(&HFSPLUS_I(inode).extents_lock);
+	mutex_init(&HFSPLUS_I(inode).extents_lock);
 	HFSPLUS_I(inode).flags = 0;
 	HFSPLUS_I(inode).rsrc_inode = NULL;
 	atomic_set(&HFSPLUS_I(inode).opencnt, 0);
-- 
cgit v1.2.3-70-g09d2


From 5d4f7fddf8882b214e4aabb3bdb37f90a72b2537 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 25 Jul 2008 01:46:36 -0700
Subject: reiserfs: fix synchronization of quota files in journal=data mode

In journal=data mode, it is not enough to do write_inode_now() as done in
vfs_quota_on() to write all data to their final location (which is needed for
quota_read to work correctly).  Calling journal_end_sync() before calling
vfs_quota_on() does it's job because transactions are committed to the journal
and data marked as dirty in memory so write_inode_now() writes them to their
final locations.

Cc: <reiserfs-devel@vger.kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/super.c | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 1d40f2bd197..0cbf4cd1114 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2026,6 +2026,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
 	int err;
 	struct nameidata nd;
 	struct inode *inode;
+	struct reiserfs_transaction_handle th;
 
 	if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA)))
 		return -EINVAL;
@@ -2053,17 +2054,28 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
 		}
 		mark_inode_dirty(inode);
 	}
-	/* Not journalling quota? No more tests needed... */
-	if (!REISERFS_SB(sb)->s_qf_names[USRQUOTA] &&
-	    !REISERFS_SB(sb)->s_qf_names[GRPQUOTA]) {
-		path_put(&nd.path);
-		return vfs_quota_on(sb, type, format_id, path, 0);
-	}
-	/* Quotafile not of fs root? */
-	if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode)
-		reiserfs_warning(sb,
+	/* Journaling quota? */
+	if (REISERFS_SB(sb)->s_qf_names[type]) {
+		/* Quotafile not of fs root? */
+		if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode)
+			reiserfs_warning(sb,
 				 "reiserfs: Quota file not on filesystem root. "
 				 "Journalled quota will not work.");
+	}
+
+	/*
+	 * When we journal data on quota file, we have to flush journal to see
+	 * all updates to the file when we bypass pagecache...
+	 */
+	if (reiserfs_file_data_log(inode)) {
+		/* Just start temporary transaction and finish it */
+		err = journal_begin(&th, sb, 1);
+		if (err)
+			return err;
+		err = journal_end_sync(&th, sb, 1);
+		if (err)
+			return err;
+	}
 	path_put(&nd.path);
 	return vfs_quota_on(sb, type, format_id, path, 0);
 }
-- 
cgit v1.2.3-70-g09d2


From 4506567b24d3ea707e46e8aad64caef539382f4b Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 25 Jul 2008 01:46:37 -0700
Subject: reiserfs: fix typos in messages and comments (journalled ->
 journaled)

Cc: <reiserfs-devel@vger.kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/super.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 0cbf4cd1114..f723604c5d9 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -182,7 +182,7 @@ static int finish_unfinished(struct super_block *s)
 			int ret = reiserfs_quota_on_mount(s, i);
 			if (ret < 0)
 				reiserfs_warning(s,
-						 "reiserfs: cannot turn on journalled quota: error %d",
+						 "reiserfs: cannot turn on journaled quota: error %d",
 						 ret);
 		}
 	}
@@ -994,7 +994,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
 
 			if (sb_any_quota_enabled(s)) {
 				reiserfs_warning(s,
-						 "reiserfs_parse_options: cannot change journalled quota options when quota turned on.");
+						 "reiserfs_parse_options: cannot change journaled quota options when quota turned on.");
 				return 0;
 			}
 			if (*arg) {	/* Some filename specified? */
@@ -1039,7 +1039,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
 #else
 		if (c == 'u' || c == 'g' || c == 'f') {
 			reiserfs_warning(s,
-					 "reiserfs_parse_options: journalled quota options not supported.");
+					 "reiserfs_parse_options: journaled quota options not supported.");
 			return 0;
 		}
 #endif
@@ -1050,7 +1050,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
 	    && (REISERFS_SB(s)->s_qf_names[USRQUOTA]
 		|| REISERFS_SB(s)->s_qf_names[GRPQUOTA])) {
 		reiserfs_warning(s,
-				 "reiserfs_parse_options: journalled quota format not specified.");
+				 "reiserfs_parse_options: journaled quota format not specified.");
 		return 0;
 	}
 	/* This checking is not precise wrt the quota type but for our purposes it is sufficient */
@@ -1980,7 +1980,7 @@ static int reiserfs_release_dquot(struct dquot *dquot)
 
 static int reiserfs_mark_dquot_dirty(struct dquot *dquot)
 {
-	/* Are we journalling quotas? */
+	/* Are we journaling quotas? */
 	if (REISERFS_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
 	    REISERFS_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
 		dquot_mark_dquot_dirty(dquot);
-- 
cgit v1.2.3-70-g09d2


From 00b441970a0ab48185244300ac7d4e4eb76df692 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 25 Jul 2008 01:46:38 -0700
Subject: reiserfs: correct mount option parsing to detect when quota options
 can be changed

We should not allow user to change quota mount options when quota is just
suspended.  It would make mount options and internal quota state inconsistent.

Also we should not allow user to change quota format when quota is turned on.
On the other hand we can just silently ignore when some option is set to the
value it already has (some mount versions do this on remount).  Finally, we
should not discard current quota options if parsing of mount options fails.

Cc: <reiserfs-devel@vger.kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/super.c | 83 ++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 60 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index f723604c5d9..a10a6d2a887 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -876,7 +876,9 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
 				     mount options were selected. */
 				  unsigned long *blocks,	/* strtol-ed from NNN of resize=NNN */
 				  char **jdev_name,
-				  unsigned int *commit_max_age)
+				  unsigned int *commit_max_age,
+				  char **qf_names,
+				  unsigned int *qfmt)
 {
 	int c;
 	char *arg = NULL;
@@ -992,7 +994,9 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
 		if (c == 'u' || c == 'g') {
 			int qtype = c == 'u' ? USRQUOTA : GRPQUOTA;
 
-			if (sb_any_quota_enabled(s)) {
+			if ((sb_any_quota_enabled(s) ||
+			     sb_any_quota_suspended(s)) &&
+			    (!*arg != !REISERFS_SB(s)->s_qf_names[qtype])) {
 				reiserfs_warning(s,
 						 "reiserfs_parse_options: cannot change journaled quota options when quota turned on.");
 				return 0;
@@ -1011,30 +1015,39 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
 							 "reiserfs_parse_options: quotafile must be on filesystem root.");
 					return 0;
 				}
-				REISERFS_SB(s)->s_qf_names[qtype] =
+				qf_names[qtype] =
 				    kmalloc(strlen(arg) + 1, GFP_KERNEL);
-				if (!REISERFS_SB(s)->s_qf_names[qtype]) {
+				if (!qf_names[qtype]) {
 					reiserfs_warning(s,
 							 "reiserfs_parse_options: not enough memory for storing quotafile name.");
 					return 0;
 				}
-				strcpy(REISERFS_SB(s)->s_qf_names[qtype], arg);
+				strcpy(qf_names[qtype], arg);
 				*mount_options |= 1 << REISERFS_QUOTA;
 			} else {
-				kfree(REISERFS_SB(s)->s_qf_names[qtype]);
-				REISERFS_SB(s)->s_qf_names[qtype] = NULL;
+				if (qf_names[qtype] !=
+				    REISERFS_SB(s)->s_qf_names[qtype])
+					kfree(qf_names[qtype]);
+				qf_names[qtype] = NULL;
 			}
 		}
 		if (c == 'f') {
 			if (!strcmp(arg, "vfsold"))
-				REISERFS_SB(s)->s_jquota_fmt = QFMT_VFS_OLD;
+				*qfmt = QFMT_VFS_OLD;
 			else if (!strcmp(arg, "vfsv0"))
-				REISERFS_SB(s)->s_jquota_fmt = QFMT_VFS_V0;
+				*qfmt = QFMT_VFS_V0;
 			else {
 				reiserfs_warning(s,
 						 "reiserfs_parse_options: unknown quota format specified.");
 				return 0;
 			}
+			if ((sb_any_quota_enabled(s) ||
+			     sb_any_quota_suspended(s)) &&
+			    *qfmt != REISERFS_SB(s)->s_jquota_fmt) {
+				reiserfs_warning(s,
+						 "reiserfs_parse_options: cannot change journaled quota options when quota turned on.");
+				return 0;
+			}
 		}
 #else
 		if (c == 'u' || c == 'g' || c == 'f') {
@@ -1046,9 +1059,8 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
 	}
 
 #ifdef CONFIG_QUOTA
-	if (!REISERFS_SB(s)->s_jquota_fmt
-	    && (REISERFS_SB(s)->s_qf_names[USRQUOTA]
-		|| REISERFS_SB(s)->s_qf_names[GRPQUOTA])) {
+	if (!REISERFS_SB(s)->s_jquota_fmt && !*qfmt
+	    && (qf_names[USRQUOTA] || qf_names[GRPQUOTA])) {
 		reiserfs_warning(s,
 				 "reiserfs_parse_options: journaled quota format not specified.");
 		return 0;
@@ -1130,6 +1142,21 @@ static void handle_attrs(struct super_block *s)
 	}
 }
 
+#ifdef CONFIG_QUOTA
+static void handle_quota_files(struct super_block *s, char **qf_names,
+			       unsigned int *qfmt)
+{
+	int i;
+
+	for (i = 0; i < MAXQUOTAS; i++) {
+		if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i])
+			kfree(REISERFS_SB(s)->s_qf_names[i]);
+		REISERFS_SB(s)->s_qf_names[i] = qf_names[i];
+	}
+	REISERFS_SB(s)->s_jquota_fmt = *qfmt;
+}
+#endif
+
 static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 {
 	struct reiserfs_super_block *rs;
@@ -1141,23 +1168,30 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 	struct reiserfs_journal *journal = SB_JOURNAL(s);
 	char *new_opts = kstrdup(arg, GFP_KERNEL);
 	int err;
+	char *qf_names[MAXQUOTAS];
+	unsigned int qfmt = 0;
 #ifdef CONFIG_QUOTA
 	int i;
+
+	memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names));
 #endif
 
 	rs = SB_DISK_SUPER_BLOCK(s);
 
 	if (!reiserfs_parse_options
-	    (s, arg, &mount_options, &blocks, NULL, &commit_max_age)) {
+	    (s, arg, &mount_options, &blocks, NULL, &commit_max_age,
+	    qf_names, &qfmt)) {
 #ifdef CONFIG_QUOTA
-		for (i = 0; i < MAXQUOTAS; i++) {
-			kfree(REISERFS_SB(s)->s_qf_names[i]);
-			REISERFS_SB(s)->s_qf_names[i] = NULL;
-		}
+		for (i = 0; i < MAXQUOTAS; i++)
+			if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i])
+				kfree(qf_names[i]);
 #endif
 		err = -EINVAL;
 		goto out_err;
 	}
+#ifdef CONFIG_QUOTA
+	handle_quota_files(s, qf_names, &qfmt);
+#endif
 
 	handle_attrs(s);
 
@@ -1570,6 +1604,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 	char *jdev_name;
 	struct reiserfs_sb_info *sbi;
 	int errval = -EINVAL;
+	char *qf_names[MAXQUOTAS] = {};
+	unsigned int qfmt = 0;
 
 	save_mount_options(s, data);
 
@@ -1597,9 +1633,12 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 	jdev_name = NULL;
 	if (reiserfs_parse_options
 	    (s, (char *)data, &(sbi->s_mount_opt), &blocks, &jdev_name,
-	     &commit_max_age) == 0) {
+	     &commit_max_age, qf_names, &qfmt) == 0) {
 		goto error;
 	}
+#ifdef CONFIG_QUOTA
+	handle_quota_files(s, qf_names, &qfmt);
+#endif
 
 	if (blocks) {
 		SWARN(silent, s, "jmacd-7: reiserfs_fill_super: resize option "
@@ -1819,7 +1858,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 
 	return (0);
 
-      error:
+error:
 	if (jinit_done) {	/* kill the commit thread, free journal ram */
 		journal_release_error(NULL, s);
 	}
@@ -1830,10 +1869,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 #ifdef CONFIG_QUOTA
 	{
 		int j;
-		for (j = 0; j < MAXQUOTAS; j++) {
-			kfree(sbi->s_qf_names[j]);
-			sbi->s_qf_names[j] = NULL;
-		}
+		for (j = 0; j < MAXQUOTAS; j++)
+			kfree(qf_names[j]);
 	}
 #endif
 	kfree(sbi);
-- 
cgit v1.2.3-70-g09d2


From f68215c4640a38d66429014e524a627bf572d26a Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Fri, 25 Jul 2008 01:46:38 -0700
Subject: reiserfs: convert j_lock to mutex

j_lock is a semaphore but uses it as if it were a mutex.  This patch converts
it to a mutex.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Cc: Matthew Wilcox <matthew@wil.cx>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Edward Shishkin <edward.shishkin@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/journal.c          | 6 +++---
 include/linux/reiserfs_fs_sb.h | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index e396b2fa474..0f7b1e807e6 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -558,13 +558,13 @@ static inline void insert_journal_hash(struct reiserfs_journal_cnode **table,
 static inline void lock_journal(struct super_block *p_s_sb)
 {
 	PROC_INFO_INC(p_s_sb, journal.lock_journal);
-	down(&SB_JOURNAL(p_s_sb)->j_lock);
+	mutex_lock(&SB_JOURNAL(p_s_sb)->j_mutex);
 }
 
 /* unlock the current transaction */
 static inline void unlock_journal(struct super_block *p_s_sb)
 {
-	up(&SB_JOURNAL(p_s_sb)->j_lock);
+	mutex_unlock(&SB_JOURNAL(p_s_sb)->j_mutex);
 }
 
 static inline void get_journal_list(struct reiserfs_journal_list *jl)
@@ -2837,7 +2837,7 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
 	journal->j_last = NULL;
 	journal->j_first = NULL;
 	init_waitqueue_head(&(journal->j_join_wait));
-	sema_init(&journal->j_lock, 1);
+	mutex_init(&journal->j_mutex);
 	sema_init(&journal->j_flush_sem, 1);
 
 	journal->j_trans_id = 10;
diff --git a/include/linux/reiserfs_fs_sb.h b/include/linux/reiserfs_fs_sb.h
index 336ee43ed7d..49b639b88ba 100644
--- a/include/linux/reiserfs_fs_sb.h
+++ b/include/linux/reiserfs_fs_sb.h
@@ -193,7 +193,7 @@ struct reiserfs_journal {
 	struct buffer_head *j_header_bh;
 
 	time_t j_trans_start_time;	/* time this transaction started */
-	struct semaphore j_lock;
+	struct mutex j_mutex;
 	struct semaphore j_flush_sem;
 	wait_queue_head_t j_join_wait;	/* wait for current transaction to finish before starting new one */
 	atomic_t j_jlock;	/* lock for j_join_wait */
-- 
cgit v1.2.3-70-g09d2


From afe70259076fff0446001eaa1a287f615241a357 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Fri, 25 Jul 2008 01:46:39 -0700
Subject: reiserfs: convert j_flush_sem to mutex

j_flush_sem is a semaphore but uses it as if it were a mutex.  This patch
converts it to a mutex.

[akpm@linux-foundation.org: fix mutex_trylock retval treatment]
Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Cc: Matthew Wilcox <matthew@wil.cx>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Edward Shishkin <edward.shishkin@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/journal.c          | 14 +++++++-------
 include/linux/reiserfs_fs_sb.h |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 0f7b1e807e6..3cb4a562030 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -1411,8 +1411,8 @@ static int flush_journal_list(struct super_block *s,
 
 	/* if flushall == 0, the lock is already held */
 	if (flushall) {
-		down(&journal->j_flush_sem);
-	} else if (!down_trylock(&journal->j_flush_sem)) {
+		mutex_lock(&journal->j_flush_mutex);
+	} else if (mutex_trylock(&journal->j_flush_mutex)) {
 		BUG();
 	}
 
@@ -1642,7 +1642,7 @@ static int flush_journal_list(struct super_block *s,
 	jl->j_state = 0;
 	put_journal_list(s, jl);
 	if (flushall)
-		up(&journal->j_flush_sem);
+		mutex_unlock(&journal->j_flush_mutex);
 	put_fs_excl();
 	return err;
 }
@@ -1772,12 +1772,12 @@ static int kupdate_transactions(struct super_block *s,
 	struct reiserfs_journal *journal = SB_JOURNAL(s);
 	chunk.nr = 0;
 
-	down(&journal->j_flush_sem);
+	mutex_lock(&journal->j_flush_mutex);
 	if (!journal_list_still_alive(s, orig_trans_id)) {
 		goto done;
 	}
 
-	/* we've got j_flush_sem held, nobody is going to delete any
+	/* we've got j_flush_mutex held, nobody is going to delete any
 	 * of these lists out from underneath us
 	 */
 	while ((num_trans && transactions_flushed < num_trans) ||
@@ -1812,7 +1812,7 @@ static int kupdate_transactions(struct super_block *s,
 	}
 
       done:
-	up(&journal->j_flush_sem);
+	mutex_unlock(&journal->j_flush_mutex);
 	return ret;
 }
 
@@ -2838,7 +2838,7 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
 	journal->j_first = NULL;
 	init_waitqueue_head(&(journal->j_join_wait));
 	mutex_init(&journal->j_mutex);
-	sema_init(&journal->j_flush_sem, 1);
+	mutex_init(&journal->j_flush_mutex);
 
 	journal->j_trans_id = 10;
 	journal->j_mount_id = 10;
diff --git a/include/linux/reiserfs_fs_sb.h b/include/linux/reiserfs_fs_sb.h
index 49b639b88ba..c0751724ee6 100644
--- a/include/linux/reiserfs_fs_sb.h
+++ b/include/linux/reiserfs_fs_sb.h
@@ -194,7 +194,7 @@ struct reiserfs_journal {
 
 	time_t j_trans_start_time;	/* time this transaction started */
 	struct mutex j_mutex;
-	struct semaphore j_flush_sem;
+	struct mutex j_flush_mutex;
 	wait_queue_head_t j_join_wait;	/* wait for current transaction to finish before starting new one */
 	atomic_t j_jlock;	/* lock for j_join_wait */
 	int j_list_bitmap_index;	/* number of next list bitmap to use */
-- 
cgit v1.2.3-70-g09d2


From 90415deac75a761a25239af6f56381546f8d2201 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Fri, 25 Jul 2008 01:46:40 -0700
Subject: reiserfs: convert j_commit_lock to mutex

j_commit_lock is a semaphore but uses it as if it were a mutex.  This patch
converts it to a mutex.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Cc: Matthew Wilcox <matthew@wil.cx>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Edward Shishkin <edward.shishkin@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/journal.c          | 22 ++++++++++------------
 include/linux/reiserfs_fs_sb.h |  2 +-
 2 files changed, 11 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 3cb4a562030..c8f60ee183b 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -34,15 +34,10 @@
 **		        from within kupdate, it will ignore the immediate flag
 */
 
-#include <asm/uaccess.h>
-#include <asm/system.h>
-
 #include <linux/time.h>
 #include <linux/semaphore.h>
-
 #include <linux/vmalloc.h>
 #include <linux/reiserfs_fs.h>
-
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/fcntl.h>
@@ -54,6 +49,9 @@
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
+#include <linux/uaccess.h>
+
+#include <asm/system.h>
 
 /* gets a struct reiserfs_journal_list * from a list head */
 #define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
@@ -1045,9 +1043,9 @@ static int flush_commit_list(struct super_block *s,
 	}
 
 	/* make sure nobody is trying to flush this one at the same time */
-	down(&jl->j_commit_lock);
+	mutex_lock(&jl->j_commit_mutex);
 	if (!journal_list_still_alive(s, trans_id)) {
-		up(&jl->j_commit_lock);
+		mutex_unlock(&jl->j_commit_mutex);
 		goto put_jl;
 	}
 	BUG_ON(jl->j_trans_id == 0);
@@ -1057,7 +1055,7 @@ static int flush_commit_list(struct super_block *s,
 		if (flushall) {
 			atomic_set(&(jl->j_older_commits_done), 1);
 		}
-		up(&jl->j_commit_lock);
+		mutex_unlock(&jl->j_commit_mutex);
 		goto put_jl;
 	}
 
@@ -1181,7 +1179,7 @@ static int flush_commit_list(struct super_block *s,
 	if (flushall) {
 		atomic_set(&(jl->j_older_commits_done), 1);
 	}
-	up(&jl->j_commit_lock);
+	mutex_unlock(&jl->j_commit_mutex);
       put_jl:
 	put_journal_list(s, jl);
 
@@ -2556,7 +2554,7 @@ static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)
 	INIT_LIST_HEAD(&jl->j_working_list);
 	INIT_LIST_HEAD(&jl->j_tail_bh_list);
 	INIT_LIST_HEAD(&jl->j_bh_list);
-	sema_init(&jl->j_commit_lock, 1);
+	mutex_init(&jl->j_commit_mutex);
 	SB_JOURNAL(s)->j_num_lists++;
 	get_journal_list(jl);
 	return jl;
@@ -4030,7 +4028,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 	 * the new transaction is fully setup, and we've already flushed the
 	 * ordered bh list
 	 */
-	down(&jl->j_commit_lock);
+	mutex_lock(&jl->j_commit_mutex);
 
 	/* save the transaction id in case we need to commit it later */
 	commit_trans_id = jl->j_trans_id;
@@ -4196,7 +4194,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 		lock_kernel();
 	}
 	BUG_ON(!list_empty(&jl->j_tail_bh_list));
-	up(&jl->j_commit_lock);
+	mutex_unlock(&jl->j_commit_mutex);
 
 	/* honor the flush wishes from the caller, simple commits can
 	 ** be done outside the journal lock, they are done below
diff --git a/include/linux/reiserfs_fs_sb.h b/include/linux/reiserfs_fs_sb.h
index c0751724ee6..315517e8bfa 100644
--- a/include/linux/reiserfs_fs_sb.h
+++ b/include/linux/reiserfs_fs_sb.h
@@ -152,7 +152,7 @@ struct reiserfs_journal_list {
 	atomic_t j_nonzerolen;
 	atomic_t j_commit_left;
 	atomic_t j_older_commits_done;	/* all commits older than this on disk */
-	struct semaphore j_commit_lock;
+	struct mutex j_commit_mutex;
 	unsigned long j_trans_id;
 	time_t j_timestamp;
 	struct reiserfs_list_bitmap *j_list_bitmap;
-- 
cgit v1.2.3-70-g09d2


From 3264d4ded4d916d294d776b77b72d477c63ac3be Mon Sep 17 00:00:00 2001
From: Shen Feng <shen@cn.fujitsu.com>
Date: Fri, 25 Jul 2008 01:46:41 -0700
Subject: reiserfs: remove double definitions of xattr macros

remove the definitions of macros:
XATTR_SECURITY_PREFIX
XATTR_TRUSTED_PREFIX
XATTR_USER_PREFIX
since they are defined in linux/xattr.h

Signed-off-by: Shen Feng <shen@cn.fujitsu.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/xattr_security.c | 2 --
 fs/reiserfs/xattr_trusted.c  | 2 --
 fs/reiserfs/xattr_user.c     | 2 --
 3 files changed, 6 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 5e90a95ad60..056008db137 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -6,8 +6,6 @@
 #include <linux/reiserfs_xattr.h>
 #include <asm/uaccess.h>
 
-#define XATTR_SECURITY_PREFIX "security."
-
 static int
 security_get(struct inode *inode, const char *name, void *buffer, size_t size)
 {
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index 024a938ca60..60abe2bb1f9 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -7,8 +7,6 @@
 #include <linux/reiserfs_xattr.h>
 #include <asm/uaccess.h>
 
-#define XATTR_TRUSTED_PREFIX "trusted."
-
 static int
 trusted_get(struct inode *inode, const char *name, void *buffer, size_t size)
 {
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index 073f39364b1..1384efcb938 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -10,8 +10,6 @@
 # include <linux/reiserfs_acl.h>
 #endif
 
-#define XATTR_USER_PREFIX "user."
-
 static int
 user_get(struct inode *inode, const char *name, void *buffer, size_t size)
 {
-- 
cgit v1.2.3-70-g09d2


From 8d44d9741f6808c107a144f469fb89e6fe7c55e3 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Fri, 25 Jul 2008 01:46:41 -0700
Subject: fat: fix parse_options()

Current parse_options() exits too early.  We need to run the code of
bottom in this function even if users doesn't specify options.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/inode.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 46a4508ffd2..60deb5fd118 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -950,7 +950,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
 	*debug = 0;
 
 	if (!options)
-		return 0;
+		goto out;
 
 	while ((p = strsep(&options, ",")) != NULL) {
 		int token;
@@ -1104,10 +1104,13 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
 			return -EINVAL;
 		}
 	}
+
+out:
 	/* UTF-8 doesn't provide FAT semantics */
 	if (!strcmp(opts->iocharset, "utf8")) {
 		printk(KERN_ERR "FAT: utf8 is not a recommended IO charset"
-		       " for FAT filesystems, filesystem will be case sensitive!\n");
+		       " for FAT filesystems, filesystem will be "
+		       "case sensitive!\n");
 	}
 
 	/* If user doesn't specify allow_utime, it's initialized from dmask. */
-- 
cgit v1.2.3-70-g09d2


From 531f710f8e68fd2bad7516a090bff372f5f9cf6d Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Fri, 25 Jul 2008 01:46:43 -0700
Subject: fat/dir.c: switch to struct __fat_dirent

struct __fat_dirent is what was formerly the kernel struct dirent (that
was different from the userspace struct dirent).

Converting all fat users to struct __fat_dirent will allow us to get rid
of the conflicting struct dirent definition.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/dir.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 34541d06e62..b57c4b1db63 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -17,7 +17,6 @@
 #include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/msdos_fs.h>
-#include <linux/dirent.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/compat.h>
@@ -715,7 +714,7 @@ efault:									   \
 	return -EFAULT;							   \
 }
 
-FAT_IOCTL_FILLDIR_FUNC(fat_ioctl_filldir, dirent)
+FAT_IOCTL_FILLDIR_FUNC(fat_ioctl_filldir, __fat_dirent)
 
 static int fat_ioctl_readdir(struct inode *inode, struct file *filp,
 			     void __user *dirent, filldir_t filldir,
@@ -741,7 +740,7 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *filp,
 static int fat_dir_ioctl(struct inode *inode, struct file *filp,
 			 unsigned int cmd, unsigned long arg)
 {
-	struct dirent __user *d1 = (struct dirent __user *)arg;
+	struct __fat_dirent __user *d1 = (struct __fat_dirent __user *)arg;
 	int short_only, both;
 
 	switch (cmd) {
@@ -757,7 +756,7 @@ static int fat_dir_ioctl(struct inode *inode, struct file *filp,
 		return fat_generic_ioctl(inode, filp, cmd, arg);
 	}
 
-	if (!access_ok(VERIFY_WRITE, d1, sizeof(struct dirent[2])))
+	if (!access_ok(VERIFY_WRITE, d1, sizeof(struct __fat_dirent[2])))
 		return -EFAULT;
 	/*
 	 * Yes, we don't need this put_user() absolutely. However old
-- 
cgit v1.2.3-70-g09d2


From d688611674cc9c265ee67e89d2ea8bf060c17e8d Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Fri, 25 Jul 2008 01:46:43 -0700
Subject: fat: cleanup fs/fat/dir.c

This is no logic changes, just cleans fs/fat/dir.c up.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/dir.c | 131 ++++++++++++++++++++++++++++++-----------------------------
 1 file changed, 67 insertions(+), 64 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index b57c4b1db63..a4410740627 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -123,10 +123,11 @@ static inline int fat_get_entry(struct inode *dir, loff_t *pos,
  * but ignore that right now.
  * Ahem... Stack smashing in ring 0 isn't fun. Fixed.
  */
-static int uni16_to_x8(unsigned char *ascii, wchar_t *uni, int len,
+static int uni16_to_x8(unsigned char *ascii, const wchar_t *uni, int len,
 		       int uni_xlate, struct nls_table *nls)
 {
-	wchar_t *ip, ec;
+	const wchar_t *ip;
+	wchar_t ec;
 	unsigned char *op, nc;
 	int charlen;
 	int k;
@@ -166,6 +167,16 @@ static int uni16_to_x8(unsigned char *ascii, wchar_t *uni, int len,
 	return (op - ascii);
 }
 
+static inline int fat_uni_to_x8(struct msdos_sb_info *sbi, const wchar_t *uni,
+				unsigned char *buf, int size)
+{
+	if (sbi->options.utf8)
+		return utf8_wcstombs(buf, uni, size);
+	else
+		return uni16_to_x8(buf, uni, size, sbi->options.unicode_xlate,
+				   sbi->nls_io);
+}
+
 static inline int
 fat_short2uni(struct nls_table *t, unsigned char *c, int clen, wchar_t *uni)
 {
@@ -226,6 +237,19 @@ fat_shortname2uni(struct nls_table *nls, unsigned char *buf, int buf_size,
 	return len;
 }
 
+static inline int fat_name_match(struct msdos_sb_info *sbi,
+				 const unsigned char *a, int a_len,
+				 const unsigned char *b, int b_len)
+{
+	if (a_len != b_len)
+		return 0;
+
+	if (sbi->options.name_check != 's')
+		return !nls_strnicmp(sbi->nls_io, a, b, a_len);
+	else
+		return !memcmp(a, b, a_len);
+}
+
 enum { PARSE_INVALID = 1, PARSE_NOT_LONGNAME, PARSE_EOF, };
 
 /**
@@ -311,29 +335,24 @@ int fat_search_long(struct inode *inode, const unsigned char *name,
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 	struct buffer_head *bh = NULL;
 	struct msdos_dir_entry *de;
-	struct nls_table *nls_io = sbi->nls_io;
 	struct nls_table *nls_disk = sbi->nls_disk;
-	wchar_t bufuname[14];
 	unsigned char nr_slots;
-	int xlate_len;
+	wchar_t bufuname[14];
 	wchar_t *unicode = NULL;
 	unsigned char work[MSDOS_NAME];
 	unsigned char *bufname = NULL;
-	int uni_xlate = sbi->options.unicode_xlate;
-	int utf8 = sbi->options.utf8;
-	int anycase = (sbi->options.name_check != 's');
 	unsigned short opt_shortname = sbi->options.shortname;
 	loff_t cpos = 0;
-	int chl, i, j, last_u, err;
+	int chl, i, j, last_u, err, len;
 
 	bufname = __getname();
 	if (!bufname)
 		return -ENOMEM;
 
 	err = -ENOENT;
-	while(1) {
+	while (1) {
 		if (fat_get_entry(inode, &cpos, &bh, &de) == -1)
-			goto EODir;
+			goto end_of_dir;
 parse_record:
 		nr_slots = 0;
 		if (de->name[0] == DELETED_FLAG)
@@ -352,7 +371,7 @@ parse_record:
 			else if (status == PARSE_NOT_LONGNAME)
 				goto parse_record;
 			else if (status == PARSE_EOF)
-				goto EODir;
+				goto end_of_dir;
 		}
 
 		memcpy(work, de->name, sizeof(de->name));
@@ -393,30 +412,21 @@ parse_record:
 		if (!last_u)
 			continue;
 
+		/* Compare shortname */
 		bufuname[last_u] = 0x0000;
-		xlate_len = utf8
-			?utf8_wcstombs(bufname, bufuname, PATH_MAX)
-			:uni16_to_x8(bufname, bufuname, PATH_MAX, uni_xlate, nls_io);
-		if (xlate_len == name_len)
-			if ((!anycase && !memcmp(name, bufname, xlate_len)) ||
-			    (anycase && !nls_strnicmp(nls_io, name, bufname,
-								xlate_len)))
-				goto Found;
+		len = fat_uni_to_x8(sbi, bufuname, bufname, PATH_MAX);
+		if (fat_name_match(sbi, name, name_len, bufname, len))
+			goto found;
 
 		if (nr_slots) {
-			xlate_len = utf8
-				?utf8_wcstombs(bufname, unicode, PATH_MAX)
-				:uni16_to_x8(bufname, unicode, PATH_MAX, uni_xlate, nls_io);
-			if (xlate_len != name_len)
-				continue;
-			if ((!anycase && !memcmp(name, bufname, xlate_len)) ||
-			    (anycase && !nls_strnicmp(nls_io, name, bufname,
-								xlate_len)))
-				goto Found;
+			/* Compare longname */
+			len = fat_uni_to_x8(sbi, unicode, bufname, PATH_MAX);
+			if (fat_name_match(sbi, name, name_len, bufname, len))
+				goto found;
 		}
 	}
 
-Found:
+found:
 	nr_slots++;	/* include the de */
 	sinfo->slot_off = cpos - nr_slots * sizeof(*de);
 	sinfo->nr_slots = nr_slots;
@@ -424,7 +434,7 @@ Found:
 	sinfo->bh = bh;
 	sinfo->i_pos = fat_make_i_pos(sb, sinfo->bh, sinfo->de);
 	err = 0;
-EODir:
+end_of_dir:
 	if (bufname)
 		__putname(bufname);
 	if (unicode)
@@ -452,23 +462,19 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 	struct buffer_head *bh;
 	struct msdos_dir_entry *de;
-	struct nls_table *nls_io = sbi->nls_io;
 	struct nls_table *nls_disk = sbi->nls_disk;
-	unsigned char long_slots;
-	const char *fill_name;
-	int fill_len;
+	unsigned char nr_slots;
 	wchar_t bufuname[14];
 	wchar_t *unicode = NULL;
 	unsigned char c, work[MSDOS_NAME], bufname[56], *ptname = bufname;
-	unsigned long lpos, dummy, *furrfu = &lpos;
-	int uni_xlate = sbi->options.unicode_xlate;
+	unsigned short opt_shortname = sbi->options.shortname;
 	int isvfat = sbi->options.isvfat;
-	int utf8 = sbi->options.utf8;
 	int nocase = sbi->options.nocase;
-	unsigned short opt_shortname = sbi->options.shortname;
+	const char *fill_name;
 	unsigned long inum;
-	int chi, chl, i, i2, j, last, last_u, dotoffset = 0;
+	unsigned long lpos, dummy, *furrfu = &lpos;
 	loff_t cpos;
+	int chi, chl, i, i2, j, last, last_u, dotoffset = 0, fill_len;
 	int ret = 0;
 
 	lock_super(sb);
@@ -488,43 +494,43 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
 			cpos = 0;
 		}
 	}
-	if (cpos & (sizeof(struct msdos_dir_entry)-1)) {
+	if (cpos & (sizeof(struct msdos_dir_entry) - 1)) {
 		ret = -ENOENT;
 		goto out;
 	}
 
 	bh = NULL;
-GetNew:
+get_new:
 	if (fat_get_entry(inode, &cpos, &bh, &de) == -1)
-		goto EODir;
+		goto end_of_dir;
 parse_record:
-	long_slots = 0;
+	nr_slots = 0;
 	/* Check for long filename entry */
 	if (isvfat) {
 		if (de->name[0] == DELETED_FLAG)
-			goto RecEnd;
+			goto record_end;
 		if (de->attr != ATTR_EXT && (de->attr & ATTR_VOLUME))
-			goto RecEnd;
+			goto record_end;
 		if (de->attr != ATTR_EXT && IS_FREE(de->name))
-			goto RecEnd;
+			goto record_end;
 	} else {
 		if ((de->attr & ATTR_VOLUME) || IS_FREE(de->name))
-			goto RecEnd;
+			goto record_end;
 	}
 
 	if (isvfat && de->attr == ATTR_EXT) {
 		int status = fat_parse_long(inode, &cpos, &bh, &de,
-					    &unicode, &long_slots);
+					    &unicode, &nr_slots);
 		if (status < 0) {
 			filp->f_pos = cpos;
 			ret = status;
 			goto out;
 		} else if (status == PARSE_INVALID)
-			goto RecEnd;
+			goto record_end;
 		else if (status == PARSE_NOT_LONGNAME)
 			goto parse_record;
 		else if (status == PARSE_EOF)
-			goto EODir;
+			goto end_of_dir;
 	}
 
 	if (sbi->options.dotsOK) {
@@ -586,12 +592,12 @@ parse_record:
 		}
 	}
 	if (!last)
-		goto RecEnd;
+		goto record_end;
 
 	i = last + dotoffset;
 	j = last_u;
 
-	lpos = cpos - (long_slots+1)*sizeof(struct msdos_dir_entry);
+	lpos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry);
 	if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME))
 		inum = inode->i_ino;
 	else if (!memcmp(de->name, MSDOS_DOTDOT, MSDOS_NAME)) {
@@ -608,20 +614,17 @@ parse_record:
 
 	if (isvfat) {
 		bufuname[j] = 0x0000;
-		i = utf8 ? utf8_wcstombs(bufname, bufuname, sizeof(bufname))
-			 : uni16_to_x8(bufname, bufuname, sizeof(bufname), uni_xlate, nls_io);
+		i = fat_uni_to_x8(sbi, bufuname, bufname, sizeof(bufname));
 	}
 
 	fill_name = bufname;
 	fill_len = i;
-	if (!short_only && long_slots) {
+	if (!short_only && nr_slots) {
 		/* convert the unicode long name. 261 is maximum size
 		 * of unicode buffer. (13 * slots + nul) */
 		void *longname = unicode + 261;
 		int buf_size = PATH_MAX - (261 * sizeof(unicode[0]));
-		int long_len = utf8
-			? utf8_wcstombs(longname, unicode, buf_size)
-			: uni16_to_x8(longname, unicode, buf_size, uni_xlate, nls_io);
+		int long_len = fat_uni_to_x8(sbi, unicode, longname, buf_size);
 
 		if (!both) {
 			fill_name = longname;
@@ -640,15 +643,15 @@ parse_record:
 	}
 	if (filldir(dirent, fill_name, fill_len, *furrfu, inum,
 		    (de->attr & ATTR_DIR) ? DT_DIR : DT_REG) < 0)
-		goto FillFailed;
+		goto fill_failed;
 
-RecEnd:
+record_end:
 	furrfu = &lpos;
 	filp->f_pos = cpos;
-	goto GetNew;
-EODir:
+	goto get_new;
+end_of_dir:
 	filp->f_pos = cpos;
-FillFailed:
+fill_failed:
 	brelse(bh);
 	if (unicode)
 		__putname(unicode);
-- 
cgit v1.2.3-70-g09d2


From 98a15160049fc1a0f822047f33ff513906a35567 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Fri, 25 Jul 2008 01:46:44 -0700
Subject: fat: use same logic in fat_search_long() and __fat_readdir()

This uses uses stack for shortname, and uses __getname() for longname in
fat_search_long() and __fat_readdir().  By this, it removes unneeded
__getname() for shortname.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/dir.c | 42 ++++++++++++++++++++++++++----------------
 1 file changed, 26 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index a4410740627..96a1cad30da 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -324,6 +324,19 @@ parse_long:
 	return 0;
 }
 
+/*
+ * Maximum buffer size of short name.
+ * [(MSDOS_NAME + '.') * max one char + nul]
+ * For msdos style, ['.' (hidden) + MSDOS_NAME + '.' + nul]
+ */
+#define FAT_MAX_SHORT_SIZE	((MSDOS_NAME + 1) * NLS_MAX_CHARSET_SIZE + 1)
+/*
+ * Maximum buffer size of unicode chars from slots.
+ * [(max longname slots * 13 (size in a slot) + nul) * sizeof(wchar_t)]
+ */
+#define FAT_MAX_UNI_CHARS	((MSDOS_SLOTS - 1) * 13 + 1)
+#define FAT_MAX_UNI_SIZE	(FAT_MAX_UNI_CHARS * sizeof(wchar_t))
+
 /*
  * Return values: negative -> error, 0 -> not found, positive -> found,
  * value is the total amount of slots, including the shortname entry.
@@ -340,15 +353,11 @@ int fat_search_long(struct inode *inode, const unsigned char *name,
 	wchar_t bufuname[14];
 	wchar_t *unicode = NULL;
 	unsigned char work[MSDOS_NAME];
-	unsigned char *bufname = NULL;
+	unsigned char bufname[FAT_MAX_SHORT_SIZE];
 	unsigned short opt_shortname = sbi->options.shortname;
 	loff_t cpos = 0;
 	int chl, i, j, last_u, err, len;
 
-	bufname = __getname();
-	if (!bufname)
-		return -ENOMEM;
-
 	err = -ENOENT;
 	while (1) {
 		if (fat_get_entry(inode, &cpos, &bh, &de) == -1)
@@ -414,14 +423,17 @@ parse_record:
 
 		/* Compare shortname */
 		bufuname[last_u] = 0x0000;
-		len = fat_uni_to_x8(sbi, bufuname, bufname, PATH_MAX);
+		len = fat_uni_to_x8(sbi, bufuname, bufname, sizeof(bufname));
 		if (fat_name_match(sbi, name, name_len, bufname, len))
 			goto found;
 
 		if (nr_slots) {
+			void *longname = unicode + FAT_MAX_UNI_CHARS;
+			int size = PATH_MAX - FAT_MAX_UNI_SIZE;
+
 			/* Compare longname */
-			len = fat_uni_to_x8(sbi, unicode, bufname, PATH_MAX);
-			if (fat_name_match(sbi, name, name_len, bufname, len))
+			len = fat_uni_to_x8(sbi, unicode, longname, size);
+			if (fat_name_match(sbi, name, name_len, longname, len))
 				goto found;
 		}
 	}
@@ -435,8 +447,6 @@ found:
 	sinfo->i_pos = fat_make_i_pos(sb, sinfo->bh, sinfo->de);
 	err = 0;
 end_of_dir:
-	if (bufname)
-		__putname(bufname);
 	if (unicode)
 		__putname(unicode);
 
@@ -466,7 +476,8 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
 	unsigned char nr_slots;
 	wchar_t bufuname[14];
 	wchar_t *unicode = NULL;
-	unsigned char c, work[MSDOS_NAME], bufname[56], *ptname = bufname;
+	unsigned char c, work[MSDOS_NAME];
+	unsigned char bufname[FAT_MAX_SHORT_SIZE], *ptname = bufname;
 	unsigned short opt_shortname = sbi->options.shortname;
 	int isvfat = sbi->options.isvfat;
 	int nocase = sbi->options.nocase;
@@ -620,11 +631,10 @@ parse_record:
 	fill_name = bufname;
 	fill_len = i;
 	if (!short_only && nr_slots) {
-		/* convert the unicode long name. 261 is maximum size
-		 * of unicode buffer. (13 * slots + nul) */
-		void *longname = unicode + 261;
-		int buf_size = PATH_MAX - (261 * sizeof(unicode[0]));
-		int long_len = fat_uni_to_x8(sbi, unicode, longname, buf_size);
+		void *longname = unicode + FAT_MAX_UNI_CHARS;
+		int long_len, size = PATH_MAX - FAT_MAX_UNI_SIZE;
+
+		long_len = fat_uni_to_x8(sbi, unicode, longname, size);
 
 		if (!both) {
 			fill_name = longname;
-- 
cgit v1.2.3-70-g09d2


From dcd8c53f13f068ee039589d84fbd0baf686abc41 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Fri, 25 Jul 2008 01:46:44 -0700
Subject: fat: small optimization to __fat_readdir()

This removes unnecessary parsing for directory entries.

If short_only, we don't need to parse longname.  And if !both and it found
the longname, we don't need shortname.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/dir.c | 71 +++++++++++++++++++++++++++++++++---------------------------
 1 file changed, 39 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 96a1cad30da..4c35477bc94 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -481,11 +481,11 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
 	unsigned short opt_shortname = sbi->options.shortname;
 	int isvfat = sbi->options.isvfat;
 	int nocase = sbi->options.nocase;
-	const char *fill_name;
+	const char *fill_name = NULL;
 	unsigned long inum;
 	unsigned long lpos, dummy, *furrfu = &lpos;
 	loff_t cpos;
-	int chi, chl, i, i2, j, last, last_u, dotoffset = 0, fill_len;
+	int chi, chl, i, i2, j, last, last_u, dotoffset = 0, fill_len = 0;
 	int ret = 0;
 
 	lock_super(sb);
@@ -516,8 +516,11 @@ get_new:
 		goto end_of_dir;
 parse_record:
 	nr_slots = 0;
-	/* Check for long filename entry */
-	if (isvfat) {
+	/*
+	 * Check for long filename entry, but if short_only, we don't
+	 * need to parse long filename.
+	 */
+	if (isvfat && !short_only) {
 		if (de->name[0] == DELETED_FLAG)
 			goto record_end;
 		if (de->attr != ATTR_EXT && (de->attr & ATTR_VOLUME))
@@ -542,6 +545,18 @@ parse_record:
 			goto parse_record;
 		else if (status == PARSE_EOF)
 			goto end_of_dir;
+
+		if (nr_slots) {
+			void *longname = unicode + FAT_MAX_UNI_CHARS;
+			int size = PATH_MAX - FAT_MAX_UNI_SIZE;
+			int len = fat_uni_to_x8(sbi, unicode, longname, size);
+
+			fill_name = longname;
+			fill_len = len;
+			/* !both && !short_only, so we don't need shortname. */
+			if (!both)
+				goto start_filldir;
+		}
 	}
 
 	if (sbi->options.dotsOK) {
@@ -608,6 +623,26 @@ parse_record:
 	i = last + dotoffset;
 	j = last_u;
 
+	if (isvfat) {
+		bufuname[j] = 0x0000;
+		i = fat_uni_to_x8(sbi, bufuname, bufname, sizeof(bufname));
+	}
+	if (nr_slots) {
+		/* hack for fat_ioctl_filldir() */
+		struct fat_ioctl_filldir_callback *p = dirent;
+
+		p->longname = fill_name;
+		p->long_len = fill_len;
+		p->shortname = bufname;
+		p->short_len = i;
+		fill_name = NULL;
+		fill_len = 0;
+	} else {
+		fill_name = bufname;
+		fill_len = i;
+	}
+
+start_filldir:
 	lpos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry);
 	if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME))
 		inum = inode->i_ino;
@@ -623,34 +658,6 @@ parse_record:
 			inum = iunique(sb, MSDOS_ROOT_INO);
 	}
 
-	if (isvfat) {
-		bufuname[j] = 0x0000;
-		i = fat_uni_to_x8(sbi, bufuname, bufname, sizeof(bufname));
-	}
-
-	fill_name = bufname;
-	fill_len = i;
-	if (!short_only && nr_slots) {
-		void *longname = unicode + FAT_MAX_UNI_CHARS;
-		int long_len, size = PATH_MAX - FAT_MAX_UNI_SIZE;
-
-		long_len = fat_uni_to_x8(sbi, unicode, longname, size);
-
-		if (!both) {
-			fill_name = longname;
-			fill_len = long_len;
-		} else {
-			/* hack for fat_ioctl_filldir() */
-			struct fat_ioctl_filldir_callback *p = dirent;
-
-			p->longname = longname;
-			p->long_len = long_len;
-			p->shortname = bufname;
-			p->short_len = i;
-			fill_name = NULL;
-			fill_len = 0;
-		}
-	}
 	if (filldir(dirent, fill_name, fill_len, *furrfu, inum,
 		    (de->attr & ATTR_DIR) ? DT_DIR : DT_REG) < 0)
 		goto fill_failed;
-- 
cgit v1.2.3-70-g09d2


From 7557bc66be629d19a402e752673708bfbb8b5e86 Mon Sep 17 00:00:00 2001
From: Rene Scharfe <rene.scharfe@lsrfire.ath.cx>
Date: Fri, 25 Jul 2008 01:46:45 -0700
Subject: msdos fs: remove unsettable atari option

It has been impossible to set the option 'atari' of the MSDOS filesystem
for several years.  Since nobody seems to have missed it, let's remove its
remains.

Signed-off-by: Rene Scharfe <rene.scharfe@lsrfire.ath.cx>
Acked-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/msdos/namei.c         | 18 ++++++------------
 include/linux/msdos_fs.h |  1 -
 2 files changed, 6 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c
index 1f7f2956412..e4ad6c6b753 100644
--- a/fs/msdos/namei.c
+++ b/fs/msdos/namei.c
@@ -14,12 +14,7 @@
 
 /* Characters that are undesirable in an MS-DOS file name */
 static unsigned char bad_chars[] = "*?<>|\"";
-static unsigned char bad_if_strict_pc[] = "+=,; ";
-/* GEMDOS is less restrictive */
-static unsigned char bad_if_strict_atari[] = " ";
-
-#define bad_if_strict(opts) \
-	((opts)->atari ? bad_if_strict_atari : bad_if_strict_pc)
+static unsigned char bad_if_strict[] = "+=,; ";
 
 /***** Formats an MS-DOS file name. Rejects invalid names. */
 static int msdos_format_name(const unsigned char *name, int len,
@@ -40,21 +35,20 @@ static int msdos_format_name(const unsigned char *name, int len,
 			/* Get rid of dot - test for it elsewhere */
 			name++;
 			len--;
-		} else if (!opts->atari)
+		} else
 			return -EINVAL;
 	}
 	/*
-	 * disallow names that _really_ start with a dot for MS-DOS,
-	 * GEMDOS does not care
+	 * disallow names that _really_ start with a dot
 	 */
-	space = !opts->atari;
+	space = 1;
 	c = 0;
 	for (walk = res; len && walk - res < 8; walk++) {
 		c = *name++;
 		len--;
 		if (opts->name_check != 'r' && strchr(bad_chars, c))
 			return -EINVAL;
-		if (opts->name_check == 's' && strchr(bad_if_strict(opts), c))
+		if (opts->name_check == 's' && strchr(bad_if_strict, c))
 			return -EINVAL;
 		if (c >= 'A' && c <= 'Z' && opts->name_check == 's')
 			return -EINVAL;
@@ -94,7 +88,7 @@ static int msdos_format_name(const unsigned char *name, int len,
 			if (opts->name_check != 'r' && strchr(bad_chars, c))
 				return -EINVAL;
 			if (opts->name_check == 's' &&
-			    strchr(bad_if_strict(opts), c))
+			    strchr(bad_if_strict, c))
 				return -EINVAL;
 			if (c < ' ' || c == ':' || c == '\\')
 				return -EINVAL;
diff --git a/include/linux/msdos_fs.h b/include/linux/msdos_fs.h
index 5161394c789..3346c9c8f17 100644
--- a/include/linux/msdos_fs.h
+++ b/include/linux/msdos_fs.h
@@ -201,7 +201,6 @@ struct fat_mount_options {
 		 utf8:1,	  /* Use of UTF-8 character set (Default) */
 		 unicode_xlate:1, /* create escape sequences for unhandled Unicode */
 		 numtail:1,       /* Does first alias have a numeric '~1' type tail? */
-		 atari:1,         /* Use Atari GEMDOS variation of MS-DOS fs */
 		 flush:1,	  /* write things quickly */
 		 nocase:1,	  /* Does this need case conversion? 0=need case conversion*/
 		 usefree:1;	  /* Use free_clusters for FAT32 */
-- 
cgit v1.2.3-70-g09d2


From e8938a62a85d1f487e02c3b01955b47c9598f6d2 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Fri, 25 Jul 2008 01:46:46 -0700
Subject: remove unused #include <linux/dirent.h>'s

Remove some unused #include <linux/dirent.h>'s.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/mips/kernel/linux32.c | 1 -
 fs/compat_ioctl.c          | 1 -
 fs/smbfs/cache.c           | 1 -
 fs/smbfs/proc.c            | 1 -
 include/linux/nfsd/nfsd.h  | 1 -
 5 files changed, 5 deletions(-)

(limited to 'fs')

diff --git a/arch/mips/kernel/linux32.c b/arch/mips/kernel/linux32.c
index c266211ed65..2fefb14414b 100644
--- a/arch/mips/kernel/linux32.c
+++ b/arch/mips/kernel/linux32.c
@@ -11,7 +11,6 @@
 #include <linux/file.h>
 #include <linux/smp_lock.h>
 #include <linux/highuid.h>
-#include <linux/dirent.h>
 #include <linux/resource.h>
 #include <linux/highmem.h>
 #include <linux/time.h>
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 18e2c548161..5235c67e759 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -25,7 +25,6 @@
 #include <linux/slab.h>
 #include <linux/raid/md.h>
 #include <linux/kd.h>
-#include <linux/dirent.h>
 #include <linux/route.h>
 #include <linux/in6.h>
 #include <linux/ipv6_route.h>
diff --git a/fs/smbfs/cache.c b/fs/smbfs/cache.c
index 8182f0542a2..8c177eb7e34 100644
--- a/fs/smbfs/cache.c
+++ b/fs/smbfs/cache.c
@@ -13,7 +13,6 @@
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
-#include <linux/dirent.h>
 #include <linux/smb_fs.h>
 #include <linux/pagemap.h>
 #include <linux/net.h>
diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c
index d517a27b7f4..ee536e8a649 100644
--- a/fs/smbfs/proc.c
+++ b/fs/smbfs/proc.c
@@ -16,7 +16,6 @@
 #include <linux/stat.h>
 #include <linux/fcntl.h>
 #include <linux/dcache.h>
-#include <linux/dirent.h>
 #include <linux/nls.h>
 #include <linux/smp_lock.h>
 #include <linux/net.h>
diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h
index a2861d95ecc..108f47e5fd9 100644
--- a/include/linux/nfsd/nfsd.h
+++ b/include/linux/nfsd/nfsd.h
@@ -12,7 +12,6 @@
 
 #include <linux/types.h>
 #include <linux/unistd.h>
-#include <linux/dirent.h>
 #include <linux/fs.h>
 #include <linux/posix_acl.h>
 #include <linux/mount.h>
-- 
cgit v1.2.3-70-g09d2


From b271e067c896ad4082b15e96077675d08db40625 Mon Sep 17 00:00:00 2001
From: Joe Peterson <joe@skyrush.com>
Date: Fri, 25 Jul 2008 01:46:47 -0700
Subject: fatfs: add UTC timestamp option

Provide a new mount option ("tz=UTC") for DOS (vfat/msdos) filesystems,
allowing timestamps to be in coordinated universal time (UTC) rather than
local time in applications where doing this is advantageous.

In particular, portable devices that use fat/vfat (such as digital
cameras) can benefit from using UTC in their internal clocks, thus
avoiding daylight saving time errors and general time ambiguity issues.
The user of the device does not have to worry about changing the time when
moving from place or when daylight saving changes.

The new mount option, when set, disables the counter-adjustment that Linux
currently makes to FAT timestamp info in anticipation of the normal
userspace time zone correction.  When used in this new mode, all daylight
saving time and time zone handling is done in userspace as is normal for
many other filesystems (like ext3).  The default mode, which remains
unchanged, is still appropriate when mounting volumes written in Windows
(because of its use of local time).

I originally based this patch on one submitted last year by Paul Collins,
but I updated it to work with current source and changed variable/option
naming.  Ogawa Hirofumi (who maintains these filesystems) and I discussed
this patch at length on lkml, and he suggested using the option name in
the attached version of the patch.  Barry Bouwsma pointed out a good
addition to the patch as well.

Signed-off-by: Joe Peterson <joe@skyrush.com>
Signed-off-by: Paul Collins <paul@ondioline.org>
Acked-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Barry Bouwsma <free_beer_for_all@yahoo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/dir.c             |  2 +-
 fs/fat/inode.c           | 27 ++++++++++++++++++++-------
 fs/fat/misc.c            | 10 ++++++----
 fs/msdos/namei.c         |  3 ++-
 fs/vfat/namei.c          |  2 +-
 include/linux/msdos_fs.h |  8 +++++---
 6 files changed, 35 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 4c35477bc94..cd4a0162e10 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -1101,7 +1101,7 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec *ts)
 		goto error_free;
 	}
 
-	fat_date_unix2dos(ts->tv_sec, &time, &date);
+	fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc);
 
 	de = (struct msdos_dir_entry *)bhs[0]->b_data;
 	/* filling the new directory slots ("." and ".." entries) */
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 60deb5fd118..23676f9d79c 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -382,17 +382,20 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
 	inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1))
 			   & ~((loff_t)sbi->cluster_size - 1)) >> 9;
 	inode->i_mtime.tv_sec =
-		date_dos2unix(le16_to_cpu(de->time), le16_to_cpu(de->date));
+		date_dos2unix(le16_to_cpu(de->time), le16_to_cpu(de->date),
+			      sbi->options.tz_utc);
 	inode->i_mtime.tv_nsec = 0;
 	if (sbi->options.isvfat) {
 		int secs = de->ctime_cs / 100;
 		int csecs = de->ctime_cs % 100;
 		inode->i_ctime.tv_sec  =
 			date_dos2unix(le16_to_cpu(de->ctime),
-				      le16_to_cpu(de->cdate)) + secs;
+				      le16_to_cpu(de->cdate),
+				      sbi->options.tz_utc) + secs;
 		inode->i_ctime.tv_nsec = csecs * 10000000;
 		inode->i_atime.tv_sec =
-			date_dos2unix(0, le16_to_cpu(de->adate));
+			date_dos2unix(0, le16_to_cpu(de->adate),
+				      sbi->options.tz_utc);
 		inode->i_atime.tv_nsec = 0;
 	} else
 		inode->i_ctime = inode->i_atime = inode->i_mtime;
@@ -591,11 +594,14 @@ retry:
 	raw_entry->attr = fat_attr(inode);
 	raw_entry->start = cpu_to_le16(MSDOS_I(inode)->i_logstart);
 	raw_entry->starthi = cpu_to_le16(MSDOS_I(inode)->i_logstart >> 16);
-	fat_date_unix2dos(inode->i_mtime.tv_sec, &raw_entry->time, &raw_entry->date);
+	fat_date_unix2dos(inode->i_mtime.tv_sec, &raw_entry->time,
+			  &raw_entry->date, sbi->options.tz_utc);
 	if (sbi->options.isvfat) {
 		__le16 atime;
-		fat_date_unix2dos(inode->i_ctime.tv_sec,&raw_entry->ctime,&raw_entry->cdate);
-		fat_date_unix2dos(inode->i_atime.tv_sec,&atime,&raw_entry->adate);
+		fat_date_unix2dos(inode->i_ctime.tv_sec, &raw_entry->ctime,
+				  &raw_entry->cdate, sbi->options.tz_utc);
+		fat_date_unix2dos(inode->i_atime.tv_sec, &atime,
+				  &raw_entry->adate, sbi->options.tz_utc);
 		raw_entry->ctime_cs = (inode->i_ctime.tv_sec & 1) * 100 +
 			inode->i_ctime.tv_nsec / 10000000;
 	}
@@ -836,6 +842,8 @@ static int fat_show_options(struct seq_file *m, struct vfsmount *mnt)
 	}
 	if (sbi->options.flush)
 		seq_puts(m, ",flush");
+	if (opts->tz_utc)
+		seq_puts(m, ",tz=UTC");
 
 	return 0;
 }
@@ -848,7 +856,7 @@ enum {
 	Opt_charset, Opt_shortname_lower, Opt_shortname_win95,
 	Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes,
 	Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
-	Opt_obsolate, Opt_flush, Opt_err,
+	Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_err,
 };
 
 static match_table_t fat_tokens = {
@@ -883,6 +891,7 @@ static match_table_t fat_tokens = {
 	{Opt_obsolate, "cvf_options=%100s"},
 	{Opt_obsolate, "posix"},
 	{Opt_flush, "flush"},
+	{Opt_tz_utc, "tz=UTC"},
 	{Opt_err, NULL},
 };
 static match_table_t msdos_tokens = {
@@ -947,6 +956,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
 	opts->utf8 = opts->unicode_xlate = 0;
 	opts->numtail = 1;
 	opts->usefree = opts->nocase = 0;
+	opts->tz_utc = 0;
 	*debug = 0;
 
 	if (!options)
@@ -1036,6 +1046,9 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
 		case Opt_flush:
 			opts->flush = 1;
 			break;
+		case Opt_tz_utc:
+			opts->tz_utc = 1;
+			break;
 
 		/* msdos specific */
 		case Opt_dots:
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 61f23511eac..79fb98ad36d 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -142,7 +142,7 @@ static int day_n[] = {
 };
 
 /* Convert a MS-DOS time/date pair to a UNIX date (seconds since 1 1 70). */
-int date_dos2unix(unsigned short time, unsigned short date)
+int date_dos2unix(unsigned short time, unsigned short date, int tz_utc)
 {
 	int month, year, secs;
 
@@ -156,16 +156,18 @@ int date_dos2unix(unsigned short time, unsigned short date)
 	    ((date & 31)-1+day_n[month]+(year/4)+year*365-((year & 3) == 0 &&
 	    month < 2 ? 1 : 0)+3653);
 			/* days since 1.1.70 plus 80's leap day */
-	secs += sys_tz.tz_minuteswest*60;
+	if (!tz_utc)
+		secs += sys_tz.tz_minuteswest*60;
 	return secs;
 }
 
 /* Convert linear UNIX date to a MS-DOS time/date pair. */
-void fat_date_unix2dos(int unix_date, __le16 *time, __le16 *date)
+void fat_date_unix2dos(int unix_date, __le16 *time, __le16 *date, int tz_utc)
 {
 	int day, year, nl_day, month;
 
-	unix_date -= sys_tz.tz_minuteswest*60;
+	if (!tz_utc)
+		unix_date -= sys_tz.tz_minuteswest*60;
 
 	/* Jan 1 GMT 00:00:00 1980. But what about another time zone? */
 	if (unix_date < 315532800)
diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c
index e4ad6c6b753..e844b9809d2 100644
--- a/fs/msdos/namei.c
+++ b/fs/msdos/namei.c
@@ -237,6 +237,7 @@ static int msdos_add_entry(struct inode *dir, const unsigned char *name,
 			   int is_dir, int is_hid, int cluster,
 			   struct timespec *ts, struct fat_slot_info *sinfo)
 {
+	struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb);
 	struct msdos_dir_entry de;
 	__le16 time, date;
 	int err;
@@ -246,7 +247,7 @@ static int msdos_add_entry(struct inode *dir, const unsigned char *name,
 	if (is_hid)
 		de.attr |= ATTR_HIDDEN;
 	de.lcase = 0;
-	fat_date_unix2dos(ts->tv_sec, &time, &date);
+	fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc);
 	de.cdate = de.adate = 0;
 	de.ctime = 0;
 	de.ctime_cs = 0;
diff --git a/fs/vfat/namei.c b/fs/vfat/namei.c
index b546ba69be8..155c10b4adb 100644
--- a/fs/vfat/namei.c
+++ b/fs/vfat/namei.c
@@ -621,7 +621,7 @@ shortname:
 	memcpy(de->name, msdos_name, MSDOS_NAME);
 	de->attr = is_dir ? ATTR_DIR : ATTR_ARCH;
 	de->lcase = lcase;
-	fat_date_unix2dos(ts->tv_sec, &time, &date);
+	fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc);
 	de->time = de->ctime = time;
 	de->date = de->cdate = de->adate = date;
 	de->ctime_cs = 0;
diff --git a/include/linux/msdos_fs.h b/include/linux/msdos_fs.h
index 3346c9c8f17..ba63858056c 100644
--- a/include/linux/msdos_fs.h
+++ b/include/linux/msdos_fs.h
@@ -203,7 +203,8 @@ struct fat_mount_options {
 		 numtail:1,       /* Does first alias have a numeric '~1' type tail? */
 		 flush:1,	  /* write things quickly */
 		 nocase:1,	  /* Does this need case conversion? 0=need case conversion*/
-		 usefree:1;	  /* Use free_clusters for FAT32 */
+		 usefree:1,	  /* Use free_clusters for FAT32 */
+		 tz_utc:1;	  /* Filesystem timestamps are in UTC */
 };
 
 #define FAT_HASH_BITS	8
@@ -434,8 +435,9 @@ extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
 extern void fat_fs_panic(struct super_block *s, const char *fmt, ...);
 extern void fat_clusters_flush(struct super_block *sb);
 extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
-extern int date_dos2unix(unsigned short time, unsigned short date);
-extern void fat_date_unix2dos(int unix_date, __le16 *time, __le16 *date);
+extern int date_dos2unix(unsigned short time, unsigned short date, int tz_utc);
+extern void fat_date_unix2dos(int unix_date, __le16 *time, __le16 *date,
+			      int tz_utc);
 extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs);
 
 int fat_cache_init(void);
-- 
cgit v1.2.3-70-g09d2


From b48d380541f634663b71766005838edbb7261685 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 25 Jul 2008 01:46:49 -0700
Subject: quota: fix possible infinite loop in quota code

When quota structure is going to be dropped and it is dirty, quota code tries
to write it.  If the write fails for some reason (e.  g.  transaction cannot
be started because the journal is aborted), we try writing again and again and
again...  Fix the problem by clearing the dirty bit even if the write failed.

(akpm: for 2.6.27, 2.6.26.x and 2.6.25.x)

Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: dingdinghua <dingdinghua85@gmail.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dquot.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dquot.c b/fs/dquot.c
index 5ac77da1995..ad88cf6fcba 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -562,6 +562,8 @@ static struct shrinker dqcache_shrinker = {
  */
 static void dqput(struct dquot *dquot)
 {
+	int ret;
+
 	if (!dquot)
 		return;
 #ifdef __DQUOT_PARANOIA
@@ -594,7 +596,19 @@ we_slept:
 	if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && dquot_dirty(dquot)) {
 		spin_unlock(&dq_list_lock);
 		/* Commit dquot before releasing */
-		dquot->dq_sb->dq_op->write_dquot(dquot);
+		ret = dquot->dq_sb->dq_op->write_dquot(dquot);
+		if (ret < 0) {
+			printk(KERN_ERR "VFS: cannot write quota structure on "
+				"device %s (error %d). Quota may get out of "
+				"sync!\n", dquot->dq_sb->s_id, ret);
+			/*
+			 * We clear dirty bit anyway, so that we avoid
+			 * infinite loop here
+			 */
+			spin_lock(&dq_list_lock);
+			clear_dquot_dirty(dquot);
+			spin_unlock(&dq_list_lock);
+		}
 		goto we_slept;
 	}
 	/* Clear flag in case dquot was inactive (something bad happened) */
-- 
cgit v1.2.3-70-g09d2


From b85f4b87a511bea86dac68c4f0fabaee2cac6c4c Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 25 Jul 2008 01:46:50 -0700
Subject: quota: rename quota functions from upper case, make bigger ones
 non-inline

Cleanup quotaops.h: Rename functions from uppercase to lowercase (and
define backward compatibility macros), move larger functions to dquot.c
and make them non-inline.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dquot.c               |  53 +++++++++++
 include/linux/quotaops.h | 226 ++++++++++++++++++++++-------------------------
 2 files changed, 160 insertions(+), 119 deletions(-)

(limited to 'fs')

diff --git a/fs/dquot.c b/fs/dquot.c
index ad88cf6fcba..0bcaf970bbb 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -1153,6 +1153,28 @@ int dquot_drop(struct inode *inode)
 	return 0;
 }
 
+/* Wrapper to remove references to quota structures from inode */
+void vfs_dq_drop(struct inode *inode)
+{
+	/* Here we can get arbitrary inode from clear_inode() so we have
+	 * to be careful. OTOH we don't need locking as quota operations
+	 * are allowed to change only at mount time */
+	if (!IS_NOQUOTA(inode) && inode->i_sb && inode->i_sb->dq_op
+	    && inode->i_sb->dq_op->drop) {
+		int cnt;
+		/* Test before calling to rule out calls from proc and such
+                 * where we are not allowed to block. Note that this is
+		 * actually reliable test even without the lock - the caller
+		 * must assure that nobody can come after the DQUOT_DROP and
+		 * add quota pointers back anyway */
+		for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+			if (inode->i_dquot[cnt] != NODQUOT)
+				break;
+		if (cnt < MAXQUOTAS)
+			inode->i_sb->dq_op->drop(inode);
+	}
+}
+
 /*
  * Following four functions update i_blocks+i_bytes fields and
  * quota information (together with appropriate checks)
@@ -1426,6 +1448,18 @@ warn_put_all:
 	return ret;
 }
 
+/* Wrapper for transferring ownership of an inode */
+int vfs_dq_transfer(struct inode *inode, struct iattr *iattr)
+{
+	if (sb_any_quota_enabled(inode->i_sb) && !IS_NOQUOTA(inode)) {
+		vfs_dq_init(inode);
+		if (inode->i_sb->dq_op->transfer(inode, iattr) == NO_QUOTA)
+			return 1;
+	}
+	return 0;
+}
+
+
 /*
  * Write info of quota file to disk
  */
@@ -1766,6 +1800,22 @@ out:
 	return error;
 }
 
+/* Wrapper to turn on quotas when remounting rw */
+int vfs_dq_quota_on_remount(struct super_block *sb)
+{
+	int cnt;
+	int ret = 0, err;
+
+	if (!sb->s_qcop || !sb->s_qcop->quota_on)
+		return -ENOSYS;
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		err = sb->s_qcop->quota_on(sb, cnt, 0, NULL, 1);
+		if (err < 0 && !ret)
+			ret = err;
+	}
+	return ret;
+}
+
 /* Generic routine for getting common part of quota structure */
 static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di)
 {
@@ -2101,8 +2151,11 @@ EXPORT_SYMBOL(dquot_release);
 EXPORT_SYMBOL(dquot_mark_dquot_dirty);
 EXPORT_SYMBOL(dquot_initialize);
 EXPORT_SYMBOL(dquot_drop);
+EXPORT_SYMBOL(vfs_dq_drop);
 EXPORT_SYMBOL(dquot_alloc_space);
 EXPORT_SYMBOL(dquot_alloc_inode);
 EXPORT_SYMBOL(dquot_free_space);
 EXPORT_SYMBOL(dquot_free_inode);
 EXPORT_SYMBOL(dquot_transfer);
+EXPORT_SYMBOL(vfs_dq_transfer);
+EXPORT_SYMBOL(vfs_dq_quota_on_remount);
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index f8670205385..0c8f9fe462a 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -19,34 +19,38 @@
 /*
  * declaration of quota_function calls in kernel.
  */
-extern void sync_dquots(struct super_block *sb, int type);
-
-extern int dquot_initialize(struct inode *inode, int type);
-extern int dquot_drop(struct inode *inode);
-
-extern int dquot_alloc_space(struct inode *inode, qsize_t number, int prealloc);
-extern int dquot_alloc_inode(const struct inode *inode, unsigned long number);
-
-extern int dquot_free_space(struct inode *inode, qsize_t number);
-extern int dquot_free_inode(const struct inode *inode, unsigned long number);
-
-extern int dquot_transfer(struct inode *inode, struct iattr *iattr);
-extern int dquot_commit(struct dquot *dquot);
-extern int dquot_acquire(struct dquot *dquot);
-extern int dquot_release(struct dquot *dquot);
-extern int dquot_commit_info(struct super_block *sb, int type);
-extern int dquot_mark_dquot_dirty(struct dquot *dquot);
-
-extern int vfs_quota_on(struct super_block *sb, int type, int format_id,
-		char *path, int remount);
-extern int vfs_quota_on_mount(struct super_block *sb, char *qf_name,
-		int format_id, int type);
-extern int vfs_quota_off(struct super_block *sb, int type, int remount);
-extern int vfs_quota_sync(struct super_block *sb, int type);
-extern int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii);
-extern int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii);
-extern int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *di);
-extern int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *di);
+void sync_dquots(struct super_block *sb, int type);
+
+int dquot_initialize(struct inode *inode, int type);
+int dquot_drop(struct inode *inode);
+
+int dquot_alloc_space(struct inode *inode, qsize_t number, int prealloc);
+int dquot_alloc_inode(const struct inode *inode, unsigned long number);
+
+int dquot_free_space(struct inode *inode, qsize_t number);
+int dquot_free_inode(const struct inode *inode, unsigned long number);
+
+int dquot_transfer(struct inode *inode, struct iattr *iattr);
+int dquot_commit(struct dquot *dquot);
+int dquot_acquire(struct dquot *dquot);
+int dquot_release(struct dquot *dquot);
+int dquot_commit_info(struct super_block *sb, int type);
+int dquot_mark_dquot_dirty(struct dquot *dquot);
+
+int vfs_quota_on(struct super_block *sb, int type, int format_id,
+ 	char *path, int remount);
+int vfs_quota_on_mount(struct super_block *sb, char *qf_name,
+ 	int format_id, int type);
+int vfs_quota_off(struct super_block *sb, int type, int remount);
+int vfs_quota_sync(struct super_block *sb, int type);
+int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii);
+int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii);
+int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *di);
+int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *di);
+
+void vfs_dq_drop(struct inode *inode);
+int vfs_dq_transfer(struct inode *inode, struct iattr *iattr);
+int vfs_dq_quota_on_remount(struct super_block *sb);
 
 /*
  * Operations supported for diskquotas.
@@ -59,38 +63,16 @@ extern struct quotactl_ops vfs_quotactl_ops;
 
 /* It is better to call this function outside of any transaction as it might
  * need a lot of space in journal for dquot structure allocation. */
-static inline void DQUOT_INIT(struct inode *inode)
+static inline void vfs_dq_init(struct inode *inode)
 {
 	BUG_ON(!inode->i_sb);
 	if (sb_any_quota_enabled(inode->i_sb) && !IS_NOQUOTA(inode))
 		inode->i_sb->dq_op->initialize(inode, -1);
 }
 
-/* The same as with DQUOT_INIT */
-static inline void DQUOT_DROP(struct inode *inode)
-{
-	/* Here we can get arbitrary inode from clear_inode() so we have
-	 * to be careful. OTOH we don't need locking as quota operations
-	 * are allowed to change only at mount time */
-	if (!IS_NOQUOTA(inode) && inode->i_sb && inode->i_sb->dq_op
-	    && inode->i_sb->dq_op->drop) {
-		int cnt;
-		/* Test before calling to rule out calls from proc and such
-                 * where we are not allowed to block. Note that this is
-		 * actually reliable test even without the lock - the caller
-		 * must assure that nobody can come after the DQUOT_DROP and
-		 * add quota pointers back anyway */
-		for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-			if (inode->i_dquot[cnt] != NODQUOT)
-				break;
-		if (cnt < MAXQUOTAS)
-			inode->i_sb->dq_op->drop(inode);
-	}
-}
-
 /* The following allocation/freeing/transfer functions *must* be called inside
  * a transaction (deadlocks possible otherwise) */
-static inline int DQUOT_PREALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t nr)
+static inline int vfs_dq_prealloc_space_nodirty(struct inode *inode, qsize_t nr)
 {
 	if (sb_any_quota_enabled(inode->i_sb)) {
 		/* Used space is updated in alloc_space() */
@@ -102,15 +84,15 @@ static inline int DQUOT_PREALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t nr)
 	return 0;
 }
 
-static inline int DQUOT_PREALLOC_SPACE(struct inode *inode, qsize_t nr)
+static inline int vfs_dq_prealloc_space(struct inode *inode, qsize_t nr)
 {
 	int ret;
-        if (!(ret =  DQUOT_PREALLOC_SPACE_NODIRTY(inode, nr)))
+        if (!(ret =  vfs_dq_prealloc_space_nodirty(inode, nr)))
 		mark_inode_dirty(inode);
 	return ret;
 }
 
-static inline int DQUOT_ALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t nr)
+static inline int vfs_dq_alloc_space_nodirty(struct inode *inode, qsize_t nr)
 {
 	if (sb_any_quota_enabled(inode->i_sb)) {
 		/* Used space is updated in alloc_space() */
@@ -122,25 +104,25 @@ static inline int DQUOT_ALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t nr)
 	return 0;
 }
 
-static inline int DQUOT_ALLOC_SPACE(struct inode *inode, qsize_t nr)
+static inline int vfs_dq_alloc_space(struct inode *inode, qsize_t nr)
 {
 	int ret;
-	if (!(ret = DQUOT_ALLOC_SPACE_NODIRTY(inode, nr)))
+	if (!(ret = vfs_dq_alloc_space_nodirty(inode, nr)))
 		mark_inode_dirty(inode);
 	return ret;
 }
 
-static inline int DQUOT_ALLOC_INODE(struct inode *inode)
+static inline int vfs_dq_alloc_inode(struct inode *inode)
 {
 	if (sb_any_quota_enabled(inode->i_sb)) {
-		DQUOT_INIT(inode);
+		vfs_dq_init(inode);
 		if (inode->i_sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA)
 			return 1;
 	}
 	return 0;
 }
 
-static inline void DQUOT_FREE_SPACE_NODIRTY(struct inode *inode, qsize_t nr)
+static inline void vfs_dq_free_space_nodirty(struct inode *inode, qsize_t nr)
 {
 	if (sb_any_quota_enabled(inode->i_sb))
 		inode->i_sb->dq_op->free_space(inode, nr);
@@ -148,35 +130,25 @@ static inline void DQUOT_FREE_SPACE_NODIRTY(struct inode *inode, qsize_t nr)
 		inode_sub_bytes(inode, nr);
 }
 
-static inline void DQUOT_FREE_SPACE(struct inode *inode, qsize_t nr)
+static inline void vfs_dq_free_space(struct inode *inode, qsize_t nr)
 {
-	DQUOT_FREE_SPACE_NODIRTY(inode, nr);
+	vfs_dq_free_space_nodirty(inode, nr);
 	mark_inode_dirty(inode);
 }
 
-static inline void DQUOT_FREE_INODE(struct inode *inode)
+static inline void vfs_dq_free_inode(struct inode *inode)
 {
 	if (sb_any_quota_enabled(inode->i_sb))
 		inode->i_sb->dq_op->free_inode(inode, 1);
 }
 
-static inline int DQUOT_TRANSFER(struct inode *inode, struct iattr *iattr)
-{
-	if (sb_any_quota_enabled(inode->i_sb) && !IS_NOQUOTA(inode)) {
-		DQUOT_INIT(inode);
-		if (inode->i_sb->dq_op->transfer(inode, iattr) == NO_QUOTA)
-			return 1;
-	}
-	return 0;
-}
-
 /* The following two functions cannot be called inside a transaction */
-static inline void DQUOT_SYNC(struct super_block *sb)
+static inline void vfs_dq_sync(struct super_block *sb)
 {
 	sync_dquots(sb, -1);
 }
 
-static inline int DQUOT_OFF(struct super_block *sb, int remount)
+static inline int vfs_dq_off(struct super_block *sb, int remount)
 {
 	int ret = -ENOSYS;
 
@@ -185,21 +157,6 @@ static inline int DQUOT_OFF(struct super_block *sb, int remount)
 	return ret;
 }
 
-static inline int DQUOT_ON_REMOUNT(struct super_block *sb)
-{
-	int cnt;
-	int ret = 0, err;
-
-	if (!sb->s_qcop || !sb->s_qcop->quota_on)
-		return -ENOSYS;
-	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		err = sb->s_qcop->quota_on(sb, cnt, 0, NULL, 1);
-		if (err < 0 && !ret)
-			ret = err;
-	}
-	return ret;
-}
-
 #else
 
 /*
@@ -208,113 +165,144 @@ static inline int DQUOT_ON_REMOUNT(struct super_block *sb)
 #define sb_dquot_ops				(NULL)
 #define sb_quotactl_ops				(NULL)
 
-static inline void DQUOT_INIT(struct inode *inode)
+static inline void vfs_dq_init(struct inode *inode)
 {
 }
 
-static inline void DQUOT_DROP(struct inode *inode)
+static inline void vfs_dq_drop(struct inode *inode)
 {
 }
 
-static inline int DQUOT_ALLOC_INODE(struct inode *inode)
+static inline int vfs_dq_alloc_inode(struct inode *inode)
 {
 	return 0;
 }
 
-static inline void DQUOT_FREE_INODE(struct inode *inode)
+static inline void vfs_dq_free_inode(struct inode *inode)
 {
 }
 
-static inline void DQUOT_SYNC(struct super_block *sb)
+static inline void vfs_dq_sync(struct super_block *sb)
 {
 }
 
-static inline int DQUOT_OFF(struct super_block *sb, int remount)
+static inline int vfs_dq_off(struct super_block *sb, int remount)
 {
 	return 0;
 }
 
-static inline int DQUOT_ON_REMOUNT(struct super_block *sb)
+static inline int vfs_dq_quota_on_remount(struct super_block *sb)
 {
 	return 0;
 }
 
-static inline int DQUOT_TRANSFER(struct inode *inode, struct iattr *iattr)
+static inline int vfs_dq_transfer(struct inode *inode, struct iattr *iattr)
 {
 	return 0;
 }
 
-static inline int DQUOT_PREALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t nr)
+static inline int vfs_dq_prealloc_space_nodirty(struct inode *inode, qsize_t nr)
 {
 	inode_add_bytes(inode, nr);
 	return 0;
 }
 
-static inline int DQUOT_PREALLOC_SPACE(struct inode *inode, qsize_t nr)
+static inline int vfs_dq_prealloc_space(struct inode *inode, qsize_t nr)
 {
-	DQUOT_PREALLOC_SPACE_NODIRTY(inode, nr);
+	vfs_dq_prealloc_space_nodirty(inode, nr);
 	mark_inode_dirty(inode);
 	return 0;
 }
 
-static inline int DQUOT_ALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t nr)
+static inline int vfs_dq_alloc_space_nodirty(struct inode *inode, qsize_t nr)
 {
 	inode_add_bytes(inode, nr);
 	return 0;
 }
 
-static inline int DQUOT_ALLOC_SPACE(struct inode *inode, qsize_t nr)
+static inline int vfs_dq_alloc_space(struct inode *inode, qsize_t nr)
 {
-	DQUOT_ALLOC_SPACE_NODIRTY(inode, nr);
+	vfs_dq_alloc_space_nodirty(inode, nr);
 	mark_inode_dirty(inode);
 	return 0;
 }
 
-static inline void DQUOT_FREE_SPACE_NODIRTY(struct inode *inode, qsize_t nr)
+static inline void vfs_dq_free_space_nodirty(struct inode *inode, qsize_t nr)
 {
 	inode_sub_bytes(inode, nr);
 }
 
-static inline void DQUOT_FREE_SPACE(struct inode *inode, qsize_t nr)
+static inline void vfs_dq_free_space(struct inode *inode, qsize_t nr)
 {
-	DQUOT_FREE_SPACE_NODIRTY(inode, nr);
+	vfs_dq_free_space_nodirty(inode, nr);
 	mark_inode_dirty(inode);
 }	
 
 #endif /* CONFIG_QUOTA */
 
-static inline int DQUOT_PREALLOC_BLOCK_NODIRTY(struct inode *inode, qsize_t nr)
+static inline int vfs_dq_prealloc_block_nodirty(struct inode *inode, qsize_t nr)
 {
-	return DQUOT_PREALLOC_SPACE_NODIRTY(inode,
+	return vfs_dq_prealloc_space_nodirty(inode,
 			nr << inode->i_sb->s_blocksize_bits);
 }
 
-static inline int DQUOT_PREALLOC_BLOCK(struct inode *inode, qsize_t nr)
+static inline int vfs_dq_prealloc_block(struct inode *inode, qsize_t nr)
 {
-	return DQUOT_PREALLOC_SPACE(inode,
+	return vfs_dq_prealloc_space(inode,
 			nr << inode->i_sb->s_blocksize_bits);
 }
 
-static inline int DQUOT_ALLOC_BLOCK_NODIRTY(struct inode *inode, qsize_t nr)
+static inline int vfs_dq_alloc_block_nodirty(struct inode *inode, qsize_t nr)
 {
-	return DQUOT_ALLOC_SPACE_NODIRTY(inode,
+ 	return vfs_dq_alloc_space_nodirty(inode,
 			nr << inode->i_sb->s_blocksize_bits);
 }
 
-static inline int DQUOT_ALLOC_BLOCK(struct inode *inode, qsize_t nr)
+static inline int vfs_dq_alloc_block(struct inode *inode, qsize_t nr)
 {
-	return DQUOT_ALLOC_SPACE(inode,
+	return vfs_dq_alloc_space(inode,
 			nr << inode->i_sb->s_blocksize_bits);
 }
 
-static inline void DQUOT_FREE_BLOCK_NODIRTY(struct inode *inode, qsize_t nr)
+static inline void vfs_dq_free_block_nodirty(struct inode *inode, qsize_t nr)
 {
-	DQUOT_FREE_SPACE_NODIRTY(inode, nr << inode->i_sb->s_blocksize_bits);
+	vfs_dq_free_space_nodirty(inode, nr << inode->i_sb->s_blocksize_bits);
 }
 
-static inline void DQUOT_FREE_BLOCK(struct inode *inode, qsize_t nr)
+static inline void vfs_dq_free_block(struct inode *inode, qsize_t nr)
 {
-	DQUOT_FREE_SPACE(inode, nr << inode->i_sb->s_blocksize_bits);
+	vfs_dq_free_space(inode, nr << inode->i_sb->s_blocksize_bits);
 }
 
+/*
+ * Define uppercase equivalents for compatibility with old function names
+ * Can go away when we think all users have been converted (15/04/2008)
+ */
+#define DQUOT_INIT(inode) vfs_dq_init(inode)
+#define DQUOT_DROP(inode) vfs_dq_drop(inode)
+#define DQUOT_PREALLOC_SPACE_NODIRTY(inode, nr) \
+				vfs_dq_prealloc_space_nodirty(inode, nr)
+#define DQUOT_PREALLOC_SPACE(inode, nr) vfs_dq_prealloc_space(inode, nr)
+#define DQUOT_ALLOC_SPACE_NODIRTY(inode, nr) \
+				vfs_dq_alloc_space_nodirty(inode, nr)
+#define DQUOT_ALLOC_SPACE(inode, nr) vfs_dq_alloc_space(inode, nr)
+#define DQUOT_PREALLOC_BLOCK_NODIRTY(inode, nr) \
+				vfs_dq_prealloc_block_nodirty(inode, nr)
+#define DQUOT_PREALLOC_BLOCK(inode, nr) vfs_dq_prealloc_block(inode, nr)
+#define DQUOT_ALLOC_BLOCK_NODIRTY(inode, nr) \
+				vfs_dq_alloc_block_nodirty(inode, nr)
+#define DQUOT_ALLOC_BLOCK(inode, nr) vfs_dq_alloc_block(inode, nr)
+#define DQUOT_ALLOC_INODE(inode) vfs_dq_alloc_inode(inode)
+#define DQUOT_FREE_SPACE_NODIRTY(inode, nr) \
+				vfs_dq_free_space_nodirty(inode, nr)
+#define DQUOT_FREE_SPACE(inode, nr) vfs_dq_free_space(inode, nr)
+#define DQUOT_FREE_BLOCK_NODIRTY(inode, nr) \
+				vfs_dq_free_block_nodirty(inode, nr)
+#define DQUOT_FREE_BLOCK(inode, nr) vfs_dq_free_block(inode, nr)
+#define DQUOT_FREE_INODE(inode) vfs_dq_free_inode(inode)
+#define DQUOT_TRANSFER(inode, iattr) vfs_dq_transfer(inode, iattr)
+#define DQUOT_SYNC(sb) vfs_dq_sync(sb)
+#define DQUOT_OFF(sb, remount) vfs_dq_off(sb, remount)
+#define DQUOT_ON_REMOUNT(sb) vfs_dq_quota_on_remount(sb)
+
 #endif /* _LINUX_QUOTAOPS_ */
-- 
cgit v1.2.3-70-g09d2


From 02a55ca87185e114e5d298a8d00608501dbabf67 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 25 Jul 2008 01:46:50 -0700
Subject: quota: cleanup loop in sync_dquots()

Make loop in sync_dquots() checking whether there's something to write
more readable, remove useless variable and macro info_any_dirty() which
is used only in this place.

Signed-off-by: Jan Kara <jack@suse.cz>
Cc: "Vegard Nossum" <vegard.nossum@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/quota.c            | 18 ++++++++++++------
 include/linux/quota.h |  2 --
 2 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/quota.c b/fs/quota.c
index db1cc9f3c7a..7f4386ebc23 100644
--- a/fs/quota.c
+++ b/fs/quota.c
@@ -186,7 +186,7 @@ static void quota_sync_sb(struct super_block *sb, int type)
 
 void sync_dquots(struct super_block *sb, int type)
 {
-	int cnt, dirty;
+	int cnt;
 
 	if (sb) {
 		if (sb->s_qcop->quota_sync)
@@ -198,11 +198,17 @@ void sync_dquots(struct super_block *sb, int type)
 restart:
 	list_for_each_entry(sb, &super_blocks, s_list) {
 		/* This test just improves performance so it needn't be reliable... */
-		for (cnt = 0, dirty = 0; cnt < MAXQUOTAS; cnt++)
-			if ((type == cnt || type == -1) && sb_has_quota_enabled(sb, cnt)
-			    && info_any_dirty(&sb_dqopt(sb)->info[cnt]))
-				dirty = 1;
-		if (!dirty)
+		for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+			if (type != -1 && type != cnt)
+				continue;
+			if (!sb_has_quota_enabled(sb, cnt))
+				continue;
+			if (!info_dirty(&sb_dqopt(sb)->info[cnt]) &&
+			    list_empty(&sb_dqopt(sb)->info[cnt].dqi_dirty_list))
+				continue;
+			break;
+		}
+		if (cnt == MAXQUOTAS)
 			continue;
 		sb->s_count++;
 		spin_unlock(&sb_lock);
diff --git a/include/linux/quota.h b/include/linux/quota.h
index dcddfb20094..6f1d97ddf82 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -224,8 +224,6 @@ struct super_block;
 
 extern void mark_info_dirty(struct super_block *sb, int type);
 #define info_dirty(info) test_bit(DQF_INFO_DIRTY_B, &(info)->dqi_flags)
-#define info_any_dquot_dirty(info) (!list_empty(&(info)->dqi_dirty_list))
-#define info_any_dirty(info) (info_dirty(info) || info_any_dquot_dirty(info))
 
 #define sb_dqopt(sb) (&(sb)->s_dquot)
 #define sb_dqinfo(sb, type) (sb_dqopt(sb)->info+(type))
-- 
cgit v1.2.3-70-g09d2


From 74abb9890dafb12a50dc140de215ed477beb1b88 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 25 Jul 2008 01:46:51 -0700
Subject: quota: move function-macros from quota.h to quotaops.h

Move declarations of some macros, which should be in fact functions to
quotaops.h.  This way they can be later converted to inline functions
because we can now use declarations from quota.h.  Also add necessary
includes of quotaops.h to a few files.

[akpm@linux-foundation.org: fix JFS build]
[akpm@linux-foundation.org: fix UFS build]
[vegard.nossum@gmail.com: fix QUOTA=n build]
Signed-off-by: Jan Kara <jack@suse.cz>
Cc: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Arjen Pool <arjenpool@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext2/super.c          |  1 +
 fs/jfs/super.c           |  1 +
 fs/quota_v1.c            |  1 +
 fs/quota_v2.c            |  1 +
 fs/reiserfs/super.c      |  1 +
 fs/ufs/super.c           |  1 +
 include/linux/quota.h    | 22 +++-------------------
 include/linux/quotaops.h | 26 ++++++++++++++++++++++++++
 8 files changed, 35 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index ef50cbc792d..31308a3b0b8 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -31,6 +31,7 @@
 #include <linux/seq_file.h>
 #include <linux/mount.h>
 #include <linux/log2.h>
+#include <linux/quotaops.h>
 #include <asm/uaccess.h>
 #include "ext2.h"
 #include "xattr.h"
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 0288e6d7936..359c091d896 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -22,6 +22,7 @@
 #include <linux/parser.h>
 #include <linux/completion.h>
 #include <linux/vfs.h>
+#include <linux/quotaops.h>
 #include <linux/mount.h>
 #include <linux/moduleparam.h>
 #include <linux/kthread.h>
diff --git a/fs/quota_v1.c b/fs/quota_v1.c
index a6cf9269105..5ae15b13eeb 100644
--- a/fs/quota_v1.c
+++ b/fs/quota_v1.c
@@ -1,6 +1,7 @@
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/quota.h>
+#include <linux/quotaops.h>
 #include <linux/dqblk_v1.h>
 #include <linux/quotaio_v1.h>
 #include <linux/kernel.h>
diff --git a/fs/quota_v2.c b/fs/quota_v2.c
index 234ada90363..b53827dc02d 100644
--- a/fs/quota_v2.c
+++ b/fs/quota_v2.c
@@ -11,6 +11,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/quotaops.h>
 
 #include <asm/byteorder.h>
 
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index a10a6d2a887..2ec748ba0bd 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -22,6 +22,7 @@
 #include <linux/blkdev.h>
 #include <linux/buffer_head.h>
 #include <linux/exportfs.h>
+#include <linux/quotaops.h>
 #include <linux/vfs.h>
 #include <linux/mnt_namespace.h>
 #include <linux/mount.h>
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 506f724055c..227c9d70004 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -76,6 +76,7 @@
 
 #include <linux/errno.h>
 #include <linux/fs.h>
+#include <linux/quotaops.h>
 #include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/stat.h>
diff --git a/include/linux/quota.h b/include/linux/quota.h
index 6f1d97ddf82..f9983ea0ff8 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -41,9 +41,6 @@
 #define __DQUOT_VERSION__	"dquot_6.5.1"
 #define __DQUOT_NUM_VERSION__	6*10000+5*100+1
 
-typedef __kernel_uid32_t qid_t; /* Type in which we store ids in memory */
-typedef __u64 qsize_t;          /* Type in which we store sizes */
-
 /* Size of blocks in which are counted size limits */
 #define QUOTABLOCK_BITS 10
 #define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS)
@@ -172,6 +169,9 @@ enum {
 
 #include <asm/atomic.h>
 
+typedef __kernel_uid32_t qid_t; /* Type in which we store ids in memory */
+typedef __u64 qsize_t;          /* Type in which we store sizes */
+
 extern spinlock_t dq_data_lock;
 
 /* Maximal numbers of writes for quota operation (insert/delete/update)
@@ -225,9 +225,6 @@ struct super_block;
 extern void mark_info_dirty(struct super_block *sb, int type);
 #define info_dirty(info) test_bit(DQF_INFO_DIRTY_B, &(info)->dqi_flags)
 
-#define sb_dqopt(sb) (&(sb)->s_dquot)
-#define sb_dqinfo(sb, type) (sb_dqopt(sb)->info+(type))
-
 struct dqstats {
 	int lookups;
 	int drops;
@@ -335,19 +332,6 @@ struct quota_info {
 	struct quota_format_ops *ops[MAXQUOTAS];	/* Operations for each type */
 };
 
-#define sb_has_quota_enabled(sb, type) ((type)==USRQUOTA ? \
-	(sb_dqopt(sb)->flags & DQUOT_USR_ENABLED) : (sb_dqopt(sb)->flags & DQUOT_GRP_ENABLED))
-
-#define sb_any_quota_enabled(sb) (sb_has_quota_enabled(sb, USRQUOTA) | \
-				  sb_has_quota_enabled(sb, GRPQUOTA))
-
-#define sb_has_quota_suspended(sb, type) \
-	((type) == USRQUOTA ? (sb_dqopt(sb)->flags & DQUOT_USR_SUSPENDED) : \
-			      (sb_dqopt(sb)->flags & DQUOT_GRP_SUSPENDED))
-
-#define sb_any_quota_suspended(sb) (sb_has_quota_suspended(sb, USRQUOTA) | \
-				  sb_has_quota_suspended(sb, GRPQUOTA))
-
 int register_quota_format(struct quota_format_type *fmt);
 void unregister_quota_format(struct quota_format_type *fmt);
 
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 0c8f9fe462a..38218c1334b 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -14,6 +14,8 @@
 
 #include <linux/fs.h>
 
+#define sb_dqopt(sb) (&(sb)->s_dquot)
+
 #if defined(CONFIG_QUOTA)
 
 /*
@@ -52,6 +54,25 @@ void vfs_dq_drop(struct inode *inode);
 int vfs_dq_transfer(struct inode *inode, struct iattr *iattr);
 int vfs_dq_quota_on_remount(struct super_block *sb);
 
+#define sb_dqinfo(sb, type) (sb_dqopt(sb)->info+(type))
+
+/*
+ * Functions for checking status of quota
+ */
+
+#define sb_has_quota_enabled(sb, type) ((type)==USRQUOTA ? \
+	(sb_dqopt(sb)->flags & DQUOT_USR_ENABLED) : (sb_dqopt(sb)->flags & DQUOT_GRP_ENABLED))
+
+#define sb_any_quota_enabled(sb) (sb_has_quota_enabled(sb, USRQUOTA) | \
+				  sb_has_quota_enabled(sb, GRPQUOTA))
+
+#define sb_has_quota_suspended(sb, type) \
+	((type) == USRQUOTA ? (sb_dqopt(sb)->flags & DQUOT_USR_SUSPENDED) : \
+			      (sb_dqopt(sb)->flags & DQUOT_GRP_SUSPENDED))
+
+#define sb_any_quota_suspended(sb) (sb_has_quota_suspended(sb, USRQUOTA) | \
+				  sb_has_quota_suspended(sb, GRPQUOTA))
+
 /*
  * Operations supported for diskquotas.
  */
@@ -159,6 +180,11 @@ static inline int vfs_dq_off(struct super_block *sb, int remount)
 
 #else
 
+#define sb_has_quota_enabled(sb, type) 0
+#define sb_any_quota_enabled(sb) 0
+#define sb_has_quota_suspended(sb, type) 0
+#define sb_any_quota_suspended(sb) 0
+
 /*
  * NO-OP when quota not configured.
  */
-- 
cgit v1.2.3-70-g09d2


From 657d3bfa98e542271b449f8cd84c7501ae2b2255 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 25 Jul 2008 01:46:52 -0700
Subject: quota: implement sending information via netlink about user below
 quota

Sometimes it may be useful for userspace to know (e.g.  for some hosting
guys) that some user stopped exceeding his hardlimit or softlimit in
quotas.  Implement sending of such events to userspace via quota netlink
protocol so that they don't have to poll for such events.  Based on idea
and initial implementation by Vladislav Bogdanov.

Cc: Vladislav Bogdanov <slava@nsys.by>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dquot.c            | 60 +++++++++++++++++++++++++++++++++++++++++++++------
 include/linux/quota.h |  4 ++++
 2 files changed, 58 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/dquot.c b/fs/dquot.c
index 0bcaf970bbb..1346eebe74c 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -889,7 +889,10 @@ static void print_warning(struct dquot *dquot, const int warntype)
 	char *msg = NULL;
 	struct tty_struct *tty;
 
-	if (!need_print_warning(dquot))
+	if (warntype == QUOTA_NL_IHARDBELOW ||
+	    warntype == QUOTA_NL_ISOFTBELOW ||
+	    warntype == QUOTA_NL_BHARDBELOW ||
+	    warntype == QUOTA_NL_BSOFTBELOW || !need_print_warning(dquot))
 		return;
 
 	mutex_lock(&tty_mutex);
@@ -1097,6 +1100,35 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
 	return QUOTA_OK;
 }
 
+static int info_idq_free(struct dquot *dquot, ulong inodes)
+{
+	if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
+	    dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit)
+		return QUOTA_NL_NOWARN;
+
+	if (dquot->dq_dqb.dqb_curinodes - inodes <= dquot->dq_dqb.dqb_isoftlimit)
+		return QUOTA_NL_ISOFTBELOW;
+	if (dquot->dq_dqb.dqb_curinodes >= dquot->dq_dqb.dqb_ihardlimit &&
+	    dquot->dq_dqb.dqb_curinodes - inodes < dquot->dq_dqb.dqb_ihardlimit)
+		return QUOTA_NL_IHARDBELOW;
+	return QUOTA_NL_NOWARN;
+}
+
+static int info_bdq_free(struct dquot *dquot, qsize_t space)
+{
+	if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
+	    toqb(dquot->dq_dqb.dqb_curspace) <= dquot->dq_dqb.dqb_bsoftlimit)
+		return QUOTA_NL_NOWARN;
+
+	if (toqb(dquot->dq_dqb.dqb_curspace - space) <=
+	    dquot->dq_dqb.dqb_bsoftlimit)
+		return QUOTA_NL_BSOFTBELOW;
+	if (toqb(dquot->dq_dqb.dqb_curspace) >= dquot->dq_dqb.dqb_bhardlimit &&
+	    toqb(dquot->dq_dqb.dqb_curspace - space) <
+						dquot->dq_dqb.dqb_bhardlimit)
+		return QUOTA_NL_BHARDBELOW;
+	return QUOTA_NL_NOWARN;
+}
 /*
  *	Initialize quota pointers in inode
  *	Transaction must be started at entry
@@ -1284,6 +1316,7 @@ warn_put_all:
 int dquot_free_space(struct inode *inode, qsize_t number)
 {
 	unsigned int cnt;
+	char warntype[MAXQUOTAS];
 
 	/* First test before acquiring mutex - solves deadlocks when we
          * re-enter the quota code and are already holding the mutex */
@@ -1292,6 +1325,7 @@ out_sub:
 		inode_sub_bytes(inode, number);
 		return QUOTA_OK;
 	}
+
 	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	/* Now recheck reliably when holding dqptr_sem */
 	if (IS_NOQUOTA(inode)) {
@@ -1302,6 +1336,7 @@ out_sub:
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (inode->i_dquot[cnt] == NODQUOT)
 			continue;
+		warntype[cnt] = info_bdq_free(inode->i_dquot[cnt], number);
 		dquot_decr_space(inode->i_dquot[cnt], number);
 	}
 	inode_sub_bytes(inode, number);
@@ -1310,6 +1345,7 @@ out_sub:
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
 		if (inode->i_dquot[cnt])
 			mark_dquot_dirty(inode->i_dquot[cnt]);
+	flush_warnings(inode->i_dquot, warntype);
 	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	return QUOTA_OK;
 }
@@ -1320,11 +1356,13 @@ out_sub:
 int dquot_free_inode(const struct inode *inode, unsigned long number)
 {
 	unsigned int cnt;
+	char warntype[MAXQUOTAS];
 
 	/* First test before acquiring mutex - solves deadlocks when we
          * re-enter the quota code and are already holding the mutex */
 	if (IS_NOQUOTA(inode))
 		return QUOTA_OK;
+
 	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	/* Now recheck reliably when holding dqptr_sem */
 	if (IS_NOQUOTA(inode)) {
@@ -1335,6 +1373,7 @@ int dquot_free_inode(const struct inode *inode, unsigned long number)
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (inode->i_dquot[cnt] == NODQUOT)
 			continue;
+		warntype[cnt] = info_idq_free(inode->i_dquot[cnt], number);
 		dquot_decr_inodes(inode->i_dquot[cnt], number);
 	}
 	spin_unlock(&dq_data_lock);
@@ -1342,6 +1381,7 @@ int dquot_free_inode(const struct inode *inode, unsigned long number)
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
 		if (inode->i_dquot[cnt])
 			mark_dquot_dirty(inode->i_dquot[cnt]);
+	flush_warnings(inode->i_dquot, warntype);
 	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	return QUOTA_OK;
 }
@@ -1359,7 +1399,8 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 	struct dquot *transfer_to[MAXQUOTAS];
 	int cnt, ret = NO_QUOTA, chuid = (iattr->ia_valid & ATTR_UID) && inode->i_uid != iattr->ia_uid,
 	    chgid = (iattr->ia_valid & ATTR_GID) && inode->i_gid != iattr->ia_gid;
-	char warntype[MAXQUOTAS];
+	char warntype_to[MAXQUOTAS];
+	char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS];
 
 	/* First test before acquiring mutex - solves deadlocks when we
          * re-enter the quota code and are already holding the mutex */
@@ -1368,7 +1409,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 	/* Clear the arrays */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		transfer_to[cnt] = transfer_from[cnt] = NODQUOT;
-		warntype[cnt] = QUOTA_NL_NOWARN;
+		warntype_to[cnt] = QUOTA_NL_NOWARN;
 	}
 	down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	/* Now recheck reliably when holding dqptr_sem */
@@ -1400,8 +1441,9 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 		if (transfer_to[cnt] == NODQUOT)
 			continue;
 		transfer_from[cnt] = inode->i_dquot[cnt];
-		if (check_idq(transfer_to[cnt], 1, warntype+cnt) == NO_QUOTA ||
-		    check_bdq(transfer_to[cnt], space, 0, warntype+cnt) == NO_QUOTA)
+		if (check_idq(transfer_to[cnt], 1, warntype_to + cnt) ==
+		    NO_QUOTA || check_bdq(transfer_to[cnt], space, 0,
+		    warntype_to + cnt) == NO_QUOTA)
 			goto warn_put_all;
 	}
 
@@ -1417,6 +1459,10 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 
 		/* Due to IO error we might not have transfer_from[] structure */
 		if (transfer_from[cnt]) {
+			warntype_from_inodes[cnt] =
+				info_idq_free(transfer_from[cnt], 1);
+			warntype_from_space[cnt] =
+				info_bdq_free(transfer_from[cnt], space);
 			dquot_decr_inodes(transfer_from[cnt], 1);
 			dquot_decr_space(transfer_from[cnt], space);
 		}
@@ -1436,7 +1482,9 @@ warn_put_all:
 		if (transfer_to[cnt])
 			mark_dquot_dirty(transfer_to[cnt]);
 	}
-	flush_warnings(transfer_to, warntype);
+	flush_warnings(transfer_to, warntype_to);
+	flush_warnings(transfer_from, warntype_from_inodes);
+	flush_warnings(transfer_from, warntype_from_space);
 	
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (ret == QUOTA_OK && transfer_from[cnt] != NODQUOT)
diff --git a/include/linux/quota.h b/include/linux/quota.h
index 4e004fef813..376a05048bc 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -135,6 +135,10 @@ struct if_dqinfo {
 #define QUOTA_NL_BHARDWARN 4		/* Block hardlimit reached */
 #define QUOTA_NL_BSOFTLONGWARN 5	/* Block grace time expired */
 #define QUOTA_NL_BSOFTWARN 6		/* Block softlimit reached */
+#define QUOTA_NL_IHARDBELOW 7		/* Usage got below inode hardlimit */
+#define QUOTA_NL_ISOFTBELOW 8		/* Usage got below inode softlimit */
+#define QUOTA_NL_BHARDBELOW 9		/* Usage got below block hardlimit */
+#define QUOTA_NL_BSOFTBELOW 10		/* Usage got below block softlimit */
 
 enum {
 	QUOTA_NL_C_UNSPEC,
-- 
cgit v1.2.3-70-g09d2


From e4901f92a8dbe843e76651a50f7a2a6dd3d53474 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:31 -0700
Subject: coredump: zap_threads: comments && use while_each_thread()

No changes in fs/exec.o

The for_each_process() loop in zap_threads() is very subtle, it is not
clear why we don't race with fork/exit/exec.  Add the fat comment.

Also, change the code to use while_each_thread().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 39 ++++++++++++++++++++++++++++++++-------
 1 file changed, 32 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index e41aef0fb35..af249af4cca 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1517,7 +1517,7 @@ static void zap_process(struct task_struct *start)
 			sigaddset(&t->pending.signal, SIGKILL);
 			signal_wake_up(t, 1);
 		}
-	} while ((t = next_thread(t)) != start);
+	} while_each_thread(start, t);
 }
 
 static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
@@ -1539,7 +1539,36 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
 
 	if (atomic_read(&mm->mm_users) == mm->core_waiters + 1)
 		goto done;
-
+	/*
+	 * We should find and kill all tasks which use this mm, and we should
+	 * count them correctly into mm->core_waiters. We don't take tasklist
+	 * lock, but this is safe wrt:
+	 *
+	 * fork:
+	 *	None of sub-threads can fork after zap_process(leader). All
+	 *	processes which were created before this point should be
+	 *	visible to zap_threads() because copy_process() adds the new
+	 *	process to the tail of init_task.tasks list, and lock/unlock
+	 *	of ->siglock provides a memory barrier.
+	 *
+	 * do_exit:
+	 *	The caller holds mm->mmap_sem. This means that the task which
+	 *	uses this mm can't pass exit_mm(), so it can't exit or clear
+	 *	its ->mm.
+	 *
+	 * de_thread:
+	 *	It does list_replace_rcu(&leader->tasks, &current->tasks),
+	 *	we must see either old or new leader, this does not matter.
+	 *	However, it can change p->sighand, so lock_task_sighand(p)
+	 *	must be used. Since p->mm != NULL and we hold ->mmap_sem
+	 *	it can't fail.
+	 *
+	 *	Note also that "g" can be the old leader with ->mm == NULL
+	 *	and already unhashed and thus removed from ->thread_group.
+	 *	This is OK, __unhash_process()->list_del_rcu() does not
+	 *	clear the ->next pointer, we will find the new leader via
+	 *	next_thread().
+	 */
 	rcu_read_lock();
 	for_each_process(g) {
 		if (g == tsk->group_leader)
@@ -1549,17 +1578,13 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
 		do {
 			if (p->mm) {
 				if (p->mm == mm) {
-					/*
-					 * p->sighand can't disappear, but
-					 * may be changed by de_thread()
-					 */
 					lock_task_sighand(p, &flags);
 					zap_process(p);
 					unlock_task_sighand(p, &flags);
 				}
 				break;
 			}
-		} while ((p = next_thread(p)) != g);
+		} while_each_thread(g, p);
 	}
 	rcu_read_unlock();
 done:
-- 
cgit v1.2.3-70-g09d2


From 7b34e4283c685f5cc6ba6d30e939906eee0d4bcf Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:37 -0700
Subject: introduce PF_KTHREAD flag

Introduce the new PF_KTHREAD flag to mark the kernel threads.  It is set
by INIT_TASK() and copied to the forked childs (we could set it in
kthreadd() along with PF_NOFREEZE instead).

daemonize() was changed as well.  In that case testing of PF_KTHREAD is
racy, but daemonize() is hopeless anyway.

This flag is cleared in do_execve(), before search_binary_handler().
Probably not the best place, we can do this in exec_mmap() or in
start_thread(), or clear it along with PF_FORKNOEXEC.  But I think this
doesn't matter in practice, and if do_execve() fails kthread should die
soon.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c                 | 1 +
 include/linux/init_task.h | 2 +-
 include/linux/sched.h     | 1 +
 kernel/exit.c             | 2 +-
 4 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index af249af4cca..cd2e8c9b124 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1326,6 +1326,7 @@ int do_execve(char * filename,
 	if (retval < 0)
 		goto out;
 
+	current->flags &= ~PF_KTHREAD;
 	retval = search_binary_handler(bprm,regs);
 	if (retval >= 0) {
 		/* execve success */
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 93c45acf249..021d8e720c7 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -122,7 +122,7 @@ extern struct group_info init_groups;
 	.state		= 0,						\
 	.stack		= &init_thread_info,				\
 	.usage		= ATOMIC_INIT(2),				\
-	.flags		= 0,						\
+	.flags		= PF_KTHREAD,					\
 	.lock_depth	= -1,						\
 	.prio		= MAX_PRIO-20,					\
 	.static_prio	= MAX_PRIO-20,					\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 79e749dbf81..eec64a4adb9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1483,6 +1483,7 @@ static inline void put_task_struct(struct task_struct *t)
 #define PF_EXITING	0x00000004	/* getting shut down */
 #define PF_EXITPIDONE	0x00000008	/* pi exit done on shut down */
 #define PF_VCPU		0x00000010	/* I'm a virtual CPU */
+#define PF_KTHREAD	0x00000020	/* I am a kernel thread */
 #define PF_FORKNOEXEC	0x00000040	/* forked but didn't exec */
 #define PF_SUPERPRIV	0x00000100	/* used super-user privileges */
 #define PF_DUMPCORE	0x00000200	/* dumped core */
diff --git a/kernel/exit.c b/kernel/exit.c
index a7799d8a640..28a44a2612d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -430,7 +430,7 @@ void daemonize(const char *name, ...)
 	 * We don't want to have TIF_FREEZE set if the system-wide hibernation
 	 * or suspend transition begins right now.
 	 */
-	current->flags |= PF_NOFREEZE;
+	current->flags |= (PF_NOFREEZE | PF_KTHREAD);
 
 	if (current->nsproxy != &init_nsproxy) {
 		get_nsproxy(&init_nsproxy);
-- 
cgit v1.2.3-70-g09d2


From 246bb0b1deb29726990620d8b5e55ca29f331362 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:38 -0700
Subject: kill PF_BORROWED_MM in favour of PF_KTHREAD

Kill PF_BORROWED_MM.  Change use_mm/unuse_mm to not play with ->flags, and
do s/PF_BORROWED_MM/PF_KTHREAD/ for a couple of other users.

No functional changes yet.  But this allows us to do further
fixes/cleanups.

oom_kill/ptrace/etc often check "p->mm != NULL" to filter out the
kthreads, this is wrong because of use_mm().  The problem with
PF_BORROWED_MM is that we need task_lock() to avoid races.  With this
patch we can check PF_KTHREAD directly, or use a simple lockless helper:

	/* The result must not be dereferenced !!! */
	struct mm_struct *__get_task_mm(struct task_struct *tsk)
	{
		if (tsk->flags & PF_KTHREAD)
			return NULL;
		return tsk->mm;
	}

Note also ecard_task().  It runs with ->mm != NULL, but it's the kernel
thread without PF_BORROWED_MM.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/aio.c              | 2 --
 include/linux/sched.h | 3 +--
 kernel/fork.c         | 4 ++--
 3 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index 0fb3117ddd9..0051fd94b44 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -586,7 +586,6 @@ static void use_mm(struct mm_struct *mm)
 	struct task_struct *tsk = current;
 
 	task_lock(tsk);
-	tsk->flags |= PF_BORROWED_MM;
 	active_mm = tsk->active_mm;
 	atomic_inc(&mm->mm_count);
 	tsk->mm = mm;
@@ -610,7 +609,6 @@ static void unuse_mm(struct mm_struct *mm)
 	struct task_struct *tsk = current;
 
 	task_lock(tsk);
-	tsk->flags &= ~PF_BORROWED_MM;
 	tsk->mm = NULL;
 	/* active_mm is still 'mm' */
 	enter_lazy_tlb(mm, tsk);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index eec64a4adb9..0560999eb1d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1483,7 +1483,6 @@ static inline void put_task_struct(struct task_struct *t)
 #define PF_EXITING	0x00000004	/* getting shut down */
 #define PF_EXITPIDONE	0x00000008	/* pi exit done on shut down */
 #define PF_VCPU		0x00000010	/* I'm a virtual CPU */
-#define PF_KTHREAD	0x00000020	/* I am a kernel thread */
 #define PF_FORKNOEXEC	0x00000040	/* forked but didn't exec */
 #define PF_SUPERPRIV	0x00000100	/* used super-user privileges */
 #define PF_DUMPCORE	0x00000200	/* dumped core */
@@ -1497,7 +1496,7 @@ static inline void put_task_struct(struct task_struct *t)
 #define PF_KSWAPD	0x00040000	/* I am kswapd */
 #define PF_SWAPOFF	0x00080000	/* I am in swapoff */
 #define PF_LESS_THROTTLE 0x00100000	/* Throttle me less: I clean memory */
-#define PF_BORROWED_MM	0x00200000	/* I am a kthread doing use_mm */
+#define PF_KTHREAD	0x00200000	/* I am a kernel thread */
 #define PF_RANDOMIZE	0x00400000	/* randomize virtual address space */
 #define PF_SWAPWRITE	0x00800000	/* Allowed to write to swap */
 #define PF_SPREAD_PAGE	0x01000000	/* Spread page cache over cpuset */
diff --git a/kernel/fork.c b/kernel/fork.c
index 228f80c9155..eeaec6893b0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -474,7 +474,7 @@ EXPORT_SYMBOL_GPL(mmput);
 /**
  * get_task_mm - acquire a reference to the task's mm
  *
- * Returns %NULL if the task has no mm.  Checks PF_BORROWED_MM (meaning
+ * Returns %NULL if the task has no mm.  Checks PF_KTHREAD (meaning
  * this kernel workthread has transiently adopted a user mm with use_mm,
  * to do its AIO) is not set and if so returns a reference to it, after
  * bumping up the use count.  User must release the mm via mmput()
@@ -487,7 +487,7 @@ struct mm_struct *get_task_mm(struct task_struct *task)
 	task_lock(task);
 	mm = task->mm;
 	if (mm) {
-		if (task->flags & PF_BORROWED_MM)
+		if (task->flags & PF_KTHREAD)
 			mm = NULL;
 		else
 			atomic_inc(&mm->mm_users);
-- 
cgit v1.2.3-70-g09d2


From 15b9f360c0316c06d37c09b02d85565edbaf9dd3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:39 -0700
Subject: coredump: zap_threads() must skip kernel threads

The main loop in zap_threads() must skip kthreads which may use the same
mm.  Otherwise we "kill" this thread erroneously (for example, it can not
fork or exec after that), and the coredumping task stucks in the
TASK_UNINTERRUPTIBLE state forever because of the wrong ->core_waiters
count.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index cd2e8c9b124..e347e6ed161 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1574,11 +1574,12 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
 	for_each_process(g) {
 		if (g == tsk->group_leader)
 			continue;
-
+		if (g->flags & PF_KTHREAD)
+			continue;
 		p = g;
 		do {
 			if (p->mm) {
-				if (p->mm == mm) {
+				if (unlikely(p->mm == mm)) {
 					lock_task_sighand(p, &flags);
 					zap_process(p);
 					unlock_task_sighand(p, &flags);
-- 
cgit v1.2.3-70-g09d2


From 24d5288f06ed8b3a368eba967d587cdb012dfdf7 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:40 -0700
Subject: coredump: elf_core_dump: skip kernel threads

linux_binfmt->core_dump() runs before the process does exit_aio(), this
means that we can hit the kernel thread which shares the same ->mm.
Afaics, nothing really bad can happen, but perhaps it makes sense to fix
this minor bug.

It is sad we have to iterate over all threads in system and use
GFP_ATOMIC.  Hopefully we can kill theses ugly do_each_thread()s, but this
needs some nontrivial changes in mm_struct and do_coredump.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_elf.c       | 6 ++++++
 fs/binfmt_elf_fdpic.c | 3 +++
 2 files changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 639d2d8b571..bad7d8770d7 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1520,6 +1520,9 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 	rcu_read_lock();
 	do_each_thread(g, p)
 		if (p->mm == dump_task->mm) {
+			if (p->flags & PF_KTHREAD)
+				continue;
+
 			t = kzalloc(offsetof(struct elf_thread_core_info,
 					     notes[info->thread_notes]),
 				    GFP_ATOMIC);
@@ -1724,6 +1727,9 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 		rcu_read_lock();
 		do_each_thread(g, p)
 			if (current->mm == p->mm && current != p) {
+				if (p->flags & PF_KTHREAD)
+					continue;
+
 				ets = kzalloc(sizeof(*ets), GFP_ATOMIC);
 				if (!ets) {
 					rcu_read_unlock();
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index d051a32e627..71bcc4b4d08 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1626,6 +1626,9 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
 		rcu_read_lock();
 		do_each_thread(g,p)
 			if (current->mm == p->mm && current != p) {
+				if (p->flags & PF_KTHREAD)
+					continue;
+
 				tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC);
 				if (!tmp) {
 					rcu_read_unlock();
-- 
cgit v1.2.3-70-g09d2


From 32ecb1f26dd50eeaac4e3f4dea4541c97848e459 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:41 -0700
Subject: coredump: turn mm->core_startup_done into the pointer to struct
 core_state

mm->core_startup_done points to "struct completion startup_done" allocated
on the coredump_wait()'s stack.  Introduce the new structure, core_state,
which holds this "struct completion".  This way we can add more info
visible to the threads participating in coredump without enlarging
mm_struct.

No changes in affected .o files.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c                | 8 ++++----
 include/linux/mm_types.h | 7 ++++++-
 kernel/exit.c            | 2 +-
 3 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index e347e6ed161..71734568f01 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1597,13 +1597,13 @@ static int coredump_wait(int exit_code)
 {
 	struct task_struct *tsk = current;
 	struct mm_struct *mm = tsk->mm;
-	struct completion startup_done;
+	struct core_state core_state;
 	struct completion *vfork_done;
 	int core_waiters;
 
 	init_completion(&mm->core_done);
-	init_completion(&startup_done);
-	mm->core_startup_done = &startup_done;
+	init_completion(&core_state.startup);
+	mm->core_state = &core_state;
 
 	core_waiters = zap_threads(tsk, mm, exit_code);
 	up_write(&mm->mmap_sem);
@@ -1622,7 +1622,7 @@ static int coredump_wait(int exit_code)
 	}
 
 	if (core_waiters)
-		wait_for_completion(&startup_done);
+		wait_for_completion(&core_state.startup);
 fail:
 	BUG_ON(mm->core_waiters);
 	return core_waiters;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 02a27ae7853..97819efd233 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -159,6 +159,10 @@ struct vm_area_struct {
 #endif
 };
 
+struct core_state {
+	struct completion startup;
+};
+
 struct mm_struct {
 	struct vm_area_struct * mmap;		/* list of VMAs */
 	struct rb_root mm_rb;
@@ -220,7 +224,8 @@ struct mm_struct {
 	unsigned long flags; /* Must use atomic bitops to access the bits */
 
 	/* coredumping support */
-	struct completion *core_startup_done, core_done;
+	struct core_state *core_state;
+	struct completion core_done;
 
 	/* aio bits */
 	rwlock_t		ioctx_list_lock;	/* aio lock */
diff --git a/kernel/exit.c b/kernel/exit.c
index 28a44a2612d..f7fa21dbced 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -680,7 +680,7 @@ static void exit_mm(struct task_struct * tsk)
 		up_read(&mm->mmap_sem);
 		down_write(&mm->mmap_sem);
 		if (!--mm->core_waiters)
-			complete(mm->core_startup_done);
+			complete(&mm->core_state->startup);
 		up_write(&mm->mmap_sem);
 
 		wait_for_completion(&mm->core_done);
-- 
cgit v1.2.3-70-g09d2


From 999d9fc1670bc082928b93b11d1f2e0e417d973c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:41 -0700
Subject: coredump: move mm->core_waiters into struct core_state

Move mm->core_waiters into "struct core_state" allocated on stack.  This
shrinks mm_struct a little bit and allows further changes.

This patch mostly does s/core_waiters/core_state.  The only essential
change is that coredump_wait() must clear mm->core_state before return.

The coredump_wait()'s path is uglified and .text grows by 30 bytes, this
is fixed by the next patch.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c                | 21 +++++++++++----------
 include/linux/mm_types.h |  2 +-
 kernel/exit.c            |  8 ++++----
 kernel/fork.c            |  2 +-
 kernel/signal.c          |  4 ++--
 5 files changed, 19 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 71734568f01..50de3aaff4d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -722,12 +722,10 @@ static int exec_mmap(struct mm_struct *mm)
 		 * Make sure that if there is a core dump in progress
 		 * for the old mm, we get out and die instead of going
 		 * through with the exec.  We must hold mmap_sem around
-		 * checking core_waiters and changing tsk->mm.  The
-		 * core-inducing thread will increment core_waiters for
-		 * each thread whose ->mm == old_mm.
+		 * checking core_state and changing tsk->mm.
 		 */
 		down_read(&old_mm->mmap_sem);
-		if (unlikely(old_mm->core_waiters)) {
+		if (unlikely(old_mm->core_state)) {
 			up_read(&old_mm->mmap_sem);
 			return -EINTR;
 		}
@@ -1514,7 +1512,7 @@ static void zap_process(struct task_struct *start)
 	t = start;
 	do {
 		if (t != current && t->mm) {
-			t->mm->core_waiters++;
+			t->mm->core_state->nr_threads++;
 			sigaddset(&t->pending.signal, SIGKILL);
 			signal_wake_up(t, 1);
 		}
@@ -1538,11 +1536,11 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
 	if (err)
 		return err;
 
-	if (atomic_read(&mm->mm_users) == mm->core_waiters + 1)
+	if (atomic_read(&mm->mm_users) == mm->core_state->nr_threads + 1)
 		goto done;
 	/*
 	 * We should find and kill all tasks which use this mm, and we should
-	 * count them correctly into mm->core_waiters. We don't take tasklist
+	 * count them correctly into ->nr_threads. We don't take tasklist
 	 * lock, but this is safe wrt:
 	 *
 	 * fork:
@@ -1590,7 +1588,7 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
 	}
 	rcu_read_unlock();
 done:
-	return mm->core_waiters;
+	return mm->core_state->nr_threads;
 }
 
 static int coredump_wait(int exit_code)
@@ -1603,9 +1601,12 @@ static int coredump_wait(int exit_code)
 
 	init_completion(&mm->core_done);
 	init_completion(&core_state.startup);
+	core_state.nr_threads = 0;
 	mm->core_state = &core_state;
 
 	core_waiters = zap_threads(tsk, mm, exit_code);
+	if (core_waiters < 0)
+		mm->core_state = NULL;
 	up_write(&mm->mmap_sem);
 
 	if (unlikely(core_waiters < 0))
@@ -1623,8 +1624,8 @@ static int coredump_wait(int exit_code)
 
 	if (core_waiters)
 		wait_for_completion(&core_state.startup);
+	mm->core_state = NULL;
 fail:
-	BUG_ON(mm->core_waiters);
 	return core_waiters;
 }
 
@@ -1702,7 +1703,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 	/*
 	 * If another thread got here first, or we are not dumpable, bail out.
 	 */
-	if (mm->core_waiters || !get_dumpable(mm)) {
+	if (mm->core_state || !get_dumpable(mm)) {
 		up_write(&mm->mmap_sem);
 		goto fail;
 	}
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 97819efd233..c0b1747b61a 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -160,6 +160,7 @@ struct vm_area_struct {
 };
 
 struct core_state {
+	int nr_threads;
 	struct completion startup;
 };
 
@@ -179,7 +180,6 @@ struct mm_struct {
 	atomic_t mm_users;			/* How many users with user space? */
 	atomic_t mm_count;			/* How many references to "struct mm_struct" (users count as 1) */
 	int map_count;				/* number of VMAs */
-	int core_waiters;
 	struct rw_semaphore mmap_sem;
 	spinlock_t page_table_lock;		/* Protects page tables and some counters */
 
diff --git a/kernel/exit.c b/kernel/exit.c
index f7fa21dbced..988e232254e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -670,16 +670,16 @@ static void exit_mm(struct task_struct * tsk)
 		return;
 	/*
 	 * Serialize with any possible pending coredump.
-	 * We must hold mmap_sem around checking core_waiters
+	 * We must hold mmap_sem around checking core_state
 	 * and clearing tsk->mm.  The core-inducing thread
-	 * will increment core_waiters for each thread in the
+	 * will increment ->nr_threads for each thread in the
 	 * group with ->mm != NULL.
 	 */
 	down_read(&mm->mmap_sem);
-	if (mm->core_waiters) {
+	if (mm->core_state) {
 		up_read(&mm->mmap_sem);
 		down_write(&mm->mmap_sem);
-		if (!--mm->core_waiters)
+		if (!--mm->core_state->nr_threads)
 			complete(&mm->core_state->startup);
 		up_write(&mm->mmap_sem);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index eeaec6893b0..813d5c89b9d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -400,7 +400,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
 	INIT_LIST_HEAD(&mm->mmlist);
 	mm->flags = (current->mm) ? current->mm->flags
 				  : MMF_DUMP_FILTER_DEFAULT;
-	mm->core_waiters = 0;
+	mm->core_state = NULL;
 	mm->nr_ptes = 0;
 	set_mm_counter(mm, file_rss, 0);
 	set_mm_counter(mm, anon_rss, 0);
diff --git a/kernel/signal.c b/kernel/signal.c
index 39c1706edf0..5c7b7eaa0dc 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1480,10 +1480,10 @@ static inline int may_ptrace_stop(void)
 	 * is a deadlock situation, and pointless because our tracer
 	 * is dead so don't allow us to stop.
 	 * If SIGKILL was already sent before the caller unlocked
-	 * ->siglock we must see ->core_waiters != 0. Otherwise it
+	 * ->siglock we must see ->core_state != NULL. Otherwise it
 	 * is safe to enter schedule().
 	 */
-	if (unlikely(current->mm->core_waiters) &&
+	if (unlikely(current->mm->core_state) &&
 	    unlikely(current->mm == current->parent->mm))
 		return 0;
 
-- 
cgit v1.2.3-70-g09d2


From 8cd9c249128a59e8e833d454a784b0cbd338d468 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:42 -0700
Subject: coredump: simplify core_state->nr_threads calculation

Change zap_process() to return int instead of incrementing
mm->core_state->nr_threads directly.  Change zap_threads() to set
mm->core_state only on success.

This patch restores the original size of .text, and more importantly now
->nr_threads is used in two places only.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 50de3aaff4d..c74bb34eeef 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1502,9 +1502,10 @@ out:
 	return ispipe;
 }
 
-static void zap_process(struct task_struct *start)
+static int zap_process(struct task_struct *start)
 {
 	struct task_struct *t;
+	int nr = 0;
 
 	start->signal->flags = SIGNAL_GROUP_EXIT;
 	start->signal->group_stop_count = 0;
@@ -1512,31 +1513,33 @@ static void zap_process(struct task_struct *start)
 	t = start;
 	do {
 		if (t != current && t->mm) {
-			t->mm->core_state->nr_threads++;
 			sigaddset(&t->pending.signal, SIGKILL);
 			signal_wake_up(t, 1);
+			nr++;
 		}
 	} while_each_thread(start, t);
+
+	return nr;
 }
 
 static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
-				int exit_code)
+				struct core_state *core_state, int exit_code)
 {
 	struct task_struct *g, *p;
 	unsigned long flags;
-	int err = -EAGAIN;
+	int nr = -EAGAIN;
 
 	spin_lock_irq(&tsk->sighand->siglock);
 	if (!signal_group_exit(tsk->signal)) {
+		mm->core_state = core_state;
 		tsk->signal->group_exit_code = exit_code;
-		zap_process(tsk);
-		err = 0;
+		nr = zap_process(tsk);
 	}
 	spin_unlock_irq(&tsk->sighand->siglock);
-	if (err)
-		return err;
+	if (unlikely(nr < 0))
+		return nr;
 
-	if (atomic_read(&mm->mm_users) == mm->core_state->nr_threads + 1)
+	if (atomic_read(&mm->mm_users) == nr + 1)
 		goto done;
 	/*
 	 * We should find and kill all tasks which use this mm, and we should
@@ -1579,7 +1582,7 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
 			if (p->mm) {
 				if (unlikely(p->mm == mm)) {
 					lock_task_sighand(p, &flags);
-					zap_process(p);
+					nr += zap_process(p);
 					unlock_task_sighand(p, &flags);
 				}
 				break;
@@ -1588,7 +1591,8 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
 	}
 	rcu_read_unlock();
 done:
-	return mm->core_state->nr_threads;
+	core_state->nr_threads = nr;
+	return nr;
 }
 
 static int coredump_wait(int exit_code)
@@ -1601,12 +1605,7 @@ static int coredump_wait(int exit_code)
 
 	init_completion(&mm->core_done);
 	init_completion(&core_state.startup);
-	core_state.nr_threads = 0;
-	mm->core_state = &core_state;
-
-	core_waiters = zap_threads(tsk, mm, exit_code);
-	if (core_waiters < 0)
-		mm->core_state = NULL;
+	core_waiters = zap_threads(tsk, mm, &core_state, exit_code);
 	up_write(&mm->mmap_sem);
 
 	if (unlikely(core_waiters < 0))
-- 
cgit v1.2.3-70-g09d2


From c5f1cc8c1828486a61ab3e575da6e2c62b34d399 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:42 -0700
Subject: coredump: turn core_state->nr_threads into atomic_t

Turn core_state->nr_threads into atomic_t and kill now unneeded
down_write(&mm->mmap_sem) in exit_mm().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c                | 2 +-
 include/linux/mm_types.h | 2 +-
 kernel/exit.c            | 5 ++---
 3 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index c74bb34eeef..15d493fe8aa 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1591,7 +1591,7 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
 	}
 	rcu_read_unlock();
 done:
-	core_state->nr_threads = nr;
+	atomic_set(&core_state->nr_threads, nr);
 	return nr;
 }
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index c0b1747b61a..ae99a28ba6a 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -160,7 +160,7 @@ struct vm_area_struct {
 };
 
 struct core_state {
-	int nr_threads;
+	atomic_t nr_threads;
 	struct completion startup;
 };
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 988e232254e..63d82957baa 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -678,10 +678,9 @@ static void exit_mm(struct task_struct * tsk)
 	down_read(&mm->mmap_sem);
 	if (mm->core_state) {
 		up_read(&mm->mmap_sem);
-		down_write(&mm->mmap_sem);
-		if (!--mm->core_state->nr_threads)
+
+		if (atomic_dec_and_test(&mm->core_state->nr_threads))
 			complete(&mm->core_state->startup);
-		up_write(&mm->mmap_sem);
 
 		wait_for_completion(&mm->core_done);
 		down_read(&mm->mmap_sem);
-- 
cgit v1.2.3-70-g09d2


From 9d5b327bf198d2720666de958dcc2ae219d86952 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:43 -0700
Subject: coredump: make mm->core_state visible to ->core_dump()

Move the "struct core_state core_state" from coredump_wait() to
do_coredump(), this makes mm->core_state visible to binfmt->core_dump().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 15d493fe8aa..b8ee842d93c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1595,17 +1595,16 @@ done:
 	return nr;
 }
 
-static int coredump_wait(int exit_code)
+static int coredump_wait(int exit_code, struct core_state *core_state)
 {
 	struct task_struct *tsk = current;
 	struct mm_struct *mm = tsk->mm;
-	struct core_state core_state;
 	struct completion *vfork_done;
 	int core_waiters;
 
 	init_completion(&mm->core_done);
-	init_completion(&core_state.startup);
-	core_waiters = zap_threads(tsk, mm, &core_state, exit_code);
+	init_completion(&core_state->startup);
+	core_waiters = zap_threads(tsk, mm, core_state, exit_code);
 	up_write(&mm->mmap_sem);
 
 	if (unlikely(core_waiters < 0))
@@ -1622,8 +1621,7 @@ static int coredump_wait(int exit_code)
 	}
 
 	if (core_waiters)
-		wait_for_completion(&core_state.startup);
-	mm->core_state = NULL;
+		wait_for_completion(&core_state->startup);
 fail:
 	return core_waiters;
 }
@@ -1679,6 +1677,7 @@ int get_dumpable(struct mm_struct *mm)
 
 int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 {
+	struct core_state core_state;
 	char corename[CORENAME_MAX_SIZE + 1];
 	struct mm_struct *mm = current->mm;
 	struct linux_binfmt * binfmt;
@@ -1717,7 +1716,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 		current->fsuid = 0;	/* Dump root private */
 	}
 
-	retval = coredump_wait(exit_code);
+	retval = coredump_wait(exit_code, &core_state);
 	if (retval < 0)
 		goto fail;
 
@@ -1812,6 +1811,7 @@ fail_unlock:
 
 	current->fsuid = fsuid;
 	complete_all(&mm->core_done);
+	mm->core_state = NULL;
 fail:
 	return retval;
 }
-- 
cgit v1.2.3-70-g09d2


From b564daf806d492dd4f7afe9b6c83b8d35d137669 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:44 -0700
Subject: coredump: construct the list of coredumping threads at startup time

binfmt->core_dump() has to iterate over the all threads in system in order
to find the coredumping threads and construct the list using the
GFP_ATOMIC allocations.

With this patch each thread allocates the list node on exit_mm()'s stack and
adds itself to the list.

This allows us to do further changes:

	- simplify ->core_dump()

	- change exit_mm() to clear ->mm first, then wait for ->core_done.
	  this makes the coredumping process visible to oom_kill

	- kill mm->core_done

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c                |  2 ++
 include/linux/mm_types.h |  6 ++++++
 kernel/exit.c            | 15 ++++++++++++---
 3 files changed, 20 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index b8ee842d93c..fe2873b8037 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1604,6 +1604,8 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
 
 	init_completion(&mm->core_done);
 	init_completion(&core_state->startup);
+	core_state->dumper.task = tsk;
+	core_state->dumper.next = NULL;
 	core_waiters = zap_threads(tsk, mm, core_state, exit_code);
 	up_write(&mm->mmap_sem);
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index ae99a28ba6a..4d0d0abc79f 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -159,8 +159,14 @@ struct vm_area_struct {
 #endif
 };
 
+struct core_thread {
+	struct task_struct *task;
+	struct core_thread *next;
+};
+
 struct core_state {
 	atomic_t nr_threads;
+	struct core_thread dumper;
 	struct completion startup;
 };
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 63d82957baa..b66f0d55c79 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -664,6 +664,7 @@ assign_new_owner:
 static void exit_mm(struct task_struct * tsk)
 {
 	struct mm_struct *mm = tsk->mm;
+	struct core_state *core_state;
 
 	mm_release(tsk, mm);
 	if (!mm)
@@ -676,11 +677,19 @@ static void exit_mm(struct task_struct * tsk)
 	 * group with ->mm != NULL.
 	 */
 	down_read(&mm->mmap_sem);
-	if (mm->core_state) {
+	core_state = mm->core_state;
+	if (core_state) {
+		struct core_thread self;
 		up_read(&mm->mmap_sem);
 
-		if (atomic_dec_and_test(&mm->core_state->nr_threads))
-			complete(&mm->core_state->startup);
+		self.task = tsk;
+		self.next = xchg(&core_state->dumper.next, &self);
+		/*
+		 * Implies mb(), the result of xchg() must be visible
+		 * to core_state->dumper.
+		 */
+		if (atomic_dec_and_test(&core_state->nr_threads))
+			complete(&core_state->startup);
 
 		wait_for_completion(&mm->core_done);
 		down_read(&mm->mmap_sem);
-- 
cgit v1.2.3-70-g09d2


From 83914441f94c6f2cd468ca97365f6c34f418706e Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:45 -0700
Subject: coredump: elf_core_dump: use core_state->dumper list

Kill the nasty rcu_read_lock() + do_each_thread() loop, use the list
encoded in mm->core_state instead, s/GFP_ATOMIC/GFP_KERNEL/.

This patch allows futher cleanups in binfmt_elf.c, in particular we can
kill the parallel info->threads list.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_elf.c | 77 ++++++++++++++++++++++++---------------------------------
 1 file changed, 32 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index bad7d8770d7..88d180306cf 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1478,7 +1478,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 	const struct user_regset_view *view = task_user_regset_view(dump_task);
 	struct elf_thread_core_info *t;
 	struct elf_prpsinfo *psinfo;
-	struct task_struct *g, *p;
+	struct core_thread *ct;
 	unsigned int i;
 
 	info->size = 0;
@@ -1517,34 +1517,26 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 	/*
 	 * Allocate a structure for each thread.
 	 */
-	rcu_read_lock();
-	do_each_thread(g, p)
-		if (p->mm == dump_task->mm) {
-			if (p->flags & PF_KTHREAD)
-				continue;
-
-			t = kzalloc(offsetof(struct elf_thread_core_info,
-					     notes[info->thread_notes]),
-				    GFP_ATOMIC);
-			if (unlikely(!t)) {
-				rcu_read_unlock();
-				return 0;
-			}
-			t->task = p;
-			if (p == dump_task || !info->thread) {
-				t->next = info->thread;
-				info->thread = t;
-			} else {
-				/*
-				 * Make sure to keep the original task at
-				 * the head of the list.
-				 */
-				t->next = info->thread->next;
-				info->thread->next = t;
-			}
+	for (ct = &dump_task->mm->core_state->dumper; ct; ct = ct->next) {
+		t = kzalloc(offsetof(struct elf_thread_core_info,
+				     notes[info->thread_notes]),
+			    GFP_KERNEL);
+		if (unlikely(!t))
+			return 0;
+
+		t->task = ct->task;
+		if (ct->task == dump_task || !info->thread) {
+			t->next = info->thread;
+			info->thread = t;
+		} else {
+			/*
+			 * Make sure to keep the original task at
+			 * the head of the list.
+			 */
+			t->next = info->thread->next;
+			info->thread->next = t;
 		}
-	while_each_thread(g, p);
-	rcu_read_unlock();
+	}
 
 	/*
 	 * Now fill in each thread's information.
@@ -1691,7 +1683,6 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 {
 #define	NUM_NOTES	6
 	struct list_head *t;
-	struct task_struct *g, *p;
 
 	info->notes = NULL;
 	info->prstatus = NULL;
@@ -1723,23 +1714,19 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 
 	info->thread_status_size = 0;
 	if (signr) {
+		struct core_thread *ct;
 		struct elf_thread_status *ets;
-		rcu_read_lock();
-		do_each_thread(g, p)
-			if (current->mm == p->mm && current != p) {
-				if (p->flags & PF_KTHREAD)
-					continue;
-
-				ets = kzalloc(sizeof(*ets), GFP_ATOMIC);
-				if (!ets) {
-					rcu_read_unlock();
-					return 0;
-				}
-				ets->thread = p;
-				list_add(&ets->list, &info->thread_list);
-			}
-		while_each_thread(g, p);
-		rcu_read_unlock();
+
+		for (ct = current->mm->core_state->dumper.next;
+						ct; ct = ct->next) {
+			ets = kzalloc(sizeof(*ets), GFP_KERNEL);
+			if (!ets)
+				return 0;
+
+			ets->thread = ct->task;
+			list_add(&ets->list, &info->thread_list);
+		}
+
 		list_for_each(t, &info->thread_list) {
 			int sz;
 
-- 
cgit v1.2.3-70-g09d2


From 182c515fd2a942623aed4e4e0e0b37fe96571b05 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:45 -0700
Subject: coredump: elf_fdpic_core_dump: use core_state->dumper list

Kill the nasty rcu_read_lock() + do_each_thread() loop, use the list
encoded in mm->core_state instead, s/GFP_ATOMIC/GFP_KERNEL/.

This patch allows futher cleanups in binfmt_elf_fdpic.c.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Roland McGrath <roland@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_elf_fdpic.c | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 71bcc4b4d08..1b59b1edf26 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1573,7 +1573,6 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
 	struct memelfnote *notes = NULL;
 	struct elf_prstatus *prstatus = NULL;	/* NT_PRSTATUS */
 	struct elf_prpsinfo *psinfo = NULL;	/* NT_PRPSINFO */
- 	struct task_struct *g, *p;
  	LIST_HEAD(thread_list);
  	struct list_head *t;
 	elf_fpregset_t *fpu = NULL;
@@ -1622,23 +1621,19 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
 #endif
 
 	if (signr) {
+		struct core_thread *ct;
 		struct elf_thread_status *tmp;
-		rcu_read_lock();
-		do_each_thread(g,p)
-			if (current->mm == p->mm && current != p) {
-				if (p->flags & PF_KTHREAD)
-					continue;
-
-				tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC);
-				if (!tmp) {
-					rcu_read_unlock();
-					goto cleanup;
-				}
-				tmp->thread = p;
-				list_add(&tmp->list, &thread_list);
-			}
-		while_each_thread(g,p);
-		rcu_read_unlock();
+
+		for (ct = current->mm->core_state->dumper.next;
+						ct; ct = ct->next) {
+			tmp = kzalloc(sizeof(*tmp), GFP_KERNEL);
+			if (!tmp)
+				goto cleanup;
+
+			tmp->thread = ct->task;
+			list_add(&tmp->list, &thread_list);
+		}
+
 		list_for_each(t, &thread_list) {
 			struct elf_thread_status *tmp;
 			int sz;
-- 
cgit v1.2.3-70-g09d2


From a94e2d408eaedbd85aae259621d46fafc10479a2 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:46 -0700
Subject: coredump: kill mm->core_done

Now that we have core_state->dumper list we can use it to wake up the
sub-threads waiting for the coredump completion.

This uglifies the code and .text grows by 47 bytes, but otoh mm_struct
lessens by sizeof(struct completion).  Also, with this change we can
decouple exit_mm() from the coredumping code.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c                | 25 ++++++++++++++++++++++---
 include/linux/mm_types.h |  4 +---
 kernel/exit.c            |  8 +++++++-
 3 files changed, 30 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index fe2873b8037..bff43aeb235 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1602,7 +1602,6 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
 	struct completion *vfork_done;
 	int core_waiters;
 
-	init_completion(&mm->core_done);
 	init_completion(&core_state->startup);
 	core_state->dumper.task = tsk;
 	core_state->dumper.next = NULL;
@@ -1628,6 +1627,27 @@ fail:
 	return core_waiters;
 }
 
+static void coredump_finish(struct mm_struct *mm)
+{
+	struct core_thread *curr, *next;
+	struct task_struct *task;
+
+	next = mm->core_state->dumper.next;
+	while ((curr = next) != NULL) {
+		next = curr->next;
+		task = curr->task;
+		/*
+		 * see exit_mm(), curr->task must not see
+		 * ->task == NULL before we read ->next.
+		 */
+		smp_mb();
+		curr->task = NULL;
+		wake_up_process(task);
+	}
+
+	mm->core_state = NULL;
+}
+
 /*
  * set_dumpable converts traditional three-value dumpable to two flags and
  * stores them into mm->flags.  It modifies lower two bits of mm->flags, but
@@ -1812,8 +1832,7 @@ fail_unlock:
 		argv_free(helper_argv);
 
 	current->fsuid = fsuid;
-	complete_all(&mm->core_done);
-	mm->core_state = NULL;
+	coredump_finish(mm);
 fail:
 	return retval;
 }
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 4d0d0abc79f..746f975b58e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -229,9 +229,7 @@ struct mm_struct {
 
 	unsigned long flags; /* Must use atomic bitops to access the bits */
 
-	/* coredumping support */
-	struct core_state *core_state;
-	struct completion core_done;
+	struct core_state *core_state; /* coredumping support */
 
 	/* aio bits */
 	rwlock_t		ioctx_list_lock;	/* aio lock */
diff --git a/kernel/exit.c b/kernel/exit.c
index b66f0d55c79..8a4d4d12e29 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -691,7 +691,13 @@ static void exit_mm(struct task_struct * tsk)
 		if (atomic_dec_and_test(&core_state->nr_threads))
 			complete(&core_state->startup);
 
-		wait_for_completion(&mm->core_done);
+		for (;;) {
+			set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+			if (!self.task) /* see coredump_finish() */
+				break;
+			schedule();
+		}
+		__set_task_state(tsk, TASK_RUNNING);
 		down_read(&mm->mmap_sem);
 	}
 	atomic_inc(&mm->mm_count);
-- 
cgit v1.2.3-70-g09d2


From 565b9b14e7f48131bca58840aa404bbef058fa89 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:47 -0700
Subject: coredump: format_corename: fix the "core_uses_pid" logic

I don't understand why the multi-thread coredump implies the core_uses_pid
behaviour, but we shouldn't use mm->mm_users for that.  This counter can
be incremented by get_task_mm().  Use the valued returned by
coredump_wait() instead.

Also, remove the "const char *pattern" argument, format_corename() can use
core_pattern directly.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index bff43aeb235..5e559013e30 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1379,17 +1379,14 @@ EXPORT_SYMBOL(set_binfmt);
  * name into corename, which must have space for at least
  * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
  */
-static int format_corename(char *corename, const char *pattern, long signr)
+static int format_corename(char *corename, int nr_threads, long signr)
 {
-	const char *pat_ptr = pattern;
+	const char *pat_ptr = core_pattern;
+	int ispipe = (*pat_ptr == '|');
 	char *out_ptr = corename;
 	char *const out_end = corename + CORENAME_MAX_SIZE;
 	int rc;
 	int pid_in_pattern = 0;
-	int ispipe = 0;
-
-	if (*pattern == '|')
-		ispipe = 1;
 
 	/* Repeat as long as we have more pattern to process and more output
 	   space */
@@ -1490,7 +1487,7 @@ static int format_corename(char *corename, const char *pattern, long signr)
 	 * and core_uses_pid is set, then .%pid will be appended to
 	 * the filename. Do not do this for piped commands. */
 	if (!ispipe && !pid_in_pattern
-            && (core_uses_pid || atomic_read(&current->mm->mm_users) != 1)) {
+	    && (core_uses_pid || nr_threads)) {
 		rc = snprintf(out_ptr, out_end - out_ptr,
 			      ".%d", task_tgid_vnr(current));
 		if (rc > out_end - out_ptr)
@@ -1753,7 +1750,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 	 * uses lock_kernel()
 	 */
  	lock_kernel();
-	ispipe = format_corename(corename, core_pattern, signr);
+	ispipe = format_corename(corename, retval, signr);
 	unlock_kernel();
 	/*
 	 * Don't bother to check the RLIMIT_CORE value if core_pattern points
-- 
cgit v1.2.3-70-g09d2


From 79885b227740b9c7d3057f2de556f4098d37cc8f Mon Sep 17 00:00:00 2001
From: "Edgar E. Iglesias" <edgar.iglesias@axis.com>
Date: Fri, 25 Jul 2008 01:48:10 -0700
Subject: elf: use ELF_CORE_EFLAGS for kcore ELF header flags

ELF_CORE_EFLAGS is already used by the binfmt_elf coredumper to set correct
arch specific ELF header flags on coredumps.  Use it for kcore dumps as well.
At the moment, this affects the CRIS and the H8300 arch.

Signed-off-by: Edgar E. Iglesias <edgar@axis.com>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/kcore.c         | 10 +++++-----
 include/asm-h8300/elf.h |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index e78c81fcf54..c2370c76fb7 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -23,6 +23,10 @@
 
 #define CORE_STR "CORE"
 
+#ifndef ELF_CORE_EFLAGS
+#define ELF_CORE_EFLAGS	0
+#endif
+
 static int open_kcore(struct inode * inode, struct file * filp)
 {
 	return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
@@ -164,11 +168,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
 	elf->e_entry	= 0;
 	elf->e_phoff	= sizeof(struct elfhdr);
 	elf->e_shoff	= 0;
-#if defined(CONFIG_H8300)
-	elf->e_flags	= ELF_FLAGS;
-#else
-	elf->e_flags	= 0;
-#endif
+	elf->e_flags	= ELF_CORE_EFLAGS;
 	elf->e_ehsize	= sizeof(struct elfhdr);
 	elf->e_phentsize= sizeof(struct elf_phdr);
 	elf->e_phnum	= nphdr;
diff --git a/include/asm-h8300/elf.h b/include/asm-h8300/elf.h
index 26bfc7e641d..a8b57d1f412 100644
--- a/include/asm-h8300/elf.h
+++ b/include/asm-h8300/elf.h
@@ -26,10 +26,10 @@ typedef unsigned long elf_fpregset_t;
 #define ELF_DATA	ELFDATA2MSB
 #define ELF_ARCH	EM_H8_300
 #if defined(__H8300H__)
-#define ELF_FLAGS       0x810000
+#define ELF_CORE_EFLAGS 0x810000
 #endif
 #if defined(__H8300S__)
-#define ELF_FLAGS       0x820000
+#define ELF_CORE_EFLAGS 0x820000
 #endif
 
 #define ELF_PLAT_INIT(_r)	_r->er1 = 0
-- 
cgit v1.2.3-70-g09d2


From d805dda412346225a50af2d399d958a4bc676c38 Mon Sep 17 00:00:00 2001
From: Abdel Benamrouche <draconux@gmail.com>
Date: Fri, 25 Jul 2008 01:48:25 -0700
Subject: fs/partition/check.c: fix return value warning

fs/partitions/check.c:381: warning: ignoring return value of ___device_add___,
  declared with attribute warn_unused_result

[akpm@linux-foundation.org: multiple-return-statements-per-function are evil]
Signed-off-by: Abdel Benamrouche <draconux@gmail.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/partitions/check.c | 28 ++++++++++++++++++++++------
 include/linux/genhd.h |  2 +-
 2 files changed, 23 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index efef715135d..2e6413fbd2d 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -344,18 +344,18 @@ static ssize_t whole_disk_show(struct device *dev,
 static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH,
 		   whole_disk_show, NULL);
 
-void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len, int flags)
+int add_partition(struct gendisk *disk, int part, sector_t start, sector_t len, int flags)
 {
 	struct hd_struct *p;
 	int err;
 
 	p = kzalloc(sizeof(*p), GFP_KERNEL);
 	if (!p)
-		return;
+		return -ENOMEM;
 
 	if (!init_part_stats(p)) {
-		kfree(p);
-		return;
+		err = -ENOMEM;
+		goto out0;
 	}
 	p->start_sect = start;
 	p->nr_sects = len;
@@ -378,15 +378,31 @@ void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len,
 
 	/* delay uevent until 'holders' subdir is created */
 	p->dev.uevent_suppress = 1;
-	device_add(&p->dev);
+	err = device_add(&p->dev);
+	if (err)
+		goto out1;
 	partition_sysfs_add_subdir(p);
 	p->dev.uevent_suppress = 0;
-	if (flags & ADDPART_FLAG_WHOLEDISK)
+	if (flags & ADDPART_FLAG_WHOLEDISK) {
 		err = device_create_file(&p->dev, &dev_attr_whole_disk);
+		if (err)
+			goto out2;
+	}
 
 	/* suppress uevent if the disk supresses it */
 	if (!disk->dev.uevent_suppress)
 		kobject_uevent(&p->dev.kobj, KOBJ_ADD);
+
+	return 0;
+
+out2:
+	device_del(&p->dev);
+out1:
+	put_device(&p->dev);
+	free_part_stats(p);
+out0:
+	kfree(p);
+	return err;
 }
 
 /* Not exported, helper to add_disk(). */
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index e8787417f65..118216f1bd3 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -541,7 +541,7 @@ extern dev_t blk_lookup_devt(const char *name, int part);
 extern char *disk_name (struct gendisk *hd, int part, char *buf);
 
 extern int rescan_partitions(struct gendisk *disk, struct block_device *bdev);
-extern void add_partition(struct gendisk *, int, sector_t, sector_t, int);
+extern int __must_check add_partition(struct gendisk *, int, sector_t, sector_t, int);
 extern void delete_partition(struct gendisk *, int);
 extern void printk_all_partitions(void);
 
-- 
cgit v1.2.3-70-g09d2


From 04ebd4aee52b06a2c38127d9208546e5b96f3a19 Mon Sep 17 00:00:00 2001
From: Abdel Benamrouche <draconux@gmail.com>
Date: Fri, 25 Jul 2008 01:48:26 -0700
Subject: block/ioctl.c and fs/partition/check.c: check value returned by
 add_partition()

Now that add_partition() has been aught to propagate errors, let's check them.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Abdel Benamrouche <draconux@gmail.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 block/ioctl.c         |  5 +++--
 fs/partitions/check.c | 10 ++++++++--
 2 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/block/ioctl.c b/block/ioctl.c
index 52d6385216a..77185e5c026 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -17,6 +17,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 	long long start, length;
 	int part;
 	int i;
+	int err;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
@@ -61,9 +62,9 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 				}
 			}
 			/* all seems OK */
-			add_partition(disk, part, start, length, ADDPART_FLAG_NONE);
+			err = add_partition(disk, part, start, length, ADDPART_FLAG_NONE);
 			mutex_unlock(&bdev->bd_mutex);
-			return 0;
+			return err;
 		case BLKPG_DEL_PARTITION:
 			if (!disk->part[part-1])
 				return -ENXIO;
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 2e6413fbd2d..7d6b34e201d 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -499,10 +499,16 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 		if (!size)
 			continue;
 		if (from + size > get_capacity(disk)) {
-			printk(" %s: p%d exceeds device capacity\n",
+			printk(KERN_ERR " %s: p%d exceeds device capacity\n",
 				disk->disk_name, p);
+			continue;
+		}
+		res = add_partition(disk, p, from, size, state->parts[p].flags);
+		if (res) {
+			printk(KERN_ERR " %s: p%d could not be added: %d\n",
+				disk->disk_name, p, -res);
+			continue;
 		}
-		add_partition(disk, p, from, size, state->parts[p].flags);
 #ifdef CONFIG_BLK_DEV_MD
 		if (state->parts[p].flags & ADDPART_FLAG_RAID)
 			md_autodetect_dev(bdev->bd_dev+p);
-- 
cgit v1.2.3-70-g09d2


From d991696263a704be7f41ac186f1a0ed17963c260 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 25 Jul 2008 01:48:26 -0700
Subject: fs/partitions/efi: convert to pr_debug

convert the local Dprintk() compile time debug printk wrappers to the
generic pr_debug() wrapper.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Matt Domsch <Matt_Domsch@dell.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/partitions/efi.c | 42 +++++++++++++++++-------------------------
 1 file changed, 17 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index e7b07006bc4..038a6022152 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -95,13 +95,6 @@
 #include "check.h"
 #include "efi.h"
 
-#undef EFI_DEBUG
-#ifdef EFI_DEBUG
-#define Dprintk(x...) printk(KERN_DEBUG x)
-#else
-#define Dprintk(x...)
-#endif
-
 /* This allows a kernel command line option 'gpt' to override
  * the test for invalid PMBR.  Not __initdata because reloading
  * the partition tables happens after init too.
@@ -305,10 +298,10 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
 
 	/* Check the GUID Partition Table signature */
 	if (le64_to_cpu((*gpt)->signature) != GPT_HEADER_SIGNATURE) {
-		Dprintk("GUID Partition Table Header signature is wrong:"
-			"%lld != %lld\n",
-			(unsigned long long)le64_to_cpu((*gpt)->signature),
-			(unsigned long long)GPT_HEADER_SIGNATURE);
+		pr_debug("GUID Partition Table Header signature is wrong:"
+			 "%lld != %lld\n",
+			 (unsigned long long)le64_to_cpu((*gpt)->signature),
+			 (unsigned long long)GPT_HEADER_SIGNATURE);
 		goto fail;
 	}
 
@@ -318,9 +311,8 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
 	crc = efi_crc32((const unsigned char *) (*gpt), le32_to_cpu((*gpt)->header_size));
 
 	if (crc != origcrc) {
-		Dprintk
-		    ("GUID Partition Table Header CRC is wrong: %x != %x\n",
-		     crc, origcrc);
+		pr_debug("GUID Partition Table Header CRC is wrong: %x != %x\n",
+			 crc, origcrc);
 		goto fail;
 	}
 	(*gpt)->header_crc32 = cpu_to_le32(origcrc);
@@ -328,9 +320,9 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
 	/* Check that the my_lba entry points to the LBA that contains
 	 * the GUID Partition Table */
 	if (le64_to_cpu((*gpt)->my_lba) != lba) {
-		Dprintk("GPT my_lba incorrect: %lld != %lld\n",
-			(unsigned long long)le64_to_cpu((*gpt)->my_lba),
-			(unsigned long long)lba);
+		pr_debug("GPT my_lba incorrect: %lld != %lld\n",
+			 (unsigned long long)le64_to_cpu((*gpt)->my_lba),
+			 (unsigned long long)lba);
 		goto fail;
 	}
 
@@ -339,15 +331,15 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
 	 */
 	lastlba = last_lba(bdev);
 	if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) {
-		Dprintk("GPT: first_usable_lba incorrect: %lld > %lld\n",
-			(unsigned long long)le64_to_cpu((*gpt)->first_usable_lba),
-			(unsigned long long)lastlba);
+		pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n",
+			 (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba),
+			 (unsigned long long)lastlba);
 		goto fail;
 	}
 	if (le64_to_cpu((*gpt)->last_usable_lba) > lastlba) {
-		Dprintk("GPT: last_usable_lba incorrect: %lld > %lld\n",
-			(unsigned long long)le64_to_cpu((*gpt)->last_usable_lba),
-			(unsigned long long)lastlba);
+		pr_debug("GPT: last_usable_lba incorrect: %lld > %lld\n",
+			 (unsigned long long)le64_to_cpu((*gpt)->last_usable_lba),
+			 (unsigned long long)lastlba);
 		goto fail;
 	}
 
@@ -360,7 +352,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
 			le32_to_cpu((*gpt)->sizeof_partition_entry));
 
 	if (crc != le32_to_cpu((*gpt)->partition_entry_array_crc32)) {
-		Dprintk("GUID Partitition Entry Array CRC check failed.\n");
+		pr_debug("GUID Partitition Entry Array CRC check failed.\n");
 		goto fail_ptes;
 	}
 
@@ -616,7 +608,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
 		return 0;
 	}
 
-	Dprintk("GUID Partition Table is valid!  Yea!\n");
+	pr_debug("GUID Partition Table is valid!  Yea!\n");
 
 	for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) {
 		if (!is_pte_valid(&ptes[i], last_lba(bdev)))
-- 
cgit v1.2.3-70-g09d2


From 6e644c3126149b65460610fe5a00d8a162092abe Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Fri, 25 Jul 2008 01:48:28 -0700
Subject: move proc_kmsg_operations to fs/proc/internal.h

This patch moves the extern of struct proc_kmsg_operations to
fs/proc/internal.h and adds an #include "internal.h" to fs/proc/kmsg.c
so that the latter sees the former.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/internal.h      | 1 +
 fs/proc/kmsg.c          | 2 ++
 include/linux/proc_fs.h | 1 -
 3 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 28cbca80590..8d67616e7bb 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -63,6 +63,7 @@ extern const struct file_operations proc_smaps_operations;
 extern const struct file_operations proc_clear_refs_operations;
 extern const struct file_operations proc_pagemap_operations;
 extern const struct file_operations proc_net_operations;
+extern const struct file_operations proc_kmsg_operations;
 extern const struct inode_operations proc_net_inode_operations;
 
 void free_proc_entry(struct proc_dir_entry *de);
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index ff3b90b56e9..9fd5df3f40c 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -15,6 +15,8 @@
 #include <asm/uaccess.h>
 #include <asm/io.h>
 
+#include "internal.h"
+
 extern wait_queue_head_t log_wait;
 
 extern int do_syslog(int type, char __user *bug, int count);
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 15a9eaf4a80..cdabc2fc02f 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -138,7 +138,6 @@ extern int proc_readdir(struct file *, void *, filldir_t);
 extern struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *);
 
 extern const struct file_operations proc_kcore_operations;
-extern const struct file_operations proc_kmsg_operations;
 extern const struct file_operations ppc_htab_operations;
 
 extern int pid_ns_prepare_proc(struct pid_namespace *ns);
-- 
cgit v1.2.3-70-g09d2


From 881adb85358309ea9c6f707394002719982ec607 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 25 Jul 2008 01:48:29 -0700
Subject: proc: always do ->release

Current two-stage scheme of removing PDE emphasizes one bug in proc:

		open
				rmmod
				remove_proc_entry
		close

->release won't be called because ->proc_fops were cleared.  In simple
cases it's small memory leak.

For every ->open, ->release has to be done.  List of openers is introduced
which is traversed at remove_proc_entry() if neeeded.

Discussions with Al long ago (sigh).

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/generic.c       | 14 ++++++++++
 fs/proc/inode.c         | 74 ++++++++++++++++++++++++++++++++++++++++++++++---
 fs/proc/internal.h      |  7 +++++
 include/linux/proc_fs.h |  1 +
 4 files changed, 92 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 43e54e86cef..bc0a0dd2d84 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -597,6 +597,7 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
 	ent->pde_users = 0;
 	spin_lock_init(&ent->pde_unload_lock);
 	ent->pde_unload_completion = NULL;
+	INIT_LIST_HEAD(&ent->pde_openers);
  out:
 	return ent;
 }
@@ -789,6 +790,19 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
 	spin_unlock(&de->pde_unload_lock);
 
 continue_removing:
+	spin_lock(&de->pde_unload_lock);
+	while (!list_empty(&de->pde_openers)) {
+		struct pde_opener *pdeo;
+
+		pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh);
+		list_del(&pdeo->lh);
+		spin_unlock(&de->pde_unload_lock);
+		pdeo->release(pdeo->inode, pdeo->file);
+		kfree(pdeo);
+		spin_lock(&de->pde_unload_lock);
+	}
+	spin_unlock(&de->pde_unload_lock);
+
 	if (S_ISDIR(de->mode))
 		parent->nlink--;
 	de->nlink = 0;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index b08d1001791..354c0848582 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -126,12 +126,17 @@ static const struct super_operations proc_sops = {
 	.remount_fs	= proc_remount,
 };
 
-static void pde_users_dec(struct proc_dir_entry *pde)
+static void __pde_users_dec(struct proc_dir_entry *pde)
 {
-	spin_lock(&pde->pde_unload_lock);
 	pde->pde_users--;
 	if (pde->pde_unload_completion && pde->pde_users == 0)
 		complete(pde->pde_unload_completion);
+}
+
+static void pde_users_dec(struct proc_dir_entry *pde)
+{
+	spin_lock(&pde->pde_unload_lock);
+	__pde_users_dec(pde);
 	spin_unlock(&pde->pde_unload_lock);
 }
 
@@ -318,36 +323,97 @@ static int proc_reg_open(struct inode *inode, struct file *file)
 	struct proc_dir_entry *pde = PDE(inode);
 	int rv = 0;
 	int (*open)(struct inode *, struct file *);
+	int (*release)(struct inode *, struct file *);
+	struct pde_opener *pdeo;
+
+	/*
+	 * What for, you ask? Well, we can have open, rmmod, remove_proc_entry
+	 * sequence. ->release won't be called because ->proc_fops will be
+	 * cleared. Depending on complexity of ->release, consequences vary.
+	 *
+	 * We can't wait for mercy when close will be done for real, it's
+	 * deadlockable: rmmod foo </proc/foo . So, we're going to do ->release
+	 * by hand in remove_proc_entry(). For this, save opener's credentials
+	 * for later.
+	 */
+	pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL);
+	if (!pdeo)
+		return -ENOMEM;
 
 	spin_lock(&pde->pde_unload_lock);
 	if (!pde->proc_fops) {
 		spin_unlock(&pde->pde_unload_lock);
+		kfree(pdeo);
 		return rv;
 	}
 	pde->pde_users++;
 	open = pde->proc_fops->open;
+	release = pde->proc_fops->release;
 	spin_unlock(&pde->pde_unload_lock);
 
 	if (open)
 		rv = open(inode, file);
 
-	pde_users_dec(pde);
+	spin_lock(&pde->pde_unload_lock);
+	if (rv == 0 && release) {
+		/* To know what to release. */
+		pdeo->inode = inode;
+		pdeo->file = file;
+		/* Strictly for "too late" ->release in proc_reg_release(). */
+		pdeo->release = release;
+		list_add(&pdeo->lh, &pde->pde_openers);
+	} else
+		kfree(pdeo);
+	__pde_users_dec(pde);
+	spin_unlock(&pde->pde_unload_lock);
 	return rv;
 }
 
+static struct pde_opener *find_pde_opener(struct proc_dir_entry *pde,
+					struct inode *inode, struct file *file)
+{
+	struct pde_opener *pdeo;
+
+	list_for_each_entry(pdeo, &pde->pde_openers, lh) {
+		if (pdeo->inode == inode && pdeo->file == file)
+			return pdeo;
+	}
+	return NULL;
+}
+
 static int proc_reg_release(struct inode *inode, struct file *file)
 {
 	struct proc_dir_entry *pde = PDE(inode);
 	int rv = 0;
 	int (*release)(struct inode *, struct file *);
+	struct pde_opener *pdeo;
 
 	spin_lock(&pde->pde_unload_lock);
+	pdeo = find_pde_opener(pde, inode, file);
 	if (!pde->proc_fops) {
-		spin_unlock(&pde->pde_unload_lock);
+		/*
+		 * Can't simply exit, __fput() will think that everything is OK,
+		 * and move on to freeing struct file. remove_proc_entry() will
+		 * find slacker in opener's list and will try to do non-trivial
+		 * things with struct file. Therefore, remove opener from list.
+		 *
+		 * But if opener is removed from list, who will ->release it?
+		 */
+		if (pdeo) {
+			list_del(&pdeo->lh);
+			spin_unlock(&pde->pde_unload_lock);
+			rv = pdeo->release(inode, file);
+			kfree(pdeo);
+		} else
+			spin_unlock(&pde->pde_unload_lock);
 		return rv;
 	}
 	pde->pde_users++;
 	release = pde->proc_fops->release;
+	if (pdeo) {
+		list_del(&pdeo->lh);
+		kfree(pdeo);
+	}
 	spin_unlock(&pde->pde_unload_lock);
 
 	if (release)
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 8d67616e7bb..442202314d5 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -89,3 +89,10 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *ino,
 		struct dentry *dentry);
 int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
 		filldir_t filldir);
+
+struct pde_opener {
+	struct inode *inode;
+	struct file *file;
+	int (*release)(struct inode *, struct file *);
+	struct list_head lh;
+};
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index cdabc2fc02f..f560d1705af 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -79,6 +79,7 @@ struct proc_dir_entry {
 	int pde_users;	/* number of callers into module in progress */
 	spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */
 	struct completion *pde_unload_completion;
+	struct list_head pde_openers;	/* who did ->open, but not ->release */
 };
 
 struct kcore_list {
-- 
cgit v1.2.3-70-g09d2


From a9bd4a3e070ba7494f154e1a11687a8a957d88dc Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 25 Jul 2008 01:48:30 -0700
Subject: proc: remove pathetic remount code

MS_RMT_MASK will unmask changes in do_remount_sb() anyway.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/inode.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 354c0848582..02eca2ed9dd 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -111,19 +111,12 @@ int __init proc_init_inodecache(void)
 	return 0;
 }
 
-static int proc_remount(struct super_block *sb, int *flags, char *data)
-{
-	*flags |= MS_NODIRATIME;
-	return 0;
-}
-
 static const struct super_operations proc_sops = {
 	.alloc_inode	= proc_alloc_inode,
 	.destroy_inode	= proc_destroy_inode,
 	.drop_inode	= generic_delete_inode,
 	.delete_inode	= proc_delete_inode,
 	.statfs		= simple_statfs,
-	.remount_fs	= proc_remount,
 };
 
 static void __pde_users_dec(struct proc_dir_entry *pde)
-- 
cgit v1.2.3-70-g09d2


From 6eedf8d30d2b48e86fbcee1a32fb2fa5f42219ee Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 25 Jul 2008 01:48:30 -0700
Subject: proc: move Kconfig to fs/proc/Kconfig

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/Kconfig      | 60 +--------------------------------------------------------
 fs/proc/Kconfig | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+), 59 deletions(-)
 create mode 100644 fs/proc/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index ed563b9e352..97e3bdedb1e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -902,65 +902,7 @@ endif # BLOCK
 
 menu "Pseudo filesystems"
 
-config PROC_FS
-	bool "/proc file system support" if EMBEDDED
-	default y
-	help
-	  This is a virtual file system providing information about the status
-	  of the system. "Virtual" means that it doesn't take up any space on
-	  your hard disk: the files are created on the fly by the kernel when
-	  you try to access them. Also, you cannot read the files with older
-	  version of the program less: you need to use more or cat.
-
-	  It's totally cool; for example, "cat /proc/interrupts" gives
-	  information about what the different IRQs are used for at the moment
-	  (there is a small number of Interrupt ReQuest lines in your computer
-	  that are used by the attached devices to gain the CPU's attention --
-	  often a source of trouble if two devices are mistakenly configured
-	  to use the same IRQ). The program procinfo to display some
-	  information about your system gathered from the /proc file system.
-
-	  Before you can use the /proc file system, it has to be mounted,
-	  meaning it has to be given a location in the directory hierarchy.
-	  That location should be /proc. A command such as "mount -t proc proc
-	  /proc" or the equivalent line in /etc/fstab does the job.
-
-	  The /proc file system is explained in the file
-	  <file:Documentation/filesystems/proc.txt> and on the proc(5) manpage
-	  ("man 5 proc").
-
-	  This option will enlarge your kernel by about 67 KB. Several
-	  programs depend on this, so everyone should say Y here.
-
-config PROC_KCORE
-	bool "/proc/kcore support" if !ARM
-	depends on PROC_FS && MMU
-
-config PROC_VMCORE
-        bool "/proc/vmcore support (EXPERIMENTAL)"
-        depends on PROC_FS && CRASH_DUMP
-	default y
-        help
-        Exports the dump image of crashed kernel in ELF format.
-
-config PROC_SYSCTL
-	bool "Sysctl support (/proc/sys)" if EMBEDDED
-	depends on PROC_FS
-	select SYSCTL
-	default y
-	---help---
-	  The sysctl interface provides a means of dynamically changing
-	  certain kernel parameters and variables on the fly without requiring
-	  a recompile of the kernel or reboot of the system.  The primary
-	  interface is through /proc/sys.  If you say Y here a tree of
-	  modifiable sysctl entries will be generated beneath the
-          /proc/sys directory. They are explained in the files
-	  in <file:Documentation/sysctl/>.  Note that enabling this
-	  option will enlarge the kernel by at least 8 KB.
-
-	  As it is generally a good thing, you should say Y here unless
-	  building a kernel for install/rescue disks or your system is very
-	  limited in memory.
+source "fs/proc/Kconfig"
 
 config SYSFS
 	bool "sysfs file system support" if EMBEDDED
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
new file mode 100644
index 00000000000..73cd7a418f0
--- /dev/null
+++ b/fs/proc/Kconfig
@@ -0,0 +1,59 @@
+config PROC_FS
+	bool "/proc file system support" if EMBEDDED
+	default y
+	help
+	  This is a virtual file system providing information about the status
+	  of the system. "Virtual" means that it doesn't take up any space on
+	  your hard disk: the files are created on the fly by the kernel when
+	  you try to access them. Also, you cannot read the files with older
+	  version of the program less: you need to use more or cat.
+
+	  It's totally cool; for example, "cat /proc/interrupts" gives
+	  information about what the different IRQs are used for at the moment
+	  (there is a small number of Interrupt ReQuest lines in your computer
+	  that are used by the attached devices to gain the CPU's attention --
+	  often a source of trouble if two devices are mistakenly configured
+	  to use the same IRQ). The program procinfo to display some
+	  information about your system gathered from the /proc file system.
+
+	  Before you can use the /proc file system, it has to be mounted,
+	  meaning it has to be given a location in the directory hierarchy.
+	  That location should be /proc. A command such as "mount -t proc proc
+	  /proc" or the equivalent line in /etc/fstab does the job.
+
+	  The /proc file system is explained in the file
+	  <file:Documentation/filesystems/proc.txt> and on the proc(5) manpage
+	  ("man 5 proc").
+
+	  This option will enlarge your kernel by about 67 KB. Several
+	  programs depend on this, so everyone should say Y here.
+
+config PROC_KCORE
+	bool "/proc/kcore support" if !ARM
+	depends on PROC_FS && MMU
+
+config PROC_VMCORE
+        bool "/proc/vmcore support (EXPERIMENTAL)"
+        depends on PROC_FS && CRASH_DUMP
+	default y
+        help
+        Exports the dump image of crashed kernel in ELF format.
+
+config PROC_SYSCTL
+	bool "Sysctl support (/proc/sys)" if EMBEDDED
+	depends on PROC_FS
+	select SYSCTL
+	default y
+	---help---
+	  The sysctl interface provides a means of dynamically changing
+	  certain kernel parameters and variables on the fly without requiring
+	  a recompile of the kernel or reboot of the system.  The primary
+	  interface is through /proc/sys.  If you say Y here a tree of
+	  modifiable sysctl entries will be generated beneath the
+          /proc/sys directory. They are explained in the files
+	  in <file:Documentation/sysctl/>.  Note that enabling this
+	  option will enlarge the kernel by at least 8 KB.
+
+	  As it is generally a good thing, you should say Y here unless
+	  building a kernel for install/rescue disks or your system is very
+	  limited in memory.
-- 
cgit v1.2.3-70-g09d2


From 297c5d92634c809cef23d73e7b2556f2528ff7e2 Mon Sep 17 00:00:00 2001
From: Andrea Righi <righi.andrea@gmail.com>
Date: Fri, 25 Jul 2008 01:48:49 -0700
Subject: task IO accounting: provide distinct tgid/tid I/O statistics

Report per-thread I/O statistics in /proc/pid/task/tid/io and aggregate
parent I/O statistics in /proc/pid/io.  This approach follows the same
model used to account per-process and per-thread CPU times.

As a practial application, this allows for example to quickly find the top
I/O consumer when a process spawns many child threads that perform the
actual I/O work, because the aggregated I/O statistics can always be found
in /proc/pid/io.

[ Oleg Nesterov points out that we should check that the task is still
  alive before we iterate over the threads, but also says that we can do
  that fixup on top of this later.  - Linus ]

Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Andrea Righi <righi.andrea@gmail.com>
Cc: Matt Heaton <matt@hostmonster.com>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Acked-by-with-comments: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c        | 86 ++++++++++++++++++++++++++++++++++++++++++---------
 include/linux/sched.h |  4 +++
 kernel/exit.c         | 27 ++++++++++++++++
 kernel/fork.c         |  6 ++++
 4 files changed, 108 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 58c3e6a8e15..a891fe4cb43 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2376,29 +2376,82 @@ static int proc_base_fill_cache(struct file *filp, void *dirent,
 }
 
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-static int proc_pid_io_accounting(struct task_struct *task, char *buffer)
-{
+static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
+{
+	u64 rchar, wchar, syscr, syscw;
+	struct task_io_accounting ioac;
+
+	if (!whole) {
+		rchar = task->rchar;
+		wchar = task->wchar;
+		syscr = task->syscr;
+		syscw = task->syscw;
+		memcpy(&ioac, &task->ioac, sizeof(ioac));
+	} else {
+		unsigned long flags;
+		struct task_struct *t = task;
+		rchar = wchar = syscr = syscw = 0;
+		memset(&ioac, 0, sizeof(ioac));
+
+		rcu_read_lock();
+		do {
+			rchar += t->rchar;
+			wchar += t->wchar;
+			syscr += t->syscr;
+			syscw += t->syscw;
+
+			ioac.read_bytes += t->ioac.read_bytes;
+			ioac.write_bytes += t->ioac.write_bytes;
+			ioac.cancelled_write_bytes +=
+					t->ioac.cancelled_write_bytes;
+			t = next_thread(t);
+		} while (t != task);
+		rcu_read_unlock();
+
+		if (lock_task_sighand(task, &flags)) {
+			struct signal_struct *sig = task->signal;
+
+			rchar += sig->rchar;
+			wchar += sig->wchar;
+			syscr += sig->syscr;
+			syscw += sig->syscw;
+
+			ioac.read_bytes += sig->ioac.read_bytes;
+			ioac.write_bytes += sig->ioac.write_bytes;
+			ioac.cancelled_write_bytes +=
+					sig->ioac.cancelled_write_bytes;
+
+			unlock_task_sighand(task, &flags);
+		}
+	}
+
 	return sprintf(buffer,
-#ifdef CONFIG_TASK_XACCT
 			"rchar: %llu\n"
 			"wchar: %llu\n"
 			"syscr: %llu\n"
 			"syscw: %llu\n"
-#endif
 			"read_bytes: %llu\n"
 			"write_bytes: %llu\n"
 			"cancelled_write_bytes: %llu\n",
-#ifdef CONFIG_TASK_XACCT
-			(unsigned long long)task->rchar,
-			(unsigned long long)task->wchar,
-			(unsigned long long)task->syscr,
-			(unsigned long long)task->syscw,
-#endif
-			(unsigned long long)task->ioac.read_bytes,
-			(unsigned long long)task->ioac.write_bytes,
-			(unsigned long long)task->ioac.cancelled_write_bytes);
+			(unsigned long long)rchar,
+			(unsigned long long)wchar,
+			(unsigned long long)syscr,
+			(unsigned long long)syscw,
+			(unsigned long long)ioac.read_bytes,
+			(unsigned long long)ioac.write_bytes,
+			(unsigned long long)ioac.cancelled_write_bytes);
+}
+
+static int proc_tid_io_accounting(struct task_struct *task, char *buffer)
+{
+	return do_io_accounting(task, buffer, 0);
 }
-#endif
+
+static int proc_tgid_io_accounting(struct task_struct *task, char *buffer)
+{
+	return do_io_accounting(task, buffer, 1);
+}
+#endif /* CONFIG_TASK_IO_ACCOUNTING */
 
 /*
  * Thread groups
@@ -2470,7 +2523,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 	REG("coredump_filter", S_IRUGO|S_IWUSR, coredump_filter),
 #endif
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-	INF("io",	S_IRUGO, pid_io_accounting),
+	INF("io",	S_IRUGO, tgid_io_accounting),
 #endif
 };
 
@@ -2797,6 +2850,9 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_FAULT_INJECTION
 	REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject),
 #endif
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+	INF("io",	S_IRUGO, tid_io_accounting),
+#endif
 };
 
 static int proc_tid_base_readdir(struct file * filp,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index af780f299c7..d22ffe06d0e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -506,6 +506,10 @@ struct signal_struct {
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
 	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
 	unsigned long inblock, oublock, cinblock, coublock;
+#ifdef CONFIG_TASK_XACCT
+	u64 rchar, wchar, syscr, syscw;
+#endif
+	struct task_io_accounting ioac;
 
 	/*
 	 * Cumulative ns of scheduled CPU time for dead threads in the
diff --git a/kernel/exit.c b/kernel/exit.c
index 8a4d4d12e29..ad933bb29ec 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -120,6 +120,18 @@ static void __exit_signal(struct task_struct *tsk)
 		sig->nivcsw += tsk->nivcsw;
 		sig->inblock += task_io_get_inblock(tsk);
 		sig->oublock += task_io_get_oublock(tsk);
+#ifdef CONFIG_TASK_XACCT
+		sig->rchar += tsk->rchar;
+		sig->wchar += tsk->wchar;
+		sig->syscr += tsk->syscr;
+		sig->syscw += tsk->syscw;
+#endif /* CONFIG_TASK_XACCT */
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+		sig->ioac.read_bytes += tsk->ioac.read_bytes;
+		sig->ioac.write_bytes += tsk->ioac.write_bytes;
+		sig->ioac.cancelled_write_bytes +=
+					tsk->ioac.cancelled_write_bytes;
+#endif /* CONFIG_TASK_IO_ACCOUNTING */
 		sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
 		sig = NULL; /* Marker for below. */
 	}
@@ -1366,6 +1378,21 @@ static int wait_task_zombie(struct task_struct *p, int options,
 		psig->coublock +=
 			task_io_get_oublock(p) +
 			sig->oublock + sig->coublock;
+#ifdef CONFIG_TASK_XACCT
+		psig->rchar += p->rchar + sig->rchar;
+		psig->wchar += p->wchar + sig->wchar;
+		psig->syscr += p->syscr + sig->syscr;
+		psig->syscw += p->syscw + sig->syscw;
+#endif /* CONFIG_TASK_XACCT */
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+		psig->ioac.read_bytes +=
+			p->ioac.read_bytes + sig->ioac.read_bytes;
+		psig->ioac.write_bytes +=
+			p->ioac.write_bytes + sig->ioac.write_bytes;
+		psig->ioac.cancelled_write_bytes +=
+				p->ioac.cancelled_write_bytes +
+				sig->ioac.cancelled_write_bytes;
+#endif /* CONFIG_TASK_IO_ACCOUNTING */
 		spin_unlock_irq(&p->parent->sighand->siglock);
 	}
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 813d5c89b9d..b99d73e971a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -812,6 +812,12 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
 	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
 	sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
+#ifdef CONFIG_TASK_XACCT
+	sig->rchar = sig->wchar = sig->syscr = sig->syscw = 0;
+#endif
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+	memset(&sig->ioac, 0, sizeof(sig->ioac));
+#endif
 	sig->sum_sched_runtime = 0;
 	INIT_LIST_HEAD(&sig->cpu_timers[0]);
 	INIT_LIST_HEAD(&sig->cpu_timers[1]);
-- 
cgit v1.2.3-70-g09d2


From cc77b1521d06be07c9bb1a4a3e1f775dcaa15093 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 25 Jul 2008 01:48:55 -0700
Subject: lockd: dont return EAGAIN for a permanent error

Fix nlm_fopen() to return NLM_FAILED (or NLM_LCK_DENIED_NOLOCKS) instead
of NLM_LCK_DENIED.  The latter means the lock request failed because of a
conflicting lock (i.e.  a temporary error), which is wrong in this case.

Also fix the client to return ENOLCK instead of EAGAIN if a blocking lock
request returns with NLM_LOCK_DENIED.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Matthew Wilcox <matthew@wil.cx>
Cc: David Teigland <teigland@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/lockd/clntproc.c | 10 +++++++++-
 fs/nfsd/lockd.c     | 13 +++++++++----
 2 files changed, 18 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 1f6dc518505..31668b690e0 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -582,7 +582,15 @@ again:
 	}
 	if (status < 0)
 		goto out_unlock;
-	status = nlm_stat_to_errno(resp->status);
+	/*
+	 * EAGAIN doesn't make sense for sleeping locks, and in some
+	 * cases NLM_LCK_DENIED is returned for a permanent error.  So
+	 * turn it into an ENOLCK.
+	 */
+	if (resp->status == nlm_lck_denied && (fl_flags & FL_SLEEP))
+		status = -ENOLCK;
+	else
+		status = nlm_stat_to_errno(resp->status);
 out_unblock:
 	nlmclnt_finish_block(block);
 out:
diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
index 6b6225ac492..15c6faeec77 100644
--- a/fs/nfsd/lockd.c
+++ b/fs/nfsd/lockd.c
@@ -19,6 +19,13 @@
 
 #define NFSDDBG_FACILITY		NFSDDBG_LOCKD
 
+#ifdef CONFIG_LOCKD_V4
+#define nlm_stale_fh	nlm4_stale_fh
+#define nlm_failed	nlm4_failed
+#else
+#define nlm_stale_fh	nlm_lck_denied_nolocks
+#define nlm_failed	nlm_lck_denied_nolocks
+#endif
 /*
  * Note: we hold the dentry use count while the file is open.
  */
@@ -47,12 +54,10 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp)
 		return 0;
 	case nfserr_dropit:
 		return nlm_drop_reply;
-#ifdef CONFIG_LOCKD_V4
 	case nfserr_stale:
-		return nlm4_stale_fh;
-#endif
+		return nlm_stale_fh;
 	default:
-		return nlm_lck_denied;
+		return nlm_failed;
 	}
 }
 
-- 
cgit v1.2.3-70-g09d2


From bde74e4bc64415b142e556a34d295a52a1b7da9d Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 25 Jul 2008 01:48:57 -0700
Subject: locks: add special return value for asynchronous locks

Use a special error value FILE_LOCK_DEFERRED to mean that a locking
operation returned asynchronously.  This is returned by

  posix_lock_file() for sleeping locks to mean that the lock has been
  queued on the block list, and will be woken up when it might become
  available and needs to be retried (either fl_lmops->fl_notify() is
  called or fl_wait is woken up).

  f_op->lock() to mean either the above, or that the filesystem will
  call back with fl_lmops->fl_grant() when the result of the locking
  operation is known.  The filesystem can do this for sleeping as well
  as non-sleeping locks.

This is to make sure, that return values of -EAGAIN and -EINPROGRESS by
filesystems are not mistaken to mean an asynchronous locking.

This also makes error handling in fs/locks.c and lockd/svclock.c slightly
cleaner.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Matthew Wilcox <matthew@wil.cx>
Cc: David Teigland <teigland@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dlm/plock.c     |  2 +-
 fs/lockd/svclock.c | 13 ++++---------
 fs/locks.c         | 28 ++++++++++++++--------------
 include/linux/fs.h |  6 ++++++
 4 files changed, 25 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 78878c5781c..eba87ff3177 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -116,7 +116,7 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
 	if (xop->callback == NULL)
 		wait_event(recv_wq, (op->done != 0));
 	else {
-		rv = -EINPROGRESS;
+		rv = FILE_LOCK_DEFERRED;
 		goto out;
 	}
 
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 821b9acdfb6..cf0d5c2c318 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -418,8 +418,8 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 			goto out;
 		case -EAGAIN:
 			ret = nlm_lck_denied;
-			break;
-		case -EINPROGRESS:
+			goto out;
+		case FILE_LOCK_DEFERRED:
 			if (wait)
 				break;
 			/* Filesystem lock operation is in progress
@@ -434,10 +434,6 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 			goto out;
 	}
 
-	ret = nlm_lck_denied;
-	if (!wait)
-		goto out;
-
 	ret = nlm_lck_blocked;
 
 	/* Append to list of blocked */
@@ -507,7 +503,7 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
 	}
 
 	error = vfs_test_lock(file->f_file, &lock->fl);
-	if (error == -EINPROGRESS) {
+	if (error == FILE_LOCK_DEFERRED) {
 		ret = nlmsvc_defer_lock_rqst(rqstp, block);
 		goto out;
 	}
@@ -731,8 +727,7 @@ nlmsvc_grant_blocked(struct nlm_block *block)
 	switch (error) {
 	case 0:
 		break;
-	case -EAGAIN:
-	case -EINPROGRESS:
+	case FILE_LOCK_DEFERRED:
 		dprintk("lockd: lock still blocked error %d\n", error);
 		nlmsvc_insert_block(block, NLM_NEVER);
 		nlmsvc_release_block(block);
diff --git a/fs/locks.c b/fs/locks.c
index dce8c747371..1ce57b4b362 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -779,8 +779,10 @@ find_conflict:
 		if (!flock_locks_conflict(request, fl))
 			continue;
 		error = -EAGAIN;
-		if (request->fl_flags & FL_SLEEP)
-			locks_insert_block(fl, request);
+		if (!(request->fl_flags & FL_SLEEP))
+			goto out;
+		error = FILE_LOCK_DEFERRED;
+		locks_insert_block(fl, request);
 		goto out;
 	}
 	if (request->fl_flags & FL_ACCESS)
@@ -836,7 +838,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 			error = -EDEADLK;
 			if (posix_locks_deadlock(request, fl))
 				goto out;
-			error = -EAGAIN;
+			error = FILE_LOCK_DEFERRED;
 			locks_insert_block(fl, request);
 			goto out;
   		}
@@ -1035,7 +1037,7 @@ int posix_lock_file_wait(struct file *filp, struct file_lock *fl)
 	might_sleep ();
 	for (;;) {
 		error = posix_lock_file(filp, fl, NULL);
-		if ((error != -EAGAIN) || !(fl->fl_flags & FL_SLEEP))
+		if (error != FILE_LOCK_DEFERRED)
 			break;
 		error = wait_event_interruptible(fl->fl_wait, !fl->fl_next);
 		if (!error)
@@ -1107,9 +1109,7 @@ int locks_mandatory_area(int read_write, struct inode *inode,
 
 	for (;;) {
 		error = __posix_lock_file(inode, &fl, NULL);
-		if (error != -EAGAIN)
-			break;
-		if (!(fl.fl_flags & FL_SLEEP))
+		if (error != FILE_LOCK_DEFERRED)
 			break;
 		error = wait_event_interruptible(fl.fl_wait, !fl.fl_next);
 		if (!error) {
@@ -1531,7 +1531,7 @@ int flock_lock_file_wait(struct file *filp, struct file_lock *fl)
 	might_sleep();
 	for (;;) {
 		error = flock_lock_file(filp, fl);
-		if ((error != -EAGAIN) || !(fl->fl_flags & FL_SLEEP))
+		if (error != FILE_LOCK_DEFERRED)
 			break;
 		error = wait_event_interruptible(fl->fl_wait, !fl->fl_next);
 		if (!error)
@@ -1716,17 +1716,17 @@ out:
  * fl_grant is set. Callers expecting ->lock() to return asynchronously
  * will only use F_SETLK, not F_SETLKW; they will set FL_SLEEP if (and only if)
  * the request is for a blocking lock. When ->lock() does return asynchronously,
- * it must return -EINPROGRESS, and call ->fl_grant() when the lock
+ * it must return FILE_LOCK_DEFERRED, and call ->fl_grant() when the lock
  * request completes.
  * If the request is for non-blocking lock the file system should return
- * -EINPROGRESS then try to get the lock and call the callback routine with
- * the result. If the request timed out the callback routine will return a
+ * FILE_LOCK_DEFERRED then try to get the lock and call the callback routine
+ * with the result. If the request timed out the callback routine will return a
  * nonzero return code and the file system should release the lock. The file
  * system is also responsible to keep a corresponding posix lock when it
  * grants a lock so the VFS can find out which locks are locally held and do
  * the correct lock cleanup when required.
  * The underlying filesystem must not drop the kernel lock or call
- * ->fl_grant() before returning to the caller with a -EINPROGRESS
+ * ->fl_grant() before returning to the caller with a FILE_LOCK_DEFERRED
  * return code.
  */
 int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf)
@@ -1804,7 +1804,7 @@ again:
 	else {
 		for (;;) {
 			error = posix_lock_file(filp, file_lock, NULL);
-			if (error != -EAGAIN || cmd == F_SETLK)
+			if (error != FILE_LOCK_DEFERRED)
 				break;
 			error = wait_event_interruptible(file_lock->fl_wait,
 					!file_lock->fl_next);
@@ -1941,7 +1941,7 @@ again:
 	else {
 		for (;;) {
 			error = posix_lock_file(filp, file_lock, NULL);
-			if (error != -EAGAIN || cmd == F_SETLK64)
+			if (error != FILE_LOCK_DEFERRED)
 				break;
 			error = wait_event_interruptible(file_lock->fl_wait,
 					!file_lock->fl_next);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4b86f806014..49d8eb7a71b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -885,6 +885,12 @@ static inline int file_check_writeable(struct file *filp)
 #define FL_CLOSE	64	/* unlock on close */
 #define FL_SLEEP	128	/* A blocking lock */
 
+/*
+ * Special return value from posix_lock_file() and vfs_lock_file() for
+ * asynchronous locking.
+ */
+#define FILE_LOCK_DEFERRED 1
+
 /*
  * The POSIX file lock owner is determined by
  * the "struct files_struct" in the thread group
-- 
cgit v1.2.3-70-g09d2


From b648a6de00770cc325c22f43bdd4e935f6a2ee55 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 25 Jul 2008 01:48:57 -0700
Subject: locks: cleanup code duplication

Extract common code into a function.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Matthew Wilcox <matthew@wil.cx>
Cc: David Teigland <teigland@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/locks.c | 71 +++++++++++++++++++++++++++-----------------------------------
 1 file changed, 31 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index 1ce57b4b362..6222e4b580e 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1738,6 +1738,35 @@ int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, str
 }
 EXPORT_SYMBOL_GPL(vfs_lock_file);
 
+static int do_lock_file_wait(struct file *filp, unsigned int cmd,
+			     struct file_lock *fl)
+{
+	int error;
+
+	error = security_file_lock(filp, fl->fl_type);
+	if (error)
+		return error;
+
+	if (filp->f_op && filp->f_op->lock != NULL)
+		error = filp->f_op->lock(filp, cmd, fl);
+	else {
+		for (;;) {
+			error = posix_lock_file(filp, fl, NULL);
+			if (error != FILE_LOCK_DEFERRED)
+				break;
+			error = wait_event_interruptible(fl->fl_wait,
+							 !fl->fl_next);
+			if (!error)
+				continue;
+
+			locks_delete_block(fl);
+			break;
+		}
+	}
+
+	return error;
+}
+
 /* Apply the lock described by l to an open file descriptor.
  * This implements both the F_SETLK and F_SETLKW commands of fcntl().
  */
@@ -1795,26 +1824,7 @@ again:
 		goto out;
 	}
 
-	error = security_file_lock(filp, file_lock->fl_type);
-	if (error)
-		goto out;
-
-	if (filp->f_op && filp->f_op->lock != NULL)
-		error = filp->f_op->lock(filp, cmd, file_lock);
-	else {
-		for (;;) {
-			error = posix_lock_file(filp, file_lock, NULL);
-			if (error != FILE_LOCK_DEFERRED)
-				break;
-			error = wait_event_interruptible(file_lock->fl_wait,
-					!file_lock->fl_next);
-			if (!error)
-				continue;
-
-			locks_delete_block(file_lock);
-			break;
-		}
-	}
+	error = do_lock_file_wait(filp, cmd, file_lock);
 
 	/*
 	 * Attempt to detect a close/fcntl race and recover by
@@ -1932,26 +1942,7 @@ again:
 		goto out;
 	}
 
-	error = security_file_lock(filp, file_lock->fl_type);
-	if (error)
-		goto out;
-
-	if (filp->f_op && filp->f_op->lock != NULL)
-		error = filp->f_op->lock(filp, cmd, file_lock);
-	else {
-		for (;;) {
-			error = posix_lock_file(filp, file_lock, NULL);
-			if (error != FILE_LOCK_DEFERRED)
-				break;
-			error = wait_event_interruptible(file_lock->fl_wait,
-					!file_lock->fl_next);
-			if (!error)
-				continue;
-
-			locks_delete_block(file_lock);
-			break;
-		}
-	}
+	error = do_lock_file_wait(filp, cmd, file_lock);
 
 	/*
 	 * Attempt to detect a close/fcntl race and recover by
-- 
cgit v1.2.3-70-g09d2


From 764c76b371722e0cba5c24d91225f0f954b69d44 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 25 Jul 2008 01:48:58 -0700
Subject: locks: allow ->lock() to return FILE_LOCK_DEFERRED

Allow filesystem's ->lock() method to call posix_lock_file() instead of
posix_lock_file_wait(), and return FILE_LOCK_DEFERRED.  This makes it
possible to implement a such a ->lock() function, that works with the lock
manager, which needs the call to be asynchronous.

Now the vfs_lock_file() helper can be used, so this is a cleanup as well.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Matthew Wilcox <matthew@wil.cx>
Cc: David Teigland <teigland@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/locks.c | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index 6222e4b580e..01490300f7c 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1747,21 +1747,16 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd,
 	if (error)
 		return error;
 
-	if (filp->f_op && filp->f_op->lock != NULL)
-		error = filp->f_op->lock(filp, cmd, fl);
-	else {
-		for (;;) {
-			error = posix_lock_file(filp, fl, NULL);
-			if (error != FILE_LOCK_DEFERRED)
-				break;
-			error = wait_event_interruptible(fl->fl_wait,
-							 !fl->fl_next);
-			if (!error)
-				continue;
-
-			locks_delete_block(fl);
+	for (;;) {
+		error = vfs_lock_file(filp, cmd, fl, NULL);
+		if (error != FILE_LOCK_DEFERRED)
 			break;
-		}
+		error = wait_event_interruptible(fl->fl_wait, !fl->fl_next);
+		if (!error)
+			continue;
+
+		locks_delete_block(fl);
+		break;
 	}
 
 	return error;
-- 
cgit v1.2.3-70-g09d2


From 0de6256daafa3a97a269995e9b29f956bd419bbf Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 25 Jul 2008 01:48:59 -0700
Subject: fuse: prepare lookup for nfs export

Use d_splice_alias() instead of d_add() in fuse lookup code, to allow NFS
exporting.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fuse/dir.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 2060bf06b90..e5217b213b4 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -239,18 +239,20 @@ int fuse_valid_type(int m)
  * Add a directory inode to a dentry, ensuring that no other dentry
  * refers to this inode.  Called with fc->inst_mutex.
  */
-static int fuse_d_add_directory(struct dentry *entry, struct inode *inode)
+static struct dentry *fuse_d_add_directory(struct dentry *entry,
+					   struct inode *inode)
 {
 	struct dentry *alias = d_find_alias(inode);
-	if (alias) {
+	if (alias && !(alias->d_flags & DCACHE_DISCONNECTED)) {
 		/* This tries to shrink the subtree below alias */
 		fuse_invalidate_entry(alias);
 		dput(alias);
 		if (!list_empty(&inode->i_dentry))
-			return -EBUSY;
+			return ERR_PTR(-EBUSY);
+	} else {
+		dput(alias);
 	}
-	d_add(entry, inode);
-	return 0;
+	return d_splice_alias(inode, entry);
 }
 
 static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
@@ -259,6 +261,7 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
 	int err;
 	struct fuse_entry_out outarg;
 	struct inode *inode = NULL;
+	struct dentry *newent;
 	struct fuse_conn *fc = get_fuse_conn(dir);
 	struct fuse_req *req;
 	struct fuse_req *forget_req;
@@ -303,21 +306,22 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
 
 	if (inode && S_ISDIR(inode->i_mode)) {
 		mutex_lock(&fc->inst_mutex);
-		err = fuse_d_add_directory(entry, inode);
+		newent = fuse_d_add_directory(entry, inode);
 		mutex_unlock(&fc->inst_mutex);
-		if (err) {
+		if (IS_ERR(newent)) {
 			iput(inode);
-			return ERR_PTR(err);
+			return newent;
 		}
 	} else
-		d_add(entry, inode);
+		newent = d_splice_alias(inode, entry);
 
+	entry = newent ? newent : entry;
 	entry->d_op = &fuse_dentry_operations;
 	if (!err)
 		fuse_change_entry_timeout(entry, &outarg);
 	else
 		fuse_invalidate_entry_cache(entry);
-	return NULL;
+	return newent;
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From dbd561d236ff16f8143bc727d91758ddd190e8cb Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 25 Jul 2008 01:49:00 -0700
Subject: fuse: add export operations

Implement export_operations, to allow fuse filesystems to be exported to
NFS.  This feature has been in the out-of-tree fuse module, and is widely
used and tested.

It has not been originally merged into mainline, because doing the NFS
export in userspace was thought to be a cleaner and more efficient way of
doing it, than through the kernel.

While that is true, it would also have involved a lot of duplicated effort
at reimplementing NFS exporting (all the different versions of the
protocol).  This effort was unfortunately not undertaken by anyone, so we
are left with doing it the easy but less efficient way.

If this feature goes in, the out-of-tree fuse module can go away,
which would have several advantages:

  - not having to maintain two versions
  - less confusion for users
  - no bugs due to kernel API changes

Comment from hch:
 - Use the same fh_type values as XFS, since we use the same fh encoding.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fuse/dir.c    |   4 +-
 fs/fuse/fuse_i.h |   4 ++
 fs/fuse/inode.c  | 115 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 121 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index e5217b213b4..be5450dd638 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -97,7 +97,7 @@ void fuse_invalidate_attr(struct inode *inode)
  * timeout is unknown (unlink, rmdir, rename and in some cases
  * lookup)
  */
-static void fuse_invalidate_entry_cache(struct dentry *entry)
+void fuse_invalidate_entry_cache(struct dentry *entry)
 {
 	fuse_dentry_settime(entry, 0);
 }
@@ -225,7 +225,7 @@ static int invalid_nodeid(u64 nodeid)
 	return !nodeid || nodeid == FUSE_ROOT_ID;
 }
 
-static struct dentry_operations fuse_dentry_operations = {
+struct dentry_operations fuse_dentry_operations = {
 	.d_revalidate	= fuse_dentry_revalidate,
 };
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index bae948657c4..5d3146da64e 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -464,6 +464,8 @@ static inline u64 get_node_id(struct inode *inode)
 /** Device operations */
 extern const struct file_operations fuse_dev_operations;
 
+extern struct dentry_operations fuse_dentry_operations;
+
 /**
  * Get a filled in inode
  */
@@ -604,6 +606,8 @@ void fuse_abort_conn(struct fuse_conn *fc);
  */
 void fuse_invalidate_attr(struct inode *inode);
 
+void fuse_invalidate_entry_cache(struct dentry *entry);
+
 /**
  * Acquire reference to fuse_conn
  */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 3141690558c..71fa76a48a3 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -18,6 +18,7 @@
 #include <linux/statfs.h>
 #include <linux/random.h>
 #include <linux/sched.h>
+#include <linux/exportfs.h>
 
 MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
 MODULE_DESCRIPTION("Filesystem in Userspace");
@@ -552,6 +553,119 @@ static struct inode *get_root_inode(struct super_block *sb, unsigned mode)
 	return fuse_iget(sb, 1, 0, &attr, 0, 0);
 }
 
+struct fuse_inode_handle
+{
+	u64 nodeid;
+	u32 generation;
+};
+
+static struct dentry *fuse_get_dentry(struct super_block *sb,
+				      struct fuse_inode_handle *handle)
+{
+	struct inode *inode;
+	struct dentry *entry;
+	int err = -ESTALE;
+
+	if (handle->nodeid == 0)
+		goto out_err;
+
+	inode = ilookup5(sb, handle->nodeid, fuse_inode_eq, &handle->nodeid);
+	if (!inode)
+		goto out_err;
+	err = -ESTALE;
+	if (inode->i_generation != handle->generation)
+		goto out_iput;
+
+	entry = d_alloc_anon(inode);
+	err = -ENOMEM;
+	if (!entry)
+		goto out_iput;
+
+	if (get_node_id(inode) != FUSE_ROOT_ID) {
+		entry->d_op = &fuse_dentry_operations;
+		fuse_invalidate_entry_cache(entry);
+	}
+
+	return entry;
+
+ out_iput:
+	iput(inode);
+ out_err:
+	return ERR_PTR(err);
+}
+
+static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
+			   int connectable)
+{
+	struct inode *inode = dentry->d_inode;
+	bool encode_parent = connectable && !S_ISDIR(inode->i_mode);
+	int len = encode_parent ? 6 : 3;
+	u64 nodeid;
+	u32 generation;
+
+	if (*max_len < len)
+		return  255;
+
+	nodeid = get_fuse_inode(inode)->nodeid;
+	generation = inode->i_generation;
+
+	fh[0] = (u32)(nodeid >> 32);
+	fh[1] = (u32)(nodeid & 0xffffffff);
+	fh[2] = generation;
+
+	if (encode_parent) {
+		struct inode *parent;
+
+		spin_lock(&dentry->d_lock);
+		parent = dentry->d_parent->d_inode;
+		nodeid = get_fuse_inode(parent)->nodeid;
+		generation = parent->i_generation;
+		spin_unlock(&dentry->d_lock);
+
+		fh[3] = (u32)(nodeid >> 32);
+		fh[4] = (u32)(nodeid & 0xffffffff);
+		fh[5] = generation;
+	}
+
+	*max_len = len;
+	return encode_parent ? 0x82 : 0x81;
+}
+
+static struct dentry *fuse_fh_to_dentry(struct super_block *sb,
+		struct fid *fid, int fh_len, int fh_type)
+{
+	struct fuse_inode_handle handle;
+
+	if ((fh_type != 0x81 && fh_type != 0x82) || fh_len < 3)
+		return NULL;
+
+	handle.nodeid = (u64) fid->raw[0] << 32;
+	handle.nodeid |= (u64) fid->raw[1];
+	handle.generation = fid->raw[2];
+	return fuse_get_dentry(sb, &handle);
+}
+
+static struct dentry *fuse_fh_to_parent(struct super_block *sb,
+		struct fid *fid, int fh_len, int fh_type)
+{
+	struct fuse_inode_handle parent;
+
+	if (fh_type != 0x82 || fh_len < 6)
+		return NULL;
+
+	parent.nodeid = (u64) fid->raw[3] << 32;
+	parent.nodeid |= (u64) fid->raw[4];
+	parent.generation = fid->raw[5];
+	return fuse_get_dentry(sb, &parent);
+}
+
+
+static const struct export_operations fuse_export_operations = {
+	.fh_to_dentry	= fuse_fh_to_dentry,
+	.fh_to_parent	= fuse_fh_to_parent,
+	.encode_fh	= fuse_encode_fh,
+};
+
 static const struct super_operations fuse_super_operations = {
 	.alloc_inode    = fuse_alloc_inode,
 	.destroy_inode  = fuse_destroy_inode,
@@ -652,6 +766,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_magic = FUSE_SUPER_MAGIC;
 	sb->s_op = &fuse_super_operations;
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_export_op = &fuse_export_operations;
 
 	file = fget(d.fd);
 	if (!file)
-- 
cgit v1.2.3-70-g09d2


From c180eebe1390c2076ead6a9bc95a02efb994edb7 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 25 Jul 2008 01:49:01 -0700
Subject: fuse: add fuse_lookup_name() helper

Add a new helper function which sends a LOOKUP request with the supplied
name.  This will be used by the next patch to send special LOOKUP requests
with "." and ".." as the name.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fuse/dir.c | 117 ++++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 77 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index be5450dd638..51d0035ff07 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -112,18 +112,16 @@ static void fuse_invalidate_entry(struct dentry *entry)
 	fuse_invalidate_entry_cache(entry);
 }
 
-static void fuse_lookup_init(struct fuse_req *req, struct inode *dir,
-			     struct dentry *entry,
+static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_req *req,
+			     u64 nodeid, struct qstr *name,
 			     struct fuse_entry_out *outarg)
 {
-	struct fuse_conn *fc = get_fuse_conn(dir);
-
 	memset(outarg, 0, sizeof(struct fuse_entry_out));
 	req->in.h.opcode = FUSE_LOOKUP;
-	req->in.h.nodeid = get_node_id(dir);
+	req->in.h.nodeid = nodeid;
 	req->in.numargs = 1;
-	req->in.args[0].size = entry->d_name.len + 1;
-	req->in.args[0].value = entry->d_name.name;
+	req->in.args[0].size = name->len + 1;
+	req->in.args[0].value = name->name;
 	req->out.numargs = 1;
 	if (fc->minor < 9)
 		req->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE;
@@ -189,7 +187,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 		attr_version = fuse_get_attr_version(fc);
 
 		parent = dget_parent(entry);
-		fuse_lookup_init(req, parent->d_inode, entry, &outarg);
+		fuse_lookup_init(fc, req, get_node_id(parent->d_inode),
+				 &entry->d_name, &outarg);
 		request_send(fc, req);
 		dput(parent);
 		err = req->out.h.error;
@@ -255,73 +254,111 @@ static struct dentry *fuse_d_add_directory(struct dentry *entry,
 	return d_splice_alias(inode, entry);
 }
 
-static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
-				  struct nameidata *nd)
+int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
+		     struct fuse_entry_out *outarg, struct inode **inode)
 {
-	int err;
-	struct fuse_entry_out outarg;
-	struct inode *inode = NULL;
-	struct dentry *newent;
-	struct fuse_conn *fc = get_fuse_conn(dir);
+	struct fuse_conn *fc = get_fuse_conn_super(sb);
 	struct fuse_req *req;
 	struct fuse_req *forget_req;
 	u64 attr_version;
+	int err;
 
-	if (entry->d_name.len > FUSE_NAME_MAX)
-		return ERR_PTR(-ENAMETOOLONG);
+	*inode = NULL;
+	err = -ENAMETOOLONG;
+	if (name->len > FUSE_NAME_MAX)
+		goto out;
 
 	req = fuse_get_req(fc);
+	err = PTR_ERR(req);
 	if (IS_ERR(req))
-		return ERR_CAST(req);
+		goto out;
 
 	forget_req = fuse_get_req(fc);
+	err = PTR_ERR(forget_req);
 	if (IS_ERR(forget_req)) {
 		fuse_put_request(fc, req);
-		return ERR_CAST(forget_req);
+		goto out;
 	}
 
 	attr_version = fuse_get_attr_version(fc);
 
-	fuse_lookup_init(req, dir, entry, &outarg);
+	fuse_lookup_init(fc, req, nodeid, name, outarg);
 	request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 	/* Zero nodeid is same as -ENOENT, but with valid timeout */
-	if (!err && outarg.nodeid &&
-	    (invalid_nodeid(outarg.nodeid) ||
-	     !fuse_valid_type(outarg.attr.mode)))
-		err = -EIO;
-	if (!err && outarg.nodeid) {
-		inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
-				  &outarg.attr, entry_attr_timeout(&outarg),
-				  attr_version);
-		if (!inode) {
-			fuse_send_forget(fc, forget_req, outarg.nodeid, 1);
-			return ERR_PTR(-ENOMEM);
-		}
+	if (err || !outarg->nodeid)
+		goto out_put_forget;
+
+	err = -EIO;
+	if (!outarg->nodeid)
+		goto out_put_forget;
+	if (!fuse_valid_type(outarg->attr.mode))
+		goto out_put_forget;
+
+	*inode = fuse_iget(sb, outarg->nodeid, outarg->generation,
+			   &outarg->attr, entry_attr_timeout(outarg),
+			   attr_version);
+	err = -ENOMEM;
+	if (!*inode) {
+		fuse_send_forget(fc, forget_req, outarg->nodeid, 1);
+		goto out;
 	}
+	err = 0;
+
+ out_put_forget:
 	fuse_put_request(fc, forget_req);
-	if (err && err != -ENOENT)
-		return ERR_PTR(err);
+ out:
+	return err;
+}
+
+static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
+				  struct nameidata *nd)
+{
+	int err;
+	struct fuse_entry_out outarg;
+	struct inode *inode;
+	struct dentry *newent;
+	struct fuse_conn *fc = get_fuse_conn(dir);
+	bool outarg_valid = true;
+
+	err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name,
+			       &outarg, &inode);
+	if (err == -ENOENT) {
+		outarg_valid = false;
+		err = 0;
+	}
+	if (err)
+		goto out_err;
+
+	err = -EIO;
+	if (inode && get_node_id(inode) == FUSE_ROOT_ID)
+		goto out_iput;
 
 	if (inode && S_ISDIR(inode->i_mode)) {
 		mutex_lock(&fc->inst_mutex);
 		newent = fuse_d_add_directory(entry, inode);
 		mutex_unlock(&fc->inst_mutex);
-		if (IS_ERR(newent)) {
-			iput(inode);
-			return newent;
-		}
-	} else
+		err = PTR_ERR(newent);
+		if (IS_ERR(newent))
+			goto out_iput;
+	} else {
 		newent = d_splice_alias(inode, entry);
+	}
 
 	entry = newent ? newent : entry;
 	entry->d_op = &fuse_dentry_operations;
-	if (!err)
+	if (outarg_valid)
 		fuse_change_entry_timeout(entry, &outarg);
 	else
 		fuse_invalidate_entry_cache(entry);
+
 	return newent;
+
+ out_iput:
+	iput(inode);
+ out_err:
+	return ERR_PTR(err);
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 33670fa296860283f04a7975b8c790f101e43a6e Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 25 Jul 2008 01:49:02 -0700
Subject: fuse: nfs export special lookups

Implement the get_parent export operation by sending a LOOKUP request with
".." as the name.

Implement looking up an inode by node ID after it has been evicted from
the cache.  This is done by seding a LOOKUP request with "." as the name
(for all file types, not just directories).

The filesystem can set the FUSE_EXPORT_SUPPORT flag in the INIT reply, to
indicate that it supports these special lookups.

Thanks to John Muir for the original implementation of this feature.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Matthew Wilcox <matthew@wil.cx>
Cc: David Teigland <teigland@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fuse/fuse_i.h     |  6 +++++
 fs/fuse/inode.c      | 66 +++++++++++++++++++++++++++++++++++++++++++++++++---
 include/linux/fuse.h |  3 +++
 3 files changed, 72 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 5d3146da64e..3a876076bdd 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -363,6 +363,9 @@ struct fuse_conn {
 	/** Do not send separate SETATTR request before open(O_TRUNC)  */
 	unsigned atomic_o_trunc : 1;
 
+	/** Filesystem supports NFS exporting.  Only set in INIT */
+	unsigned export_support : 1;
+
 	/*
 	 * The following bitfields are only for optimization purposes
 	 * and hence races in setting them will not cause malfunction
@@ -473,6 +476,9 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
 			int generation, struct fuse_attr *attr,
 			u64 attr_valid, u64 attr_version);
 
+int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
+		     struct fuse_entry_out *outarg, struct inode **inode);
+
 /**
  * Send FORGET command
  */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 71fa76a48a3..7d2f7d6e22e 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -562,6 +562,7 @@ struct fuse_inode_handle
 static struct dentry *fuse_get_dentry(struct super_block *sb,
 				      struct fuse_inode_handle *handle)
 {
+	struct fuse_conn *fc = get_fuse_conn_super(sb);
 	struct inode *inode;
 	struct dentry *entry;
 	int err = -ESTALE;
@@ -570,8 +571,27 @@ static struct dentry *fuse_get_dentry(struct super_block *sb,
 		goto out_err;
 
 	inode = ilookup5(sb, handle->nodeid, fuse_inode_eq, &handle->nodeid);
-	if (!inode)
-		goto out_err;
+	if (!inode) {
+		struct fuse_entry_out outarg;
+		struct qstr name;
+
+		if (!fc->export_support)
+			goto out_err;
+
+		name.len = 1;
+		name.name = ".";
+		err = fuse_lookup_name(sb, handle->nodeid, &name, &outarg,
+				       &inode);
+		if (err && err != -ENOENT)
+			goto out_err;
+		if (err || !inode) {
+			err = -ESTALE;
+			goto out_err;
+		}
+		err = -EIO;
+		if (get_node_id(inode) != handle->nodeid)
+			goto out_iput;
+	}
 	err = -ESTALE;
 	if (inode->i_generation != handle->generation)
 		goto out_iput;
@@ -659,11 +679,46 @@ static struct dentry *fuse_fh_to_parent(struct super_block *sb,
 	return fuse_get_dentry(sb, &parent);
 }
 
+static struct dentry *fuse_get_parent(struct dentry *child)
+{
+	struct inode *child_inode = child->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(child_inode);
+	struct inode *inode;
+	struct dentry *parent;
+	struct fuse_entry_out outarg;
+	struct qstr name;
+	int err;
+
+	if (!fc->export_support)
+		return ERR_PTR(-ESTALE);
+
+	name.len = 2;
+	name.name = "..";
+	err = fuse_lookup_name(child_inode->i_sb, get_node_id(child_inode),
+			       &name, &outarg, &inode);
+	if (err && err != -ENOENT)
+		return ERR_PTR(err);
+	if (err || !inode)
+		return ERR_PTR(-ESTALE);
+
+	parent = d_alloc_anon(inode);
+	if (!parent) {
+		iput(inode);
+		return ERR_PTR(-ENOMEM);
+	}
+	if (get_node_id(inode) != FUSE_ROOT_ID) {
+		parent->d_op = &fuse_dentry_operations;
+		fuse_invalidate_entry_cache(parent);
+	}
+
+	return parent;
+}
 
 static const struct export_operations fuse_export_operations = {
 	.fh_to_dentry	= fuse_fh_to_dentry,
 	.fh_to_parent	= fuse_fh_to_parent,
 	.encode_fh	= fuse_encode_fh,
+	.get_parent	= fuse_get_parent,
 };
 
 static const struct super_operations fuse_super_operations = {
@@ -695,6 +750,11 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 				fc->no_lock = 1;
 			if (arg->flags & FUSE_ATOMIC_O_TRUNC)
 				fc->atomic_o_trunc = 1;
+			if (arg->minor >= 9) {
+				/* LOOKUP has dependency on proto version */
+				if (arg->flags & FUSE_EXPORT_SUPPORT)
+					fc->export_support = 1;
+			}
 			if (arg->flags & FUSE_BIG_WRITES)
 				fc->big_writes = 1;
 		} else {
@@ -721,7 +781,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
 	arg->minor = FUSE_KERNEL_MINOR_VERSION;
 	arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE;
 	arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
-		FUSE_BIG_WRITES;
+		FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES;
 	req->in.h.opcode = FUSE_INIT;
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(*arg);
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index d4828219769..265635dc990 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -104,11 +104,14 @@ struct fuse_file_lock {
 
 /**
  * INIT request/reply flags
+ *
+ * FUSE_EXPORT_SUPPORT: filesystem handles lookups of "." and ".."
  */
 #define FUSE_ASYNC_READ		(1 << 0)
 #define FUSE_POSIX_LOCKS	(1 << 1)
 #define FUSE_FILE_OPS		(1 << 2)
 #define FUSE_ATOMIC_O_TRUNC	(1 << 3)
+#define FUSE_EXPORT_SUPPORT	(1 << 4)
 #define FUSE_BIG_WRITES		(1 << 5)
 
 /**
-- 
cgit v1.2.3-70-g09d2


From 48e90761b570ff57f58b726229d229729949c5bb Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 25 Jul 2008 01:49:02 -0700
Subject: fuse: lockd support

If fuse filesystem doesn't define it's own lock operations, then allow the
lock manager to work with fuse.

Adding lockd support for remote locking is also possible, but more rarely
used, so leave it till later.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Matthew Wilcox <matthew@wil.cx>
Cc: David Teigland <teigland@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fuse/file.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 8092f0d9fd1..67ff2c6a8f6 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1341,6 +1341,11 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
 	pid_t pid = fl->fl_type != F_UNLCK ? current->tgid : 0;
 	int err;
 
+	if (fl->fl_lmops && fl->fl_lmops->fl_grant) {
+		/* NLM needs asynchronous locks, which we don't support yet */
+		return -ENOLCK;
+	}
+
 	/* Unlock on close is handled by the flush method */
 	if (fl->fl_flags & FL_CLOSE)
 		return 0;
@@ -1365,7 +1370,9 @@ static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	int err;
 
-	if (cmd == F_GETLK) {
+	if (cmd == F_CANCELLK) {
+		err = 0;
+	} else if (cmd == F_GETLK) {
 		if (fc->no_lock) {
 			posix_test_lock(file, fl);
 			err = 0;
@@ -1373,7 +1380,7 @@ static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
 			err = fuse_getlk(file, fl);
 	} else {
 		if (fc->no_lock)
-			err = posix_lock_file_wait(file, fl);
+			err = posix_lock_file(file, fl, NULL);
 		else
 			err = fuse_setlk(file, fl, 0);
 	}
-- 
cgit v1.2.3-70-g09d2


From f3c6ba986ab4527b6dfacf9f3b9e40f72466a8b2 Mon Sep 17 00:00:00 2001
From: Kentaro Makita <k-makita@np.css.fujitsu.com>
Date: Fri, 25 Jul 2008 19:44:40 -0700
Subject: vfs: add cond_resched_lock while scanning dentry LRU lists

Add cond_resched_lock(&dcache_lock) while scanning LRU lists on
superblocks in __shrink_dcache_sb()

Signed-off-by: Kentaro Makita <k-makita@np.css.fujitsu.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dcache.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 3818d6ab76c..f2584d22cb4 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -487,6 +487,7 @@ restart:
 				if (!cnt)
 					break;
 			}
+			cond_resched_lock(&dcache_lock);
 		}
 	}
 	while (!list_empty(&tmp)) {
-- 
cgit v1.2.3-70-g09d2


From 7d135a5d50a08bbc53b189d79d8bdb03136f5303 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <matthias@kaehlcke.net>
Date: Fri, 25 Jul 2008 19:44:51 -0700
Subject: affs: convert s_bmlock into a mutex

The semaphore s_bmlock is used as a mutex.  Convert it to the mutex API.

Signed-off-by: Matthias Kaehlcke <matthias@kaehlcke.net>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/affs/affs.h   |  3 ++-
 fs/affs/bitmap.c | 18 +++++++++---------
 fs/affs/super.c  |  2 +-
 3 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 223b1917093..e9ec915f755 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -2,6 +2,7 @@
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
 #include <linux/amigaffs.h>
+#include <linux/mutex.h>
 
 /* AmigaOS allows file names with up to 30 characters length.
  * Names longer than that will be silently truncated. If you
@@ -98,7 +99,7 @@ struct affs_sb_info {
 	gid_t s_gid;			/* gid to override */
 	umode_t s_mode;			/* mode to override */
 	struct buffer_head *s_root_bh;	/* Cached root block. */
-	struct semaphore s_bmlock;	/* Protects bitmap access. */
+	struct mutex s_bmlock;		/* Protects bitmap access. */
 	struct affs_bm_info *s_bitmap;	/* Bitmap infos. */
 	u32 s_bmap_count;		/* # of bitmap blocks. */
 	u32 s_bmap_bits;		/* # of bits in one bitmap blocks */
diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c
index c4a5ad09ddf..dc5ef14bdc1 100644
--- a/fs/affs/bitmap.c
+++ b/fs/affs/bitmap.c
@@ -45,14 +45,14 @@ affs_count_free_blocks(struct super_block *sb)
 	if (sb->s_flags & MS_RDONLY)
 		return 0;
 
-	down(&AFFS_SB(sb)->s_bmlock);
+	mutex_lock(&AFFS_SB(sb)->s_bmlock);
 
 	bm = AFFS_SB(sb)->s_bitmap;
 	free = 0;
 	for (i = AFFS_SB(sb)->s_bmap_count; i > 0; bm++, i--)
 		free += bm->bm_free;
 
-	up(&AFFS_SB(sb)->s_bmlock);
+	mutex_unlock(&AFFS_SB(sb)->s_bmlock);
 
 	return free;
 }
@@ -76,7 +76,7 @@ affs_free_block(struct super_block *sb, u32 block)
 	bit     = blk % sbi->s_bmap_bits;
 	bm      = &sbi->s_bitmap[bmap];
 
-	down(&sbi->s_bmlock);
+	mutex_lock(&sbi->s_bmlock);
 
 	bh = sbi->s_bmap_bh;
 	if (sbi->s_last_bmap != bmap) {
@@ -105,19 +105,19 @@ affs_free_block(struct super_block *sb, u32 block)
 	sb->s_dirt = 1;
 	bm->bm_free++;
 
-	up(&sbi->s_bmlock);
+	mutex_unlock(&sbi->s_bmlock);
 	return;
 
 err_free:
 	affs_warning(sb,"affs_free_block","Trying to free block %u which is already free", block);
-	up(&sbi->s_bmlock);
+	mutex_unlock(&sbi->s_bmlock);
 	return;
 
 err_bh_read:
 	affs_error(sb,"affs_free_block","Cannot read bitmap block %u", bm->bm_key);
 	sbi->s_bmap_bh = NULL;
 	sbi->s_last_bmap = ~0;
-	up(&sbi->s_bmlock);
+	mutex_unlock(&sbi->s_bmlock);
 	return;
 
 err_range:
@@ -168,7 +168,7 @@ affs_alloc_block(struct inode *inode, u32 goal)
 	bmap = blk / sbi->s_bmap_bits;
 	bm = &sbi->s_bitmap[bmap];
 
-	down(&sbi->s_bmlock);
+	mutex_lock(&sbi->s_bmlock);
 
 	if (bm->bm_free)
 		goto find_bmap_bit;
@@ -249,7 +249,7 @@ find_bit:
 	mark_buffer_dirty(bh);
 	sb->s_dirt = 1;
 
-	up(&sbi->s_bmlock);
+	mutex_unlock(&sbi->s_bmlock);
 
 	pr_debug("%d\n", blk);
 	return blk;
@@ -259,7 +259,7 @@ err_bh_read:
 	sbi->s_bmap_bh = NULL;
 	sbi->s_last_bmap = ~0;
 err_full:
-	up(&sbi->s_bmlock);
+	mutex_unlock(&sbi->s_bmlock);
 	pr_debug("failed\n");
 	return 0;
 }
diff --git a/fs/affs/super.c b/fs/affs/super.c
index d214837d5e4..4e030956640 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -290,7 +290,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
 	if (!sbi)
 		return -ENOMEM;
 	sb->s_fs_info = sbi;
-	init_MUTEX(&sbi->s_bmlock);
+	mutex_init(&sbi->s_bmlock);
 
 	if (!parse_options(data,&uid,&gid,&i,&reserved,&root_block,
 				&blocksize,&sbi->s_prefix,
-- 
cgit v1.2.3-70-g09d2


From 75b25b4cabb7ce956c36442bf8225659b1864866 Mon Sep 17 00:00:00 2001
From: Dmitri Vorobiev <dmitri.vorobiev@movial.fi>
Date: Fri, 25 Jul 2008 19:44:52 -0700
Subject: bfs: assorted cleanups

This patch makes the following cleanups:

	o removing an unused variable from bfs_fill_super();
	o removing unneeded blank spaces from pointer
	  definitions.

Signed-off-by: Dmitri Vorobiev <dmitri.vorobiev@movial.fi>
Cc: Tigran Aivazian <tigran_aivazian@symantec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/bfs/bfs.h   | 4 ++--
 fs/bfs/inode.c | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h
index 70f5d3a8eed..d1a6cd10afc 100644
--- a/fs/bfs/bfs.h
+++ b/fs/bfs/bfs.h
@@ -16,8 +16,8 @@ struct bfs_sb_info {
 	unsigned long si_freei;
 	unsigned long si_lf_eblk;
 	unsigned long si_lasti;
-	unsigned long * si_imap;
-	struct buffer_head * si_sbh;		/* buffer header w/superblock */
+	unsigned long *si_imap;
+	struct buffer_head *si_sbh;		/* buffer header w/superblock */
 };
 
 /*
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 8db623838b5..c8b3982e11c 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -380,7 +380,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
 		struct bfs_inode *di;
 		int block = (i - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1;
 		int off = (i - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK;
-		unsigned long sblock, eblock;
+		unsigned long eblock;
 
 		if (!off) {
 			brelse(bh);
@@ -399,7 +399,6 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
 		set_bit(i, info->si_imap);
 		info->si_freeb -= BFS_FILEBLOCKS(di);
 
-		sblock =  le32_to_cpu(di->i_sblock);
 		eblock =  le32_to_cpu(di->i_eblock);
 		if (eblock > info->si_lf_eblk)
 			info->si_lf_eblk = eblock;
-- 
cgit v1.2.3-70-g09d2


From 3f165e4cf2af042af7d2440d688299c0d2a48b1f Mon Sep 17 00:00:00 2001
From: Dmitri Vorobiev <dmitri.vorobiev@movial.fi>
Date: Fri, 25 Jul 2008 19:44:54 -0700
Subject: bfs: kill BKL

Replace the BKL-based locking scheme used in the bfs driver by a private
filesystem-wide mutex.

Signed-off-by: Dmitri Vorobiev <dmitri.vorobiev@movial.fi>
Cc: Tigran Aivazian <tigran_aivazian@symantec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/bfs/bfs.h   |  1 +
 fs/bfs/dir.c   | 46 ++++++++++++++++++++++++++--------------------
 fs/bfs/file.c  |  4 ++--
 fs/bfs/inode.c | 24 +++++++++++++++---------
 4 files changed, 44 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h
index d1a6cd10afc..7109e451abf 100644
--- a/fs/bfs/bfs.h
+++ b/fs/bfs/bfs.h
@@ -18,6 +18,7 @@ struct bfs_sb_info {
 	unsigned long si_lasti;
 	unsigned long *si_imap;
 	struct buffer_head *si_sbh;		/* buffer header w/superblock */
+	struct mutex bfs_lock;
 };
 
 /*
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 034950cb3cb..87ee5ccee34 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -32,16 +32,17 @@ static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
 	struct inode *dir = f->f_path.dentry->d_inode;
 	struct buffer_head *bh;
 	struct bfs_dirent *de;
+	struct bfs_sb_info *info = BFS_SB(dir->i_sb);
 	unsigned int offset;
 	int block;
 
-	lock_kernel();
+	mutex_lock(&info->bfs_lock);
 
 	if (f->f_pos & (BFS_DIRENT_SIZE - 1)) {
 		printf("Bad f_pos=%08lx for %s:%08lx\n",
 					(unsigned long)f->f_pos,
 					dir->i_sb->s_id, dir->i_ino);
-		unlock_kernel();
+		mutex_unlock(&info->bfs_lock);
 		return -EBADF;
 	}
 
@@ -61,7 +62,7 @@ static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
 						le16_to_cpu(de->ino),
 						DT_UNKNOWN) < 0) {
 					brelse(bh);
-					unlock_kernel();
+					mutex_unlock(&info->bfs_lock);
 					return 0;
 				}
 			}
@@ -71,7 +72,7 @@ static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
 		brelse(bh);
 	}
 
-	unlock_kernel();
+	mutex_unlock(&info->bfs_lock);
 	return 0;	
 }
 
@@ -95,10 +96,10 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	inode = new_inode(s);
 	if (!inode)
 		return -ENOSPC;
-	lock_kernel();
+	mutex_lock(&info->bfs_lock);
 	ino = find_first_zero_bit(info->si_imap, info->si_lasti);
 	if (ino > info->si_lasti) {
-		unlock_kernel();
+		mutex_unlock(&info->bfs_lock);
 		iput(inode);
 		return -ENOSPC;
 	}
@@ -125,10 +126,10 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	if (err) {
 		inode_dec_link_count(inode);
 		iput(inode);
-		unlock_kernel();
+		mutex_unlock(&info->bfs_lock);
 		return err;
 	}
-	unlock_kernel();
+	mutex_unlock(&info->bfs_lock);
 	d_instantiate(dentry, inode);
 	return 0;
 }
@@ -139,22 +140,23 @@ static struct dentry *bfs_lookup(struct inode *dir, struct dentry *dentry,
 	struct inode *inode = NULL;
 	struct buffer_head *bh;
 	struct bfs_dirent *de;
+	struct bfs_sb_info *info = BFS_SB(dir->i_sb);
 
 	if (dentry->d_name.len > BFS_NAMELEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	lock_kernel();
+	mutex_lock(&info->bfs_lock);
 	bh = bfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, &de);
 	if (bh) {
 		unsigned long ino = (unsigned long)le16_to_cpu(de->ino);
 		brelse(bh);
 		inode = bfs_iget(dir->i_sb, ino);
 		if (IS_ERR(inode)) {
-			unlock_kernel();
+			mutex_unlock(&info->bfs_lock);
 			return ERR_CAST(inode);
 		}
 	}
-	unlock_kernel();
+	mutex_unlock(&info->bfs_lock);
 	d_add(dentry, inode);
 	return NULL;
 }
@@ -163,13 +165,14 @@ static int bfs_link(struct dentry *old, struct inode *dir,
 						struct dentry *new)
 {
 	struct inode *inode = old->d_inode;
+	struct bfs_sb_info *info = BFS_SB(inode->i_sb);
 	int err;
 
-	lock_kernel();
+	mutex_lock(&info->bfs_lock);
 	err = bfs_add_entry(dir, new->d_name.name, new->d_name.len,
 							inode->i_ino);
 	if (err) {
-		unlock_kernel();
+		mutex_unlock(&info->bfs_lock);
 		return err;
 	}
 	inc_nlink(inode);
@@ -177,19 +180,19 @@ static int bfs_link(struct dentry *old, struct inode *dir,
 	mark_inode_dirty(inode);
 	atomic_inc(&inode->i_count);
 	d_instantiate(new, inode);
-	unlock_kernel();
+	mutex_unlock(&info->bfs_lock);
 	return 0;
 }
 
 static int bfs_unlink(struct inode *dir, struct dentry *dentry)
 {
 	int error = -ENOENT;
-	struct inode *inode;
+	struct inode *inode = dentry->d_inode;
 	struct buffer_head *bh;
 	struct bfs_dirent *de;
+	struct bfs_sb_info *info = BFS_SB(inode->i_sb);
 
-	inode = dentry->d_inode;
-	lock_kernel();
+	mutex_lock(&info->bfs_lock);
 	bh = bfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, &de);
 	if (!bh || (le16_to_cpu(de->ino) != inode->i_ino))
 		goto out_brelse;
@@ -210,7 +213,7 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry)
 
 out_brelse:
 	brelse(bh);
-	unlock_kernel();
+	mutex_unlock(&info->bfs_lock);
 	return error;
 }
 
@@ -220,6 +223,7 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct inode *old_inode, *new_inode;
 	struct buffer_head *old_bh, *new_bh;
 	struct bfs_dirent *old_de, *new_de;
+	struct bfs_sb_info *info;
 	int error = -ENOENT;
 
 	old_bh = new_bh = NULL;
@@ -227,7 +231,9 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (S_ISDIR(old_inode->i_mode))
 		return -EINVAL;
 
-	lock_kernel();
+	info = BFS_SB(old_inode->i_sb);
+
+	mutex_lock(&info->bfs_lock);
 	old_bh = bfs_find_entry(old_dir, 
 				old_dentry->d_name.name, 
 				old_dentry->d_name.len, &old_de);
@@ -264,7 +270,7 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	error = 0;
 
 end_rename:
-	unlock_kernel();
+	mutex_unlock(&info->bfs_lock);
 	brelse(old_bh);
 	brelse(new_bh);
 	return error;
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index b11e63e8fbc..6a021265f01 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -99,7 +99,7 @@ static int bfs_get_block(struct inode *inode, sector_t block,
 		return -ENOSPC;
 
 	/* The rest has to be protected against itself. */
-	lock_kernel();
+	mutex_lock(&info->bfs_lock);
 
 	/*
 	 * If the last data block for this file is the last allocated
@@ -151,7 +151,7 @@ static int bfs_get_block(struct inode *inode, sector_t block,
 	mark_buffer_dirty(sbh);
 	map_bh(bh_result, sb, phys);
 out:
-	unlock_kernel();
+	mutex_unlock(&info->bfs_lock);
 	return err;
 }
 
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index c8b3982e11c..053e690ec9e 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -104,6 +104,7 @@ static int bfs_write_inode(struct inode *inode, int unused)
 	struct bfs_inode *di;
 	struct buffer_head *bh;
 	int block, off;
+	struct bfs_sb_info *info = BFS_SB(inode->i_sb);
 
         dprintf("ino=%08x\n", ino);
 
@@ -112,13 +113,13 @@ static int bfs_write_inode(struct inode *inode, int unused)
 		return -EIO;
 	}
 
-	lock_kernel();
+	mutex_lock(&info->bfs_lock);
 	block = (ino - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1;
 	bh = sb_bread(inode->i_sb, block);
 	if (!bh) {
 		printf("Unable to read inode %s:%08x\n",
 				inode->i_sb->s_id, ino);
-		unlock_kernel();
+		mutex_unlock(&info->bfs_lock);
 		return -EIO;
 	}
 
@@ -145,7 +146,7 @@ static int bfs_write_inode(struct inode *inode, int unused)
 
 	mark_buffer_dirty(bh);
 	brelse(bh);
-	unlock_kernel();
+	mutex_unlock(&info->bfs_lock);
 	return 0;
 }
 
@@ -170,7 +171,7 @@ static void bfs_delete_inode(struct inode *inode)
 	
 	inode->i_size = 0;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
-	lock_kernel();
+	mutex_lock(&info->bfs_lock);
 	mark_inode_dirty(inode);
 
 	block = (ino - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1;
@@ -178,7 +179,7 @@ static void bfs_delete_inode(struct inode *inode)
 	if (!bh) {
 		printf("Unable to read inode %s:%08lx\n",
 					inode->i_sb->s_id, ino);
-		unlock_kernel();
+		mutex_unlock(&info->bfs_lock);
 		return;
 	}
 	off = (ino - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK;
@@ -204,14 +205,16 @@ static void bfs_delete_inode(struct inode *inode)
 		info->si_lf_eblk = bi->i_sblock - 1;
 		mark_buffer_dirty(info->si_sbh);
 	}
-	unlock_kernel();
+	mutex_unlock(&info->bfs_lock);
 	clear_inode(inode);
 }
 
 static void bfs_put_super(struct super_block *s)
 {
 	struct bfs_sb_info *info = BFS_SB(s);
+
 	brelse(info->si_sbh);
+	mutex_destroy(&info->bfs_lock);
 	kfree(info->si_imap);
 	kfree(info);
 	s->s_fs_info = NULL;
@@ -236,11 +239,13 @@ static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 
 static void bfs_write_super(struct super_block *s)
 {
-	lock_kernel();
+	struct bfs_sb_info *info = BFS_SB(s);
+
+	mutex_lock(&info->bfs_lock);
 	if (!(s->s_flags & MS_RDONLY))
-		mark_buffer_dirty(BFS_SB(s)->si_sbh);
+		mark_buffer_dirty(info->si_sbh);
 	s->s_dirt = 0;
-	unlock_kernel();
+	mutex_unlock(&info->bfs_lock);
 }
 
 static struct kmem_cache *bfs_inode_cachep;
@@ -409,6 +414,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
 		s->s_dirt = 1;
 	} 
 	dump_imap("read_super", s);
+	mutex_init(&info->bfs_lock);
 	return 0;
 
 out:
-- 
cgit v1.2.3-70-g09d2


From 1b002d7b173ae7cc15ed90d3c07f6d106babc510 Mon Sep 17 00:00:00 2001
From: Bob Copeland <me@bobcopeland.com>
Date: Fri, 25 Jul 2008 19:45:15 -0700
Subject: omfs: define filesystem structures

Add header files containing OMFS on-disk and memory structures.

Signed-off-by: Bob Copeland <me@bobcopeland.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/omfs/omfs.h    | 67 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/omfs/omfs_fs.h | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 147 insertions(+)
 create mode 100644 fs/omfs/omfs.h
 create mode 100644 fs/omfs/omfs_fs.h

(limited to 'fs')

diff --git a/fs/omfs/omfs.h b/fs/omfs/omfs.h
new file mode 100644
index 00000000000..2bc0f067040
--- /dev/null
+++ b/fs/omfs/omfs.h
@@ -0,0 +1,67 @@
+#ifndef _OMFS_H
+#define _OMFS_H
+
+#include <linux/module.h>
+#include <linux/fs.h>
+
+#include "omfs_fs.h"
+
+/* In-memory structures */
+struct omfs_sb_info {
+	u64 s_num_blocks;
+	u64 s_bitmap_ino;
+	u64 s_root_ino;
+	u32 s_blocksize;
+	u32 s_mirrors;
+	u32 s_sys_blocksize;
+	u32 s_clustersize;
+	int s_block_shift;
+	unsigned long **s_imap;
+	int s_imap_size;
+	struct mutex s_bitmap_lock;
+	int s_uid;
+	int s_gid;
+	int s_dmask;
+	int s_fmask;
+};
+
+/* convert a cluster number to a scaled block number */
+static inline sector_t clus_to_blk(struct omfs_sb_info *sbi, sector_t block)
+{
+	return block << sbi->s_block_shift;
+}
+
+static inline struct omfs_sb_info *OMFS_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+/* bitmap.c */
+extern unsigned long omfs_count_free(struct super_block *sb);
+extern int omfs_allocate_block(struct super_block *sb, u64 block);
+extern int omfs_allocate_range(struct super_block *sb, int min_request,
+			int max_request, u64 *return_block, int *return_size);
+extern int omfs_clear_range(struct super_block *sb, u64 block, int count);
+
+/* dir.c */
+extern struct file_operations omfs_dir_operations;
+extern struct inode_operations omfs_dir_inops;
+extern int omfs_make_empty(struct inode *inode, struct super_block *sb);
+extern int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header,
+			u64 fsblock);
+
+/* file.c */
+extern struct file_operations omfs_file_operations;
+extern struct inode_operations omfs_file_inops;
+extern struct address_space_operations omfs_aops;
+extern void omfs_make_empty_table(struct buffer_head *bh, int offset);
+extern int omfs_shrink_inode(struct inode *inode);
+
+/* inode.c */
+extern struct inode *omfs_iget(struct super_block *sb, ino_t inode);
+extern struct inode *omfs_new_inode(struct inode *dir, int mode);
+extern int omfs_reserve_block(struct super_block *sb, sector_t block);
+extern int omfs_find_empty_block(struct super_block *sb, int mode, ino_t *ino);
+extern int omfs_sync_inode(struct inode *inode);
+
+#endif
diff --git a/fs/omfs/omfs_fs.h b/fs/omfs/omfs_fs.h
new file mode 100644
index 00000000000..12cca245d6e
--- /dev/null
+++ b/fs/omfs/omfs_fs.h
@@ -0,0 +1,80 @@
+#ifndef _OMFS_FS_H
+#define _OMFS_FS_H
+
+/* OMFS On-disk structures */
+
+#define OMFS_MAGIC 0xC2993D87
+#define OMFS_IMAGIC 0xD2
+
+#define OMFS_DIR 'D'
+#define OMFS_FILE 'F'
+#define OMFS_INODE_NORMAL 'e'
+#define OMFS_INODE_CONTINUATION 'c'
+#define OMFS_INODE_SYSTEM 's'
+#define OMFS_NAMELEN 256
+#define OMFS_DIR_START 0x1b8
+#define OMFS_EXTENT_START 0x1d0
+#define OMFS_EXTENT_CONT 0x40
+#define OMFS_XOR_COUNT 19
+#define OMFS_MAX_BLOCK_SIZE 8192
+
+struct omfs_super_block {
+	char s_fill1[256];
+	__be64 s_root_block;		/* block number of omfs_root_block */
+	__be64 s_num_blocks;		/* total number of FS blocks */
+	__be32 s_magic;			/* OMFS_MAGIC */
+	__be32 s_blocksize;		/* size of a block */
+	__be32 s_mirrors;		/* # of mirrors of system blocks */
+	__be32 s_sys_blocksize;		/* size of non-data blocks */
+};
+
+struct omfs_header {
+	__be64 h_self;			/* FS block where this is located */
+	__be32 h_body_size;		/* size of useful data after header */
+	__be16 h_crc;			/* crc-ccitt of body_size bytes */
+	char h_fill1[2];
+	u8 h_version;			/* version, always 1 */
+	char h_type;			/* OMFS_INODE_X */
+	u8 h_magic;			/* OMFS_IMAGIC */
+	u8 h_check_xor;			/* XOR of header bytes before this */
+	__be32 h_fill2;
+};
+
+struct omfs_root_block {
+	struct omfs_header r_head;	/* header */
+	__be64 r_fill1;
+	__be64 r_num_blocks;		/* total number of FS blocks */
+	__be64 r_root_dir;		/* block # of root directory */
+	__be64 r_bitmap;		/* block # of free space bitmap */
+	__be32 r_blocksize;		/* size of a block */
+	__be32 r_clustersize;		/* size allocated for data blocks */
+	__be64 r_mirrors;		/* # of mirrors of system blocks */
+	char r_name[OMFS_NAMELEN];	/* partition label */
+};
+
+struct omfs_inode {
+	struct omfs_header i_head;	/* header */
+	__be64 i_parent;		/* parent containing this inode */
+	__be64 i_sibling;		/* next inode in hash bucket */
+	__be64 i_ctime;			/* ctime, in milliseconds */
+	char i_fill1[35];
+	char i_type;			/* OMFS_[DIR,FILE] */
+	__be32 i_fill2;
+	char i_fill3[64];
+	char i_name[OMFS_NAMELEN];	/* filename */
+	__be64 i_size;			/* size of file, in bytes */
+};
+
+struct omfs_extent_entry {
+	__be64 e_cluster;		/* start location of a set of blocks */
+	__be64 e_blocks;		/* number of blocks after e_cluster */
+};
+
+struct omfs_extent {
+	__be64 e_next;			/* next extent table location */
+	__be32 e_extent_count;		/* total # extents in this table */
+	__be32 e_fill;
+	struct omfs_extent_entry e_entry;	/* start of extent entries */
+};
+
+#endif
-- 
cgit v1.2.3-70-g09d2


From 555e3775ced1d05203934fc6529bbf0560dd8733 Mon Sep 17 00:00:00 2001
From: Bob Copeland <me@bobcopeland.com>
Date: Fri, 25 Jul 2008 19:45:15 -0700
Subject: omfs: add inode routines

Add basic superblock and inode handling routines for OMFS

Signed-off-by: Bob Copeland <me@bobcopeland.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/omfs/inode.c | 553 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 553 insertions(+)
 create mode 100644 fs/omfs/inode.c

(limited to 'fs')

diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
new file mode 100644
index 00000000000..d865f553543
--- /dev/null
+++ b/fs/omfs/inode.c
@@ -0,0 +1,553 @@
+/*
+ * Optimized MPEG FS - inode and super operations.
+ * Copyright (C) 2006 Bob Copeland <me@bobcopeland.com>
+ * Released under GPL v2.
+ */
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/parser.h>
+#include <linux/buffer_head.h>
+#include <linux/vmalloc.h>
+#include <linux/crc-itu-t.h>
+#include "omfs.h"
+
+MODULE_AUTHOR("Bob Copeland <me@bobcopeland.com>");
+MODULE_DESCRIPTION("OMFS (ReplayTV/Karma) Filesystem for Linux");
+MODULE_LICENSE("GPL");
+
+struct inode *omfs_new_inode(struct inode *dir, int mode)
+{
+	struct inode *inode;
+	u64 new_block;
+	int err;
+	int len;
+	struct omfs_sb_info *sbi = OMFS_SB(dir->i_sb);
+
+	inode = new_inode(dir->i_sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	err = omfs_allocate_range(dir->i_sb, sbi->s_mirrors, sbi->s_mirrors,
+			&new_block, &len);
+	if (err)
+		goto fail;
+
+	inode->i_ino = new_block;
+	inode->i_mode = mode;
+	inode->i_uid = current->fsuid;
+	inode->i_gid = current->fsgid;
+	inode->i_blocks = 0;
+	inode->i_mapping->a_ops = &omfs_aops;
+
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	switch (mode & S_IFMT) {
+	case S_IFDIR:
+		inode->i_op = &omfs_dir_inops;
+		inode->i_fop = &omfs_dir_operations;
+		inode->i_size = sbi->s_sys_blocksize;
+		inc_nlink(inode);
+		break;
+	case S_IFREG:
+		inode->i_op = &omfs_file_inops;
+		inode->i_fop = &omfs_file_operations;
+		inode->i_size = 0;
+		break;
+	}
+
+	insert_inode_hash(inode);
+	mark_inode_dirty(inode);
+	return inode;
+fail:
+	make_bad_inode(inode);
+	iput(inode);
+	return ERR_PTR(err);
+}
+
+/*
+ * Update the header checksums for a dirty inode based on its contents.
+ * Caller is expected to hold the buffer head underlying oi and mark it
+ * dirty.
+ */
+static void omfs_update_checksums(struct omfs_inode *oi)
+{
+	int xor, i, ofs = 0, count;
+	u16 crc = 0;
+	unsigned char *ptr = (unsigned char *) oi;
+
+	count = be32_to_cpu(oi->i_head.h_body_size);
+	ofs = sizeof(struct omfs_header);
+
+	crc = crc_itu_t(crc, ptr + ofs, count);
+	oi->i_head.h_crc = cpu_to_be16(crc);
+
+	xor = ptr[0];
+	for (i = 1; i < OMFS_XOR_COUNT; i++)
+		xor ^= ptr[i];
+
+	oi->i_head.h_check_xor = xor;
+}
+
+static int omfs_write_inode(struct inode *inode, int wait)
+{
+	struct omfs_inode *oi;
+	struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb);
+	struct buffer_head *bh, *bh2;
+	unsigned int block;
+	u64 ctime;
+	int i;
+	int ret = -EIO;
+	int sync_failed = 0;
+
+	/* get current inode since we may have written sibling ptrs etc. */
+	block = clus_to_blk(sbi, inode->i_ino);
+	bh = sb_bread(inode->i_sb, block);
+	if (!bh)
+		goto out;
+
+	oi = (struct omfs_inode *) bh->b_data;
+
+	oi->i_head.h_self = cpu_to_be64(inode->i_ino);
+	if (S_ISDIR(inode->i_mode))
+		oi->i_type = OMFS_DIR;
+	else if (S_ISREG(inode->i_mode))
+		oi->i_type = OMFS_FILE;
+	else {
+		printk(KERN_WARNING "omfs: unknown file type: %d\n",
+			inode->i_mode);
+		goto out_brelse;
+	}
+
+	oi->i_head.h_body_size = cpu_to_be32(sbi->s_sys_blocksize -
+		sizeof(struct omfs_header));
+	oi->i_head.h_version = 1;
+	oi->i_head.h_type = OMFS_INODE_NORMAL;
+	oi->i_head.h_magic = OMFS_IMAGIC;
+	oi->i_size = cpu_to_be64(inode->i_size);
+
+	ctime = inode->i_ctime.tv_sec * 1000LL +
+		((inode->i_ctime.tv_nsec + 999)/1000);
+	oi->i_ctime = cpu_to_be64(ctime);
+
+	omfs_update_checksums(oi);
+
+	mark_buffer_dirty(bh);
+	if (wait) {
+		sync_dirty_buffer(bh);
+		if (buffer_req(bh) && !buffer_uptodate(bh))
+			sync_failed = 1;
+	}
+
+	/* if mirroring writes, copy to next fsblock */
+	for (i = 1; i < sbi->s_mirrors; i++) {
+		bh2 = sb_bread(inode->i_sb, block + i *
+			(sbi->s_blocksize / sbi->s_sys_blocksize));
+		if (!bh2)
+			goto out_brelse;
+
+		memcpy(bh2->b_data, bh->b_data, bh->b_size);
+		mark_buffer_dirty(bh2);
+		if (wait) {
+			sync_dirty_buffer(bh2);
+			if (buffer_req(bh2) && !buffer_uptodate(bh2))
+				sync_failed = 1;
+		}
+		brelse(bh2);
+	}
+	ret = (sync_failed) ? -EIO : 0;
+out_brelse:
+	brelse(bh);
+out:
+	return ret;
+}
+
+int omfs_sync_inode(struct inode *inode)
+{
+	return omfs_write_inode(inode, 1);
+}
+
+/*
+ * called when an entry is deleted, need to clear the bits in the
+ * bitmaps.
+ */
+static void omfs_delete_inode(struct inode *inode)
+{
+	truncate_inode_pages(&inode->i_data, 0);
+
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_size = 0;
+		omfs_shrink_inode(inode);
+	}
+
+	omfs_clear_range(inode->i_sb, inode->i_ino, 2);
+	clear_inode(inode);
+}
+
+struct inode *omfs_iget(struct super_block *sb, ino_t ino)
+{
+	struct omfs_sb_info *sbi = OMFS_SB(sb);
+	struct omfs_inode *oi;
+	struct buffer_head *bh;
+	unsigned int block;
+	u64 ctime;
+	unsigned long nsecs;
+	struct inode *inode;
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	block = clus_to_blk(sbi, ino);
+	bh = sb_bread(inode->i_sb, block);
+	if (!bh)
+		goto iget_failed;
+
+	oi = (struct omfs_inode *)bh->b_data;
+
+	/* check self */
+	if (ino != be64_to_cpu(oi->i_head.h_self))
+		goto fail_bh;
+
+	inode->i_uid = sbi->s_uid;
+	inode->i_gid = sbi->s_gid;
+
+	ctime = be64_to_cpu(oi->i_ctime);
+	nsecs = do_div(ctime, 1000) * 1000L;
+
+	inode->i_atime.tv_sec = ctime;
+	inode->i_mtime.tv_sec = ctime;
+	inode->i_ctime.tv_sec = ctime;
+	inode->i_atime.tv_nsec = nsecs;
+	inode->i_mtime.tv_nsec = nsecs;
+	inode->i_ctime.tv_nsec = nsecs;
+
+	inode->i_mapping->a_ops = &omfs_aops;
+
+	switch (oi->i_type) {
+	case OMFS_DIR:
+		inode->i_mode = S_IFDIR | (S_IRWXUGO & ~sbi->s_dmask);
+		inode->i_op = &omfs_dir_inops;
+		inode->i_fop = &omfs_dir_operations;
+		inode->i_size = be32_to_cpu(oi->i_head.h_body_size) +
+			sizeof(struct omfs_header);
+		inc_nlink(inode);
+		break;
+	case OMFS_FILE:
+		inode->i_mode = S_IFREG | (S_IRWXUGO & ~sbi->s_fmask);
+		inode->i_fop = &omfs_file_operations;
+		inode->i_size = be64_to_cpu(oi->i_size);
+		break;
+	}
+	brelse(bh);
+	unlock_new_inode(inode);
+	return inode;
+fail_bh:
+	brelse(bh);
+iget_failed:
+	iget_failed(inode);
+	return ERR_PTR(-EIO);
+}
+
+static void omfs_put_super(struct super_block *sb)
+{
+	struct omfs_sb_info *sbi = OMFS_SB(sb);
+	kfree(sbi->s_imap);
+	kfree(sbi);
+	sb->s_fs_info = NULL;
+}
+
+static int omfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *s = dentry->d_sb;
+	struct omfs_sb_info *sbi = OMFS_SB(s);
+	buf->f_type = OMFS_MAGIC;
+	buf->f_bsize = sbi->s_blocksize;
+	buf->f_blocks = sbi->s_num_blocks;
+	buf->f_files = sbi->s_num_blocks;
+	buf->f_namelen = OMFS_NAMELEN;
+
+	buf->f_bfree = buf->f_bavail = buf->f_ffree =
+		omfs_count_free(s);
+	return 0;
+}
+
+static struct super_operations omfs_sops = {
+	.write_inode	= omfs_write_inode,
+	.delete_inode	= omfs_delete_inode,
+	.put_super	= omfs_put_super,
+	.statfs		= omfs_statfs,
+	.show_options	= generic_show_options,
+};
+
+/*
+ * For Rio Karma, there is an on-disk free bitmap whose location is
+ * stored in the root block.  For ReplayTV, there is no such free bitmap
+ * so we have to walk the tree.  Both inodes and file data are allocated
+ * from the same map.  This array can be big (300k) so we allocate
+ * in units of the blocksize.
+ */
+static int omfs_get_imap(struct super_block *sb)
+{
+	int bitmap_size;
+	int array_size;
+	int count;
+	struct omfs_sb_info *sbi = OMFS_SB(sb);
+	struct buffer_head *bh;
+	unsigned long **ptr;
+	sector_t block;
+
+	bitmap_size = DIV_ROUND_UP(sbi->s_num_blocks, 8);
+	array_size = DIV_ROUND_UP(bitmap_size, sb->s_blocksize);
+
+	if (sbi->s_bitmap_ino == ~0ULL)
+		goto out;
+
+	sbi->s_imap_size = array_size;
+	sbi->s_imap = kzalloc(array_size * sizeof(unsigned long *), GFP_KERNEL);
+	if (!sbi->s_imap)
+		goto nomem;
+
+	block = clus_to_blk(sbi, sbi->s_bitmap_ino);
+	ptr = sbi->s_imap;
+	for (count = bitmap_size; count > 0; count -= sb->s_blocksize) {
+		bh = sb_bread(sb, block++);
+		if (!bh)
+			goto nomem_free;
+		*ptr = kmalloc(sb->s_blocksize, GFP_KERNEL);
+		if (!*ptr) {
+			brelse(bh);
+			goto nomem_free;
+		}
+		memcpy(*ptr, bh->b_data, sb->s_blocksize);
+		if (count < sb->s_blocksize)
+			memset((void *)*ptr + count, 0xff,
+				sb->s_blocksize - count);
+		brelse(bh);
+		ptr++;
+	}
+out:
+	return 0;
+
+nomem_free:
+	for (count = 0; count < array_size; count++)
+		kfree(sbi->s_imap[count]);
+
+	kfree(sbi->s_imap);
+nomem:
+	sbi->s_imap = NULL;
+	sbi->s_imap_size = 0;
+	return -ENOMEM;
+}
+
+enum {
+	Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask
+};
+
+static match_table_t tokens = {
+	{Opt_uid, "uid=%u"},
+	{Opt_gid, "gid=%u"},
+	{Opt_umask, "umask=%o"},
+	{Opt_dmask, "dmask=%o"},
+	{Opt_fmask, "fmask=%o"},
+};
+
+static int parse_options(char *options, struct omfs_sb_info *sbi)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+
+	if (!options)
+		return 1;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_uid:
+			if (match_int(&args[0], &option))
+				return 0;
+			sbi->s_uid = option;
+			break;
+		case Opt_gid:
+			if (match_int(&args[0], &option))
+				return 0;
+			sbi->s_gid = option;
+			break;
+		case Opt_umask:
+			if (match_octal(&args[0], &option))
+				return 0;
+			sbi->s_fmask = sbi->s_dmask = option;
+			break;
+		case Opt_dmask:
+			if (match_octal(&args[0], &option))
+				return 0;
+			sbi->s_dmask = option;
+			break;
+		case Opt_fmask:
+			if (match_octal(&args[0], &option))
+				return 0;
+			sbi->s_fmask = option;
+			break;
+		default:
+			return 0;
+		}
+	}
+	return 1;
+}
+
+static int omfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct buffer_head *bh, *bh2;
+	struct omfs_super_block *omfs_sb;
+	struct omfs_root_block *omfs_rb;
+	struct omfs_sb_info *sbi;
+	struct inode *root;
+	sector_t start;
+	int ret = -EINVAL;
+
+	save_mount_options(sb, (char *) data);
+
+	sbi = kzalloc(sizeof(struct omfs_sb_info), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+
+	sb->s_fs_info = sbi;
+
+	sbi->s_uid = current->uid;
+	sbi->s_gid = current->gid;
+	sbi->s_dmask = sbi->s_fmask = current->fs->umask;
+
+	if (!parse_options((char *) data, sbi))
+		goto end;
+
+	sb->s_maxbytes = 0xffffffff;
+
+	sb_set_blocksize(sb, 0x200);
+
+	bh = sb_bread(sb, 0);
+	if (!bh)
+		goto end;
+
+	omfs_sb = (struct omfs_super_block *)bh->b_data;
+
+	if (omfs_sb->s_magic != cpu_to_be32(OMFS_MAGIC)) {
+		if (!silent)
+			printk(KERN_ERR "omfs: Invalid superblock (%x)\n",
+				   omfs_sb->s_magic);
+		goto out_brelse_bh;
+	}
+	sb->s_magic = OMFS_MAGIC;
+
+	sbi->s_num_blocks = be64_to_cpu(omfs_sb->s_num_blocks);
+	sbi->s_blocksize = be32_to_cpu(omfs_sb->s_blocksize);
+	sbi->s_mirrors = be32_to_cpu(omfs_sb->s_mirrors);
+	sbi->s_root_ino = be64_to_cpu(omfs_sb->s_root_block);
+	sbi->s_sys_blocksize = be32_to_cpu(omfs_sb->s_sys_blocksize);
+	mutex_init(&sbi->s_bitmap_lock);
+
+	if (sbi->s_sys_blocksize > PAGE_SIZE) {
+		printk(KERN_ERR "omfs: sysblock size (%d) is out of range\n",
+			sbi->s_sys_blocksize);
+		goto out_brelse_bh;
+	}
+
+	if (sbi->s_blocksize < sbi->s_sys_blocksize ||
+	    sbi->s_blocksize > OMFS_MAX_BLOCK_SIZE) {
+		printk(KERN_ERR "omfs: block size (%d) is out of range\n",
+			sbi->s_blocksize);
+		goto out_brelse_bh;
+	}
+
+	/*
+	 * Use sys_blocksize as the fs block since it is smaller than a
+	 * page while the fs blocksize can be larger.
+	 */
+	sb_set_blocksize(sb, sbi->s_sys_blocksize);
+
+	/*
+	 * ...and the difference goes into a shift.  sys_blocksize is always
+	 * a power of two factor of blocksize.
+	 */
+	sbi->s_block_shift = get_bitmask_order(sbi->s_blocksize) -
+		get_bitmask_order(sbi->s_sys_blocksize);
+
+	start = clus_to_blk(sbi, be64_to_cpu(omfs_sb->s_root_block));
+	bh2 = sb_bread(sb, start);
+	if (!bh2)
+		goto out_brelse_bh;
+
+	omfs_rb = (struct omfs_root_block *)bh2->b_data;
+
+	sbi->s_bitmap_ino = be64_to_cpu(omfs_rb->r_bitmap);
+	sbi->s_clustersize = be32_to_cpu(omfs_rb->r_clustersize);
+
+	if (sbi->s_num_blocks != be64_to_cpu(omfs_rb->r_num_blocks)) {
+		printk(KERN_ERR "omfs: block count discrepancy between "
+			"super and root blocks (%llx, %llx)\n",
+			sbi->s_num_blocks, be64_to_cpu(omfs_rb->r_num_blocks));
+		goto out_brelse_bh2;
+	}
+
+	ret = omfs_get_imap(sb);
+	if (ret)
+		goto out_brelse_bh2;
+
+	sb->s_op = &omfs_sops;
+
+	root = omfs_iget(sb, be64_to_cpu(omfs_rb->r_root_dir));
+	if (IS_ERR(root)) {
+		ret = PTR_ERR(root);
+		goto out_brelse_bh2;
+	}
+
+	sb->s_root = d_alloc_root(root);
+	if (!sb->s_root) {
+		iput(root);
+		goto out_brelse_bh2;
+	}
+	printk(KERN_DEBUG "omfs: Mounted volume %s\n", omfs_rb->r_name);
+
+	ret = 0;
+out_brelse_bh2:
+	brelse(bh2);
+out_brelse_bh:
+	brelse(bh);
+end:
+	return ret;
+}
+
+static int omfs_get_sb(struct file_system_type *fs_type,
+			int flags, const char *dev_name,
+			void *data, struct vfsmount *m)
+{
+	return get_sb_bdev(fs_type, flags, dev_name, data, omfs_fill_super, m);
+}
+
+static struct file_system_type omfs_fs_type = {
+	.owner = THIS_MODULE,
+	.name = "omfs",
+	.get_sb = omfs_get_sb,
+	.kill_sb = kill_block_super,
+	.fs_flags = FS_REQUIRES_DEV,
+};
+
+static int __init init_omfs_fs(void)
+{
+	return register_filesystem(&omfs_fs_type);
+}
+
+static void __exit exit_omfs_fs(void)
+{
+	unregister_filesystem(&omfs_fs_type);
+}
+
+module_init(init_omfs_fs);
+module_exit(exit_omfs_fs);
-- 
cgit v1.2.3-70-g09d2


From a3ab7155ea21aadc8a4d5687e91b3d876973185e Mon Sep 17 00:00:00 2001
From: Bob Copeland <me@bobcopeland.com>
Date: Fri, 25 Jul 2008 19:45:16 -0700
Subject: omfs: add directory routines

Add lookup and directory management routines for OMFS.  The filesystem uses
hashing based on the filename and stores collisions, unordered, in siblings
of files' inode structures.  To support telldir, the current position in
the hash table is encoded in fpos.

Signed-off-by: Bob Copeland <me@bobcopeland.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/omfs/dir.c | 504 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 504 insertions(+)
 create mode 100644 fs/omfs/dir.c

(limited to 'fs')

diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
new file mode 100644
index 00000000000..05a5bc31e4b
--- /dev/null
+++ b/fs/omfs/dir.c
@@ -0,0 +1,504 @@
+/*
+ * OMFS (as used by RIO Karma) directory operations.
+ * Copyright (C) 2005 Bob Copeland <me@bobcopeland.com>
+ * Released under GPL v2.
+ */
+
+#include <linux/fs.h>
+#include <linux/ctype.h>
+#include <linux/buffer_head.h>
+#include "omfs.h"
+
+static int omfs_hash(const char *name, int namelen, int mod)
+{
+	int i, hash = 0;
+	for (i = 0; i < namelen; i++)
+		hash ^= tolower(name[i]) << (i % 24);
+	return hash % mod;
+}
+
+/*
+ * Finds the bucket for a given name and reads the containing block;
+ * *ofs is set to the offset of the first list entry.
+ */
+static struct buffer_head *omfs_get_bucket(struct inode *dir,
+		const char *name, int namelen, int *ofs)
+{
+	int nbuckets = (dir->i_size - OMFS_DIR_START)/8;
+	int block = clus_to_blk(OMFS_SB(dir->i_sb), dir->i_ino);
+	int bucket = omfs_hash(name, namelen, nbuckets);
+
+	*ofs = OMFS_DIR_START + bucket * 8;
+	return sb_bread(dir->i_sb, block);
+}
+
+static struct buffer_head *omfs_scan_list(struct inode *dir, u64 block,
+				const char *name, int namelen,
+				u64 *prev_block)
+{
+	struct buffer_head *bh;
+	struct omfs_inode *oi;
+	int err = -ENOENT;
+	*prev_block = ~0;
+
+	while (block != ~0) {
+		bh = sb_bread(dir->i_sb,
+			clus_to_blk(OMFS_SB(dir->i_sb), block));
+		if (!bh) {
+			err = -EIO;
+			goto err;
+		}
+
+		oi = (struct omfs_inode *) bh->b_data;
+		if (omfs_is_bad(OMFS_SB(dir->i_sb), &oi->i_head, block)) {
+			brelse(bh);
+			goto err;
+		}
+
+		if (strncmp(oi->i_name, name, namelen) == 0)
+			return bh;
+
+		*prev_block = block;
+		block = be64_to_cpu(oi->i_sibling);
+		brelse(bh);
+	}
+err:
+	return ERR_PTR(err);
+}
+
+static struct buffer_head *omfs_find_entry(struct inode *dir,
+					   const char *name, int namelen)
+{
+	struct buffer_head *bh;
+	int ofs;
+	u64 block, dummy;
+
+	bh = omfs_get_bucket(dir, name, namelen, &ofs);
+	if (!bh)
+		return ERR_PTR(-EIO);
+
+	block = be64_to_cpu(*((__be64 *) &bh->b_data[ofs]));
+	brelse(bh);
+
+	return omfs_scan_list(dir, block, name, namelen, &dummy);
+}
+
+int omfs_make_empty(struct inode *inode, struct super_block *sb)
+{
+	struct omfs_sb_info *sbi = OMFS_SB(sb);
+	int block = clus_to_blk(sbi, inode->i_ino);
+	struct buffer_head *bh;
+	struct omfs_inode *oi;
+
+	bh = sb_bread(sb, block);
+	if (!bh)
+		return -ENOMEM;
+
+	memset(bh->b_data, 0, sizeof(struct omfs_inode));
+
+	if (inode->i_mode & S_IFDIR) {
+		memset(&bh->b_data[OMFS_DIR_START], 0xff,
+			sbi->s_sys_blocksize - OMFS_DIR_START);
+	} else
+		omfs_make_empty_table(bh, OMFS_EXTENT_START);
+
+	oi = (struct omfs_inode *) bh->b_data;
+	oi->i_head.h_self = cpu_to_be64(inode->i_ino);
+	oi->i_sibling = ~0ULL;
+
+	mark_buffer_dirty(bh);
+	brelse(bh);
+	return 0;
+}
+
+static int omfs_add_link(struct dentry *dentry, struct inode *inode)
+{
+	struct inode *dir = dentry->d_parent->d_inode;
+	const char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	struct omfs_inode *oi;
+	struct buffer_head *bh;
+	u64 block;
+	__be64 *entry;
+	int ofs;
+
+	/* just prepend to head of queue in proper bucket */
+	bh = omfs_get_bucket(dir, name, namelen, &ofs);
+	if (!bh)
+		goto out;
+
+	entry = (__be64 *) &bh->b_data[ofs];
+	block = be64_to_cpu(*entry);
+	*entry = cpu_to_be64(inode->i_ino);
+	mark_buffer_dirty(bh);
+	brelse(bh);
+
+	/* now set the sibling and parent pointers on the new inode */
+	bh = sb_bread(dir->i_sb, clus_to_blk(OMFS_SB(dir->i_sb), inode->i_ino));
+	if (!bh)
+		goto out;
+
+	oi = (struct omfs_inode *) bh->b_data;
+	memcpy(oi->i_name, name, namelen);
+	memset(oi->i_name + namelen, 0, OMFS_NAMELEN - namelen);
+	oi->i_sibling = cpu_to_be64(block);
+	oi->i_parent = cpu_to_be64(dir->i_ino);
+	mark_buffer_dirty(bh);
+	brelse(bh);
+
+	dir->i_ctime = CURRENT_TIME_SEC;
+
+	/* mark affected inodes dirty to rebuild checksums */
+	mark_inode_dirty(dir);
+	mark_inode_dirty(inode);
+	return 0;
+out:
+	return -ENOMEM;
+}
+
+static int omfs_delete_entry(struct dentry *dentry)
+{
+	struct inode *dir = dentry->d_parent->d_inode;
+	struct inode *dirty;
+	const char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	struct omfs_inode *oi;
+	struct buffer_head *bh, *bh2;
+	__be64 *entry, next;
+	u64 block, prev;
+	int ofs;
+	int err = -ENOMEM;
+
+	/* delete the proper node in the bucket's linked list */
+	bh = omfs_get_bucket(dir, name, namelen, &ofs);
+	if (!bh)
+		goto out;
+
+	entry = (__be64 *) &bh->b_data[ofs];
+	block = be64_to_cpu(*entry);
+
+	bh2 = omfs_scan_list(dir, block, name, namelen, &prev);
+	if (IS_ERR(bh2)) {
+		err = PTR_ERR(bh2);
+		goto out_free_bh;
+	}
+
+	oi = (struct omfs_inode *) bh2->b_data;
+	next = oi->i_sibling;
+	brelse(bh2);
+
+	if (prev != ~0) {
+		/* found in middle of list, get list ptr */
+		brelse(bh);
+		bh = sb_bread(dir->i_sb,
+			clus_to_blk(OMFS_SB(dir->i_sb), prev));
+		if (!bh)
+			goto out;
+
+		oi = (struct omfs_inode *) bh->b_data;
+		entry = &oi->i_sibling;
+	}
+
+	*entry = next;
+	mark_buffer_dirty(bh);
+
+	if (prev != ~0) {
+		dirty = omfs_iget(dir->i_sb, prev);
+		if (!IS_ERR(dirty)) {
+			mark_inode_dirty(dirty);
+			iput(dirty);
+		}
+	}
+
+	err = 0;
+out_free_bh:
+	brelse(bh);
+out:
+	return err;
+}
+
+static int omfs_dir_is_empty(struct inode *inode)
+{
+	int nbuckets = (inode->i_size - OMFS_DIR_START) / 8;
+	struct buffer_head *bh;
+	u64 *ptr;
+	int i;
+
+	bh = sb_bread(inode->i_sb, clus_to_blk(OMFS_SB(inode->i_sb),
+			inode->i_ino));
+
+	if (!bh)
+		return 0;
+
+	ptr = (u64 *) &bh->b_data[OMFS_DIR_START];
+
+	for (i = 0; i < nbuckets; i++, ptr++)
+		if (*ptr != ~0)
+			break;
+
+	brelse(bh);
+	return *ptr != ~0;
+}
+
+static int omfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	int ret;
+	struct inode *inode = dentry->d_inode;
+
+	ret = omfs_delete_entry(dentry);
+	if (ret)
+		goto end_unlink;
+
+	inode_dec_link_count(inode);
+	mark_inode_dirty(dir);
+
+end_unlink:
+	return ret;
+}
+
+static int omfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	int err = -ENOTEMPTY;
+	struct inode *inode = dentry->d_inode;
+
+	if (omfs_dir_is_empty(inode)) {
+		err = omfs_unlink(dir, dentry);
+		if (!err)
+			inode_dec_link_count(inode);
+	}
+	return err;
+}
+
+static int omfs_add_node(struct inode *dir, struct dentry *dentry, int mode)
+{
+	int err;
+	struct inode *inode = omfs_new_inode(dir, mode);
+
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	err = omfs_make_empty(inode, dir->i_sb);
+	if (err)
+		goto out_free_inode;
+
+	err = omfs_add_link(dentry, inode);
+	if (err)
+		goto out_free_inode;
+
+	d_instantiate(dentry, inode);
+	return 0;
+
+out_free_inode:
+	iput(inode);
+	return err;
+}
+
+static int omfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	return omfs_add_node(dir, dentry, mode | S_IFDIR);
+}
+
+static int omfs_create(struct inode *dir, struct dentry *dentry, int mode,
+		struct nameidata *nd)
+{
+	return omfs_add_node(dir, dentry, mode | S_IFREG);
+}
+
+static struct dentry *omfs_lookup(struct inode *dir, struct dentry *dentry,
+				  struct nameidata *nd)
+{
+	struct buffer_head *bh;
+	struct inode *inode = NULL;
+
+	if (dentry->d_name.len > OMFS_NAMELEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	bh = omfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len);
+	if (!IS_ERR(bh)) {
+		struct omfs_inode *oi = (struct omfs_inode *)bh->b_data;
+		ino_t ino = be64_to_cpu(oi->i_head.h_self);
+		brelse(bh);
+		inode = omfs_iget(dir->i_sb, ino);
+		if (IS_ERR(inode))
+			return ERR_CAST(inode);
+	}
+	d_add(dentry, inode);
+	return NULL;
+}
+
+/* sanity check block's self pointer */
+int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header,
+	u64 fsblock)
+{
+	int is_bad;
+	u64 ino = be64_to_cpu(header->h_self);
+	is_bad = ((ino != fsblock) || (ino < sbi->s_root_ino) ||
+		(ino > sbi->s_num_blocks));
+
+	if (is_bad)
+		printk(KERN_WARNING "omfs: bad hash chain detected\n");
+
+	return is_bad;
+}
+
+static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir,
+		u64 fsblock, int hindex)
+{
+	struct inode *dir = filp->f_dentry->d_inode;
+	struct buffer_head *bh;
+	struct omfs_inode *oi;
+	u64 self;
+	int res = 0;
+	unsigned char d_type;
+
+	/* follow chain in this bucket */
+	while (fsblock != ~0) {
+		bh = sb_bread(dir->i_sb, clus_to_blk(OMFS_SB(dir->i_sb),
+				fsblock));
+		if (!bh)
+			goto out;
+
+		oi = (struct omfs_inode *) bh->b_data;
+		if (omfs_is_bad(OMFS_SB(dir->i_sb), &oi->i_head, fsblock)) {
+			brelse(bh);
+			goto out;
+		}
+
+		self = fsblock;
+		fsblock = be64_to_cpu(oi->i_sibling);
+
+		/* skip visited nodes */
+		if (hindex) {
+			hindex--;
+			brelse(bh);
+			continue;
+		}
+
+		d_type = (oi->i_type == OMFS_DIR) ? DT_DIR : DT_REG;
+
+		res = filldir(dirent, oi->i_name, strnlen(oi->i_name,
+			OMFS_NAMELEN), filp->f_pos, self, d_type);
+		if (res == 0)
+			filp->f_pos++;
+		brelse(bh);
+	}
+out:
+	return res;
+}
+
+static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+		struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct inode *new_inode = new_dentry->d_inode;
+	struct inode *old_inode = old_dentry->d_inode;
+	struct buffer_head *bh;
+	int is_dir;
+	int err;
+
+	is_dir = S_ISDIR(old_inode->i_mode);
+
+	if (new_inode) {
+		/* overwriting existing file/dir */
+		err = -ENOTEMPTY;
+		if (is_dir && !omfs_dir_is_empty(new_inode))
+			goto out;
+
+		err = -ENOENT;
+		bh = omfs_find_entry(new_dir, new_dentry->d_name.name,
+			new_dentry->d_name.len);
+		if (IS_ERR(bh))
+			goto out;
+		brelse(bh);
+
+		err = omfs_unlink(new_dir, new_dentry);
+		if (err)
+			goto out;
+	}
+
+	/* since omfs locates files by name, we need to unlink _before_
+	 * adding the new link or we won't find the old one */
+	inode_inc_link_count(old_inode);
+	err = omfs_unlink(old_dir, old_dentry);
+	if (err) {
+		inode_dec_link_count(old_inode);
+		goto out;
+	}
+
+	err = omfs_add_link(new_dentry, old_inode);
+	if (err)
+		goto out;
+
+	old_inode->i_ctime = CURRENT_TIME_SEC;
+out:
+	return err;
+}
+
+static int omfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct inode *dir = filp->f_dentry->d_inode;
+	struct buffer_head *bh;
+	loff_t offset, res;
+	unsigned int hchain, hindex;
+	int nbuckets;
+	u64 fsblock;
+	int ret = -EINVAL;
+
+	if (filp->f_pos >> 32)
+		goto success;
+
+	switch ((unsigned long) filp->f_pos) {
+	case 0:
+		if (filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR) < 0)
+			goto success;
+		filp->f_pos++;
+		/* fall through */
+	case 1:
+		if (filldir(dirent, "..", 2, 1,
+		    parent_ino(filp->f_dentry), DT_DIR) < 0)
+			goto success;
+		filp->f_pos = 1 << 20;
+		/* fall through */
+	}
+
+	nbuckets = (dir->i_size - OMFS_DIR_START) / 8;
+
+	/* high 12 bits store bucket + 1 and low 20 bits store hash index */
+	hchain = (filp->f_pos >> 20) - 1;
+	hindex = filp->f_pos & 0xfffff;
+
+	bh = sb_bread(dir->i_sb, clus_to_blk(OMFS_SB(dir->i_sb), dir->i_ino));
+	if (!bh)
+		goto out;
+
+	offset = OMFS_DIR_START + hchain * 8;
+
+	for (; hchain < nbuckets; hchain++, offset += 8) {
+		fsblock = be64_to_cpu(*((__be64 *) &bh->b_data[offset]));
+
+		res = omfs_fill_chain(filp, dirent, filldir, fsblock, hindex);
+		hindex = 0;
+		if (res < 0)
+			break;
+
+		filp->f_pos = (hchain+2) << 20;
+	}
+	brelse(bh);
+success:
+	ret = 0;
+out:
+	return ret;
+}
+
+struct inode_operations omfs_dir_inops = {
+	.lookup = omfs_lookup,
+	.mkdir = omfs_mkdir,
+	.rename = omfs_rename,
+	.create = omfs_create,
+	.unlink = omfs_unlink,
+	.rmdir = omfs_rmdir,
+};
+
+struct file_operations omfs_dir_operations = {
+	.read = generic_read_dir,
+	.readdir = omfs_readdir,
+};
-- 
cgit v1.2.3-70-g09d2


From 8f09e98768c17287df076580c4cc72ac358312c6 Mon Sep 17 00:00:00 2001
From: Bob Copeland <me@bobcopeland.com>
Date: Fri, 25 Jul 2008 19:45:16 -0700
Subject: omfs: add file routines

Add functions for reading and manipulating the storage of file data in
the extent-based OMFS.

Signed-off-by: Bob Copeland <me@bobcopeland.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/omfs/file.c | 346 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 346 insertions(+)
 create mode 100644 fs/omfs/file.c

(limited to 'fs')

diff --git a/fs/omfs/file.c b/fs/omfs/file.c
new file mode 100644
index 00000000000..66e01fae438
--- /dev/null
+++ b/fs/omfs/file.c
@@ -0,0 +1,346 @@
+/*
+ * OMFS (as used by RIO Karma) file operations.
+ * Copyright (C) 2005 Bob Copeland <me@bobcopeland.com>
+ * Released under GPL v2.
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include "omfs.h"
+
+static int omfs_sync_file(struct file *file, struct dentry *dentry,
+		int datasync)
+{
+	struct inode *inode = dentry->d_inode;
+	int err;
+
+	err = sync_mapping_buffers(inode->i_mapping);
+	if (!(inode->i_state & I_DIRTY))
+		return err;
+	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+		return err;
+	err |= omfs_sync_inode(inode);
+	return err ? -EIO : 0;
+}
+
+void omfs_make_empty_table(struct buffer_head *bh, int offset)
+{
+	struct omfs_extent *oe = (struct omfs_extent *) &bh->b_data[offset];
+
+	oe->e_next = ~0ULL;
+	oe->e_extent_count = cpu_to_be32(1),
+	oe->e_fill = cpu_to_be32(0x22),
+	oe->e_entry.e_cluster = ~0ULL;
+	oe->e_entry.e_blocks = ~0ULL;
+}
+
+int omfs_shrink_inode(struct inode *inode)
+{
+	struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb);
+	struct omfs_extent *oe;
+	struct omfs_extent_entry *entry;
+	struct buffer_head *bh;
+	u64 next, last;
+	u32 extent_count;
+	int ret;
+
+	/* traverse extent table, freeing each entry that is greater
+	 * than inode->i_size;
+	 */
+	next = inode->i_ino;
+
+	/* only support truncate -> 0 for now */
+	ret = -EIO;
+	if (inode->i_size != 0)
+		goto out;
+
+	bh = sb_bread(inode->i_sb, clus_to_blk(sbi, next));
+	if (!bh)
+		goto out;
+
+	oe = (struct omfs_extent *)(&bh->b_data[OMFS_EXTENT_START]);
+
+	for (;;) {
+
+		if (omfs_is_bad(sbi, (struct omfs_header *) bh->b_data, next)) {
+			brelse(bh);
+			goto out;
+		}
+
+		extent_count = be32_to_cpu(oe->e_extent_count);
+		last = next;
+		next = be64_to_cpu(oe->e_next);
+		entry = &oe->e_entry;
+
+		/* ignore last entry as it is the terminator */
+		for (; extent_count > 1; extent_count--) {
+			u64 start, count;
+			start = be64_to_cpu(entry->e_cluster);
+			count = be64_to_cpu(entry->e_blocks);
+
+			omfs_clear_range(inode->i_sb, start, (int) count);
+			entry++;
+		}
+		omfs_make_empty_table(bh, (char *) oe - bh->b_data);
+		mark_buffer_dirty(bh);
+		brelse(bh);
+
+		if (last != inode->i_ino)
+			omfs_clear_range(inode->i_sb, last, sbi->s_mirrors);
+
+		if (next == ~0)
+			break;
+
+		bh = sb_bread(inode->i_sb, clus_to_blk(sbi, next));
+		if (!bh)
+			goto out;
+		oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]);
+	}
+	ret = 0;
+out:
+	return ret;
+}
+
+static void omfs_truncate(struct inode *inode)
+{
+	omfs_shrink_inode(inode);
+	mark_inode_dirty(inode);
+}
+
+/*
+ * Add new blocks to the current extent, or create new entries/continuations
+ * as necessary.
+ */
+static int omfs_grow_extent(struct inode *inode, struct omfs_extent *oe,
+			u64 *ret_block)
+{
+	struct omfs_extent_entry *terminator;
+	struct omfs_extent_entry *entry = &oe->e_entry;
+	struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb);
+	u32 extent_count = be32_to_cpu(oe->e_extent_count);
+	u64 new_block = 0;
+	u32 max_count;
+	int new_count;
+	int ret = 0;
+
+	/* reached the end of the extent table with no blocks mapped.
+	 * there are three possibilities for adding: grow last extent,
+	 * add a new extent to the current extent table, and add a
+	 * continuation inode.  in last two cases need an allocator for
+	 * sbi->s_cluster_size
+	 */
+
+	/* TODO: handle holes */
+
+	/* should always have a terminator */
+	if (extent_count < 1)
+		return -EIO;
+
+	/* trivially grow current extent, if next block is not taken */
+	terminator = entry + extent_count - 1;
+	if (extent_count > 1) {
+		entry = terminator-1;
+		new_block = be64_to_cpu(entry->e_cluster) +
+			be64_to_cpu(entry->e_blocks);
+
+		if (omfs_allocate_block(inode->i_sb, new_block)) {
+			entry->e_blocks =
+				cpu_to_be64(be64_to_cpu(entry->e_blocks) + 1);
+			terminator->e_blocks = ~(cpu_to_be64(
+				be64_to_cpu(~terminator->e_blocks) + 1));
+			goto out;
+		}
+	}
+	max_count = (sbi->s_sys_blocksize - OMFS_EXTENT_START -
+		sizeof(struct omfs_extent)) /
+		sizeof(struct omfs_extent_entry) + 1;
+
+	/* TODO: add a continuation block here */
+	if (be32_to_cpu(oe->e_extent_count) > max_count-1)
+		return -EIO;
+
+	/* try to allocate a new cluster */
+	ret = omfs_allocate_range(inode->i_sb, 1, sbi->s_clustersize,
+		&new_block, &new_count);
+	if (ret)
+		goto out_fail;
+
+	/* copy terminator down an entry */
+	entry = terminator;
+	terminator++;
+	memcpy(terminator, entry, sizeof(struct omfs_extent_entry));
+
+	entry->e_cluster = cpu_to_be64(new_block);
+	entry->e_blocks = cpu_to_be64((u64) new_count);
+
+	terminator->e_blocks = ~(cpu_to_be64(
+		be64_to_cpu(~terminator->e_blocks) + (u64) new_count));
+
+	/* write in new entry */
+	oe->e_extent_count = cpu_to_be32(1 + be32_to_cpu(oe->e_extent_count));
+
+out:
+	*ret_block = new_block;
+out_fail:
+	return ret;
+}
+
+/*
+ * Scans across the directory table for a given file block number.
+ * If block not found, return 0.
+ */
+static sector_t find_block(struct inode *inode, struct omfs_extent_entry *ent,
+			sector_t block, int count, int *left)
+{
+	/* count > 1 because of terminator */
+	sector_t searched = 0;
+	for (; count > 1; count--) {
+		int numblocks = clus_to_blk(OMFS_SB(inode->i_sb),
+			be64_to_cpu(ent->e_blocks));
+
+		if (block >= searched  &&
+		    block < searched + numblocks) {
+			/*
+			 * found it at cluster + (block - searched)
+			 * numblocks - (block - searched) is remainder
+			 */
+			*left = numblocks - (block - searched);
+			return clus_to_blk(OMFS_SB(inode->i_sb),
+				be64_to_cpu(ent->e_cluster)) +
+				block - searched;
+		}
+		searched += numblocks;
+		ent++;
+	}
+	return 0;
+}
+
+static int omfs_get_block(struct inode *inode, sector_t block,
+			  struct buffer_head *bh_result, int create)
+{
+	struct buffer_head *bh;
+	sector_t next, offset;
+	int ret;
+	u64 new_block;
+	int extent_count;
+	struct omfs_extent *oe;
+	struct omfs_extent_entry *entry;
+	struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb);
+	int max_blocks = bh_result->b_size >> inode->i_blkbits;
+	int remain;
+
+	ret = -EIO;
+	bh = sb_bread(inode->i_sb, clus_to_blk(sbi, inode->i_ino));
+	if (!bh)
+		goto out;
+
+	oe = (struct omfs_extent *)(&bh->b_data[OMFS_EXTENT_START]);
+	next = inode->i_ino;
+
+	for (;;) {
+
+		if (omfs_is_bad(sbi, (struct omfs_header *) bh->b_data, next))
+			goto out_brelse;
+
+		extent_count = be32_to_cpu(oe->e_extent_count);
+		next = be64_to_cpu(oe->e_next);
+		entry = &oe->e_entry;
+
+		offset = find_block(inode, entry, block, extent_count, &remain);
+		if (offset > 0) {
+			ret = 0;
+			map_bh(bh_result, inode->i_sb, offset);
+			if (remain > max_blocks)
+				remain = max_blocks;
+			bh_result->b_size = (remain << inode->i_blkbits);
+			goto out_brelse;
+		}
+		if (next == ~0)
+			break;
+
+		brelse(bh);
+		bh = sb_bread(inode->i_sb, clus_to_blk(sbi, next));
+		if (!bh)
+			goto out;
+		oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]);
+	}
+	if (create) {
+		ret = omfs_grow_extent(inode, oe, &new_block);
+		if (ret == 0) {
+			mark_buffer_dirty(bh);
+			mark_inode_dirty(inode);
+			map_bh(bh_result, inode->i_sb,
+					clus_to_blk(sbi, new_block));
+		}
+	}
+out_brelse:
+	brelse(bh);
+out:
+	return ret;
+}
+
+static int omfs_readpage(struct file *file, struct page *page)
+{
+	return block_read_full_page(page, omfs_get_block);
+}
+
+static int omfs_readpages(struct file *file, struct address_space *mapping,
+		struct list_head *pages, unsigned nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, omfs_get_block);
+}
+
+static int omfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	return block_write_full_page(page, omfs_get_block, wbc);
+}
+
+static int
+omfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+	return mpage_writepages(mapping, wbc, omfs_get_block);
+}
+
+static int omfs_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	*pagep = NULL;
+	return block_write_begin(file, mapping, pos, len, flags,
+				pagep, fsdata, omfs_get_block);
+}
+
+static sector_t omfs_bmap(struct address_space *mapping, sector_t block)
+{
+	return generic_block_bmap(mapping, block, omfs_get_block);
+}
+
+struct file_operations omfs_file_operations = {
+	.llseek = generic_file_llseek,
+	.read = do_sync_read,
+	.write = do_sync_write,
+	.aio_read = generic_file_aio_read,
+	.aio_write = generic_file_aio_write,
+	.mmap = generic_file_mmap,
+	.fsync = omfs_sync_file,
+	.splice_read = generic_file_splice_read,
+};
+
+struct inode_operations omfs_file_inops = {
+	.truncate = omfs_truncate
+};
+
+struct address_space_operations omfs_aops = {
+	.readpage = omfs_readpage,
+	.readpages = omfs_readpages,
+	.writepage = omfs_writepage,
+	.writepages = omfs_writepages,
+	.sync_page = block_sync_page,
+	.write_begin = omfs_write_begin,
+	.write_end = generic_write_end,
+	.bmap = omfs_bmap,
+};
+
-- 
cgit v1.2.3-70-g09d2


From 36cc410a6799a205bfc6ccc38abd9d52f2afba64 Mon Sep 17 00:00:00 2001
From: Bob Copeland <me@bobcopeland.com>
Date: Fri, 25 Jul 2008 19:45:17 -0700
Subject: omfs: add bitmap routines

Add block allocation and block bitmap management routines for OMFS.

Signed-off-by: Bob Copeland <me@bobcopeland.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/omfs/bitmap.c | 192 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 192 insertions(+)
 create mode 100644 fs/omfs/bitmap.c

(limited to 'fs')

diff --git a/fs/omfs/bitmap.c b/fs/omfs/bitmap.c
new file mode 100644
index 00000000000..dc75f22be3f
--- /dev/null
+++ b/fs/omfs/bitmap.c
@@ -0,0 +1,192 @@
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <asm/div64.h>
+#include "omfs.h"
+
+unsigned long omfs_count_free(struct super_block *sb)
+{
+	unsigned int i;
+	unsigned long sum = 0;
+	struct omfs_sb_info *sbi = OMFS_SB(sb);
+	int nbits = sb->s_blocksize * 8;
+
+	for (i = 0; i < sbi->s_imap_size; i++)
+		sum += nbits - bitmap_weight(sbi->s_imap[i], nbits);
+
+	return sum;
+}
+
+/*
+ *  Counts the run of zero bits starting at bit up to max.
+ *  It handles the case where a run might spill over a buffer.
+ *  Called with bitmap lock.
+ */
+static int count_run(unsigned long **addr, int nbits,
+		int addrlen, int bit, int max)
+{
+	int count = 0;
+	int x;
+
+	for (; addrlen > 0; addrlen--, addr++) {
+		x = find_next_bit(*addr, nbits, bit);
+		count += x - bit;
+
+		if (x < nbits || count > max)
+			return min(count, max);
+
+		bit = 0;
+	}
+	return min(count, max);
+}
+
+/*
+ * Sets or clears the run of count bits starting with bit.
+ * Called with bitmap lock.
+ */
+static int set_run(struct super_block *sb, int map,
+		int nbits, int bit, int count, int set)
+{
+	int i;
+	int err;
+	struct buffer_head *bh;
+	struct omfs_sb_info *sbi = OMFS_SB(sb);
+
+ 	err = -ENOMEM;
+	bh = sb_bread(sb, clus_to_blk(sbi, sbi->s_bitmap_ino) + map);
+	if (!bh)
+		goto out;
+
+	for (i = 0; i < count; i++, bit++) {
+		if (bit >= nbits) {
+			bit = 0;
+			map++;
+
+			mark_buffer_dirty(bh);
+			brelse(bh);
+			bh = sb_bread(sb,
+				clus_to_blk(sbi, sbi->s_bitmap_ino) + map);
+			if (!bh)
+				goto out;
+		}
+		if (set) {
+			set_bit(bit, sbi->s_imap[map]);
+			set_bit(bit, (long *) bh->b_data);
+		} else {
+			clear_bit(bit, sbi->s_imap[map]);
+			clear_bit(bit, (long *) bh->b_data);
+		}
+	}
+	mark_buffer_dirty(bh);
+	brelse(bh);
+	err = 0;
+out:
+	return err;
+}
+
+/*
+ * Tries to allocate exactly one block.  Returns true if sucessful.
+ */
+int omfs_allocate_block(struct super_block *sb, u64 block)
+{
+	struct buffer_head *bh;
+	struct omfs_sb_info *sbi = OMFS_SB(sb);
+	int bits_per_entry = 8 * sb->s_blocksize;
+	int map, bit;
+	int ret = 0;
+	u64 tmp;
+
+	tmp = block;
+	bit = do_div(tmp, bits_per_entry);
+	map = tmp;
+
+	mutex_lock(&sbi->s_bitmap_lock);
+	if (map >= sbi->s_imap_size || test_and_set_bit(bit, sbi->s_imap[map]))
+		goto out;
+
+	if (sbi->s_bitmap_ino > 0) {
+		bh = sb_bread(sb, clus_to_blk(sbi, sbi->s_bitmap_ino) + map);
+		if (!bh)
+			goto out;
+
+		set_bit(bit, (long *) bh->b_data);
+		mark_buffer_dirty(bh);
+		brelse(bh);
+	}
+	ret = 1;
+out:
+	mutex_unlock(&sbi->s_bitmap_lock);
+	return ret;
+}
+
+
+/*
+ *  Tries to allocate a set of blocks.	The request size depends on the
+ *  type: for inodes, we must allocate sbi->s_mirrors blocks, and for file
+ *  blocks, we try to allocate sbi->s_clustersize, but can always get away
+ *  with just one block.
+ */
+int omfs_allocate_range(struct super_block *sb,
+			int min_request,
+			int max_request,
+			u64 *return_block,
+			int *return_size)
+{
+	struct omfs_sb_info *sbi = OMFS_SB(sb);
+	int bits_per_entry = 8 * sb->s_blocksize;
+	int ret = 0;
+	int i, run, bit;
+
+	mutex_lock(&sbi->s_bitmap_lock);
+	for (i = 0; i < sbi->s_imap_size; i++) {
+		bit = 0;
+		while (bit < bits_per_entry) {
+			bit = find_next_zero_bit(sbi->s_imap[i], bits_per_entry,
+				bit);
+
+			if (bit == bits_per_entry)
+				break;
+
+			run = count_run(&sbi->s_imap[i], bits_per_entry,
+				sbi->s_imap_size-i, bit, max_request);
+
+			if (run >= min_request)
+				goto found;
+			bit += run;
+		}
+	}
+	ret = -ENOSPC;
+	goto out;
+
+found:
+	*return_block = i * bits_per_entry + bit;
+	*return_size = run;
+	ret = set_run(sb, i, bits_per_entry, bit, run, 1);
+
+out:
+	mutex_unlock(&sbi->s_bitmap_lock);
+	return ret;
+}
+
+/*
+ * Clears count bits starting at a given block.
+ */
+int omfs_clear_range(struct super_block *sb, u64 block, int count)
+{
+	struct omfs_sb_info *sbi = OMFS_SB(sb);
+	int bits_per_entry = 8 * sb->s_blocksize;
+	u64 tmp;
+	int map, bit, ret;
+
+	tmp = block;
+	bit = do_div(tmp, bits_per_entry);
+	map = tmp;
+
+	if (map >= sbi->s_imap_size)
+		return 0;
+
+	mutex_lock(&sbi->s_bitmap_lock);
+	ret = set_run(sb, map, bits_per_entry, bit, count, 0);
+	mutex_unlock(&sbi->s_bitmap_lock);
+	return ret;
+}
-- 
cgit v1.2.3-70-g09d2


From 63ca8ce2a2641f9cb5f0add33ced4591681d1cd7 Mon Sep 17 00:00:00 2001
From: Bob Copeland <me@bobcopeland.com>
Date: Fri, 25 Jul 2008 19:45:17 -0700
Subject: omfs: update kbuild to include OMFS

Adds OMFS to the fs Kconfig and Makefile

Signed-off-by: Bob Copeland <me@bobcopeland.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/Kconfig       | 13 +++++++++++++
 fs/Makefile      |  1 +
 fs/omfs/Makefile |  4 ++++
 3 files changed, 18 insertions(+)
 create mode 100644 fs/omfs/Makefile

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 97e3bdedb1e..d3873583360 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1383,6 +1383,19 @@ config MINIX_FS
 	  partition (the one containing the directory /) cannot be compiled as
 	  a module.
 
+config OMFS_FS
+	tristate "SonicBlue Optimized MPEG File System support"
+	depends on BLOCK
+	select CRC_ITU_T
+	help
+	  This is the proprietary file system used by the Rio Karma music
+	  player and ReplayTV DVR.  Despite the name, this filesystem is not
+	  more efficient than a standard FS for MPEG files, in fact likely
+	  the opposite is true.  Say Y if you have either of these devices
+	  and wish to mount its disk.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called omfs.  If unsure, say N.
 
 config HPFS_FS
 	tristate "OS/2 HPFS file system support"
diff --git a/fs/Makefile b/fs/Makefile
index 3b2178b4bb6..a1482a5eff1 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -111,6 +111,7 @@ obj-$(CONFIG_ADFS_FS)		+= adfs/
 obj-$(CONFIG_FUSE_FS)		+= fuse/
 obj-$(CONFIG_UDF_FS)		+= udf/
 obj-$(CONFIG_SUN_OPENPROMFS)	+= openpromfs/
+obj-$(CONFIG_OMFS_FS)		+= omfs/
 obj-$(CONFIG_JFS_FS)		+= jfs/
 obj-$(CONFIG_XFS_FS)		+= xfs/
 obj-$(CONFIG_9P_FS)		+= 9p/
diff --git a/fs/omfs/Makefile b/fs/omfs/Makefile
new file mode 100644
index 00000000000..8b82b63f112
--- /dev/null
+++ b/fs/omfs/Makefile
@@ -0,0 +1,4 @@
+
+obj-$(CONFIG_OMFS_FS) += omfs.o
+
+omfs-y := bitmap.o dir.o file.o inode.o
-- 
cgit v1.2.3-70-g09d2


From f5dd33c494a427b1d1a3b574de5c9e511c888864 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Fri, 25 Jul 2008 19:45:25 -0700
Subject: dio: use get_user_pages_fast

Use get_user_pages_fast in the common/generic block and fs direct IO paths.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Dave Kleikamp <shaggy@austin.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Dave Kleikamp <shaggy@austin.ibm.com>
Cc: Badari Pulavarty <pbadari@us.ibm.com>
Cc: Zach Brown <zach.brown@oracle.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Reviewed-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/bio.c       |  8 ++------
 fs/direct-io.c | 10 ++--------
 2 files changed, 4 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index 88322b066ac..25f1af0d81e 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -721,12 +721,8 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
 		const int local_nr_pages = end - start;
 		const int page_limit = cur_page + local_nr_pages;
 		
-		down_read(&current->mm->mmap_sem);
-		ret = get_user_pages(current, current->mm, uaddr,
-				     local_nr_pages,
-				     write_to_vm, 0, &pages[cur_page], NULL);
-		up_read(&current->mm->mmap_sem);
-
+		ret = get_user_pages_fast(uaddr, local_nr_pages,
+				write_to_vm, &pages[cur_page]);
 		if (ret < local_nr_pages) {
 			ret = -EFAULT;
 			goto out_unmap;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 9e81addbd6e..9606ee848fd 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -150,17 +150,11 @@ static int dio_refill_pages(struct dio *dio)
 	int nr_pages;
 
 	nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES);
-	down_read(&current->mm->mmap_sem);
-	ret = get_user_pages(
-		current,			/* Task for fault acounting */
-		current->mm,			/* whose pages? */
+	ret = get_user_pages_fast(
 		dio->curr_user_address,		/* Where from? */
 		nr_pages,			/* How many pages? */
 		dio->rw == READ,		/* Write to memory? */
-		0,				/* force (?) */
-		&dio->pages[0],
-		NULL);				/* vmas */
-	up_read(&current->mm->mmap_sem);
+		&dio->pages[0]);		/* Put results here */
 
 	if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) {
 		struct page *page = ZERO_PAGE(0);
-- 
cgit v1.2.3-70-g09d2


From bc40d73c950146725e9e768e856a416ec8949065 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Fri, 25 Jul 2008 19:45:26 -0700
Subject: splice: use get_user_pages_fast

Use get_user_pages_fast in splice.  This reverts some mmap_sem batching
there, however the biggest problem with mmap_sem tends to be hold times
blocking out other threads rather than cacheline bouncing.  Further: on
architectures that implement get_user_pages_fast without locks, mmap_sem
can be avoided completely anyway.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Dave Kleikamp <shaggy@austin.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Dave Kleikamp <shaggy@austin.ibm.com>
Cc: Badari Pulavarty <pbadari@us.ibm.com>
Cc: Zach Brown <zach.brown@oracle.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Reviewed-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/splice.c | 41 +++--------------------------------------
 1 file changed, 3 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 399442179d8..47dc1a445d1 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1160,36 +1160,6 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 	return -EINVAL;
 }
 
-/*
- * Do a copy-from-user while holding the mmap_semaphore for reading, in a
- * manner safe from deadlocking with simultaneous mmap() (grabbing mmap_sem
- * for writing) and page faulting on the user memory pointed to by src.
- * This assumes that we will very rarely hit the partial != 0 path, or this
- * will not be a win.
- */
-static int copy_from_user_mmap_sem(void *dst, const void __user *src, size_t n)
-{
-	int partial;
-
-	if (!access_ok(VERIFY_READ, src, n))
-		return -EFAULT;
-
-	pagefault_disable();
-	partial = __copy_from_user_inatomic(dst, src, n);
-	pagefault_enable();
-
-	/*
-	 * Didn't copy everything, drop the mmap_sem and do a faulting copy
-	 */
-	if (unlikely(partial)) {
-		up_read(&current->mm->mmap_sem);
-		partial = copy_from_user(dst, src, n);
-		down_read(&current->mm->mmap_sem);
-	}
-
-	return partial;
-}
-
 /*
  * Map an iov into an array of pages and offset/length tupples. With the
  * partial_page structure, we can map several non-contiguous ranges into
@@ -1203,8 +1173,6 @@ static int get_iovec_page_array(const struct iovec __user *iov,
 {
 	int buffers = 0, error = 0;
 
-	down_read(&current->mm->mmap_sem);
-
 	while (nr_vecs) {
 		unsigned long off, npages;
 		struct iovec entry;
@@ -1213,7 +1181,7 @@ static int get_iovec_page_array(const struct iovec __user *iov,
 		int i;
 
 		error = -EFAULT;
-		if (copy_from_user_mmap_sem(&entry, iov, sizeof(entry)))
+		if (copy_from_user(&entry, iov, sizeof(entry)))
 			break;
 
 		base = entry.iov_base;
@@ -1247,9 +1215,8 @@ static int get_iovec_page_array(const struct iovec __user *iov,
 		if (npages > PIPE_BUFFERS - buffers)
 			npages = PIPE_BUFFERS - buffers;
 
-		error = get_user_pages(current, current->mm,
-				       (unsigned long) base, npages, 0, 0,
-				       &pages[buffers], NULL);
+		error = get_user_pages_fast((unsigned long)base, npages,
+					0, &pages[buffers]);
 
 		if (unlikely(error <= 0))
 			break;
@@ -1288,8 +1255,6 @@ static int get_iovec_page_array(const struct iovec __user *iov,
 		iov++;
 	}
 
-	up_read(&current->mm->mmap_sem);
-
 	if (buffers)
 		return buffers;
 
-- 
cgit v1.2.3-70-g09d2


From 19fd6231279be3c3bdd02ed99f9b0eb195978064 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Fri, 25 Jul 2008 19:45:32 -0700
Subject: mm: spinlock tree_lock

mapping->tree_lock has no read lockers.  convert the lock from an rwlock
to a spinlock.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Reviewed-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c                     |  4 ++--
 fs/inode.c                      |  2 +-
 include/asm-arm/cacheflush.h    |  4 ++--
 include/asm-parisc/cacheflush.h |  4 ++--
 include/linux/fs.h              |  2 +-
 mm/filemap.c                    | 10 +++++-----
 mm/migrate.c                    | 11 +++++------
 mm/page-writeback.c             | 12 ++++++------
 mm/swap_state.c                 | 10 +++++-----
 mm/swapfile.c                   |  4 ++--
 mm/truncate.c                   |  6 +++---
 mm/vmscan.c                     |  8 ++++----
 12 files changed, 38 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index d48caee12e2..109b261192d 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -706,7 +706,7 @@ static int __set_page_dirty(struct page *page,
 	if (TestSetPageDirty(page))
 		return 0;
 
-	write_lock_irq(&mapping->tree_lock);
+	spin_lock_irq(&mapping->tree_lock);
 	if (page->mapping) {	/* Race with truncate? */
 		WARN_ON_ONCE(warn && !PageUptodate(page));
 
@@ -719,7 +719,7 @@ static int __set_page_dirty(struct page *page,
 		radix_tree_tag_set(&mapping->page_tree,
 				page_index(page), PAGECACHE_TAG_DIRTY);
 	}
-	write_unlock_irq(&mapping->tree_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 
 	return 1;
diff --git a/fs/inode.c b/fs/inode.c
index c36d9480335..35b6414522e 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -209,7 +209,7 @@ void inode_init_once(struct inode *inode)
 	INIT_LIST_HEAD(&inode->i_dentry);
 	INIT_LIST_HEAD(&inode->i_devices);
 	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
-	rwlock_init(&inode->i_data.tree_lock);
+	spin_lock_init(&inode->i_data.tree_lock);
 	spin_lock_init(&inode->i_data.i_mmap_lock);
 	INIT_LIST_HEAD(&inode->i_data.private_list);
 	spin_lock_init(&inode->i_data.private_lock);
diff --git a/include/asm-arm/cacheflush.h b/include/asm-arm/cacheflush.h
index 70b0fe724b6..03cf1ee977b 100644
--- a/include/asm-arm/cacheflush.h
+++ b/include/asm-arm/cacheflush.h
@@ -424,9 +424,9 @@ static inline void flush_anon_page(struct vm_area_struct *vma,
 }
 
 #define flush_dcache_mmap_lock(mapping) \
-	write_lock_irq(&(mapping)->tree_lock)
+	spin_lock_irq(&(mapping)->tree_lock)
 #define flush_dcache_mmap_unlock(mapping) \
-	write_unlock_irq(&(mapping)->tree_lock)
+	spin_unlock_irq(&(mapping)->tree_lock)
 
 #define flush_icache_user_range(vma,page,addr,len) \
 	flush_dcache_page(page)
diff --git a/include/asm-parisc/cacheflush.h b/include/asm-parisc/cacheflush.h
index 2f1e1b05440..b7ca6dc7fdd 100644
--- a/include/asm-parisc/cacheflush.h
+++ b/include/asm-parisc/cacheflush.h
@@ -45,9 +45,9 @@ void flush_cache_mm(struct mm_struct *mm);
 extern void flush_dcache_page(struct page *page);
 
 #define flush_dcache_mmap_lock(mapping) \
-	write_lock_irq(&(mapping)->tree_lock)
+	spin_lock_irq(&(mapping)->tree_lock)
 #define flush_dcache_mmap_unlock(mapping) \
-	write_unlock_irq(&(mapping)->tree_lock)
+	spin_unlock_irq(&(mapping)->tree_lock)
 
 #define flush_icache_page(vma,page)	do { 		\
 	flush_kernel_dcache_page(page);			\
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 49d8eb7a71b..53d2edb709b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -499,7 +499,7 @@ struct backing_dev_info;
 struct address_space {
 	struct inode		*host;		/* owner: inode, block_device */
 	struct radix_tree_root	page_tree;	/* radix tree of all pages */
-	rwlock_t		tree_lock;	/* and rwlock protecting it */
+	spinlock_t		tree_lock;	/* and lock protecting it */
 	unsigned int		i_mmap_writable;/* count VM_SHARED mappings */
 	struct prio_tree_root	i_mmap;		/* tree of private and shared mappings */
 	struct list_head	i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
diff --git a/mm/filemap.c b/mm/filemap.c
index feb8448d861..2ed8b0389c5 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -109,7 +109,7 @@
 /*
  * Remove a page from the page cache and free it. Caller has to make
  * sure the page is locked and that nobody else uses it - or that usage
- * is safe.  The caller must hold a write_lock on the mapping's tree_lock.
+ * is safe.  The caller must hold the mapping's tree_lock.
  */
 void __remove_from_page_cache(struct page *page)
 {
@@ -141,9 +141,9 @@ void remove_from_page_cache(struct page *page)
 
 	BUG_ON(!PageLocked(page));
 
-	write_lock_irq(&mapping->tree_lock);
+	spin_lock_irq(&mapping->tree_lock);
 	__remove_from_page_cache(page);
-	write_unlock_irq(&mapping->tree_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 }
 
 static int sync_page(void *word)
@@ -469,7 +469,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
 		page->mapping = mapping;
 		page->index = offset;
 
-		write_lock_irq(&mapping->tree_lock);
+		spin_lock_irq(&mapping->tree_lock);
 		error = radix_tree_insert(&mapping->page_tree, offset, page);
 		if (likely(!error)) {
 			mapping->nrpages++;
@@ -480,7 +480,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
 			page_cache_release(page);
 		}
 
-		write_unlock_irq(&mapping->tree_lock);
+		spin_unlock_irq(&mapping->tree_lock);
 		radix_tree_preload_end();
 	} else
 		mem_cgroup_uncharge_cache_page(page);
diff --git a/mm/migrate.c b/mm/migrate.c
index 3ca6392e82c..153572fb60b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -323,7 +323,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
 		return 0;
 	}
 
-	write_lock_irq(&mapping->tree_lock);
+	spin_lock_irq(&mapping->tree_lock);
 
 	pslot = radix_tree_lookup_slot(&mapping->page_tree,
  					page_index(page));
@@ -331,12 +331,12 @@ static int migrate_page_move_mapping(struct address_space *mapping,
 	expected_count = 2 + !!PagePrivate(page);
 	if (page_count(page) != expected_count ||
 			(struct page *)radix_tree_deref_slot(pslot) != page) {
-		write_unlock_irq(&mapping->tree_lock);
+		spin_unlock_irq(&mapping->tree_lock);
 		return -EAGAIN;
 	}
 
 	if (!page_freeze_refs(page, expected_count)) {
-		write_unlock_irq(&mapping->tree_lock);
+		spin_unlock_irq(&mapping->tree_lock);
 		return -EAGAIN;
 	}
 
@@ -373,10 +373,9 @@ static int migrate_page_move_mapping(struct address_space *mapping,
 	__dec_zone_page_state(page, NR_FILE_PAGES);
 	__inc_zone_page_state(newpage, NR_FILE_PAGES);
 
-	write_unlock_irq(&mapping->tree_lock);
-	if (!PageSwapCache(newpage)) {
+	spin_unlock_irq(&mapping->tree_lock);
+	if (!PageSwapCache(newpage))
 		mem_cgroup_uncharge_cache_page(page);
-	}
 
 	return 0;
 }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 94c6d8988ab..24de8b65fdb 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1088,7 +1088,7 @@ int __set_page_dirty_nobuffers(struct page *page)
 		if (!mapping)
 			return 1;
 
-		write_lock_irq(&mapping->tree_lock);
+		spin_lock_irq(&mapping->tree_lock);
 		mapping2 = page_mapping(page);
 		if (mapping2) { /* Race with truncate? */
 			BUG_ON(mapping2 != mapping);
@@ -1102,7 +1102,7 @@ int __set_page_dirty_nobuffers(struct page *page)
 			radix_tree_tag_set(&mapping->page_tree,
 				page_index(page), PAGECACHE_TAG_DIRTY);
 		}
-		write_unlock_irq(&mapping->tree_lock);
+		spin_unlock_irq(&mapping->tree_lock);
 		if (mapping->host) {
 			/* !PageAnon && !swapper_space */
 			__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
@@ -1258,7 +1258,7 @@ int test_clear_page_writeback(struct page *page)
 		struct backing_dev_info *bdi = mapping->backing_dev_info;
 		unsigned long flags;
 
-		write_lock_irqsave(&mapping->tree_lock, flags);
+		spin_lock_irqsave(&mapping->tree_lock, flags);
 		ret = TestClearPageWriteback(page);
 		if (ret) {
 			radix_tree_tag_clear(&mapping->page_tree,
@@ -1269,7 +1269,7 @@ int test_clear_page_writeback(struct page *page)
 				__bdi_writeout_inc(bdi);
 			}
 		}
-		write_unlock_irqrestore(&mapping->tree_lock, flags);
+		spin_unlock_irqrestore(&mapping->tree_lock, flags);
 	} else {
 		ret = TestClearPageWriteback(page);
 	}
@@ -1287,7 +1287,7 @@ int test_set_page_writeback(struct page *page)
 		struct backing_dev_info *bdi = mapping->backing_dev_info;
 		unsigned long flags;
 
-		write_lock_irqsave(&mapping->tree_lock, flags);
+		spin_lock_irqsave(&mapping->tree_lock, flags);
 		ret = TestSetPageWriteback(page);
 		if (!ret) {
 			radix_tree_tag_set(&mapping->page_tree,
@@ -1300,7 +1300,7 @@ int test_set_page_writeback(struct page *page)
 			radix_tree_tag_clear(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_DIRTY);
-		write_unlock_irqrestore(&mapping->tree_lock, flags);
+		spin_unlock_irqrestore(&mapping->tree_lock, flags);
 	} else {
 		ret = TestSetPageWriteback(page);
 	}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 3e3381d6c7e..2c217e33d49 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -39,7 +39,7 @@ static struct backing_dev_info swap_backing_dev_info = {
 
 struct address_space swapper_space = {
 	.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
-	.tree_lock	= __RW_LOCK_UNLOCKED(swapper_space.tree_lock),
+	.tree_lock	= __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock),
 	.a_ops		= &swap_aops,
 	.i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
 	.backing_dev_info = &swap_backing_dev_info,
@@ -80,7 +80,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
 		SetPageSwapCache(page);
 		set_page_private(page, entry.val);
 
-		write_lock_irq(&swapper_space.tree_lock);
+		spin_lock_irq(&swapper_space.tree_lock);
 		error = radix_tree_insert(&swapper_space.page_tree,
 						entry.val, page);
 		if (likely(!error)) {
@@ -88,7 +88,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
 			__inc_zone_page_state(page, NR_FILE_PAGES);
 			INC_CACHE_INFO(add_total);
 		}
-		write_unlock_irq(&swapper_space.tree_lock);
+		spin_unlock_irq(&swapper_space.tree_lock);
 		radix_tree_preload_end();
 
 		if (unlikely(error)) {
@@ -182,9 +182,9 @@ void delete_from_swap_cache(struct page *page)
 
 	entry.val = page_private(page);
 
-	write_lock_irq(&swapper_space.tree_lock);
+	spin_lock_irq(&swapper_space.tree_lock);
 	__delete_from_swap_cache(page);
-	write_unlock_irq(&swapper_space.tree_lock);
+	spin_unlock_irq(&swapper_space.tree_lock);
 
 	swap_free(entry);
 	page_cache_release(page);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2f33edb8bee..af283933c14 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -369,13 +369,13 @@ int remove_exclusive_swap_page(struct page *page)
 	retval = 0;
 	if (p->swap_map[swp_offset(entry)] == 1) {
 		/* Recheck the page count with the swapcache lock held.. */
-		write_lock_irq(&swapper_space.tree_lock);
+		spin_lock_irq(&swapper_space.tree_lock);
 		if ((page_count(page) == 2) && !PageWriteback(page)) {
 			__delete_from_swap_cache(page);
 			SetPageDirty(page);
 			retval = 1;
 		}
-		write_unlock_irq(&swapper_space.tree_lock);
+		spin_unlock_irq(&swapper_space.tree_lock);
 	}
 	spin_unlock(&swap_lock);
 
diff --git a/mm/truncate.c b/mm/truncate.c
index b8961cb6341..e68443d7456 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -349,18 +349,18 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
 	if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
 		return 0;
 
-	write_lock_irq(&mapping->tree_lock);
+	spin_lock_irq(&mapping->tree_lock);
 	if (PageDirty(page))
 		goto failed;
 
 	BUG_ON(PagePrivate(page));
 	__remove_from_page_cache(page);
-	write_unlock_irq(&mapping->tree_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 	ClearPageUptodate(page);
 	page_cache_release(page);	/* pagecache ref */
 	return 1;
 failed:
-	write_unlock_irq(&mapping->tree_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 	return 0;
 }
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0075eac1cd0..8f71761bc4b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -399,7 +399,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
 	BUG_ON(!PageLocked(page));
 	BUG_ON(mapping != page_mapping(page));
 
-	write_lock_irq(&mapping->tree_lock);
+	spin_lock_irq(&mapping->tree_lock);
 	/*
 	 * The non racy check for a busy page.
 	 *
@@ -436,17 +436,17 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
 	if (PageSwapCache(page)) {
 		swp_entry_t swap = { .val = page_private(page) };
 		__delete_from_swap_cache(page);
-		write_unlock_irq(&mapping->tree_lock);
+		spin_unlock_irq(&mapping->tree_lock);
 		swap_free(swap);
 	} else {
 		__remove_from_page_cache(page);
-		write_unlock_irq(&mapping->tree_lock);
+		spin_unlock_irq(&mapping->tree_lock);
 	}
 
 	return 1;
 
 cannot_free:
-	write_unlock_irq(&mapping->tree_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 51cc50685a4275c6a02653670af9f108a64e01cf Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 25 Jul 2008 19:45:34 -0700
Subject: SL*B: drop kmem cache argument from constructor

Kmem cache passed to constructor is only needed for constructors that are
themselves multiplexeres.  Nobody uses this "feature", nor does anybody uses
passed kmem cache in non-trivial way, so pass only pointer to object.

Non-trivial places are:
	arch/powerpc/mm/init_64.c
	arch/powerpc/mm/hugetlbpage.c

This is flag day, yes.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Acked-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Jon Tollefson <kniht@linux.vnet.ibm.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Matt Mackall <mpm@selenic.com>
[akpm@linux-foundation.org: fix arch/powerpc/mm/hugetlbpage.c]
[akpm@linux-foundation.org: fix mm/slab.c]
[akpm@linux-foundation.org: fix ubifs]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/plat-s3c24xx/dma.c               |  2 +-
 arch/powerpc/kernel/rtas_flash.c          |  2 +-
 arch/powerpc/mm/hugetlbpage.c             |  9 ++-------
 arch/powerpc/mm/init_64.c                 | 24 +++++++++---------------
 arch/powerpc/platforms/cell/spufs/inode.c |  2 +-
 arch/sh/mm/pmb.c                          |  2 +-
 arch/xtensa/mm/init.c                     |  2 +-
 drivers/usb/mon/mon_text.c                |  4 ++--
 fs/adfs/super.c                           |  2 +-
 fs/affs/super.c                           |  2 +-
 fs/afs/super.c                            |  4 ++--
 fs/befs/linuxvfs.c                        |  2 +-
 fs/bfs/inode.c                            |  2 +-
 fs/block_dev.c                            |  2 +-
 fs/buffer.c                               |  2 +-
 fs/cifs/cifsfs.c                          |  2 +-
 fs/coda/inode.c                           |  2 +-
 fs/ecryptfs/main.c                        |  4 ++--
 fs/efs/super.c                            |  2 +-
 fs/ext2/super.c                           |  2 +-
 fs/ext3/super.c                           |  2 +-
 fs/ext4/super.c                           |  2 +-
 fs/fat/cache.c                            |  2 +-
 fs/fat/inode.c                            |  2 +-
 fs/fuse/inode.c                           |  2 +-
 fs/gfs2/main.c                            |  4 ++--
 fs/hfs/super.c                            |  2 +-
 fs/hfsplus/super.c                        |  2 +-
 fs/hpfs/super.c                           |  2 +-
 fs/hugetlbfs/inode.c                      |  2 +-
 fs/inode.c                                |  2 +-
 fs/isofs/inode.c                          |  2 +-
 fs/jffs2/super.c                          |  2 +-
 fs/jfs/jfs_metapage.c                     |  2 +-
 fs/jfs/super.c                            |  2 +-
 fs/locks.c                                |  2 +-
 fs/minix/inode.c                          |  2 +-
 fs/ncpfs/inode.c                          |  2 +-
 fs/nfs/inode.c                            |  2 +-
 fs/ntfs/super.c                           |  2 +-
 fs/ocfs2/dlm/dlmfs.c                      |  3 +--
 fs/ocfs2/super.c                          |  2 +-
 fs/openpromfs/inode.c                     |  2 +-
 fs/proc/inode.c                           |  2 +-
 fs/qnx4/inode.c                           |  2 +-
 fs/reiserfs/super.c                       |  2 +-
 fs/romfs/inode.c                          |  2 +-
 fs/smbfs/inode.c                          |  2 +-
 fs/sysv/inode.c                           |  2 +-
 fs/ubifs/super.c                          |  2 +-
 fs/udf/super.c                            |  2 +-
 fs/ufs/super.c                            |  2 +-
 fs/xfs/linux-2.6/kmem.h                   |  2 +-
 fs/xfs/linux-2.6/xfs_super.c              |  1 -
 include/linux/slab.h                      |  2 +-
 include/linux/slub_def.h                  |  2 +-
 ipc/mqueue.c                              |  2 +-
 kernel/fork.c                             |  2 +-
 lib/idr.c                                 |  2 +-
 lib/radix-tree.c                          |  2 +-
 mm/rmap.c                                 |  2 +-
 mm/shmem.c                                |  2 +-
 mm/slab.c                                 | 11 +++++------
 mm/slob.c                                 |  7 +++----
 mm/slub.c                                 | 13 ++++++-------
 net/socket.c                              |  2 +-
 net/sunrpc/rpc_pipe.c                     |  2 +-
 67 files changed, 90 insertions(+), 106 deletions(-)

(limited to 'fs')

diff --git a/arch/arm/plat-s3c24xx/dma.c b/arch/arm/plat-s3c24xx/dma.c
index 60f162dc4fa..8c5e656d5d8 100644
--- a/arch/arm/plat-s3c24xx/dma.c
+++ b/arch/arm/plat-s3c24xx/dma.c
@@ -1304,7 +1304,7 @@ struct sysdev_class dma_sysclass = {
 
 /* kmem cache implementation */
 
-static void s3c2410_dma_cache_ctor(struct kmem_cache *c, void *p)
+static void s3c2410_dma_cache_ctor(void *p)
 {
 	memset(p, 0, sizeof(struct s3c2410_dma_buf));
 }
diff --git a/arch/powerpc/kernel/rtas_flash.c b/arch/powerpc/kernel/rtas_flash.c
index 09ded5c424a..149cb112cd1 100644
--- a/arch/powerpc/kernel/rtas_flash.c
+++ b/arch/powerpc/kernel/rtas_flash.c
@@ -286,7 +286,7 @@ static ssize_t rtas_flash_read(struct file *file, char __user *buf,
 }
 
 /* constructor for flash_block_cache */
-void rtas_block_ctor(struct kmem_cache *cache, void *ptr)
+void rtas_block_ctor(void *ptr)
 {
 	memset(ptr, 0, RTAS_BLK_SIZE);
 }
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index fb42c4dd321..ed0aab0208a 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -113,7 +113,7 @@ static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr,
 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 			   unsigned long address, unsigned int psize)
 {
-	pte_t *new = kmem_cache_alloc(huge_pgtable_cache(psize),
+	pte_t *new = kmem_cache_zalloc(huge_pgtable_cache(psize),
 				      GFP_KERNEL|__GFP_REPEAT);
 
 	if (! new)
@@ -730,11 +730,6 @@ static int __init hugepage_setup_sz(char *str)
 }
 __setup("hugepagesz=", hugepage_setup_sz);
 
-static void zero_ctor(struct kmem_cache *cache, void *addr)
-{
-	memset(addr, 0, kmem_cache_size(cache));
-}
-
 static int __init hugetlbpage_init(void)
 {
 	unsigned int psize;
@@ -756,7 +751,7 @@ static int __init hugetlbpage_init(void)
 						HUGEPTE_TABLE_SIZE(psize),
 						HUGEPTE_TABLE_SIZE(psize),
 						0,
-						zero_ctor);
+						NULL);
 			if (!huge_pgtable_cache(psize))
 				panic("hugetlbpage_init(): could not create %s"\
 				      "\n", HUGEPTE_CACHE_NAME(psize));
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index a41bc5aa204..4f7df85129d 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -136,9 +136,14 @@ static int __init setup_kcore(void)
 module_init(setup_kcore);
 #endif
 
-static void zero_ctor(struct kmem_cache *cache, void *addr)
+static void pgd_ctor(void *addr)
 {
-	memset(addr, 0, kmem_cache_size(cache));
+	memset(addr, 0, PGD_TABLE_SIZE);
+}
+
+static void pmd_ctor(void *addr)
+{
+	memset(addr, 0, PMD_TABLE_SIZE);
 }
 
 static const unsigned int pgtable_cache_size[2] = {
@@ -163,19 +168,8 @@ struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)];
 
 void pgtable_cache_init(void)
 {
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(pgtable_cache_size); i++) {
-		int size = pgtable_cache_size[i];
-		const char *name = pgtable_cache_name[i];
-
-		pr_debug("Allocating page table cache %s (#%d) "
-			"for size: %08x...\n", name, i, size);
-		pgtable_cache[i] = kmem_cache_create(name,
-						     size, size,
-						     SLAB_PANIC,
-						     zero_ctor);
-	}
+	pgtable_cache[0] = kmem_cache_create(pgtable_cache_name[0], PGD_TABLE_SIZE, PGD_TABLE_SIZE, SLAB_PANIC, pgd_ctor);
+	pgtable_cache[1] = kmem_cache_create(pgtable_cache_name[1], PMD_TABLE_SIZE, PMD_TABLE_SIZE, SLAB_PANIC, pmd_ctor);
 }
 
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 7123472801d..690ca7b0dcf 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -78,7 +78,7 @@ spufs_destroy_inode(struct inode *inode)
 }
 
 static void
-spufs_init_once(struct kmem_cache *cachep, void *p)
+spufs_init_once(void *p)
 {
 	struct spufs_inode_info *ei = p;
 
diff --git a/arch/sh/mm/pmb.c b/arch/sh/mm/pmb.c
index 0b0ec6e0475..46911bcbf17 100644
--- a/arch/sh/mm/pmb.c
+++ b/arch/sh/mm/pmb.c
@@ -293,7 +293,7 @@ void pmb_unmap(unsigned long addr)
 	} while (pmbe);
 }
 
-static void pmb_cache_ctor(struct kmem_cache *cachep, void *pmb)
+static void pmb_cache_ctor(void *pmb)
 {
 	struct pmb_entry *pmbe = pmb;
 
diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c
index 81d0560eaea..ee261005b36 100644
--- a/arch/xtensa/mm/init.c
+++ b/arch/xtensa/mm/init.c
@@ -309,7 +309,7 @@ void show_mem(void)
 
 struct kmem_cache *pgtable_cache __read_mostly;
 
-static void pgd_ctor(struct kmem_cache *cache, void* addr)
+static void pgd_ctor(void* addr)
 {
 	pte_t* ptep = (pte_t*)addr;
 	int i;
diff --git a/drivers/usb/mon/mon_text.c b/drivers/usb/mon/mon_text.c
index 5e3e4e9b6c7..1f715436d6d 100644
--- a/drivers/usb/mon/mon_text.c
+++ b/drivers/usb/mon/mon_text.c
@@ -87,7 +87,7 @@ struct mon_reader_text {
 
 static struct dentry *mon_dir;		/* Usually /sys/kernel/debug/usbmon */
 
-static void mon_text_ctor(struct kmem_cache *, void *);
+static void mon_text_ctor(void *);
 
 struct mon_text_ptr {
 	int cnt, limit;
@@ -720,7 +720,7 @@ void mon_text_del(struct mon_bus *mbus)
 /*
  * Slab interface: constructor.
  */
-static void mon_text_ctor(struct kmem_cache *slab, void *mem)
+static void mon_text_ctor(void *mem)
 {
 	/*
 	 * Nothing to initialize. No, really!
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 9e421eeb672..26f3b43726b 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -249,7 +249,7 @@ static void adfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(adfs_inode_cachep, ADFS_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct adfs_inode_info *ei = (struct adfs_inode_info *) foo;
 
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 4e030956640..3a89094f93d 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -90,7 +90,7 @@ static void affs_destroy_inode(struct inode *inode)
 	kmem_cache_free(affs_inode_cachep, AFFS_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct affs_inode_info *ei = (struct affs_inode_info *) foo;
 
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 7e3faeef681..250d8c4d66e 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -27,7 +27,7 @@
 
 #define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */
 
-static void afs_i_init_once(struct kmem_cache *cachep, void *foo);
+static void afs_i_init_once(void *foo);
 static int afs_get_sb(struct file_system_type *fs_type,
 		      int flags, const char *dev_name,
 		      void *data, struct vfsmount *mnt);
@@ -449,7 +449,7 @@ static void afs_put_super(struct super_block *sb)
 /*
  * initialise an inode cache slab element prior to any use
  */
-static void afs_i_init_once(struct kmem_cache *cachep, void *_vnode)
+static void afs_i_init_once(void *_vnode)
 {
 	struct afs_vnode *vnode = _vnode;
 
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index e8717de3bab..02c6e62b72f 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -289,7 +289,7 @@ befs_destroy_inode(struct inode *inode)
         kmem_cache_free(befs_inode_cachep, BEFS_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
         struct befs_inode_info *bi = (struct befs_inode_info *) foo;
 
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 053e690ec9e..0ed57b5ee01 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -264,7 +264,7 @@ static void bfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(bfs_inode_cachep, BFS_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct bfs_inode_info *bi = foo;
 
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 10d8a0aa871..dcf37cada36 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -271,7 +271,7 @@ static void bdev_destroy_inode(struct inode *inode)
 	kmem_cache_free(bdev_cachep, bdi);
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct bdev_inode *ei = (struct bdev_inode *) foo;
 	struct block_device *bdev = &ei->bdev;
diff --git a/fs/buffer.c b/fs/buffer.c
index 109b261192d..5fd497cdd6f 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3272,7 +3272,7 @@ int bh_submit_read(struct buffer_head *bh)
 EXPORT_SYMBOL(bh_submit_read);
 
 static void
-init_buffer_head(struct kmem_cache *cachep, void *data)
+init_buffer_head(void *data)
 {
 	struct buffer_head *bh = data;
 
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 22857c639df..fe5f6809cba 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -766,7 +766,7 @@ const struct file_operations cifs_dir_ops = {
 };
 
 static void
-cifs_init_once(struct kmem_cache *cachep, void *inode)
+cifs_init_once(void *inode)
 {
 	struct cifsInodeInfo *cifsi = inode;
 
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 2f58dfc7008..830f51abb97 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -58,7 +58,7 @@ static void coda_destroy_inode(struct inode *inode)
 	kmem_cache_free(coda_inode_cachep, ITOC(inode));
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct coda_inode_info *ei = (struct coda_inode_info *) foo;
 
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 6f403cfba14..448dfd597b5 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -578,7 +578,7 @@ static struct file_system_type ecryptfs_fs_type = {
  * Initializes the ecryptfs_inode_info_cache when it is created
  */
 static void
-inode_info_init_once(struct kmem_cache *cachep, void *vptr)
+inode_info_init_once(void *vptr)
 {
 	struct ecryptfs_inode_info *ei = (struct ecryptfs_inode_info *)vptr;
 
@@ -589,7 +589,7 @@ static struct ecryptfs_cache_info {
 	struct kmem_cache **cache;
 	const char *name;
 	size_t size;
-	void (*ctor)(struct kmem_cache *cache, void *obj);
+	void (*ctor)(void *obj);
 } ecryptfs_cache_infos[] = {
 	{
 		.cache = &ecryptfs_auth_tok_list_item_cache,
diff --git a/fs/efs/super.c b/fs/efs/super.c
index d733531b55e..567b134fa1f 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -70,7 +70,7 @@ static void efs_destroy_inode(struct inode *inode)
 	kmem_cache_free(efs_inode_cachep, INODE_INFO(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct efs_inode_info *ei = (struct efs_inode_info *) foo;
 
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 31308a3b0b8..fd88c7b43e6 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -159,7 +159,7 @@ static void ext2_destroy_inode(struct inode *inode)
 	kmem_cache_free(ext2_inode_cachep, EXT2_I(inode));
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct ext2_inode_info *ei = (struct ext2_inode_info *) foo;
 
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 615788c6843..8ddced38467 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -472,7 +472,7 @@ static void ext3_destroy_inode(struct inode *inode)
 	kmem_cache_free(ext3_inode_cachep, EXT3_I(inode));
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct ext3_inode_info *ei = (struct ext3_inode_info *) foo;
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 1cb371dcd60..b5479b1dff1 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -595,7 +595,7 @@ static void ext4_destroy_inode(struct inode *inode)
 	kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
 
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 3a9ecac8d61..3222f51c41c 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -36,7 +36,7 @@ static inline int fat_max_cache(struct inode *inode)
 
 static struct kmem_cache *fat_cache_cachep;
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct fat_cache *cache = (struct fat_cache *)foo;
 
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 23676f9d79c..6d266d793e2 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -498,7 +498,7 @@ static void fat_destroy_inode(struct inode *inode)
 	kmem_cache_free(fat_inode_cachep, MSDOS_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct msdos_inode_info *ei = (struct msdos_inode_info *)foo;
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 7d2f7d6e22e..d2249f174e2 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -956,7 +956,7 @@ static inline void unregister_fuseblk(void)
 }
 #endif
 
-static void fuse_inode_init_once(struct kmem_cache *cachep, void *foo)
+static void fuse_inode_init_once(void *foo)
 {
 	struct inode * inode = foo;
 
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index bcc668d0fad..bb2cc303ac2 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -24,7 +24,7 @@
 #include "util.h"
 #include "glock.h"
 
-static void gfs2_init_inode_once(struct kmem_cache *cachep, void *foo)
+static void gfs2_init_inode_once(void *foo)
 {
 	struct gfs2_inode *ip = foo;
 
@@ -33,7 +33,7 @@ static void gfs2_init_inode_once(struct kmem_cache *cachep, void *foo)
 	ip->i_alloc = NULL;
 }
 
-static void gfs2_init_glock_once(struct kmem_cache *cachep, void *foo)
+static void gfs2_init_glock_once(void *foo)
 {
 	struct gfs2_glock *gl = foo;
 
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index ac2ec5ef66e..4abb1047c68 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -432,7 +432,7 @@ static struct file_system_type hfs_fs_type = {
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 
-static void hfs_init_once(struct kmem_cache *cachep, void *p)
+static void hfs_init_once(void *p)
 {
 	struct hfs_inode_info *i = p;
 
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 3859118531c..e834e578c93 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -485,7 +485,7 @@ static struct file_system_type hfsplus_fs_type = {
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 
-static void hfsplus_init_once(struct kmem_cache *cachep, void *p)
+static void hfsplus_init_once(void *p)
 {
 	struct hfsplus_inode_info *i = p;
 
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index f63a699ec65..b8ae9c90ada 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -173,7 +173,7 @@ static void hpfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(hpfs_inode_cachep, hpfs_i(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct hpfs_inode_info *ei = (struct hpfs_inode_info *) foo;
 
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index dbd01d262ca..3f58923fb39 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -705,7 +705,7 @@ static const struct address_space_operations hugetlbfs_aops = {
 };
 
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
 
diff --git a/fs/inode.c b/fs/inode.c
index 35b6414522e..b6726f64453 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -224,7 +224,7 @@ void inode_init_once(struct inode *inode)
 
 EXPORT_SYMBOL(inode_init_once);
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct inode * inode = (struct inode *) foo;
 
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 044a254d526..26948a6033b 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -73,7 +73,7 @@ static void isofs_destroy_inode(struct inode *inode)
 	kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct iso_inode_info *ei = foo;
 
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 7da69eae49e..efd401257ed 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -44,7 +44,7 @@ static void jffs2_destroy_inode(struct inode *inode)
 	kmem_cache_free(jffs2_inode_cachep, JFFS2_INODE_INFO(inode));
 }
 
-static void jffs2_i_init_once(struct kmem_cache *cachep, void *foo)
+static void jffs2_i_init_once(void *foo)
 {
 	struct jffs2_inode_info *f = foo;
 
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 854ff0ec574..c350057087d 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -182,7 +182,7 @@ static inline void remove_metapage(struct page *page, struct metapage *mp)
 
 #endif
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct metapage *mp = (struct metapage *)foo;
 
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 359c091d896..3630718be39 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -760,7 +760,7 @@ static struct file_system_type jfs_fs_type = {
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct jfs_inode_info *jfs_ip = (struct jfs_inode_info *) foo;
 
diff --git a/fs/locks.c b/fs/locks.c
index 01490300f7c..5eb259e3cd3 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -201,7 +201,7 @@ EXPORT_SYMBOL(locks_init_lock);
  * Initialises the fields of the file lock which are invariant for
  * free file_locks.
  */
-static void init_once(struct kmem_cache *cache, void *foo)
+static void init_once(void *foo)
 {
 	struct file_lock *lock = (struct file_lock *) foo;
 
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 523d7371341..d1d1eb84679 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -68,7 +68,7 @@ static void minix_destroy_inode(struct inode *inode)
 	kmem_cache_free(minix_inode_cachep, minix_i(inode));
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct minix_inode_info *ei = (struct minix_inode_info *) foo;
 
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 2e5ab1204de..d642f0e5b36 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -64,7 +64,7 @@ static void ncp_destroy_inode(struct inode *inode)
 	kmem_cache_free(ncp_inode_cachep, NCP_FINFO(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct ncp_inode_info *ei = (struct ncp_inode_info *) foo;
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index df23f987da6..52daefa2f52 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1242,7 +1242,7 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi)
 #endif
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct nfs_inode *nfsi = (struct nfs_inode *) foo;
 
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 3e76f3b216b..4a46743b507 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -3080,7 +3080,7 @@ struct kmem_cache *ntfs_inode_cache;
 struct kmem_cache *ntfs_big_inode_cache;
 
 /* Init once constructor for the inode slab cache. */
-static void ntfs_big_inode_init_once(struct kmem_cache *cachep, void *foo)
+static void ntfs_big_inode_init_once(void *foo)
 {
 	ntfs_inode *ni = (ntfs_inode *)foo;
 
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index e48aba698b7..533a789c3ef 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -267,8 +267,7 @@ static ssize_t dlmfs_file_write(struct file *filp,
 	return writelen;
 }
 
-static void dlmfs_init_once(struct kmem_cache *cachep,
-			    void *foo)
+static void dlmfs_init_once(void *foo)
 {
 	struct dlmfs_inode_private *ip =
 		(struct dlmfs_inode_private *) foo;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index ccecfe5094f..2560b33889a 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1118,7 +1118,7 @@ bail:
 	return status;
 }
 
-static void ocfs2_inode_init_once(struct kmem_cache *cachep, void *data)
+static void ocfs2_inode_init_once(void *data)
 {
 	struct ocfs2_inode_info *oi = data;
 
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index d17b4fd204e..9f5b054f06b 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -430,7 +430,7 @@ static struct file_system_type openprom_fs_type = {
 	.kill_sb	= kill_anon_super,
 };
 
-static void op_inode_init_once(struct kmem_cache * cachep, void *data)
+static void op_inode_init_once(void *data)
 {
 	struct op_inode_info *oi = (struct op_inode_info *) data;
 
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 02eca2ed9dd..b37f25dc45a 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -94,7 +94,7 @@ static void proc_destroy_inode(struct inode *inode)
 	kmem_cache_free(proc_inode_cachep, PROC_I(inode));
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct proc_inode *ei = (struct proc_inode *) foo;
 
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index b31ab78052b..2aad1044b84 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -553,7 +553,7 @@ static void qnx4_destroy_inode(struct inode *inode)
 	kmem_cache_free(qnx4_inode_cachep, qnx4_i(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct qnx4_inode_info *ei = (struct qnx4_inode_info *) foo;
 
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 2ec748ba0bd..879e54d35c2 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -521,7 +521,7 @@ static void reiserfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode));
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo;
 
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index 3f13d491c7c..8e51a2aaa97 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -577,7 +577,7 @@ static void romfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct romfs_inode_info *ei = foo;
 
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 376ef3ee6ed..3528f40ffb0 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -67,7 +67,7 @@ static void smb_destroy_inode(struct inode *inode)
 	kmem_cache_free(smb_inode_cachep, SMB_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct smb_inode_info *ei = (struct smb_inode_info *) foo;
 
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index c5d60de0658..df0d435baa4 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -326,7 +326,7 @@ static void sysv_destroy_inode(struct inode *inode)
 	kmem_cache_free(sysv_inode_cachep, SYSV_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *p)
+static void init_once(void *p)
 {
 	struct sysv_inode_info *si = (struct sysv_inode_info *)p;
 
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 00eb9c68ad0..ca1e2d4e03c 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1841,7 +1841,7 @@ static struct file_system_type ubifs_fs_type = {
 /*
  * Inode slab cache constructor.
  */
-static void inode_slab_ctor(struct kmem_cache *cachep, void *obj)
+static void inode_slab_ctor(void *obj)
 {
 	struct ubifs_inode *ui = obj;
 	inode_init_once(&ui->vfs_inode);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 44cc702f96c..5698bbf83bb 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -148,7 +148,7 @@ static void udf_destroy_inode(struct inode *inode)
 	kmem_cache_free(udf_inode_cachep, UDF_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct udf_inode_info *ei = (struct udf_inode_info *)foo;
 
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 227c9d70004..3e30e40aa24 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1302,7 +1302,7 @@ static void ufs_destroy_inode(struct inode *inode)
 	kmem_cache_free(ufs_inode_cachep, UFS_I(inode));
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct ufs_inode_info *ei = (struct ufs_inode_info *) foo;
 
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
index 5e956490297..a20683cf74d 100644
--- a/fs/xfs/linux-2.6/kmem.h
+++ b/fs/xfs/linux-2.6/kmem.h
@@ -79,7 +79,7 @@ kmem_zone_init(int size, char *zone_name)
 
 static inline kmem_zone_t *
 kmem_zone_init_flags(int size, char *zone_name, unsigned long flags,
-		     void (*construct)(kmem_zone_t *, void *))
+		     void (*construct)(void *))
 {
 	return kmem_cache_create(zone_name, size, 0, flags, construct);
 }
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 742b2c7852c..943381284e2 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -843,7 +843,6 @@ xfs_fs_destroy_inode(
 
 STATIC void
 xfs_fs_inode_init_once(
-	kmem_zone_t		*zonep,
 	void			*vnode)
 {
 	inode_init_once(vn_to_inode((bhv_vnode_t *)vnode));
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 41103910f8a..9ff8e849940 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -58,7 +58,7 @@ int slab_is_available(void);
 
 struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
 			unsigned long,
-			void (*)(struct kmem_cache *, void *));
+			void (*)(void *));
 void kmem_cache_destroy(struct kmem_cache *);
 int kmem_cache_shrink(struct kmem_cache *);
 void kmem_cache_free(struct kmem_cache *, void *);
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index d117ea2825a..5bad61a93f6 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -85,7 +85,7 @@ struct kmem_cache {
 	struct kmem_cache_order_objects min;
 	gfp_t allocflags;	/* gfp flags to use on each alloc */
 	int refcount;		/* Refcount for slab cache destroy */
-	void (*ctor)(struct kmem_cache *, void *);
+	void (*ctor)(void *);
 	int inuse;		/* Offset to metadata */
 	int align;		/* Alignment */
 	const char *name;	/* Name (only for display!) */
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 1fdc2eb2f6d..474984f9e03 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -207,7 +207,7 @@ static int mqueue_get_sb(struct file_system_type *fs_type,
 	return get_sb_single(fs_type, flags, data, mqueue_fill_super, mnt);
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct mqueue_inode_info *p = (struct mqueue_inode_info *) foo;
 
diff --git a/kernel/fork.c b/kernel/fork.c
index b99d73e971a..80e83e459b1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1442,7 +1442,7 @@ long do_fork(unsigned long clone_flags,
 #define ARCH_MIN_MMSTRUCT_ALIGN 0
 #endif
 
-static void sighand_ctor(struct kmem_cache *cachep, void *data)
+static void sighand_ctor(void *data)
 {
 	struct sighand_struct *sighand = data;
 
diff --git a/lib/idr.c b/lib/idr.c
index 3476f8203e9..e728c7fccc4 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -607,7 +607,7 @@ void *idr_replace(struct idr *idp, void *ptr, int id)
 }
 EXPORT_SYMBOL(idr_replace);
 
-static void idr_cache_ctor(struct kmem_cache *idr_layer_cache, void *idr_layer)
+static void idr_cache_ctor(void *idr_layer)
 {
 	memset(idr_layer, 0, sizeof(struct idr_layer));
 }
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 9c4f1ffa286..be86b32bc87 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -1183,7 +1183,7 @@ int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag)
 EXPORT_SYMBOL(radix_tree_tagged);
 
 static void
-radix_tree_node_ctor(struct kmem_cache *cachep, void *node)
+radix_tree_node_ctor(void *node)
 {
 	memset(node, 0, sizeof(struct radix_tree_node));
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index abbd29f7c43..39ae5a9bf38 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -138,7 +138,7 @@ void anon_vma_unlink(struct vm_area_struct *vma)
 		anon_vma_free(anon_vma);
 }
 
-static void anon_vma_ctor(struct kmem_cache *cachep, void *data)
+static void anon_vma_ctor(void *data)
 {
 	struct anon_vma *anon_vma = data;
 
diff --git a/mm/shmem.c b/mm/shmem.c
index 1089092aeca..952d361774b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2352,7 +2352,7 @@ static void shmem_destroy_inode(struct inode *inode)
 	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
 
diff --git a/mm/slab.c b/mm/slab.c
index 052e7d64537..918f04f7fef 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -406,7 +406,7 @@ struct kmem_cache {
 	unsigned int dflags;		/* dynamic flags */
 
 	/* constructor func */
-	void (*ctor)(struct kmem_cache *, void *);
+	void (*ctor)(void *obj);
 
 /* 5) cache creation/removal */
 	const char *name;
@@ -2137,8 +2137,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
  */
 struct kmem_cache *
 kmem_cache_create (const char *name, size_t size, size_t align,
-	unsigned long flags,
-	void (*ctor)(struct kmem_cache *, void *))
+	unsigned long flags, void (*ctor)(void *))
 {
 	size_t left_over, slab_size, ralign;
 	struct kmem_cache *cachep = NULL, *pc;
@@ -2653,7 +2652,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
 		 * They must also be threaded.
 		 */
 		if (cachep->ctor && !(cachep->flags & SLAB_POISON))
-			cachep->ctor(cachep, objp + obj_offset(cachep));
+			cachep->ctor(objp + obj_offset(cachep));
 
 		if (cachep->flags & SLAB_RED_ZONE) {
 			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
@@ -2669,7 +2668,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
 					 cachep->buffer_size / PAGE_SIZE, 0);
 #else
 		if (cachep->ctor)
-			cachep->ctor(cachep, objp);
+			cachep->ctor(objp);
 #endif
 		slab_bufctl(slabp)[i] = i + 1;
 	}
@@ -3093,7 +3092,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
 #endif
 	objp += obj_offset(cachep);
 	if (cachep->ctor && cachep->flags & SLAB_POISON)
-		cachep->ctor(cachep, objp);
+		cachep->ctor(objp);
 #if ARCH_SLAB_MINALIGN
 	if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
 		printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
diff --git a/mm/slob.c b/mm/slob.c
index de268eb7ac7..d8fbd4d1bfa 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -525,12 +525,11 @@ struct kmem_cache {
 	unsigned int size, align;
 	unsigned long flags;
 	const char *name;
-	void (*ctor)(struct kmem_cache *, void *);
+	void (*ctor)(void *);
 };
 
 struct kmem_cache *kmem_cache_create(const char *name, size_t size,
-	size_t align, unsigned long flags,
-	void (*ctor)(struct kmem_cache *, void *))
+	size_t align, unsigned long flags, void (*ctor)(void *))
 {
 	struct kmem_cache *c;
 
@@ -575,7 +574,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
 		b = slob_new_page(flags, get_order(c->size), node);
 
 	if (c->ctor)
-		c->ctor(c, b);
+		c->ctor(b);
 
 	return b;
 }
diff --git a/mm/slub.c b/mm/slub.c
index 77c21cf53ff..b7e2cd5d82d 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1012,7 +1012,7 @@ __setup("slub_debug", setup_slub_debug);
 
 static unsigned long kmem_cache_flags(unsigned long objsize,
 	unsigned long flags, const char *name,
-	void (*ctor)(struct kmem_cache *, void *))
+	void (*ctor)(void *))
 {
 	/*
 	 * Enable debugging if selected on the kernel commandline.
@@ -1040,7 +1040,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page,
 static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
 static inline unsigned long kmem_cache_flags(unsigned long objsize,
 	unsigned long flags, const char *name,
-	void (*ctor)(struct kmem_cache *, void *))
+	void (*ctor)(void *))
 {
 	return flags;
 }
@@ -1103,7 +1103,7 @@ static void setup_object(struct kmem_cache *s, struct page *page,
 {
 	setup_object_debug(s, page, object);
 	if (unlikely(s->ctor))
-		s->ctor(s, object);
+		s->ctor(object);
 }
 
 static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -2286,7 +2286,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
 static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
 		const char *name, size_t size,
 		size_t align, unsigned long flags,
-		void (*ctor)(struct kmem_cache *, void *))
+		void (*ctor)(void *))
 {
 	memset(s, 0, kmem_size);
 	s->name = name;
@@ -3042,7 +3042,7 @@ static int slab_unmergeable(struct kmem_cache *s)
 
 static struct kmem_cache *find_mergeable(size_t size,
 		size_t align, unsigned long flags, const char *name,
-		void (*ctor)(struct kmem_cache *, void *))
+		void (*ctor)(void *))
 {
 	struct kmem_cache *s;
 
@@ -3082,8 +3082,7 @@ static struct kmem_cache *find_mergeable(size_t size,
 }
 
 struct kmem_cache *kmem_cache_create(const char *name, size_t size,
-		size_t align, unsigned long flags,
-		void (*ctor)(struct kmem_cache *, void *))
+		size_t align, unsigned long flags, void (*ctor)(void *))
 {
 	struct kmem_cache *s;
 
diff --git a/net/socket.c b/net/socket.c
index 1310a82cbba..8ef8ba81b9e 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -265,7 +265,7 @@ static void sock_destroy_inode(struct inode *inode)
 			container_of(inode, struct socket_alloc, vfs_inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct socket_alloc *ei = (struct socket_alloc *)foo;
 
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 5a9b0e7828c..23a2b8f6dc4 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -897,7 +897,7 @@ static struct file_system_type rpc_pipe_fs_type = {
 };
 
 static void
-init_once(struct kmem_cache * cachep, void *foo)
+init_once(void *foo)
 {
 	struct rpc_inode *rpci = (struct rpc_inode *) foo;
 
-- 
cgit v1.2.3-70-g09d2


From 5c752ad9f35910ff1912b3f3ae82878178ddc432 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 25 Jul 2008 19:45:40 -0700
Subject: Use WARN() in fs/

Use WARN() instead of a printk+WARN_ON() pair; this way the message
becomes part of the warning section for better reporting/collection.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c    | 3 +--
 fs/namespace.c | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index 5fd497cdd6f..f9580501963 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1214,8 +1214,7 @@ void __brelse(struct buffer_head * buf)
 		put_bh(buf);
 		return;
 	}
-	printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
-	WARN_ON(1);
+	WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
 }
 
 /*
diff --git a/fs/namespace.c b/fs/namespace.c
index 4f6f7635b59..f30b11e2240 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -309,10 +309,9 @@ static void handle_write_count_underflow(struct vfsmount *mnt)
 	 */
 	if ((atomic_read(&mnt->__mnt_writers) < 0) &&
 	    !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
-		printk(KERN_DEBUG "leak detected on mount(%p) writers "
+		WARN(1, KERN_DEBUG "leak detected on mount(%p) writers "
 				"count: %d\n",
 			mnt, atomic_read(&mnt->__mnt_writers));
-		WARN_ON(1);
 		/* use the flag to keep the dmesg spam down */
 		mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
 	}
-- 
cgit v1.2.3-70-g09d2


From 99fcd77d15357e8ba51005c25cc750b9c28b2688 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 25 Jul 2008 19:45:41 -0700
Subject: Use WARN() in fs/sysfs

Use WARN() instead of a printk+WARN_ON() pair; this way the message becomes
part of the warning section for better reporting/collection.  Also, with this,
one fo the if() sections collapses entirely into the WARN().

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/sysfs/dir.c   | 5 +----
 fs/sysfs/file.c  | 3 +--
 fs/sysfs/group.c | 3 +--
 3 files changed, 3 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index c1a7efb310b..aedaeba82ae 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -459,11 +459,8 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 	int ret;
 
 	ret = __sysfs_add_one(acxt, sd);
-	if (ret == -EEXIST) {
-		printk(KERN_WARNING "sysfs: duplicate filename '%s' "
+	WARN(ret == -EEXIST, KERN_WARNING "sysfs: duplicate filename '%s' "
 		       "can not be created\n", sd->s_name);
-		WARN_ON(1);
-	}
 	return ret;
 }
 
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 3f07893ff89..c9e4e5091da 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -337,9 +337,8 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 	if (kobj->ktype && kobj->ktype->sysfs_ops)
 		ops = kobj->ktype->sysfs_ops;
 	else {
-		printk(KERN_ERR "missing sysfs attribute operations for "
+		WARN(1, KERN_ERR "missing sysfs attribute operations for "
 		       "kobject: %s\n", kobject_name(kobj));
-		WARN_ON(1);
 		goto err_out;
 	}
 
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index eeba38417b1..fe611949a7f 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -134,9 +134,8 @@ void sysfs_remove_group(struct kobject * kobj,
 	if (grp->name) {
 		sd = sysfs_get_dirent(dir_sd, grp->name);
 		if (!sd) {
-			printk(KERN_WARNING "sysfs group %p not found for "
+			WARN(!sd, KERN_WARNING "sysfs group %p not found for "
 				"kobject '%s'\n", grp, kobject_name(kobj));
-			WARN_ON(!sd);
 			return;
 		}
 	} else
-- 
cgit v1.2.3-70-g09d2


From 267e2a9c71b8e088ac307f9549f71468e86e26c1 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 25 Jul 2008 19:45:41 -0700
Subject: Use WARN() in fs/proc/

Use WARN() instead of a printk+WARN_ON() pair; this way the message
becomes part of the warning section for better reporting/collection.
This way, the entire if() {} section can collapse into the WARN() as well.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/generic.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index bc0a0dd2d84..cb4096cc3fb 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -806,12 +806,9 @@ continue_removing:
 	if (S_ISDIR(de->mode))
 		parent->nlink--;
 	de->nlink = 0;
-	if (de->subdir) {
-		printk(KERN_WARNING "%s: removing non-empty directory "
+	WARN(de->subdir, KERN_WARNING "%s: removing non-empty directory "
 			"'%s/%s', leaking at least '%s'\n", __func__,
 			de->parent->name, de->name, de->subdir->name);
-		WARN_ON(1);
-	}
 	if (atomic_dec_and_test(&de->count))
 		free_proc_entry(de);
 }
-- 
cgit v1.2.3-70-g09d2


From 6341c393fcc37d58727865f1ee2f65e632e9d4f0 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:44 -0700
Subject: tracehook: exec

This moves all the ptrace hooks related to exec into tracehook.h inlines.

This also lifts the calls for tracing out of the binfmt load_binary hooks
into search_binary_handler() after it calls into the binfmt module.  This
change has no effect, since all the binfmt modules' load_binary functions
did the call at the end on success, and now search_binary_handler() does
it immediately after return if successful.  We consolidate the repeated
code, and binfmt modules no longer need to import ptrace_notify().

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/ia32/ia32_aout.c |  6 ------
 fs/binfmt_aout.c          |  6 ------
 fs/binfmt_elf.c           |  6 ------
 fs/binfmt_elf_fdpic.c     |  7 -------
 fs/binfmt_flat.c          |  3 ---
 fs/binfmt_som.c           |  2 --
 fs/exec.c                 | 12 ++++--------
 include/linux/tracehook.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 50 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 58cccb6483b..a0e1dbe67dc 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -441,12 +441,6 @@ beyond_if:
 	regs->r8 = regs->r9 = regs->r10 = regs->r11 =
 	regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;
 	set_fs(USER_DS);
-	if (unlikely(current->ptrace & PT_PTRACED)) {
-		if (current->ptrace & PT_TRACE_EXEC)
-			ptrace_notify((PTRACE_EVENT_EXEC << 8) | SIGTRAP);
-		else
-			send_sig(SIGTRAP, current, 0);
-	}
 	return 0;
 }
 
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index ba4cddb92f1..204cfd1d767 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -444,12 +444,6 @@ beyond_if:
 	regs->gp = ex.a_gpvalue;
 #endif
 	start_thread(regs, ex.a_entry, current->mm->start_stack);
-	if (unlikely(current->ptrace & PT_PTRACED)) {
-		if (current->ptrace & PT_TRACE_EXEC)
-			ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP);
-		else
-			send_sig(SIGTRAP, current, 0);
-	}
 	return 0;
 }
 
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 3b6ff854d98..655ed8d30a8 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1003,12 +1003,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 #endif
 
 	start_thread(regs, elf_entry, bprm->p);
-	if (unlikely(current->ptrace & PT_PTRACED)) {
-		if (current->ptrace & PT_TRACE_EXEC)
-			ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP);
-		else
-			send_sig(SIGTRAP, current, 0);
-	}
 	retval = 0;
 out:
 	kfree(loc);
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 1b59b1edf26..fdeadab2f18 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -433,13 +433,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
 	entryaddr = interp_params.entry_addr ?: exec_params.entry_addr;
 	start_thread(regs, entryaddr, current->mm->start_stack);
 
-	if (unlikely(current->ptrace & PT_PTRACED)) {
-		if (current->ptrace & PT_TRACE_EXEC)
-			ptrace_notify((PTRACE_EVENT_EXEC << 8) | SIGTRAP);
-		else
-			send_sig(SIGTRAP, current, 0);
-	}
-
 	retval = 0;
 
 error:
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 2cb1acda3a8..56372ecf169 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -920,9 +920,6 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	
 	start_thread(regs, start_addr, current->mm->start_stack);
 
-	if (current->ptrace & PT_PTRACED)
-		send_sig(SIGTRAP, current, 0);
-
 	return 0;
 }
 
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index fdc36bfd6a7..68be580ba28 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -274,8 +274,6 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	map_hpux_gateway_page(current,current->mm);
 
 	start_thread_som(regs, som_entry, bprm->p);
-	if (current->ptrace & PT_PTRACED)
-		send_sig(SIGTRAP, current, 0);
 	return 0;
 
 	/* error cleanup */
diff --git a/fs/exec.c b/fs/exec.c
index 5e559013e30..b8792a13153 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -42,13 +42,13 @@
 #include <linux/module.h>
 #include <linux/namei.h>
 #include <linux/proc_fs.h>
-#include <linux/ptrace.h>
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
 #include <linux/audit.h>
+#include <linux/tracehook.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -1071,13 +1071,8 @@ EXPORT_SYMBOL(prepare_binprm);
 
 static int unsafe_exec(struct task_struct *p)
 {
-	int unsafe = 0;
-	if (p->ptrace & PT_PTRACED) {
-		if (p->ptrace & PT_PTRACE_CAP)
-			unsafe |= LSM_UNSAFE_PTRACE_CAP;
-		else
-			unsafe |= LSM_UNSAFE_PTRACE;
-	}
+	int unsafe = tracehook_unsafe_exec(p);
+
 	if (atomic_read(&p->fs->count) > 1 ||
 	    atomic_read(&p->files->count) > 1 ||
 	    atomic_read(&p->sighand->count) > 1)
@@ -1214,6 +1209,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 			read_unlock(&binfmt_lock);
 			retval = fn(bprm, regs);
 			if (retval >= 0) {
+				tracehook_report_exec(fmt, bprm, regs);
 				put_binfmt(fmt);
 				allow_write_access(bprm->file);
 				if (bprm->file)
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index bea0f3eeff5..6276353709c 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -48,5 +48,51 @@
 
 #include <linux/sched.h>
 #include <linux/ptrace.h>
+#include <linux/security.h>
+struct linux_binprm;
+
+/**
+ * tracehook_unsafe_exec - check for exec declared unsafe due to tracing
+ * @task:		current task doing exec
+ *
+ * Return %LSM_UNSAFE_* bits applied to an exec because of tracing.
+ *
+ * Called with task_lock() held on @task.
+ */
+static inline int tracehook_unsafe_exec(struct task_struct *task)
+{
+	int unsafe = 0;
+	int ptrace = task_ptrace(task);
+	if (ptrace & PT_PTRACED) {
+		if (ptrace & PT_PTRACE_CAP)
+			unsafe |= LSM_UNSAFE_PTRACE_CAP;
+		else
+			unsafe |= LSM_UNSAFE_PTRACE;
+	}
+	return unsafe;
+}
+
+/**
+ * tracehook_report_exec - a successful exec was completed
+ * @fmt:		&struct linux_binfmt that performed the exec
+ * @bprm:		&struct linux_binprm containing exec details
+ * @regs:		user-mode register state
+ *
+ * An exec just completed, we are shortly going to return to user mode.
+ * The freshly initialized register state can be seen and changed in @regs.
+ * The name, file and other pointers in @bprm are still on hand to be
+ * inspected, but will be freed as soon as this returns.
+ *
+ * Called with no locks, but with some kernel resources held live
+ * and a reference on @fmt->module.
+ */
+static inline void tracehook_report_exec(struct linux_binfmt *fmt,
+					 struct linux_binprm *bprm,
+					 struct pt_regs *regs)
+{
+	if (!ptrace_event(PT_TRACE_EXEC, PTRACE_EVENT_EXEC, 0) &&
+	    unlikely(task_ptrace(current) & PT_PTRACED))
+		send_sig(SIGTRAP, current, 0);
+}
 
 #endif	/* <linux/tracehook.h> */
-- 
cgit v1.2.3-70-g09d2


From 0d094efeb1e98010c6b99923f1eb7e17bf1e3a74 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:49 -0700
Subject: tracehook: tracehook_tracer_task

This adds the tracehook_tracer_task() hook to consolidate all forms of
"Who is using ptrace on me?" logic.  This is used for "TracerPid:" in
/proc and for permission checks.  We also clean up the selinux code the
called an identical accessor.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/array.c           |  9 +++++++--
 fs/proc/base.c            | 13 +++++++++----
 include/linux/tracehook.h | 18 ++++++++++++++++++
 security/selinux/hooks.c  | 22 +++-------------------
 4 files changed, 37 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 797d775e035..0d6eb33597c 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -80,6 +80,7 @@
 #include <linux/delayacct.h>
 #include <linux/seq_file.h>
 #include <linux/pid_namespace.h>
+#include <linux/tracehook.h>
 
 #include <asm/pgtable.h>
 #include <asm/processor.h>
@@ -168,8 +169,12 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 	rcu_read_lock();
 	ppid = pid_alive(p) ?
 		task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0;
-	tpid = pid_alive(p) && p->ptrace ?
-		task_pid_nr_ns(rcu_dereference(p->parent), ns) : 0;
+	tpid = 0;
+	if (pid_alive(p)) {
+		struct task_struct *tracer = tracehook_tracer_task(p);
+		if (tracer)
+			tpid = task_pid_nr_ns(tracer, ns);
+	}
 	seq_printf(m,
 		"State:\t%s\n"
 		"Tgid:\t%d\n"
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a891fe4cb43..4b74dba69a6 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -69,6 +69,7 @@
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/ptrace.h>
+#include <linux/tracehook.h>
 #include <linux/cgroup.h>
 #include <linux/cpuset.h>
 #include <linux/audit.h>
@@ -231,10 +232,14 @@ static int check_mem_permission(struct task_struct *task)
 	 * If current is actively ptrace'ing, and would also be
 	 * permitted to freshly attach with ptrace now, permit it.
 	 */
-	if (task->parent == current && (task->ptrace & PT_PTRACED) &&
-	    task_is_stopped_or_traced(task) &&
-	    ptrace_may_access(task, PTRACE_MODE_ATTACH))
-		return 0;
+	if (task_is_stopped_or_traced(task)) {
+		int match;
+		rcu_read_lock();
+		match = (tracehook_tracer_task(task) == current);
+		rcu_read_unlock();
+		if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH))
+			return 0;
+	}
 
 	/*
 	 * Noone else is allowed.
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 9a5b3be2503..6468ca0fe69 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -72,6 +72,24 @@ static inline int tracehook_unsafe_exec(struct task_struct *task)
 	return unsafe;
 }
 
+/**
+ * tracehook_tracer_task - return the task that is tracing the given task
+ * @tsk:		task to consider
+ *
+ * Returns NULL if noone is tracing @task, or the &struct task_struct
+ * pointer to its tracer.
+ *
+ * Must called under rcu_read_lock().  The pointer returned might be kept
+ * live only by RCU.  During exec, this may be called with task_lock()
+ * held on @task, still held from when tracehook_unsafe_exec() was called.
+ */
+static inline struct task_struct *tracehook_tracer_task(struct task_struct *tsk)
+{
+	if (task_ptrace(tsk) & PT_PTRACED)
+		return rcu_dereference(tsk->parent);
+	return NULL;
+}
+
 /**
  * tracehook_report_exec - a successful exec was completed
  * @fmt:		&struct linux_binfmt that performed the exec
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 63f131fc42e..3481cde5bf1 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -25,7 +25,7 @@
 
 #include <linux/init.h>
 #include <linux/kernel.h>
-#include <linux/ptrace.h>
+#include <linux/tracehook.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/security.h>
@@ -1971,22 +1971,6 @@ static int selinux_vm_enough_memory(struct mm_struct *mm, long pages)
 	return __vm_enough_memory(mm, pages, cap_sys_admin);
 }
 
-/**
- * task_tracer_task - return the task that is tracing the given task
- * @task:		task to consider
- *
- * Returns NULL if noone is tracing @task, or the &struct task_struct
- * pointer to its tracer.
- *
- * Must be called under rcu_read_lock().
- */
-static struct task_struct *task_tracer_task(struct task_struct *task)
-{
-	if (task->ptrace & PT_PTRACED)
-		return rcu_dereference(task->parent);
-	return NULL;
-}
-
 /* binprm security operations */
 
 static int selinux_bprm_alloc_security(struct linux_binprm *bprm)
@@ -2238,7 +2222,7 @@ static void selinux_bprm_apply_creds(struct linux_binprm *bprm, int unsafe)
 			u32 ptsid = 0;
 
 			rcu_read_lock();
-			tracer = task_tracer_task(current);
+			tracer = tracehook_tracer_task(current);
 			if (likely(tracer != NULL)) {
 				sec = tracer->security;
 				ptsid = sec->sid;
@@ -5247,7 +5231,7 @@ static int selinux_setprocattr(struct task_struct *p,
 		   Otherwise, leave SID unchanged and fail. */
 		task_lock(p);
 		rcu_read_lock();
-		tracer = task_tracer_task(p);
+		tracer = tracehook_tracer_task(p);
 		if (tracer != NULL) {
 			struct task_security_struct *ptsec = tracer->security;
 			u32 ptsid = ptsec->sid;
-- 
cgit v1.2.3-70-g09d2


From ebcb67341fee34061430f3367f2e507e52ee051b Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:46:00 -0700
Subject: /proc/PID/syscall

This adds /proc/PID/syscall and /proc/PID/task/TID/syscall magic files.
These use task_current_syscall() to show the task's current system call
number and argument registers, stack pointer and PC.  For a task blocked
but not in a syscall, the file shows "-1" in place of the syscall number,
followed by only the SP and PC.  For a task that's not blocked, it shows
"running".

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 4b74dba69a6..81bce6791bf 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -509,6 +509,26 @@ static int proc_pid_limits(struct task_struct *task, char *buffer)
 	return count;
 }
 
+#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+static int proc_pid_syscall(struct task_struct *task, char *buffer)
+{
+	long nr;
+	unsigned long args[6], sp, pc;
+
+	if (task_current_syscall(task, &nr, args, 6, &sp, &pc))
+		return sprintf(buffer, "running\n");
+
+	if (nr < 0)
+		return sprintf(buffer, "%ld 0x%lx 0x%lx\n", nr, sp, pc);
+
+	return sprintf(buffer,
+		       "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n",
+		       nr,
+		       args[0], args[1], args[2], args[3], args[4], args[5],
+		       sp, pc);
+}
+#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
+
 /************************************************************************/
 /*                       Here the fs part begins                        */
 /************************************************************************/
@@ -2477,6 +2497,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 	INF("limits",	  S_IRUSR, pid_limits),
 #ifdef CONFIG_SCHED_DEBUG
 	REG("sched",      S_IRUGO|S_IWUSR, pid_sched),
+#endif
+#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+	INF("syscall",    S_IRUSR, pid_syscall),
 #endif
 	INF("cmdline",    S_IRUGO, pid_cmdline),
 	ONE("stat",       S_IRUGO, tgid_stat),
@@ -2809,6 +2832,9 @@ static const struct pid_entry tid_base_stuff[] = {
 	INF("limits",	 S_IRUSR, pid_limits),
 #ifdef CONFIG_SCHED_DEBUG
 	REG("sched",     S_IRUGO|S_IWUSR, pid_sched),
+#endif
+#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+	INF("syscall",   S_IRUSR, pid_syscall),
 #endif
 	INF("cmdline",   S_IRUGO, pid_cmdline),
 	ONE("stat",      S_IRUGO, tid_stat),
-- 
cgit v1.2.3-70-g09d2


From d70b67c8bc72ee23b55381bd6a884f4796692f77 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Wed, 2 Jul 2008 21:30:15 +0200
Subject: [patch] vfs: fix lookup on deleted directory

Lookup can install a child dentry for a deleted directory.  This keeps
the directory dentry alive, and the inode pinned in the cache and on
disk, even after all external references have gone away.

This isn't a big problem normally, since memory pressure or umount
will clear out the directory dentry and its children, releasing the
inode.  But for UBIFS this causes problems because its orphan area can
overflow.

Fix this by returning ENOENT for all lookups on a S_DEAD directory
before creating a child dentry.

Thanks to Zoltan Sogor for noticing this while testing UBIFS, and
Artem for the excellent analysis of the problem and testing.

Reported-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Tested-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 01e67dddcc3..3b26a240ade 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -519,7 +519,14 @@ static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, s
 	 */
 	result = d_lookup(parent, name);
 	if (!result) {
-		struct dentry * dentry = d_alloc(parent, name);
+		struct dentry *dentry;
+
+		/* Don't create child dentry for a dead directory. */
+		result = ERR_PTR(-ENOENT);
+		if (IS_DEADDIR(dir))
+			goto out_unlock;
+
+		dentry = d_alloc(parent, name);
 		result = ERR_PTR(-ENOMEM);
 		if (dentry) {
 			result = dir->i_op->lookup(dir, dentry, nd);
@@ -528,6 +535,7 @@ static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, s
 			else
 				result = dentry;
 		}
+out_unlock:
 		mutex_unlock(&dir->i_mutex);
 		return result;
 	}
@@ -1317,7 +1325,14 @@ static struct dentry *__lookup_hash(struct qstr *name,
 
 	dentry = cached_lookup(base, name, nd);
 	if (!dentry) {
-		struct dentry *new = d_alloc(base, name);
+		struct dentry *new;
+
+		/* Don't create child dentry for a dead directory. */
+		dentry = ERR_PTR(-ENOENT);
+		if (IS_DEADDIR(inode))
+			goto out;
+
+		new = d_alloc(base, name);
 		dentry = ERR_PTR(-ENOMEM);
 		if (!new)
 			goto out;
-- 
cgit v1.2.3-70-g09d2


From d2d9648ec6858e19d16a0b16da62534e85888653 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Tue, 1 Jul 2008 14:16:09 +0200
Subject: [PATCH] reuse xxx_fifo_fops for xxx_pipe_fops

Merge fifo and pipe file_operations.

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fifo.c          |  8 ++++----
 fs/pipe.c          | 51 ++++++++-------------------------------------------
 include/linux/fs.h |  6 +++---
 3 files changed, 15 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/fifo.c b/fs/fifo.c
index 9785e36f81e..987bf941149 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -57,7 +57,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
 	 *  POSIX.1 says that O_NONBLOCK means return with the FIFO
 	 *  opened, even when there is no process writing the FIFO.
 	 */
-		filp->f_op = &read_fifo_fops;
+		filp->f_op = &read_pipefifo_fops;
 		pipe->r_counter++;
 		if (pipe->readers++ == 0)
 			wake_up_partner(inode);
@@ -86,7 +86,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
 		if ((filp->f_flags & O_NONBLOCK) && !pipe->readers)
 			goto err;
 
-		filp->f_op = &write_fifo_fops;
+		filp->f_op = &write_pipefifo_fops;
 		pipe->w_counter++;
 		if (!pipe->writers++)
 			wake_up_partner(inode);
@@ -105,7 +105,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
 	 *  This implementation will NEVER block on a O_RDWR open, since
 	 *  the process can at least talk to itself.
 	 */
-		filp->f_op = &rdwr_fifo_fops;
+		filp->f_op = &rdwr_pipefifo_fops;
 
 		pipe->readers++;
 		pipe->writers++;
@@ -151,5 +151,5 @@ err_nocleanup:
  * depending on the access mode of the file...
  */
 const struct file_operations def_fifo_fops = {
-	.open		= fifo_open,	/* will set read or write pipe_fops */
+	.open		= fifo_open,	/* will set read_ or write_pipefifo_fops */
 };
diff --git a/fs/pipe.c b/fs/pipe.c
index 10c4e9aa5c4..fcba6542b8d 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -777,45 +777,10 @@ pipe_rdwr_open(struct inode *inode, struct file *filp)
 /*
  * The file_operations structs are not static because they
  * are also used in linux/fs/fifo.c to do operations on FIFOs.
+ *
+ * Pipes reuse fifos' file_operations structs.
  */
-const struct file_operations read_fifo_fops = {
-	.llseek		= no_llseek,
-	.read		= do_sync_read,
-	.aio_read	= pipe_read,
-	.write		= bad_pipe_w,
-	.poll		= pipe_poll,
-	.unlocked_ioctl	= pipe_ioctl,
-	.open		= pipe_read_open,
-	.release	= pipe_read_release,
-	.fasync		= pipe_read_fasync,
-};
-
-const struct file_operations write_fifo_fops = {
-	.llseek		= no_llseek,
-	.read		= bad_pipe_r,
-	.write		= do_sync_write,
-	.aio_write	= pipe_write,
-	.poll		= pipe_poll,
-	.unlocked_ioctl	= pipe_ioctl,
-	.open		= pipe_write_open,
-	.release	= pipe_write_release,
-	.fasync		= pipe_write_fasync,
-};
-
-const struct file_operations rdwr_fifo_fops = {
-	.llseek		= no_llseek,
-	.read		= do_sync_read,
-	.aio_read	= pipe_read,
-	.write		= do_sync_write,
-	.aio_write	= pipe_write,
-	.poll		= pipe_poll,
-	.unlocked_ioctl	= pipe_ioctl,
-	.open		= pipe_rdwr_open,
-	.release	= pipe_rdwr_release,
-	.fasync		= pipe_rdwr_fasync,
-};
-
-static const struct file_operations read_pipe_fops = {
+const struct file_operations read_pipefifo_fops = {
 	.llseek		= no_llseek,
 	.read		= do_sync_read,
 	.aio_read	= pipe_read,
@@ -827,7 +792,7 @@ static const struct file_operations read_pipe_fops = {
 	.fasync		= pipe_read_fasync,
 };
 
-static const struct file_operations write_pipe_fops = {
+const struct file_operations write_pipefifo_fops = {
 	.llseek		= no_llseek,
 	.read		= bad_pipe_r,
 	.write		= do_sync_write,
@@ -839,7 +804,7 @@ static const struct file_operations write_pipe_fops = {
 	.fasync		= pipe_write_fasync,
 };
 
-static const struct file_operations rdwr_pipe_fops = {
+const struct file_operations rdwr_pipefifo_fops = {
 	.llseek		= no_llseek,
 	.read		= do_sync_read,
 	.aio_read	= pipe_read,
@@ -927,7 +892,7 @@ static struct inode * get_pipe_inode(void)
 	inode->i_pipe = pipe;
 
 	pipe->readers = pipe->writers = 1;
-	inode->i_fop = &rdwr_pipe_fops;
+	inode->i_fop = &rdwr_pipefifo_fops;
 
 	/*
 	 * Mark the inode dirty from the very beginning,
@@ -978,7 +943,7 @@ struct file *create_write_pipe(int flags)
 	d_instantiate(dentry, inode);
 
 	err = -ENFILE;
-	f = alloc_file(pipe_mnt, dentry, FMODE_WRITE, &write_pipe_fops);
+	f = alloc_file(pipe_mnt, dentry, FMODE_WRITE, &write_pipefifo_fops);
 	if (!f)
 		goto err_dentry;
 	f->f_mapping = inode->i_mapping;
@@ -1020,7 +985,7 @@ struct file *create_read_pipe(struct file *wrf, int flags)
 
 	f->f_pos = 0;
 	f->f_flags = O_RDONLY | (flags & O_NONBLOCK);
-	f->f_op = &read_pipe_fops;
+	f->f_op = &read_pipefifo_fops;
 	f->f_mode = FMODE_READ;
 	f->f_version = 0;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 53d2edb709b..7721a2ac9c0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1696,9 +1696,9 @@ extern void init_special_inode(struct inode *, umode_t, dev_t);
 extern void make_bad_inode(struct inode *);
 extern int is_bad_inode(struct inode *);
 
-extern const struct file_operations read_fifo_fops;
-extern const struct file_operations write_fifo_fops;
-extern const struct file_operations rdwr_fifo_fops;
+extern const struct file_operations read_pipefifo_fops;
+extern const struct file_operations write_pipefifo_fops;
+extern const struct file_operations rdwr_pipefifo_fops;
 
 extern int fs_may_remount_ro(struct super_block *);
 
-- 
cgit v1.2.3-70-g09d2


From 7ac6cd653d7c31ad6b7bb5b88c549c4ebf628c34 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Tue, 1 Jul 2008 23:07:54 +0200
Subject: [patch] hppfs: remove hppfs_permission

hppfs_permission() is equivalent to the '.permission == NULL' case.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/hppfs/hppfs.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'fs')

diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 65077aa90f0..2b3d1828db9 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -655,20 +655,13 @@ static void *hppfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 	return proc_dentry->d_inode->i_op->follow_link(proc_dentry, nd);
 }
 
-int hppfs_permission(struct inode *inode, int mask, struct nameidata *nd)
-{
-	return generic_permission(inode, mask, NULL);
-}
-
 static const struct inode_operations hppfs_dir_iops = {
 	.lookup		= hppfs_lookup,
-	.permission	= hppfs_permission,
 };
 
 static const struct inode_operations hppfs_link_iops = {
 	.readlink	= hppfs_readlink,
 	.follow_link	= hppfs_follow_link,
-	.permission	= hppfs_permission,
 };
 
 static struct inode *get_inode(struct super_block *sb, struct dentry *dentry)
-- 
cgit v1.2.3-70-g09d2


From 9043476f726802f4b00c96d0c4f418dde48d1304 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 15 Jul 2008 08:54:06 -0400
Subject: [PATCH] sanitize proc_sysctl

* keep references to ctl_table_head and ctl_table in /proc/sys inodes
* grab the former during operations, use the latter for access to
  entry if that succeeds
* have ->d_compare() check if table should be seen for one who does lookup;
  that allows us to avoid flipping inodes - if we have the same name resolve
  to different things, we'll just keep several dentries and ->d_compare()
  will reject the wrong ones.
* have ->lookup() and ->readdir() scan the table of our inode first, then
  walk all ctl_table_header and scan ->attached_by for those that are
  attached to our directory.
* implement ->getattr().
* get rid of insane amounts of tree-walking
* get rid of the need to know dentry in ->permission() and of the contortions
  induced by that.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/inode.c         |   5 +
 fs/proc/proc_sysctl.c   | 427 ++++++++++++++++++++++--------------------------
 include/linux/proc_fs.h |   5 +
 include/linux/sysctl.h  |   1 +
 kernel/sysctl.c         |  15 ++
 5 files changed, 218 insertions(+), 235 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index b37f25dc45a..8bb03f056c2 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -17,6 +17,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/smp_lock.h>
+#include <linux/sysctl.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -65,6 +66,8 @@ static void proc_delete_inode(struct inode *inode)
 			module_put(de->owner);
 		de_put(de);
 	}
+	if (PROC_I(inode)->sysctl)
+		sysctl_head_put(PROC_I(inode)->sysctl);
 	clear_inode(inode);
 }
 
@@ -84,6 +87,8 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
 	ei->fd = 0;
 	ei->op.proc_get_link = NULL;
 	ei->pde = NULL;
+	ei->sysctl = NULL;
+	ei->sysctl_entry = NULL;
 	inode = &ei->vfs_inode;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	return inode;
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 5acc001d49f..fa1ec2433e4 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -10,149 +10,110 @@
 static struct dentry_operations proc_sys_dentry_operations;
 static const struct file_operations proc_sys_file_operations;
 static const struct inode_operations proc_sys_inode_operations;
+static const struct file_operations proc_sys_dir_file_operations;
+static const struct inode_operations proc_sys_dir_operations;
 
-static void proc_sys_refresh_inode(struct inode *inode, struct ctl_table *table)
-{
-	/* Refresh the cached information bits in the inode */
-	if (table) {
-		inode->i_uid = 0;
-		inode->i_gid = 0;
-		inode->i_mode = table->mode;
-		if (table->proc_handler) {
-			inode->i_mode |= S_IFREG;
-			inode->i_nlink = 1;
-		} else {
-			inode->i_mode |= S_IFDIR;
-			inode->i_nlink = 0;	/* It is too hard to figure out */
-		}
-	}
-}
-
-static struct inode *proc_sys_make_inode(struct inode *dir, struct ctl_table *table)
+static struct inode *proc_sys_make_inode(struct super_block *sb,
+		struct ctl_table_header *head, struct ctl_table *table)
 {
 	struct inode *inode;
-	struct proc_inode *dir_ei, *ei;
-	int depth;
+	struct proc_inode *ei;
 
-	inode = new_inode(dir->i_sb);
+	inode = new_inode(sb);
 	if (!inode)
 		goto out;
 
-	/* A directory is always one deeper than it's parent */
-	dir_ei = PROC_I(dir);
-	depth = dir_ei->fd + 1;
-
+	sysctl_head_get(head);
 	ei = PROC_I(inode);
-	ei->fd = depth;
+	ei->sysctl = head;
+	ei->sysctl_entry = table;
+
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-	inode->i_op = &proc_sys_inode_operations;
-	inode->i_fop = &proc_sys_file_operations;
 	inode->i_flags |= S_PRIVATE; /* tell selinux to ignore this inode */
-	proc_sys_refresh_inode(inode, table);
+	inode->i_mode = table->mode;
+	if (!table->child) {
+		inode->i_mode |= S_IFREG;
+		inode->i_op = &proc_sys_inode_operations;
+		inode->i_fop = &proc_sys_file_operations;
+	} else {
+		inode->i_mode |= S_IFDIR;
+		inode->i_nlink = 0;
+		inode->i_op = &proc_sys_dir_operations;
+		inode->i_fop = &proc_sys_dir_file_operations;
+	}
 out:
 	return inode;
 }
 
-static struct dentry *proc_sys_ancestor(struct dentry *dentry, int depth)
-{
-	for (;;) {
-		struct proc_inode *ei;
-
-		ei = PROC_I(dentry->d_inode);
-		if (ei->fd == depth)
-			break; /* found */
-
-		dentry = dentry->d_parent;
-	}
-	return dentry;
-}
-
-static struct ctl_table *proc_sys_lookup_table_one(struct ctl_table *table,
-							struct qstr *name)
+static struct ctl_table *find_in_table(struct ctl_table *p, struct qstr *name)
 {
 	int len;
-	for ( ; table->ctl_name || table->procname; table++) {
+	for ( ; p->ctl_name || p->procname; p++) {
 
-		if (!table->procname)
+		if (!p->procname)
 			continue;
 
-		len = strlen(table->procname);
+		len = strlen(p->procname);
 		if (len != name->len)
 			continue;
 
-		if (memcmp(table->procname, name->name, len) != 0)
+		if (memcmp(p->procname, name->name, len) != 0)
 			continue;
 
 		/* I have a match */
-		return table;
+		return p;
 	}
 	return NULL;
 }
 
-static struct ctl_table *proc_sys_lookup_table(struct dentry *dentry,
-						struct ctl_table *table)
+struct ctl_table_header *grab_header(struct inode *inode)
 {
-	struct dentry *ancestor;
-	struct proc_inode *ei;
-	int depth, i;
+	if (PROC_I(inode)->sysctl)
+		return sysctl_head_grab(PROC_I(inode)->sysctl);
+	else
+		return sysctl_head_next(NULL);
+}
 
-	ei = PROC_I(dentry->d_inode);
-	depth = ei->fd;
+static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
+					struct nameidata *nd)
+{
+	struct ctl_table_header *head = grab_header(dir);
+	struct ctl_table *table = PROC_I(dir)->sysctl_entry;
+	struct ctl_table_header *h = NULL;
+	struct qstr *name = &dentry->d_name;
+	struct ctl_table *p;
+	struct inode *inode;
+	struct dentry *err = ERR_PTR(-ENOENT);
 
-	if (depth == 0)
-		return table;
+	if (IS_ERR(head))
+		return ERR_CAST(head);
 
-	for (i = 1; table && (i <= depth); i++) {
-		ancestor = proc_sys_ancestor(dentry, i);
-		table = proc_sys_lookup_table_one(table, &ancestor->d_name);
-		if (table)
-			table = table->child;
+	if (table && !table->child) {
+		WARN_ON(1);
+		goto out;
 	}
-	return table;
-
-}
-static struct ctl_table *proc_sys_lookup_entry(struct dentry *dparent,
-						struct qstr *name,
-						struct ctl_table *table)
-{
-	table = proc_sys_lookup_table(dparent, table);
-	if (table)
-		table = proc_sys_lookup_table_one(table, name);
-	return table;
-}
 
-static struct ctl_table *do_proc_sys_lookup(struct dentry *parent,
-						struct qstr *name,
-						struct ctl_table_header **ptr)
-{
-	struct ctl_table_header *head;
-	struct ctl_table *table = NULL;
+	table = table ? table->child : head->ctl_table;
 
-	for (head = sysctl_head_next(NULL); head;
-			head = sysctl_head_next(head)) {
-		table = proc_sys_lookup_entry(parent, name, head->ctl_table);
-		if (table)
-			break;
+	p = find_in_table(table, name);
+	if (!p) {
+		for (h = sysctl_head_next(NULL); h; h = sysctl_head_next(h)) {
+			if (h->attached_to != table)
+				continue;
+			p = find_in_table(h->attached_by, name);
+			if (p)
+				break;
+		}
 	}
-	*ptr = head;
-	return table;
-}
-
-static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
-					struct nameidata *nd)
-{
-	struct ctl_table_header *head;
-	struct inode *inode;
-	struct dentry *err;
-	struct ctl_table *table;
 
-	err = ERR_PTR(-ENOENT);
-	table = do_proc_sys_lookup(dentry->d_parent, &dentry->d_name, &head);
-	if (!table)
+	if (!p)
 		goto out;
 
 	err = ERR_PTR(-ENOMEM);
-	inode = proc_sys_make_inode(dir, table);
+	inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p);
+	if (h)
+		sysctl_head_finish(h);
+
 	if (!inode)
 		goto out;
 
@@ -168,22 +129,14 @@ out:
 static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
 		size_t count, loff_t *ppos, int write)
 {
-	struct dentry *dentry = filp->f_dentry;
-	struct ctl_table_header *head;
-	struct ctl_table *table;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct ctl_table_header *head = grab_header(inode);
+	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
 	ssize_t error;
 	size_t res;
 
-	table = do_proc_sys_lookup(dentry->d_parent, &dentry->d_name, &head);
-	/* Has the sysctl entry disappeared on us? */
-	error = -ENOENT;
-	if (!table)
-		goto out;
-
-	/* Has the sysctl entry been replaced by a directory? */
-	error = -EISDIR;
-	if (!table->proc_handler)
-		goto out;
+	if (IS_ERR(head))
+		return PTR_ERR(head);
 
 	/*
 	 * At this point we know that the sysctl was not unregistered
@@ -193,6 +146,11 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
 	if (sysctl_perm(head->root, table, write ? MAY_WRITE : MAY_READ))
 		goto out;
 
+	/* if that can happen at all, it should be -EINVAL, not -EISDIR */
+	error = -EINVAL;
+	if (!table->proc_handler)
+		goto out;
+
 	/* careful: calling conventions are nasty here */
 	res = count;
 	error = table->proc_handler(table, write, filp, buf, &res, ppos);
@@ -218,82 +176,86 @@ static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
 
 
 static int proc_sys_fill_cache(struct file *filp, void *dirent,
-				filldir_t filldir, struct ctl_table *table)
+				filldir_t filldir,
+				struct ctl_table_header *head,
+				struct ctl_table *table)
 {
-	struct ctl_table_header *head;
-	struct ctl_table *child_table = NULL;
 	struct dentry *child, *dir = filp->f_path.dentry;
 	struct inode *inode;
 	struct qstr qname;
 	ino_t ino = 0;
 	unsigned type = DT_UNKNOWN;
-	int ret;
 
 	qname.name = table->procname;
 	qname.len  = strlen(table->procname);
 	qname.hash = full_name_hash(qname.name, qname.len);
 
-	/* Suppress duplicates.
-	 * Only fill a directory entry if it is the value that
-	 * an ordinary lookup of that name returns.  Hide all
-	 * others.
-	 *
-	 * If we ever cache this translation in the dcache
-	 * I should do a dcache lookup first.  But for now
-	 * it is just simpler not to.
-	 */
-	ret = 0;
-	child_table = do_proc_sys_lookup(dir, &qname, &head);
-	sysctl_head_finish(head);
-	if (child_table != table)
-		return 0;
-
 	child = d_lookup(dir, &qname);
 	if (!child) {
-		struct dentry *new;
-		new = d_alloc(dir, &qname);
-		if (new) {
-			inode = proc_sys_make_inode(dir->d_inode, table);
-			if (!inode)
-				child = ERR_PTR(-ENOMEM);
-			else {
-				new->d_op = &proc_sys_dentry_operations;
-				d_add(new, inode);
+		child = d_alloc(dir, &qname);
+		if (child) {
+			inode = proc_sys_make_inode(dir->d_sb, head, table);
+			if (!inode) {
+				dput(child);
+				return -ENOMEM;
+			} else {
+				child->d_op = &proc_sys_dentry_operations;
+				d_add(child, inode);
 			}
-			if (child)
-				dput(new);
-			else
-				child = new;
+		} else {
+			return -ENOMEM;
 		}
 	}
-	if (!child || IS_ERR(child) || !child->d_inode)
-		goto end_instantiate;
 	inode = child->d_inode;
-	if (inode) {
-		ino  = inode->i_ino;
-		type = inode->i_mode >> 12;
-	}
+	ino  = inode->i_ino;
+	type = inode->i_mode >> 12;
 	dput(child);
-end_instantiate:
-	if (!ino)
-		ino= find_inode_number(dir, &qname);
-	if (!ino)
-		ino = 1;
-	return filldir(dirent, qname.name, qname.len, filp->f_pos, ino, type);
+	return !!filldir(dirent, qname.name, qname.len, filp->f_pos, ino, type);
+}
+
+static int scan(struct ctl_table_header *head, ctl_table *table,
+		unsigned long *pos, struct file *file,
+		void *dirent, filldir_t filldir)
+{
+
+	for (; table->ctl_name || table->procname; table++, (*pos)++) {
+		int res;
+
+		/* Can't do anything without a proc name */
+		if (!table->procname)
+			continue;
+
+		if (*pos < file->f_pos)
+			continue;
+
+		res = proc_sys_fill_cache(file, dirent, filldir, head, table);
+		if (res)
+			return res;
+
+		file->f_pos = *pos + 1;
+	}
+	return 0;
 }
 
 static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir)
 {
-	struct dentry *dentry = filp->f_dentry;
+	struct dentry *dentry = filp->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
-	struct ctl_table_header *head = NULL;
-	struct ctl_table *table;
+	struct ctl_table_header *head = grab_header(inode);
+	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+	struct ctl_table_header *h = NULL;
 	unsigned long pos;
-	int ret;
+	int ret = -EINVAL;
+
+	if (IS_ERR(head))
+		return PTR_ERR(head);
 
-	ret = -ENOTDIR;
-	if (!S_ISDIR(inode->i_mode))
+	if (table && !table->child) {
+		WARN_ON(1);
 		goto out;
+	}
+
+	table = table ? table->child : head->ctl_table;
 
 	ret = 0;
 	/* Avoid a switch here: arm builds fail with missing __cmpdi2 */
@@ -311,30 +273,17 @@ static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	}
 	pos = 2;
 
-	/* - Find each instance of the directory
-	 * - Read all entries in each instance
-	 * - Before returning an entry to user space lookup the entry
-	 *   by name and if I find a different entry don't return
-	 *   this one because it means it is a buried dup.
-	 * For sysctl this should only happen for directory entries.
-	 */
-	for (head = sysctl_head_next(NULL); head; head = sysctl_head_next(head)) {
-		table = proc_sys_lookup_table(dentry, head->ctl_table);
+	ret = scan(head, table, &pos, filp, dirent, filldir);
+	if (ret)
+		goto out;
 
-		if (!table)
+	for (h = sysctl_head_next(NULL); h; h = sysctl_head_next(h)) {
+		if (h->attached_to != table)
 			continue;
-
-		for (; table->ctl_name || table->procname; table++, pos++) {
-			/* Can't do anything without a proc name */
-			if (!table->procname)
-				continue;
-
-			if (pos < filp->f_pos)
-				continue;
-
-			if (proc_sys_fill_cache(filp, dirent, filldir, table) < 0)
-				goto out;
-			filp->f_pos = pos + 1;
+		ret = scan(h, h->attached_by, &pos, filp, dirent, filldir);
+		if (ret) {
+			sysctl_head_finish(h);
+			break;
 		}
 	}
 	ret = 1;
@@ -349,47 +298,18 @@ static int proc_sys_permission(struct inode *inode, int mask, struct nameidata *
 	 * sysctl entries that are not writeable,
 	 * are _NOT_ writeable, capabilities or not.
 	 */
-	struct ctl_table_header *head;
-	struct ctl_table *table;
-	struct dentry *dentry;
-	int mode;
-	int depth;
+	struct ctl_table_header *head = grab_header(inode);
+	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
 	int error;
 
-	head = NULL;
-	depth = PROC_I(inode)->fd;
-
-	/* First check the cached permissions, in case we don't have
-	 * enough information to lookup the sysctl table entry.
-	 */
-	error = -EACCES;
-	mode = inode->i_mode;
-
-	if (current->euid == 0)
-		mode >>= 6;
-	else if (in_group_p(0))
-		mode >>= 3;
-
-	if ((mode & mask & (MAY_READ|MAY_WRITE|MAY_EXEC)) == mask)
-		error = 0;
-
-	/* If we can't get a sysctl table entry the permission
-	 * checks on the cached mode will have to be enough.
-	 */
-	if (!nd || !depth)
-		goto out;
+	if (IS_ERR(head))
+		return PTR_ERR(head);
 
-	dentry = nd->path.dentry;
-	table = do_proc_sys_lookup(dentry->d_parent, &dentry->d_name, &head);
+	if (!table) /* global root - r-xr-xr-x */
+		error = mask & MAY_WRITE ? -EACCES : 0;
+	else /* Use the permissions on the sysctl table entry */
+		error = sysctl_perm(head->root, table, mask);
 
-	/* If the entry does not exist deny permission */
-	error = -EACCES;
-	if (!table)
-		goto out;
-
-	/* Use the permissions on the sysctl table entry */
-	error = sysctl_perm(head->root, table, mask);
-out:
 	sysctl_head_finish(head);
 	return error;
 }
@@ -409,33 +329,70 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr)
 	return error;
 }
 
-/* I'm lazy and don't distinguish between files and directories,
- * until access time.
- */
+static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ctl_table_header *head = grab_header(inode);
+	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+
+	if (IS_ERR(head))
+		return PTR_ERR(head);
+
+	generic_fillattr(inode, stat);
+	if (table)
+		stat->mode = (stat->mode & S_IFMT) | table->mode;
+
+	sysctl_head_finish(head);
+	return 0;
+}
+
 static const struct file_operations proc_sys_file_operations = {
 	.read		= proc_sys_read,
 	.write		= proc_sys_write,
+};
+
+static const struct file_operations proc_sys_dir_file_operations = {
 	.readdir	= proc_sys_readdir,
 };
 
 static const struct inode_operations proc_sys_inode_operations = {
+	.permission	= proc_sys_permission,
+	.setattr	= proc_sys_setattr,
+	.getattr	= proc_sys_getattr,
+};
+
+static const struct inode_operations proc_sys_dir_operations = {
 	.lookup		= proc_sys_lookup,
 	.permission	= proc_sys_permission,
 	.setattr	= proc_sys_setattr,
+	.getattr	= proc_sys_getattr,
 };
 
 static int proc_sys_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-	struct ctl_table_header *head;
-	struct ctl_table *table;
-	table = do_proc_sys_lookup(dentry->d_parent, &dentry->d_name, &head);
-	proc_sys_refresh_inode(dentry->d_inode, table);
-	sysctl_head_finish(head);
-	return !!table;
+	return !PROC_I(dentry->d_inode)->sysctl->unregistering;
+}
+
+static int proc_sys_delete(struct dentry *dentry)
+{
+	return !!PROC_I(dentry->d_inode)->sysctl->unregistering;
+}
+
+static int proc_sys_compare(struct dentry *dir, struct qstr *qstr,
+			    struct qstr *name)
+{
+	struct dentry *dentry = container_of(qstr, struct dentry, d_name);
+	if (qstr->len != name->len)
+		return 1;
+	if (memcmp(qstr->name, name->name, name->len))
+		return 1;
+	return !sysctl_is_seen(PROC_I(dentry->d_inode)->sysctl);
 }
 
 static struct dentry_operations proc_sys_dentry_operations = {
 	.d_revalidate	= proc_sys_revalidate,
+	.d_delete	= proc_sys_delete,
+	.d_compare	= proc_sys_compare,
 };
 
 static struct proc_dir_entry *proc_sys_root;
@@ -443,8 +400,8 @@ static struct proc_dir_entry *proc_sys_root;
 int proc_sys_init(void)
 {
 	proc_sys_root = proc_mkdir("sys", NULL);
-	proc_sys_root->proc_iops = &proc_sys_inode_operations;
-	proc_sys_root->proc_fops = &proc_sys_file_operations;
+	proc_sys_root->proc_iops = &proc_sys_dir_operations;
+	proc_sys_root->proc_fops = &proc_sys_dir_file_operations;
 	proc_sys_root->nlink = 0;
 	return 0;
 }
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index f560d1705af..fb61850d1cf 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -282,11 +282,16 @@ union proc_op {
 		struct task_struct *task);
 };
 
+struct ctl_table_header;
+struct ctl_table;
+
 struct proc_inode {
 	struct pid *pid;
 	int fd;
 	union proc_op op;
 	struct proc_dir_entry *pde;
+	struct ctl_table_header *sysctl;
+	struct ctl_table *sysctl_entry;
 	struct inode vfs_inode;
 };
 
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 3f6599aeb0d..d0437f36921 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -961,6 +961,7 @@ struct ctl_table_header;
 
 extern void sysctl_head_get(struct ctl_table_header *);
 extern void sysctl_head_put(struct ctl_table_header *);
+extern int sysctl_is_seen(struct ctl_table_header *);
 extern struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *);
 extern struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev);
 extern struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c9a0af88703..ff5abcca5dd 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1932,6 +1932,21 @@ void unregister_sysctl_table(struct ctl_table_header * header)
 	spin_unlock(&sysctl_lock);
 }
 
+int sysctl_is_seen(struct ctl_table_header *p)
+{
+	struct ctl_table_set *set = p->set;
+	int res;
+	spin_lock(&sysctl_lock);
+	if (p->unregistering)
+		res = 0;
+	else if (!set->is_seen)
+		res = 1;
+	else
+		res = set->is_seen(set);
+	spin_unlock(&sysctl_lock);
+	return res;
+}
+
 void setup_sysctl_set(struct ctl_table_set *p,
 	struct ctl_table_set *parent,
 	int (*is_seen)(struct ctl_table_set *))
-- 
cgit v1.2.3-70-g09d2


From 1bd5191d9f5d1928c4efdf604c4164b04bb88dbe Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Wed, 21 May 2008 19:15:03 +0200
Subject: [patch 05/14] hpfs: dont call permission()

hpfs_unlink() calls permission() prior to truncating the file.  HPFS
doesn't define a .permission method, so replace with explicit call to
generic_permission().

This is equivalent, except that devcgroup_inode_permission() and
security_inode_permission() are not called.

The truncation is just an implementation detail of the unlink, so
these security checks are unnecessary.

I suspect that even calling generic_permission() is unnecessary, since
we shouldn't mind if the file isn't writable.  But I leave that to the
maintainer to decide.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
CC: Mikulas Patocka <mikulas@artax.karlin.mff.cuni.cz>
---
 fs/hpfs/namei.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index d256559b410..d9c59a77544 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -415,7 +415,7 @@ again:
 		d_drop(dentry);
 		spin_lock(&dentry->d_lock);
 		if (atomic_read(&dentry->d_count) > 1 ||
-		    permission(inode, MAY_WRITE, NULL) ||
+		    generic_permission(inode, MAY_WRITE, NULL) ||
 		    !S_ISREG(inode->i_mode) ||
 		    get_write_access(inode)) {
 			spin_unlock(&dentry->d_lock);
-- 
cgit v1.2.3-70-g09d2


From e6305c43eda10ebfd2ad9e35d6e172ccc7bb3695 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 15 Jul 2008 21:03:57 -0400
Subject: [PATCH] sanitize ->permission() prototype

* kill nameidata * argument; map the 3 bits in ->flags anybody cares
  about to new MAY_... ones and pass with the mask.
* kill redundant gfs2_iop_permission()
* sanitize ecryptfs_permission()
* fix remaining places where ->permission() instances might barf on new
  MAY_... found in mask.

The obvious next target in that direction is permission(9)

folded fix for nfs_permission() breakage from Miklos Szeredi <mszeredi@suse.cz>

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/afs/internal.h              |  4 +---
 fs/afs/security.c              |  2 +-
 fs/bad_inode.c                 |  3 +--
 fs/cifs/cifsfs.c               |  2 +-
 fs/coda/dir.c                  |  4 +++-
 fs/coda/pioctl.c               |  6 ++----
 fs/ecryptfs/inode.c            | 17 ++---------------
 fs/ext2/acl.c                  |  2 +-
 fs/ext2/acl.h                  |  2 +-
 fs/ext3/acl.c                  |  2 +-
 fs/ext3/acl.h                  |  2 +-
 fs/ext4/acl.c                  |  2 +-
 fs/ext4/acl.h                  |  2 +-
 fs/fuse/dir.c                  |  6 +++---
 fs/gfs2/ops_inode.c            | 12 +++---------
 fs/hfs/inode.c                 |  3 +--
 fs/hfsplus/inode.c             |  2 +-
 fs/hostfs/hostfs_kern.c        |  2 +-
 fs/jffs2/acl.c                 |  2 +-
 fs/jffs2/acl.h                 |  2 +-
 fs/jfs/acl.c                   |  2 +-
 fs/jfs/jfs_acl.h               |  2 +-
 fs/namei.c                     | 23 +++++++++++++++++------
 fs/nfs/dir.c                   | 11 +++++------
 fs/ocfs2/file.c                |  2 +-
 fs/ocfs2/file.h                |  3 +--
 fs/proc/base.c                 |  3 +--
 fs/proc/proc_sysctl.c          |  2 +-
 fs/reiserfs/xattr.c            |  2 +-
 fs/smbfs/file.c                |  4 ++--
 fs/xfs/linux-2.6/xfs_iops.c    |  3 +--
 include/linux/coda_linux.h     |  2 +-
 include/linux/fs.h             |  5 ++++-
 include/linux/nfs_fs.h         |  2 +-
 include/linux/reiserfs_xattr.h |  2 +-
 include/linux/shmem_fs.h       |  2 +-
 kernel/sysctl.c                | 10 +++++-----
 mm/shmem_acl.c                 |  2 +-
 38 files changed, 74 insertions(+), 87 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 7102824ba84..3cb6920ff30 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -469,8 +469,6 @@ extern bool afs_cm_incoming_call(struct afs_call *);
 extern const struct inode_operations afs_dir_inode_operations;
 extern const struct file_operations afs_dir_file_operations;
 
-extern int afs_permission(struct inode *, int, struct nameidata *);
-
 /*
  * file.c
  */
@@ -605,7 +603,7 @@ extern void afs_clear_permits(struct afs_vnode *);
 extern void afs_cache_permit(struct afs_vnode *, struct key *, long);
 extern void afs_zap_permits(struct rcu_head *);
 extern struct key *afs_request_key(struct afs_cell *);
-extern int afs_permission(struct inode *, int, struct nameidata *);
+extern int afs_permission(struct inode *, int);
 
 /*
  * server.c
diff --git a/fs/afs/security.c b/fs/afs/security.c
index 3bcbeceba1b..3ef50437003 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -284,7 +284,7 @@ static int afs_check_permit(struct afs_vnode *vnode, struct key *key,
  * - AFS ACLs are attached to directories only, and a file is controlled by its
  *   parent directory's ACL
  */
-int afs_permission(struct inode *inode, int mask, struct nameidata *nd)
+int afs_permission(struct inode *inode, int mask)
 {
 	struct afs_vnode *vnode = AFS_FS_I(inode);
 	afs_access_t uninitialized_var(access);
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index f1c2ea8342f..5f1538c03b1 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -243,8 +243,7 @@ static int bad_inode_readlink(struct dentry *dentry, char __user *buffer,
 	return -EIO;
 }
 
-static int bad_inode_permission(struct inode *inode, int mask,
-			struct nameidata *nd)
+static int bad_inode_permission(struct inode *inode, int mask)
 {
 	return -EIO;
 }
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index fe5f6809cba..1ec7076f7b2 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -267,7 +267,7 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
-static int cifs_permission(struct inode *inode, int mask, struct nameidata *nd)
+static int cifs_permission(struct inode *inode, int mask)
 {
 	struct cifs_sb_info *cifs_sb;
 
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 3d2580e00a3..c5916228243 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -137,9 +137,11 @@ exit:
 }
 
 
-int coda_permission(struct inode *inode, int mask, struct nameidata *nd)
+int coda_permission(struct inode *inode, int mask)
 {
         int error = 0;
+
+	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
  
 	if (!mask)
 		return 0; 
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index c21a1f552a6..c38a98974fb 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -24,8 +24,7 @@
 #include <linux/coda_psdev.h>
 
 /* pioctl ops */
-static int coda_ioctl_permission(struct inode *inode, int mask,
-				 struct nameidata *nd);
+static int coda_ioctl_permission(struct inode *inode, int mask);
 static int coda_pioctl(struct inode * inode, struct file * filp, 
                        unsigned int cmd, unsigned long user_data);
 
@@ -42,8 +41,7 @@ const struct file_operations coda_ioctl_operations = {
 };
 
 /* the coda pioctl inode ops */
-static int coda_ioctl_permission(struct inode *inode, int mask,
-				 struct nameidata *nd)
+static int coda_ioctl_permission(struct inode *inode, int mask)
 {
         return 0;
 }
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index d755455e3bf..32f4228efcd 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -830,22 +830,9 @@ out:
 }
 
 static int
-ecryptfs_permission(struct inode *inode, int mask, struct nameidata *nd)
+ecryptfs_permission(struct inode *inode, int mask)
 {
-	int rc;
-
-        if (nd) {
-		struct vfsmount *vfsmnt_save = nd->path.mnt;
-		struct dentry *dentry_save = nd->path.dentry;
-
-		nd->path.mnt = ecryptfs_dentry_to_lower_mnt(nd->path.dentry);
-		nd->path.dentry = ecryptfs_dentry_to_lower(nd->path.dentry);
-		rc = permission(ecryptfs_inode_to_lower(inode), mask, nd);
-		nd->path.mnt = vfsmnt_save;
-		nd->path.dentry = dentry_save;
-        } else
-		rc = permission(ecryptfs_inode_to_lower(inode), mask, NULL);
-        return rc;
+	return permission(ecryptfs_inode_to_lower(inode), mask, NULL);
 }
 
 /**
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index e58669e1b87..ae8c4f850b2 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -294,7 +294,7 @@ ext2_check_acl(struct inode *inode, int mask)
 }
 
 int
-ext2_permission(struct inode *inode, int mask, struct nameidata *nd)
+ext2_permission(struct inode *inode, int mask)
 {
 	return generic_permission(inode, mask, ext2_check_acl);
 }
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index 0bde85bafe3..b42cf578554 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -58,7 +58,7 @@ static inline int ext2_acl_count(size_t size)
 #define EXT2_ACL_NOT_CACHED ((void *)-1)
 
 /* acl.c */
-extern int ext2_permission (struct inode *, int, struct nameidata *);
+extern int ext2_permission (struct inode *, int);
 extern int ext2_acl_chmod (struct inode *);
 extern int ext2_init_acl (struct inode *, struct inode *);
 
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index a754d184817..b60bb241880 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -299,7 +299,7 @@ ext3_check_acl(struct inode *inode, int mask)
 }
 
 int
-ext3_permission(struct inode *inode, int mask, struct nameidata *nd)
+ext3_permission(struct inode *inode, int mask)
 {
 	return generic_permission(inode, mask, ext3_check_acl);
 }
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
index 0d1e6279cbf..42da16b8cac 100644
--- a/fs/ext3/acl.h
+++ b/fs/ext3/acl.h
@@ -58,7 +58,7 @@ static inline int ext3_acl_count(size_t size)
 #define EXT3_ACL_NOT_CACHED ((void *)-1)
 
 /* acl.c */
-extern int ext3_permission (struct inode *, int, struct nameidata *);
+extern int ext3_permission (struct inode *, int);
 extern int ext3_acl_chmod (struct inode *);
 extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
 
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 3c8dab880d9..c7d04e16544 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -299,7 +299,7 @@ ext4_check_acl(struct inode *inode, int mask)
 }
 
 int
-ext4_permission(struct inode *inode, int mask, struct nameidata *nd)
+ext4_permission(struct inode *inode, int mask)
 {
 	return generic_permission(inode, mask, ext4_check_acl);
 }
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 26a5c1abf14..cd2b855a07d 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -58,7 +58,7 @@ static inline int ext4_acl_count(size_t size)
 #define EXT4_ACL_NOT_CACHED ((void *)-1)
 
 /* acl.c */
-extern int ext4_permission (struct inode *, int, struct nameidata *);
+extern int ext4_permission (struct inode *, int);
 extern int ext4_acl_chmod (struct inode *);
 extern int ext4_init_acl (handle_t *, struct inode *, struct inode *);
 
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 51d0035ff07..48a7934cb95 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -898,7 +898,7 @@ static int fuse_access(struct inode *inode, int mask)
 		return PTR_ERR(req);
 
 	memset(&inarg, 0, sizeof(inarg));
-	inarg.mask = mask;
+	inarg.mask = mask & (MAY_READ | MAY_WRITE | MAY_EXEC);
 	req->in.h.opcode = FUSE_ACCESS;
 	req->in.h.nodeid = get_node_id(inode);
 	req->in.numargs = 1;
@@ -927,7 +927,7 @@ static int fuse_access(struct inode *inode, int mask)
  * access request is sent.  Execute permission is still checked
  * locally based on file mode.
  */
-static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd)
+static int fuse_permission(struct inode *inode, int mask)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	bool refreshed = false;
@@ -962,7 +962,7 @@ static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd)
 		   exist.  So if permissions are revoked this won't be
 		   noticed immediately, only after the attribute
 		   timeout has expired */
-	} else if (nd && (nd->flags & (LOOKUP_ACCESS | LOOKUP_CHDIR))) {
+	} else if (mask & (MAY_ACCESS | MAY_CHDIR)) {
 		err = fuse_access(inode, mask);
 	} else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) {
 		if (!(inode->i_mode & S_IXUGO)) {
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 1e252dfc529..4e982532f08 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -915,12 +915,6 @@ int gfs2_permission(struct inode *inode, int mask)
 	return error;
 }
 
-static int gfs2_iop_permission(struct inode *inode, int mask,
-			       struct nameidata *nd)
-{
-	return gfs2_permission(inode, mask);
-}
-
 static int setattr_size(struct inode *inode, struct iattr *attr)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
@@ -1150,7 +1144,7 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
 }
 
 const struct inode_operations gfs2_file_iops = {
-	.permission = gfs2_iop_permission,
+	.permission = gfs2_permission,
 	.setattr = gfs2_setattr,
 	.getattr = gfs2_getattr,
 	.setxattr = gfs2_setxattr,
@@ -1169,7 +1163,7 @@ const struct inode_operations gfs2_dir_iops = {
 	.rmdir = gfs2_rmdir,
 	.mknod = gfs2_mknod,
 	.rename = gfs2_rename,
-	.permission = gfs2_iop_permission,
+	.permission = gfs2_permission,
 	.setattr = gfs2_setattr,
 	.getattr = gfs2_getattr,
 	.setxattr = gfs2_setxattr,
@@ -1181,7 +1175,7 @@ const struct inode_operations gfs2_dir_iops = {
 const struct inode_operations gfs2_symlink_iops = {
 	.readlink = gfs2_readlink,
 	.follow_link = gfs2_follow_link,
-	.permission = gfs2_iop_permission,
+	.permission = gfs2_permission,
 	.setattr = gfs2_setattr,
 	.getattr = gfs2_getattr,
 	.setxattr = gfs2_setxattr,
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index dc4ec640e87..aa73f3fd5dd 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -511,8 +511,7 @@ void hfs_clear_inode(struct inode *inode)
 	}
 }
 
-static int hfs_permission(struct inode *inode, int mask,
-			  struct nameidata *nd)
+static int hfs_permission(struct inode *inode, int mask)
 {
 	if (S_ISREG(inode->i_mode) && mask & MAY_EXEC)
 		return 0;
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index cc3b5e24339..d4014e3044d 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -238,7 +238,7 @@ static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
 	perms->dev = cpu_to_be32(HFSPLUS_I(inode).dev);
 }
 
-static int hfsplus_permission(struct inode *inode, int mask, struct nameidata *nd)
+static int hfsplus_permission(struct inode *inode, int mask)
 {
 	/* MAY_EXEC is also used for lookup, if no x bit is set allow lookup,
 	 * open_exec has the same test, so it's still not executable, if a x bit
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 5222345ddcc..d6ecabf4d23 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -822,7 +822,7 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from,
 	return err;
 }
 
-int hostfs_permission(struct inode *ino, int desired, struct nameidata *nd)
+int hostfs_permission(struct inode *ino, int desired)
 {
 	char *name;
 	int r = 0, w = 0, x = 0, err;
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 4c80404a9ab..d98713777a1 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -314,7 +314,7 @@ static int jffs2_check_acl(struct inode *inode, int mask)
 	return -EAGAIN;
 }
 
-int jffs2_permission(struct inode *inode, int mask, struct nameidata *nd)
+int jffs2_permission(struct inode *inode, int mask)
 {
 	return generic_permission(inode, mask, jffs2_check_acl);
 }
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index 0bb7f003fd8..8ca058aed38 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -28,7 +28,7 @@ struct jffs2_acl_header {
 
 #define JFFS2_ACL_NOT_CACHED ((void *)-1)
 
-extern int jffs2_permission(struct inode *, int, struct nameidata *);
+extern int jffs2_permission(struct inode *, int);
 extern int jffs2_acl_chmod(struct inode *);
 extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *);
 extern int jffs2_init_acl_post(struct inode *);
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 4d84bdc8829..d3e5c33665d 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -140,7 +140,7 @@ static int jfs_check_acl(struct inode *inode, int mask)
 	return -EAGAIN;
 }
 
-int jfs_permission(struct inode *inode, int mask, struct nameidata *nd)
+int jfs_permission(struct inode *inode, int mask)
 {
 	return generic_permission(inode, mask, jfs_check_acl);
 }
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index 455fa429204..88475f10a38 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -20,7 +20,7 @@
 
 #ifdef CONFIG_JFS_POSIX_ACL
 
-int jfs_permission(struct inode *, int, struct nameidata *);
+int jfs_permission(struct inode *, int);
 int jfs_init_acl(tid_t, struct inode *, struct inode *);
 int jfs_setattr(struct dentry *, struct iattr *);
 
diff --git a/fs/namei.c b/fs/namei.c
index 3b26a240ade..46af98ed136 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -185,6 +185,8 @@ int generic_permission(struct inode *inode, int mask,
 {
 	umode_t			mode = inode->i_mode;
 
+	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
+
 	if (current->fsuid == inode->i_uid)
 		mode >>= 6;
 	else {
@@ -203,7 +205,7 @@ int generic_permission(struct inode *inode, int mask,
 	/*
 	 * If the DACs are ok we don't need any capability check.
 	 */
-	if (((mode & mask & (MAY_READ|MAY_WRITE|MAY_EXEC)) == mask))
+	if ((mask & ~mode) == 0)
 		return 0;
 
  check_capabilities:
@@ -228,7 +230,7 @@ int generic_permission(struct inode *inode, int mask,
 
 int permission(struct inode *inode, int mask, struct nameidata *nd)
 {
-	int retval, submask;
+	int retval;
 	struct vfsmount *mnt = NULL;
 
 	if (nd)
@@ -261,9 +263,17 @@ int permission(struct inode *inode, int mask, struct nameidata *nd)
 	}
 
 	/* Ordinary permission routines do not understand MAY_APPEND. */
-	submask = mask & ~MAY_APPEND;
 	if (inode->i_op && inode->i_op->permission) {
-		retval = inode->i_op->permission(inode, submask, nd);
+		int extra = 0;
+		if (nd) {
+			if (nd->flags & LOOKUP_ACCESS)
+				extra |= MAY_ACCESS;
+			if (nd->flags & LOOKUP_CHDIR)
+				extra |= MAY_CHDIR;
+			if (nd->flags & LOOKUP_OPEN)
+				extra |= MAY_OPEN;
+		}
+		retval = inode->i_op->permission(inode, mask | extra);
 		if (!retval) {
 			/*
 			 * Exec permission on a regular file is denied if none
@@ -277,7 +287,7 @@ int permission(struct inode *inode, int mask, struct nameidata *nd)
 				return -EACCES;
 		}
 	} else {
-		retval = generic_permission(inode, submask, NULL);
+		retval = generic_permission(inode, mask, NULL);
 	}
 	if (retval)
 		return retval;
@@ -286,7 +296,8 @@ int permission(struct inode *inode, int mask, struct nameidata *nd)
 	if (retval)
 		return retval;
 
-	return security_inode_permission(inode, mask, nd);
+	return security_inode_permission(inode,
+			mask & (MAY_READ|MAY_WRITE|MAY_EXEC), nd);
 }
 
 /**
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 28a238dab23..74f92b717f7 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1884,7 +1884,7 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
 		return status;
 	nfs_access_add_cache(inode, &cache);
 out:
-	if ((cache.mask & mask) == mask)
+	if ((mask & ~cache.mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
 		return 0;
 	return -EACCES;
 }
@@ -1907,17 +1907,17 @@ int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags)
 	return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags));
 }
 
-int nfs_permission(struct inode *inode, int mask, struct nameidata *nd)
+int nfs_permission(struct inode *inode, int mask)
 {
 	struct rpc_cred *cred;
 	int res = 0;
 
 	nfs_inc_stats(inode, NFSIOS_VFSACCESS);
 
-	if (mask == 0)
+	if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
 		goto out;
 	/* Is this sys_access() ? */
-	if (nd != NULL && (nd->flags & LOOKUP_ACCESS))
+	if (mask & MAY_ACCESS)
 		goto force_lookup;
 
 	switch (inode->i_mode & S_IFMT) {
@@ -1926,8 +1926,7 @@ int nfs_permission(struct inode *inode, int mask, struct nameidata *nd)
 		case S_IFREG:
 			/* NFSv4 has atomic_open... */
 			if (nfs_server_capable(inode, NFS_CAP_ATOMIC_OPEN)
-					&& nd != NULL
-					&& (nd->flags & LOOKUP_OPEN))
+					&& (mask & MAY_OPEN))
 				goto out;
 			break;
 		case S_IFDIR:
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index e8514e8b6ce..be2dd95d3a1 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1176,7 +1176,7 @@ bail:
 	return err;
 }
 
-int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
+int ocfs2_permission(struct inode *inode, int mask)
 {
 	int ret;
 
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 048ddcaf5c8..1e27b4d017e 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -62,8 +62,7 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
 int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		  struct kstat *stat);
-int ocfs2_permission(struct inode *inode, int mask,
-		     struct nameidata *nd);
+int ocfs2_permission(struct inode *inode, int mask);
 
 int ocfs2_should_update_atime(struct inode *inode,
 			      struct vfsmount *vfsmnt);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 81bce6791bf..d82d800389f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1859,8 +1859,7 @@ static const struct file_operations proc_fd_operations = {
  * /proc/pid/fd needs a special permission handler so that a process can still
  * access /proc/self/fd after it has executed a setuid().
  */
-static int proc_fd_permission(struct inode *inode, int mask,
-				struct nameidata *nd)
+static int proc_fd_permission(struct inode *inode, int mask)
 {
 	int rv;
 
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index fa1ec2433e4..f9a8b892718 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -292,7 +292,7 @@ out:
 	return ret;
 }
 
-static int proc_sys_permission(struct inode *inode, int mask, struct nameidata *nd)
+static int proc_sys_permission(struct inode *inode, int mask)
 {
 	/*
 	 * sysctl entries that are not writeable,
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index d7c4935c103..bb3cb5b7cdb 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -1250,7 +1250,7 @@ static int reiserfs_check_acl(struct inode *inode, int mask)
 	return error;
 }
 
-int reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd)
+int reiserfs_permission(struct inode *inode, int mask)
 {
 	/*
 	 * We don't do permission checks on the internal objects.
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index 2294783320c..e4f8d51a555 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -408,7 +408,7 @@ smb_file_release(struct inode *inode, struct file * file)
  * privileges, so we need our own check for this.
  */
 static int
-smb_file_permission(struct inode *inode, int mask, struct nameidata *nd)
+smb_file_permission(struct inode *inode, int mask)
 {
 	int mode = inode->i_mode;
 	int error = 0;
@@ -417,7 +417,7 @@ smb_file_permission(struct inode *inode, int mask, struct nameidata *nd)
 
 	/* Look at user permissions */
 	mode >>= 6;
-	if ((mode & 7 & mask) != mask)
+	if (mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC))
 		error = -EACCES;
 	return error;
 }
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 2bf287ef548..5fc61c824bb 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -589,8 +589,7 @@ xfs_check_acl(
 STATIC int
 xfs_vn_permission(
 	struct inode		*inode,
-	int			mask,
-	struct nameidata	*nd)
+	int			mask)
 {
 	return generic_permission(inode, mask, xfs_check_acl);
 }
diff --git a/include/linux/coda_linux.h b/include/linux/coda_linux.h
index 31b75311e2c..dcc228aa335 100644
--- a/include/linux/coda_linux.h
+++ b/include/linux/coda_linux.h
@@ -37,7 +37,7 @@ extern const struct file_operations coda_ioctl_operations;
 /* operations shared over more than one file */
 int coda_open(struct inode *i, struct file *f);
 int coda_release(struct inode *i, struct file *f);
-int coda_permission(struct inode *inode, int mask, struct nameidata *nd);
+int coda_permission(struct inode *inode, int mask);
 int coda_revalidate_inode(struct dentry *);
 int coda_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 int coda_setattr(struct dentry *, struct iattr *);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7721a2ac9c0..6c923c9b79b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -60,6 +60,9 @@ extern int dir_notify_enable;
 #define MAY_WRITE 2
 #define MAY_READ 4
 #define MAY_APPEND 8
+#define MAY_ACCESS 16
+#define MAY_CHDIR 32
+#define MAY_OPEN 64
 
 #define FMODE_READ 1
 #define FMODE_WRITE 2
@@ -1272,7 +1275,7 @@ struct inode_operations {
 	void * (*follow_link) (struct dentry *, struct nameidata *);
 	void (*put_link) (struct dentry *, struct nameidata *, void *);
 	void (*truncate) (struct inode *);
-	int (*permission) (struct inode *, int, struct nameidata *);
+	int (*permission) (struct inode *, int);
 	int (*setattr) (struct dentry *, struct iattr *);
 	int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
 	int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 29d26191873..f08f9ca602a 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -332,7 +332,7 @@ extern int nfs_refresh_inode(struct inode *, struct nfs_fattr *);
 extern int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr);
 extern int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr);
 extern int nfs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
-extern int nfs_permission(struct inode *, int, struct nameidata *);
+extern int nfs_permission(struct inode *, int);
 extern int nfs_open(struct inode *, struct file *);
 extern int nfs_release(struct inode *, struct file *);
 extern int nfs_attribute_timeout(struct inode *inode);
diff --git a/include/linux/reiserfs_xattr.h b/include/linux/reiserfs_xattr.h
index 66a96814d61..af135ae895d 100644
--- a/include/linux/reiserfs_xattr.h
+++ b/include/linux/reiserfs_xattr.h
@@ -55,7 +55,7 @@ int reiserfs_removexattr(struct dentry *dentry, const char *name);
 int reiserfs_delete_xattrs(struct inode *inode);
 int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs);
 int reiserfs_xattr_init(struct super_block *sb, int mount_flags);
-int reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd);
+int reiserfs_permission(struct inode *inode, int mask);
 
 int reiserfs_xattr_del(struct inode *, const char *);
 int reiserfs_xattr_get(const struct inode *, const char *, void *, size_t);
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index f2d12d5a21b..fd83f2584b1 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -43,7 +43,7 @@ static inline struct shmem_inode_info *SHMEM_I(struct inode *inode)
 }
 
 #ifdef CONFIG_TMPFS_POSIX_ACL
-int shmem_permission(struct inode *, int, struct nameidata *);
+int shmem_permission(struct inode *, int);
 int shmem_acl_init(struct inode *, struct inode *);
 void shmem_acl_destroy_inode(struct inode *);
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ff5abcca5dd..911d846f050 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1516,9 +1516,9 @@ static int do_sysctl_strategy(struct ctl_table_root *root,
 	int op = 0, rc;
 
 	if (oldval)
-		op |= 004;
+		op |= MAY_READ;
 	if (newval)
-		op |= 002;
+		op |= MAY_WRITE;
 	if (sysctl_perm(root, table, op))
 		return -EPERM;
 
@@ -1560,7 +1560,7 @@ repeat:
 		if (n == table->ctl_name) {
 			int error;
 			if (table->child) {
-				if (sysctl_perm(root, table, 001))
+				if (sysctl_perm(root, table, MAY_EXEC))
 					return -EPERM;
 				name++;
 				nlen--;
@@ -1635,7 +1635,7 @@ static int test_perm(int mode, int op)
 		mode >>= 6;
 	else if (in_egroup_p(0))
 		mode >>= 3;
-	if ((mode & op & 0007) == op)
+	if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0)
 		return 0;
 	return -EACCES;
 }
@@ -1645,7 +1645,7 @@ int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
 	int error;
 	int mode;
 
-	error = security_sysctl(table, op);
+	error = security_sysctl(table, op & (MAY_READ | MAY_WRITE | MAY_EXEC));
 	if (error)
 		return error;
 
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c
index f5664c5b9eb..8e5aadd7dcd 100644
--- a/mm/shmem_acl.c
+++ b/mm/shmem_acl.c
@@ -191,7 +191,7 @@ shmem_check_acl(struct inode *inode, int mask)
  * shmem_permission  -  permission() inode operation
  */
 int
-shmem_permission(struct inode *inode, int mask, struct nameidata *nd)
+shmem_permission(struct inode *inode, int mask)
 {
 	return generic_permission(inode, mask, shmem_check_acl);
 }
-- 
cgit v1.2.3-70-g09d2


From c82e42da8a6b2f3a85dc4d4278cb8238702f8f64 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Tue, 24 Jun 2008 16:50:12 +0200
Subject: [patch 1/5] vfs: truncate: dont check immutable twice

vfs_permission(MAY_WRITE) already checked for the inode being
immutable, so no need to repeat it.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Acked-by: Christoph Hellwig <hch@infradead.org>
---
 fs/open.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/open.c b/fs/open.c
index bb98d2fe809..b2e4c93aed0 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -256,7 +256,7 @@ static long do_sys_truncate(const char __user * path, loff_t length)
 		goto mnt_drop_write_and_out;
 
 	error = -EPERM;
-	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+	if (IS_APPEND(inode))
 		goto mnt_drop_write_and_out;
 
 	error = get_write_access(inode);
-- 
cgit v1.2.3-70-g09d2


From 2f1936b87783a3a56c9441b27b9ba7a747f11e8e Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Tue, 24 Jun 2008 16:50:14 +0200
Subject: [patch 3/5] vfs: change remove_suid() to file_remove_suid()

All calls to remove_suid() are made with a file pointer, because
(similarly to file_update_time) it is called when the file is written.

Clean up callers by passing in a file instead of a dentry.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/file.c             | 2 +-
 fs/ntfs/file.c             | 2 +-
 fs/splice.c                | 4 ++--
 fs/xfs/linux-2.6/xfs_lrw.c | 2 +-
 include/linux/fs.h         | 2 +-
 mm/filemap.c               | 7 ++++---
 mm/filemap_xip.c           | 2 +-
 7 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 67ff2c6a8f6..2bada6bbc31 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -893,7 +893,7 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	if (count == 0)
 		goto out;
 
-	err = remove_suid(file->f_path.dentry);
+	err = file_remove_suid(file);
 	if (err)
 		goto out;
 
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 3c5550cd11d..d020866d423 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2118,7 +2118,7 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
 		goto out;
 	if (!count)
 		goto out;
-	err = remove_suid(file->f_path.dentry);
+	err = file_remove_suid(file);
 	if (err)
 		goto out;
 	file_update_time(file);
diff --git a/fs/splice.c b/fs/splice.c
index 47dc1a445d1..b30311ba8af 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -772,7 +772,7 @@ generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out,
 	ssize_t ret;
 	int err;
 
-	err = remove_suid(out->f_path.dentry);
+	err = file_remove_suid(out);
 	if (unlikely(err))
 		return err;
 
@@ -830,7 +830,7 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 	ssize_t ret;
 
 	inode_double_lock(inode, pipe->inode);
-	ret = remove_suid(out->f_path.dentry);
+	ret = file_remove_suid(out);
 	if (likely(!ret))
 		ret = __splice_from_pipe(pipe, &sd, pipe_to_file);
 	inode_double_unlock(inode, pipe->inode);
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 5e3b57516ec..82333b3e118 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -711,7 +711,7 @@ start:
 	     !capable(CAP_FSETID)) {
 		error = xfs_write_clear_setuid(xip);
 		if (likely(!error))
-			error = -remove_suid(file->f_path.dentry);
+			error = -file_remove_suid(file);
 		if (unlikely(error)) {
 			goto out_unlock_internal;
 		}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6c923c9b79b..1a3546e69f9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1834,7 +1834,7 @@ extern void clear_inode(struct inode *);
 extern void destroy_inode(struct inode *);
 extern struct inode *new_inode(struct super_block *);
 extern int should_remove_suid(struct dentry *);
-extern int remove_suid(struct dentry *);
+extern int file_remove_suid(struct file *);
 
 extern void __insert_inode_hash(struct inode *, unsigned long hashval);
 extern void remove_inode_hash(struct inode *);
diff --git a/mm/filemap.c b/mm/filemap.c
index 2ed8b0389c5..5de7633e1db 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1758,8 +1758,9 @@ static int __remove_suid(struct dentry *dentry, int kill)
 	return notify_change(dentry, &newattrs);
 }
 
-int remove_suid(struct dentry *dentry)
+int file_remove_suid(struct file *file)
 {
+	struct dentry *dentry = file->f_path.dentry;
 	int killsuid = should_remove_suid(dentry);
 	int killpriv = security_inode_need_killpriv(dentry);
 	int error = 0;
@@ -1773,7 +1774,7 @@ int remove_suid(struct dentry *dentry)
 
 	return error;
 }
-EXPORT_SYMBOL(remove_suid);
+EXPORT_SYMBOL(file_remove_suid);
 
 static size_t __iovec_copy_from_user_inatomic(char *vaddr,
 			const struct iovec *iov, size_t base, size_t bytes)
@@ -2529,7 +2530,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
 	if (count == 0)
 		goto out;
 
-	err = remove_suid(file->f_path.dentry);
+	err = file_remove_suid(file);
 	if (err)
 		goto out;
 
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 3e744abcce9..98a3f31ccd6 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -380,7 +380,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
 	if (count == 0)
 		goto out_backing;
 
-	ret = remove_suid(filp->f_path.dentry);
+	ret = file_remove_suid(filp);
 	if (ret)
 		goto out_backing;
 
-- 
cgit v1.2.3-70-g09d2


From 7e79eedb3b22200cc8b774baea3a7bf28d766101 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Date: Tue, 24 Jun 2008 16:50:15 +0200
Subject: [patch 4/5] vfs: reuse local variable in vfs_link()

Why not reuse "inode" which is assigned as

  struct inode *inode = old_dentry->d_inode;

in the beginning of vfs_link() ?

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/namei.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 46af98ed136..3b67be7631d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2524,19 +2524,19 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 		return -EPERM;
 	if (!dir->i_op || !dir->i_op->link)
 		return -EPERM;
-	if (S_ISDIR(old_dentry->d_inode->i_mode))
+	if (S_ISDIR(inode->i_mode))
 		return -EPERM;
 
 	error = security_inode_link(old_dentry, dir, new_dentry);
 	if (error)
 		return error;
 
-	mutex_lock(&old_dentry->d_inode->i_mutex);
+	mutex_lock(&inode->i_mutex);
 	DQUOT_INIT(dir);
 	error = dir->i_op->link(old_dentry, dir, new_dentry);
-	mutex_unlock(&old_dentry->d_inode->i_mutex);
+	mutex_unlock(&inode->i_mutex);
 	if (!error)
-		fsnotify_link(dir, old_dentry->d_inode, new_dentry);
+		fsnotify_link(dir, inode, new_dentry);
 	return error;
 }
 
-- 
cgit v1.2.3-70-g09d2


From db2e747b14991a4c6a5c98b0e5f552a193237c03 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Tue, 24 Jun 2008 16:50:16 +0200
Subject: [patch 5/5] vfs: remove mode parameter from vfs_symlink()

Remove the unused mode parameter from vfs_symlink and callers.

Thanks to Tetsuo Handa for noticing.

CC: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/ecryptfs/inode.c |  4 +---
 fs/namei.c          |  4 ++--
 fs/nfsd/vfs.c       | 10 ++--------
 include/linux/fs.h  |  2 +-
 4 files changed, 6 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 32f4228efcd..f25caf2b088 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -465,7 +465,6 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
 	int rc;
 	struct dentry *lower_dentry;
 	struct dentry *lower_dir_dentry;
-	umode_t mode;
 	char *encoded_symname;
 	int encoded_symlen;
 	struct ecryptfs_crypt_stat *crypt_stat = NULL;
@@ -473,7 +472,6 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
 	lower_dentry = ecryptfs_dentry_to_lower(dentry);
 	dget(lower_dentry);
 	lower_dir_dentry = lock_parent(lower_dentry);
-	mode = S_IALLUGO;
 	encoded_symlen = ecryptfs_encode_filename(crypt_stat, symname,
 						  strlen(symname),
 						  &encoded_symname);
@@ -482,7 +480,7 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
 		goto out_lock;
 	}
 	rc = vfs_symlink(lower_dir_dentry->d_inode, lower_dentry,
-			 encoded_symname, mode);
+			 encoded_symname);
 	kfree(encoded_symname);
 	if (rc || !lower_dentry->d_inode)
 		goto out_lock;
diff --git a/fs/namei.c b/fs/namei.c
index 3b67be7631d..ae0e56fdb74 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2434,7 +2434,7 @@ asmlinkage long sys_unlink(const char __user *pathname)
 	return do_unlinkat(AT_FDCWD, pathname);
 }
 
-int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname, int mode)
+int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
 {
 	int error = may_create(dir, dentry, NULL);
 
@@ -2483,7 +2483,7 @@ asmlinkage long sys_symlinkat(const char __user *oldname,
 	error = mnt_want_write(nd.path.mnt);
 	if (error)
 		goto out_dput;
-	error = vfs_symlink(nd.path.dentry->d_inode, dentry, from, S_IALLUGO);
+	error = vfs_symlink(nd.path.dentry->d_inode, dentry, from);
 	mnt_drop_write(nd.path.mnt);
 out_dput:
 	dput(dentry);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 0f4481e0502..ad1ad59e374 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1516,7 +1516,6 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	struct dentry	*dentry, *dnew;
 	__be32		err, cerr;
 	int		host_err;
-	umode_t		mode;
 
 	err = nfserr_noent;
 	if (!flen || !plen)
@@ -1535,11 +1534,6 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	if (IS_ERR(dnew))
 		goto out_nfserr;
 
-	mode = S_IALLUGO;
-	/* Only the MODE ATTRibute is even vaguely meaningful */
-	if (iap && (iap->ia_valid & ATTR_MODE))
-		mode = iap->ia_mode & S_IALLUGO;
-
 	host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
 	if (host_err)
 		goto out_nfserr;
@@ -1551,11 +1545,11 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		else {
 			strncpy(path_alloced, path, plen);
 			path_alloced[plen] = 0;
-			host_err = vfs_symlink(dentry->d_inode, dnew, path_alloced, mode);
+			host_err = vfs_symlink(dentry->d_inode, dnew, path_alloced);
 			kfree(path_alloced);
 		}
 	} else
-		host_err = vfs_symlink(dentry->d_inode, dnew, path, mode);
+		host_err = vfs_symlink(dentry->d_inode, dnew, path);
 
 	if (!host_err) {
 		if (EX_ISSYNC(fhp->fh_export))
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1a3546e69f9..25998e803fc 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1139,7 +1139,7 @@ extern int vfs_permission(struct nameidata *, int);
 extern int vfs_create(struct inode *, struct dentry *, int, struct nameidata *);
 extern int vfs_mkdir(struct inode *, struct dentry *, int);
 extern int vfs_mknod(struct inode *, struct dentry *, int, dev_t);
-extern int vfs_symlink(struct inode *, struct dentry *, const char *, int);
+extern int vfs_symlink(struct inode *, struct dentry *, const char *);
 extern int vfs_link(struct dentry *, struct inode *, struct dentry *);
 extern int vfs_rmdir(struct inode *, struct dentry *);
 extern int vfs_unlink(struct inode *, struct dentry *);
-- 
cgit v1.2.3-70-g09d2


From 8bb79224b87aab92071e94d46e70bd160d89bf34 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 16 Jul 2008 09:51:03 -0400
Subject: [PATCH] permission checks for chdir need special treatment only on
 the last step

... so we ought to pass MAY_CHDIR to vfs_permission() instead of having
it triggered on every step of preceding pathname resolution.  LOOKUP_CHDIR
is killed by that.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c            | 2 --
 fs/open.c             | 5 ++---
 include/linux/namei.h | 1 -
 3 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index ae0e56fdb74..6c76e1ee9c4 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -268,8 +268,6 @@ int permission(struct inode *inode, int mask, struct nameidata *nd)
 		if (nd) {
 			if (nd->flags & LOOKUP_ACCESS)
 				extra |= MAY_ACCESS;
-			if (nd->flags & LOOKUP_CHDIR)
-				extra |= MAY_CHDIR;
 			if (nd->flags & LOOKUP_OPEN)
 				extra |= MAY_OPEN;
 		}
diff --git a/fs/open.c b/fs/open.c
index b2e4c93aed0..8e02d42bfe4 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -501,12 +501,11 @@ asmlinkage long sys_chdir(const char __user * filename)
 	struct nameidata nd;
 	int error;
 
-	error = __user_walk(filename,
-			    LOOKUP_FOLLOW|LOOKUP_DIRECTORY|LOOKUP_CHDIR, &nd);
+	error = __user_walk(filename, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd);
 	if (error)
 		goto out;
 
-	error = vfs_permission(&nd, MAY_EXEC);
+	error = vfs_permission(&nd, MAY_EXEC | MAY_CHDIR);
 	if (error)
 		goto dput_and_out;
 
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 24d88e98a62..3cf62d26d49 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -55,7 +55,6 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
 #define LOOKUP_OPEN		(0x0100)
 #define LOOKUP_CREATE		(0x0200)
 #define LOOKUP_ACCESS		(0x0400)
-#define LOOKUP_CHDIR		(0x0800)
 
 extern int __user_walk(const char __user *, unsigned, struct nameidata *);
 extern int __user_walk_fd(int dfd, const char __user *, unsigned, struct nameidata *);
-- 
cgit v1.2.3-70-g09d2


From 7f2da1e7d0330395e5e9e350b879b98a1ea495df Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 10 May 2008 20:44:54 -0400
Subject: [PATCH] kill altroot

long overdue...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c                    | 89 +------------------------------------------
 fs/namespace.c                |  8 +---
 fs/open.c                     |  3 +-
 include/asm-alpha/namei.h     | 17 ---------
 include/asm-arm/namei.h       | 25 ------------
 include/asm-avr32/namei.h     |  7 ----
 include/asm-blackfin/namei.h  | 19 ---------
 include/asm-cris/namei.h      | 17 ---------
 include/asm-frv/namei.h       | 18 ---------
 include/asm-h8300/namei.h     | 17 ---------
 include/asm-ia64/namei.h      | 25 ------------
 include/asm-m32r/namei.h      | 17 ---------
 include/asm-m68k/namei.h      | 17 ---------
 include/asm-m68knommu/namei.h |  1 -
 include/asm-mips/namei.h      | 11 ------
 include/asm-mn10300/namei.h   | 22 -----------
 include/asm-parisc/namei.h    | 17 ---------
 include/asm-powerpc/namei.h   | 20 ----------
 include/asm-s390/namei.h      | 21 ----------
 include/asm-sh/namei.h        | 17 ---------
 include/asm-sparc/namei.h     |  8 ----
 include/asm-sparc64/namei.h   |  1 -
 include/asm-um/namei.h        |  6 ---
 include/asm-v850/namei.h      | 17 ---------
 include/asm-x86/namei.h       | 11 ------
 include/asm-xtensa/namei.h    | 26 -------------
 include/linux/fs_struct.h     |  3 +-
 include/linux/namei.h         |  1 -
 kernel/exec_domain.c          |  1 -
 kernel/exit.c                 |  2 -
 kernel/fork.c                 |  7 ----
 31 files changed, 5 insertions(+), 466 deletions(-)
 delete mode 100644 include/asm-alpha/namei.h
 delete mode 100644 include/asm-arm/namei.h
 delete mode 100644 include/asm-avr32/namei.h
 delete mode 100644 include/asm-blackfin/namei.h
 delete mode 100644 include/asm-cris/namei.h
 delete mode 100644 include/asm-frv/namei.h
 delete mode 100644 include/asm-h8300/namei.h
 delete mode 100644 include/asm-ia64/namei.h
 delete mode 100644 include/asm-m32r/namei.h
 delete mode 100644 include/asm-m68k/namei.h
 delete mode 100644 include/asm-m68knommu/namei.h
 delete mode 100644 include/asm-mips/namei.h
 delete mode 100644 include/asm-mn10300/namei.h
 delete mode 100644 include/asm-parisc/namei.h
 delete mode 100644 include/asm-powerpc/namei.h
 delete mode 100644 include/asm-s390/namei.h
 delete mode 100644 include/asm-sh/namei.h
 delete mode 100644 include/asm-sparc/namei.h
 delete mode 100644 include/asm-sparc64/namei.h
 delete mode 100644 include/asm-um/namei.h
 delete mode 100644 include/asm-v850/namei.h
 delete mode 100644 include/asm-x86/namei.h
 delete mode 100644 include/asm-xtensa/namei.h

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 6c76e1ee9c4..095818089ac 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -31,7 +31,6 @@
 #include <linux/file.h>
 #include <linux/fcntl.h>
 #include <linux/device_cgroup.h>
-#include <asm/namei.h>
 #include <asm/uaccess.h>
 
 #define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE])
@@ -562,27 +561,16 @@ out_unlock:
 	return result;
 }
 
-static int __emul_lookup_dentry(const char *, struct nameidata *);
-
 /* SMP-safe */
-static __always_inline int
+static __always_inline void
 walk_init_root(const char *name, struct nameidata *nd)
 {
 	struct fs_struct *fs = current->fs;
 
 	read_lock(&fs->lock);
-	if (fs->altroot.dentry && !(nd->flags & LOOKUP_NOALT)) {
-		nd->path = fs->altroot;
-		path_get(&fs->altroot);
-		read_unlock(&fs->lock);
-		if (__emul_lookup_dentry(name,nd))
-			return 0;
-		read_lock(&fs->lock);
-	}
 	nd->path = fs->root;
 	path_get(&fs->root);
 	read_unlock(&fs->lock);
-	return 1;
 }
 
 /*
@@ -623,12 +611,9 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
 
 	if (*link == '/') {
 		path_put(&nd->path);
-		if (!walk_init_root(link, nd))
-			/* weird __emul_prefix() stuff did it */
-			goto out;
+		walk_init_root(link, nd);
 	}
 	res = link_path_walk(link, nd);
-out:
 	if (nd->depth || res || nd->last_type!=LAST_NORM)
 		return res;
 	/*
@@ -1077,67 +1062,6 @@ static int path_walk(const char *name, struct nameidata *nd)
 	return link_path_walk(name, nd);
 }
 
-/* 
- * SMP-safe: Returns 1 and nd will have valid dentry and mnt, if
- * everything is done. Returns 0 and drops input nd, if lookup failed;
- */
-static int __emul_lookup_dentry(const char *name, struct nameidata *nd)
-{
-	if (path_walk(name, nd))
-		return 0;		/* something went wrong... */
-
-	if (!nd->path.dentry->d_inode ||
-	    S_ISDIR(nd->path.dentry->d_inode->i_mode)) {
-		struct path old_path = nd->path;
-		struct qstr last = nd->last;
-		int last_type = nd->last_type;
-		struct fs_struct *fs = current->fs;
-
-		/*
-		 * NAME was not found in alternate root or it's a directory.
-		 * Try to find it in the normal root:
-		 */
-		nd->last_type = LAST_ROOT;
-		read_lock(&fs->lock);
-		nd->path = fs->root;
-		path_get(&fs->root);
-		read_unlock(&fs->lock);
-		if (path_walk(name, nd) == 0) {
-			if (nd->path.dentry->d_inode) {
-				path_put(&old_path);
-				return 1;
-			}
-			path_put(&nd->path);
-		}
-		nd->path = old_path;
-		nd->last = last;
-		nd->last_type = last_type;
-	}
-	return 1;
-}
-
-void set_fs_altroot(void)
-{
-	char *emul = __emul_prefix();
-	struct nameidata nd;
-	struct path path = {}, old_path;
-	int err;
-	struct fs_struct *fs = current->fs;
-
-	if (!emul)
-		goto set_it;
-	err = path_lookup(emul, LOOKUP_FOLLOW|LOOKUP_DIRECTORY|LOOKUP_NOALT, &nd);
-	if (!err)
-		path = nd.path;
-set_it:
-	write_lock(&fs->lock);
-	old_path = fs->altroot;
-	fs->altroot = path;
-	write_unlock(&fs->lock);
-	if (old_path.dentry)
-		path_put(&old_path);
-}
-
 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
 static int do_path_lookup(int dfd, const char *name,
 				unsigned int flags, struct nameidata *nd)
@@ -1153,14 +1077,6 @@ static int do_path_lookup(int dfd, const char *name,
 
 	if (*name=='/') {
 		read_lock(&fs->lock);
-		if (fs->altroot.dentry && !(nd->flags & LOOKUP_NOALT)) {
-			nd->path = fs->altroot;
-			path_get(&fs->altroot);
-			read_unlock(&fs->lock);
-			if (__emul_lookup_dentry(name,nd))
-				goto out; /* found in altroot */
-			read_lock(&fs->lock);
-		}
 		nd->path = fs->root;
 		path_get(&fs->root);
 		read_unlock(&fs->lock);
@@ -1194,7 +1110,6 @@ static int do_path_lookup(int dfd, const char *name,
 	}
 
 	retval = path_walk(name, nd);
-out:
 	if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
 				nd->path.dentry->d_inode))
 		audit_inode(name, nd->path.dentry);
diff --git a/fs/namespace.c b/fs/namespace.c
index f30b11e2240..c4fcf48acef 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1972,7 +1972,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
 		struct fs_struct *fs)
 {
 	struct mnt_namespace *new_ns;
-	struct vfsmount *rootmnt = NULL, *pwdmnt = NULL, *altrootmnt = NULL;
+	struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
 	struct vfsmount *p, *q;
 
 	new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
@@ -2015,10 +2015,6 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
 				pwdmnt = p;
 				fs->pwd.mnt = mntget(q);
 			}
-			if (p == fs->altroot.mnt) {
-				altrootmnt = p;
-				fs->altroot.mnt = mntget(q);
-			}
 		}
 		p = next_mnt(p, mnt_ns->root);
 		q = next_mnt(q, new_ns->root);
@@ -2029,8 +2025,6 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
 		mntput(rootmnt);
 	if (pwdmnt)
 		mntput(pwdmnt);
-	if (altrootmnt)
-		mntput(altrootmnt);
 
 	return new_ns;
 }
diff --git a/fs/open.c b/fs/open.c
index 8e02d42bfe4..d3a2a00f52d 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -548,7 +548,7 @@ asmlinkage long sys_chroot(const char __user * filename)
 	struct nameidata nd;
 	int error;
 
-	error = __user_walk(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd);
+	error = __user_walk(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &nd);
 	if (error)
 		goto out;
 
@@ -561,7 +561,6 @@ asmlinkage long sys_chroot(const char __user * filename)
 		goto dput_and_out;
 
 	set_fs_root(current->fs, &nd.path);
-	set_fs_altroot();
 	error = 0;
 dput_and_out:
 	path_put(&nd.path);
diff --git a/include/asm-alpha/namei.h b/include/asm-alpha/namei.h
deleted file mode 100644
index 5cc9bb39499..00000000000
--- a/include/asm-alpha/namei.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* $Id: namei.h,v 1.1 1996/12/13 14:48:21 jj Exp $
- * linux/include/asm-alpha/namei.h
- *
- * Included from linux/fs/namei.c
- */
-
-#ifndef __ALPHA_NAMEI_H
-#define __ALPHA_NAMEI_H
-
-/* This dummy routine maybe changed to something useful
- * for /usr/gnemul/ emulation stuff.
- * Look at asm-sparc/namei.h for details.
- */
-
-#define __emul_prefix() NULL
-
-#endif /* __ALPHA_NAMEI_H */
diff --git a/include/asm-arm/namei.h b/include/asm-arm/namei.h
deleted file mode 100644
index a402d3b9d0f..00000000000
--- a/include/asm-arm/namei.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* 
- * linux/include/asm-arm/namei.h
- *
- * Routines to handle famous /usr/gnemul
- * Derived from the Sparc version of this file
- *
- * Included from linux/fs/namei.c
- */
-
-#ifndef __ASMARM_NAMEI_H
-#define __ASMARM_NAMEI_H
-
-#define ARM_BSD_EMUL "usr/gnemul/bsd/"
-
-static inline char *__emul_prefix(void)
-{
-	switch (current->personality) {
-	case PER_BSD:
-		return ARM_BSD_EMUL;
-	default:
-		return NULL;
-	}
-}
-
-#endif /* __ASMARM_NAMEI_H */
diff --git a/include/asm-avr32/namei.h b/include/asm-avr32/namei.h
deleted file mode 100644
index f0a26de06ca..00000000000
--- a/include/asm-avr32/namei.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef __ASM_AVR32_NAMEI_H
-#define __ASM_AVR32_NAMEI_H
-
-/* This dummy routine may be changed to something useful */
-#define __emul_prefix() NULL
-
-#endif /* __ASM_AVR32_NAMEI_H */
diff --git a/include/asm-blackfin/namei.h b/include/asm-blackfin/namei.h
deleted file mode 100644
index 8b89a2d65cb..00000000000
--- a/include/asm-blackfin/namei.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * linux/include/asm/namei.h
- *
- * Included from linux/fs/namei.c
- *
- * Changes made by Lineo Inc.    May 2001
- */
-
-#ifndef __BFIN_NAMEI_H
-#define __BFIN_NAMEI_H
-
-/* This dummy routine maybe changed to something useful
- * for /usr/gnemul/ emulation stuff.
- * Look at asm-sparc/namei.h for details.
- */
-
-#define __emul_prefix() NULL
-
-#endif
diff --git a/include/asm-cris/namei.h b/include/asm-cris/namei.h
deleted file mode 100644
index 8a3be7a6d9f..00000000000
--- a/include/asm-cris/namei.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* $Id: namei.h,v 1.1 2000/07/10 16:32:31 bjornw Exp $
- * linux/include/asm-cris/namei.h
- *
- * Included from linux/fs/namei.c
- */
-
-#ifndef __CRIS_NAMEI_H
-#define __CRIS_NAMEI_H
-
-/* used to find file-system prefixes for doing emulations
- * see for example asm-sparc/namei.h
- * we don't use it...
- */
-
-#define __emul_prefix() NULL
-
-#endif /* __CRIS_NAMEI_H */
diff --git a/include/asm-frv/namei.h b/include/asm-frv/namei.h
deleted file mode 100644
index 4ea57171d95..00000000000
--- a/include/asm-frv/namei.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * include/asm-frv/namei.h
- *
- * Included from linux/fs/namei.c
- */
-
-#ifndef __ASM_NAMEI_H
-#define __ASM_NAMEI_H
-
-/* This dummy routine maybe changed to something useful
- * for /usr/gnemul/ emulation stuff.
- * Look at asm-sparc/namei.h for details.
- */
-
-#define __emul_prefix() NULL
-
-#endif
-
diff --git a/include/asm-h8300/namei.h b/include/asm-h8300/namei.h
deleted file mode 100644
index ab6f196db6e..00000000000
--- a/include/asm-h8300/namei.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * linux/include/asm-h8300/namei.h
- *
- * Included from linux/fs/namei.c
- */
-
-#ifndef __H8300_NAMEI_H
-#define __H8300_NAMEI_H
-
-/* This dummy routine maybe changed to something useful
- * for /usr/gnemul/ emulation stuff.
- * Look at asm-sparc/namei.h for details.
- */
-
-#define __emul_prefix() NULL
-
-#endif
diff --git a/include/asm-ia64/namei.h b/include/asm-ia64/namei.h
deleted file mode 100644
index 78e76807908..00000000000
--- a/include/asm-ia64/namei.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef _ASM_IA64_NAMEI_H
-#define _ASM_IA64_NAMEI_H
-
-/*
- * Modified 1998, 1999, 2001
- *	David Mosberger-Tang <davidm@hpl.hp.com>, Hewlett-Packard Co
- */
-
-#include <asm/ptrace.h>
-#include <asm/system.h>
-
-#define EMUL_PREFIX_LINUX_IA32 "/emul/ia32-linux/"
-
-static inline char *
-__emul_prefix (void)
-{
-	switch (current->personality) {
-	      case PER_LINUX32:
-		return EMUL_PREFIX_LINUX_IA32;
-	      default:
-		return NULL;
-	}
-}
-
-#endif /* _ASM_IA64_NAMEI_H */
diff --git a/include/asm-m32r/namei.h b/include/asm-m32r/namei.h
deleted file mode 100644
index 210f8056b80..00000000000
--- a/include/asm-m32r/namei.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef _ASM_M32R_NAMEI_H
-#define _ASM_M32R_NAMEI_H
-
-/*
- * linux/include/asm-m32r/namei.h
- *
- * Included from linux/fs/namei.c
- */
-
-/* This dummy routine maybe changed to something useful
- * for /usr/gnemul/ emulation stuff.
- * Look at asm-sparc/namei.h for details.
- */
-
-#define __emul_prefix() NULL
-
-#endif /* _ASM_M32R_NAMEI_H */
diff --git a/include/asm-m68k/namei.h b/include/asm-m68k/namei.h
deleted file mode 100644
index f33f243b644..00000000000
--- a/include/asm-m68k/namei.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * linux/include/asm-m68k/namei.h
- *
- * Included from linux/fs/namei.c
- */
-
-#ifndef __M68K_NAMEI_H
-#define __M68K_NAMEI_H
-
-/* This dummy routine maybe changed to something useful
- * for /usr/gnemul/ emulation stuff.
- * Look at asm-sparc/namei.h for details.
- */
-
-#define __emul_prefix() NULL
-
-#endif
diff --git a/include/asm-m68knommu/namei.h b/include/asm-m68knommu/namei.h
deleted file mode 100644
index 31a85d27b93..00000000000
--- a/include/asm-m68knommu/namei.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-m68k/namei.h>
diff --git a/include/asm-mips/namei.h b/include/asm-mips/namei.h
deleted file mode 100644
index a6605a75246..00000000000
--- a/include/asm-mips/namei.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef _ASM_NAMEI_H
-#define _ASM_NAMEI_H
-
-/*
- * This dummy routine maybe changed to something useful
- * for /usr/gnemul/ emulation stuff.
- */
-
-#define __emul_prefix() NULL
-
-#endif /* _ASM_NAMEI_H */
diff --git a/include/asm-mn10300/namei.h b/include/asm-mn10300/namei.h
deleted file mode 100644
index bd9ce94aeb6..00000000000
--- a/include/asm-mn10300/namei.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Emulation stuff
- *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#ifndef _ASM_NAMEI_H
-#define _ASM_NAMEI_H
-
-/* This dummy routine maybe changed to something useful
- * for /usr/gnemul/ emulation stuff.
- * Look at asm-sparc/namei.h for details.
- */
-
-#define __emul_prefix() NULL
-
-#endif /* _ASM_NAMEI_H */
diff --git a/include/asm-parisc/namei.h b/include/asm-parisc/namei.h
deleted file mode 100644
index 8d29b3d9fb3..00000000000
--- a/include/asm-parisc/namei.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* $Id: namei.h,v 1.1 1996/12/13 14:48:21 jj Exp $
- * linux/include/asm-parisc/namei.h
- *
- * Included from linux/fs/namei.c
- */
-
-#ifndef __PARISC_NAMEI_H
-#define __PARISC_NAMEI_H
-
-/* This dummy routine maybe changed to something useful
- * for /usr/gnemul/ emulation stuff.
- * Look at asm-sparc/namei.h for details.
- */
-
-#define __emul_prefix() NULL
-
-#endif /* __PARISC_NAMEI_H */
diff --git a/include/asm-powerpc/namei.h b/include/asm-powerpc/namei.h
deleted file mode 100644
index 657443474a6..00000000000
--- a/include/asm-powerpc/namei.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef _ASM_POWERPC_NAMEI_H
-#define _ASM_POWERPC_NAMEI_H
-
-#ifdef __KERNEL__
-
-/*
- * Adapted from include/asm-alpha/namei.h
- *
- * Included from fs/namei.c
- */
-
-/* This dummy routine maybe changed to something useful
- * for /usr/gnemul/ emulation stuff.
- * Look at asm-sparc/namei.h for details.
- */
-
-#define __emul_prefix() NULL
-
-#endif	/* __KERNEL__ */
-#endif	/* _ASM_POWERPC_NAMEI_H */
diff --git a/include/asm-s390/namei.h b/include/asm-s390/namei.h
deleted file mode 100644
index 3e286bdde4b..00000000000
--- a/include/asm-s390/namei.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- *  include/asm-s390/namei.h
- *
- *  S390 version
- *
- *  Derived from "include/asm-i386/namei.h"
- *
- *  Included from linux/fs/namei.c
- */
-
-#ifndef __S390_NAMEI_H
-#define __S390_NAMEI_H
-
-/* This dummy routine maybe changed to something useful
- * for /usr/gnemul/ emulation stuff.
- * Look at asm-sparc/namei.h for details.
- */
-
-#define __emul_prefix() NULL
-
-#endif /* __S390_NAMEI_H */
diff --git a/include/asm-sh/namei.h b/include/asm-sh/namei.h
deleted file mode 100644
index 338a5d94714..00000000000
--- a/include/asm-sh/namei.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* $Id: namei.h,v 1.3 2000/07/04 06:24:49 gniibe Exp $
- * linux/include/asm-sh/namei.h
- *
- * Included from linux/fs/namei.c
- */
-
-#ifndef __ASM_SH_NAMEI_H
-#define __ASM_SH_NAMEI_H
-
-/* This dummy routine maybe changed to something useful
- * for /usr/gnemul/ emulation stuff.
- * Look at asm-sparc/namei.h for details.
- */
-
-#define __emul_prefix() NULL
-
-#endif /* __ASM_SH_NAMEI_H */
diff --git a/include/asm-sparc/namei.h b/include/asm-sparc/namei.h
deleted file mode 100644
index eff944b8e32..00000000000
--- a/include/asm-sparc/namei.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef ___ASM_SPARC_NAMEI_H
-#define ___ASM_SPARC_NAMEI_H
-#if defined(__sparc__) && defined(__arch64__)
-#include <asm-sparc/namei_64.h>
-#else
-#include <asm-sparc/namei_32.h>
-#endif
-#endif
diff --git a/include/asm-sparc64/namei.h b/include/asm-sparc64/namei.h
deleted file mode 100644
index 1344a910ba2..00000000000
--- a/include/asm-sparc64/namei.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-sparc/namei.h>
diff --git a/include/asm-um/namei.h b/include/asm-um/namei.h
deleted file mode 100644
index 002984d5bc8..00000000000
--- a/include/asm-um/namei.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef __UM_NAMEI_H
-#define __UM_NAMEI_H
-
-#include "asm/arch/namei.h"
-
-#endif
diff --git a/include/asm-v850/namei.h b/include/asm-v850/namei.h
deleted file mode 100644
index ee8339b2384..00000000000
--- a/include/asm-v850/namei.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * linux/include/asm-v850/namei.h
- *
- * Included from linux/fs/namei.c
- */
-
-#ifndef __V850_NAMEI_H__
-#define __V850_NAMEI_H__
-
-/* This dummy routine maybe changed to something useful
- * for /usr/gnemul/ emulation stuff.
- * Look at asm-sparc/namei.h for details.
- */
-
-#define __emul_prefix() NULL
-
-#endif /* __V850_NAMEI_H__ */
diff --git a/include/asm-x86/namei.h b/include/asm-x86/namei.h
deleted file mode 100644
index 415ef5d9550..00000000000
--- a/include/asm-x86/namei.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef _ASM_X86_NAMEI_H
-#define _ASM_X86_NAMEI_H
-
-/* This dummy routine maybe changed to something useful
- * for /usr/gnemul/ emulation stuff.
- * Look at asm-sparc/namei.h for details.
- */
-
-#define __emul_prefix() NULL
-
-#endif /* _ASM_X86_NAMEI_H */
diff --git a/include/asm-xtensa/namei.h b/include/asm-xtensa/namei.h
deleted file mode 100644
index 3fdff039d27..00000000000
--- a/include/asm-xtensa/namei.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * include/asm-xtensa/namei.h
- *
- * Included from linux/fs/namei.c
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Copyright (C) 2001 - 2005 Tensilica Inc.
- */
-
-#ifndef _XTENSA_NAMEI_H
-#define _XTENSA_NAMEI_H
-
-#ifdef __KERNEL__
-
-/* This dummy routine maybe changed to something useful
- * for /usr/gnemul/ emulation stuff.
- * Look at asm-sparc/namei.h for details.
- */
-
-#define __emul_prefix() NULL
-
-#endif	/* __KERNEL__ */
-#endif	/* _XTENSA_NAMEI_H */
diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h
index 282f5421912..9e5a06e78d0 100644
--- a/include/linux/fs_struct.h
+++ b/include/linux/fs_struct.h
@@ -7,7 +7,7 @@ struct fs_struct {
 	atomic_t count;
 	rwlock_t lock;
 	int umask;
-	struct path root, pwd, altroot;
+	struct path root, pwd;
 };
 
 #define INIT_FS {				\
@@ -19,7 +19,6 @@ struct fs_struct {
 extern struct kmem_cache *fs_cachep;
 
 extern void exit_fs(struct task_struct *);
-extern void set_fs_altroot(void);
 extern void set_fs_root(struct fs_struct *, struct path *);
 extern void set_fs_pwd(struct fs_struct *, struct path *);
 extern struct fs_struct *copy_fs_struct(struct fs_struct *);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 3cf62d26d49..768773d5785 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -47,7 +47,6 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
 #define LOOKUP_DIRECTORY	 2
 #define LOOKUP_CONTINUE		 4
 #define LOOKUP_PARENT		16
-#define LOOKUP_NOALT		32
 #define LOOKUP_REVAL		64
 /*
  * Intent data
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index c1ef192aa65..0d407e88673 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -168,7 +168,6 @@ __set_personality(u_long personality)
 	current->personality = personality;
 	oep = current_thread_info()->exec_domain;
 	current_thread_info()->exec_domain = ep;
-	set_fs_altroot();
 
 	module_put(oep->module);
 	return 0;
diff --git a/kernel/exit.c b/kernel/exit.c
index 6cdf60712bd..0caf590548a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -565,8 +565,6 @@ void put_fs_struct(struct fs_struct *fs)
 	if (atomic_dec_and_test(&fs->count)) {
 		path_put(&fs->root);
 		path_put(&fs->pwd);
-		if (fs->altroot.dentry)
-			path_put(&fs->altroot);
 		kmem_cache_free(fs_cachep, fs);
 	}
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index abb3ed6298f..5e050c1317c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -657,13 +657,6 @@ static struct fs_struct *__copy_fs_struct(struct fs_struct *old)
 		path_get(&old->root);
 		fs->pwd = old->pwd;
 		path_get(&old->pwd);
-		if (old->altroot.dentry) {
-			fs->altroot = old->altroot;
-			path_get(&old->altroot);
-		} else {
-			fs->altroot.mnt = NULL;
-			fs->altroot.dentry = NULL;
-		}
 		read_unlock(&old->lock);
 	}
 	return fs;
-- 
cgit v1.2.3-70-g09d2


From a110343f0d6d41f68b7cf8c00b57a3172c67f816 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 Jul 2008 09:19:08 -0400
Subject: [PATCH] fix MAY_CHDIR/MAY_ACCESS/LOOKUP_ACCESS mess

* MAY_CHDIR is redundant - it's an equivalent of MAY_ACCESS
* MAY_ACCESS on fuse should affect only the last step of pathname resolution
* fchdir() and chroot() should pass MAY_ACCESS, for the same reason why
  chdir() needs that.
* now that we pass MAY_ACCESS explicitly in all cases, LOOKUP_ACCESS can be
  removed; it has no business being in nameidata.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fuse/dir.c         |  2 +-
 fs/namei.c            |  2 --
 fs/open.c             | 10 +++++-----
 include/linux/fs.h    |  3 +--
 include/linux/namei.h |  1 -
 5 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 48a7934cb95..fd03330cade 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -962,7 +962,7 @@ static int fuse_permission(struct inode *inode, int mask)
 		   exist.  So if permissions are revoked this won't be
 		   noticed immediately, only after the attribute
 		   timeout has expired */
-	} else if (mask & (MAY_ACCESS | MAY_CHDIR)) {
+	} else if (mask & MAY_ACCESS) {
 		err = fuse_access(inode, mask);
 	} else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) {
 		if (!(inode->i_mode & S_IXUGO)) {
diff --git a/fs/namei.c b/fs/namei.c
index 095818089ac..33dcaf025c4 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -265,8 +265,6 @@ int permission(struct inode *inode, int mask, struct nameidata *nd)
 	if (inode->i_op && inode->i_op->permission) {
 		int extra = 0;
 		if (nd) {
-			if (nd->flags & LOOKUP_ACCESS)
-				extra |= MAY_ACCESS;
 			if (nd->flags & LOOKUP_OPEN)
 				extra |= MAY_OPEN;
 		}
diff --git a/fs/open.c b/fs/open.c
index d3a2a00f52d..3317e1909b2 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -457,11 +457,11 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 			old_cap = cap_set_effective(current->cap_permitted);
 	}
 
-	res = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd);
+	res = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW, &nd);
 	if (res)
 		goto out;
 
-	res = vfs_permission(&nd, mode);
+	res = vfs_permission(&nd, mode | MAY_ACCESS);
 	/* SuS v2 requires we report a read only fs too */
 	if(res || !(mode & S_IWOTH) ||
 	   special_file(nd.path.dentry->d_inode->i_mode))
@@ -505,7 +505,7 @@ asmlinkage long sys_chdir(const char __user * filename)
 	if (error)
 		goto out;
 
-	error = vfs_permission(&nd, MAY_EXEC | MAY_CHDIR);
+	error = vfs_permission(&nd, MAY_EXEC | MAY_ACCESS);
 	if (error)
 		goto dput_and_out;
 
@@ -534,7 +534,7 @@ asmlinkage long sys_fchdir(unsigned int fd)
 	if (!S_ISDIR(inode->i_mode))
 		goto out_putf;
 
-	error = file_permission(file, MAY_EXEC);
+	error = file_permission(file, MAY_EXEC | MAY_ACCESS);
 	if (!error)
 		set_fs_pwd(current->fs, &file->f_path);
 out_putf:
@@ -552,7 +552,7 @@ asmlinkage long sys_chroot(const char __user * filename)
 	if (error)
 		goto out;
 
-	error = vfs_permission(&nd, MAY_EXEC);
+	error = vfs_permission(&nd, MAY_EXEC | MAY_ACCESS);
 	if (error)
 		goto dput_and_out;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 25998e803fc..d8721e818b4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -61,8 +61,7 @@ extern int dir_notify_enable;
 #define MAY_READ 4
 #define MAY_APPEND 8
 #define MAY_ACCESS 16
-#define MAY_CHDIR 32
-#define MAY_OPEN 64
+#define MAY_OPEN 32
 
 #define FMODE_READ 1
 #define FMODE_WRITE 2
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 768773d5785..60e35a02f6c 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -53,7 +53,6 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
  */
 #define LOOKUP_OPEN		(0x0100)
 #define LOOKUP_CREATE		(0x0200)
-#define LOOKUP_ACCESS		(0x0400)
 
 extern int __user_walk(const char __user *, unsigned, struct nameidata *);
 extern int __user_walk_fd(int dfd, const char __user *, unsigned, struct nameidata *);
-- 
cgit v1.2.3-70-g09d2


From b77b0646ef4efe31a7449bb3d9360fd00f95433d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 Jul 2008 09:37:02 -0400
Subject: [PATCH] pass MAY_OPEN to vfs_permission() explicitly

... and get rid of the last "let's deduce mask from nameidata->flags"
bit.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/exec.c                  |  4 ++--
 fs/namei.c                 | 13 ++++---------
 include/linux/security.h   |  7 +++----
 security/capability.c      |  3 +--
 security/security.c        |  4 ++--
 security/selinux/hooks.c   |  5 ++---
 security/smack/smack_lsm.c |  3 +--
 7 files changed, 15 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index b8792a13153..0ba5d355c5a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -118,7 +118,7 @@ asmlinkage long sys_uselib(const char __user * library)
 	if (!S_ISREG(nd.path.dentry->d_inode->i_mode))
 		goto exit;
 
-	error = vfs_permission(&nd, MAY_READ | MAY_EXEC);
+	error = vfs_permission(&nd, MAY_READ | MAY_EXEC | MAY_OPEN);
 	if (error)
 		goto exit;
 
@@ -666,7 +666,7 @@ struct file *open_exec(const char *name)
 		struct inode *inode = nd.path.dentry->d_inode;
 		file = ERR_PTR(-EACCES);
 		if (S_ISREG(inode->i_mode)) {
-			int err = vfs_permission(&nd, MAY_EXEC);
+			int err = vfs_permission(&nd, MAY_EXEC | MAY_OPEN);
 			file = ERR_PTR(err);
 			if (!err) {
 				file = nameidata_to_filp(&nd,
diff --git a/fs/namei.c b/fs/namei.c
index 33dcaf025c4..6b0e8e5e079 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -263,12 +263,7 @@ int permission(struct inode *inode, int mask, struct nameidata *nd)
 
 	/* Ordinary permission routines do not understand MAY_APPEND. */
 	if (inode->i_op && inode->i_op->permission) {
-		int extra = 0;
-		if (nd) {
-			if (nd->flags & LOOKUP_OPEN)
-				extra |= MAY_OPEN;
-		}
-		retval = inode->i_op->permission(inode, mask | extra);
+		retval = inode->i_op->permission(inode, mask);
 		if (!retval) {
 			/*
 			 * Exec permission on a regular file is denied if none
@@ -292,7 +287,7 @@ int permission(struct inode *inode, int mask, struct nameidata *nd)
 		return retval;
 
 	return security_inode_permission(inode,
-			mask & (MAY_READ|MAY_WRITE|MAY_EXEC), nd);
+			mask & (MAY_READ|MAY_WRITE|MAY_EXEC));
 }
 
 /**
@@ -492,7 +487,7 @@ static int exec_permission_lite(struct inode *inode,
 
 	return -EACCES;
 ok:
-	return security_inode_permission(inode, MAY_EXEC, nd);
+	return security_inode_permission(inode, MAY_EXEC);
 }
 
 /*
@@ -1692,7 +1687,7 @@ struct file *do_filp_open(int dfd, const char *pathname,
 	int will_write;
 	int flag = open_to_namei_flags(open_flag);
 
-	acc_mode = ACC_MODE(flag);
+	acc_mode = MAY_OPEN | ACC_MODE(flag);
 
 	/* O_TRUNC implies we need access checks for write permissions */
 	if (flag & O_TRUNC)
diff --git a/include/linux/security.h b/include/linux/security.h
index f0e9adb22ac..fd96e7f8a6f 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1362,7 +1362,7 @@ struct security_operations {
 			     struct inode *new_dir, struct dentry *new_dentry);
 	int (*inode_readlink) (struct dentry *dentry);
 	int (*inode_follow_link) (struct dentry *dentry, struct nameidata *nd);
-	int (*inode_permission) (struct inode *inode, int mask, struct nameidata *nd);
+	int (*inode_permission) (struct inode *inode, int mask);
 	int (*inode_setattr)	(struct dentry *dentry, struct iattr *attr);
 	int (*inode_getattr) (struct vfsmount *mnt, struct dentry *dentry);
 	void (*inode_delete) (struct inode *inode);
@@ -1628,7 +1628,7 @@ int security_inode_rename(struct inode *old_dir, struct dentry *old_dentry,
 			  struct inode *new_dir, struct dentry *new_dentry);
 int security_inode_readlink(struct dentry *dentry);
 int security_inode_follow_link(struct dentry *dentry, struct nameidata *nd);
-int security_inode_permission(struct inode *inode, int mask, struct nameidata *nd);
+int security_inode_permission(struct inode *inode, int mask);
 int security_inode_setattr(struct dentry *dentry, struct iattr *attr);
 int security_inode_getattr(struct vfsmount *mnt, struct dentry *dentry);
 void security_inode_delete(struct inode *inode);
@@ -2021,8 +2021,7 @@ static inline int security_inode_follow_link(struct dentry *dentry,
 	return 0;
 }
 
-static inline int security_inode_permission(struct inode *inode, int mask,
-					     struct nameidata *nd)
+static inline int security_inode_permission(struct inode *inode, int mask)
 {
 	return 0;
 }
diff --git a/security/capability.c b/security/capability.c
index 5b01c0b0242..63d10da515a 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -211,8 +211,7 @@ static int cap_inode_follow_link(struct dentry *dentry,
 	return 0;
 }
 
-static int cap_inode_permission(struct inode *inode, int mask,
-				struct nameidata *nd)
+static int cap_inode_permission(struct inode *inode, int mask)
 {
 	return 0;
 }
diff --git a/security/security.c b/security/security.c
index 59f23b5918b..78ed3ffde24 100644
--- a/security/security.c
+++ b/security/security.c
@@ -429,11 +429,11 @@ int security_inode_follow_link(struct dentry *dentry, struct nameidata *nd)
 	return security_ops->inode_follow_link(dentry, nd);
 }
 
-int security_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
+int security_inode_permission(struct inode *inode, int mask)
 {
 	if (unlikely(IS_PRIVATE(inode)))
 		return 0;
-	return security_ops->inode_permission(inode, mask, nd);
+	return security_ops->inode_permission(inode, mask);
 }
 
 int security_inode_setattr(struct dentry *dentry, struct iattr *attr)
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 3481cde5bf1..5ba13908b5b 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2624,12 +2624,11 @@ static int selinux_inode_follow_link(struct dentry *dentry, struct nameidata *na
 	return dentry_has_perm(current, NULL, dentry, FILE__READ);
 }
 
-static int selinux_inode_permission(struct inode *inode, int mask,
-				    struct nameidata *nd)
+static int selinux_inode_permission(struct inode *inode, int mask)
 {
 	int rc;
 
-	rc = secondary_ops->inode_permission(inode, mask, nd);
+	rc = secondary_ops->inode_permission(inode, mask);
 	if (rc)
 		return rc;
 
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index ee5a51cbc5e..1b40e558f98 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -522,8 +522,7 @@ static int smack_inode_rename(struct inode *old_inode,
  *
  * Returns 0 if access is permitted, -EACCES otherwise
  */
-static int smack_inode_permission(struct inode *inode, int mask,
-				  struct nameidata *nd)
+static int smack_inode_permission(struct inode *inode, int mask)
 {
 	/*
 	 * No permission to check. Existence test. Yup, it's there.
-- 
cgit v1.2.3-70-g09d2


From 672b16b2f66c149888bd876a4f92342112205fe1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 Jul 2008 09:45:01 -0400
Subject: [PATCH] more nameidata removal: exec_permission_lite() doesn't need
 it

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 6b0e8e5e079..6d75430358a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -460,8 +460,7 @@ static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name,
  * short-cut DAC fails, then call permission() to do more
  * complete permission check.
  */
-static int exec_permission_lite(struct inode *inode,
-				       struct nameidata *nd)
+static int exec_permission_lite(struct inode *inode)
 {
 	umode_t	mode = inode->i_mode;
 
@@ -884,7 +883,7 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
 		unsigned int c;
 
 		nd->flags |= LOOKUP_CONTINUE;
-		err = exec_permission_lite(inode, nd);
+		err = exec_permission_lite(inode);
 		if (err == -EAGAIN)
 			err = vfs_permission(nd, MAY_EXEC);
  		if (err)
-- 
cgit v1.2.3-70-g09d2


From 88b387824fdaecb6ba0f471acf0aadf7d24739fd Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 21 Jul 2008 18:06:36 +0800
Subject: [PATCH] vfs: use kstrdup() and check failing allocation

- use kstrdup() instead of kmalloc() + memcpy()
- return NULL if allocating ->mnt_devname failed
- mnt_devname should be const

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c        | 24 +++++++++++++-----------
 include/linux/mount.h |  2 +-
 2 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index c4fcf48acef..26380f59953 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -112,9 +112,13 @@ struct vfsmount *alloc_vfsmnt(const char *name)
 		int err;
 
 		err = mnt_alloc_id(mnt);
-		if (err) {
-			kmem_cache_free(mnt_cache, mnt);
-			return NULL;
+		if (err)
+			goto out_free_cache;
+
+		if (name) {
+			mnt->mnt_devname = kstrdup(name, GFP_KERNEL);
+			if (!mnt->mnt_devname)
+				goto out_free_id;
 		}
 
 		atomic_set(&mnt->mnt_count, 1);
@@ -127,16 +131,14 @@ struct vfsmount *alloc_vfsmnt(const char *name)
 		INIT_LIST_HEAD(&mnt->mnt_slave_list);
 		INIT_LIST_HEAD(&mnt->mnt_slave);
 		atomic_set(&mnt->__mnt_writers, 0);
-		if (name) {
-			int size = strlen(name) + 1;
-			char *newname = kmalloc(size, GFP_KERNEL);
-			if (newname) {
-				memcpy(newname, name, size);
-				mnt->mnt_devname = newname;
-			}
-		}
 	}
 	return mnt;
+
+out_free_id:
+	mnt_free_id(mnt);
+out_free_cache:
+	kmem_cache_free(mnt_cache, mnt);
+	return NULL;
 }
 
 /*
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 4374d1adeb4..b5efaa2132a 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -47,7 +47,7 @@ struct vfsmount {
 	struct list_head mnt_child;	/* and going through their mnt_child */
 	int mnt_flags;
 	/* 4 bytes hole on 64bits arches */
-	char *mnt_devname;		/* Name of device e.g. /dev/dsk/hda1 */
+	const char *mnt_devname;	/* Name of device e.g. /dev/dsk/hda1 */
 	struct list_head mnt_list;
 	struct list_head mnt_expire;	/* link in fs-specific expiry list */
 	struct list_head mnt_share;	/* circular list of shared mounts */
-- 
cgit v1.2.3-70-g09d2


From 9767d74957450da6365c363d69e3d02d605d7375 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Tue, 1 Jul 2008 15:01:26 +0200
Subject: [patch 1/4] vfs: utimes: move owner check into inode_change_ok()

Add a new ia_valid flag: ATTR_TIMES_SET, to handle the
UTIMES_OMIT/UTIMES_NOW and UTIMES_NOW/UTIMES_OMIT cases.  In these
cases neither ATTR_MTIME_SET nor ATTR_ATIME_SET is in the flags, yet
the POSIX draft specifies that permission checking is performed the
same way as if one or both of the times was explicitly set to a
timestamp.

See the path "vfs: utimensat(): fix error checking for
{UTIME_NOW,UTIME_OMIT} case" by Michael Kerrisk for the patch
introducing this behavior.

This is a cleanup, as well as allowing filesystems (NFS/fuse/...) to
perform their own permission checking instead of the default.

CC: Ulrich Drepper <drepper@redhat.com>
CC: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/attr.c          |  2 +-
 fs/utimes.c        | 17 ++++-------------
 include/linux/fs.h | 33 +++++++++++++++++----------------
 3 files changed, 22 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/attr.c b/fs/attr.c
index 966b73e25f8..765fc75fab3 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -51,7 +51,7 @@ int inode_change_ok(struct inode *inode, struct iattr *attr)
 	}
 
 	/* Check for setting the inode time. */
-	if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET)) {
+	if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) {
 		if (!is_owner_or_cap(inode))
 			goto error;
 	}
diff --git a/fs/utimes.c b/fs/utimes.c
index b6b664e7145..ecf8941ba34 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -101,7 +101,6 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
 		     times[1].tv_nsec == UTIME_NOW)
 		times = NULL;
 
-	/* In most cases, the checks are done in inode_change_ok() */
 	newattrs.ia_valid = ATTR_CTIME | ATTR_MTIME | ATTR_ATIME;
 	if (times) {
 		error = -EPERM;
@@ -123,21 +122,13 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
 			newattrs.ia_mtime.tv_nsec = times[1].tv_nsec;
 			newattrs.ia_valid |= ATTR_MTIME_SET;
 		}
-
 		/*
-		 * For the UTIME_OMIT/UTIME_NOW and UTIME_NOW/UTIME_OMIT
-		 * cases, we need to make an extra check that is not done by
-		 * inode_change_ok().
+		 * Tell inode_change_ok(), that this is an explicit time
+		 * update, even if neither ATTR_ATIME_SET nor ATTR_MTIME_SET
+		 * were used.
 		 */
-		if (((times[0].tv_nsec == UTIME_NOW &&
-			    times[1].tv_nsec == UTIME_OMIT)
-		     ||
-		     (times[0].tv_nsec == UTIME_OMIT &&
-			    times[1].tv_nsec == UTIME_NOW))
-		    && !is_owner_or_cap(inode))
-			goto mnt_drop_write_and_out;
+		newattrs.ia_valid |= ATTR_TIMES_SET;
 	} else {
-
 		/*
 		 * If times is NULL (or both times are UTIME_NOW),
 		 * then we need to check permissions, because
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d8721e818b4..527b9e482f9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -320,22 +320,23 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
  * Attribute flags.  These should be or-ed together to figure out what
  * has been changed!
  */
-#define ATTR_MODE	1
-#define ATTR_UID	2
-#define ATTR_GID	4
-#define ATTR_SIZE	8
-#define ATTR_ATIME	16
-#define ATTR_MTIME	32
-#define ATTR_CTIME	64
-#define ATTR_ATIME_SET	128
-#define ATTR_MTIME_SET	256
-#define ATTR_FORCE	512	/* Not a change, but a change it */
-#define ATTR_ATTR_FLAG	1024
-#define ATTR_KILL_SUID	2048
-#define ATTR_KILL_SGID	4096
-#define ATTR_FILE	8192
-#define ATTR_KILL_PRIV	16384
-#define ATTR_OPEN	32768	/* Truncating from open(O_TRUNC) */
+#define ATTR_MODE	(1 << 0)
+#define ATTR_UID	(1 << 1)
+#define ATTR_GID	(1 << 2)
+#define ATTR_SIZE	(1 << 3)
+#define ATTR_ATIME	(1 << 4)
+#define ATTR_MTIME	(1 << 5)
+#define ATTR_CTIME	(1 << 6)
+#define ATTR_ATIME_SET	(1 << 7)
+#define ATTR_MTIME_SET	(1 << 8)
+#define ATTR_FORCE	(1 << 9) /* Not a change, but a change it */
+#define ATTR_ATTR_FLAG	(1 << 10)
+#define ATTR_KILL_SUID	(1 << 11)
+#define ATTR_KILL_SGID	(1 << 12)
+#define ATTR_FILE	(1 << 13)
+#define ATTR_KILL_PRIV	(1 << 14)
+#define ATTR_OPEN	(1 << 15) /* Truncating from open(O_TRUNC) */
+#define ATTR_TIMES_SET	(1 << 16)
 
 /*
  * This is the Inode Attributes structure, used for notify_change().  It
-- 
cgit v1.2.3-70-g09d2


From e9b76fedc61235da80b6b7f81dfd67ec224dfb49 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Tue, 1 Jul 2008 15:01:27 +0200
Subject: [patch 2/4] vfs: utimes cleanup

Untange the mess that is do_utimes().  Add kerneldoc comment to
do_utimes().

CC: Ulrich Drepper <drepper@redhat.com>
CC: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/utimes.c | 114 ++++++++++++++++++++++++++++++++++--------------------------
 1 file changed, 65 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/utimes.c b/fs/utimes.c
index ecf8941ba34..8e09dbdfd7f 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -48,54 +48,15 @@ static bool nsec_valid(long nsec)
 	return nsec >= 0 && nsec <= 999999999;
 }
 
-/* If times==NULL, set access and modification to current time,
- * must be owner or have write permission.
- * Else, update from *times, must be owner or super user.
- */
-long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags)
+static int utimes_common(struct path *path, struct timespec *times)
 {
 	int error;
-	struct nameidata nd;
-	struct dentry *dentry;
-	struct inode *inode;
 	struct iattr newattrs;
-	struct file *f = NULL;
-	struct vfsmount *mnt;
-
-	error = -EINVAL;
-	if (times && (!nsec_valid(times[0].tv_nsec) ||
-		      !nsec_valid(times[1].tv_nsec))) {
-		goto out;
-	}
-
-	if (flags & ~AT_SYMLINK_NOFOLLOW)
-		goto out;
-
-	if (filename == NULL && dfd != AT_FDCWD) {
-		error = -EINVAL;
-		if (flags & AT_SYMLINK_NOFOLLOW)
-			goto out;
-
-		error = -EBADF;
-		f = fget(dfd);
-		if (!f)
-			goto out;
-		dentry = f->f_path.dentry;
-		mnt = f->f_path.mnt;
-	} else {
-		error = __user_walk_fd(dfd, filename, (flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW, &nd);
-		if (error)
-			goto out;
-
-		dentry = nd.path.dentry;
-		mnt = nd.path.mnt;
-	}
-
-	inode = dentry->d_inode;
+	struct inode *inode = path->dentry->d_inode;
 
-	error = mnt_want_write(mnt);
+	error = mnt_want_write(path->mnt);
 	if (error)
-		goto dput_and_out;
+		goto out;
 
 	if (times && times[0].tv_nsec == UTIME_NOW &&
 		     times[1].tv_nsec == UTIME_NOW)
@@ -145,15 +106,70 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
 		}
 	}
 	mutex_lock(&inode->i_mutex);
-	error = notify_change(dentry, &newattrs);
+	error = notify_change(path->dentry, &newattrs);
 	mutex_unlock(&inode->i_mutex);
+
 mnt_drop_write_and_out:
-	mnt_drop_write(mnt);
-dput_and_out:
-	if (f)
-		fput(f);
-	else
+	mnt_drop_write(path->mnt);
+out:
+	return error;
+}
+
+/*
+ * do_utimes - change times on filename or file descriptor
+ * @dfd: open file descriptor, -1 or AT_FDCWD
+ * @filename: path name or NULL
+ * @times: new times or NULL
+ * @flags: zero or more flags (only AT_SYMLINK_NOFOLLOW for the moment)
+ *
+ * If filename is NULL and dfd refers to an open file, then operate on
+ * the file.  Otherwise look up filename, possibly using dfd as a
+ * starting point.
+ *
+ * If times==NULL, set access and modification to current time,
+ * must be owner or have write permission.
+ * Else, update from *times, must be owner or super user.
+ */
+long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags)
+{
+	int error = -EINVAL;
+
+	if (times && (!nsec_valid(times[0].tv_nsec) ||
+		      !nsec_valid(times[1].tv_nsec))) {
+		goto out;
+	}
+
+	if (flags & ~AT_SYMLINK_NOFOLLOW)
+		goto out;
+
+	if (filename == NULL && dfd != AT_FDCWD) {
+		struct file *file;
+
+		if (flags & AT_SYMLINK_NOFOLLOW)
+			goto out;
+
+		file = fget(dfd);
+		error = -EBADF;
+		if (!file)
+			goto out;
+
+		error = utimes_common(&file->f_path, times);
+		fput(file);
+	} else {
+		struct nameidata nd;
+		int lookup_flags = 0;
+
+		if (!(flags & AT_SYMLINK_NOFOLLOW))
+			lookup_flags |= LOOKUP_FOLLOW;
+
+		error = __user_walk_fd(dfd, filename, lookup_flags, &nd);
+		if (error)
+			goto out;
+
+		error = utimes_common(&nd.path, times);
 		path_put(&nd.path);
+	}
+
 out:
 	return error;
 }
-- 
cgit v1.2.3-70-g09d2


From b1da47e29e467f1ec36dc78d009bfb109fd533c7 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Tue, 1 Jul 2008 15:01:28 +0200
Subject: [patch 3/4] fat: dont call notify_change

The FAT_IOCTL_SET_ATTRIBUTES ioctl() calls notify_change() to change
the file mode before changing the inode attributes.  Replace with
explicit calls to security_inode_setattr(), fat_setattr() and
fsnotify_change().

This is equivalent to the original.  The reason it is needed, is that
later in the series we move the immutable check into notify_change().
That would break the FAT_IOCTL_SET_ATTRIBUTES ioctl, as it needs to
perform the mode change regardless of the immutability of the file.

[Fix error if fat is built as a module.  Thanks to OGAWA Hirofumi for
noticing.]

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Acked-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fat/file.c       | 15 ++++++++++++++-
 security/security.c |  1 +
 2 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fat/file.c b/fs/fat/file.c
index c672df4036e..8707a8cfa02 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -15,6 +15,8 @@
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/blkdev.h>
+#include <linux/fsnotify.h>
+#include <linux/security.h>
 
 int fat_generic_ioctl(struct inode *inode, struct file *filp,
 		      unsigned int cmd, unsigned long arg)
@@ -64,6 +66,7 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
 
 		/* Equivalent to a chmod() */
 		ia.ia_valid = ATTR_MODE | ATTR_CTIME;
+		ia.ia_ctime = current_fs_time(inode->i_sb);
 		if (is_dir) {
 			ia.ia_mode = MSDOS_MKMODE(attr,
 				S_IRWXUGO & ~sbi->options.fs_dmask)
@@ -90,11 +93,21 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
 			}
 		}
 
+		/*
+		 * The security check is questionable...  We single
+		 * out the RO attribute for checking by the security
+		 * module, just because it maps to a file mode.
+		 */
+		err = security_inode_setattr(filp->f_path.dentry, &ia);
+		if (err)
+			goto up;
+
 		/* This MUST be done before doing anything irreversible... */
-		err = notify_change(filp->f_path.dentry, &ia);
+		err = fat_setattr(filp->f_path.dentry, &ia);
 		if (err)
 			goto up;
 
+		fsnotify_change(filp->f_path.dentry, ia.ia_valid);
 		if (sbi->options.sys_immutable) {
 			if (attr & ATTR_SYS)
 				inode->i_flags |= S_IMMUTABLE;
diff --git a/security/security.c b/security/security.c
index 78ed3ffde24..ff706872775 100644
--- a/security/security.c
+++ b/security/security.c
@@ -442,6 +442,7 @@ int security_inode_setattr(struct dentry *dentry, struct iattr *attr)
 		return 0;
 	return security_ops->inode_setattr(dentry, attr);
 }
+EXPORT_SYMBOL_GPL(security_inode_setattr);
 
 int security_inode_getattr(struct vfsmount *mnt, struct dentry *dentry)
 {
-- 
cgit v1.2.3-70-g09d2


From beb29e058c35ab69e96e455a12ccf7505f6de425 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Tue, 1 Jul 2008 15:01:29 +0200
Subject: [patch 4/4] vfs: immutable inode checking cleanup

Move the immutable and append-only checks from chmod, chown and utimes
into notify_change().  Checks for immutable and append-only files are
always performed by the VFS and not by the filesystem (see
permission() and may_...() in namei.c), so these belong in
notify_change(), and not in inode_change_ok().

This should be completely equivalent.

CC: Ulrich Drepper <drepper@redhat.com>
CC: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/attr.c   |  5 +++++
 fs/open.c   | 24 ++----------------------
 fs/utimes.c |  4 ----
 3 files changed, 7 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/attr.c b/fs/attr.c
index 765fc75fab3..26c71ba1eed 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -108,6 +108,11 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
 	struct timespec now;
 	unsigned int ia_valid = attr->ia_valid;
 
+	if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) {
+		if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+			return -EPERM;
+	}
+
 	now = current_fs_time(inode->i_sb);
 
 	attr->ia_ctime = now;
diff --git a/fs/open.c b/fs/open.c
index 3317e1909b2..3b3c43674be 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -588,9 +588,6 @@ asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
 	err = mnt_want_write(file->f_path.mnt);
 	if (err)
 		goto out_putf;
-	err = -EPERM;
-	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-		goto out_drop_write;
 	mutex_lock(&inode->i_mutex);
 	if (mode == (mode_t) -1)
 		mode = inode->i_mode;
@@ -598,8 +595,6 @@ asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
 	newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
 	err = notify_change(dentry, &newattrs);
 	mutex_unlock(&inode->i_mutex);
-
-out_drop_write:
 	mnt_drop_write(file->f_path.mnt);
 out_putf:
 	fput(file);
@@ -623,11 +618,6 @@ asmlinkage long sys_fchmodat(int dfd, const char __user *filename,
 	error = mnt_want_write(nd.path.mnt);
 	if (error)
 		goto dput_and_out;
-
-	error = -EPERM;
-	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-		goto out_drop_write;
-
 	mutex_lock(&inode->i_mutex);
 	if (mode == (mode_t) -1)
 		mode = inode->i_mode;
@@ -635,8 +625,6 @@ asmlinkage long sys_fchmodat(int dfd, const char __user *filename,
 	newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
 	error = notify_change(nd.path.dentry, &newattrs);
 	mutex_unlock(&inode->i_mutex);
-
-out_drop_write:
 	mnt_drop_write(nd.path.mnt);
 dput_and_out:
 	path_put(&nd.path);
@@ -651,18 +639,10 @@ asmlinkage long sys_chmod(const char __user *filename, mode_t mode)
 
 static int chown_common(struct dentry * dentry, uid_t user, gid_t group)
 {
-	struct inode * inode;
+	struct inode *inode = dentry->d_inode;
 	int error;
 	struct iattr newattrs;
 
-	error = -ENOENT;
-	if (!(inode = dentry->d_inode)) {
-		printk(KERN_ERR "chown_common: NULL inode\n");
-		goto out;
-	}
-	error = -EPERM;
-	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-		goto out;
 	newattrs.ia_valid =  ATTR_CTIME;
 	if (user != (uid_t) -1) {
 		newattrs.ia_valid |= ATTR_UID;
@@ -678,7 +658,7 @@ static int chown_common(struct dentry * dentry, uid_t user, gid_t group)
 	mutex_lock(&inode->i_mutex);
 	error = notify_change(dentry, &newattrs);
 	mutex_unlock(&inode->i_mutex);
-out:
+
 	return error;
 }
 
diff --git a/fs/utimes.c b/fs/utimes.c
index 8e09dbdfd7f..dad679d3a15 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -64,10 +64,6 @@ static int utimes_common(struct path *path, struct timespec *times)
 
 	newattrs.ia_valid = ATTR_CTIME | ATTR_MTIME | ATTR_ATIME;
 	if (times) {
-		error = -EPERM;
-                if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
-			goto mnt_drop_write_and_out;
-
 		if (times[0].tv_nsec == UTIME_OMIT)
 			newattrs.ia_valid &= ~ATTR_ATIME;
 		else if (times[0].tv_nsec != UTIME_NOW) {
-- 
cgit v1.2.3-70-g09d2


From e56b6a5dda1a36ffaa532df6f975ea324298fa4d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 19 May 2008 07:53:34 +0200
Subject: Re: [PATCH 3/6] vfs: open_exec cleanup

On Mon, May 19, 2008 at 12:01:49AM +0200, Marcin Slusarz wrote:
> open_exec is needlessly indented, calls ERR_PTR with 0 argument
> (which is not valid errno) and jumps into middle of function
> just to return value.
> So clean it up a bit.

Still looks rather messy.  See below for a better version.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/exec.c | 58 ++++++++++++++++++++++++++++++----------------------------
 1 file changed, 30 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 0ba5d355c5a..346e3f69c6e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -656,38 +656,40 @@ EXPORT_SYMBOL(setup_arg_pages);
 struct file *open_exec(const char *name)
 {
 	struct nameidata nd;
-	int err;
 	struct file *file;
+	int err;
 
-	err = path_lookup_open(AT_FDCWD, name, LOOKUP_FOLLOW, &nd, FMODE_READ|FMODE_EXEC);
-	file = ERR_PTR(err);
-
-	if (!err) {
-		struct inode *inode = nd.path.dentry->d_inode;
-		file = ERR_PTR(-EACCES);
-		if (S_ISREG(inode->i_mode)) {
-			int err = vfs_permission(&nd, MAY_EXEC | MAY_OPEN);
-			file = ERR_PTR(err);
-			if (!err) {
-				file = nameidata_to_filp(&nd,
-							O_RDONLY|O_LARGEFILE);
-				if (!IS_ERR(file)) {
-					err = deny_write_access(file);
-					if (err) {
-						fput(file);
-						file = ERR_PTR(err);
-					}
-				}
-out:
-				return file;
-			}
-		}
-		release_open_intent(&nd);
-		path_put(&nd.path);
+	err = path_lookup_open(AT_FDCWD, name, LOOKUP_FOLLOW, &nd,
+				FMODE_READ|FMODE_EXEC);
+	if (err)
+		goto out;
+
+	err = -EACCES;
+	if (!S_ISREG(nd.path.dentry->d_inode->i_mode))
+		goto out_path_put;
+
+	err = vfs_permission(&nd, MAY_EXEC | MAY_OPEN);
+	if (err)
+		goto out_path_put;
+
+	file = nameidata_to_filp(&nd, O_RDONLY|O_LARGEFILE);
+	if (IS_ERR(file))
+		return file;
+
+	err = deny_write_access(file);
+	if (err) {
+		fput(file);
+		goto out;
 	}
-	goto out;
-}
 
+	return file;
+
+ out_path_put:
+	release_open_intent(&nd);
+	path_put(&nd.path);
+ out:
+	return ERR_PTR(err);
+}
 EXPORT_SYMBOL(open_exec);
 
 int kernel_read(struct file *file, unsigned long offset,
-- 
cgit v1.2.3-70-g09d2


From 30524472c2f728c20d6bf35191042a5d455c0a64 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 22 Jul 2008 00:02:33 -0400
Subject: [PATCH] take noexec checks to very few callers that care

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/exec.c  |  7 +++++++
 fs/namei.c |  9 ---------
 fs/open.c  | 10 ++++++++++
 3 files changed, 17 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 346e3f69c6e..eca58c29ede 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -118,6 +118,10 @@ asmlinkage long sys_uselib(const char __user * library)
 	if (!S_ISREG(nd.path.dentry->d_inode->i_mode))
 		goto exit;
 
+	error = -EACCES;
+	if (nd.path.mnt->mnt_flags & MNT_NOEXEC)
+		goto exit;
+
 	error = vfs_permission(&nd, MAY_READ | MAY_EXEC | MAY_OPEN);
 	if (error)
 		goto exit;
@@ -668,6 +672,9 @@ struct file *open_exec(const char *name)
 	if (!S_ISREG(nd.path.dentry->d_inode->i_mode))
 		goto out_path_put;
 
+	if (nd.path.mnt->mnt_flags & MNT_NOEXEC)
+		goto out_path_put;
+
 	err = vfs_permission(&nd, MAY_EXEC | MAY_OPEN);
 	if (err)
 		goto out_path_put;
diff --git a/fs/namei.c b/fs/namei.c
index 6d75430358a..396cb3e5c36 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -252,15 +252,6 @@ int permission(struct inode *inode, int mask, struct nameidata *nd)
 			return -EACCES;
 	}
 
-	if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) {
-		/*
-		 * MAY_EXEC on regular files is denied if the fs is mounted
-		 * with the "noexec" flag.
-		 */
-		if (mnt && (mnt->mnt_flags & MNT_NOEXEC))
-			return -EACCES;
-	}
-
 	/* Ordinary permission routines do not understand MAY_APPEND. */
 	if (inode->i_op && inode->i_op->permission) {
 		retval = inode->i_op->permission(inode, mask);
diff --git a/fs/open.c b/fs/open.c
index 3b3c43674be..d5e421ad0cf 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -461,6 +461,16 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 	if (res)
 		goto out;
 
+	if ((mode & MAY_EXEC) && S_ISREG(nd.path.dentry->d_inode->i_mode)) {
+		/*
+		 * MAY_EXEC on regular files is denied if the fs is mounted
+		 * with the "noexec" flag.
+		 */
+		res = -EACCES;
+		if (nd.path.mnt->mnt_flags & MNT_NOEXEC)
+			goto out_path_release;
+	}
+
 	res = vfs_permission(&nd, mode | MAY_ACCESS);
 	/* SuS v2 requires we report a read only fs too */
 	if(res || !(mode & S_IWOTH) ||
-- 
cgit v1.2.3-70-g09d2


From f419a2e3b64def707e1384ee38abb77f99af5f6d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 22 Jul 2008 00:07:17 -0400
Subject: [PATCH] kill nameidata passing to permission(), rename to
 inode_permission()

Incidentally, the name that gives hundreds of false positives on grep
is not a good idea...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ecryptfs/inode.c |  2 +-
 fs/namei.c          | 22 +++++++++-------------
 fs/nfsd/nfsfh.c     |  2 +-
 fs/nfsd/vfs.c       |  4 ++--
 fs/utimes.c         |  2 +-
 fs/xattr.c          |  2 +-
 include/linux/fs.h  |  2 +-
 ipc/mqueue.c        |  2 +-
 8 files changed, 17 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index f25caf2b088..89209f00f9c 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -830,7 +830,7 @@ out:
 static int
 ecryptfs_permission(struct inode *inode, int mask)
 {
-	return permission(ecryptfs_inode_to_lower(inode), mask, NULL);
+	return inode_permission(ecryptfs_inode_to_lower(inode), mask);
 }
 
 /**
diff --git a/fs/namei.c b/fs/namei.c
index 396cb3e5c36..5029b93ebbd 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -227,13 +227,9 @@ int generic_permission(struct inode *inode, int mask,
 	return -EACCES;
 }
 
-int permission(struct inode *inode, int mask, struct nameidata *nd)
+int inode_permission(struct inode *inode, int mask)
 {
 	int retval;
-	struct vfsmount *mnt = NULL;
-
-	if (nd)
-		mnt = nd->path.mnt;
 
 	if (mask & MAY_WRITE) {
 		umode_t mode = inode->i_mode;
@@ -293,7 +289,7 @@ int permission(struct inode *inode, int mask, struct nameidata *nd)
  */
 int vfs_permission(struct nameidata *nd, int mask)
 {
-	return permission(nd->path.dentry->d_inode, mask, nd);
+	return inode_permission(nd->path.dentry->d_inode, mask);
 }
 
 /**
@@ -310,7 +306,7 @@ int vfs_permission(struct nameidata *nd, int mask)
  */
 int file_permission(struct file *file, int mask)
 {
-	return permission(file->f_path.dentry->d_inode, mask, NULL);
+	return inode_permission(file->f_path.dentry->d_inode, mask);
 }
 
 /*
@@ -1262,7 +1258,7 @@ static struct dentry *lookup_hash(struct nameidata *nd)
 {
 	int err;
 
-	err = permission(nd->path.dentry->d_inode, MAY_EXEC, nd);
+	err = inode_permission(nd->path.dentry->d_inode, MAY_EXEC);
 	if (err)
 		return ERR_PTR(err);
 	return __lookup_hash(&nd->last, nd->path.dentry, nd);
@@ -1310,7 +1306,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 	if (err)
 		return ERR_PTR(err);
 
-	err = permission(base->d_inode, MAY_EXEC, NULL);
+	err = inode_permission(base->d_inode, MAY_EXEC);
 	if (err)
 		return ERR_PTR(err);
 	return __lookup_hash(&this, base, NULL);
@@ -1400,7 +1396,7 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
 	BUG_ON(victim->d_parent->d_inode != dir);
 	audit_inode_child(victim->d_name.name, victim, dir);
 
-	error = permission(dir,MAY_WRITE | MAY_EXEC, NULL);
+	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
 	if (error)
 		return error;
 	if (IS_APPEND(dir))
@@ -1437,7 +1433,7 @@ static inline int may_create(struct inode *dir, struct dentry *child,
 		return -EEXIST;
 	if (IS_DEADDIR(dir))
 		return -ENOENT;
-	return permission(dir,MAY_WRITE | MAY_EXEC, nd);
+	return inode_permission(dir, MAY_WRITE | MAY_EXEC);
 }
 
 /* 
@@ -2543,7 +2539,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
 	 * we'll need to flip '..'.
 	 */
 	if (new_dir != old_dir) {
-		error = permission(old_dentry->d_inode, MAY_WRITE, NULL);
+		error = inode_permission(old_dentry->d_inode, MAY_WRITE);
 		if (error)
 			return error;
 	}
@@ -2897,7 +2893,7 @@ EXPORT_SYMBOL(page_symlink);
 EXPORT_SYMBOL(page_symlink_inode_operations);
 EXPORT_SYMBOL(path_lookup);
 EXPORT_SYMBOL(vfs_path_lookup);
-EXPORT_SYMBOL(permission);
+EXPORT_SYMBOL(inode_permission);
 EXPORT_SYMBOL(vfs_permission);
 EXPORT_SYMBOL(file_permission);
 EXPORT_SYMBOL(unlock_rename);
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index f45451eb1e3..ea37c96f044 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -51,7 +51,7 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry)
 		/* make sure parents give x permission to user */
 		int err;
 		parent = dget_parent(tdentry);
-		err = permission(parent->d_inode, MAY_EXEC, NULL);
+		err = inode_permission(parent->d_inode, MAY_EXEC);
 		if (err < 0) {
 			dput(parent);
 			break;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index ad1ad59e374..18060bed526 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1953,12 +1953,12 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
 		return 0;
 
 	/* This assumes  NFSD_MAY_{READ,WRITE,EXEC} == MAY_{READ,WRITE,EXEC} */
-	err = permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC), NULL);
+	err = inode_permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC));
 
 	/* Allow read access to binaries even when mode 111 */
 	if (err == -EACCES && S_ISREG(inode->i_mode) &&
 	    acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE))
-		err = permission(inode, MAY_EXEC, NULL);
+		err = inode_permission(inode, MAY_EXEC);
 
 	return err? nfserrno(err) : 0;
 }
diff --git a/fs/utimes.c b/fs/utimes.c
index dad679d3a15..dc28b782625 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -96,7 +96,7 @@ static int utimes_common(struct path *path, struct timespec *times)
 			goto mnt_drop_write_and_out;
 
 		if (!is_owner_or_cap(inode)) {
-			error = permission(inode, MAY_WRITE, NULL);
+			error = inode_permission(inode, MAY_WRITE);
 			if (error)
 				goto mnt_drop_write_and_out;
 		}
diff --git a/fs/xattr.c b/fs/xattr.c
index 4706a8b1f49..b96222e05ba 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -63,7 +63,7 @@ xattr_permission(struct inode *inode, const char *name, int mask)
 			return -EPERM;
 	}
 
-	return permission(inode, mask, NULL);
+	return inode_permission(inode, mask);
 }
 
 int
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 527b9e482f9..9d2de4cadab 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1770,7 +1770,7 @@ extern int do_remount_sb(struct super_block *sb, int flags,
 extern sector_t bmap(struct inode *, sector_t);
 #endif
 extern int notify_change(struct dentry *, struct iattr *);
-extern int permission(struct inode *, int, struct nameidata *);
+extern int inode_permission(struct inode *, int);
 extern int generic_permission(struct inode *, int,
 		int (*check_acl)(struct inode *, int));
 
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 474984f9e03..96fb36cd987 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -638,7 +638,7 @@ static int oflag2acc[O_ACCMODE] = { MAY_READ, MAY_WRITE,
 		return ERR_PTR(-EINVAL);
 	}
 
-	if (permission(dentry->d_inode, oflag2acc[oflag & O_ACCMODE], NULL)) {
+	if (inode_permission(dentry->d_inode, oflag2acc[oflag & O_ACCMODE])) {
 		dput(dentry);
 		mntput(mqueue_mnt);
 		return ERR_PTR(-EACCES);
-- 
cgit v1.2.3-70-g09d2


From 256984a83880ff7ac78055cb87baea48137f0b77 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 22 Jul 2008 08:09:30 -0400
Subject: [PATCH] preparation to __user_walk_fd cleanup

Almost all users __user_walk_fd() and friends care only about struct path.
Get rid of the few that do not.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inotify_user.c |  2 +-
 fs/open.c         | 18 ++++++++++--------
 2 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/inotify_user.c b/fs/inotify_user.c
index fe79c25d95d..9b99ebf2888 100644
--- a/fs/inotify_user.c
+++ b/fs/inotify_user.c
@@ -365,7 +365,7 @@ static int find_inode(const char __user *dirname, struct nameidata *nd,
 	if (error)
 		return error;
 	/* you can only watch an inode if you have read permissions on it */
-	error = vfs_permission(nd, MAY_READ);
+	error = inode_permission(nd->path.dentry->d_inode, MAY_READ);
 	if (error)
 		path_put(&nd->path);
 	return error;
diff --git a/fs/open.c b/fs/open.c
index d5e421ad0cf..e94266700ed 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -251,7 +251,7 @@ static long do_sys_truncate(const char __user * path, loff_t length)
 	if (error)
 		goto dput_and_out;
 
-	error = vfs_permission(&nd, MAY_WRITE);
+	error = inode_permission(inode, MAY_WRITE);
 	if (error)
 		goto mnt_drop_write_and_out;
 
@@ -426,6 +426,7 @@ out:
 asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 {
 	struct nameidata nd;
+	struct inode *inode;
 	int old_fsuid, old_fsgid;
 	kernel_cap_t uninitialized_var(old_cap);  /* !SECURE_NO_SETUID_FIXUP */
 	int res;
@@ -461,7 +462,9 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 	if (res)
 		goto out;
 
-	if ((mode & MAY_EXEC) && S_ISREG(nd.path.dentry->d_inode->i_mode)) {
+	inode = nd.path.dentry->d_inode;
+
+	if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) {
 		/*
 		 * MAY_EXEC on regular files is denied if the fs is mounted
 		 * with the "noexec" flag.
@@ -471,10 +474,9 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 			goto out_path_release;
 	}
 
-	res = vfs_permission(&nd, mode | MAY_ACCESS);
+	res = inode_permission(inode, mode | MAY_ACCESS);
 	/* SuS v2 requires we report a read only fs too */
-	if(res || !(mode & S_IWOTH) ||
-	   special_file(nd.path.dentry->d_inode->i_mode))
+	if (res || !(mode & S_IWOTH) || special_file(inode->i_mode))
 		goto out_path_release;
 	/*
 	 * This is a rare case where using __mnt_is_readonly()
@@ -515,7 +517,7 @@ asmlinkage long sys_chdir(const char __user * filename)
 	if (error)
 		goto out;
 
-	error = vfs_permission(&nd, MAY_EXEC | MAY_ACCESS);
+	error = inode_permission(nd.path.dentry->d_inode, MAY_EXEC | MAY_ACCESS);
 	if (error)
 		goto dput_and_out;
 
@@ -544,7 +546,7 @@ asmlinkage long sys_fchdir(unsigned int fd)
 	if (!S_ISDIR(inode->i_mode))
 		goto out_putf;
 
-	error = file_permission(file, MAY_EXEC | MAY_ACCESS);
+	error = inode_permission(inode, MAY_EXEC | MAY_ACCESS);
 	if (!error)
 		set_fs_pwd(current->fs, &file->f_path);
 out_putf:
@@ -562,7 +564,7 @@ asmlinkage long sys_chroot(const char __user * filename)
 	if (error)
 		goto out;
 
-	error = vfs_permission(&nd, MAY_EXEC | MAY_ACCESS);
+	error = inode_permission(nd.path.dentry->d_inode, MAY_EXEC | MAY_ACCESS);
 	if (error)
 		goto dput_and_out;
 
-- 
cgit v1.2.3-70-g09d2


From 2d8f30380ab8c706f4e0a8f1aaa22b5886e9ac8a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 22 Jul 2008 09:59:21 -0400
Subject: [PATCH] sanitize __user_walk_fd() et.al.

* do not pass nameidata; struct path is all the callers want.
* switch to new helpers:
	user_path_at(dfd, pathname, flags, &path)
	user_path(pathname, &path)
	user_lpath(pathname, &path)
	user_path_dir(pathname, &path)  (fail if not a directory)
  The last 3 are trivial macro wrappers for the first one.
* remove nameidata in callers.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/alpha/kernel/osf_sys.c  |  10 ++--
 arch/parisc/hpux/sys_hpux.c  |  10 ++--
 fs/coda/pioctl.c             |  14 ++---
 fs/compat.c                  |  20 +++----
 fs/inotify_user.c            |  22 ++++----
 fs/namei.c                   |  36 ++++++-------
 fs/namespace.c               |  74 +++++++++++++-------------
 fs/open.c                    | 124 +++++++++++++++++++++----------------------
 fs/stat.c                    |  32 +++++------
 fs/utimes.c                  |   8 +--
 fs/xattr.c                   |  96 ++++++++++++++++-----------------
 fs/xfs/linux-2.6/xfs_ioctl.c |  14 +++--
 include/linux/namei.h        |  13 ++---
 13 files changed, 235 insertions(+), 238 deletions(-)

(limited to 'fs')

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 32ca1b92730..6e943135f0e 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -253,15 +253,15 @@ do_osf_statfs(struct dentry * dentry, struct osf_statfs __user *buffer,
 }
 
 asmlinkage int
-osf_statfs(char __user *path, struct osf_statfs __user *buffer, unsigned long bufsiz)
+osf_statfs(char __user *pathname, struct osf_statfs __user *buffer, unsigned long bufsiz)
 {
-	struct nameidata nd;
+	struct path path;
 	int retval;
 
-	retval = user_path_walk(path, &nd);
+	retval = user_path(pathname, &path);
 	if (!retval) {
-		retval = do_osf_statfs(nd.path.dentry, buffer, bufsiz);
-		path_put(&nd.path);
+		retval = do_osf_statfs(path.dentry, buffer, bufsiz);
+		path_put(&path);
 	}
 	return retval;
 }
diff --git a/arch/parisc/hpux/sys_hpux.c b/arch/parisc/hpux/sys_hpux.c
index be255ebb609..18072e03a01 100644
--- a/arch/parisc/hpux/sys_hpux.c
+++ b/arch/parisc/hpux/sys_hpux.c
@@ -210,19 +210,19 @@ static int vfs_statfs_hpux(struct dentry *dentry, struct hpux_statfs *buf)
 }
 
 /* hpux statfs */
-asmlinkage long hpux_statfs(const char __user *path,
+asmlinkage long hpux_statfs(const char __user *pathname,
 						struct hpux_statfs __user *buf)
 {
-	struct nameidata nd;
+	struct path path;
 	int error;
 
-	error = user_path_walk(path, &nd);
+	error = user_path(pathname, &path);
 	if (!error) {
 		struct hpux_statfs tmp;
-		error = vfs_statfs_hpux(nd.path.dentry, &tmp);
+		error = vfs_statfs_hpux(path.dentry, &tmp);
 		if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 			error = -EFAULT;
-		path_put(&nd.path);
+		path_put(&path);
 	}
 	return error;
 }
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index c38a98974fb..c51365422aa 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -49,7 +49,7 @@ static int coda_ioctl_permission(struct inode *inode, int mask)
 static int coda_pioctl(struct inode * inode, struct file * filp, 
                        unsigned int cmd, unsigned long user_data)
 {
-	struct nameidata nd;
+	struct path path;
         int error;
 	struct PioctlData data;
         struct inode *target_inode = NULL;
@@ -64,21 +64,21 @@ static int coda_pioctl(struct inode * inode, struct file * filp,
          * Look up the pathname. Note that the pathname is in 
          * user memory, and namei takes care of this
          */
-        if ( data.follow ) {
-                error = user_path_walk(data.path, &nd);
+        if (data.follow) {
+                error = user_path(data.path, &path);
 	} else {
-	        error = user_path_walk_link(data.path, &nd);
+	        error = user_lpath(data.path, &path);
 	}
 		
 	if ( error ) {
 		return error;
         } else {
-		target_inode = nd.path.dentry->d_inode;
+		target_inode = path.dentry->d_inode;
 	}
 	
 	/* return if it is not a Coda inode */
 	if ( target_inode->i_sb != inode->i_sb ) {
-		path_put(&nd.path);
+		path_put(&path);
 	        return  -EINVAL;
 	}
 
@@ -87,7 +87,7 @@ static int coda_pioctl(struct inode * inode, struct file * filp,
 
 	error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data);
 
-	path_put(&nd.path);
+	path_put(&path);
         return error;
 }
 
diff --git a/fs/compat.c b/fs/compat.c
index 106eba28ec5..c9d1472e65c 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -234,18 +234,18 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
  * The following statfs calls are copies of code from fs/open.c and
  * should be checked against those from time to time
  */
-asmlinkage long compat_sys_statfs(const char __user *path, struct compat_statfs __user *buf)
+asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf)
 {
-	struct nameidata nd;
+	struct path path;
 	int error;
 
-	error = user_path_walk(path, &nd);
+	error = user_path(pathname, &path);
 	if (!error) {
 		struct kstatfs tmp;
-		error = vfs_statfs(nd.path.dentry, &tmp);
+		error = vfs_statfs(path.dentry, &tmp);
 		if (!error)
 			error = put_compat_statfs(buf, &tmp);
-		path_put(&nd.path);
+		path_put(&path);
 	}
 	return error;
 }
@@ -299,21 +299,21 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat
 	return 0;
 }
 
-asmlinkage long compat_sys_statfs64(const char __user *path, compat_size_t sz, struct compat_statfs64 __user *buf)
+asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t sz, struct compat_statfs64 __user *buf)
 {
-	struct nameidata nd;
+	struct path path;
 	int error;
 
 	if (sz != sizeof(*buf))
 		return -EINVAL;
 
-	error = user_path_walk(path, &nd);
+	error = user_path(pathname, &path);
 	if (!error) {
 		struct kstatfs tmp;
-		error = vfs_statfs(nd.path.dentry, &tmp);
+		error = vfs_statfs(path.dentry, &tmp);
 		if (!error)
 			error = put_compat_statfs64(buf, &tmp);
-		path_put(&nd.path);
+		path_put(&path);
 	}
 	return error;
 }
diff --git a/fs/inotify_user.c b/fs/inotify_user.c
index 9b99ebf2888..60249429a25 100644
--- a/fs/inotify_user.c
+++ b/fs/inotify_user.c
@@ -354,20 +354,20 @@ static void inotify_dev_event_dequeue(struct inotify_device *dev)
 }
 
 /*
- * find_inode - resolve a user-given path to a specific inode and return a nd
+ * find_inode - resolve a user-given path to a specific inode
  */
-static int find_inode(const char __user *dirname, struct nameidata *nd,
+static int find_inode(const char __user *dirname, struct path *path,
 		      unsigned flags)
 {
 	int error;
 
-	error = __user_walk(dirname, flags, nd);
+	error = user_path_at(AT_FDCWD, dirname, flags, path);
 	if (error)
 		return error;
 	/* you can only watch an inode if you have read permissions on it */
-	error = inode_permission(nd->path.dentry->d_inode, MAY_READ);
+	error = inode_permission(path->dentry->d_inode, MAY_READ);
 	if (error)
-		path_put(&nd->path);
+		path_put(path);
 	return error;
 }
 
@@ -650,11 +650,11 @@ asmlinkage long sys_inotify_init(void)
 	return sys_inotify_init1(0);
 }
 
-asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
+asmlinkage long sys_inotify_add_watch(int fd, const char __user *pathname, u32 mask)
 {
 	struct inode *inode;
 	struct inotify_device *dev;
-	struct nameidata nd;
+	struct path path;
 	struct file *filp;
 	int ret, fput_needed;
 	unsigned flags = 0;
@@ -674,12 +674,12 @@ asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
 	if (mask & IN_ONLYDIR)
 		flags |= LOOKUP_DIRECTORY;
 
-	ret = find_inode(path, &nd, flags);
+	ret = find_inode(pathname, &path, flags);
 	if (unlikely(ret))
 		goto fput_and_out;
 
-	/* inode held in place by reference to nd; dev by fget on fd */
-	inode = nd.path.dentry->d_inode;
+	/* inode held in place by reference to path; dev by fget on fd */
+	inode = path.dentry->d_inode;
 	dev = filp->private_data;
 
 	mutex_lock(&dev->up_mutex);
@@ -688,7 +688,7 @@ asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
 		ret = create_watch(dev, inode, mask);
 	mutex_unlock(&dev->up_mutex);
 
-	path_put(&nd.path);
+	path_put(&path);
 fput_and_out:
 	fput_light(filp, fput_needed);
 	return ret;
diff --git a/fs/namei.c b/fs/namei.c
index 5029b93ebbd..edb5e973f9b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1334,24 +1334,24 @@ struct dentry *lookup_one_noperm(const char *name, struct dentry *base)
 	return __lookup_hash(&this, base, NULL);
 }
 
-int __user_walk_fd(int dfd, const char __user *name, unsigned flags,
-			    struct nameidata *nd)
+int user_path_at(int dfd, const char __user *name, unsigned flags,
+		 struct path *path)
 {
+	struct nameidata nd;
 	char *tmp = getname(name);
 	int err = PTR_ERR(tmp);
-
 	if (!IS_ERR(tmp)) {
-		err = do_path_lookup(dfd, tmp, flags, nd);
+
+		BUG_ON(flags & LOOKUP_PARENT);
+
+		err = do_path_lookup(dfd, tmp, flags, &nd);
 		putname(tmp);
+		if (!err)
+			*path = nd.path;
 	}
 	return err;
 }
 
-int __user_walk(const char __user *name, unsigned flags, struct nameidata *nd)
-{
-	return __user_walk_fd(AT_FDCWD, name, flags, nd);
-}
-
 /*
  * It's inline, so penalty for filesystems that don't use sticky bit is
  * minimal.
@@ -2446,7 +2446,8 @@ asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
 			   int flags)
 {
 	struct dentry *new_dentry;
-	struct nameidata nd, old_nd;
+	struct nameidata nd;
+	struct path old_path;
 	int error;
 	char * to;
 
@@ -2457,16 +2458,16 @@ asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
 	if (IS_ERR(to))
 		return PTR_ERR(to);
 
-	error = __user_walk_fd(olddfd, oldname,
-			       flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0,
-			       &old_nd);
+	error = user_path_at(olddfd, oldname,
+			     flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0,
+			     &old_path);
 	if (error)
 		goto exit;
 	error = do_path_lookup(newdfd, to, LOOKUP_PARENT, &nd);
 	if (error)
 		goto out;
 	error = -EXDEV;
-	if (old_nd.path.mnt != nd.path.mnt)
+	if (old_path.mnt != nd.path.mnt)
 		goto out_release;
 	new_dentry = lookup_create(&nd, 0);
 	error = PTR_ERR(new_dentry);
@@ -2475,7 +2476,7 @@ asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
 	error = mnt_want_write(nd.path.mnt);
 	if (error)
 		goto out_dput;
-	error = vfs_link(old_nd.path.dentry, nd.path.dentry->d_inode, new_dentry);
+	error = vfs_link(old_path.dentry, nd.path.dentry->d_inode, new_dentry);
 	mnt_drop_write(nd.path.mnt);
 out_dput:
 	dput(new_dentry);
@@ -2484,7 +2485,7 @@ out_unlock:
 out_release:
 	path_put(&nd.path);
 out:
-	path_put(&old_nd.path);
+	path_put(&old_path);
 exit:
 	putname(to);
 
@@ -2877,8 +2878,7 @@ const struct inode_operations page_symlink_inode_operations = {
 	.put_link	= page_put_link,
 };
 
-EXPORT_SYMBOL(__user_walk);
-EXPORT_SYMBOL(__user_walk_fd);
+EXPORT_SYMBOL(user_path_at);
 EXPORT_SYMBOL(follow_down);
 EXPORT_SYMBOL(follow_up);
 EXPORT_SYMBOL(get_write_access); /* binfmt_aout */
diff --git a/fs/namespace.c b/fs/namespace.c
index 26380f59953..411728c0c8b 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1130,27 +1130,27 @@ static int do_umount(struct vfsmount *mnt, int flags)
 
 asmlinkage long sys_umount(char __user * name, int flags)
 {
-	struct nameidata nd;
+	struct path path;
 	int retval;
 
-	retval = __user_walk(name, LOOKUP_FOLLOW, &nd);
+	retval = user_path(name, &path);
 	if (retval)
 		goto out;
 	retval = -EINVAL;
-	if (nd.path.dentry != nd.path.mnt->mnt_root)
+	if (path.dentry != path.mnt->mnt_root)
 		goto dput_and_out;
-	if (!check_mnt(nd.path.mnt))
+	if (!check_mnt(path.mnt))
 		goto dput_and_out;
 
 	retval = -EPERM;
 	if (!capable(CAP_SYS_ADMIN))
 		goto dput_and_out;
 
-	retval = do_umount(nd.path.mnt, flags);
+	retval = do_umount(path.mnt, flags);
 dput_and_out:
 	/* we mustn't call path_put() as that would clear mnt_expiry_mark */
-	dput(nd.path.dentry);
-	mntput_no_expire(nd.path.mnt);
+	dput(path.dentry);
+	mntput_no_expire(path.mnt);
 out:
 	return retval;
 }
@@ -2179,28 +2179,26 @@ asmlinkage long sys_pivot_root(const char __user * new_root,
 			       const char __user * put_old)
 {
 	struct vfsmount *tmp;
-	struct nameidata new_nd, old_nd;
-	struct path parent_path, root_parent, root;
+	struct path new, old, parent_path, root_parent, root;
 	int error;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	error = __user_walk(new_root, LOOKUP_FOLLOW | LOOKUP_DIRECTORY,
-			    &new_nd);
+	error = user_path_dir(new_root, &new);
 	if (error)
 		goto out0;
 	error = -EINVAL;
-	if (!check_mnt(new_nd.path.mnt))
+	if (!check_mnt(new.mnt))
 		goto out1;
 
-	error = __user_walk(put_old, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old_nd);
+	error = user_path_dir(put_old, &old);
 	if (error)
 		goto out1;
 
-	error = security_sb_pivotroot(&old_nd.path, &new_nd.path);
+	error = security_sb_pivotroot(&old, &new);
 	if (error) {
-		path_put(&old_nd.path);
+		path_put(&old);
 		goto out1;
 	}
 
@@ -2209,69 +2207,69 @@ asmlinkage long sys_pivot_root(const char __user * new_root,
 	path_get(&current->fs->root);
 	read_unlock(&current->fs->lock);
 	down_write(&namespace_sem);
-	mutex_lock(&old_nd.path.dentry->d_inode->i_mutex);
+	mutex_lock(&old.dentry->d_inode->i_mutex);
 	error = -EINVAL;
-	if (IS_MNT_SHARED(old_nd.path.mnt) ||
-		IS_MNT_SHARED(new_nd.path.mnt->mnt_parent) ||
+	if (IS_MNT_SHARED(old.mnt) ||
+		IS_MNT_SHARED(new.mnt->mnt_parent) ||
 		IS_MNT_SHARED(root.mnt->mnt_parent))
 		goto out2;
 	if (!check_mnt(root.mnt))
 		goto out2;
 	error = -ENOENT;
-	if (IS_DEADDIR(new_nd.path.dentry->d_inode))
+	if (IS_DEADDIR(new.dentry->d_inode))
 		goto out2;
-	if (d_unhashed(new_nd.path.dentry) && !IS_ROOT(new_nd.path.dentry))
+	if (d_unhashed(new.dentry) && !IS_ROOT(new.dentry))
 		goto out2;
-	if (d_unhashed(old_nd.path.dentry) && !IS_ROOT(old_nd.path.dentry))
+	if (d_unhashed(old.dentry) && !IS_ROOT(old.dentry))
 		goto out2;
 	error = -EBUSY;
-	if (new_nd.path.mnt == root.mnt ||
-	    old_nd.path.mnt == root.mnt)
+	if (new.mnt == root.mnt ||
+	    old.mnt == root.mnt)
 		goto out2; /* loop, on the same file system  */
 	error = -EINVAL;
 	if (root.mnt->mnt_root != root.dentry)
 		goto out2; /* not a mountpoint */
 	if (root.mnt->mnt_parent == root.mnt)
 		goto out2; /* not attached */
-	if (new_nd.path.mnt->mnt_root != new_nd.path.dentry)
+	if (new.mnt->mnt_root != new.dentry)
 		goto out2; /* not a mountpoint */
-	if (new_nd.path.mnt->mnt_parent == new_nd.path.mnt)
+	if (new.mnt->mnt_parent == new.mnt)
 		goto out2; /* not attached */
 	/* make sure we can reach put_old from new_root */
-	tmp = old_nd.path.mnt;
+	tmp = old.mnt;
 	spin_lock(&vfsmount_lock);
-	if (tmp != new_nd.path.mnt) {
+	if (tmp != new.mnt) {
 		for (;;) {
 			if (tmp->mnt_parent == tmp)
 				goto out3; /* already mounted on put_old */
-			if (tmp->mnt_parent == new_nd.path.mnt)
+			if (tmp->mnt_parent == new.mnt)
 				break;
 			tmp = tmp->mnt_parent;
 		}
-		if (!is_subdir(tmp->mnt_mountpoint, new_nd.path.dentry))
+		if (!is_subdir(tmp->mnt_mountpoint, new.dentry))
 			goto out3;
-	} else if (!is_subdir(old_nd.path.dentry, new_nd.path.dentry))
+	} else if (!is_subdir(old.dentry, new.dentry))
 		goto out3;
-	detach_mnt(new_nd.path.mnt, &parent_path);
+	detach_mnt(new.mnt, &parent_path);
 	detach_mnt(root.mnt, &root_parent);
 	/* mount old root on put_old */
-	attach_mnt(root.mnt, &old_nd.path);
+	attach_mnt(root.mnt, &old);
 	/* mount new_root on / */
-	attach_mnt(new_nd.path.mnt, &root_parent);
+	attach_mnt(new.mnt, &root_parent);
 	touch_mnt_namespace(current->nsproxy->mnt_ns);
 	spin_unlock(&vfsmount_lock);
-	chroot_fs_refs(&root, &new_nd.path);
-	security_sb_post_pivotroot(&root, &new_nd.path);
+	chroot_fs_refs(&root, &new);
+	security_sb_post_pivotroot(&root, &new);
 	error = 0;
 	path_put(&root_parent);
 	path_put(&parent_path);
 out2:
-	mutex_unlock(&old_nd.path.dentry->d_inode->i_mutex);
+	mutex_unlock(&old.dentry->d_inode->i_mutex);
 	up_write(&namespace_sem);
 	path_put(&root);
-	path_put(&old_nd.path);
+	path_put(&old);
 out1:
-	path_put(&new_nd.path);
+	path_put(&new);
 out0:
 	return error;
 out3:
diff --git a/fs/open.c b/fs/open.c
index e94266700ed..3fe1a6857c7 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -122,37 +122,37 @@ static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf)
 	return 0;
 }
 
-asmlinkage long sys_statfs(const char __user * path, struct statfs __user * buf)
+asmlinkage long sys_statfs(const char __user *pathname, struct statfs __user * buf)
 {
-	struct nameidata nd;
+	struct path path;
 	int error;
 
-	error = user_path_walk(path, &nd);
+	error = user_path(pathname, &path);
 	if (!error) {
 		struct statfs tmp;
-		error = vfs_statfs_native(nd.path.dentry, &tmp);
+		error = vfs_statfs_native(path.dentry, &tmp);
 		if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 			error = -EFAULT;
-		path_put(&nd.path);
+		path_put(&path);
 	}
 	return error;
 }
 
 
-asmlinkage long sys_statfs64(const char __user *path, size_t sz, struct statfs64 __user *buf)
+asmlinkage long sys_statfs64(const char __user *pathname, size_t sz, struct statfs64 __user *buf)
 {
-	struct nameidata nd;
+	struct path path;
 	long error;
 
 	if (sz != sizeof(*buf))
 		return -EINVAL;
-	error = user_path_walk(path, &nd);
+	error = user_path(pathname, &path);
 	if (!error) {
 		struct statfs64 tmp;
-		error = vfs_statfs64(nd.path.dentry, &tmp);
+		error = vfs_statfs64(path.dentry, &tmp);
 		if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 			error = -EFAULT;
-		path_put(&nd.path);
+		path_put(&path);
 	}
 	return error;
 }
@@ -223,20 +223,20 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
 	return err;
 }
 
-static long do_sys_truncate(const char __user * path, loff_t length)
+static long do_sys_truncate(const char __user *pathname, loff_t length)
 {
-	struct nameidata nd;
-	struct inode * inode;
+	struct path path;
+	struct inode *inode;
 	int error;
 
 	error = -EINVAL;
 	if (length < 0)	/* sorry, but loff_t says... */
 		goto out;
 
-	error = user_path_walk(path, &nd);
+	error = user_path(pathname, &path);
 	if (error)
 		goto out;
-	inode = nd.path.dentry->d_inode;
+	inode = path.dentry->d_inode;
 
 	/* For directories it's -EISDIR, for other non-regulars - -EINVAL */
 	error = -EISDIR;
@@ -247,7 +247,7 @@ static long do_sys_truncate(const char __user * path, loff_t length)
 	if (!S_ISREG(inode->i_mode))
 		goto dput_and_out;
 
-	error = mnt_want_write(nd.path.mnt);
+	error = mnt_want_write(path.mnt);
 	if (error)
 		goto dput_and_out;
 
@@ -274,15 +274,15 @@ static long do_sys_truncate(const char __user * path, loff_t length)
 	error = locks_verify_truncate(inode, NULL, length);
 	if (!error) {
 		DQUOT_INIT(inode);
-		error = do_truncate(nd.path.dentry, length, 0, NULL);
+		error = do_truncate(path.dentry, length, 0, NULL);
 	}
 
 put_write_and_out:
 	put_write_access(inode);
 mnt_drop_write_and_out:
-	mnt_drop_write(nd.path.mnt);
+	mnt_drop_write(path.mnt);
 dput_and_out:
-	path_put(&nd.path);
+	path_put(&path);
 out:
 	return error;
 }
@@ -425,7 +425,7 @@ out:
  */
 asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 {
-	struct nameidata nd;
+	struct path path;
 	struct inode *inode;
 	int old_fsuid, old_fsgid;
 	kernel_cap_t uninitialized_var(old_cap);  /* !SECURE_NO_SETUID_FIXUP */
@@ -449,7 +449,7 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 		 * FIXME: There is a race here against sys_capset.  The
 		 * capabilities can change yet we will restore the old
 		 * value below.  We should hold task_capabilities_lock,
-		 * but we cannot because user_path_walk can sleep.
+		 * but we cannot because user_path_at can sleep.
 		 */
 #endif /* ndef CONFIG_SECURITY_FILE_CAPABILITIES */
 		if (current->uid)
@@ -458,11 +458,11 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 			old_cap = cap_set_effective(current->cap_permitted);
 	}
 
-	res = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW, &nd);
+	res = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path);
 	if (res)
 		goto out;
 
-	inode = nd.path.dentry->d_inode;
+	inode = path.dentry->d_inode;
 
 	if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) {
 		/*
@@ -470,7 +470,7 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 		 * with the "noexec" flag.
 		 */
 		res = -EACCES;
-		if (nd.path.mnt->mnt_flags & MNT_NOEXEC)
+		if (path.mnt->mnt_flags & MNT_NOEXEC)
 			goto out_path_release;
 	}
 
@@ -488,11 +488,11 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 	 * inherently racy and know that the fs may change
 	 * state before we even see this result.
 	 */
-	if (__mnt_is_readonly(nd.path.mnt))
+	if (__mnt_is_readonly(path.mnt))
 		res = -EROFS;
 
 out_path_release:
-	path_put(&nd.path);
+	path_put(&path);
 out:
 	current->fsuid = old_fsuid;
 	current->fsgid = old_fsgid;
@@ -510,21 +510,21 @@ asmlinkage long sys_access(const char __user *filename, int mode)
 
 asmlinkage long sys_chdir(const char __user * filename)
 {
-	struct nameidata nd;
+	struct path path;
 	int error;
 
-	error = __user_walk(filename, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd);
+	error = user_path_dir(filename, &path);
 	if (error)
 		goto out;
 
-	error = inode_permission(nd.path.dentry->d_inode, MAY_EXEC | MAY_ACCESS);
+	error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS);
 	if (error)
 		goto dput_and_out;
 
-	set_fs_pwd(current->fs, &nd.path);
+	set_fs_pwd(current->fs, &path);
 
 dput_and_out:
-	path_put(&nd.path);
+	path_put(&path);
 out:
 	return error;
 }
@@ -557,14 +557,14 @@ out:
 
 asmlinkage long sys_chroot(const char __user * filename)
 {
-	struct nameidata nd;
+	struct path path;
 	int error;
 
-	error = __user_walk(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &nd);
+	error = user_path_dir(filename, &path);
 	if (error)
 		goto out;
 
-	error = inode_permission(nd.path.dentry->d_inode, MAY_EXEC | MAY_ACCESS);
+	error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS);
 	if (error)
 		goto dput_and_out;
 
@@ -572,10 +572,10 @@ asmlinkage long sys_chroot(const char __user * filename)
 	if (!capable(CAP_SYS_CHROOT))
 		goto dput_and_out;
 
-	set_fs_root(current->fs, &nd.path);
+	set_fs_root(current->fs, &path);
 	error = 0;
 dput_and_out:
-	path_put(&nd.path);
+	path_put(&path);
 out:
 	return error;
 }
@@ -617,17 +617,17 @@ out:
 asmlinkage long sys_fchmodat(int dfd, const char __user *filename,
 			     mode_t mode)
 {
-	struct nameidata nd;
-	struct inode * inode;
+	struct path path;
+	struct inode *inode;
 	int error;
 	struct iattr newattrs;
 
-	error = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW, &nd);
+	error = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path);
 	if (error)
 		goto out;
-	inode = nd.path.dentry->d_inode;
+	inode = path.dentry->d_inode;
 
-	error = mnt_want_write(nd.path.mnt);
+	error = mnt_want_write(path.mnt);
 	if (error)
 		goto dput_and_out;
 	mutex_lock(&inode->i_mutex);
@@ -635,11 +635,11 @@ asmlinkage long sys_fchmodat(int dfd, const char __user *filename,
 		mode = inode->i_mode;
 	newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
 	newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
-	error = notify_change(nd.path.dentry, &newattrs);
+	error = notify_change(path.dentry, &newattrs);
 	mutex_unlock(&inode->i_mutex);
-	mnt_drop_write(nd.path.mnt);
+	mnt_drop_write(path.mnt);
 dput_and_out:
-	path_put(&nd.path);
+	path_put(&path);
 out:
 	return error;
 }
@@ -676,19 +676,19 @@ static int chown_common(struct dentry * dentry, uid_t user, gid_t group)
 
 asmlinkage long sys_chown(const char __user * filename, uid_t user, gid_t group)
 {
-	struct nameidata nd;
+	struct path path;
 	int error;
 
-	error = user_path_walk(filename, &nd);
+	error = user_path(filename, &path);
 	if (error)
 		goto out;
-	error = mnt_want_write(nd.path.mnt);
+	error = mnt_want_write(path.mnt);
 	if (error)
 		goto out_release;
-	error = chown_common(nd.path.dentry, user, group);
-	mnt_drop_write(nd.path.mnt);
+	error = chown_common(path.dentry, user, group);
+	mnt_drop_write(path.mnt);
 out_release:
-	path_put(&nd.path);
+	path_put(&path);
 out:
 	return error;
 }
@@ -696,7 +696,7 @@ out:
 asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user,
 			     gid_t group, int flag)
 {
-	struct nameidata nd;
+	struct path path;
 	int error = -EINVAL;
 	int follow;
 
@@ -704,35 +704,35 @@ asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user,
 		goto out;
 
 	follow = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
-	error = __user_walk_fd(dfd, filename, follow, &nd);
+	error = user_path_at(dfd, filename, follow, &path);
 	if (error)
 		goto out;
-	error = mnt_want_write(nd.path.mnt);
+	error = mnt_want_write(path.mnt);
 	if (error)
 		goto out_release;
-	error = chown_common(nd.path.dentry, user, group);
-	mnt_drop_write(nd.path.mnt);
+	error = chown_common(path.dentry, user, group);
+	mnt_drop_write(path.mnt);
 out_release:
-	path_put(&nd.path);
+	path_put(&path);
 out:
 	return error;
 }
 
 asmlinkage long sys_lchown(const char __user * filename, uid_t user, gid_t group)
 {
-	struct nameidata nd;
+	struct path path;
 	int error;
 
-	error = user_path_walk_link(filename, &nd);
+	error = user_lpath(filename, &path);
 	if (error)
 		goto out;
-	error = mnt_want_write(nd.path.mnt);
+	error = mnt_want_write(path.mnt);
 	if (error)
 		goto out_release;
-	error = chown_common(nd.path.dentry, user, group);
-	mnt_drop_write(nd.path.mnt);
+	error = chown_common(path.dentry, user, group);
+	mnt_drop_write(path.mnt);
 out_release:
-	path_put(&nd.path);
+	path_put(&path);
 out:
 	return error;
 }
diff --git a/fs/stat.c b/fs/stat.c
index 9cf41f719d5..7c46fbeb8b7 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -57,13 +57,13 @@ EXPORT_SYMBOL(vfs_getattr);
 
 int vfs_stat_fd(int dfd, char __user *name, struct kstat *stat)
 {
-	struct nameidata nd;
+	struct path path;
 	int error;
 
-	error = __user_walk_fd(dfd, name, LOOKUP_FOLLOW, &nd);
+	error = user_path_at(dfd, name, LOOKUP_FOLLOW, &path);
 	if (!error) {
-		error = vfs_getattr(nd.path.mnt, nd.path.dentry, stat);
-		path_put(&nd.path);
+		error = vfs_getattr(path.mnt, path.dentry, stat);
+		path_put(&path);
 	}
 	return error;
 }
@@ -77,13 +77,13 @@ EXPORT_SYMBOL(vfs_stat);
 
 int vfs_lstat_fd(int dfd, char __user *name, struct kstat *stat)
 {
-	struct nameidata nd;
+	struct path path;
 	int error;
 
-	error = __user_walk_fd(dfd, name, 0, &nd);
+	error = user_path_at(dfd, name, 0, &path);
 	if (!error) {
-		error = vfs_getattr(nd.path.mnt, nd.path.dentry, stat);
-		path_put(&nd.path);
+		error = vfs_getattr(path.mnt, path.dentry, stat);
+		path_put(&path);
 	}
 	return error;
 }
@@ -291,29 +291,29 @@ asmlinkage long sys_newfstat(unsigned int fd, struct stat __user *statbuf)
 	return error;
 }
 
-asmlinkage long sys_readlinkat(int dfd, const char __user *path,
+asmlinkage long sys_readlinkat(int dfd, const char __user *pathname,
 				char __user *buf, int bufsiz)
 {
-	struct nameidata nd;
+	struct path path;
 	int error;
 
 	if (bufsiz <= 0)
 		return -EINVAL;
 
-	error = __user_walk_fd(dfd, path, 0, &nd);
+	error = user_path_at(dfd, pathname, 0, &path);
 	if (!error) {
-		struct inode *inode = nd.path.dentry->d_inode;
+		struct inode *inode = path.dentry->d_inode;
 
 		error = -EINVAL;
 		if (inode->i_op && inode->i_op->readlink) {
-			error = security_inode_readlink(nd.path.dentry);
+			error = security_inode_readlink(path.dentry);
 			if (!error) {
-				touch_atime(nd.path.mnt, nd.path.dentry);
-				error = inode->i_op->readlink(nd.path.dentry,
+				touch_atime(path.mnt, path.dentry);
+				error = inode->i_op->readlink(path.dentry,
 							      buf, bufsiz);
 			}
 		}
-		path_put(&nd.path);
+		path_put(&path);
 	}
 	return error;
 }
diff --git a/fs/utimes.c b/fs/utimes.c
index dc28b782625..6929e3e91d0 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -152,18 +152,18 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
 		error = utimes_common(&file->f_path, times);
 		fput(file);
 	} else {
-		struct nameidata nd;
+		struct path path;
 		int lookup_flags = 0;
 
 		if (!(flags & AT_SYMLINK_NOFOLLOW))
 			lookup_flags |= LOOKUP_FOLLOW;
 
-		error = __user_walk_fd(dfd, filename, lookup_flags, &nd);
+		error = user_path_at(dfd, filename, lookup_flags, &path);
 		if (error)
 			goto out;
 
-		error = utimes_common(&nd.path, times);
-		path_put(&nd.path);
+		error = utimes_common(&path, times);
+		path_put(&path);
 	}
 
 out:
diff --git a/fs/xattr.c b/fs/xattr.c
index b96222e05ba..468377e6653 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -252,40 +252,40 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
 }
 
 asmlinkage long
-sys_setxattr(const char __user *path, const char __user *name,
+sys_setxattr(const char __user *pathname, const char __user *name,
 	     const void __user *value, size_t size, int flags)
 {
-	struct nameidata nd;
+	struct path path;
 	int error;
 
-	error = user_path_walk(path, &nd);
+	error = user_path(pathname, &path);
 	if (error)
 		return error;
-	error = mnt_want_write(nd.path.mnt);
+	error = mnt_want_write(path.mnt);
 	if (!error) {
-		error = setxattr(nd.path.dentry, name, value, size, flags);
-		mnt_drop_write(nd.path.mnt);
+		error = setxattr(path.dentry, name, value, size, flags);
+		mnt_drop_write(path.mnt);
 	}
-	path_put(&nd.path);
+	path_put(&path);
 	return error;
 }
 
 asmlinkage long
-sys_lsetxattr(const char __user *path, const char __user *name,
+sys_lsetxattr(const char __user *pathname, const char __user *name,
 	      const void __user *value, size_t size, int flags)
 {
-	struct nameidata nd;
+	struct path path;
 	int error;
 
-	error = user_path_walk_link(path, &nd);
+	error = user_lpath(pathname, &path);
 	if (error)
 		return error;
-	error = mnt_want_write(nd.path.mnt);
+	error = mnt_want_write(path.mnt);
 	if (!error) {
-		error = setxattr(nd.path.dentry, name, value, size, flags);
-		mnt_drop_write(nd.path.mnt);
+		error = setxattr(path.dentry, name, value, size, flags);
+		mnt_drop_write(path.mnt);
 	}
-	path_put(&nd.path);
+	path_put(&path);
 	return error;
 }
 
@@ -350,32 +350,32 @@ getxattr(struct dentry *d, const char __user *name, void __user *value,
 }
 
 asmlinkage ssize_t
-sys_getxattr(const char __user *path, const char __user *name,
+sys_getxattr(const char __user *pathname, const char __user *name,
 	     void __user *value, size_t size)
 {
-	struct nameidata nd;
+	struct path path;
 	ssize_t error;
 
-	error = user_path_walk(path, &nd);
+	error = user_path(pathname, &path);
 	if (error)
 		return error;
-	error = getxattr(nd.path.dentry, name, value, size);
-	path_put(&nd.path);
+	error = getxattr(path.dentry, name, value, size);
+	path_put(&path);
 	return error;
 }
 
 asmlinkage ssize_t
-sys_lgetxattr(const char __user *path, const char __user *name, void __user *value,
+sys_lgetxattr(const char __user *pathname, const char __user *name, void __user *value,
 	      size_t size)
 {
-	struct nameidata nd;
+	struct path path;
 	ssize_t error;
 
-	error = user_path_walk_link(path, &nd);
+	error = user_lpath(pathname, &path);
 	if (error)
 		return error;
-	error = getxattr(nd.path.dentry, name, value, size);
-	path_put(&nd.path);
+	error = getxattr(path.dentry, name, value, size);
+	path_put(&path);
 	return error;
 }
 
@@ -425,30 +425,30 @@ listxattr(struct dentry *d, char __user *list, size_t size)
 }
 
 asmlinkage ssize_t
-sys_listxattr(const char __user *path, char __user *list, size_t size)
+sys_listxattr(const char __user *pathname, char __user *list, size_t size)
 {
-	struct nameidata nd;
+	struct path path;
 	ssize_t error;
 
-	error = user_path_walk(path, &nd);
+	error = user_path(pathname, &path);
 	if (error)
 		return error;
-	error = listxattr(nd.path.dentry, list, size);
-	path_put(&nd.path);
+	error = listxattr(path.dentry, list, size);
+	path_put(&path);
 	return error;
 }
 
 asmlinkage ssize_t
-sys_llistxattr(const char __user *path, char __user *list, size_t size)
+sys_llistxattr(const char __user *pathname, char __user *list, size_t size)
 {
-	struct nameidata nd;
+	struct path path;
 	ssize_t error;
 
-	error = user_path_walk_link(path, &nd);
+	error = user_lpath(pathname, &path);
 	if (error)
 		return error;
-	error = listxattr(nd.path.dentry, list, size);
-	path_put(&nd.path);
+	error = listxattr(path.dentry, list, size);
+	path_put(&path);
 	return error;
 }
 
@@ -486,38 +486,38 @@ removexattr(struct dentry *d, const char __user *name)
 }
 
 asmlinkage long
-sys_removexattr(const char __user *path, const char __user *name)
+sys_removexattr(const char __user *pathname, const char __user *name)
 {
-	struct nameidata nd;
+	struct path path;
 	int error;
 
-	error = user_path_walk(path, &nd);
+	error = user_path(pathname, &path);
 	if (error)
 		return error;
-	error = mnt_want_write(nd.path.mnt);
+	error = mnt_want_write(path.mnt);
 	if (!error) {
-		error = removexattr(nd.path.dentry, name);
-		mnt_drop_write(nd.path.mnt);
+		error = removexattr(path.dentry, name);
+		mnt_drop_write(path.mnt);
 	}
-	path_put(&nd.path);
+	path_put(&path);
 	return error;
 }
 
 asmlinkage long
-sys_lremovexattr(const char __user *path, const char __user *name)
+sys_lremovexattr(const char __user *pathname, const char __user *name)
 {
-	struct nameidata nd;
+	struct path path;
 	int error;
 
-	error = user_path_walk_link(path, &nd);
+	error = user_lpath(pathname, &path);
 	if (error)
 		return error;
-	error = mnt_want_write(nd.path.mnt);
+	error = mnt_want_write(path.mnt);
 	if (!error) {
-		error = removexattr(nd.path.dentry, name);
-		mnt_drop_write(nd.path.mnt);
+		error = removexattr(path.dentry, name);
+		mnt_drop_write(path.mnt);
 	}
-	path_put(&nd.path);
+	path_put(&path);
 	return error;
 }
 
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index a42ba9d7115..01939ba2d8d 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -84,17 +84,15 @@ xfs_find_handle(
 	switch (cmd) {
 	case XFS_IOC_PATH_TO_FSHANDLE:
 	case XFS_IOC_PATH_TO_HANDLE: {
-		struct nameidata	nd;
-		int			error;
-
-		error = user_path_walk_link((const char __user *)hreq.path, &nd);
+		struct path path;
+		int error = user_lpath((const char __user *)hreq.path, &path);
 		if (error)
 			return error;
 
-		ASSERT(nd.path.dentry);
-		ASSERT(nd.path.dentry->d_inode);
-		inode = igrab(nd.path.dentry->d_inode);
-		path_put(&nd.path);
+		ASSERT(path.dentry);
+		ASSERT(path.dentry->d_inode);
+		inode = igrab(path.dentry->d_inode);
+		path_put(&path);
 		break;
 	}
 
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 60e35a02f6c..00888ff6950 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -54,12 +54,13 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
 #define LOOKUP_OPEN		(0x0100)
 #define LOOKUP_CREATE		(0x0200)
 
-extern int __user_walk(const char __user *, unsigned, struct nameidata *);
-extern int __user_walk_fd(int dfd, const char __user *, unsigned, struct nameidata *);
-#define user_path_walk(name,nd) \
-	__user_walk_fd(AT_FDCWD, name, LOOKUP_FOLLOW, nd)
-#define user_path_walk_link(name,nd) \
-	__user_walk_fd(AT_FDCWD, name, 0, nd)
+extern int user_path_at(int, const char __user *, unsigned, struct path *);
+
+#define user_path(name, path) user_path_at(AT_FDCWD, name, LOOKUP_FOLLOW, path)
+#define user_lpath(name, path) user_path_at(AT_FDCWD, name, 0, path)
+#define user_path_dir(name, path) \
+	user_path_at(AT_FDCWD, name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, path)
+
 extern int path_lookup(const char *, unsigned, struct nameidata *);
 extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
 			   const char *, unsigned int, struct nameidata *);
-- 
cgit v1.2.3-70-g09d2


From 2ad94ae654f5eb72fd3260b706aea645cf4a7791 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 21 Jul 2008 09:32:51 -0400
Subject: [PATCH] new (local) helper: user_path_parent()

Preparation to untangling intents mess: reduce the number of do_path_lookup()
callers.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 137 +++++++++++++++++++++++++------------------------------------
 1 file changed, 57 insertions(+), 80 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index edb5e973f9b..38ceb6e06eb 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1352,6 +1352,24 @@ int user_path_at(int dfd, const char __user *name, unsigned flags,
 	return err;
 }
 
+static int user_path_parent(int dfd, const char __user *path,
+			struct nameidata *nd, char **name)
+{
+	char *s = getname(path);
+	int error;
+
+	if (IS_ERR(s))
+		return PTR_ERR(s);
+
+	error = do_path_lookup(dfd, s, LOOKUP_PARENT, nd);
+	if (error)
+		putname(s);
+	else
+		*name = s;
+
+	return error;
+}
+
 /*
  * It's inline, so penalty for filesystems that don't use sticky bit is
  * minimal.
@@ -1989,20 +2007,18 @@ static int may_mknod(mode_t mode)
 asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
 				unsigned dev)
 {
-	int error = 0;
-	char * tmp;
-	struct dentry * dentry;
+	int error;
+	char *tmp;
+	struct dentry *dentry;
 	struct nameidata nd;
 
 	if (S_ISDIR(mode))
 		return -EPERM;
-	tmp = getname(filename);
-	if (IS_ERR(tmp))
-		return PTR_ERR(tmp);
 
-	error = do_path_lookup(dfd, tmp, LOOKUP_PARENT, &nd);
+	error = user_path_parent(dfd, filename, &nd, &tmp);
 	if (error)
-		goto out;
+		return error;
+
 	dentry = lookup_create(&nd, 0);
 	if (IS_ERR(dentry)) {
 		error = PTR_ERR(dentry);
@@ -2034,7 +2050,6 @@ out_dput:
 out_unlock:
 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
 	path_put(&nd.path);
-out:
 	putname(tmp);
 
 	return error;
@@ -2074,14 +2089,10 @@ asmlinkage long sys_mkdirat(int dfd, const char __user *pathname, int mode)
 	struct dentry *dentry;
 	struct nameidata nd;
 
-	tmp = getname(pathname);
-	error = PTR_ERR(tmp);
-	if (IS_ERR(tmp))
+	error = user_path_parent(dfd, pathname, &nd, &tmp);
+	if (error)
 		goto out_err;
 
-	error = do_path_lookup(dfd, tmp, LOOKUP_PARENT, &nd);
-	if (error)
-		goto out;
 	dentry = lookup_create(&nd, 1);
 	error = PTR_ERR(dentry);
 	if (IS_ERR(dentry))
@@ -2099,7 +2110,6 @@ out_dput:
 out_unlock:
 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
 	path_put(&nd.path);
-out:
 	putname(tmp);
 out_err:
 	return error;
@@ -2177,13 +2187,9 @@ static long do_rmdir(int dfd, const char __user *pathname)
 	struct dentry *dentry;
 	struct nameidata nd;
 
-	name = getname(pathname);
-	if(IS_ERR(name))
-		return PTR_ERR(name);
-
-	error = do_path_lookup(dfd, name, LOOKUP_PARENT, &nd);
+	error = user_path_parent(dfd, pathname, &nd, &name);
 	if (error)
-		goto exit;
+		return error;
 
 	switch(nd.last_type) {
 		case LAST_DOTDOT:
@@ -2212,7 +2218,6 @@ exit2:
 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
 exit1:
 	path_put(&nd.path);
-exit:
 	putname(name);
 	return error;
 }
@@ -2261,19 +2266,16 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
  */
 static long do_unlinkat(int dfd, const char __user *pathname)
 {
-	int error = 0;
-	char * name;
+	int error;
+	char *name;
 	struct dentry *dentry;
 	struct nameidata nd;
 	struct inode *inode = NULL;
 
-	name = getname(pathname);
-	if(IS_ERR(name))
-		return PTR_ERR(name);
-
-	error = do_path_lookup(dfd, name, LOOKUP_PARENT, &nd);
+	error = user_path_parent(dfd, pathname, &nd, &name);
 	if (error)
-		goto exit;
+		return error;
+
 	error = -EISDIR;
 	if (nd.last_type != LAST_NORM)
 		goto exit1;
@@ -2300,7 +2302,6 @@ static long do_unlinkat(int dfd, const char __user *pathname)
 		iput(inode);	/* truncate the inode here */
 exit1:
 	path_put(&nd.path);
-exit:
 	putname(name);
 	return error;
 
@@ -2350,23 +2351,20 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
 asmlinkage long sys_symlinkat(const char __user *oldname,
 			      int newdfd, const char __user *newname)
 {
-	int error = 0;
-	char * from;
-	char * to;
+	int error;
+	char *from;
+	char *to;
 	struct dentry *dentry;
 	struct nameidata nd;
 
 	from = getname(oldname);
-	if(IS_ERR(from))
+	if (IS_ERR(from))
 		return PTR_ERR(from);
-	to = getname(newname);
-	error = PTR_ERR(to);
-	if (IS_ERR(to))
-		goto out_putname;
 
-	error = do_path_lookup(newdfd, to, LOOKUP_PARENT, &nd);
+	error = user_path_parent(newdfd, newname, &nd, &to);
 	if (error)
-		goto out;
+		goto out_putname;
+
 	dentry = lookup_create(&nd, 0);
 	error = PTR_ERR(dentry);
 	if (IS_ERR(dentry))
@@ -2382,7 +2380,6 @@ out_dput:
 out_unlock:
 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
 	path_put(&nd.path);
-out:
 	putname(to);
 out_putname:
 	putname(from);
@@ -2449,21 +2446,18 @@ asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
 	struct nameidata nd;
 	struct path old_path;
 	int error;
-	char * to;
+	char *to;
 
 	if ((flags & ~AT_SYMLINK_FOLLOW) != 0)
 		return -EINVAL;
 
-	to = getname(newname);
-	if (IS_ERR(to))
-		return PTR_ERR(to);
-
 	error = user_path_at(olddfd, oldname,
 			     flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0,
 			     &old_path);
 	if (error)
-		goto exit;
-	error = do_path_lookup(newdfd, to, LOOKUP_PARENT, &nd);
+		return error;
+
+	error = user_path_parent(newdfd, newname, &nd, &to);
 	if (error)
 		goto out;
 	error = -EXDEV;
@@ -2484,10 +2478,9 @@ out_unlock:
 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
 out_release:
 	path_put(&nd.path);
+	putname(to);
 out:
 	path_put(&old_path);
-exit:
-	putname(to);
 
 	return error;
 }
@@ -2643,20 +2636,22 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	return error;
 }
 
-static int do_rename(int olddfd, const char *oldname,
-			int newdfd, const char *newname)
+asmlinkage long sys_renameat(int olddfd, const char __user *oldname,
+			     int newdfd, const char __user *newname)
 {
-	int error = 0;
-	struct dentry * old_dir, * new_dir;
-	struct dentry * old_dentry, *new_dentry;
-	struct dentry * trap;
+	struct dentry *old_dir, *new_dir;
+	struct dentry *old_dentry, *new_dentry;
+	struct dentry *trap;
 	struct nameidata oldnd, newnd;
+	char *from;
+	char *to;
+	int error;
 
-	error = do_path_lookup(olddfd, oldname, LOOKUP_PARENT, &oldnd);
+	error = user_path_parent(olddfd, oldname, &oldnd, &from);
 	if (error)
 		goto exit;
 
-	error = do_path_lookup(newdfd, newname, LOOKUP_PARENT, &newnd);
+	error = user_path_parent(newdfd, newname, &newnd, &to);
 	if (error)
 		goto exit1;
 
@@ -2718,29 +2713,11 @@ exit3:
 	unlock_rename(new_dir, old_dir);
 exit2:
 	path_put(&newnd.path);
+	putname(to);
 exit1:
 	path_put(&oldnd.path);
-exit:
-	return error;
-}
-
-asmlinkage long sys_renameat(int olddfd, const char __user *oldname,
-			     int newdfd, const char __user *newname)
-{
-	int error;
-	char * from;
-	char * to;
-
-	from = getname(oldname);
-	if(IS_ERR(from))
-		return PTR_ERR(from);
-	to = getname(newname);
-	error = PTR_ERR(to);
-	if (!IS_ERR(to)) {
-		error = do_rename(olddfd, from, newdfd, to);
-		putname(to);
-	}
 	putname(from);
+exit:
 	return error;
 }
 
-- 
cgit v1.2.3-70-g09d2


From a569c711f63995ad80c23918525111e0cdb0bc73 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 23 Jul 2008 14:42:05 -0400
Subject: [PATCH] don't pass nameidata to gfs2_lookupi()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/gfs2/inode.c      | 6 +++---
 fs/gfs2/inode.h      | 2 +-
 fs/gfs2/ops_export.c | 2 +-
 fs/gfs2/ops_inode.c  | 4 ++--
 fs/gfs2/super.c      | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 6da0ab355b8..8b0806a3294 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -448,7 +448,7 @@ struct inode *gfs2_lookup_simple(struct inode *dip, const char *name)
 	struct qstr qstr;
 	struct inode *inode;
 	gfs2_str2qstr(&qstr, name);
-	inode = gfs2_lookupi(dip, &qstr, 1, NULL);
+	inode = gfs2_lookupi(dip, &qstr, 1);
 	/* gfs2_lookupi has inconsistent callers: vfs
 	 * related routines expect NULL for no entry found,
 	 * gfs2_lookup_simple callers expect ENOENT
@@ -477,7 +477,7 @@ struct inode *gfs2_lookup_simple(struct inode *dip, const char *name)
  */
 
 struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
-			   int is_root, struct nameidata *nd)
+			   int is_root)
 {
 	struct super_block *sb = dir->i_sb;
 	struct gfs2_inode *dip = GFS2_I(dir);
@@ -1173,7 +1173,7 @@ int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
 			break;
 		}
 
-		tmp = gfs2_lookupi(dir, &dotdot, 1, NULL);
+		tmp = gfs2_lookupi(dir, &dotdot, 1);
 		if (IS_ERR(tmp)) {
 			error = PTR_ERR(tmp);
 			break;
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 6074c2506f7..58f9607d6a8 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -83,7 +83,7 @@ int gfs2_inode_refresh(struct gfs2_inode *ip);
 int gfs2_dinode_dealloc(struct gfs2_inode *inode);
 int gfs2_change_nlink(struct gfs2_inode *ip, int diff);
 struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
-			   int is_root, struct nameidata *nd);
+			   int is_root);
 struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
 			   unsigned int mode, dev_t dev);
 int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
index 990d9f4bc46..9cda8536530 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/ops_export.c
@@ -134,7 +134,7 @@ static struct dentry *gfs2_get_parent(struct dentry *child)
 	struct dentry *dentry;
 
 	gfs2_str2qstr(&dotdot, "..");
-	inode = gfs2_lookupi(child->d_inode, &dotdot, 1, NULL);
+	inode = gfs2_lookupi(child->d_inode, &dotdot, 1);
 
 	if (!inode)
 		return ERR_PTR(-ENOENT);
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 4e982532f08..e2c62f73a77 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -74,7 +74,7 @@ static int gfs2_create(struct inode *dir, struct dentry *dentry,
 			return PTR_ERR(inode);
 		}
 
-		inode = gfs2_lookupi(dir, &dentry->d_name, 0, nd);
+		inode = gfs2_lookupi(dir, &dentry->d_name, 0);
 		if (inode) {
 			if (!IS_ERR(inode)) {
 				gfs2_holder_uninit(ghs);
@@ -109,7 +109,7 @@ static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
 
 	dentry->d_op = &gfs2_dops;
 
-	inode = gfs2_lookupi(dir, &dentry->d_name, 0, nd);
+	inode = gfs2_lookupi(dir, &dentry->d_name, 0);
 	if (inode && IS_ERR(inode))
 		return ERR_CAST(inode);
 
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 63a8a902d9d..ca831991cbc 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -389,7 +389,7 @@ int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
 			break;
 
 		INIT_LIST_HEAD(&jd->extent_list);
-		jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1, NULL);
+		jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
 		if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
 			if (!jd->jd_inode)
 				error = -ENOENT;
-- 
cgit v1.2.3-70-g09d2


From 58ec42b061bf5dad8fa0370a19966cfd96eaf80c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 23 Jul 2008 14:45:55 -0400
Subject: [PATCH] don't pass nameidata to __ncp_lookup_validate()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ncpfs/dir.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 011ef0b6d2d..07e9715b865 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -266,7 +266,7 @@ leave_me:;
 
 
 static int
-__ncp_lookup_validate(struct dentry * dentry, struct nameidata *nd)
+__ncp_lookup_validate(struct dentry *dentry)
 {
 	struct ncp_server *server;
 	struct dentry *parent;
@@ -340,7 +340,7 @@ ncp_lookup_validate(struct dentry * dentry, struct nameidata *nd)
 {
 	int res;
 	lock_kernel();
-	res = __ncp_lookup_validate(dentry, nd);
+	res = __ncp_lookup_validate(dentry);
 	unlock_kernel();
 	return res;
 }
-- 
cgit v1.2.3-70-g09d2


From 3c333937ee3be114b181c4861188cfe8f6a59697 Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Fri, 25 Jul 2008 22:32:13 -0400
Subject: [PATCH] dup3 fix

Al Viro notice one cornercase that the new dup3() code.  The dup2()
function, as a special case, handles dup-ing to the same file
descriptor.  In this case the current dup3() code does nothing at
all.  I.e., it ingnores the flags parameter.  This shouldn't happen,
the close-on-exec flag should be set if requested.

In case the O_CLOEXEC bit in the flags parameter is not set the
dup3() function should behave in this respect identical to dup2().
This means dup3(fd, fd, 0) should not actively reset the c-o-e
flag.

The patch below implements this minor change.

[AV: credits to Artur Grabowski for bringing that up as potential subtle point
in dup2() behaviour]

Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fcntl.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 9679fcbdeaa..ce12a688511 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -139,8 +139,13 @@ asmlinkage long sys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
 	if (!(file = fcheck(oldfd)))
 		goto out_unlock;
 	err = newfd;
-	if (newfd == oldfd)
+	if (unlikely(newfd == oldfd)) {
+		if (flags & O_CLOEXEC) {
+			fdt = files_fdtable(files);
+			FD_SET(newfd, fdt->close_on_exec);
+		}
 		goto out_unlock;
+	}
 	err = -EBADF;
 	if (newfd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
 		goto out_unlock;
-- 
cgit v1.2.3-70-g09d2


From 516e0cc5646f377ab80fcc2ee639892eccb99853 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 26 Jul 2008 00:39:17 -0400
Subject: [PATCH] f_count may wrap around

make it atomic_long_t; while we are at it, get rid of useless checks in affs,
hfs and hpfs - ->open() always has it equal to 1, ->release() - to 0.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/net/ppp_generic.c |  6 +++---
 fs/affs/file.c            |  4 ----
 fs/aio.c                  |  6 +++---
 fs/file_table.c           | 10 +++++-----
 fs/hfs/inode.c            |  4 ----
 fs/hfsplus/inode.c        |  4 ----
 include/linux/fs.h        |  6 +++---
 include/net/af_unix.h     |  2 +-
 net/sched/sch_atm.c       |  4 ++--
 net/unix/af_unix.c        |  2 +-
 net/unix/garbage.c        | 18 +++++++++---------
 11 files changed, 27 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/drivers/net/ppp_generic.c b/drivers/net/ppp_generic.c
index 739b3ab7bcc..ddccc074a76 100644
--- a/drivers/net/ppp_generic.c
+++ b/drivers/net/ppp_generic.c
@@ -581,12 +581,12 @@ static long ppp_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 			if (file == ppp->owner)
 				ppp_shutdown_interface(ppp);
 		}
-		if (atomic_read(&file->f_count) <= 2) {
+		if (atomic_long_read(&file->f_count) <= 2) {
 			ppp_release(NULL, file);
 			err = 0;
 		} else
-			printk(KERN_DEBUG "PPPIOCDETACH file->f_count=%d\n",
-			       atomic_read(&file->f_count));
+			printk(KERN_DEBUG "PPPIOCDETACH file->f_count=%ld\n",
+			       atomic_long_read(&file->f_count));
 		unlock_kernel();
 		return err;
 	}
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 6eac7bdeec9..1377b1240b6 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -46,8 +46,6 @@ const struct inode_operations affs_file_inode_operations = {
 static int
 affs_file_open(struct inode *inode, struct file *filp)
 {
-	if (atomic_read(&filp->f_count) != 1)
-		return 0;
 	pr_debug("AFFS: open(%lu,%d)\n",
 		 inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt));
 	atomic_inc(&AFFS_I(inode)->i_opencnt);
@@ -57,8 +55,6 @@ affs_file_open(struct inode *inode, struct file *filp)
 static int
 affs_file_release(struct inode *inode, struct file *filp)
 {
-	if (atomic_read(&filp->f_count) != 0)
-		return 0;
 	pr_debug("AFFS: release(%lu, %d)\n",
 		 inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt));
 
diff --git a/fs/aio.c b/fs/aio.c
index 0051fd94b44..f658441d566 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -512,8 +512,8 @@ static void aio_fput_routine(struct work_struct *data)
  */
 static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
 {
-	dprintk(KERN_DEBUG "aio_put(%p): f_count=%d\n",
-		req, atomic_read(&req->ki_filp->f_count));
+	dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n",
+		req, atomic_long_read(&req->ki_filp->f_count));
 
 	assert_spin_locked(&ctx->ctx_lock);
 
@@ -528,7 +528,7 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
 	/* Must be done under the lock to serialise against cancellation.
 	 * Call this aio_fput as it duplicates fput via the fput_work.
 	 */
-	if (unlikely(atomic_dec_and_test(&req->ki_filp->f_count))) {
+	if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) {
 		get_ioctx(ctx);
 		spin_lock(&fput_lock);
 		list_add(&req->ki_list, &fput_head);
diff --git a/fs/file_table.c b/fs/file_table.c
index 83084225b4c..f45a4493f9e 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -120,7 +120,7 @@ struct file *get_empty_filp(void)
 
 	tsk = current;
 	INIT_LIST_HEAD(&f->f_u.fu_list);
-	atomic_set(&f->f_count, 1);
+	atomic_long_set(&f->f_count, 1);
 	rwlock_init(&f->f_owner.lock);
 	f->f_uid = tsk->fsuid;
 	f->f_gid = tsk->fsgid;
@@ -219,7 +219,7 @@ EXPORT_SYMBOL(init_file);
 
 void fput(struct file *file)
 {
-	if (atomic_dec_and_test(&file->f_count))
+	if (atomic_long_dec_and_test(&file->f_count))
 		__fput(file);
 }
 
@@ -294,7 +294,7 @@ struct file *fget(unsigned int fd)
 	rcu_read_lock();
 	file = fcheck_files(files, fd);
 	if (file) {
-		if (!atomic_inc_not_zero(&file->f_count)) {
+		if (!atomic_long_inc_not_zero(&file->f_count)) {
 			/* File object ref couldn't be taken */
 			rcu_read_unlock();
 			return NULL;
@@ -326,7 +326,7 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
 		rcu_read_lock();
 		file = fcheck_files(files, fd);
 		if (file) {
-			if (atomic_inc_not_zero(&file->f_count))
+			if (atomic_long_inc_not_zero(&file->f_count))
 				*fput_needed = 1;
 			else
 				/* Didn't get the reference, someone's freed */
@@ -341,7 +341,7 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
 
 void put_filp(struct file *file)
 {
-	if (atomic_dec_and_test(&file->f_count)) {
+	if (atomic_long_dec_and_test(&file->f_count)) {
 		security_file_free(file);
 		file_kill(file);
 		file_free(file);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index aa73f3fd5dd..7e19835efa2 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -522,8 +522,6 @@ static int hfs_file_open(struct inode *inode, struct file *file)
 {
 	if (HFS_IS_RSRC(inode))
 		inode = HFS_I(inode)->rsrc_inode;
-	if (atomic_read(&file->f_count) != 1)
-		return 0;
 	atomic_inc(&HFS_I(inode)->opencnt);
 	return 0;
 }
@@ -534,8 +532,6 @@ static int hfs_file_release(struct inode *inode, struct file *file)
 
 	if (HFS_IS_RSRC(inode))
 		inode = HFS_I(inode)->rsrc_inode;
-	if (atomic_read(&file->f_count) != 0)
-		return 0;
 	if (atomic_dec_and_test(&HFS_I(inode)->opencnt)) {
 		mutex_lock(&inode->i_mutex);
 		hfs_file_truncate(inode);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index d4014e3044d..b085d64a2b6 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -254,8 +254,6 @@ static int hfsplus_file_open(struct inode *inode, struct file *file)
 {
 	if (HFSPLUS_IS_RSRC(inode))
 		inode = HFSPLUS_I(inode).rsrc_inode;
-	if (atomic_read(&file->f_count) != 1)
-		return 0;
 	atomic_inc(&HFSPLUS_I(inode).opencnt);
 	return 0;
 }
@@ -266,8 +264,6 @@ static int hfsplus_file_release(struct inode *inode, struct file *file)
 
 	if (HFSPLUS_IS_RSRC(inode))
 		inode = HFSPLUS_I(inode).rsrc_inode;
-	if (atomic_read(&file->f_count) != 0)
-		return 0;
 	if (atomic_dec_and_test(&HFSPLUS_I(inode).opencnt)) {
 		mutex_lock(&inode->i_mutex);
 		hfsplus_file_truncate(inode);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9d2de4cadab..7676fa1c20a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -795,7 +795,7 @@ struct file {
 #define f_dentry	f_path.dentry
 #define f_vfsmnt	f_path.mnt
 	const struct file_operations	*f_op;
-	atomic_t		f_count;
+	atomic_long_t		f_count;
 	unsigned int 		f_flags;
 	mode_t			f_mode;
 	loff_t			f_pos;
@@ -824,8 +824,8 @@ extern spinlock_t files_lock;
 #define file_list_lock() spin_lock(&files_lock);
 #define file_list_unlock() spin_unlock(&files_lock);
 
-#define get_file(x)	atomic_inc(&(x)->f_count)
-#define file_count(x)	atomic_read(&(x)->f_count)
+#define get_file(x)	atomic_long_inc(&(x)->f_count)
+#define file_count(x)	atomic_long_read(&(x)->f_count)
 
 #ifdef CONFIG_DEBUG_WRITECOUNT
 static inline void file_take_write(struct file *f)
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 2dfa96b0575..7dd29b7e461 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -51,7 +51,7 @@ struct unix_sock {
         struct sock		*peer;
         struct sock		*other;
 	struct list_head	link;
-        atomic_t                inflight;
+        atomic_long_t           inflight;
         spinlock_t		lock;
 	unsigned int		gc_candidate : 1;
         wait_queue_head_t       peer_wait;
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index 04faa835be1..6b517b9dac5 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -162,7 +162,7 @@ static void atm_tc_put(struct Qdisc *sch, unsigned long cl)
 	qdisc_destroy(flow->q);
 	tcf_destroy_chain(&flow->filter_list);
 	if (flow->sock) {
-		pr_debug("atm_tc_put: f_count %d\n",
+		pr_debug("atm_tc_put: f_count %ld\n",
 			file_count(flow->sock->file));
 		flow->vcc->pop = flow->old_pop;
 		sockfd_put(flow->sock);
@@ -259,7 +259,7 @@ static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent,
 	sock = sockfd_lookup(fd, &error);
 	if (!sock)
 		return error;	/* f_count++ */
-	pr_debug("atm_tc_change: f_count %d\n", file_count(sock->file));
+	pr_debug("atm_tc_change: f_count %ld\n", file_count(sock->file));
 	if (sock->ops->family != PF_ATMSVC && sock->ops->family != PF_ATMPVC) {
 		error = -EPROTOTYPE;
 		goto err_out;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 70ceb1604ad..6e7fec74bdb 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -603,7 +603,7 @@ static struct sock * unix_create1(struct net *net, struct socket *sock)
 	u->dentry = NULL;
 	u->mnt	  = NULL;
 	spin_lock_init(&u->lock);
-	atomic_set(&u->inflight, 0);
+	atomic_long_set(&u->inflight, 0);
 	INIT_LIST_HEAD(&u->link);
 	mutex_init(&u->readlock); /* single task reading lock */
 	init_waitqueue_head(&u->peer_wait);
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index ebdff3d877a..2a27b84f740 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -127,7 +127,7 @@ void unix_inflight(struct file *fp)
 	if(s) {
 		struct unix_sock *u = unix_sk(s);
 		spin_lock(&unix_gc_lock);
-		if (atomic_inc_return(&u->inflight) == 1) {
+		if (atomic_long_inc_return(&u->inflight) == 1) {
 			BUG_ON(!list_empty(&u->link));
 			list_add_tail(&u->link, &gc_inflight_list);
 		} else {
@@ -145,7 +145,7 @@ void unix_notinflight(struct file *fp)
 		struct unix_sock *u = unix_sk(s);
 		spin_lock(&unix_gc_lock);
 		BUG_ON(list_empty(&u->link));
-		if (atomic_dec_and_test(&u->inflight))
+		if (atomic_long_dec_and_test(&u->inflight))
 			list_del_init(&u->link);
 		unix_tot_inflight--;
 		spin_unlock(&unix_gc_lock);
@@ -237,17 +237,17 @@ static void scan_children(struct sock *x, void (*func)(struct unix_sock *),
 
 static void dec_inflight(struct unix_sock *usk)
 {
-	atomic_dec(&usk->inflight);
+	atomic_long_dec(&usk->inflight);
 }
 
 static void inc_inflight(struct unix_sock *usk)
 {
-	atomic_inc(&usk->inflight);
+	atomic_long_inc(&usk->inflight);
 }
 
 static void inc_inflight_move_tail(struct unix_sock *u)
 {
-	atomic_inc(&u->inflight);
+	atomic_long_inc(&u->inflight);
 	/*
 	 * If this is still a candidate, move it to the end of the
 	 * list, so that it's checked even if it was already passed
@@ -288,11 +288,11 @@ void unix_gc(void)
 	 * before the detach without atomicity guarantees.
 	 */
 	list_for_each_entry_safe(u, next, &gc_inflight_list, link) {
-		int total_refs;
-		int inflight_refs;
+		long total_refs;
+		long inflight_refs;
 
 		total_refs = file_count(u->sk.sk_socket->file);
-		inflight_refs = atomic_read(&u->inflight);
+		inflight_refs = atomic_long_read(&u->inflight);
 
 		BUG_ON(inflight_refs < 1);
 		BUG_ON(total_refs < inflight_refs);
@@ -324,7 +324,7 @@ void unix_gc(void)
 		/* Move cursor to after the current position. */
 		list_move(&cursor, &u->link);
 
-		if (atomic_read(&u->inflight) > 0) {
+		if (atomic_long_read(&u->inflight) > 0) {
 			list_move_tail(&u->link, &gc_inflight_list);
 			u->gc_candidate = 0;
 			scan_children(&u->sk, inc_inflight_move_tail, NULL);
-- 
cgit v1.2.3-70-g09d2


From 964bd183624c03680796b63b4ab97ee3905a806a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 26 Jul 2008 03:33:14 -0400
Subject: [PATCH] get rid of __user_path_lookup_open

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/exec.c             | 14 ++++++++++----
 fs/namei.c            | 13 -------------
 include/linux/namei.h |  1 -
 3 files changed, 10 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index eca58c29ede..9696bbf0f0b 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -106,11 +106,17 @@ static inline void put_binfmt(struct linux_binfmt * fmt)
  */
 asmlinkage long sys_uselib(const char __user * library)
 {
-	struct file * file;
+	struct file *file;
 	struct nameidata nd;
-	int error;
-
-	error = __user_path_lookup_open(library, LOOKUP_FOLLOW, &nd, FMODE_READ|FMODE_EXEC);
+	char *tmp = getname(library);
+	int error = PTR_ERR(tmp);
+
+	if (!IS_ERR(tmp)) {
+		error = path_lookup_open(AT_FDCWD, tmp,
+					 LOOKUP_FOLLOW, &nd,
+					 FMODE_READ|FMODE_EXEC);
+		putname(tmp);
+	}
 	if (error)
 		goto out;
 
diff --git a/fs/namei.c b/fs/namei.c
index 38ceb6e06eb..a7b0a0b8012 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1193,19 +1193,6 @@ static int path_lookup_create(int dfd, const char *name,
 			nd, open_flags, create_mode);
 }
 
-int __user_path_lookup_open(const char __user *name, unsigned int lookup_flags,
-		struct nameidata *nd, int open_flags)
-{
-	char *tmp = getname(name);
-	int err = PTR_ERR(tmp);
-
-	if (!IS_ERR(tmp)) {
-		err = __path_lookup_intent_open(AT_FDCWD, tmp, lookup_flags, nd, open_flags, 0);
-		putname(tmp);
-	}
-	return err;
-}
-
 static struct dentry *__lookup_hash(struct qstr *name,
 		struct dentry *base, struct nameidata *nd)
 {
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 00888ff6950..68f8c3203c8 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -65,7 +65,6 @@ extern int path_lookup(const char *, unsigned, struct nameidata *);
 extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
 			   const char *, unsigned int, struct nameidata *);
 
-extern int __user_path_lookup_open(const char __user *, unsigned lookup_flags, struct nameidata *nd, int open_flags);
 extern int path_lookup_open(int dfd, const char *name, unsigned lookup_flags, struct nameidata *, int open_flags);
 extern struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry *dentry,
 		int (*open)(struct inode *, struct file *));
-- 
cgit v1.2.3-70-g09d2


From 3f8206d496e9e9495afb1d4e70d29712b4d403c9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 26 Jul 2008 03:46:43 -0400
Subject: [PATCH] get rid of indirect users of namei.h

fs.h needs path.h, not namei.h; nfs_fs.h doesn't need it at all.
Several places in the tree needed direct include.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfsd/nfsctl.c       | 1 +
 fs/ubifs/file.c        | 1 +
 include/linux/fs.h     | 2 +-
 include/linux/nfs_fs.h | 1 -
 kernel/cgroup.c        | 1 +
 5 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 1955a2702e6..c53e65f8f3a 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -12,6 +12,7 @@
 #include <linux/time.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
+#include <linux/namei.h>
 #include <linux/fcntl.h>
 #include <linux/net.h>
 #include <linux/in.h>
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 005a3b854d9..8565e586e53 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -53,6 +53,7 @@
 
 #include "ubifs.h"
 #include <linux/mount.h>
+#include <linux/namei.h>
 
 static int read_block(struct inode *inode, void *addr, unsigned int block,
 		      struct ubifs_data_node *dn)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7676fa1c20a..8252b045e62 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -279,7 +279,7 @@ extern int dir_notify_enable;
 #include <linux/types.h>
 #include <linux/kdev_t.h>
 #include <linux/dcache.h>
-#include <linux/namei.h>
+#include <linux/path.h>
 #include <linux/stat.h>
 #include <linux/cache.h>
 #include <linux/kobject.h>
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index f08f9ca602a..78a5922a2f1 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -42,7 +42,6 @@
 #include <linux/in.h>
 #include <linux/kref.h>
 #include <linux/mm.h>
-#include <linux/namei.h>
 #include <linux/pagemap.h>
 #include <linux/rbtree.h>
 #include <linux/rwsem.h>
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 89bd6fb7894..657f8f8d93a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -45,6 +45,7 @@
 #include <linux/delayacct.h>
 #include <linux/cgroupstats.h>
 #include <linux/hash.h>
+#include <linux/namei.h>
 
 #include <asm/atomic.h>
 
-- 
cgit v1.2.3-70-g09d2


From 6c5d0512a091480c9f981162227fdb1c9d70e555 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 26 Jul 2008 13:38:19 -0400
Subject: [PATCH] get rid of corner case in dup3() entirely

Since Ulrich is OK with getting rid of dup3(fd, fd, flags) completely,
to hell the damn thing goes.  Corner case for dup2() is handled in
sys_dup2() (complete with -EBADF if dup2(fd, fd) is called with fd
that is not open), the rest is done in dup3().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fcntl.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/fcntl.c b/fs/fcntl.c
index ce12a688511..3deec988708 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -135,18 +135,12 @@ asmlinkage long sys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
 	if ((flags & ~O_CLOEXEC) != 0)
 		return -EINVAL;
 
+	if (unlikely(oldfd == newfd))
+		return -EINVAL;
+
 	spin_lock(&files->file_lock);
 	if (!(file = fcheck(oldfd)))
 		goto out_unlock;
-	err = newfd;
-	if (unlikely(newfd == oldfd)) {
-		if (flags & O_CLOEXEC) {
-			fdt = files_fdtable(files);
-			FD_SET(newfd, fdt->close_on_exec);
-		}
-		goto out_unlock;
-	}
-	err = -EBADF;
 	if (newfd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
 		goto out_unlock;
 	get_file(file);			/* We are now finished with oldfd */
@@ -194,6 +188,14 @@ out_fput:
 
 asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd)
 {
+	if (unlikely(newfd == oldfd)) { /* corner case */
+		struct files_struct *files = current->files;
+		rcu_read_lock();
+		if (!fcheck_files(files, oldfd))
+			oldfd = -EBADF;
+		rcu_read_unlock();
+		return oldfd;
+	}
 	return sys_dup3(oldfd, newfd, 0);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 4e1e018ecc6f7bfd10fc75b3ff9715cc8164e0a2 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 26 Jul 2008 16:01:20 -0400
Subject: [PATCH] fix RLIM_NOFILE handling

* dup2() should return -EBADF on exceeded sysctl_nr_open
* dup() should *not* return -EINVAL even if you have rlimit set to 0;
  it should get -EMFILE instead.

Check for orig_start exceeding rlimit taken to sys_fcntl().
Failing expand_files() in dup{2,3}() now gets -EMFILE remapped to -EBADF.
Consequently, remaining checks for rlimit are taken to expand_files().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fcntl.c | 18 ++++++------------
 fs/file.c  |  9 +++++++++
 fs/open.c  |  9 ---------
 3 files changed, 15 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 3deec988708..61d62513681 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -64,11 +64,6 @@ static int locate_fd(unsigned int orig_start, int cloexec)
 	struct fdtable *fdt;
 
 	spin_lock(&files->file_lock);
-
-	error = -EINVAL;
-	if (orig_start >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
-		goto out;
-
 repeat:
 	fdt = files_fdtable(files);
 	/*
@@ -83,10 +78,6 @@ repeat:
 	if (start < fdt->max_fds)
 		newfd = find_next_zero_bit(fdt->open_fds->fds_bits,
 					   fdt->max_fds, start);
-	
-	error = -EMFILE;
-	if (newfd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
-		goto out;
 
 	error = expand_files(files, newfd);
 	if (error < 0)
@@ -141,13 +132,14 @@ asmlinkage long sys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
 	spin_lock(&files->file_lock);
 	if (!(file = fcheck(oldfd)))
 		goto out_unlock;
-	if (newfd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
-		goto out_unlock;
 	get_file(file);			/* We are now finished with oldfd */
 
 	err = expand_files(files, newfd);
-	if (err < 0)
+	if (unlikely(err < 0)) {
+		if (err == -EMFILE)
+			err = -EBADF;
 		goto out_fput;
+	}
 
 	/* To avoid races with open() and dup(), we will mark the fd as
 	 * in-use in the open-file bitmap throughout the entire dup2()
@@ -328,6 +320,8 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 	switch (cmd) {
 	case F_DUPFD:
 	case F_DUPFD_CLOEXEC:
+		if (arg >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
+			break;
 		get_file(filp);
 		err = dupfd(filp, arg, cmd == F_DUPFD_CLOEXEC);
 		break;
diff --git a/fs/file.c b/fs/file.c
index 7b3887e054d..d8773b19fe4 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -250,9 +250,18 @@ int expand_files(struct files_struct *files, int nr)
 	struct fdtable *fdt;
 
 	fdt = files_fdtable(files);
+
+	/*
+	 * N.B. For clone tasks sharing a files structure, this test
+	 * will limit the total number of files that can be opened.
+	 */
+	if (nr >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
+		return -EMFILE;
+
 	/* Do we need to expand? */
 	if (nr < fdt->max_fds)
 		return 0;
+
 	/* Can we expand? */
 	if (nr >= sysctl_nr_open)
 		return -EMFILE;
diff --git a/fs/open.c b/fs/open.c
index 3fe1a6857c7..52647be277a 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -972,7 +972,6 @@ int get_unused_fd_flags(int flags)
 	int fd, error;
 	struct fdtable *fdt;
 
-  	error = -EMFILE;
 	spin_lock(&files->file_lock);
 
 repeat:
@@ -980,13 +979,6 @@ repeat:
 	fd = find_next_zero_bit(fdt->open_fds->fds_bits, fdt->max_fds,
 				files->next_fd);
 
-	/*
-	 * N.B. For clone tasks sharing a files structure, this test
-	 * will limit the total number of files that can be opened.
-	 */
-	if (fd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
-		goto out;
-
 	/* Do we need to expand the fd array or fd set?  */
 	error = expand_files(files, fd);
 	if (error < 0)
@@ -997,7 +989,6 @@ repeat:
 	 	 * If we needed to expand the fs array we
 		 * might have blocked - try again.
 		 */
-		error = -EMFILE;
 		goto repeat;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From b2d002dba5a8a4c0c3ec96fd1ff3c9def6bd71a1 Mon Sep 17 00:00:00 2001
From: Andrea Righi <righi.andrea@gmail.com>
Date: Sat, 26 Jul 2008 15:22:27 -0700
Subject: task IO accounting: correctly account threads IO statistics

Oleg Nesterov points out that we should check that the task is still alive
before we iterate over the threads.  This patch includes a fixup for this.

Also simplify do_io_accounting() implementation.

Signed-off-by: Andrea Righi <righi.andrea@gmail.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 56 ++++++++++++++++++++++----------------------------------
 1 file changed, 22 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 81bce6791bf..d744aa3c9f7 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2406,35 +2406,18 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
 	u64 rchar, wchar, syscr, syscw;
 	struct task_io_accounting ioac;
 
-	if (!whole) {
-		rchar = task->rchar;
-		wchar = task->wchar;
-		syscr = task->syscr;
-		syscw = task->syscw;
-		memcpy(&ioac, &task->ioac, sizeof(ioac));
-	} else {
-		unsigned long flags;
-		struct task_struct *t = task;
-		rchar = wchar = syscr = syscw = 0;
-		memset(&ioac, 0, sizeof(ioac));
+	rchar = task->rchar;
+	wchar = task->wchar;
+	syscr = task->syscr;
+	syscw = task->syscw;
+	memcpy(&ioac, &task->ioac, sizeof(ioac));
 
-		rcu_read_lock();
-		do {
-			rchar += t->rchar;
-			wchar += t->wchar;
-			syscr += t->syscr;
-			syscw += t->syscw;
-
-			ioac.read_bytes += t->ioac.read_bytes;
-			ioac.write_bytes += t->ioac.write_bytes;
-			ioac.cancelled_write_bytes +=
-					t->ioac.cancelled_write_bytes;
-			t = next_thread(t);
-		} while (t != task);
-		rcu_read_unlock();
+	if (whole) {
+		unsigned long flags;
 
 		if (lock_task_sighand(task, &flags)) {
 			struct signal_struct *sig = task->signal;
+			struct task_struct *t = task;
 
 			rchar += sig->rchar;
 			wchar += sig->wchar;
@@ -2445,11 +2428,20 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
 			ioac.write_bytes += sig->ioac.write_bytes;
 			ioac.cancelled_write_bytes +=
 					sig->ioac.cancelled_write_bytes;
-
+			while_each_thread(task, t) {
+				rchar += t->rchar;
+				wchar += t->wchar;
+				syscr += t->syscr;
+				syscw += t->syscw;
+
+				ioac.read_bytes += t->ioac.read_bytes;
+				ioac.write_bytes += t->ioac.write_bytes;
+				ioac.cancelled_write_bytes +=
+					t->ioac.cancelled_write_bytes;
+			}
 			unlock_task_sighand(task, &flags);
 		}
 	}
-
 	return sprintf(buffer,
 			"rchar: %llu\n"
 			"wchar: %llu\n"
@@ -2458,13 +2450,9 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
 			"read_bytes: %llu\n"
 			"write_bytes: %llu\n"
 			"cancelled_write_bytes: %llu\n",
-			(unsigned long long)rchar,
-			(unsigned long long)wchar,
-			(unsigned long long)syscr,
-			(unsigned long long)syscw,
-			(unsigned long long)ioac.read_bytes,
-			(unsigned long long)ioac.write_bytes,
-			(unsigned long long)ioac.cancelled_write_bytes);
+			rchar, wchar, syscr, syscw,
+			ioac.read_bytes, ioac.write_bytes,
+			ioac.cancelled_write_bytes);
 }
 
 static int proc_tid_io_accounting(struct task_struct *task, char *buffer)
-- 
cgit v1.2.3-70-g09d2


From 5995477ab7f3522c497c9c4a1c55373e9d655574 Mon Sep 17 00:00:00 2001
From: Andrea Righi <righi.andrea@gmail.com>
Date: Sun, 27 Jul 2008 17:29:15 +0200
Subject: task IO accounting: improve code readability

Put all i/o statistics in struct proc_io_accounting and use inline functions to
initialize and increment statistics, removing a lot of single variable
assignments.

This also reduces the kernel size as following (with CONFIG_TASK_XACCT=y and
CONFIG_TASK_IO_ACCOUNTING=y).

    text    data     bss     dec     hex filename
   11651       0       0   11651    2d83 kernel/exit.o.before
   11619       0       0   11619    2d63 kernel/exit.o.after
   10886     132     136   11154    2b92 kernel/fork.o.before
   10758     132     136   11026    2b12 kernel/fork.o.after

 3082029  807968 4818600 8708597  84e1f5 vmlinux.o.before
 3081869  807968 4818600 8708437  84e155 vmlinux.o.after

Signed-off-by: Andrea Righi <righi.andrea@gmail.com>
Acked-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c                         | 57 ++++++++++------------------------
 include/linux/sched.h                  | 19 ++++--------
 include/linux/task_io_accounting.h     | 27 ++++++++++++++--
 include/linux/task_io_accounting_ops.h | 56 +++++++++++++++++++++++++++------
 kernel/exit.c                          | 30 ++----------------
 kernel/fork.c                          | 15 ++-------
 kernel/tsacct.c                        | 14 ++++-----
 7 files changed, 104 insertions(+), 114 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index e74308bdabd..3d94906c7aa 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -53,6 +53,7 @@
 #include <linux/time.h>
 #include <linux/proc_fs.h>
 #include <linux/stat.h>
+#include <linux/task_io_accounting_ops.h>
 #include <linux/init.h>
 #include <linux/capability.h>
 #include <linux/file.h>
@@ -2402,44 +2403,17 @@ static int proc_base_fill_cache(struct file *filp, void *dirent,
 #ifdef CONFIG_TASK_IO_ACCOUNTING
 static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
 {
-	u64 rchar, wchar, syscr, syscw;
-	struct task_io_accounting ioac;
-
-	rchar = task->rchar;
-	wchar = task->wchar;
-	syscr = task->syscr;
-	syscw = task->syscw;
-	memcpy(&ioac, &task->ioac, sizeof(ioac));
-
-	if (whole) {
-		unsigned long flags;
-
-		if (lock_task_sighand(task, &flags)) {
-			struct signal_struct *sig = task->signal;
-			struct task_struct *t = task;
-
-			rchar += sig->rchar;
-			wchar += sig->wchar;
-			syscr += sig->syscr;
-			syscw += sig->syscw;
-
-			ioac.read_bytes += sig->ioac.read_bytes;
-			ioac.write_bytes += sig->ioac.write_bytes;
-			ioac.cancelled_write_bytes +=
-					sig->ioac.cancelled_write_bytes;
-			while_each_thread(task, t) {
-				rchar += t->rchar;
-				wchar += t->wchar;
-				syscr += t->syscr;
-				syscw += t->syscw;
-
-				ioac.read_bytes += t->ioac.read_bytes;
-				ioac.write_bytes += t->ioac.write_bytes;
-				ioac.cancelled_write_bytes +=
-					t->ioac.cancelled_write_bytes;
-			}
-			unlock_task_sighand(task, &flags);
-		}
+	struct proc_io_accounting acct = task->ioac;
+	unsigned long flags;
+
+	if (whole && lock_task_sighand(task, &flags)) {
+		struct task_struct *t = task;
+
+		task_io_accounting_add(&acct, &task->signal->ioac);
+		while_each_thread(task, t)
+			task_io_accounting_add(&acct, &t->ioac);
+
+		unlock_task_sighand(task, &flags);
 	}
 	return sprintf(buffer,
 			"rchar: %llu\n"
@@ -2449,9 +2423,10 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
 			"read_bytes: %llu\n"
 			"write_bytes: %llu\n"
 			"cancelled_write_bytes: %llu\n",
-			rchar, wchar, syscr, syscw,
-			ioac.read_bytes, ioac.write_bytes,
-			ioac.cancelled_write_bytes);
+			acct.chr.rchar, acct.chr.wchar,
+			acct.chr.syscr, acct.chr.syscw,
+			acct.blk.read_bytes, acct.blk.write_bytes,
+			acct.blk.cancelled_write_bytes);
 }
 
 static int proc_tid_io_accounting(struct task_struct *task, char *buffer)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f59318a0099..034c1ca6b33 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -505,10 +505,7 @@ struct signal_struct {
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
 	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
 	unsigned long inblock, oublock, cinblock, coublock;
-#ifdef CONFIG_TASK_XACCT
-	u64 rchar, wchar, syscr, syscw;
-#endif
-	struct task_io_accounting ioac;
+	struct proc_io_accounting ioac;
 
 	/*
 	 * Cumulative ns of scheduled CPU time for dead threads in the
@@ -1256,11 +1253,7 @@ struct task_struct {
 
 	unsigned long ptrace_message;
 	siginfo_t *last_siginfo; /* For ptrace use.  */
-#ifdef CONFIG_TASK_XACCT
-/* i/o counters(bytes read/written, #syscalls */
-	u64 rchar, wchar, syscr, syscw;
-#endif
-	struct task_io_accounting ioac;
+	struct proc_io_accounting ioac;
 #if defined(CONFIG_TASK_XACCT)
 	u64 acct_rss_mem1;	/* accumulated rss usage */
 	u64 acct_vm_mem1;	/* accumulated virtual memory usage */
@@ -2190,22 +2183,22 @@ extern long sched_group_rt_period(struct task_group *tg);
 #ifdef CONFIG_TASK_XACCT
 static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
 {
-	tsk->rchar += amt;
+	tsk->ioac.chr.rchar += amt;
 }
 
 static inline void add_wchar(struct task_struct *tsk, ssize_t amt)
 {
-	tsk->wchar += amt;
+	tsk->ioac.chr.wchar += amt;
 }
 
 static inline void inc_syscr(struct task_struct *tsk)
 {
-	tsk->syscr++;
+	tsk->ioac.chr.syscr++;
 }
 
 static inline void inc_syscw(struct task_struct *tsk)
 {
-	tsk->syscw++;
+	tsk->ioac.chr.syscw++;
 }
 #else
 static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
diff --git a/include/linux/task_io_accounting.h b/include/linux/task_io_accounting.h
index 44d00e9ccee..165390f8b93 100644
--- a/include/linux/task_io_accounting.h
+++ b/include/linux/task_io_accounting.h
@@ -1,5 +1,5 @@
 /*
- * task_io_accounting: a structure which is used for recording a single task's
+ * proc_io_accounting: a structure which is used for recording a single task's
  * IO statistics.
  *
  * Don't include this header file directly - it is designed to be dragged in via
@@ -8,6 +8,22 @@
  * Blame akpm@osdl.org for all this.
  */
 
+#ifdef CONFIG_TASK_XACCT
+struct task_chr_io_accounting {
+	/* bytes read */
+	u64 rchar;
+	/*  bytes written */
+	u64 wchar;
+	/* # of read syscalls */
+	u64 syscr;
+	/* # of write syscalls */
+	u64 syscw;
+};
+#else /* CONFIG_TASK_XACCT */
+struct task_chr_io_accounting {
+};
+#endif /* CONFIG_TASK_XACCT */
+
 #ifdef CONFIG_TASK_IO_ACCOUNTING
 struct task_io_accounting {
 	/*
@@ -31,7 +47,12 @@ struct task_io_accounting {
 	 */
 	u64 cancelled_write_bytes;
 };
-#else
+#else /* CONFIG_TASK_IO_ACCOUNTING */
 struct task_io_accounting {
 };
-#endif
+#endif /* CONFIG_TASK_IO_ACCOUNTING */
+
+struct proc_io_accounting {
+	struct task_chr_io_accounting chr;
+	struct task_io_accounting blk;
+};
diff --git a/include/linux/task_io_accounting_ops.h b/include/linux/task_io_accounting_ops.h
index ff46c6fad79..e6f958ebe97 100644
--- a/include/linux/task_io_accounting_ops.h
+++ b/include/linux/task_io_accounting_ops.h
@@ -9,7 +9,7 @@
 #ifdef CONFIG_TASK_IO_ACCOUNTING
 static inline void task_io_account_read(size_t bytes)
 {
-	current->ioac.read_bytes += bytes;
+	current->ioac.blk.read_bytes += bytes;
 }
 
 /*
@@ -18,12 +18,12 @@ static inline void task_io_account_read(size_t bytes)
  */
 static inline unsigned long task_io_get_inblock(const struct task_struct *p)
 {
-	return p->ioac.read_bytes >> 9;
+	return p->ioac.blk.read_bytes >> 9;
 }
 
 static inline void task_io_account_write(size_t bytes)
 {
-	current->ioac.write_bytes += bytes;
+	current->ioac.blk.write_bytes += bytes;
 }
 
 /*
@@ -32,17 +32,25 @@ static inline void task_io_account_write(size_t bytes)
  */
 static inline unsigned long task_io_get_oublock(const struct task_struct *p)
 {
-	return p->ioac.write_bytes >> 9;
+	return p->ioac.blk.write_bytes >> 9;
 }
 
 static inline void task_io_account_cancelled_write(size_t bytes)
 {
-	current->ioac.cancelled_write_bytes += bytes;
+	current->ioac.blk.cancelled_write_bytes += bytes;
 }
 
-static inline void task_io_accounting_init(struct task_struct *tsk)
+static inline void task_io_accounting_init(struct proc_io_accounting *ioac)
 {
-	memset(&tsk->ioac, 0, sizeof(tsk->ioac));
+	memset(ioac, 0, sizeof(*ioac));
+}
+
+static inline void task_blk_io_accounting_add(struct proc_io_accounting *dst,
+						struct proc_io_accounting *src)
+{
+	dst->blk.read_bytes += src->blk.read_bytes;
+	dst->blk.write_bytes += src->blk.write_bytes;
+	dst->blk.cancelled_write_bytes += src->blk.cancelled_write_bytes;
 }
 
 #else
@@ -69,9 +77,37 @@ static inline void task_io_account_cancelled_write(size_t bytes)
 {
 }
 
-static inline void task_io_accounting_init(struct task_struct *tsk)
+static inline void task_io_accounting_init(struct proc_io_accounting *ioac)
+{
+}
+
+static inline void task_blk_io_accounting_add(struct proc_io_accounting *dst,
+						struct proc_io_accounting *src)
 {
 }
 
-#endif		/* CONFIG_TASK_IO_ACCOUNTING */
-#endif		/* __TASK_IO_ACCOUNTING_OPS_INCLUDED */
+#endif /* CONFIG_TASK_IO_ACCOUNTING */
+
+#ifdef CONFIG_TASK_XACCT
+static inline void task_chr_io_accounting_add(struct proc_io_accounting *dst,
+						struct proc_io_accounting *src)
+{
+	dst->chr.rchar += src->chr.rchar;
+	dst->chr.wchar += src->chr.wchar;
+	dst->chr.syscr += src->chr.syscr;
+	dst->chr.syscw += src->chr.syscw;
+}
+#else
+static inline void task_chr_io_accounting_add(struct proc_io_accounting *dst,
+						struct proc_io_accounting *src)
+{
+}
+#endif /* CONFIG_TASK_XACCT */
+
+static inline void task_io_accounting_add(struct proc_io_accounting *dst,
+						struct proc_io_accounting *src)
+{
+	task_chr_io_accounting_add(dst, src);
+	task_blk_io_accounting_add(dst, src);
+}
+#endif /* __TASK_IO_ACCOUNTING_OPS_INCLUDED */
diff --git a/kernel/exit.c b/kernel/exit.c
index 0caf590548a..eb4d6470d1d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -121,18 +121,7 @@ static void __exit_signal(struct task_struct *tsk)
 		sig->nivcsw += tsk->nivcsw;
 		sig->inblock += task_io_get_inblock(tsk);
 		sig->oublock += task_io_get_oublock(tsk);
-#ifdef CONFIG_TASK_XACCT
-		sig->rchar += tsk->rchar;
-		sig->wchar += tsk->wchar;
-		sig->syscr += tsk->syscr;
-		sig->syscw += tsk->syscw;
-#endif /* CONFIG_TASK_XACCT */
-#ifdef CONFIG_TASK_IO_ACCOUNTING
-		sig->ioac.read_bytes += tsk->ioac.read_bytes;
-		sig->ioac.write_bytes += tsk->ioac.write_bytes;
-		sig->ioac.cancelled_write_bytes +=
-					tsk->ioac.cancelled_write_bytes;
-#endif /* CONFIG_TASK_IO_ACCOUNTING */
+		task_io_accounting_add(&sig->ioac, &tsk->ioac);
 		sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
 		sig = NULL; /* Marker for below. */
 	}
@@ -1363,21 +1352,8 @@ static int wait_task_zombie(struct task_struct *p, int options,
 		psig->coublock +=
 			task_io_get_oublock(p) +
 			sig->oublock + sig->coublock;
-#ifdef CONFIG_TASK_XACCT
-		psig->rchar += p->rchar + sig->rchar;
-		psig->wchar += p->wchar + sig->wchar;
-		psig->syscr += p->syscr + sig->syscr;
-		psig->syscw += p->syscw + sig->syscw;
-#endif /* CONFIG_TASK_XACCT */
-#ifdef CONFIG_TASK_IO_ACCOUNTING
-		psig->ioac.read_bytes +=
-			p->ioac.read_bytes + sig->ioac.read_bytes;
-		psig->ioac.write_bytes +=
-			p->ioac.write_bytes + sig->ioac.write_bytes;
-		psig->ioac.cancelled_write_bytes +=
-				p->ioac.cancelled_write_bytes +
-				sig->ioac.cancelled_write_bytes;
-#endif /* CONFIG_TASK_IO_ACCOUNTING */
+		task_io_accounting_add(&psig->ioac, &p->ioac);
+		task_io_accounting_add(&psig->ioac, &sig->ioac);
 		spin_unlock_irq(&p->parent->sighand->siglock);
 	}
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 5e050c1317c..8214ba7c8bb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -806,12 +806,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
 	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
 	sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
-#ifdef CONFIG_TASK_XACCT
-	sig->rchar = sig->wchar = sig->syscr = sig->syscw = 0;
-#endif
-#ifdef CONFIG_TASK_IO_ACCOUNTING
-	memset(&sig->ioac, 0, sizeof(sig->ioac));
-#endif
+	task_io_accounting_init(&sig->ioac);
 	sig->sum_sched_runtime = 0;
 	INIT_LIST_HEAD(&sig->cpu_timers[0]);
 	INIT_LIST_HEAD(&sig->cpu_timers[1]);
@@ -994,13 +989,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->last_switch_timestamp = 0;
 #endif
 
-#ifdef CONFIG_TASK_XACCT
-	p->rchar = 0;		/* I/O counter: bytes read */
-	p->wchar = 0;		/* I/O counter: bytes written */
-	p->syscr = 0;		/* I/O counter: read syscalls */
-	p->syscw = 0;		/* I/O counter: write syscalls */
-#endif
-	task_io_accounting_init(p);
+	task_io_accounting_init(&p->ioac);
 	acct_clear_integrals(p);
 
 	p->it_virt_expires = cputime_zero;
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 3da47ccdc5e..f9cd2561689 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -94,14 +94,14 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 		stats->hiwater_vm    = mm->hiwater_vm * PAGE_SIZE / KB;
 		mmput(mm);
 	}
-	stats->read_char	= p->rchar;
-	stats->write_char	= p->wchar;
-	stats->read_syscalls	= p->syscr;
-	stats->write_syscalls	= p->syscw;
+	stats->read_char	= p->ioac.chr.rchar;
+	stats->write_char	= p->ioac.chr.wchar;
+	stats->read_syscalls	= p->ioac.chr.syscr;
+	stats->write_syscalls	= p->ioac.chr.syscw;
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-	stats->read_bytes	= p->ioac.read_bytes;
-	stats->write_bytes	= p->ioac.write_bytes;
-	stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes;
+	stats->read_bytes	= p->ioac.blk.read_bytes;
+	stats->write_bytes	= p->ioac.blk.write_bytes;
+	stats->cancelled_write_bytes = p->ioac.blk.cancelled_write_bytes;
 #else
 	stats->read_bytes	= 0;
 	stats->write_bytes	= 0;
-- 
cgit v1.2.3-70-g09d2


From 31c9446993f412ecb7875e30bba4bc7f216ae016 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@misterjones.org>
Date: Thu, 17 Jul 2008 13:21:55 +0200
Subject: nfs_remount oops when rebooting + possible fix

Jeff, Trond,

The commit

48b605f83c920d8daa50e43fc2c7f718e04c7bfa (NFS: implement option checking
when remounting NFS filesystems (resend))

generate an Oops on my platform when rebooting while its root FS on
an NFS share (NFSv3, TCP) :

Unmounting local filesystems...done.
Unable to handle kernel NULL pointer dereference at virtual address 00000000
pgd = c3d00000
[00000000] *pgd=a3d72031, *pte=00000000, *ppte=00000000
Internal error: Oops: 17 [#1]
Modules linked in: cpufreq_powersave cpufreq_ondemand cpufreq_userspace cpufreq_conservative ext3 jbd sd_mod pata_pcmcia libata scsi_mod pcmcia loop firmware_class pxafb cfbcopyarea cfbimgblt cfbfillrect pxa2xx_cs pxa2xx_core pcmcia_core snd_pxa2xx_ac97 snd_ac97_codec ac97_bus snd_pxa2xx_pcm snd_pcm_oss snd_mixer_oss snd_pcm snd_timer snd isp116x_hcd soundcore rtc_sa1100 snd_page_alloc pxa25x_udc usbcore rtc_ds1307 rtc_core
CPU: 0    Not tainted  (2.6.26-03414-g33af79d-dirty #15)
PC is at nfs_remount+0x40/0x264
LR is at do_remount_sb+0x158/0x194
pc : [<c00bbf54>]    lr : [<c0076c40>]    psr: 60000013
sp : c2dd1e70  ip : c2dd1e98  fp : c2dd1e94
r10: 00000040  r9 : c3d17000  r8 : c3c3fc40
r7 : 00000000  r6 : 00000000  r5 : c3d2b200  r4 : 00000000
r3 : 00000003  r2 : 00000000  r1 : c2dd1e9c  r0 : c3c3fc00
Flags: nZCv  IRQs on  FIQs on  Mode SVC_32  ISA ARM  Segment user
Control: 0000397f  Table: a3d00000  DAC: 00000015
Process mount (pid: 1462, stack limit = 0xc2dd0270)
Stack: (0xc2dd1e70 to 0xc2dd2000)
1e60:                                     00000000 c3c3fc00 00000000 00000000
1e80: c3c3fc40 c3d17000 c2dd1ebc c2dd1e98 c0076c40 c00bbf20 c01c61e4 00000001
1ea0: c2dd1ebc 00000001 c3c3fc00 c2dd1ef0 c2dd1ee4 c2dd1ec0 c008c6d8 c0076af4
1ec0: 00000021 00000040 c2dd1ef0 c3d77000 c3eaa000 00000000 c2dd1f6c c2dd1ee8
1ee0: c008d1bc c008c5f8 00000000 c2dd0000 c3c0c320 c3805b38 c002064c 0001f820
1f00: 0001f810 00000001 00000001 00000000 c2dd0000 00000000 c2dd1f34 c2dd1f28
1f20: c005ead8 c005e6f8 c2dd1f44 c2dd1f38 c005eaf8 c005ead0 c2dd1f6c c2dd1f48
1f40: c008ae3c 00000000 c3d77000 0001f810 c0ed0021 c0020ca8 c2dd0000 00000000
1f60: c2dd1fa4 c2dd1f70 c008d2d4 c008d0bc 00000000 0001f810 c2dd1f9c c3eaa000
1f80: c3d17000 00000000 00000000 be8b6aa8 be8b6ad0 00000015 00000000 c2dd1fa8
1fa0: c0020b00 c008d254 00000000 be8b6aa8 0001f810 0001f820 0001f830 c0ed0021
1fc0: 00000000 be8b6aa8 be8b6ad0 00000015 00000000 be8b6ad0 0001f810 be8b6aa8
1fe0: 0001f810 be8b6964 0000aab8 40125124 60000010 0001f810 00000000 00000000
Backtrace:
[<c00bbf14>] (nfs_remount+0x0/0x264) from [<c0076c40>] (do_remount_sb+0x158/0x194)
  r9:c3d17000 r8:c3c3fc40 r7:00000000 r6:00000000 r5:c3c3fc00
r4:00000000
[<c0076ae8>] (do_remount_sb+0x0/0x194) from [<c008c6d8>] (do_remount+0xec/0x118)
  r6:c2dd1ef0 r5:c3c3fc00 r4:00000001
[<c008c5ec>] (do_remount+0x0/0x118) from [<c008d1bc>] (do_mount+0x10c/0x198)
[<c008d0b0>] (do_mount+0x0/0x198) from [<c008d2d4>] (sys_mount+0x8c/0xd4)
[<c008d248>] (sys_mount+0x0/0xd4) from [<c0020b00>] (ret_fast_syscall+0x0/0x2c)
  r7:00000015 r6:be8b6ad0 r5:be8b6aa8 r4:00000000
Code: 0a000086 ea000006 e3530003 8a000004 (e5923000)
---[ end trace 55e1b689cf8c8a6a ]---
------------[ cut here ]------------
WARNING: at kernel/exit.c:966 do_exit+0x3c/0x628()
Modules linked in: cpufreq_powersave cpufreq_ondemand cpufreq_userspace cpufreq_conservative ext3 jbd sd_mod pata_pcmcia libata scsi_mod pcmcia loop firmware_class pxafb cfbcopyarea cfbimgblt cfbfillrect pxa2xx_cs pxa2xx_core pcmcia_core snd_pxa2xx_ac97 snd_ac97_codec ac97_bus snd_pxa2xx_pcm snd_pcm_oss snd_mixer_oss snd_pcm snd_timer snd isp116x_hcd soundcore rtc_sa1100 snd_page_alloc pxa25x_udc usbcore rtc_ds1307 rtc_core
[<c0025168>] (dump_stack+0x0/0x14) from [<c0032154>] (warn_on_slowpath+0x4c/0x68)
[<c0032108>] (warn_on_slowpath+0x0/0x68) from [<c003531c>] (do_exit+0x3c/0x628)
  r6:0000000b r5:c3c3dc80 r4:c2dd0000
[<c00352e0>] (do_exit+0x0/0x628) from [<c0025004>] (die+0x2b0/0x30c)
[<c0024d54>] (die+0x0/0x30c) from [<c00270bc>] (__do_kernel_fault+0x6c/0x80)
[<c0027050>] (__do_kernel_fault+0x0/0x80) from [<c00272e0>] (do_page_fault+0x210/0x230)
  r7:c3fa7118 r6:c3c3dc80 r5:c3d166a8 r4:00010000
[<c00270d0>] (do_page_fault+0x0/0x230) from [<c00201ec>] (do_DataAbort+0x3c/0xa0)
[<c00201b0>] (do_DataAbort+0x0/0xa0) from [<c002064c>] (__dabt_svc+0x4c/0x60)
Exception stack(0xc2dd1e28 to 0xc2dd1e70)
1e20:                   c3c3fc00 c2dd1e9c 00000000 00000003 00000000 c3d2b200
1e40: 00000000 00000000 c3c3fc40 c3d17000 00000040 c2dd1e94 c2dd1e98 c2dd1e70
1e60: c0076c40 c00bbf54 60000013 ffffffff
  r8:c3c3fc40 r7:00000000 r6:00000000 r5:c2dd1e5c r4:ffffffff
[<c00bbf14>] (nfs_remount+0x0/0x264) from [<c0076c40>] (do_remount_sb+0x158/0x194)
  r9:c3d17000 r8:c3c3fc40 r7:00000000 r6:00000000 r5:c3c3fc00
r4:00000000
[<c0076ae8>] (do_remount_sb+0x0/0x194) from [<c008c6d8>] (do_remount+0xec/0x118)
  r6:c2dd1ef0 r5:c3c3fc00 r4:00000001
[<c008c5ec>] (do_remount+0x0/0x118) from [<c008d1bc>] (do_mount+0x10c/0x198)
[<c008d0b0>] (do_mount+0x0/0x198) from [<c008d2d4>] (sys_mount+0x8c/0xd4)
[<c008d248>] (sys_mount+0x0/0xd4) from [<c0020b00>] (ret_fast_syscall+0x0/0x2c)
  r7:00000015 r6:be8b6ad0 r5:be8b6aa8 r4:00000000
---[ end trace 55e1b689cf8c8a6a ]---
/etc/rc6.d/S60umountroot: line 17:  1462 Segmentation fault      mount $MOUNT_FORCE_OPT -n -o remount,ro -t dummytype dummydev / 2> /dev/null

The new super.c:nfs_remount function doesn't check the validity of the
options/options4 pointers. Unfortunately, this seems to happend.
The obvious patch seems to check the pointers, and not to do anything if
the happend to be NULL.

Tested on an XScale PXA255 system, latest git.

Regards,

	M.

Signed-off-by: Marc Zyngier <marc.zyngier@altran.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 1b94e3650f5..9abcd2b329f 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1718,9 +1718,9 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
 	 * ones were explicitly specified. Fall back to legacy behavior and
 	 * just return success.
 	 */
-	if ((nfsvers == 4 && options4->version == 1) ||
-	    (nfsvers <= 3 && options->version >= 1 &&
-	     options->version <= 6))
+	if ((nfsvers == 4 && (!options4 || options4->version == 1)) ||
+	    (nfsvers <= 3 && (!options || (options->version >= 1 &&
+					   options->version <= 6))))
 		return 0;
 
 	data = kzalloc(sizeof(*data), GFP_KERNEL);
-- 
cgit v1.2.3-70-g09d2


From 744d18dbfae07482ea461701b0aaec3a75ec9224 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 27 Jul 2008 18:03:19 -0400
Subject: NFS: Ensure we call nfs_sb_deactive() after releasing the directory
 inode

In order to avoid the "Busy inodes after unmount" error message, we need to
ensure that nfs_async_unlink_release() releases the super block after the
call to nfs_free_unlinkdata().

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/unlink.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 3adf8b26646..f089e5839d7 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -95,10 +95,11 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
 static void nfs_async_unlink_release(void *calldata)
 {
 	struct nfs_unlinkdata	*data = calldata;
+	struct super_block *sb = data->dir->i_sb;
 
 	nfs_dec_sillycount(data->dir);
-	nfs_sb_deactive(NFS_SERVER(data->dir));
 	nfs_free_unlinkdata(data);
+	nfs_sb_deactive(NFS_SB(sb));
 }
 
 static const struct rpc_call_ops nfs_unlink_ops = {
-- 
cgit v1.2.3-70-g09d2


From 940389b8afad6495211614c13eb91ef7001773ec Mon Sep 17 00:00:00 2001
From: Andrea Righi <righi.andrea@gmail.com>
Date: Mon, 28 Jul 2008 00:48:12 +0200
Subject: task IO accounting: move all IO statistics in struct
 task_io_accounting

Simplify the code of include/linux/task_io_accounting.h.

It is also more reasonable to have all the task i/o-related statistics in a
single struct (task_io_accounting).

Signed-off-by: Andrea Righi <righi.andrea@gmail.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c                         | 10 +++----
 include/linux/sched.h                  | 12 ++++-----
 include/linux/task_io_accounting.h     | 17 ++----------
 include/linux/task_io_accounting_ops.h | 48 +++++++++++++++++-----------------
 kernel/tsacct.c                        | 14 +++++-----
 5 files changed, 44 insertions(+), 57 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3d94906c7aa..01ed610f9b8 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2403,7 +2403,7 @@ static int proc_base_fill_cache(struct file *filp, void *dirent,
 #ifdef CONFIG_TASK_IO_ACCOUNTING
 static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
 {
-	struct proc_io_accounting acct = task->ioac;
+	struct task_io_accounting acct = task->ioac;
 	unsigned long flags;
 
 	if (whole && lock_task_sighand(task, &flags)) {
@@ -2423,10 +2423,10 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
 			"read_bytes: %llu\n"
 			"write_bytes: %llu\n"
 			"cancelled_write_bytes: %llu\n",
-			acct.chr.rchar, acct.chr.wchar,
-			acct.chr.syscr, acct.chr.syscw,
-			acct.blk.read_bytes, acct.blk.write_bytes,
-			acct.blk.cancelled_write_bytes);
+			acct.rchar, acct.wchar,
+			acct.syscr, acct.syscw,
+			acct.read_bytes, acct.write_bytes,
+			acct.cancelled_write_bytes);
 }
 
 static int proc_tid_io_accounting(struct task_struct *task, char *buffer)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 034c1ca6b33..5270d449ff9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -505,7 +505,7 @@ struct signal_struct {
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
 	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
 	unsigned long inblock, oublock, cinblock, coublock;
-	struct proc_io_accounting ioac;
+	struct task_io_accounting ioac;
 
 	/*
 	 * Cumulative ns of scheduled CPU time for dead threads in the
@@ -1253,7 +1253,7 @@ struct task_struct {
 
 	unsigned long ptrace_message;
 	siginfo_t *last_siginfo; /* For ptrace use.  */
-	struct proc_io_accounting ioac;
+	struct task_io_accounting ioac;
 #if defined(CONFIG_TASK_XACCT)
 	u64 acct_rss_mem1;	/* accumulated rss usage */
 	u64 acct_vm_mem1;	/* accumulated virtual memory usage */
@@ -2183,22 +2183,22 @@ extern long sched_group_rt_period(struct task_group *tg);
 #ifdef CONFIG_TASK_XACCT
 static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
 {
-	tsk->ioac.chr.rchar += amt;
+	tsk->ioac.rchar += amt;
 }
 
 static inline void add_wchar(struct task_struct *tsk, ssize_t amt)
 {
-	tsk->ioac.chr.wchar += amt;
+	tsk->ioac.wchar += amt;
 }
 
 static inline void inc_syscr(struct task_struct *tsk)
 {
-	tsk->ioac.chr.syscr++;
+	tsk->ioac.syscr++;
 }
 
 static inline void inc_syscw(struct task_struct *tsk)
 {
-	tsk->ioac.chr.syscw++;
+	tsk->ioac.syscw++;
 }
 #else
 static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
diff --git a/include/linux/task_io_accounting.h b/include/linux/task_io_accounting.h
index 165390f8b93..5e88afc9a2f 100644
--- a/include/linux/task_io_accounting.h
+++ b/include/linux/task_io_accounting.h
@@ -1,5 +1,5 @@
 /*
- * proc_io_accounting: a structure which is used for recording a single task's
+ * task_io_accounting: a structure which is used for recording a single task's
  * IO statistics.
  *
  * Don't include this header file directly - it is designed to be dragged in via
@@ -8,8 +8,8 @@
  * Blame akpm@osdl.org for all this.
  */
 
+struct task_io_accounting {
 #ifdef CONFIG_TASK_XACCT
-struct task_chr_io_accounting {
 	/* bytes read */
 	u64 rchar;
 	/*  bytes written */
@@ -18,14 +18,9 @@ struct task_chr_io_accounting {
 	u64 syscr;
 	/* # of write syscalls */
 	u64 syscw;
-};
-#else /* CONFIG_TASK_XACCT */
-struct task_chr_io_accounting {
-};
 #endif /* CONFIG_TASK_XACCT */
 
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-struct task_io_accounting {
 	/*
 	 * The number of bytes which this task has caused to be read from
 	 * storage.
@@ -46,13 +41,5 @@ struct task_io_accounting {
 	 * information loss in doing that.
 	 */
 	u64 cancelled_write_bytes;
-};
-#else /* CONFIG_TASK_IO_ACCOUNTING */
-struct task_io_accounting {
-};
 #endif /* CONFIG_TASK_IO_ACCOUNTING */
-
-struct proc_io_accounting {
-	struct task_chr_io_accounting chr;
-	struct task_io_accounting blk;
 };
diff --git a/include/linux/task_io_accounting_ops.h b/include/linux/task_io_accounting_ops.h
index e6f958ebe97..4d090f9ee60 100644
--- a/include/linux/task_io_accounting_ops.h
+++ b/include/linux/task_io_accounting_ops.h
@@ -9,7 +9,7 @@
 #ifdef CONFIG_TASK_IO_ACCOUNTING
 static inline void task_io_account_read(size_t bytes)
 {
-	current->ioac.blk.read_bytes += bytes;
+	current->ioac.read_bytes += bytes;
 }
 
 /*
@@ -18,12 +18,12 @@ static inline void task_io_account_read(size_t bytes)
  */
 static inline unsigned long task_io_get_inblock(const struct task_struct *p)
 {
-	return p->ioac.blk.read_bytes >> 9;
+	return p->ioac.read_bytes >> 9;
 }
 
 static inline void task_io_account_write(size_t bytes)
 {
-	current->ioac.blk.write_bytes += bytes;
+	current->ioac.write_bytes += bytes;
 }
 
 /*
@@ -32,25 +32,25 @@ static inline void task_io_account_write(size_t bytes)
  */
 static inline unsigned long task_io_get_oublock(const struct task_struct *p)
 {
-	return p->ioac.blk.write_bytes >> 9;
+	return p->ioac.write_bytes >> 9;
 }
 
 static inline void task_io_account_cancelled_write(size_t bytes)
 {
-	current->ioac.blk.cancelled_write_bytes += bytes;
+	current->ioac.cancelled_write_bytes += bytes;
 }
 
-static inline void task_io_accounting_init(struct proc_io_accounting *ioac)
+static inline void task_io_accounting_init(struct task_io_accounting *ioac)
 {
 	memset(ioac, 0, sizeof(*ioac));
 }
 
-static inline void task_blk_io_accounting_add(struct proc_io_accounting *dst,
-						struct proc_io_accounting *src)
+static inline void task_blk_io_accounting_add(struct task_io_accounting *dst,
+						struct task_io_accounting *src)
 {
-	dst->blk.read_bytes += src->blk.read_bytes;
-	dst->blk.write_bytes += src->blk.write_bytes;
-	dst->blk.cancelled_write_bytes += src->blk.cancelled_write_bytes;
+	dst->read_bytes += src->read_bytes;
+	dst->write_bytes += src->write_bytes;
+	dst->cancelled_write_bytes += src->cancelled_write_bytes;
 }
 
 #else
@@ -77,35 +77,35 @@ static inline void task_io_account_cancelled_write(size_t bytes)
 {
 }
 
-static inline void task_io_accounting_init(struct proc_io_accounting *ioac)
+static inline void task_io_accounting_init(struct task_io_accounting *ioac)
 {
 }
 
-static inline void task_blk_io_accounting_add(struct proc_io_accounting *dst,
-						struct proc_io_accounting *src)
+static inline void task_blk_io_accounting_add(struct task_io_accounting *dst,
+						struct task_io_accounting *src)
 {
 }
 
 #endif /* CONFIG_TASK_IO_ACCOUNTING */
 
 #ifdef CONFIG_TASK_XACCT
-static inline void task_chr_io_accounting_add(struct proc_io_accounting *dst,
-						struct proc_io_accounting *src)
+static inline void task_chr_io_accounting_add(struct task_io_accounting *dst,
+						struct task_io_accounting *src)
 {
-	dst->chr.rchar += src->chr.rchar;
-	dst->chr.wchar += src->chr.wchar;
-	dst->chr.syscr += src->chr.syscr;
-	dst->chr.syscw += src->chr.syscw;
+	dst->rchar += src->rchar;
+	dst->wchar += src->wchar;
+	dst->syscr += src->syscr;
+	dst->syscw += src->syscw;
 }
 #else
-static inline void task_chr_io_accounting_add(struct proc_io_accounting *dst,
-						struct proc_io_accounting *src)
+static inline void task_chr_io_accounting_add(struct task_io_accounting *dst,
+						struct task_io_accounting *src)
 {
 }
 #endif /* CONFIG_TASK_XACCT */
 
-static inline void task_io_accounting_add(struct proc_io_accounting *dst,
-						struct proc_io_accounting *src)
+static inline void task_io_accounting_add(struct task_io_accounting *dst,
+						struct task_io_accounting *src)
 {
 	task_chr_io_accounting_add(dst, src);
 	task_blk_io_accounting_add(dst, src);
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index f9cd2561689..8ebcd8532df 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -94,14 +94,14 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 		stats->hiwater_vm    = mm->hiwater_vm * PAGE_SIZE / KB;
 		mmput(mm);
 	}
-	stats->read_char	= p->ioac.chr.rchar;
-	stats->write_char	= p->ioac.chr.wchar;
-	stats->read_syscalls	= p->ioac.chr.syscr;
-	stats->write_syscalls	= p->ioac.chr.syscw;
+	stats->read_char	= p->ioac.rchar;
+	stats->write_char	= p->ioac.wchar;
+	stats->read_syscalls	= p->ioac.syscr;
+	stats->write_syscalls	= p->ioac.syscw;
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-	stats->read_bytes	= p->ioac.blk.read_bytes;
-	stats->write_bytes	= p->ioac.blk.write_bytes;
-	stats->cancelled_write_bytes = p->ioac.blk.cancelled_write_bytes;
+	stats->read_bytes	= p->ioac.read_bytes;
+	stats->write_bytes	= p->ioac.write_bytes;
+	stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes;
 #else
 	stats->read_bytes	= 0;
 	stats->write_bytes	= 0;
-- 
cgit v1.2.3-70-g09d2


From 9b14ec35f03d89c88cba225add8b6eca15203964 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 19 May 2008 13:34:45 +0900
Subject: binfmt_elf_fdpic: Magical stack pointer index, for NEW_AUX_ENT
 compat.

While implementing binfmt_elf_fdpic on SH it quickly became apparent
that SH was the first platform to support both binfmt_elf_fdpic and
binfmt_elf, as well as the only of the FDPIC platforms to make use of the
auxvt.

Currently binfmt_elf_fdpic uses a special version of NEW_AUX_ENT() where
the first argument is the entry displacement after csp has been adjusted,
being reset after each adjustment. As we have no ability to sort this out
through the platform's ARCH_DLINFO, this index needs to be managed
entirely in create_elf_fdpic_tables(). Presently none of the platforms
that set their own auxvt entries are able to do so through their
respective ARCH_DLINFOs when using binfmt_elf_fdpic.

In addition to this, binfmt_elf_fdpic has been looking at
DLINFO_ARCH_ITEMS for the number of architecture-specific entries in the
auxvt. This is legacy cruft, and is not defined by any platforms in-tree,
even those that make heavy use of the auxvt. AT_VECTOR_SIZE_ARCH is
always available, and contains the number that is of interest here, so we
switch to using that unconditionally as well.

As this has direct bearing on how much stack is used, platforms that have
configurable (or dynamically adjustable) NEW_AUX_ENT calls need to either
make AT_VECTOR_SIZE_ARCH more fine-grained, or leave it as a worst-case
and live with some lost stack space if those entries aren't pushed (some
platforms may also need to purposely sacrifice some space here for
alignment considerations, as noted in the code -- although not an issue
for any FDPIC-capable platform today).

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
Acked-by: David Howells <dhowells@redhat.com>
---
 fs/binfmt_elf_fdpic.c | 45 +++++++++++++++++++++++++--------------------
 1 file changed, 25 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index fdeadab2f18..80c1f952ef7 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -470,6 +470,7 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 	char __user *u_platform, *p;
 	long hwcap;
 	int loop;
+	int nr;	/* reset for each csp adjustment */
 
 	/* we're going to shovel a whole load of stuff onto the stack */
 #ifdef CONFIG_MMU
@@ -542,10 +543,7 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 	/* force 16 byte _final_ alignment here for generality */
 #define DLINFO_ITEMS 13
 
-	nitems = 1 + DLINFO_ITEMS + (k_platform ? 1 : 0);
-#ifdef DLINFO_ARCH_ITEMS
-	nitems += DLINFO_ARCH_ITEMS;
-#endif
+	nitems = 1 + DLINFO_ITEMS + (k_platform ? 1 : 0) + AT_VECTOR_SIZE_ARCH;
 
 	csp = sp;
 	sp -= nitems * 2 * sizeof(unsigned long);
@@ -557,39 +555,46 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 	sp -= sp & 15UL;
 
 	/* put the ELF interpreter info on the stack */
-#define NEW_AUX_ENT(nr, id, val)					\
+#define NEW_AUX_ENT(id, val)						\
 	do {								\
 		struct { unsigned long _id, _val; } __user *ent;	\
 									\
 		ent = (void __user *) csp;				\
 		__put_user((id), &ent[nr]._id);				\
 		__put_user((val), &ent[nr]._val);			\
+		nr++;							\
 	} while (0)
 
+	nr = 0;
 	csp -= 2 * sizeof(unsigned long);
-	NEW_AUX_ENT(0, AT_NULL, 0);
+	NEW_AUX_ENT(AT_NULL, 0);
 	if (k_platform) {
+		nr = 0;
 		csp -= 2 * sizeof(unsigned long);
-		NEW_AUX_ENT(0, AT_PLATFORM,
+		NEW_AUX_ENT(AT_PLATFORM,
 			    (elf_addr_t) (unsigned long) u_platform);
 	}
 
+	nr = 0;
 	csp -= DLINFO_ITEMS * 2 * sizeof(unsigned long);
-	NEW_AUX_ENT( 0, AT_HWCAP,	hwcap);
-	NEW_AUX_ENT( 1, AT_PAGESZ,	PAGE_SIZE);
-	NEW_AUX_ENT( 2, AT_CLKTCK,	CLOCKS_PER_SEC);
-	NEW_AUX_ENT( 3, AT_PHDR,	exec_params->ph_addr);
-	NEW_AUX_ENT( 4, AT_PHENT,	sizeof(struct elf_phdr));
-	NEW_AUX_ENT( 5, AT_PHNUM,	exec_params->hdr.e_phnum);
-	NEW_AUX_ENT( 6,	AT_BASE,	interp_params->elfhdr_addr);
-	NEW_AUX_ENT( 7, AT_FLAGS,	0);
-	NEW_AUX_ENT( 8, AT_ENTRY,	exec_params->entry_addr);
-	NEW_AUX_ENT( 9, AT_UID,		(elf_addr_t) current->uid);
-	NEW_AUX_ENT(10, AT_EUID,	(elf_addr_t) current->euid);
-	NEW_AUX_ENT(11, AT_GID,		(elf_addr_t) current->gid);
-	NEW_AUX_ENT(12, AT_EGID,	(elf_addr_t) current->egid);
+	NEW_AUX_ENT(AT_HWCAP,	hwcap);
+	NEW_AUX_ENT(AT_PAGESZ,	PAGE_SIZE);
+	NEW_AUX_ENT(AT_CLKTCK,	CLOCKS_PER_SEC);
+	NEW_AUX_ENT(AT_PHDR,	exec_params->ph_addr);
+	NEW_AUX_ENT(AT_PHENT,	sizeof(struct elf_phdr));
+	NEW_AUX_ENT(AT_PHNUM,	exec_params->hdr.e_phnum);
+	NEW_AUX_ENT(AT_BASE,	interp_params->elfhdr_addr);
+	NEW_AUX_ENT(AT_FLAGS,	0);
+	NEW_AUX_ENT(AT_ENTRY,	exec_params->entry_addr);
+	NEW_AUX_ENT(AT_UID,	(elf_addr_t) current->uid);
+	NEW_AUX_ENT(AT_EUID,	(elf_addr_t) current->euid);
+	NEW_AUX_ENT(AT_GID,	(elf_addr_t) current->gid);
+	NEW_AUX_ENT(AT_EGID,	(elf_addr_t) current->egid);
 
 #ifdef ARCH_DLINFO
+	nr = 0;
+	csp -= AT_VECTOR_SIZE_ARCH * 2 * sizeof(unsigned long);
+
 	/* ARCH_DLINFO must come last so platform specific code can enforce
 	 * special alignment requirements on the AUXV if necessary (eg. PPC).
 	 */
-- 
cgit v1.2.3-70-g09d2


From 3bc24a1a5441ef621daf737ec93b0a10e8999d59 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 19 May 2008 13:40:12 +0900
Subject: sh: Initial ELF FDPIC support.

This adds initial support for ELF FDPIC on MMU-less SH, as per version
0.2 of the ABI definition at:

	http://www.codesourcery.com/public/docs/sh-fdpic/sh-fdpic-abi.txt

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/ptrace_32.c | 23 ++++++++++++++++++++
 arch/sh/kernel/signal_32.c | 25 ++++++++++++++++++++--
 fs/Kconfig.binfmt          |  2 +-
 include/asm-sh/elf.h       | 53 +++++++++++++++++++++++++++++++++++++++++++++-
 include/asm-sh/mmu.h       |  4 ++++
 include/asm-sh/ptrace.h    |  5 +++++
 6 files changed, 108 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c
index fddb547f3c2..2bc72def5cf 100644
--- a/arch/sh/kernel/ptrace_32.c
+++ b/arch/sh/kernel/ptrace_32.c
@@ -240,6 +240,29 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		}
 		break;
 	}
+#endif
+#ifdef CONFIG_BINFMT_ELF_FDPIC
+	case PTRACE_GETFDPIC: {
+		unsigned long tmp = 0;
+
+		switch (addr) {
+		case PTRACE_GETFDPIC_EXEC:
+			tmp = child->mm->context.exec_fdpic_loadmap;
+			break;
+		case PTRACE_GETFDPIC_INTERP:
+			tmp = child->mm->context.interp_fdpic_loadmap;
+			break;
+		default:
+			break;
+		}
+
+		ret = 0;
+		if (put_user(tmp, (unsigned long *) data)) {
+			ret = -EFAULT;
+			break;
+		}
+		break;
+	}
 #endif
 	default:
 		ret = ptrace_request(child, request, addr, data);
diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c
index f311551d9a0..46170a9a722 100644
--- a/arch/sh/kernel/signal_32.c
+++ b/arch/sh/kernel/signal_32.c
@@ -33,6 +33,11 @@
 
 #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
 
+struct fdpic_func_descriptor {
+	unsigned long	text;
+	unsigned long	GOT;
+};
+
 /*
  * Atomically swap in the new signal mask, and wait for a signal.
  */
@@ -378,7 +383,15 @@ static int setup_frame(int sig, struct k_sigaction *ka,
 	regs->regs[4] = signal; /* Arg for signal handler */
 	regs->regs[5] = 0;
 	regs->regs[6] = (unsigned long) &frame->sc;
-	regs->pc = (unsigned long) ka->sa.sa_handler;
+
+	if (current->personality & FDPIC_FUNCPTRS) {
+		struct fdpic_func_descriptor __user *funcptr =
+			(struct fdpic_func_descriptor __user *)ka->sa.sa_handler;
+
+		__get_user(regs->pc, &funcptr->text);
+		__get_user(regs->regs[12], &funcptr->GOT);
+	} else
+		regs->pc = (unsigned long)ka->sa.sa_handler;
 
 	set_fs(USER_DS);
 
@@ -458,7 +471,15 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 	regs->regs[4] = signal; /* Arg for signal handler */
 	regs->regs[5] = (unsigned long) &frame->info;
 	regs->regs[6] = (unsigned long) &frame->uc;
-	regs->pc = (unsigned long) ka->sa.sa_handler;
+
+	if (current->personality & FDPIC_FUNCPTRS) {
+		struct fdpic_func_descriptor __user *funcptr =
+			(struct fdpic_func_descriptor __user *)ka->sa.sa_handler;
+
+		__get_user(regs->pc, &funcptr->text);
+		__get_user(regs->regs[12], &funcptr->GOT);
+	} else
+		regs->pc = (unsigned long)ka->sa.sa_handler;
 
 	set_fs(USER_DS);
 
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 3263084eef9..4a551af6f3f 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -30,7 +30,7 @@ config COMPAT_BINFMT_ELF
 config BINFMT_ELF_FDPIC
 	bool "Kernel support for FDPIC ELF binaries"
 	default y
-	depends on (FRV || BLACKFIN)
+	depends on (FRV || BLACKFIN || (SUPERH32 && !MMU))
 	help
 	  ELF FDPIC binaries are based on ELF, but allow the individual load
 	  segments of a binary to be located in memory independently of each
diff --git a/include/asm-sh/elf.h b/include/asm-sh/elf.h
index 05092da1aa5..f01449a8d37 100644
--- a/include/asm-sh/elf.h
+++ b/include/asm-sh/elf.h
@@ -1,10 +1,15 @@
 #ifndef __ASM_SH_ELF_H
 #define __ASM_SH_ELF_H
 
+#include <linux/utsname.h>
 #include <asm/auxvec.h>
 #include <asm/ptrace.h>
 #include <asm/user.h>
 
+/* ELF header e_flags defines */
+#define EF_SH_PIC		0x100	/* -fpic */
+#define EF_SH_FDPIC		0x8000	/* -mfdpic */
+
 /* SH (particularly SHcompact) relocation types  */
 #define	R_SH_NONE		0
 #define	R_SH_DIR32		1
@@ -43,6 +48,28 @@
 #define	R_SH_RELATIVE		165
 #define	R_SH_GOTOFF		166
 #define	R_SH_GOTPC		167
+
+/* FDPIC relocs */
+#define R_SH_GOT20		70
+#define R_SH_GOTOFF20		71
+#define R_SH_GOTFUNCDESC	72
+#define R_SH_GOTFUNCDESC20	73
+#define R_SH_GOTOFFFUNCDESC	74
+#define R_SH_GOTOFFFUNCDESC20	75
+#define R_SH_FUNCDESC		76
+#define R_SH_FUNCDESC_VALUE	77
+
+#if 0 /* XXX - later .. */
+#define R_SH_GOT20		198
+#define R_SH_GOTOFF20		199
+#define R_SH_GOTFUNCDESC	200
+#define R_SH_GOTFUNCDESC20	201
+#define R_SH_GOTOFFFUNCDESC	202
+#define R_SH_GOTOFFFUNCDESC20	203
+#define R_SH_FUNCDESC		204
+#define R_SH_FUNCDESC_VALUE	205
+#endif
+
 /* SHmedia relocs */
 #define R_SH_IMM_LOW16		246
 #define R_SH_IMM_LOW16_PCREL	247
@@ -77,9 +104,12 @@ typedef struct user_fpu_struct elf_fpregset_t;
 /*
  * This is used to ensure we don't load something for the wrong architecture.
  */
-#define elf_check_arch(x) ( (x)->e_machine == EM_SH )
+#define elf_check_arch(x)		((x)->e_machine == EM_SH)
+#define elf_check_fdpic(x)		((x)->e_flags & EF_SH_FDPIC)
+#define elf_check_const_displacement(x)	((x)->e_flags & EF_SH_PIC)
 
 #define USE_ELF_CORE_DUMP
+#define ELF_FDPIC_CORE_EFLAGS	EF_SH_FDPIC
 #define ELF_EXEC_PAGESIZE	PAGE_SIZE
 
 /* This is the location that an ET_DYN program is loaded if exec'ed.  Typical
@@ -136,6 +166,27 @@ typedef struct user_fpu_struct elf_fpregset_t;
        _r->regs[8]=0; _r->regs[9]=0; _r->regs[10]=0; _r->regs[11]=0; \
        _r->regs[12]=0; _r->regs[13]=0; _r->regs[14]=0; \
        _r->sr = SR_FD; } while (0)
+
+#define ELF_FDPIC_PLAT_INIT(_r, _exec_map_addr, _interp_map_addr,	\
+			    _dynamic_addr)				\
+do {									\
+	_r->regs[0]	= 0;						\
+	_r->regs[1]	= 0;						\
+	_r->regs[2]	= 0;						\
+	_r->regs[3]	= 0;						\
+	_r->regs[4]	= 0;						\
+	_r->regs[5]	= 0;						\
+	_r->regs[6]	= 0;						\
+	_r->regs[7]	= 0;						\
+	_r->regs[8]	= _exec_map_addr;				\
+	_r->regs[9]	= _interp_map_addr;				\
+	_r->regs[10]	= _dynamic_addr;				\
+	_r->regs[11]	= 0;						\
+	_r->regs[12]	= 0;						\
+	_r->regs[13]	= 0;						\
+	_r->regs[14]	= 0;						\
+	_r->sr		= SR_FD;					\
+} while (0)
 #endif
 
 #define SET_PERSONALITY(ex, ibcs2) set_personality(PER_LINUX_32BIT)
diff --git a/include/asm-sh/mmu.h b/include/asm-sh/mmu.h
index eb0358c097d..fdcb93bc6d1 100644
--- a/include/asm-sh/mmu.h
+++ b/include/asm-sh/mmu.h
@@ -12,6 +12,10 @@ typedef struct {
 	struct vm_list_struct	*vmlist;
 	unsigned long		end_brk;
 #endif
+#ifdef CONFIG_BINFMT_ELF_FDPIC
+	unsigned long		exec_fdpic_loadmap;
+	unsigned long		interp_fdpic_loadmap;
+#endif
 } mm_context_t;
 
 /*
diff --git a/include/asm-sh/ptrace.h b/include/asm-sh/ptrace.h
index 7d36dc3bee6..643ab5a7cf3 100644
--- a/include/asm-sh/ptrace.h
+++ b/include/asm-sh/ptrace.h
@@ -87,6 +87,11 @@ struct pt_dspregs {
 	unsigned long	mod;
 };
 
+#define PTRACE_GETFDPIC		31	/* get the ELF fdpic loadmap address */
+
+#define PTRACE_GETFDPIC_EXEC	0	/* [addr] request the executable loadmap */
+#define PTRACE_GETFDPIC_INTERP	1	/* [addr] request the interpreter loadmap */
+
 #define	PTRACE_GETDSPREGS	55
 #define	PTRACE_SETDSPREGS	56
 #endif
-- 
cgit v1.2.3-70-g09d2


From ca5b172bd2b2fe489e7ba11cedd46ddf772d132f Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Mon, 28 Jul 2008 15:46:18 -0700
Subject: exec: include pagemap.h again to fix build

Fix compilation errors on avr32 and without CONFIG_SWAP, introduced by
ba92a43dbaee339cf5915ef766d3d3ffbaaf103c ("exec: remove some includes")

  In file included from include/asm/tlb.h:24,
                   from fs/exec.c:55:
  include/asm-generic/tlb.h: In function 'tlb_flush_mmu':
  include/asm-generic/tlb.h:76: error: implicit declaration of function 'release_pages'
  include/asm-generic/tlb.h: In function 'tlb_remove_page':
  include/asm-generic/tlb.h:105: error: implicit declaration of function 'page_cache_release'
  make[1]: *** [fs/exec.o] Error 1

This straightforward part-revert is nobody's favourite patch to address
the underlying tlb.h needs swap.h needs pagemap.h (but sparc won't like
that) mess; but appropriate to fix the build now before any overhaul.

Reported-by: Yoichi Yuasa <yoichi_yuasa@tripeaks.co.jp>
Reported-by: Haavard Skinnemoen <haavard.skinnemoen@atmel.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Tested-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 9696bbf0f0b..32993beecbe 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -32,6 +32,7 @@
 #include <linux/swap.h>
 #include <linux/string.h>
 #include <linux/init.h>
+#include <linux/pagemap.h>
 #include <linux/highmem.h>
 #include <linux/spinlock.h>
 #include <linux/key.h>
-- 
cgit v1.2.3-70-g09d2


From 8ab22b9abb5c55413802e4adc9aa6223324547c3 Mon Sep 17 00:00:00 2001
From: Hisashi Hifumi <hifumi.hisashi@oss.ntt.co.jp>
Date: Mon, 28 Jul 2008 15:46:36 -0700
Subject: vfs: pagecache usage optimization for pagesize!=blocksize

When we read some part of a file through pagecache, if there is a
pagecache of corresponding index but this page is not uptodate, read IO
is issued and this page will be uptodate.

I think this is good for pagesize == blocksize environment but there is
room for improvement on pagesize != blocksize environment.  Because in
this case a page can have multiple buffers and even if a page is not
uptodate, some buffers can be uptodate.

So I suggest that when all buffers which correspond to a part of a file
that we want to read are uptodate, use this pagecache and copy data from
this pagecache to user buffer even if a page is not uptodate.  This can
reduce read IO and improve system throughput.

I wrote a benchmark program and got result number with this program.

This benchmark do:

  1: mount and open a test file.

  2: create a 512MB file.

  3: close a file and umount.

  4: mount and again open a test file.

  5: pwrite randomly 300000 times on a test file.  offset is aligned
     by IO size(1024bytes).

  6: measure time of preading randomly 100000 times on a test file.

The result was:
	2.6.26
        330 sec

	2.6.26-patched
        226 sec

Arch:i386
Filesystem:ext3
Blocksize:1024 bytes
Memory: 1GB

On ext3/4, a file is written through buffer/block.  So random read/write
mixed workloads or random read after random write workloads are optimized
with this patch under pagesize != blocksize environment.  This test result
showed this.

The benchmark program is as follows:

#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mount.h>

#define LEN 1024
#define LOOP 1024*512 /* 512MB */

main(void)
{
	unsigned long i, offset, filesize;
	int fd;
	char buf[LEN];
	time_t t1, t2;

	if (mount("/dev/sda1", "/root/test1/", "ext3", 0, 0) < 0) {
		perror("cannot mount\n");
		exit(1);
	}
	memset(buf, 0, LEN);
	fd = open("/root/test1/testfile", O_CREAT|O_RDWR|O_TRUNC);
	if (fd < 0) {
		perror("cannot open file\n");
		exit(1);
	}
	for (i = 0; i < LOOP; i++)
		write(fd, buf, LEN);
	close(fd);
	if (umount("/root/test1/") < 0) {
		perror("cannot umount\n");
		exit(1);
	}
	if (mount("/dev/sda1", "/root/test1/", "ext3", 0, 0) < 0) {
		perror("cannot mount\n");
		exit(1);
	}
	fd = open("/root/test1/testfile", O_RDWR);
	if (fd < 0) {
		perror("cannot open file\n");
		exit(1);
	}

	filesize = LEN * LOOP;
	for (i = 0; i < 300000; i++){
		offset = (random() % filesize) & (~(LEN - 1));
		pwrite(fd, buf, LEN, offset);
	}
	printf("start test\n");
	time(&t1);
	for (i = 0; i < 100000; i++){
		offset = (random() % filesize) & (~(LEN - 1));
		pread(fd, buf, LEN, offset);
	}
	time(&t2);
	printf("%ld sec\n", t2-t1);
	close(fd);
	if (umount("/root/test1/") < 0) {
		perror("cannot umount\n");
		exit(1);
	}
}

Signed-off-by: Hisashi Hifumi <hifumi.hisashi@oss.ntt.co.jp>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Jan Kara <jack@ucw.cz>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c                 | 46 +++++++++++++++++++++++
 fs/ext2/inode.c             |  1 +
 fs/ext3/inode.c             | 67 +++++++++++++++++----------------
 fs/ext4/inode.c             | 92 +++++++++++++++++++++++----------------------
 include/linux/buffer_head.h |  2 +
 include/linux/fs.h          | 44 +++++++++++-----------
 mm/filemap.c                | 14 ++++++-
 7 files changed, 167 insertions(+), 99 deletions(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index f9580501963..ca12a6bb82b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2095,6 +2095,52 @@ int generic_write_end(struct file *file, struct address_space *mapping,
 }
 EXPORT_SYMBOL(generic_write_end);
 
+/*
+ * block_is_partially_uptodate checks whether buffers within a page are
+ * uptodate or not.
+ *
+ * Returns true if all buffers which correspond to a file portion
+ * we want to read are uptodate.
+ */
+int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
+					unsigned long from)
+{
+	struct inode *inode = page->mapping->host;
+	unsigned block_start, block_end, blocksize;
+	unsigned to;
+	struct buffer_head *bh, *head;
+	int ret = 1;
+
+	if (!page_has_buffers(page))
+		return 0;
+
+	blocksize = 1 << inode->i_blkbits;
+	to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
+	to = from + to;
+	if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
+		return 0;
+
+	head = page_buffers(page);
+	bh = head;
+	block_start = 0;
+	do {
+		block_end = block_start + blocksize;
+		if (block_end > from && block_start < to) {
+			if (!buffer_uptodate(bh)) {
+				ret = 0;
+				break;
+			}
+			if (block_end >= to)
+				break;
+		}
+		block_start = block_end;
+		bh = bh->b_this_page;
+	} while (bh != head);
+
+	return ret;
+}
+EXPORT_SYMBOL(block_is_partially_uptodate);
+
 /*
  * Generic "read page" function for block devices that have the normal
  * get_block functionality. This is most of the block device filesystems.
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 384fc0d1dd7..991d6dfeb51 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -791,6 +791,7 @@ const struct address_space_operations ext2_aops = {
 	.direct_IO		= ext2_direct_IO,
 	.writepages		= ext2_writepages,
 	.migratepage		= buffer_migrate_page,
+	.is_partially_uptodate	= block_is_partially_uptodate,
 };
 
 const struct address_space_operations ext2_aops_xip = {
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 3bf07d70b91..507d8689b11 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1767,44 +1767,47 @@ static int ext3_journalled_set_page_dirty(struct page *page)
 }
 
 static const struct address_space_operations ext3_ordered_aops = {
-	.readpage	= ext3_readpage,
-	.readpages	= ext3_readpages,
-	.writepage	= ext3_ordered_writepage,
-	.sync_page	= block_sync_page,
-	.write_begin	= ext3_write_begin,
-	.write_end	= ext3_ordered_write_end,
-	.bmap		= ext3_bmap,
-	.invalidatepage	= ext3_invalidatepage,
-	.releasepage	= ext3_releasepage,
-	.direct_IO	= ext3_direct_IO,
-	.migratepage	= buffer_migrate_page,
+	.readpage		= ext3_readpage,
+	.readpages		= ext3_readpages,
+	.writepage		= ext3_ordered_writepage,
+	.sync_page		= block_sync_page,
+	.write_begin		= ext3_write_begin,
+	.write_end		= ext3_ordered_write_end,
+	.bmap			= ext3_bmap,
+	.invalidatepage		= ext3_invalidatepage,
+	.releasepage		= ext3_releasepage,
+	.direct_IO		= ext3_direct_IO,
+	.migratepage		= buffer_migrate_page,
+	.is_partially_uptodate  = block_is_partially_uptodate,
 };
 
 static const struct address_space_operations ext3_writeback_aops = {
-	.readpage	= ext3_readpage,
-	.readpages	= ext3_readpages,
-	.writepage	= ext3_writeback_writepage,
-	.sync_page	= block_sync_page,
-	.write_begin	= ext3_write_begin,
-	.write_end	= ext3_writeback_write_end,
-	.bmap		= ext3_bmap,
-	.invalidatepage	= ext3_invalidatepage,
-	.releasepage	= ext3_releasepage,
-	.direct_IO	= ext3_direct_IO,
-	.migratepage	= buffer_migrate_page,
+	.readpage		= ext3_readpage,
+	.readpages		= ext3_readpages,
+	.writepage		= ext3_writeback_writepage,
+	.sync_page		= block_sync_page,
+	.write_begin		= ext3_write_begin,
+	.write_end		= ext3_writeback_write_end,
+	.bmap			= ext3_bmap,
+	.invalidatepage		= ext3_invalidatepage,
+	.releasepage		= ext3_releasepage,
+	.direct_IO		= ext3_direct_IO,
+	.migratepage		= buffer_migrate_page,
+	.is_partially_uptodate  = block_is_partially_uptodate,
 };
 
 static const struct address_space_operations ext3_journalled_aops = {
-	.readpage	= ext3_readpage,
-	.readpages	= ext3_readpages,
-	.writepage	= ext3_journalled_writepage,
-	.sync_page	= block_sync_page,
-	.write_begin	= ext3_write_begin,
-	.write_end	= ext3_journalled_write_end,
-	.set_page_dirty	= ext3_journalled_set_page_dirty,
-	.bmap		= ext3_bmap,
-	.invalidatepage	= ext3_invalidatepage,
-	.releasepage	= ext3_releasepage,
+	.readpage		= ext3_readpage,
+	.readpages		= ext3_readpages,
+	.writepage		= ext3_journalled_writepage,
+	.sync_page		= block_sync_page,
+	.write_begin		= ext3_write_begin,
+	.write_end		= ext3_journalled_write_end,
+	.set_page_dirty		= ext3_journalled_set_page_dirty,
+	.bmap			= ext3_bmap,
+	.invalidatepage		= ext3_invalidatepage,
+	.releasepage		= ext3_releasepage,
+	.is_partially_uptodate  = block_is_partially_uptodate,
 };
 
 void ext3_set_aops(struct inode *inode)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8ca2763df09..9843b046c23 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2806,59 +2806,63 @@ static int ext4_journalled_set_page_dirty(struct page *page)
 }
 
 static const struct address_space_operations ext4_ordered_aops = {
-	.readpage	= ext4_readpage,
-	.readpages	= ext4_readpages,
-	.writepage	= ext4_normal_writepage,
-	.sync_page	= block_sync_page,
-	.write_begin	= ext4_write_begin,
-	.write_end	= ext4_ordered_write_end,
-	.bmap		= ext4_bmap,
-	.invalidatepage	= ext4_invalidatepage,
-	.releasepage	= ext4_releasepage,
-	.direct_IO	= ext4_direct_IO,
-	.migratepage	= buffer_migrate_page,
+	.readpage		= ext4_readpage,
+	.readpages		= ext4_readpages,
+	.writepage		= ext4_normal_writepage,
+	.sync_page		= block_sync_page,
+	.write_begin		= ext4_write_begin,
+	.write_end		= ext4_ordered_write_end,
+	.bmap			= ext4_bmap,
+	.invalidatepage		= ext4_invalidatepage,
+	.releasepage		= ext4_releasepage,
+	.direct_IO		= ext4_direct_IO,
+	.migratepage		= buffer_migrate_page,
+	.is_partially_uptodate  = block_is_partially_uptodate,
 };
 
 static const struct address_space_operations ext4_writeback_aops = {
-	.readpage	= ext4_readpage,
-	.readpages	= ext4_readpages,
-	.writepage	= ext4_normal_writepage,
-	.sync_page	= block_sync_page,
-	.write_begin	= ext4_write_begin,
-	.write_end	= ext4_writeback_write_end,
-	.bmap		= ext4_bmap,
-	.invalidatepage	= ext4_invalidatepage,
-	.releasepage	= ext4_releasepage,
-	.direct_IO	= ext4_direct_IO,
-	.migratepage	= buffer_migrate_page,
+	.readpage		= ext4_readpage,
+	.readpages		= ext4_readpages,
+	.writepage		= ext4_normal_writepage,
+	.sync_page		= block_sync_page,
+	.write_begin		= ext4_write_begin,
+	.write_end		= ext4_writeback_write_end,
+	.bmap			= ext4_bmap,
+	.invalidatepage		= ext4_invalidatepage,
+	.releasepage		= ext4_releasepage,
+	.direct_IO		= ext4_direct_IO,
+	.migratepage		= buffer_migrate_page,
+	.is_partially_uptodate  = block_is_partially_uptodate,
 };
 
 static const struct address_space_operations ext4_journalled_aops = {
-	.readpage	= ext4_readpage,
-	.readpages	= ext4_readpages,
-	.writepage	= ext4_journalled_writepage,
-	.sync_page	= block_sync_page,
-	.write_begin	= ext4_write_begin,
-	.write_end	= ext4_journalled_write_end,
-	.set_page_dirty	= ext4_journalled_set_page_dirty,
-	.bmap		= ext4_bmap,
-	.invalidatepage	= ext4_invalidatepage,
-	.releasepage	= ext4_releasepage,
+	.readpage		= ext4_readpage,
+	.readpages		= ext4_readpages,
+	.writepage		= ext4_journalled_writepage,
+	.sync_page		= block_sync_page,
+	.write_begin		= ext4_write_begin,
+	.write_end		= ext4_journalled_write_end,
+	.set_page_dirty		= ext4_journalled_set_page_dirty,
+	.bmap			= ext4_bmap,
+	.invalidatepage		= ext4_invalidatepage,
+	.releasepage		= ext4_releasepage,
+	.is_partially_uptodate  = block_is_partially_uptodate,
 };
 
 static const struct address_space_operations ext4_da_aops = {
-	.readpage	= ext4_readpage,
-	.readpages	= ext4_readpages,
-	.writepage	= ext4_da_writepage,
-	.writepages	= ext4_da_writepages,
-	.sync_page	= block_sync_page,
-	.write_begin	= ext4_da_write_begin,
-	.write_end	= ext4_da_write_end,
-	.bmap		= ext4_bmap,
-	.invalidatepage	= ext4_da_invalidatepage,
-	.releasepage	= ext4_releasepage,
-	.direct_IO	= ext4_direct_IO,
-	.migratepage	= buffer_migrate_page,
+	.readpage		= ext4_readpage,
+	.readpages		= ext4_readpages,
+	.writepage		= ext4_da_writepage,
+	.writepages		= ext4_da_writepages,
+	.sync_page		= block_sync_page,
+	.write_begin		= ext4_da_write_begin,
+	.write_end		= ext4_da_write_end,
+	.bmap			= ext4_bmap,
+	.invalidatepage		= ext4_da_invalidatepage,
+	.releasepage		= ext4_releasepage,
+	.direct_IO		= ext4_direct_IO,
+	.migratepage		= buffer_migrate_page,
+	.is_partially_uptodate  = block_is_partially_uptodate,
 };
 
 void ext4_set_aops(struct inode *inode)
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 82aa36c53ea..50cfe8ceb47 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -205,6 +205,8 @@ void block_invalidatepage(struct page *page, unsigned long offset);
 int block_write_full_page(struct page *page, get_block_t *get_block,
 				struct writeback_control *wbc);
 int block_read_full_page(struct page*, get_block_t*);
+int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
+				unsigned long from);
 int block_write_begin(struct file *, struct address_space *,
 				loff_t, unsigned, unsigned,
 				struct page **, void **, get_block_t*);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8252b045e62..580b513668f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -443,6 +443,27 @@ static inline size_t iov_iter_count(struct iov_iter *i)
 	return i->count;
 }
 
+/*
+ * "descriptor" for what we're up to with a read.
+ * This allows us to use the same read code yet
+ * have multiple different users of the data that
+ * we read from a file.
+ *
+ * The simplest case just copies the data to user
+ * mode.
+ */
+typedef struct {
+	size_t written;
+	size_t count;
+	union {
+		char __user *buf;
+		void *data;
+	} arg;
+	int error;
+} read_descriptor_t;
+
+typedef int (*read_actor_t)(read_descriptor_t *, struct page *,
+		unsigned long, unsigned long);
 
 struct address_space_operations {
 	int (*writepage)(struct page *page, struct writeback_control *wbc);
@@ -484,6 +505,8 @@ struct address_space_operations {
 	int (*migratepage) (struct address_space *,
 			struct page *, struct page *);
 	int (*launder_page) (struct page *);
+	int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
+					unsigned long);
 };
 
 /*
@@ -1198,27 +1221,6 @@ struct block_device_operations {
 	struct module *owner;
 };
 
-/*
- * "descriptor" for what we're up to with a read.
- * This allows us to use the same read code yet
- * have multiple different users of the data that
- * we read from a file.
- *
- * The simplest case just copies the data to user
- * mode.
- */
-typedef struct {
-	size_t written;
-	size_t count;
-	union {
-		char __user * buf;
-		void *data;
-	} arg;
-	int error;
-} read_descriptor_t;
-
-typedef int (*read_actor_t)(read_descriptor_t *, struct page *, unsigned long, unsigned long);
-
 /* These macros are for out of kernel modules to test that
  * the kernel supports the unlocked_ioctl and compat_ioctl
  * fields in struct file_operations. */
diff --git a/mm/filemap.c b/mm/filemap.c
index 5de7633e1db..42bbc6909ba 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1023,8 +1023,17 @@ find_page:
 					ra, filp, page,
 					index, last_index - index);
 		}
-		if (!PageUptodate(page))
-			goto page_not_up_to_date;
+		if (!PageUptodate(page)) {
+			if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
+					!mapping->a_ops->is_partially_uptodate)
+				goto page_not_up_to_date;
+			if (TestSetPageLocked(page))
+				goto page_not_up_to_date;
+			if (!mapping->a_ops->is_partially_uptodate(page,
+								desc, offset))
+				goto page_not_up_to_date_locked;
+			unlock_page(page);
+		}
 page_ok:
 		/*
 		 * i_size must be checked after we know the page is Uptodate.
@@ -1094,6 +1103,7 @@ page_not_up_to_date:
 		if (lock_page_killable(page))
 			goto readpage_eio;
 
+page_not_up_to_date_locked:
 		/* Did it get truncated before we got the lock? */
 		if (!page->mapping) {
 			unlock_page(page);
-- 
cgit v1.2.3-70-g09d2


From e3b6e806cf7e45ac5e6ac0625cebafa4de3394aa Mon Sep 17 00:00:00 2001
From: Yoichi Yuasa <yoichi_yuasa@tripeaks.co.jp>
Date: Mon, 28 Jul 2008 15:46:37 -0700
Subject: bio-integrity: remove EXPORT_SYMBOL for bio_integrity_init_slab()

I got section mismatch message about bio_integrity_init_slab().

WARNING: fs/built-in.o(__ksymtab+0xb60): Section mismatch in reference from the variable __ksymtab_bio_integrity_init_slab to the function .init.text:bio_integrity_init_slab()

The symbol bio_integrity_init_slab is exported and annotated __init Fix
this by removing the __init annotation of bio_integrity_init_slab or drop
the export.

It only call from init_bio().  The EXPORT_SYMBOL() can be removed.

Signed-off-by: Yoichi Yuasa <yoichi_yuasa@tripeaks.co.jp>
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/bio-integrity.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 63e2ee63058..c3e174b35fe 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -705,7 +705,6 @@ void __init bio_integrity_init_slab(void)
 	bio_integrity_slab = KMEM_CACHE(bio_integrity_payload,
 					SLAB_HWCACHE_ALIGN|SLAB_PANIC);
 }
-EXPORT_SYMBOL(bio_integrity_init_slab);
 
 static int __init integrity_init(void)
 {
-- 
cgit v1.2.3-70-g09d2


From 7fcba054373d5dfc43d26e243a5c9b92069972ee Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Mon, 28 Jul 2008 15:46:39 -0700
Subject: eCryptfs: use page_alloc not kmalloc to get a page of memory

With SLUB debugging turned on in 2.6.26, I was getting memory corruption
when testing eCryptfs.  The root cause turned out to be that eCryptfs was
doing kmalloc(PAGE_CACHE_SIZE); virt_to_page() and treating that as a nice
page-aligned chunk of memory.  But at least with SLUB debugging on, this
is not always true, and the page we get from virt_to_page does not
necessarily match the PAGE_CACHE_SIZE worth of memory we got from kmalloc.

My simple testcase was 2 loops doing "rm -f fileX; cp /tmp/fileX ." for 2
different multi-megabyte files.  With this change I no longer see the
corruption.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Acked-by: Michael Halcrow <mhalcrow@us.ibm.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: <stable@kernel.org>		[2.6.25.x, 2.6.26.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/crypto.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 7b99917ffad..06db79d05c1 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -475,8 +475,8 @@ int ecryptfs_encrypt_page(struct page *page)
 {
 	struct inode *ecryptfs_inode;
 	struct ecryptfs_crypt_stat *crypt_stat;
-	char *enc_extent_virt = NULL;
-	struct page *enc_extent_page;
+	char *enc_extent_virt;
+	struct page *enc_extent_page = NULL;
 	loff_t extent_offset;
 	int rc = 0;
 
@@ -492,14 +492,14 @@ int ecryptfs_encrypt_page(struct page *page)
 			       page->index);
 		goto out;
 	}
-	enc_extent_virt = kmalloc(PAGE_CACHE_SIZE, GFP_USER);
-	if (!enc_extent_virt) {
+	enc_extent_page = alloc_page(GFP_USER);
+	if (!enc_extent_page) {
 		rc = -ENOMEM;
 		ecryptfs_printk(KERN_ERR, "Error allocating memory for "
 				"encrypted extent\n");
 		goto out;
 	}
-	enc_extent_page = virt_to_page(enc_extent_virt);
+	enc_extent_virt = kmap(enc_extent_page);
 	for (extent_offset = 0;
 	     extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size);
 	     extent_offset++) {
@@ -527,7 +527,10 @@ int ecryptfs_encrypt_page(struct page *page)
 		}
 	}
 out:
-	kfree(enc_extent_virt);
+	if (enc_extent_page) {
+		kunmap(enc_extent_page);
+		__free_page(enc_extent_page);
+	}
 	return rc;
 }
 
@@ -609,8 +612,8 @@ int ecryptfs_decrypt_page(struct page *page)
 {
 	struct inode *ecryptfs_inode;
 	struct ecryptfs_crypt_stat *crypt_stat;
-	char *enc_extent_virt = NULL;
-	struct page *enc_extent_page;
+	char *enc_extent_virt;
+	struct page *enc_extent_page = NULL;
 	unsigned long extent_offset;
 	int rc = 0;
 
@@ -627,14 +630,14 @@ int ecryptfs_decrypt_page(struct page *page)
 			       page->index);
 		goto out;
 	}
-	enc_extent_virt = kmalloc(PAGE_CACHE_SIZE, GFP_USER);
-	if (!enc_extent_virt) {
+	enc_extent_page = alloc_page(GFP_USER);
+	if (!enc_extent_page) {
 		rc = -ENOMEM;
 		ecryptfs_printk(KERN_ERR, "Error allocating memory for "
 				"encrypted extent\n");
 		goto out;
 	}
-	enc_extent_page = virt_to_page(enc_extent_virt);
+	enc_extent_virt = kmap(enc_extent_page);
 	for (extent_offset = 0;
 	     extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size);
 	     extent_offset++) {
@@ -662,7 +665,10 @@ int ecryptfs_decrypt_page(struct page *page)
 		}
 	}
 out:
-	kfree(enc_extent_virt);
+	if (enc_extent_page) {
+		kunmap(enc_extent_page);
+		__free_page(enc_extent_page);
+	}
 	return rc;
 }
 
-- 
cgit v1.2.3-70-g09d2