183 files changed, 4329 insertions, 4020 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index a42f767dcdd..8ea7b04c661 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1734,6 +1734,18 @@ config SUNRPC
 config SUNRPC_GSS
 	tristate
 
+config SUNRPC_BIND34
+	bool "Support for rpcbind versions 3 & 4 (EXPERIMENTAL)"
+	depends on SUNRPC && EXPERIMENTAL
+	help
+	  Provides kernel support for querying rpcbind servers via versions 3
+	  and 4 of the rpcbind protocol.  The kernel automatically falls back
+	  to version 2 if a remote rpcbind service does not support versions
+	  3 or 4.
+
+	  If unsure, say N to get traditional behavior (version 2 rpcbind
+	  requests only).
+
 config RPCSEC_GSS_KRB5
 	tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)"
 	depends on SUNRPC && EXPERIMENTAL
@@ -2020,7 +2032,6 @@ config AFS_FS
 	tristate "Andrew File System support (AFS) (EXPERIMENTAL)"
 	depends on INET && EXPERIMENTAL
 	select AF_RXRPC
-	select KEYS
 	help
 	  If you say Y here, you will get an experimental Andrew File System
 	  driver. It currently only supports unsecured read-only AFS access.
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index f3d3d81eb7e..74c64409ddb 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -26,7 +26,7 @@ config BINFMT_ELF
 config BINFMT_ELF_FDPIC
 	bool "Kernel support for FDPIC ELF binaries"
 	default y
-	depends on FRV
+	depends on (FRV || BLACKFIN)
 	help
 	  ELF FDPIC binaries are based on ELF, but allow the individual load
 	  segments of a binary to be located in memory independently of each
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 2e5f2c8371e..30c29650849 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -232,8 +232,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
 	struct adfs_inode_info *ei = (struct adfs_inode_info *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR)
+	if (flags & SLAB_CTOR_CONSTRUCTOR)
 		inode_init_once(&ei->vfs_inode);
 }
  
diff --git a/fs/affs/super.c b/fs/affs/super.c
index c3986a1911b..beff7d21e6e 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -87,8 +87,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
 	struct affs_inode_info *ei = (struct affs_inode_info *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
+	if (flags & SLAB_CTOR_CONSTRUCTOR) {
 		init_MUTEX(&ei->i_link_lock);
 		init_MUTEX(&ei->i_ext_lock);
 		inode_init_once(&ei->vfs_inode);
diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index 01545eb1d87..cf83e5d6351 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -18,7 +18,7 @@ kafs-objs := \
 	security.o \
 	server.o \
 	super.o \
-	use-rtnetlink.o \
+	netdevices.o \
 	vlclient.o \
 	vlocation.o \
 	vnode.o \
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 639399f0ab6..9bdbf36a9aa 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -468,7 +468,7 @@ int __init afs_callback_update_init(void)
 /*
  * shut down the callback update process
  */
-void __exit afs_callback_update_kill(void)
+void afs_callback_update_kill(void)
 {
 	destroy_workqueue(afs_callback_update_worker);
 }
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 6685f4cbccb..d5b2ad6575b 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -443,6 +443,7 @@ static void SRXAFSCB_GetCapabilities(struct work_struct *work)
 			reply.ia.netmask[loop] = ifs[loop].netmask.s_addr;
 			reply.ia.mtu[loop] = htonl(ifs[loop].mtu);
 		}
+		kfree(ifs);
 	}
 
 	reply.cap.capcount = htonl(1);
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index dac5b990c0c..0c1e902f17a 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -194,10 +194,7 @@ static struct page *afs_dir_get_page(struct inode *dir, unsigned long index,
 
 	page = read_mapping_page(dir->i_mapping, index, &file);
 	if (!IS_ERR(page)) {
-		wait_on_page_locked(page);
 		kmap(page);
-		if (!PageUptodate(page))
-			goto fail;
 		if (!PageChecked(page))
 			afs_dir_check_page(dir, page);
 		if (PageError(page))
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 2393d2a08d7..e54e6c2ad34 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -266,7 +266,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
 		call->unmarshall++;
 
 		if (call->count < PAGE_SIZE) {
-			buffer = kmap_atomic(call->reply3, KM_USER0);
+			page = call->reply3;
+			buffer = kmap_atomic(page, KM_USER0);
 			memset(buffer + PAGE_SIZE - call->count, 0,
 			       call->count);
 			kunmap_atomic(buffer, KM_USER0);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 34665f7d7a1..d90c158cd93 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -349,7 +349,6 @@ struct afs_permits {
  * record of one of a system's set of network interfaces
  */
 struct afs_interface {
-	unsigned	index;		/* interface index */
 	struct in_addr	address;	/* IPv4 address bound to interface */
 	struct in_addr	netmask;	/* netmask applied to address */
 	unsigned	mtu;		/* MTU of interface */
@@ -392,7 +391,7 @@ extern void afs_give_up_callback(struct afs_vnode *);
 extern void afs_dispatch_give_up_callbacks(struct work_struct *);
 extern void afs_flush_callback_breaks(struct afs_server *);
 extern int __init afs_callback_update_init(void);
-extern void __exit afs_callback_update_kill(void);
+extern void afs_callback_update_kill(void);
 
 /*
  * cell.c
@@ -564,7 +563,7 @@ extern void afs_fs_exit(void);
  * use-rtnetlink.c
  */
 extern int afs_get_ipv4_interfaces(struct afs_interface *, size_t, bool);
-extern int afs_get_MAC_address(u8 [6]);
+extern int afs_get_MAC_address(u8 *, size_t);
 
 /*
  * vlclient.c
@@ -591,7 +590,7 @@ extern struct afs_vlocation *afs_vlocation_lookup(struct afs_cell *,
 						  struct key *,
 						  const char *, size_t);
 extern void afs_put_vlocation(struct afs_vlocation *);
-extern void __exit afs_vlocation_purge(void);
+extern void afs_vlocation_purge(void);
 
 /*
  * vnode.c
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 40c2704e755..80ec6fd19a7 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -54,7 +54,7 @@ static int __init afs_get_client_UUID(void)
 
 	/* read the MAC address of one of the external interfaces and construct
 	 * a UUID from it */
-	ret = afs_get_MAC_address(afs_uuid.node);
+	ret = afs_get_MAC_address(afs_uuid.node, sizeof(afs_uuid.node));
 	if (ret < 0)
 		return ret;
 
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index b905ae37f91..034fcfd4e33 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -68,13 +68,11 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
 	}
 
 	ret = -EIO;
-	wait_on_page_locked(page);
-	buf = kmap(page);
-	if (!PageUptodate(page))
-		goto out_free;
 	if (PageError(page))
 		goto out_free;
 
+	buf = kmap(page);
+
 	/* examine the symlink's contents */
 	size = vnode->status.size;
 	_debug("symlink to %*.*s", (int) size, (int) size, buf);
@@ -91,8 +89,8 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
 
 	ret = 0;
 
-out_free:
 	kunmap(page);
+out_free:
 	page_cache_release(page);
 out:
 	_leave(" = %d", ret);
@@ -171,8 +169,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 	}
 
 	ret = -EIO;
-	wait_on_page_locked(page);
-	if (!PageUptodate(page) || PageError(page))
+	if (PageError(page))
 		goto error;
 
 	buf = kmap(page);
diff --git a/fs/afs/netdevices.c b/fs/afs/netdevices.c
new file mode 100644
index 00000000000..fc27d4b52e5
--- /dev/null
+++ b/fs/afs/netdevices.c
@@ -0,0 +1,68 @@
+/* AFS network device helpers
+ *
+ * Copyright (c) 2007 Patrick McHardy <kaber@trash.net>
+ */
+
+#include <linux/string.h>
+#include <linux/rtnetlink.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include "internal.h"
+
+/*
+ * get a MAC address from a random ethernet interface that has a real one
+ * - the buffer will normally be 6 bytes in size
+ */
+int afs_get_MAC_address(u8 *mac, size_t maclen)
+{
+	struct net_device *dev;
+	int ret = -ENODEV;
+
+	if (maclen != ETH_ALEN)
+		BUG();
+
+	rtnl_lock();
+	dev = __dev_getfirstbyhwtype(ARPHRD_ETHER);
+	if (dev) {
+		memcpy(mac, dev->dev_addr, maclen);
+		ret = 0;
+	}
+	rtnl_unlock();
+	return ret;
+}
+
+/*
+ * get a list of this system's interface IPv4 addresses, netmasks and MTUs
+ * - maxbufs must be at least 1
+ * - returns the number of interface records in the buffer
+ */
+int afs_get_ipv4_interfaces(struct afs_interface *bufs, size_t maxbufs,
+			    bool wantloopback)
+{
+	struct net_device *dev;
+	struct in_device *idev;
+	int n = 0;
+
+	ASSERT(maxbufs > 0);
+
+	rtnl_lock();
+	for_each_netdev(dev) {
+		if (dev->type == ARPHRD_LOOPBACK && !wantloopback)
+			continue;
+		idev = __in_dev_get_rtnl(dev);
+		if (!idev)
+			continue;
+		for_primary_ifa(idev) {
+			bufs[n].address.s_addr = ifa->ifa_address;
+			bufs[n].netmask.s_addr = ifa->ifa_mask;
+			bufs[n].mtu = dev->mtu;
+			n++;
+			if (n >= maxbufs)
+				goto out;
+		} endfor_ifa(idev);
+	}
+out:
+	rtnl_unlock();
+	return n;
+}
diff --git a/fs/afs/super.c b/fs/afs/super.c
index cebd03c91f5..7030d76155f 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -20,6 +20,7 @@
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
+#include <linux/parser.h>
 #include "internal.h"
 
 #define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */
@@ -42,7 +43,7 @@ struct file_system_type afs_fs_type = {
 	.name		= "afs",
 	.get_sb		= afs_get_sb,
 	.kill_sb	= kill_anon_super,
-	.fs_flags	= FS_BINARY_MOUNTDATA,
+	.fs_flags	= 0,
 };
 
 static const struct super_operations afs_super_ops = {
@@ -58,6 +59,20 @@ static const struct super_operations afs_super_ops = {
 static struct kmem_cache *afs_inode_cachep;
 static atomic_t afs_count_active_inodes;
 
+enum {
+	afs_no_opt,
+	afs_opt_cell,
+	afs_opt_rwpath,
+	afs_opt_vol,
+};
+
+static const match_table_t afs_options_list = {
+	{ afs_opt_cell,		"cell=%s"	},
+	{ afs_opt_rwpath,	"rwpath"	},
+	{ afs_opt_vol,		"vol=%s"	},
+	{ afs_no_opt,		NULL		},
+};
+
 /*
  * initialise the filesystem
  */
@@ -115,31 +130,6 @@ void __exit afs_fs_exit(void)
 }
 
 /*
- * check that an argument has a value
- */
-static int want_arg(char **_value, const char *option)
-{
-	if (!_value || !*_value || !**_value) {
-		printk(KERN_NOTICE "kAFS: %s: argument missing\n", option);
-		return 0;
-	}
-	return 1;
-}
-
-/*
- * check that there's no subsequent value
- */
-static int want_no_value(char *const *_value, const char *option)
-{
-	if (*_value && **_value) {
-		printk(KERN_NOTICE "kAFS: %s: Invalid argument: %s\n",
-		       option, *_value);
-		return 0;
-	}
-	return 1;
-}
-
-/*
  * parse the mount options
  * - this function has been shamelessly adapted from the ext3 fs which
  *   shamelessly adapted it from the msdos fs
@@ -148,48 +138,46 @@ static int afs_parse_options(struct afs_mount_params *params,
 			     char *options, const char **devname)
 {
 	struct afs_cell *cell;
-	char *key, *value;
-	int ret;
+	substring_t args[MAX_OPT_ARGS];
+	char *p;
+	int token;
 
 	_enter("%s", options);
 
 	options[PAGE_SIZE - 1] = 0;
 
-	ret = 0;
-	while ((key = strsep(&options, ","))) {
-		value = strchr(key, '=');
-		if (value)
-			*value++ = 0;
-
-		_debug("kAFS: KEY: %s, VAL:%s", key, value ?: "-");
+	while ((p = strsep(&options, ","))) {
+		if (!*p)
+			continue;
 
-		if (strcmp(key, "rwpath") == 0) {
-			if (!want_no_value(&value, "rwpath"))
-				return -EINVAL;
-			params->rwpath = 1;
-		} else if (strcmp(key, "vol") == 0) {
-			if (!want_arg(&value, "vol"))
-				return -EINVAL;
-			*devname = value;
-		} else if (strcmp(key, "cell") == 0) {
-			if (!want_arg(&value, "cell"))
-				return -EINVAL;
-			cell = afs_cell_lookup(value, strlen(value));
+		token = match_token(p, afs_options_list, args);
+		switch (token) {
+		case afs_opt_cell:
+			cell = afs_cell_lookup(args[0].from,
+					       args[0].to - args[0].from);
 			if (IS_ERR(cell))
 				return PTR_ERR(cell);
 			afs_put_cell(params->cell);
 			params->cell = cell;
-		} else {
-			printk("kAFS: Unknown mount option: '%s'\n",  key);
-			ret = -EINVAL;
-			goto error;
+			break;
+
+		case afs_opt_rwpath:
+			params->rwpath = 1;
+			break;
+
+		case afs_opt_vol:
+			*devname = args[0].from;
+			break;
+
+		default:
+			printk(KERN_ERR "kAFS:"
+			       " Unknown or invalid mount option: '%s'\n", p);
+			return -EINVAL;
 		}
 	}
 
-	ret = 0;
-error:
-	_leave(" = %d", ret);
-	return ret;
+	_leave(" = 0");
+	return 0;
 }
 
 /*
@@ -361,7 +349,6 @@ error:
 
 /*
  * get an AFS superblock
- * - TODO: don't use get_sb_nodev(), but rather call sget() directly
  */
 static int afs_get_sb(struct file_system_type *fs_type,
 		      int flags,
@@ -386,7 +373,6 @@ static int afs_get_sb(struct file_system_type *fs_type,
 			goto error;
 	}
 
-
 	ret = afs_parse_device_name(&params, dev_name);
 	if (ret < 0)
 		goto error;
@@ -467,8 +453,7 @@ static void afs_i_init_once(void *_vnode, struct kmem_cache *cachep,
 {
 	struct afs_vnode *vnode = _vnode;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
+	if (flags & SLAB_CTOR_CONSTRUCTOR) {
 		memset(vnode, 0, sizeof(*vnode));
 		inode_init_once(&vnode->vfs_inode);
 		init_waitqueue_head(&vnode->update_waitq);
diff --git a/fs/afs/use-rtnetlink.c b/fs/afs/use-rtnetlink.c
deleted file mode 100644
index f8991c700e0..00000000000
--- a/fs/afs/use-rtnetlink.c
+++ /dev/null
@@ -1,473 +0,0 @@
-/* RTNETLINK client
- *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <linux/netlink.h>
-#include <linux/rtnetlink.h>
-#include <linux/if_addr.h>
-#include <linux/if_arp.h>
-#include <linux/inetdevice.h>
-#include <net/netlink.h>
-#include "internal.h"
-
-struct afs_rtm_desc {
-	struct socket		*nlsock;
-	struct afs_interface	*bufs;
-	u8			*mac;
-	size_t			nbufs;
-	size_t			maxbufs;
-	void			*data;
-	ssize_t			datalen;
-	size_t			datamax;
-	int			msg_seq;
-	unsigned		mac_index;
-	bool			wantloopback;
-	int (*parse)(struct afs_rtm_desc *, struct nlmsghdr *);
-};
-
-/*
- * parse an RTM_GETADDR response
- */
-static int afs_rtm_getaddr_parse(struct afs_rtm_desc *desc,
-				 struct nlmsghdr *nlhdr)
-{
-	struct afs_interface *this;
-	struct ifaddrmsg *ifa;
-	struct rtattr *rtattr;
-	const char *name;
-	size_t len;
-
-	ifa = (struct ifaddrmsg *) NLMSG_DATA(nlhdr);
-
-	_enter("{ix=%d,af=%d}", ifa->ifa_index, ifa->ifa_family);
-
-	if (ifa->ifa_family != AF_INET) {
-		_leave(" = 0 [family %d]", ifa->ifa_family);
-		return 0;
-	}
-	if (desc->nbufs >= desc->maxbufs) {
-		_leave(" = 0 [max %zu/%zu]", desc->nbufs, desc->maxbufs);
-		return 0;
-	}
-
-	this = &desc->bufs[desc->nbufs];
-
-	this->index = ifa->ifa_index;
-	this->netmask.s_addr = inet_make_mask(ifa->ifa_prefixlen);
-	this->mtu = 0;
-
-	rtattr = NLMSG_DATA(nlhdr) + NLMSG_ALIGN(sizeof(struct ifaddrmsg));
-	len = NLMSG_PAYLOAD(nlhdr, sizeof(struct ifaddrmsg));
-
-	name = "unknown";
-	for (; RTA_OK(rtattr, len); rtattr = RTA_NEXT(rtattr, len)) {
-		switch (rtattr->rta_type) {
-		case IFA_ADDRESS:
-			memcpy(&this->address, RTA_DATA(rtattr), 4);
-			break;
-		case IFA_LABEL:
-			name = RTA_DATA(rtattr);
-			break;
-		}
-	}
-
-	_debug("%s: "NIPQUAD_FMT"/"NIPQUAD_FMT,
-	       name, NIPQUAD(this->address), NIPQUAD(this->netmask));
-
-	desc->nbufs++;
-	_leave(" = 0");
-	return 0;
-}
-
-/*
- * parse an RTM_GETLINK response for MTUs
- */
-static int afs_rtm_getlink_if_parse(struct afs_rtm_desc *desc,
-				    struct nlmsghdr *nlhdr)
-{
-	struct afs_interface *this;
-	struct ifinfomsg *ifi;
-	struct rtattr *rtattr;
-	const char *name;
-	size_t len, loop;
-
-	ifi = (struct ifinfomsg *) NLMSG_DATA(nlhdr);
-
-	_enter("{ix=%d}", ifi->ifi_index);
-
-	for (loop = 0; loop < desc->nbufs; loop++) {
-		this = &desc->bufs[loop];
-		if (this->index == ifi->ifi_index)
-			goto found;
-	}
-
-	_leave(" = 0 [no match]");
-	return 0;
-
-found:
-	if (ifi->ifi_type == ARPHRD_LOOPBACK && !desc->wantloopback) {
-		_leave(" = 0 [loopback]");
-		return 0;
-	}
-
-	rtattr = NLMSG_DATA(nlhdr) + NLMSG_ALIGN(sizeof(struct ifinfomsg));
-	len = NLMSG_PAYLOAD(nlhdr, sizeof(struct ifinfomsg));
-
-	name = "unknown";
-	for (; RTA_OK(rtattr, len); rtattr = RTA_NEXT(rtattr, len)) {
-		switch (rtattr->rta_type) {
-		case IFLA_MTU:
-			memcpy(&this->mtu, RTA_DATA(rtattr), 4);
-			break;
-		case IFLA_IFNAME:
-			name = RTA_DATA(rtattr);
-			break;
-		}
-	}
-
-	_debug("%s: "NIPQUAD_FMT"/"NIPQUAD_FMT" mtu %u",
-	       name, NIPQUAD(this->address), NIPQUAD(this->netmask),
-	       this->mtu);
-
-	_leave(" = 0");
-	return 0;
-}
-
-/*
- * parse an RTM_GETLINK response for the MAC address belonging to the lowest
- * non-internal interface
- */
-static int afs_rtm_getlink_mac_parse(struct afs_rtm_desc *desc,
-				     struct nlmsghdr *nlhdr)
-{
-	struct ifinfomsg *ifi;
-	struct rtattr *rtattr;
-	const char *name;
-	size_t remain, len;
-	bool set;
-
-	ifi = (struct ifinfomsg *) NLMSG_DATA(nlhdr);
-
-	_enter("{ix=%d}", ifi->ifi_index);
-
-	if (ifi->ifi_index >= desc->mac_index) {
-		_leave(" = 0 [high]");
-		return 0;
-	}
-	if (ifi->ifi_type == ARPHRD_LOOPBACK) {
-		_leave(" = 0 [loopback]");
-		return 0;
-	}
-
-	rtattr = NLMSG_DATA(nlhdr) + NLMSG_ALIGN(sizeof(struct ifinfomsg));
-	remain = NLMSG_PAYLOAD(nlhdr, sizeof(struct ifinfomsg));
-
-	name = "unknown";
-	set = false;
-	for (; RTA_OK(rtattr, remain); rtattr = RTA_NEXT(rtattr, remain)) {
-		switch (rtattr->rta_type) {
-		case IFLA_ADDRESS:
-			len = RTA_PAYLOAD(rtattr);
-			memcpy(desc->mac, RTA_DATA(rtattr),
-			       min_t(size_t, len, 6));
-			desc->mac_index = ifi->ifi_index;
-			set = true;
-			break;
-		case IFLA_IFNAME:
-			name = RTA_DATA(rtattr);
-			break;
-		}
-	}
-
-	if (set)
-		_debug("%s: %02x:%02x:%02x:%02x:%02x:%02x",
-		       name,
-		       desc->mac[0], desc->mac[1], desc->mac[2],
-		       desc->mac[3], desc->mac[4], desc->mac[5]);
-
-	_leave(" = 0");
-	return 0;
-}
-
-/*
- * read the rtnetlink response and pass to parsing routine
- */
-static int afs_read_rtm(struct afs_rtm_desc *desc)
-{
-	struct nlmsghdr *nlhdr, tmphdr;
-	struct msghdr msg;
-	struct kvec iov[1];
-	void *data;
-	bool last = false;
-	int len, ret, remain;
-
-	_enter("");
-
-	do {
-		/* first of all peek to see how big the packet is */
-		memset(&msg, 0, sizeof(msg));
-		iov[0].iov_base = &tmphdr;
-		iov[0].iov_len = sizeof(tmphdr);
-		len = kernel_recvmsg(desc->nlsock, &msg, iov, 1,
-				     sizeof(tmphdr), MSG_PEEK | MSG_TRUNC);
-		if (len < 0) {
-			_leave(" = %d [peek]", len);
-			return len;
-		}
-		if (len == 0)
-			continue;
-		if (len < sizeof(tmphdr) || len < NLMSG_PAYLOAD(&tmphdr, 0)) {
-			_leave(" = -EMSGSIZE");
-			return -EMSGSIZE;
-		}
-
-		if (desc->datamax < len) {
-			kfree(desc->data);
-			desc->data = NULL;
-			data = kmalloc(len, GFP_KERNEL);
-			if (!data)
-				return -ENOMEM;
-			desc->data = data;
-		}
-		desc->datamax = len;
-
-		/* read all the data from this packet */
-		iov[0].iov_base = desc->data;
-		iov[0].iov_len = desc->datamax;
-		desc->datalen = kernel_recvmsg(desc->nlsock, &msg, iov, 1,
-					       desc->datamax, 0);
-		if (desc->datalen < 0) {
-			_leave(" = %zd [recv]", desc->datalen);
-			return desc->datalen;
-		}
-
-		nlhdr = desc->data;
-
-		/* check if the header is valid */
-		if (!NLMSG_OK(nlhdr, desc->datalen) ||
-		    nlhdr->nlmsg_type == NLMSG_ERROR) {
-			_leave(" = -EIO");
-			return -EIO;
-		}
-
-		/* see if this is the last message */
-		if (nlhdr->nlmsg_type == NLMSG_DONE ||
-		    !(nlhdr->nlmsg_flags & NLM_F_MULTI))
-			last = true;
-
-		/* parse the bits we got this time */
-		nlmsg_for_each_msg(nlhdr, desc->data, desc->datalen, remain) {
-			ret = desc->parse(desc, nlhdr);
-			if (ret < 0) {
-				_leave(" = %d [parse]", ret);
-				return ret;
-			}
-		}
-
-	} while (!last);
-
-	_leave(" = 0");
-	return 0;
-}
-
-/*
- * list the interface bound addresses to get the address and netmask
- */
-static int afs_rtm_getaddr(struct afs_rtm_desc *desc)
-{
-	struct msghdr msg;
-	struct kvec iov[1];
-	int ret;
-
-	struct {
-		struct nlmsghdr nl_msg __attribute__((aligned(NLMSG_ALIGNTO)));
-		struct ifaddrmsg addr_msg __attribute__((aligned(NLMSG_ALIGNTO)));
-	} request;
-
-	_enter("");
-
-	memset(&request, 0, sizeof(request));
-
-	request.nl_msg.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifaddrmsg));
-	request.nl_msg.nlmsg_type = RTM_GETADDR;
-	request.nl_msg.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
-	request.nl_msg.nlmsg_seq = desc->msg_seq++;
-	request.nl_msg.nlmsg_pid = 0;
-
-	memset(&msg, 0, sizeof(msg));
-	iov[0].iov_base = &request;
-	iov[0].iov_len = sizeof(request);
-
-	ret = kernel_sendmsg(desc->nlsock, &msg, iov, 1, iov[0].iov_len);
-	_leave(" = %d", ret);
-	return ret;
-}
-
-/*
- * list the interface link statuses to get the MTUs
- */
-static int afs_rtm_getlink(struct afs_rtm_desc *desc)
-{
-	struct msghdr msg;
-	struct kvec iov[1];
-	int ret;
-
-	struct {
-		struct nlmsghdr nl_msg __attribute__((aligned(NLMSG_ALIGNTO)));
-		struct ifinfomsg link_msg __attribute__((aligned(NLMSG_ALIGNTO)));
-	} request;
-
-	_enter("");
-
-	memset(&request, 0, sizeof(request));
-
-	request.nl_msg.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
-	request.nl_msg.nlmsg_type = RTM_GETLINK;
-	request.nl_msg.nlmsg_flags = NLM_F_REQUEST | NLM_F_ROOT;
-	request.nl_msg.nlmsg_seq = desc->msg_seq++;
-	request.nl_msg.nlmsg_pid = 0;
-
-	memset(&msg, 0, sizeof(msg));
-	iov[0].iov_base = &request;
-	iov[0].iov_len = sizeof(request);
-
-	ret = kernel_sendmsg(desc->nlsock, &msg, iov, 1, iov[0].iov_len);
-	_leave(" = %d", ret);
-	return ret;
-}
-
-/*
- * cull any interface records for which there isn't an MTU value
- */
-static void afs_cull_interfaces(struct afs_rtm_desc *desc)
-{
-	struct afs_interface *bufs = desc->bufs;
-	size_t nbufs = desc->nbufs;
-	int loop, point = 0;
-
-	_enter("{%zu}", nbufs);
-
-	for (loop = 0; loop < nbufs; loop++) {
-		if (desc->bufs[loop].mtu != 0) {
-			if (loop != point) {
-				ASSERTCMP(loop, >, point);
-				bufs[point] = bufs[loop];
-			}
-			point++;
-		}
-	}
-
-	desc->nbufs = point;
-	_leave(" [%zu/%zu]", desc->nbufs, nbufs);
-}
-
-/*
- * get a list of this system's interface IPv4 addresses, netmasks and MTUs
- * - returns the number of interface records in the buffer
- */
-int afs_get_ipv4_interfaces(struct afs_interface *bufs, size_t maxbufs,
-			    bool wantloopback)
-{
-	struct afs_rtm_desc desc;
-	int ret, loop;
-
-	_enter("");
-
-	memset(&desc, 0, sizeof(desc));
-	desc.bufs = bufs;
-	desc.maxbufs = maxbufs;
-	desc.wantloopback = wantloopback;
-
-	ret = sock_create_kern(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE,
-			       &desc.nlsock);
-	if (ret < 0) {
-		_leave(" = %d [sock]", ret);
-		return ret;
-	}
-
-	/* issue RTM_GETADDR */
-	desc.parse = afs_rtm_getaddr_parse;
-	ret = afs_rtm_getaddr(&desc);
-	if (ret < 0)
-		goto error;
-	ret = afs_read_rtm(&desc);
-	if (ret < 0)
-		goto error;
-
-	/* issue RTM_GETLINK */
-	desc.parse = afs_rtm_getlink_if_parse;
-	ret = afs_rtm_getlink(&desc);
-	if (ret < 0)
-		goto error;
-	ret = afs_read_rtm(&desc);
-	if (ret < 0)
-		goto error;
-
-	afs_cull_interfaces(&desc);
-	ret = desc.nbufs;
-
-	for (loop = 0; loop < ret; loop++)
-		_debug("[%d] "NIPQUAD_FMT"/"NIPQUAD_FMT" mtu %u",
-		       bufs[loop].index,
-		       NIPQUAD(bufs[loop].address),
-		       NIPQUAD(bufs[loop].netmask),
-		       bufs[loop].mtu);
-
-error:
-	kfree(desc.data);
-	sock_release(desc.nlsock);
-	_leave(" = %d", ret);
-	return ret;
-}
-
-/*
- * get a MAC address from a random ethernet interface that has a real one
- * - the buffer should be 6 bytes in size
- */
-int afs_get_MAC_address(u8 mac[6])
-{
-	struct afs_rtm_desc desc;
-	int ret;
-
-	_enter("");
-
-	memset(&desc, 0, sizeof(desc));
-	desc.mac = mac;
-	desc.mac_index = UINT_MAX;
-
-	ret = sock_create_kern(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE,
-			       &desc.nlsock);
-	if (ret < 0) {
-		_leave(" = %d [sock]", ret);
-		return ret;
-	}
-
-	/* issue RTM_GETLINK */
-	desc.parse = afs_rtm_getlink_mac_parse;
-	ret = afs_rtm_getlink(&desc);
-	if (ret < 0)
-		goto error;
-	ret = afs_read_rtm(&desc);
-	if (ret < 0)
-		goto error;
-
-	if (desc.mac_index < UINT_MAX) {
-		/* got a MAC address */
-		_debug("[%d] %02x:%02x:%02x:%02x:%02x:%02x",
-		       desc.mac_index,
-		       mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
-	} else {
-		ret = -ENONET;
-	}
-
-error:
-	sock_release(desc.nlsock);
-	_leave(" = %d", ret);
-	return ret;
-}
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 6c8e95a7c2c..3370cdb7256 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -602,7 +602,7 @@ int __init afs_vlocation_update_init(void)
 /*
  * discard all the volume location records for rmmod
  */
-void __exit afs_vlocation_purge(void)
+void afs_vlocation_purge(void)
 {
 	afs_vlocation_timeout = 0;
 
diff --git a/fs/aio.c b/fs/aio.c
index e4598d6d49d..b97ab8028b6 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -68,10 +68,8 @@ static void aio_queue_work(struct kioctx *);
  */
 static int __init aio_setup(void)
 {
-	kiocb_cachep = kmem_cache_create("kiocb", sizeof(struct kiocb),
-				0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
-	kioctx_cachep = kmem_cache_create("kioctx", sizeof(struct kioctx),
-				0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+	kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+	kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
 
 	aio_wq = create_workqueue("aio");
 
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index cc6cc8ed2e3..fe96108a788 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -293,8 +293,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
         struct befs_inode_info *bi = (struct befs_inode_info *) foo;
 	
-	        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-		            SLAB_CTOR_CONSTRUCTOR) {
+	        if (flags & SLAB_CTOR_CONSTRUCTOR) {
 			inode_init_once(&bi->vfs_inode);
 		}
 }
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 93d6219243a..edc08d89aab 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -248,8 +248,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
 	struct bfs_inode_info *bi = foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR)
+	if (flags & SLAB_CTOR_CONSTRUCTOR)
 		inode_init_once(&bi->vfs_inode);
 }
  
diff --git a/fs/bio.c b/fs/bio.c
index 693940da409..093345f0012 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1193,8 +1193,7 @@ static void __init biovec_init_slabs(void)
 
 static int __init init_bio(void)
 {
-	bio_slab = kmem_cache_create("bio", sizeof(struct bio), 0,
-				SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+	bio_slab = KMEM_CACHE(bio, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
 
 	biovec_init_slabs();
 
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 575076c018f..f02b7bdd986 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -55,10 +55,12 @@ static sector_t max_block(struct block_device *bdev)
 	return retval;
 }
 
-/* Kill _all_ buffers, dirty or not.. */
+/* Kill _all_ buffers and pagecache , dirty or not.. */
 static void kill_bdev(struct block_device *bdev)
 {
-	invalidate_bdev(bdev, 1);
+	if (bdev->bd_inode->i_mapping->nrpages == 0)
+		return;
+	invalidate_bh_lrus();
 	truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
 }	
 
@@ -455,9 +457,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 	struct bdev_inode *ei = (struct bdev_inode *) foo;
 	struct block_device *bdev = &ei->bdev;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR)
-	{
+	if (flags & SLAB_CTOR_CONSTRUCTOR) {
 		memset(bdev, 0, sizeof(*bdev));
 		mutex_init(&bdev->bd_mutex);
 		sema_init(&bdev->bd_mount_sem, 1);
@@ -1478,7 +1478,7 @@ int __invalidate_device(struct block_device *bdev)
 		res = invalidate_inodes(sb);
 		drop_super(sb);
 	}
-	invalidate_bdev(bdev, 0);
+	invalidate_bdev(bdev);
 	return res;
 }
 EXPORT_SYMBOL(__invalidate_device);
diff --git a/fs/buffer.c b/fs/buffer.c
index 1d0852fa728..7db24b9e544 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -44,7 +44,6 @@
 #include <linux/bit_spinlock.h>
 
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
-static void invalidate_bh_lrus(void);
 
 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
 
@@ -333,7 +332,7 @@ out:
    we think the disk contains more recent information than the buffercache.
    The update == 1 pass marks the buffers we need to update, the update == 2
    pass does the actual I/O. */
-void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
+void invalidate_bdev(struct block_device *bdev)
 {
 	struct address_space *mapping = bdev->bd_inode->i_mapping;
 
@@ -341,11 +340,6 @@ void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
 		return;
 
 	invalidate_bh_lrus();
-	/*
-	 * FIXME: what about destroy_dirty_buffers?
-	 * We really want to use invalidate_inode_pages2() for
-	 * that, but not until that's cleaned up.
-	 */
 	invalidate_mapping_pages(mapping, 0, -1);
 }
 
@@ -1408,7 +1402,7 @@ static void invalidate_bh_lru(void *arg)
 	put_cpu_var(bh_lrus);
 }
 	
-static void invalidate_bh_lrus(void)
+void invalidate_bh_lrus(void)
 {
 	on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
 }
@@ -1700,17 +1694,8 @@ done:
 		 * clean.  Someone wrote them back by hand with
 		 * ll_rw_block/submit_bh.  A rare case.
 		 */
-		int uptodate = 1;
-		do {
-			if (!buffer_uptodate(bh)) {
-				uptodate = 0;
-				break;
-			}
-			bh = bh->b_this_page;
-		} while (bh != head);
-		if (uptodate)
-			SetPageUptodate(page);
 		end_page_writeback(page);
+
 		/*
 		 * The page and buffer_heads can be released at any time from
 		 * here on.
@@ -2968,8 +2953,7 @@ EXPORT_SYMBOL(free_buffer_head);
 static void
 init_buffer_head(void *data, struct kmem_cache *cachep, unsigned long flags)
 {
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-			    SLAB_CTOR_CONSTRUCTOR) {
+	if (flags & SLAB_CTOR_CONSTRUCTOR) {
 		struct buffer_head * bh = (struct buffer_head *)data;
 
 		memset(bh, 0, sizeof(*bh));
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 5d1f4873d70..a9b6bc5157b 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,4 +1,16 @@
-Verison 1.48
+Version 1.49
+------------
+IPv6 support.  Enable ipv6 addresses to be passed on mount (put the ipv6
+address after the "ip=" mount option, at least until mount.cifs is fixed to
+handle DNS host to ipv6 name translation).  Accept override of uid or gid
+on mount even when Unix Extensions are negotiated (it used to be ignored
+when Unix Extensions were ignored).  This allows users to override the
+default uid and gid for files when they are certain that the uids or
+gids on the server do not match those of the client.  Make "sec=none"
+mount override username (so that null user connection is attempted)
+to match what documentation said.
+
+Version 1.48
 ------------
 Fix mtime bouncing around from local idea of last write times to remote time.
 Fix hang (in i_size_read) when simultaneous size update of same remote file
@@ -9,7 +21,13 @@ from read-only back to read-write, reflect this change in default file mode
 (we had been leaving a file's mode read-only until the inode were reloaded).
 Allow setting of attribute back to ATTR_NORMAL (removing readonly dos attribute
 when archive dos attribute not set and we are changing mode back to writeable
-on server which does not support the Unix Extensions).
+on server which does not support the Unix Extensions).  Remove read only dos
+attribute on chmod when adding any write permission (ie on any of
+user/group/other (not all of user/group/other ie  0222) when
+mounted to windows.  Add support for POSIX MkDir (slight performance
+enhancement and eliminates the network race between the mkdir and set 
+path info of the mode).
+
 
 Version 1.47
 ------------
diff --git a/fs/cifs/README b/fs/cifs/README
index 080c5eba112..4d01697722c 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -257,13 +257,19 @@ A partial list of the supported mount options follows:
 		mount.	
   domain	Set the SMB/CIFS workgroup name prepended to the
 		username during CIFS session establishment
-  uid		If CIFS Unix extensions are not supported by the server
-		this overrides the default uid for inodes. For mounts to
-		servers which do support the CIFS Unix extensions, such
-		as a properly configured Samba server, the server provides
-		the uid, gid and mode.  For servers which do not support
-		the Unix extensions, the default uid (and gid) returned on
-		lookup of existing files is the uid (gid) of the person
+  uid		Set the default uid for inodes. For mounts to servers
+		which do support the CIFS Unix extensions, such as a
+		properly configured Samba server, the server provides
+		the uid, gid and mode so this parameter should  not be
+		specified unless the server and clients uid and gid
+		numbering differ.  If the server and client are in the
+		same domain (e.g. running winbind or nss_ldap) and
+		the server supports the Unix Extensions then the uid
+		and gid can be retrieved from the server (and uid
+		and gid would not have to be specifed on the mount. 
+		For servers which do not support the CIFS Unix
+		extensions, the default uid (and gid) returned on lookup
+		of existing files will be the uid (gid) of the person
 		who executed the mount (root, except when mount.cifs
 		is configured setuid for user mounts) unless the "uid=" 
 		(gid) mount option is specified.  For the uid (gid) of newly
@@ -281,8 +287,7 @@ A partial list of the supported mount options follows:
 		the client.  Note that the mount.cifs helper must be
 		at version 1.10 or higher to support specifying the uid
 		(or gid) in non-numberic form.
-  gid		If CIFS Unix extensions are not supported by the server
-		this overrides the default gid for inodes.
+  gid		Set the default gid for inodes (similar to above).
   file_mode     If CIFS Unix extensions are not supported by the server
 		this overrides the default mode for file inodes.
   dir_mode      If CIFS Unix extensions are not supported by the server 
@@ -467,7 +472,7 @@ including:
 	-V      print mount.cifs version
 	-?      display simple usage information
 
-With recent 2.6 kernel versions of modutils, the version of the cifs kernel
+With most 2.6 kernel versions of modutils, the version of the cifs kernel
 module can be displayed via modinfo.
 
 Misc /proc/fs/cifs Flags and Debug Info
@@ -516,8 +521,22 @@ SecurityFlags		Flags which control security negotiation and
 			must use plaintext passwords			0x20020
 			(reserved for future packet encryption)		0x00040
 
-cifsFYI			If set to one, additional debug information is
-			logged to the system error log. (default 0)
+cifsFYI			If set to non-zero value, additional debug information
+			will be logged to the system error log.  This field
+			contains three flags controlling different classes of
+			debugging entries.  The maximum value it can be set
+			to is 7 which enables all debugging points (default 0).
+			Some debugging statements are not compiled into the
+			cifs kernel unless CONFIG_CIFS_DEBUG2 is enabled in the
+			kernel configuration. cifsFYI may be set to one or
+			nore of the following flags (7 sets them all):
+
+			log cifs informational messages			0x01
+			log return codes from cifs entry points		0x02
+			log slow responses (ie which take longer than 1 second)
+			  CONFIG_CIFS_STATS2 must be enabled in .config	0x04
+				
+				
 traceSMB		If set to one, debug information is logged to the
 			system error log with the start of smb requests
 			and responses (default 0)
diff --git a/fs/cifs/TODO b/fs/cifs/TODO
index d7b9c27c942..78b620e332b 100644
--- a/fs/cifs/TODO
+++ b/fs/cifs/TODO
@@ -1,4 +1,4 @@
-Version 1.39 November 30, 2005
+Version 1.49 April 26, 2007
 
 A Partial List of Missing Features
 ==================================
@@ -18,7 +18,7 @@ better)
 
 d) Kerberos/SPNEGO session setup support - (started)
 
-e) NTLMv2 authentication (mostly implemented - double check
+e) More testing of NTLMv2 authentication (mostly implemented - double check
 that NTLMv2 signing works, also need to cleanup now unneeded SessSetup code in
 fs/cifs/connect.c)
 
@@ -27,55 +27,44 @@ used (Kerberos or NTLMSSP). Signing alreadyimplemented for NTLM
 and raw NTLMSSP already. This is important when enabling
 extended security and mounting to Windows 2003 Servers
 
-f) Directory entry caching relies on a 1 second timer, rather than 
+g) Directory entry caching relies on a 1 second timer, rather than 
 using FindNotify or equivalent.  - (started)
 
-g) A few byte range testcases fail due to POSIX vs. Windows/CIFS
-style byte range lock differences.  Save byte range locks so
-reconnect can replay them.  
-
-h) Support unlock all (unlock 0,MAX_OFFSET)
-by unlocking all known byte range locks that we locked on the file.
-
-i) quota support (needs minor kernel change since quota calls
+h) quota support (needs minor kernel change since quota calls
 to make it to network filesystems or deviceless filesystems)
 
-j) investigate sync behavior (including syncpage) and check  
+i) investigate sync behavior (including syncpage) and check  
 for proper behavior of intr/nointr
 
-k) hook lower into the sockets api (as NFS/SunRPC does) to avoid the
+j) hook lower into the sockets api (as NFS/SunRPC does) to avoid the
 extra copy in/out of the socket buffers in some cases.
 
-l) finish support for IPv6.  This is mostly complete but
-needs a simple conversion of ipv6 to sin6_addr from the
-address in string representation.
-
-m) Better optimize open (and pathbased setfilesize) to reduce the
+k) Better optimize open (and pathbased setfilesize) to reduce the
 oplock breaks coming from windows srv.  Piggyback identical file
 opens on top of each other by incrementing reference count rather
 than resending (helps reduce server resource utilization and avoid
 spurious oplock breaks).
 
-o) Improve performance of readpages by sending more than one read
+l) Improve performance of readpages by sending more than one read
 at a time when 8 pages or more are requested. In conjuntion
 add support for async_cifs_readpages.
 
-p) Add support for storing symlink info to Windows servers 
+m) Add support for storing symlink info to Windows servers 
 in the Extended Attribute format their SFU clients would recognize.
 
-q) Finish fcntl D_NOTIFY support so kde and gnome file list windows
+n) Finish fcntl D_NOTIFY support so kde and gnome file list windows
 will autorefresh (partially complete by Asser). Needs minor kernel
 vfs change to support removing D_NOTIFY on a file.   
 
-r) Add GUI tool to configure /proc/fs/cifs settings and for display of
+o) Add GUI tool to configure /proc/fs/cifs settings and for display of
 the CIFS statistics (started)
 
-s) implement support for security and trusted categories of xattrs
+p) implement support for security and trusted categories of xattrs
 (requires minor protocol extension) to enable better support for SELINUX
 
-t) Implement O_DIRECT flag on open (already supported on mount)
+q) Implement O_DIRECT flag on open (already supported on mount)
 
-u) Create UID mapping facility so server UIDs can be mapped on a per
+r) Create UID mapping facility so server UIDs can be mapped on a per
 mount or a per server basis to client UIDs or nobody if no mapping
 exists.  This is helpful when Unix extensions are negotiated to
 allow better permission checking when UIDs differ on the server
@@ -83,19 +72,26 @@ and client.  Add new protocol request to the CIFS protocol
 standard for asking the server for the corresponding name of a
 particular uid.
 
-v) Add support for CIFS Unix and also the newer POSIX extensions to the
+s) Add support for CIFS Unix and also the newer POSIX extensions to the
 server side for Samba 4.
 
-w) Finish up the dos time conversion routines needed to return old server
-time to the client (default time, of now or time 0 is used now for these 
-very old servers)
-
-x) In support for OS/2 (LANMAN 1.2 and LANMAN2.1 based SMB servers) 
+t) In support for OS/2 (LANMAN 1.2 and LANMAN2.1 based SMB servers) 
 need to add ability to set time to server (utimes command)
 
-y) Finish testing of Windows 9x/Windows ME server support (started).
+u) DOS attrs - returned as pseudo-xattr in Samba format (check VFAT and NTFS for this too)
+
+v) mount check for unmatched uids
+
+w) Add mount option for Linux extension disable per mount, and partial
+disable per mount (uid off, symlink/fifo/mknod on but what about posix acls?)
 
-KNOWN BUGS (updated February 26, 2007)
+x) Fix Samba 3 server to handle Linux kernel aio so dbench with lots of 
+processes can proceed better in parallel (on the server)
+
+y) Fix Samba 3 to handle reads/writes over 127K (and remove the cifs mount
+restriction of wsize max being 127K) 
+
+KNOWN BUGS (updated April 24, 2007)
 ====================================
 See http://bugzilla.samba.org - search on product "CifsVFS" for
 current bug list.
@@ -127,10 +123,3 @@ negotiated size) and send larger write sizes to modern servers.
 4) More exhaustively test against less common servers.  More testing
 against Windows 9x, Windows ME servers.
 
-DOS attrs - returned as pseudo-xattr in Samba format (check VFAT and NTFS for this too)
-
-mount check for unmatched uids - and uid override
-
-Add mount option for Linux extension disable per mount, and partial disable per mount (uid off, symlink/fifo/mknod on but what about posix acls?) 
-
-Free threads at umount --force that are stuck on the sesSem
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index fd1e52ebcee..4cc2012e932 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -22,12 +22,14 @@
 #define CIFS_MOUNT_SET_UID      2 /* set current->euid in create etc. */
 #define CIFS_MOUNT_SERVER_INUM  4 /* inode numbers from uniqueid from server */
 #define CIFS_MOUNT_DIRECT_IO    8 /* do not write nor read through page cache */
-#define CIFS_MOUNT_NO_XATTR  0x10 /* if set - disable xattr support */
-#define CIFS_MOUNT_MAP_SPECIAL_CHR 0x20 /* remap illegal chars in filenames */
-#define CIFS_MOUNT_POSIX_PATHS	0x40 /* Negotiate posix pathnames if possible. */
-#define CIFS_MOUNT_UNX_EMUL	0x80 /* Network compat with SFUnix emulation */
-#define CIFS_MOUNT_NO_BRL	0x100 /* No sending byte range locks to srv */
-#define CIFS_MOUNT_CIFS_ACL	0x200 /* send ACL requests to non-POSIX srv */
+#define CIFS_MOUNT_NO_XATTR     0x10  /* if set - disable xattr support       */
+#define CIFS_MOUNT_MAP_SPECIAL_CHR 0x20 /* remap illegal chars in filenames   */
+#define CIFS_MOUNT_POSIX_PATHS  0x40  /* Negotiate posix pathnames if possible*/
+#define CIFS_MOUNT_UNX_EMUL     0x80  /* Network compat with SFUnix emulation */
+#define CIFS_MOUNT_NO_BRL       0x100 /* No sending byte range locks to srv   */
+#define CIFS_MOUNT_CIFS_ACL     0x200 /* send ACL requests to non-POSIX srv   */
+#define CIFS_MOUNT_OVERR_UID    0x400 /* override uid returned from server    */
+#define CIFS_MOUNT_OVERR_GID    0x800 /* override gid returned from server    */
 
 struct cifs_sb_info {
 	struct cifsTconInfo *tcon;	/* primary mount */
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index d2a8b2941fc..793c4b95c16 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -74,8 +74,8 @@ cifs_strtoUCS(__le16 * to, const char *from, int len,
 		charlen = codepage->char2uni(from, len, &wchar_to[i]);
 		if (charlen < 1) {
 			cERROR(1,
-			       ("cifs_strtoUCS: char2uni returned %d",
-				charlen));
+			       ("strtoUCS: char2uni of %d returned %d",
+				(int)*from, charlen));
 			/* A question mark */
 			to[i] = cpu_to_le16(0x003f);
 			charlen = 1;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index faba4d69fe9..8568e100953 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -100,7 +100,7 @@ cifs_read_super(struct super_block *sb, void *data,
 	sb->s_flags |= MS_NODIRATIME | MS_NOATIME;
 	sb->s_fs_info = kzalloc(sizeof(struct cifs_sb_info),GFP_KERNEL);
 	cifs_sb = CIFS_SB(sb);
-	if(cifs_sb == NULL)
+	if (cifs_sb == NULL)
 		return -ENOMEM;
 
 	rc = cifs_mount(sb, cifs_sb, data, devname);
@@ -115,10 +115,10 @@ cifs_read_super(struct super_block *sb, void *data,
 	sb->s_magic = CIFS_MAGIC_NUMBER;
 	sb->s_op = &cifs_super_ops;
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-	if(experimEnabled != 0)
+	if (experimEnabled != 0)
 		sb->s_export_op = &cifs_export_ops;
 #endif /* EXPERIMENTAL */	
-/*	if(cifs_sb->tcon->ses->server->maxBuf > MAX_CIFS_HDR_SIZE + 512)
+/*	if (cifs_sb->tcon->ses->server->maxBuf > MAX_CIFS_HDR_SIZE + 512)
 	    sb->s_blocksize = cifs_sb->tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE; */
 #ifdef CONFIG_CIFS_QUOTA
 	sb->s_qcop = &cifs_quotactl_ops;
@@ -147,8 +147,8 @@ out_no_root:
 		iput(inode);
 
 out_mount_failed:
-	if(cifs_sb) {
-		if(cifs_sb->local_nls)
+	if (cifs_sb) {
+		if (cifs_sb->local_nls)
 			unload_nls(cifs_sb->local_nls);	
 		kfree(cifs_sb);
 	}
@@ -163,7 +163,7 @@ cifs_put_super(struct super_block *sb)
 
 	cFYI(1, ("In cifs_put_super"));
 	cifs_sb = CIFS_SB(sb);
-	if(cifs_sb == NULL) {
+	if (cifs_sb == NULL) {
 		cFYI(1,("Empty cifs superblock info passed to unmount"));
 		return;
 	}
@@ -208,14 +208,14 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
 
     /* Only need to call the old QFSInfo if failed
     on newer one */
-    if(rc)
-	if(pTcon->ses->capabilities & CAP_NT_SMBS)
+    if (rc)
+	if (pTcon->ses->capabilities & CAP_NT_SMBS)
 		rc = CIFSSMBQFSInfo(xid, pTcon, buf); /* not supported by OS2 */
 
 	/* Some old Windows servers also do not support level 103, retry with
 	   older level one if old server failed the previous call or we
 	   bypassed it because we detected that this was an older LANMAN sess */
-	if(rc)
+	if (rc)
 		rc = SMBOldQFSInfo(xid, pTcon, buf);
 	/*     
 	   int f_type;
@@ -301,11 +301,19 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
 				if (cifs_sb->tcon->ses->userName)
 					seq_printf(s, ",username=%s",
 					   cifs_sb->tcon->ses->userName);
-				if(cifs_sb->tcon->ses->domainName)
+				if (cifs_sb->tcon->ses->domainName)
 					seq_printf(s, ",domain=%s",
 					   cifs_sb->tcon->ses->domainName);
 			}
 		}
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
+			seq_printf(s, ",posixpaths");
+		if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID) ||
+		   !(cifs_sb->tcon->ses->capabilities & CAP_UNIX))
+			seq_printf(s, ",uid=%d", cifs_sb->mnt_uid);
+		if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID) ||
+		   !(cifs_sb->tcon->ses->capabilities & CAP_UNIX))
+			seq_printf(s, ",gid=%d", cifs_sb->mnt_gid);
 		seq_printf(s, ",rsize=%d",cifs_sb->rsize);
 		seq_printf(s, ",wsize=%d",cifs_sb->wsize);
 	}
@@ -321,14 +329,14 @@ int cifs_xquota_set(struct super_block * sb, int quota_type, qid_t qid,
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 	struct cifsTconInfo *pTcon;
 	
-	if(cifs_sb)
+	if (cifs_sb)
 		pTcon = cifs_sb->tcon;
 	else
 		return -EIO;
 
 
 	xid = GetXid();
-	if(pTcon) {
+	if (pTcon) {
 		cFYI(1,("set type: 0x%x id: %d",quota_type,qid));		
 	} else {
 		return -EIO;
@@ -346,13 +354,13 @@ int cifs_xquota_get(struct super_block * sb, int quota_type, qid_t qid,
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 	struct cifsTconInfo *pTcon;
 
-	if(cifs_sb)
+	if (cifs_sb)
 		pTcon = cifs_sb->tcon;
 	else
 		return -EIO;
 
 	xid = GetXid();
-	if(pTcon) {
+	if (pTcon) {
                 cFYI(1,("set type: 0x%x id: %d",quota_type,qid));
 	} else {
 		rc = -EIO;
@@ -369,13 +377,13 @@ int cifs_xstate_set(struct super_block * sb, unsigned int flags, int operation)
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 	struct cifsTconInfo *pTcon;
 
-	if(cifs_sb)
+	if (cifs_sb)
 		pTcon = cifs_sb->tcon;
 	else
 		return -EIO;
 
 	xid = GetXid();
-	if(pTcon) {
+	if (pTcon) {
                 cFYI(1,("flags: 0x%x operation: 0x%x",flags,operation));
 	} else {
 		rc = -EIO;
@@ -392,13 +400,13 @@ int cifs_xstate_get(struct super_block * sb, struct fs_quota_stat *qstats)
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 	struct cifsTconInfo *pTcon;
 
-	if(cifs_sb) {
+	if (cifs_sb) {
 		pTcon = cifs_sb->tcon;
 	} else {
 		return -EIO;
 	}
 	xid = GetXid();
-	if(pTcon) {
+	if (pTcon) {
 		cFYI(1,("pqstats %p",qstats));		
 	} else {
 		rc = -EIO;
@@ -424,11 +432,11 @@ static void cifs_umount_begin(struct vfsmount * vfsmnt, int flags)
 	if (!(flags & MNT_FORCE))
 		return;
 	cifs_sb = CIFS_SB(vfsmnt->mnt_sb);
-	if(cifs_sb == NULL)
+	if (cifs_sb == NULL)
 		return;
 
 	tcon = cifs_sb->tcon;
-	if(tcon == NULL)
+	if (tcon == NULL)
 		return;
 	down(&tcon->tconSem);
 	if (atomic_read(&tcon->useCount) == 1)
@@ -437,7 +445,7 @@ static void cifs_umount_begin(struct vfsmount * vfsmnt, int flags)
 
 	/* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */
 	/* cancel_notify_requests(tcon); */
-	if(tcon->ses && tcon->ses->server)
+	if (tcon->ses && tcon->ses->server)
 	{
 		cFYI(1,("wake up tasks now - umount begin not complete"));
 		wake_up_all(&tcon->ses->server->request_q);
@@ -529,8 +537,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
 		/* some applications poll for the file length in this strange
 		   way so we must seek to end on non-oplocked files by
 		   setting the revalidate time to zero */
-		if(file->f_path.dentry->d_inode)		
-			CIFS_I(file->f_path.dentry->d_inode)->time = 0;
+		CIFS_I(file->f_path.dentry->d_inode)->time = 0;
 
 		retval = cifs_revalidate(file->f_path.dentry);
 		if (retval < 0)
@@ -694,8 +701,7 @@ cifs_init_once(void *inode, struct kmem_cache * cachep, unsigned long flags)
 {
 	struct cifsInodeInfo *cifsi = inode;
 
-	if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
+	if (flags & SLAB_CTOR_CONSTRUCTOR) {
 		inode_init_once(&cifsi->vfs_inode);
 		INIT_LIST_HEAD(&cifsi->lockList);
 	}
@@ -724,7 +730,7 @@ cifs_destroy_inodecache(void)
 static int
 cifs_init_request_bufs(void)
 {
-	if(CIFSMaxBufSize < 8192) {
+	if (CIFSMaxBufSize < 8192) {
 	/* Buffer size can not be smaller than 2 * PATH_MAX since maximum
 	Unicode path name has to fit in any SMB/CIFS path based frames */
 		CIFSMaxBufSize = 8192;
@@ -741,7 +747,7 @@ cifs_init_request_bufs(void)
 	if (cifs_req_cachep == NULL)
 		return -ENOMEM;
 
-	if(cifs_min_rcv < 1)
+	if (cifs_min_rcv < 1)
 		cifs_min_rcv = 1;
 	else if (cifs_min_rcv > 64) {
 		cifs_min_rcv = 64;
@@ -751,7 +757,7 @@ cifs_init_request_bufs(void)
 	cifs_req_poolp = mempool_create_slab_pool(cifs_min_rcv,
 						  cifs_req_cachep);
 
-	if(cifs_req_poolp == NULL) {
+	if (cifs_req_poolp == NULL) {
 		kmem_cache_destroy(cifs_req_cachep);
 		return -ENOMEM;
 	}
@@ -772,7 +778,7 @@ cifs_init_request_bufs(void)
 		return -ENOMEM;              
 	}
 
-	if(cifs_min_small < 2)
+	if (cifs_min_small < 2)
 		cifs_min_small = 2;
 	else if (cifs_min_small > 256) {
 		cifs_min_small = 256;
@@ -782,7 +788,7 @@ cifs_init_request_bufs(void)
 	cifs_sm_req_poolp = mempool_create_slab_pool(cifs_min_small,
 						     cifs_sm_req_cachep);
 
-	if(cifs_sm_req_poolp == NULL) {
+	if (cifs_sm_req_poolp == NULL) {
 		mempool_destroy(cifs_req_poolp);
 		kmem_cache_destroy(cifs_req_cachep);
 		kmem_cache_destroy(cifs_sm_req_cachep);
@@ -812,7 +818,7 @@ cifs_init_mids(void)
 
 	/* 3 is a reasonable minimum number of simultaneous operations */
 	cifs_mid_poolp = mempool_create_slab_pool(3, cifs_mid_cachep);
-	if(cifs_mid_poolp == NULL) {
+	if (cifs_mid_poolp == NULL) {
 		kmem_cache_destroy(cifs_mid_cachep);
 		return -ENOMEM;
 	}
@@ -850,14 +856,14 @@ static int cifs_oplock_thread(void * dummyarg)
 			continue;
 		
 		spin_lock(&GlobalMid_Lock);
-		if(list_empty(&GlobalOplock_Q)) {
+		if (list_empty(&GlobalOplock_Q)) {
 			spin_unlock(&GlobalMid_Lock);
 			set_current_state(TASK_INTERRUPTIBLE);
 			schedule_timeout(39*HZ);
 		} else {
 			oplock_item = list_entry(GlobalOplock_Q.next, 
 				struct oplock_q_entry, qhead);
-			if(oplock_item) {
+			if (oplock_item) {
 				cFYI(1,("found oplock item to write out")); 
 				pTcon = oplock_item->tcon;
 				inode = oplock_item->pinode;
@@ -871,7 +877,7 @@ static int cifs_oplock_thread(void * dummyarg)
 				/* mutex_lock(&inode->i_mutex);*/
 				if (S_ISREG(inode->i_mode)) {
 					rc = filemap_fdatawrite(inode->i_mapping);
-					if(CIFS_I(inode)->clientCanCacheRead == 0) {
+					if (CIFS_I(inode)->clientCanCacheRead == 0) {
 						filemap_fdatawait(inode->i_mapping);
 						invalidate_remote_inode(inode);
 					}
@@ -888,7 +894,7 @@ static int cifs_oplock_thread(void * dummyarg)
 				not bother sending an oplock release if session 
 				to server still is disconnected since oplock 
 				already released by the server in that case */
-				if(pTcon->tidStatus != CifsNeedReconnect) {
+				if (pTcon->tidStatus != CifsNeedReconnect) {
 				    rc = CIFSSMBLock(0, pTcon, netfid,
 					    0 /* len */ , 0 /* offset */, 0, 
 					    0, LOCKING_ANDX_OPLOCK_RELEASE,
@@ -922,7 +928,7 @@ static int cifs_dnotify_thread(void * dummyarg)
 		list_for_each(tmp, &GlobalSMBSessionList) {
 			ses = list_entry(tmp, struct cifsSesInfo, 
 				cifsSessionList);
-			if(ses && ses->server && 
+			if (ses && ses->server && 
 			     atomic_read(&ses->server->inFlight))
 				wake_up_all(&ses->server->response_q);
 		}
@@ -971,10 +977,10 @@ init_cifs(void)
 	rwlock_init(&GlobalSMBSeslock);
 	spin_lock_init(&GlobalMid_Lock);
 
-	if(cifs_max_pending < 2) {
+	if (cifs_max_pending < 2) {
 		cifs_max_pending = 2;
 		cFYI(1,("cifs_max_pending set to min of 2"));
-	} else if(cifs_max_pending > 256) {
+	} else if (cifs_max_pending > 256) {
 		cifs_max_pending = 256;
 		cFYI(1,("cifs_max_pending set to max of 256"));
 	}
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 2c2c384894d..c235d32ad4a 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -100,5 +100,5 @@ extern ssize_t	cifs_getxattr(struct dentry *, const char *, void *, size_t);
 extern ssize_t	cifs_listxattr(struct dentry *, char *, size_t);
 extern int cifs_ioctl (struct inode * inode, struct file * filep,
 		       unsigned int command, unsigned long arg);
-#define CIFS_VERSION   "1.48"
+#define CIFS_VERSION   "1.49"
 #endif				/* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index e4de8eba478..23655de2f4a 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -311,7 +311,7 @@ struct cifsFileInfo {
 	/* lock scope id (0 if none) */
 	struct file * pfile; /* needed for writepage */
 	struct inode * pInode; /* needed for oplock break */
-	struct semaphore lock_sem;
+	struct mutex lock_mutex;
 	struct list_head llist; /* list of byte range locks we have. */
 	unsigned closePend:1;	/* file is marked to close */
 	unsigned invalidHandle:1;  /* file closed via session abend */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 4d8948e8762..d619ca7d141 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -1388,7 +1388,7 @@ struct smb_t2_rsp {
 #define SMB_SET_POSIX_LOCK              0x208
 #define SMB_POSIX_OPEN                  0x209
 #define SMB_POSIX_UNLINK                0x20a
-#define SMB_SET_FILE_UNIX_INFO2
+#define SMB_SET_FILE_UNIX_INFO2         0x20b
 #define SMB_SET_FILE_BASIC_INFO2        0x3ec
 #define SMB_SET_FILE_RENAME_INFORMATION 0x3f2 /* BB check if qpathinfo too */
 #define SMB_FILE_ALL_INFO2              0x3fa
@@ -2109,22 +2109,40 @@ struct cifs_posix_acl { /* access conrol list  (ACL) */
 
 /* end of POSIX ACL definitions */
 
+/* POSIX Open Flags */
+#define SMB_O_RDONLY 	 0x1
+#define SMB_O_WRONLY 	0x2
+#define SMB_O_RDWR 	0x4
+#define SMB_O_CREAT 	0x10
+#define SMB_O_EXCL 	0x20
+#define SMB_O_TRUNC 	0x40
+#define SMB_O_APPEND 	0x80
+#define SMB_O_SYNC 	0x100
+#define SMB_O_DIRECTORY 0x200
+#define SMB_O_NOFOLLOW 	0x400
+#define SMB_O_DIRECT 	0x800
+
 typedef struct {
-	__u32 OpenFlags; /* same as NT CreateX */
-	__u32 PosixOpenFlags;
-	__u32 Mode;
-	__u16 Level; /* reply level requested (see QPathInfo levels) */
-	__u16 Pad;  /* reserved - MBZ */
+	__le32 OpenFlags; /* same as NT CreateX */
+	__le32 PosixOpenFlags;
+	__le64 Permissions;
+	__le16 Level; /* reply level requested (see QPathInfo levels) */
 } __attribute__((packed)) OPEN_PSX_REQ; /* level 0x209 SetPathInfo data */
 
 typedef struct {
-	/* reply varies based on requested level */
+	__le16 OplockFlags;
+	__u16 Fid;
+	__le32 CreateAction;
+	__le16 ReturnedLevel;
+	__le16 Pad;
+	/* struct following varies based on requested level */
 } __attribute__((packed)) OPEN_PSX_RSP; /* level 0x209 SetPathInfo data */
 
 
 struct file_internal_info {
 	__u64  UniqueId; /* inode number */
 } __attribute__((packed));      /* level 0x3ee */
+
 struct file_mode_info {
 	__le32	Mode;
 } __attribute__((packed));      /* level 0x3f8 */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 32eb1acab63..5d163e2b614 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -1,7 +1,7 @@
 /*
  *   fs/cifs/cifsproto.h
  *
- *   Copyright (c) International Business Machines  Corp., 2002,2006
+ *   Copyright (c) International Business Machines  Corp., 2002,2007
  *   Author(s): Steve French (sfrench@us.ibm.com)
  *
  *   This library is free software; you can redistribute it and/or modify
@@ -244,6 +244,11 @@ extern int SMBLegacyOpen(const int xid, struct cifsTconInfo *tcon,
 			const int access_flags, const int omode,
 			__u16 * netfid, int *pOplock, FILE_ALL_INFO *,
 			const struct nls_table *nls_codepage, int remap);
+extern int CIFSPOSIXCreate(const int xid, struct cifsTconInfo *tcon, 
+			u32 posix_flags, __u64 mode, __u16 * netfid,
+			FILE_UNIX_BASIC_INFO *pRetData,
+			__u32 *pOplock, const char *name,
+			const struct nls_table *nls_codepage, int remap);			
 extern int CIFSSMBClose(const int xid, struct cifsTconInfo *tcon,
 			const int smb_file_id);
 
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 48fc0c2ab0e..14de58fa143 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1,7 +1,7 @@
 /*
  *   fs/cifs/cifssmb.c
  *
- *   Copyright (C) International Business Machines  Corp., 2002,2006
+ *   Copyright (C) International Business Machines  Corp., 2002,2007
  *   Author(s): Steve French (sfrench@us.ibm.com)
  *
  *   Contains the routines for constructing the SMB PDUs themselves
@@ -24,8 +24,8 @@
  /* SMB/CIFS PDU handling routines here - except for leftovers in connect.c   */
  /* These are mostly routines that operate on a pathname, or on a tree id     */
  /* (mounted volume), but there are eight handle based routines which must be */
- /* treated slightly different for reconnection purposes since we never want  */
- /* to reuse a stale file handle and the caller knows the file handle */
+ /* treated slightly differently for reconnection purposes since we never     */
+ /* want to reuse a stale file handle and only the caller knows the file info */
 
 #include <linux/fs.h>
 #include <linux/kernel.h>
@@ -913,6 +913,130 @@ MkDirRetry:
 	return rc;
 }
 
+int
+CIFSPOSIXCreate(const int xid, struct cifsTconInfo *tcon, __u32 posix_flags,
+		__u64 mode, __u16 * netfid, FILE_UNIX_BASIC_INFO *pRetData,
+		__u32 *pOplock, const char *name, 
+		const struct nls_table *nls_codepage, int remap)
+{
+	TRANSACTION2_SPI_REQ *pSMB = NULL;
+	TRANSACTION2_SPI_RSP *pSMBr = NULL;
+	int name_len;
+	int rc = 0;
+	int bytes_returned = 0;
+	char *data_offset;
+	__u16 params, param_offset, offset, byte_count, count;
+	OPEN_PSX_REQ * pdata;
+	OPEN_PSX_RSP * psx_rsp;
+
+	cFYI(1, ("In POSIX Create"));
+PsxCreat:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len =
+		    cifsConvertToUCS((__le16 *) pSMB->FileName, name,
+				     PATH_MAX, nls_codepage, remap);
+		name_len++;	/* trailing null */
+		name_len *= 2;
+	} else {	/* BB improve the check for buffer overruns BB */
+		name_len = strnlen(name, PATH_MAX);
+		name_len++;	/* trailing null */
+		strncpy(pSMB->FileName, name, name_len);
+	}
+
+	params = 6 + name_len;
+	count = sizeof(OPEN_PSX_REQ);
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	pSMB->MaxDataCount = cpu_to_le16(1000);	/* large enough */
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	param_offset = offsetof(struct smb_com_transaction2_spi_req,
+                                     InformationLevel) - 4;
+	offset = param_offset + params;
+	data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
+	pdata = (OPEN_PSX_REQ *)(((char *)&pSMB->hdr.Protocol) + offset);
+	pdata->Level = SMB_QUERY_FILE_UNIX_BASIC;
+	pdata->Permissions = cpu_to_le64(mode);
+	pdata->PosixOpenFlags = cpu_to_le32(posix_flags); 
+	pdata->OpenFlags =  cpu_to_le32(*pOplock);
+	pSMB->ParameterOffset = cpu_to_le16(param_offset);
+	pSMB->DataOffset = cpu_to_le16(offset);
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION);
+	byte_count = 3 /* pad */  + params + count;
+
+	pSMB->DataCount = cpu_to_le16(count);
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalDataCount = pSMB->DataCount;
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->InformationLevel = cpu_to_le16(SMB_POSIX_OPEN);
+	pSMB->Reserved4 = 0;
+	pSMB->hdr.smb_buf_length += byte_count; 
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cFYI(1, ("Posix create returned %d", rc));
+		goto psx_create_err;
+	}
+
+	cFYI(1,("copying inode info"));
+	rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+
+	if (rc || (pSMBr->ByteCount < sizeof(OPEN_PSX_RSP))) {
+		rc = -EIO;	/* bad smb */
+		goto psx_create_err;
+	}
+
+	/* copy return information to pRetData */
+	psx_rsp = (OPEN_PSX_RSP *)((char *) &pSMBr->hdr.Protocol 
+			+ le16_to_cpu(pSMBr->t2.DataOffset));
+		
+	*pOplock = le16_to_cpu(psx_rsp->OplockFlags);
+	if(netfid)
+		*netfid = psx_rsp->Fid;   /* cifs fid stays in le */
+	/* Let caller know file was created so we can set the mode. */
+	/* Do we care about the CreateAction in any other cases? */
+	if(cpu_to_le32(FILE_CREATE) == psx_rsp->CreateAction)
+		*pOplock |= CIFS_CREATE_ACTION;
+	/* check to make sure response data is there */
+	if(psx_rsp->ReturnedLevel != SMB_QUERY_FILE_UNIX_BASIC) {
+		pRetData->Type = -1; /* unknown */
+#ifdef CONFIG_CIFS_DEBUG2
+		cFYI(1,("unknown type"));
+#endif
+	} else {
+		if(pSMBr->ByteCount < sizeof(OPEN_PSX_RSP) 
+					+ sizeof(FILE_UNIX_BASIC_INFO)) {
+			cERROR(1,("Open response data too small"));
+			pRetData->Type = -1;
+			goto psx_create_err;
+		}
+		memcpy((char *) pRetData, 
+			(char *)psx_rsp + sizeof(OPEN_PSX_RSP),
+			sizeof (FILE_UNIX_BASIC_INFO));
+	}
+			
+
+psx_create_err:
+	cifs_buf_release(pSMB);
+
+	cifs_stats_inc(&tcon->num_mkdirs);
+
+	if (rc == -EAGAIN)
+		goto PsxCreat;
+
+	return rc;	
+}
+
 static __u16 convert_disposition(int disposition)
 {
 	__u16 ofun = 0;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 20ba7dcc995..216fb625843 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -30,6 +30,7 @@
 #include <linux/mempool.h>
 #include <linux/delay.h>
 #include <linux/completion.h>
+#include <linux/kthread.h>
 #include <linux/pagevec.h>
 #include <linux/freezer.h>
 #include <asm/uaccess.h>
@@ -74,6 +75,8 @@ struct smb_vol {
 	unsigned retry:1;
 	unsigned intr:1;
 	unsigned setuids:1;
+	unsigned override_uid:1;
+	unsigned override_gid:1;
 	unsigned noperm:1;
 	unsigned no_psx_acl:1; /* set if posix acl support should be disabled */
 	unsigned cifs_acl:1;
@@ -120,7 +123,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
 	struct mid_q_entry * mid_entry;
 	
 	spin_lock(&GlobalMid_Lock);
-	if(server->tcpStatus == CifsExiting) {
+	if( kthread_should_stop() ) {
 		/* the demux thread will exit normally 
 		next time through the loop */
 		spin_unlock(&GlobalMid_Lock);
@@ -182,7 +185,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
 	spin_unlock(&GlobalMid_Lock);
 	up(&server->tcpSem); 
 
-	while ((server->tcpStatus != CifsExiting) && (server->tcpStatus != CifsGood))
+	while ( (!kthread_should_stop()) && (server->tcpStatus != CifsGood))
 	{
 		try_to_freeze();
 		if(server->protocolType == IPV6) {
@@ -199,7 +202,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
 		} else {
 			atomic_inc(&tcpSesReconnectCount);
 			spin_lock(&GlobalMid_Lock);
-			if(server->tcpStatus != CifsExiting)
+			if( !kthread_should_stop() )
 				server->tcpStatus = CifsGood;
 			server->sequence_number = 0;
 			spin_unlock(&GlobalMid_Lock);			
@@ -345,7 +348,6 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
 	int isMultiRsp;
 	int reconnect;
 
-	daemonize("cifsd");
 	allow_signal(SIGKILL);
 	current->flags |= PF_MEMALLOC;
 	server->tsk = current;	/* save process info to wake at shutdown */
@@ -361,7 +363,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
 			GFP_KERNEL);
 	}
 
-	while (server->tcpStatus != CifsExiting) {
+	while (!kthread_should_stop()) {
 		if (try_to_freeze())
 			continue;
 		if (bigbuf == NULL) {
@@ -400,7 +402,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
 		    kernel_recvmsg(csocket, &smb_msg,
 				 &iov, 1, 4, 0 /* BB see socket.h flags */);
 
-		if (server->tcpStatus == CifsExiting) {
+		if ( kthread_should_stop() ) {
 			break;
 		} else if (server->tcpStatus == CifsNeedReconnect) {
 			cFYI(1, ("Reconnect after server stopped responding"));
@@ -524,7 +526,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
 		     total_read += length) {
 			length = kernel_recvmsg(csocket, &smb_msg, &iov, 1,
 						pdu_length - total_read, 0);
-			if((server->tcpStatus == CifsExiting) ||
+			if( kthread_should_stop() ||
 			    (length == -EINTR)) {
 				/* then will exit */
 				reconnect = 2;
@@ -757,7 +759,6 @@ multi_t2_fnd:
 			GFP_KERNEL);
 	}
 	
-	complete_and_exit(&cifsd_complete, 0);
 	return 0;
 }
 
@@ -973,7 +974,7 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
 			}
 			if ((temp_len = strnlen(value, 300)) < 300) {
 				vol->UNC = kmalloc(temp_len+1,GFP_KERNEL);
-				if(vol->UNC == NULL)
+				if (vol->UNC == NULL)
 					return 1;
 				strcpy(vol->UNC,value);
 				if (strncmp(vol->UNC, "//", 2) == 0) {
@@ -1010,12 +1011,12 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
                                 return 1;       /* needs_arg; */
                         }
                         if ((temp_len = strnlen(value, 1024)) < 1024) {
-				if(value[0] != '/')
+				if (value[0] != '/')
 					temp_len++;  /* missing leading slash */
                                 vol->prepath = kmalloc(temp_len+1,GFP_KERNEL);
-                                if(vol->prepath == NULL)
+                                if (vol->prepath == NULL)
                                         return 1;
-				if(value[0] != '/') {
+				if (value[0] != '/') {
 					vol->prepath[0] = '/';
 	                                strcpy(vol->prepath+1,value);
 				} else
@@ -1031,7 +1032,7 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
 				return 1;	/* needs_arg; */
 			}
 			if (strnlen(value, 65) < 65) {
-				if(strnicmp(value,"default",7))
+				if (strnicmp(value,"default",7))
 					vol->iocharset = value;
 				/* if iocharset not set load_nls_default used by caller */
 				cFYI(1, ("iocharset set to %s",value));
@@ -1043,11 +1044,13 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
 			if (value && *value) {
 				vol->linux_uid =
 					simple_strtoul(value, &value, 0);
+				vol->override_uid = 1;
 			}
 		} else if (strnicmp(data, "gid", 3) == 0) {
 			if (value && *value) {
 				vol->linux_gid =
 					simple_strtoul(value, &value, 0);
+				vol->override_gid = 1;
 			}
 		} else if (strnicmp(data, "file_mode", 4) == 0) {
 			if (value && *value) {
@@ -1102,7 +1105,7 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
 				}
 				/* The string has 16th byte zero still from
 				set at top of the function  */
-				if((i==15) && (value[i] != 0))
+				if ((i==15) && (value[i] != 0))
 					printk(KERN_WARNING "CIFS: netbiosname longer than 15 truncated.\n");
 			}
 		} else if (strnicmp(data, "servern", 7) == 0) {
@@ -1126,7 +1129,7 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
 				}
 				/* The string has 16th byte zero still from
 				   set at top of the function  */
-				if((i==15) && (value[i] != 0))
+				if ((i==15) && (value[i] != 0))
 					printk(KERN_WARNING "CIFS: server netbiosname longer than 15 truncated.\n");
 			}
 		} else if (strnicmp(data, "credentials", 4) == 0) {
@@ -1233,13 +1236,13 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
 			printk(KERN_WARNING "CIFS: Unknown mount option %s\n",data);
 	}
 	if (vol->UNC == NULL) {
-		if(devname == NULL) {
+		if (devname == NULL) {
 			printk(KERN_WARNING "CIFS: Missing UNC name for mount target\n");
 			return 1;
 		}
 		if ((temp_len = strnlen(devname, 300)) < 300) {
 			vol->UNC = kmalloc(temp_len+1,GFP_KERNEL);
-			if(vol->UNC == NULL)
+			if (vol->UNC == NULL)
 				return 1;
 			strcpy(vol->UNC,devname);
 			if (strncmp(vol->UNC, "//", 2) == 0) {
@@ -1663,7 +1666,13 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo * tcon,
 				CIFS_SB(sb)->mnt_cifs_flags |= 
 					CIFS_MOUNT_POSIX_PATHS;
 		}
-			
+	
+		/* We might be setting the path sep back to a different
+		form if we are reconnecting and the server switched its
+		posix path capability for this share */	
+		if(sb && (CIFS_SB(sb)->prepathlen > 0))
+			CIFS_SB(sb)->prepath[0] = CIFS_DIR_SEP(CIFS_SB(sb));
+	
 		cFYI(1,("Negotiate caps 0x%x",(int)cap));
 #ifdef CONFIG_CIFS_DEBUG2
 		if(cap & CIFS_UNIX_FCNTL_CAP)
@@ -1712,12 +1721,12 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 		return -EINVAL;
 	}
 
-	if (volume_info.username) {
+	if (volume_info.nullauth) {
+		cFYI(1,("null user"));
+		volume_info.username = NULL;
+	} else if (volume_info.username) {
 		/* BB fixme parse for domain name here */
 		cFYI(1, ("Username: %s ", volume_info.username));
-
-	} else if (volume_info.nullauth) {
-		cFYI(1,("null user"));
 	} else {
 		cifserror("No username specified");
         /* In userspace mount helper we can get user name from alternate
@@ -1791,11 +1800,12 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 		existingCifsSes = cifs_find_tcp_session(&sin_server.sin_addr,
 			NULL /* no ipv6 addr */,
 			volume_info.username, &srvTcp);
-	else if(address_type == AF_INET6)
+	else if(address_type == AF_INET6) {
+		cFYI(1,("looking for ipv6 address"));
 		existingCifsSes = cifs_find_tcp_session(NULL /* no ipv4 addr */,
 			&sin_server6.sin6_addr,
 			volume_info.username, &srvTcp);
-	else {
+	} else {
 		kfree(volume_info.UNC);
 		kfree(volume_info.password);
 		kfree(volume_info.prepath);
@@ -1807,17 +1817,23 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 	if (srvTcp) {
 		cFYI(1, ("Existing tcp session with server found"));                
 	} else {	/* create socket */
-		if(volume_info.port)
+		if (volume_info.port)
 			sin_server.sin_port = htons(volume_info.port);
 		else
 			sin_server.sin_port = 0;
-		rc = ipv4_connect(&sin_server,&csocket,
+		if (address_type == AF_INET6) {
+			cFYI(1,("attempting ipv6 connect"));
+			/* BB should we allow ipv6 on port 139? */
+			/* other OS never observed in Wild doing 139 with v6 */
+			rc = ipv6_connect(&sin_server6,&csocket);
+		} else 
+			rc = ipv4_connect(&sin_server,&csocket,
 				  volume_info.source_rfc1001_name,
 				  volume_info.target_rfc1001_name);
 		if (rc < 0) {
 			cERROR(1,
-			       ("Error connecting to IPv4 socket. Aborting operation"));
-			if(csocket != NULL)
+			       ("Error connecting to IPv4 socket. Aborting operation"));			       
+			if (csocket != NULL)
 				sock_release(csocket);
 			kfree(volume_info.UNC);
 			kfree(volume_info.password);
@@ -1850,10 +1866,11 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			so no need to spinlock this init of tcpStatus */
 			srvTcp->tcpStatus = CifsNew;
 			init_MUTEX(&srvTcp->tcpSem);
-			rc = (int)kernel_thread((void *)(void *)cifs_demultiplex_thread, srvTcp,
-				      CLONE_FS | CLONE_FILES | CLONE_VM);
-			if(rc < 0) {
-				rc = -ENOMEM;
+			srvTcp->tsk = kthread_run((void *)(void *)cifs_demultiplex_thread, srvTcp, "cifsd");
+			if ( IS_ERR(srvTcp->tsk) ) {
+				rc = PTR_ERR(srvTcp->tsk);
+				cERROR(1,("error %d create cifsd thread", rc));
+				srvTcp->tsk = NULL;
 				sock_release(csocket);
 				kfree(volume_info.UNC);
 				kfree(volume_info.password);
@@ -1896,7 +1913,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 				int len = strlen(volume_info.domainname);
 				pSesInfo->domainName = 
 					kmalloc(len + 1, GFP_KERNEL);
-				if(pSesInfo->domainName)
+				if (pSesInfo->domainName)
 					strcpy(pSesInfo->domainName,
 						volume_info.domainname);
 			}
@@ -1906,7 +1923,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			/* BB FIXME need to pass vol->secFlgs BB */
 			rc = cifs_setup_session(xid,pSesInfo, cifs_sb->local_nls);
 			up(&pSesInfo->sesSem);
-			if(!rc)
+			if (!rc)
 				atomic_inc(&srvTcp->socketUseCount);
 		} else
 			kfree(volume_info.password);
@@ -1914,7 +1931,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
     
 	/* search for existing tcon to this server share */
 	if (!rc) {
-		if(volume_info.rsize > CIFSMaxBufSize) {
+		if (volume_info.rsize > CIFSMaxBufSize) {
 			cERROR(1,("rsize %d too large, using MaxBufSize",
 				volume_info.rsize));
 			cifs_sb->rsize = CIFSMaxBufSize;
@@ -1923,11 +1940,11 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 		else /* default */
 			cifs_sb->rsize = CIFSMaxBufSize;
 
-		if(volume_info.wsize > PAGEVEC_SIZE * PAGE_CACHE_SIZE) {
+		if (volume_info.wsize > PAGEVEC_SIZE * PAGE_CACHE_SIZE) {
 			cERROR(1,("wsize %d too large using 4096 instead",
 				  volume_info.wsize));
 			cifs_sb->wsize = 4096;
-		} else if(volume_info.wsize)
+		} else if (volume_info.wsize)
 			cifs_sb->wsize = volume_info.wsize;
 		else
 			cifs_sb->wsize = 
@@ -1940,14 +1957,14 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			   conjunction with 52K kvec constraint on arch with 4K
 			   page size  */
 
-		if(cifs_sb->rsize < 2048) {
+		if (cifs_sb->rsize < 2048) {
 			cifs_sb->rsize = 2048; 
 			/* Windows ME may prefer this */
 			cFYI(1,("readsize set to minimum 2048"));
 		}
 		/* calculate prepath */
 		cifs_sb->prepath = volume_info.prepath;
-		if(cifs_sb->prepath) {
+		if (cifs_sb->prepath) {
 			cifs_sb->prepathlen = strlen(cifs_sb->prepath);
 			cifs_sb->prepath[0] = CIFS_DIR_SEP(cifs_sb);
 			volume_info.prepath = NULL;
@@ -1960,24 +1977,27 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 		cFYI(1,("file mode: 0x%x  dir mode: 0x%x",
 			cifs_sb->mnt_file_mode,cifs_sb->mnt_dir_mode));
 
-		if(volume_info.noperm)
+		if (volume_info.noperm)
 			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM;
-		if(volume_info.setuids)
+		if (volume_info.setuids)
 			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SET_UID;
-		if(volume_info.server_ino)
+		if (volume_info.server_ino)
 			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SERVER_INUM;
-		if(volume_info.remap)
+		if (volume_info.remap)
 			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MAP_SPECIAL_CHR;
-		if(volume_info.no_xattr)
+		if (volume_info.no_xattr)
 			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_XATTR;
-		if(volume_info.sfu_emul)
+		if (volume_info.sfu_emul)
 			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UNX_EMUL;
-		if(volume_info.nobrl)
+		if (volume_info.nobrl)
 			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_BRL;
-		if(volume_info.cifs_acl)
+		if (volume_info.cifs_acl)
 			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL;
-
-		if(volume_info.direct_io) {
+		if (volume_info.override_uid)
+			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_UID;
+		if (volume_info.override_gid)
+			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_GID;
+		if (volume_info.direct_io) {
 			cFYI(1,("mounting share using direct i/o"));
 			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
 		}
@@ -2030,7 +2050,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			}
 		}
 	}
-	if(pSesInfo) {
+	if (pSesInfo) {
 		if (pSesInfo->capabilities & CAP_LARGE_FILES) {
 			sb->s_maxbytes = (u64) 1 << 63;
 		} else
@@ -2044,13 +2064,13 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 	if (rc) {
 		/* if session setup failed, use count is zero but
 		we still need to free cifsd thread */
-		if(atomic_read(&srvTcp->socketUseCount) == 0) {
+		if (atomic_read(&srvTcp->socketUseCount) == 0) {
 			spin_lock(&GlobalMid_Lock);
 			srvTcp->tcpStatus = CifsExiting;
 			spin_unlock(&GlobalMid_Lock);
-			if(srvTcp->tsk) {
+			if (srvTcp->tsk) {
 				send_sig(SIGKILL,srvTcp->tsk,1);
-				wait_for_completion(&cifsd_complete);
+				kthread_stop(srvTcp->tsk);
 			}
 		}
 		 /* If find_unc succeeded then rc == 0 so we can not end */
@@ -2063,10 +2083,10 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 					int temp_rc;
 					temp_rc = CIFSSMBLogoff(xid, pSesInfo);
 					/* if the socketUseCount is now zero */
-					if((temp_rc == -ESHUTDOWN) &&
-					   (pSesInfo->server->tsk)) {
+					if ((temp_rc == -ESHUTDOWN) &&
+					   (pSesInfo->server) && (pSesInfo->server->tsk)) {
 						send_sig(SIGKILL,pSesInfo->server->tsk,1);
-						wait_for_completion(&cifsd_complete);
+						kthread_stop(pSesInfo->server->tsk);
 					}
 				} else
 					cFYI(1, ("No session or bad tcon"));
@@ -2127,7 +2147,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 	__u16 count;
 
 	cFYI(1, ("In sesssetup"));
-	if(ses == NULL)
+	if (ses == NULL)
 		return -EINVAL;
 	user = ses->userName;
 	domain = ses->domainName;
@@ -2182,7 +2202,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 			*bcc_ptr = 0;
 			bcc_ptr++;
 		}
-		if(user == NULL)
+		if (user == NULL)
 			bytes_returned = 0; /* skip null user */
 	        else
 			bytes_returned =
@@ -2216,7 +2236,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 		bcc_ptr += 2 * bytes_returned;
 		bcc_ptr += 2;
 	} else {
-		if(user != NULL) {                
+		if (user != NULL) {                
 		    strncpy(bcc_ptr, user, 200);
 		    bcc_ptr += strnlen(user, 200);
 		}
@@ -3316,7 +3336,7 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
 				cFYI(1,("Waking up socket by sending it signal"));
 				if(cifsd_task) {
 					send_sig(SIGKILL,cifsd_task,1);
-					wait_for_completion(&cifsd_complete);
+					kthread_stop(cifsd_task);
 				}
 				rc = 0;
 			} /* else - we have an smb session
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 3fad638d26d..e5210519ac4 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -274,7 +274,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 			pCifsFile->invalidHandle = FALSE;
 			pCifsFile->closePend     = FALSE;
 			init_MUTEX(&pCifsFile->fh_sem);
-			init_MUTEX(&pCifsFile->lock_sem);
+			mutex_init(&pCifsFile->lock_mutex);
 			INIT_LIST_HEAD(&pCifsFile->llist);
 			atomic_set(&pCifsFile->wrtPending,0);
 
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 2d3275bedb5..b570530f97b 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -48,7 +48,7 @@ static inline struct cifsFileInfo *cifs_init_private(
 	private_data->netfid = netfid;
 	private_data->pid = current->tgid;	
 	init_MUTEX(&private_data->fh_sem);
-	init_MUTEX(&private_data->lock_sem);
+	mutex_init(&private_data->lock_mutex);
 	INIT_LIST_HEAD(&private_data->llist);
 	private_data->pfile = file; /* needed for writepage */
 	private_data->pInode = inode;
@@ -338,8 +338,7 @@ static int cifs_relock_file(struct cifsFileInfo *cifsFile)
 	return rc;
 }
 
-static int cifs_reopen_file(struct inode *inode, struct file *file, 
-	int can_flush)
+static int cifs_reopen_file(struct file *file, int can_flush)
 {
 	int rc = -EACCES;
 	int xid, oplock;
@@ -347,13 +346,12 @@ static int cifs_reopen_file(struct inode *inode, struct file *file,
 	struct cifsTconInfo *pTcon;
 	struct cifsFileInfo *pCifsFile;
 	struct cifsInodeInfo *pCifsInode;
+	struct inode * inode;
 	char *full_path = NULL;
 	int desiredAccess;
 	int disposition = FILE_OPEN;
 	__u16 netfid;
 
-	if (inode == NULL)
-		return -EBADF;
 	if (file->private_data) {
 		pCifsFile = (struct cifsFileInfo *)file->private_data;
 	} else
@@ -368,25 +366,37 @@ static int cifs_reopen_file(struct inode *inode, struct file *file,
 	}
 
 	if (file->f_path.dentry == NULL) {
-		up(&pCifsFile->fh_sem);
-		cFYI(1, ("failed file reopen, no valid name if dentry freed"));
-		FreeXid(xid);
-		return -EBADF;
+		cERROR(1, ("no valid name if dentry freed"));
+		dump_stack();
+		rc = -EBADF;
+		goto reopen_error_exit;
 	}
+
+	inode = file->f_path.dentry->d_inode;
+	if(inode == NULL) {
+		cERROR(1, ("inode not valid"));
+		dump_stack();
+		rc = -EBADF;
+		goto reopen_error_exit;
+	}
+		
 	cifs_sb = CIFS_SB(inode->i_sb);
 	pTcon = cifs_sb->tcon;
+
 /* can not grab rename sem here because various ops, including
    those that already have the rename sem can end up causing writepage
    to get called and if the server was down that means we end up here,
    and we can never tell if the caller already has the rename_sem */
 	full_path = build_path_from_dentry(file->f_path.dentry);
 	if (full_path == NULL) {
+		rc = -ENOMEM;
+reopen_error_exit:
 		up(&pCifsFile->fh_sem);
 		FreeXid(xid);
-		return -ENOMEM;
+		return rc;
 	}
 
-	cFYI(1, (" inode = 0x%p file flags are 0x%x for %s",
+	cFYI(1, ("inode = 0x%p file flags 0x%x for %s",
 		 inode, file->f_flags,full_path));
 	desiredAccess = cifs_convert_flags(file->f_flags);
 
@@ -401,13 +411,6 @@ static int cifs_reopen_file(struct inode *inode, struct file *file,
 	   and server version of file size can be stale. If we knew for sure
 	   that inode was not dirty locally we could do this */
 
-/*	buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
-	if (buf == 0) {
-		up(&pCifsFile->fh_sem);
-		kfree(full_path);
-		FreeXid(xid);
-		return -ENOMEM;
-	} */
 	rc = CIFSSMBOpen(xid, pTcon, full_path, disposition, desiredAccess,
 			 CREATE_NOT_DIR, &netfid, &oplock, NULL,
 			 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 
@@ -508,12 +511,12 @@ int cifs_close(struct inode *inode, struct file *file)
 
 		/* Delete any outstanding lock records.
 		   We'll lose them when the file is closed anyway. */
-		down(&pSMBFile->lock_sem);
+		mutex_lock(&pSMBFile->lock_mutex);
 		list_for_each_entry_safe(li, tmp, &pSMBFile->llist, llist) {
 			list_del(&li->llist);
 			kfree(li);
 		}
-		up(&pSMBFile->lock_sem);
+		mutex_unlock(&pSMBFile->lock_mutex);
 
 		write_lock(&GlobalSMBSeslock);
 		list_del(&pSMBFile->flist);
@@ -598,9 +601,9 @@ static int store_file_lock(struct cifsFileInfo *fid, __u64 len,
 	li->offset = offset;
 	li->length = len;
 	li->type = lockType;
-	down(&fid->lock_sem);
+	mutex_lock(&fid->lock_mutex);
 	list_add(&li->llist, &fid->llist);
-	up(&fid->lock_sem);
+	mutex_unlock(&fid->lock_mutex);
 	return 0;
 }
 
@@ -757,7 +760,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
 			struct cifsLockInfo *li, *tmp;
 
 			rc = 0;
-			down(&fid->lock_sem);
+			mutex_lock(&fid->lock_mutex);
 			list_for_each_entry_safe(li, tmp, &fid->llist, llist) {
 				if (pfLock->fl_start <= li->offset &&
 						length >= li->length) {
@@ -771,7 +774,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
 					kfree(li);
 				}
 			}
-			up(&fid->lock_sem);
+			mutex_unlock(&fid->lock_mutex);
 		}
 	}
 
@@ -792,12 +795,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 	int xid, long_op;
 	struct cifsFileInfo *open_file;
 
-	if (file->f_path.dentry == NULL)
-		return -EBADF;
-
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-	if (cifs_sb == NULL)
-		return -EBADF;
 
 	pTcon = cifs_sb->tcon;
 
@@ -807,14 +805,9 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 
 	if (file->private_data == NULL)
 		return -EBADF;
-	else
-		open_file = (struct cifsFileInfo *) file->private_data;
+	open_file = (struct cifsFileInfo *) file->private_data;
 	
 	xid = GetXid();
-	if (file->f_path.dentry->d_inode == NULL) {
-		FreeXid(xid);
-		return -EBADF;
-	}
 
 	if (*poffset > file->f_path.dentry->d_inode->i_size)
 		long_op = 2; /* writes past end of file can take a long time */
@@ -841,17 +834,11 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 					return -EBADF;
 			}
 			if (open_file->invalidHandle) {
-				if ((file->f_path.dentry == NULL) ||
-				    (file->f_path.dentry->d_inode == NULL)) {
-					FreeXid(xid);
-					return total_written;
-				}
 				/* we could deadlock if we called
 				   filemap_fdatawait from here so tell
 				   reopen_file not to flush data to server
 				   now */
-				rc = cifs_reopen_file(file->f_path.dentry->d_inode,
-					file, FALSE);
+				rc = cifs_reopen_file(file, FALSE);
 				if (rc != 0)
 					break;
 			}
@@ -908,12 +895,7 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
 	int xid, long_op;
 	struct cifsFileInfo *open_file;
 
-	if (file->f_path.dentry == NULL)
-		return -EBADF;
-
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-	if (cifs_sb == NULL)
-		return -EBADF;
 
 	pTcon = cifs_sb->tcon;
 
@@ -922,14 +904,9 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
 
 	if (file->private_data == NULL)
 		return -EBADF;
-	else
-		open_file = (struct cifsFileInfo *)file->private_data;
+	open_file = (struct cifsFileInfo *)file->private_data;
 	
 	xid = GetXid();
-	if (file->f_path.dentry->d_inode == NULL) {
-		FreeXid(xid);
-		return -EBADF;
-	}
 
 	if (*poffset > file->f_path.dentry->d_inode->i_size)
 		long_op = 2; /* writes past end of file can take a long time */
@@ -957,17 +934,11 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
 					return -EBADF;
 			}
 			if (open_file->invalidHandle) {
-				if ((file->f_path.dentry == NULL) ||
-				   (file->f_path.dentry->d_inode == NULL)) {
-					FreeXid(xid);
-					return total_written;
-				}
 				/* we could deadlock if we called
 				   filemap_fdatawait from here so tell
 				   reopen_file not to flush data to 
 				   server now */
-				rc = cifs_reopen_file(file->f_path.dentry->d_inode,
-					file, FALSE);
+				rc = cifs_reopen_file(file, FALSE);
 				if (rc != 0)
 					break;
 			}
@@ -1056,8 +1027,7 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
 			read_unlock(&GlobalSMBSeslock);
 			if((open_file->invalidHandle) && 
 			   (!open_file->closePend) /* BB fixme -since the second clause can not be true remove it BB */) {
-				rc = cifs_reopen_file(&cifs_inode->vfs_inode, 
-						      open_file->pfile, FALSE);
+				rc = cifs_reopen_file(open_file->pfile, FALSE);
 				/* if it fails, try another handle - might be */
 				/* dangerous to hold up writepages with retry */
 				if(rc) {
@@ -1404,32 +1374,6 @@ static int cifs_commit_write(struct file *file, struct page *page,
 	spin_lock(&inode->i_lock);
 	if (position > inode->i_size) {
 		i_size_write(inode, position);
-		/* if (file->private_data == NULL) {
-			rc = -EBADF;
-		} else {
-			open_file = (struct cifsFileInfo *)file->private_data;
-			cifs_sb = CIFS_SB(inode->i_sb);
-			rc = -EAGAIN;
-			while (rc == -EAGAIN) {
-				if ((open_file->invalidHandle) && 
-				    (!open_file->closePend)) {
-					rc = cifs_reopen_file(
-						file->f_path.dentry->d_inode, file);
-					if (rc != 0)
-						break;
-				}
-				if (!open_file->closePend) {
-					rc = CIFSSMBSetFileSize(xid,
-						cifs_sb->tcon, position,
-						open_file->netfid,
-						open_file->pid, FALSE);
-				} else {
-					rc = -EBADF;
-					break;
-				}
-			}
-			cFYI(1, (" SetEOF (commit write) rc = %d", rc));
-		} */
 	}
 	spin_unlock(&inode->i_lock);
 	if (!PageUptodate(page)) {
@@ -1573,8 +1517,7 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
 			int buf_type = CIFS_NO_BUFFER;
 			if ((open_file->invalidHandle) && 
 			    (!open_file->closePend)) {
-				rc = cifs_reopen_file(file->f_path.dentry->d_inode,
-					file, TRUE);
+				rc = cifs_reopen_file(file, TRUE);
 				if (rc != 0)
 					break;
 			}
@@ -1660,8 +1603,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
 		while (rc == -EAGAIN) {
 			if ((open_file->invalidHandle) && 
 			    (!open_file->closePend)) {
-				rc = cifs_reopen_file(file->f_path.dentry->d_inode,
-					file, TRUE);
+				rc = cifs_reopen_file(file, TRUE);
 				if (rc != 0)
 					break;
 			}
@@ -1817,8 +1759,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 		while (rc == -EAGAIN) {
 			if ((open_file->invalidHandle) && 
 			    (!open_file->closePend)) {
-				rc = cifs_reopen_file(file->f_path.dentry->d_inode,
-					file, TRUE);
+				rc = cifs_reopen_file(file, TRUE);
 				if (rc != 0)
 					break;
 			}
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index f414526e476..3e87dad3367 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1,7 +1,7 @@
 /*
  *   fs/cifs/inode.c
  *
- *   Copyright (C) International Business Machines  Corp., 2002,2005
+ *   Copyright (C) International Business Machines  Corp., 2002,2007
  *   Author(s): Steve French (sfrench@us.ibm.com)
  *
  *   This library is free software; you can redistribute it and/or modify
@@ -90,7 +90,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
 				(*pinode)->i_ino =
 					(unsigned long)findData.UniqueId;
 			} /* note ino incremented to unique num in new_inode */
-			if(sb->s_flags & MS_NOATIME)
+			if (sb->s_flags & MS_NOATIME)
 				(*pinode)->i_flags |= S_NOATIME | S_NOCMTIME;
 				
 			insert_inode_hash(*pinode);
@@ -139,8 +139,17 @@ int cifs_get_inode_info_unix(struct inode **pinode,
 			inode->i_mode |= S_IFREG;
 			cFYI(1,("unknown type %d",type));
 		}
-		inode->i_uid = le64_to_cpu(findData.Uid);
-		inode->i_gid = le64_to_cpu(findData.Gid);
+		
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
+			inode->i_uid = cifs_sb->mnt_uid;
+		else
+			inode->i_uid = le64_to_cpu(findData.Uid);
+
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
+			inode->i_gid = cifs_sb->mnt_gid;
+		else
+			inode->i_gid = le64_to_cpu(findData.Gid);
+			
 		inode->i_nlink = le64_to_cpu(findData.Nlinks);
 
 		spin_lock(&inode->i_lock);
@@ -178,13 +187,13 @@ int cifs_get_inode_info_unix(struct inode **pinode,
 						&cifs_file_direct_nobrl_ops;
 				else
 					inode->i_fop = &cifs_file_direct_ops;
-			} else if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+			} else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
 				inode->i_fop = &cifs_file_nobrl_ops;
 			else /* not direct, send byte range locks */ 
 				inode->i_fop = &cifs_file_ops;
 
 			/* check if server can support readpages */
-			if(pTcon->ses->server->maxBuf < 
+			if (pTcon->ses->server->maxBuf < 
 			    PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)
 				inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
 			else
@@ -220,7 +229,7 @@ static int decode_sfu_inode(struct inode * inode, __u64 size,
 
 	pbuf = buf;
 
-	if(size == 0) {
+	if (size == 0) {
 		inode->i_mode |= S_IFIFO;
 		return 0;
 	} else if (size < 8) {
@@ -239,11 +248,11 @@ static int decode_sfu_inode(struct inode * inode, __u64 size,
 			         netfid,
 				 24 /* length */, 0 /* offset */,
 				 &bytes_read, &pbuf, &buf_type);
-		if((rc == 0) && (bytes_read >= 8)) {
-			if(memcmp("IntxBLK", pbuf, 8) == 0) {
+		if ((rc == 0) && (bytes_read >= 8)) {
+			if (memcmp("IntxBLK", pbuf, 8) == 0) {
 				cFYI(1,("Block device"));
 				inode->i_mode |= S_IFBLK;
-				if(bytes_read == 24) {
+				if (bytes_read == 24) {
 					/* we have enough to decode dev num */
 					__u64 mjr; /* major */
 					__u64 mnr; /* minor */
@@ -251,10 +260,10 @@ static int decode_sfu_inode(struct inode * inode, __u64 size,
 					mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
 					inode->i_rdev = MKDEV(mjr, mnr);
 				}
-			} else if(memcmp("IntxCHR", pbuf, 8) == 0) {
+			} else if (memcmp("IntxCHR", pbuf, 8) == 0) {
 				cFYI(1,("Char device"));
 				inode->i_mode |= S_IFCHR;
-				if(bytes_read == 24) {
+				if (bytes_read == 24) {
 					/* we have enough to decode dev num */
 					__u64 mjr; /* major */
 					__u64 mnr; /* minor */
@@ -262,7 +271,7 @@ static int decode_sfu_inode(struct inode * inode, __u64 size,
 					mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
 					inode->i_rdev = MKDEV(mjr, mnr);
                                 }
-			} else if(memcmp("IntxLNK", pbuf, 7) == 0) {
+			} else if (memcmp("IntxLNK", pbuf, 7) == 0) {
 				cFYI(1,("Symlink"));
 				inode->i_mode |= S_IFLNK;
 			} else {
@@ -293,7 +302,7 @@ static int get_sfu_uid_mode(struct inode * inode,
 	rc = CIFSSMBQueryEA(xid, cifs_sb->tcon, path, "SETFILEBITS",
 			ea_value, 4 /* size of buf */, cifs_sb->local_nls,
                         cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
-	if(rc < 0)
+	if (rc < 0)
 		return (int)rc;
 	else if (rc > 3) {
 		mode = le32_to_cpu(*((__le32 *)ea_value));
@@ -348,7 +357,7 @@ int cifs_get_inode_info(struct inode **pinode,
 		/* BB optimize code so we do not make the above call
 		when server claims no NT SMB support and the above call
 		failed at least once - set flag in tcon or mount */
-		if((rc == -EOPNOTSUPP) || (rc == -EINVAL)) {
+		if ((rc == -EOPNOTSUPP) || (rc == -EINVAL)) {
 			rc = SMBQueryInformation(xid, pTcon, search_path,
 					pfindData, cifs_sb->local_nls, 
 					cifs_sb->mnt_cifs_flags &
@@ -425,7 +434,7 @@ int cifs_get_inode_info(struct inode **pinode,
 				} else /* do we need cast or hash to ino? */
 					(*pinode)->i_ino = inode_num;
 			} /* else ino incremented to unique num in new_inode*/
-			if(sb->s_flags & MS_NOATIME)
+			if (sb->s_flags & MS_NOATIME)
 				(*pinode)->i_flags |= S_NOATIME | S_NOCMTIME;
 			insert_inode_hash(*pinode);
 		}
@@ -442,7 +451,7 @@ int cifs_get_inode_info(struct inode **pinode,
 		(pTcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE) & 0xFFFFFE00;*/
 
 		/* Linux can not store file creation time so ignore it */
-		if(pfindData->LastAccessTime)
+		if (pfindData->LastAccessTime)
 			inode->i_atime = cifs_NTtimeToUnix
 				(le64_to_cpu(pfindData->LastAccessTime));
 		else /* do not need to use current_fs_time - time not stored */
@@ -452,7 +461,7 @@ int cifs_get_inode_info(struct inode **pinode,
 		inode->i_ctime =
 		    cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime));
 		cFYI(0, ("Attributes came in as 0x%x", attr));
-		if(adjustTZ && (pTcon->ses) && (pTcon->ses->server)) {
+		if (adjustTZ && (pTcon->ses) && (pTcon->ses->server)) {
 			inode->i_ctime.tv_sec += pTcon->ses->server->timeAdj;
 	                inode->i_mtime.tv_sec += pTcon->ses->server->timeAdj;
 		}
@@ -521,8 +530,10 @@ int cifs_get_inode_info(struct inode **pinode,
 
 		/* BB fill in uid and gid here? with help from winbind? 
 		   or retrieve from NTFS stream extended attribute */
-		if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
 			/* fill in uid, gid, mode from server ACL */
+			/* BB FIXME this should also take into account the
+			 * default uid specified on mount if present */
 			get_sfu_uid_mode(inode, search_path, cifs_sb, xid);
 		} else if (atomic_read(&cifsInfo->inUse) == 0) {
 			inode->i_uid = cifs_sb->mnt_uid;
@@ -541,12 +552,12 @@ int cifs_get_inode_info(struct inode **pinode,
 						&cifs_file_direct_nobrl_ops;
 				else
 					inode->i_fop = &cifs_file_direct_ops;
-			} else if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+			} else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
 				inode->i_fop = &cifs_file_nobrl_ops;
 			else /* not direct, send byte range locks */
 				inode->i_fop = &cifs_file_ops;
 
-			if(pTcon->ses->server->maxBuf < 
+			if (pTcon->ses->server->maxBuf < 
 			     PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)
 				inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
 			else
@@ -597,7 +608,7 @@ int cifs_unlink(struct inode *inode, struct dentry *direntry)
 
 	xid = GetXid();
 
-	if(inode)
+	if (inode)
 		cifs_sb = CIFS_SB(inode->i_sb);
 	else
 		cifs_sb = CIFS_SB(direntry->d_sb);
@@ -723,7 +734,7 @@ int cifs_unlink(struct inode *inode, struct dentry *direntry)
 					   when needed */
 		direntry->d_inode->i_ctime = current_fs_time(inode->i_sb);
 	}
-	if(inode) {
+	if (inode) {
 		inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
 		cifsInode = CIFS_I(inode);
 		cifsInode->time = 0;	/* force revalidate of dir as well */
@@ -734,6 +745,136 @@ int cifs_unlink(struct inode *inode, struct dentry *direntry)
 	return rc;
 }
 
+static void posix_fill_in_inode(struct inode *tmp_inode,
+	FILE_UNIX_BASIC_INFO *pData, int *pobject_type, int isNewInode)
+{
+	loff_t local_size;
+	struct timespec local_mtime;
+
+	struct cifsInodeInfo *cifsInfo = CIFS_I(tmp_inode);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(tmp_inode->i_sb);
+
+	__u32 type = le32_to_cpu(pData->Type);
+	__u64 num_of_bytes = le64_to_cpu(pData->NumOfBytes);
+	__u64 end_of_file = le64_to_cpu(pData->EndOfFile);
+	cifsInfo->time = jiffies;
+	atomic_inc(&cifsInfo->inUse);
+
+	/* save mtime and size */
+	local_mtime = tmp_inode->i_mtime;
+	local_size  = tmp_inode->i_size;
+
+	tmp_inode->i_atime =
+	    cifs_NTtimeToUnix(le64_to_cpu(pData->LastAccessTime));
+	tmp_inode->i_mtime =
+	    cifs_NTtimeToUnix(le64_to_cpu(pData->LastModificationTime));
+	tmp_inode->i_ctime =
+	    cifs_NTtimeToUnix(le64_to_cpu(pData->LastStatusChange));
+
+	tmp_inode->i_mode = le64_to_cpu(pData->Permissions);
+	/* since we set the inode type below we need to mask off type
+           to avoid strange results if bits above were corrupt */
+        tmp_inode->i_mode &= ~S_IFMT;
+	if (type == UNIX_FILE) {
+		*pobject_type = DT_REG;
+		tmp_inode->i_mode |= S_IFREG;
+	} else if (type == UNIX_SYMLINK) {
+		*pobject_type = DT_LNK;
+		tmp_inode->i_mode |= S_IFLNK;
+	} else if (type == UNIX_DIR) {
+		*pobject_type = DT_DIR;
+		tmp_inode->i_mode |= S_IFDIR;
+	} else if (type == UNIX_CHARDEV) {
+		*pobject_type = DT_CHR;
+		tmp_inode->i_mode |= S_IFCHR;
+		tmp_inode->i_rdev = MKDEV(le64_to_cpu(pData->DevMajor),
+				le64_to_cpu(pData->DevMinor) & MINORMASK);
+	} else if (type == UNIX_BLOCKDEV) {
+		*pobject_type = DT_BLK;
+		tmp_inode->i_mode |= S_IFBLK;
+		tmp_inode->i_rdev = MKDEV(le64_to_cpu(pData->DevMajor),
+				le64_to_cpu(pData->DevMinor) & MINORMASK);
+	} else if (type == UNIX_FIFO) {
+		*pobject_type = DT_FIFO;
+		tmp_inode->i_mode |= S_IFIFO;
+	} else if (type == UNIX_SOCKET) {
+		*pobject_type = DT_SOCK;
+		tmp_inode->i_mode |= S_IFSOCK;
+	} else {
+		/* safest to just call it a file */
+		*pobject_type = DT_REG;
+		tmp_inode->i_mode |= S_IFREG;
+		cFYI(1,("unknown inode type %d",type)); 
+	}
+
+#ifdef CONFIG_CIFS_DEBUG2
+	cFYI(1,("object type: %d", type));
+#endif
+	tmp_inode->i_uid = le64_to_cpu(pData->Uid);
+	tmp_inode->i_gid = le64_to_cpu(pData->Gid);
+	tmp_inode->i_nlink = le64_to_cpu(pData->Nlinks);
+
+	spin_lock(&tmp_inode->i_lock);
+	if (is_size_safe_to_change(cifsInfo, end_of_file)) {
+		/* can not safely change the file size here if the 
+		client is writing to it due to potential races */
+		i_size_write(tmp_inode, end_of_file);
+
+	/* 512 bytes (2**9) is the fake blocksize that must be used */
+	/* for this calculation, not the real blocksize */
+		tmp_inode->i_blocks = (512 - 1 + num_of_bytes) >> 9;
+	}
+	spin_unlock(&tmp_inode->i_lock);
+
+	if (S_ISREG(tmp_inode->i_mode)) {
+		cFYI(1, ("File inode"));
+		tmp_inode->i_op = &cifs_file_inode_ops;
+
+		if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
+			if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+				tmp_inode->i_fop = &cifs_file_direct_nobrl_ops;
+			else
+				tmp_inode->i_fop = &cifs_file_direct_ops;
+		
+		} else if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+			tmp_inode->i_fop = &cifs_file_nobrl_ops;
+		else
+			tmp_inode->i_fop = &cifs_file_ops;
+
+		if((cifs_sb->tcon) && (cifs_sb->tcon->ses) &&
+		   (cifs_sb->tcon->ses->server->maxBuf < 
+			PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE))
+			tmp_inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
+		else
+			tmp_inode->i_data.a_ops = &cifs_addr_ops;
+
+		if(isNewInode)
+			return; /* No sense invalidating pages for new inode since we
+					   have not started caching readahead file data yet */
+
+		if (timespec_equal(&tmp_inode->i_mtime, &local_mtime) &&
+			(local_size == tmp_inode->i_size)) {
+			cFYI(1, ("inode exists but unchanged"));
+		} else {
+			/* file may have changed on server */
+			cFYI(1, ("invalidate inode, readdir detected change"));
+			invalidate_remote_inode(tmp_inode);
+		}
+	} else if (S_ISDIR(tmp_inode->i_mode)) {
+		cFYI(1, ("Directory inode"));
+		tmp_inode->i_op = &cifs_dir_inode_ops;
+		tmp_inode->i_fop = &cifs_dir_ops;
+	} else if (S_ISLNK(tmp_inode->i_mode)) {
+		cFYI(1, ("Symbolic Link inode"));
+		tmp_inode->i_op = &cifs_symlink_inode_ops;
+/* tmp_inode->i_fop = *//* do not need to set to anything */
+	} else {
+		cFYI(1, ("Special inode")); 
+		init_special_inode(tmp_inode, tmp_inode->i_mode,
+				   tmp_inode->i_rdev);
+	}	
+}
+
 int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 {
 	int rc = 0;
@@ -755,6 +896,71 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 		FreeXid(xid);
 		return -ENOMEM;
 	}
+	
+	if((pTcon->ses->capabilities & CAP_UNIX) && 
+		(CIFS_UNIX_POSIX_PATH_OPS_CAP &	
+			le64_to_cpu(pTcon->fsUnixInfo.Capability))) {
+		u32 oplock = 0;
+		FILE_UNIX_BASIC_INFO * pInfo = 
+			kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
+		if(pInfo == NULL) {
+			rc = -ENOMEM;
+			goto mkdir_out;
+		}
+			
+		rc = CIFSPOSIXCreate(xid, pTcon, SMB_O_DIRECTORY | SMB_O_CREAT,
+				mode, NULL /* netfid */, pInfo, &oplock,
+				full_path, cifs_sb->local_nls, 
+				cifs_sb->mnt_cifs_flags & 
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+		if (rc) {
+			cFYI(1, ("posix mkdir returned 0x%x", rc));
+			d_drop(direntry);
+		} else {
+			int obj_type;
+			if (pInfo->Type == -1) /* no return info - go query */
+				goto mkdir_get_info; 
+/*BB check (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID ) to see if need to set uid/gid */
+			inc_nlink(inode);
+			if (pTcon->nocase)
+				direntry->d_op = &cifs_ci_dentry_ops;
+			else
+				direntry->d_op = &cifs_dentry_ops;
+
+			newinode = new_inode(inode->i_sb);
+			if (newinode == NULL)
+				goto mkdir_get_info;
+			/* Is an i_ino of zero legal? */
+			/* Are there sanity checks we can use to ensure that
+			   the server is really filling in that field? */
+			if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
+				newinode->i_ino =
+					(unsigned long)pInfo->UniqueId;
+			} /* note ino incremented to unique num in new_inode */
+			if(inode->i_sb->s_flags & MS_NOATIME)
+				newinode->i_flags |= S_NOATIME | S_NOCMTIME;
+			newinode->i_nlink = 2;
+
+			insert_inode_hash(newinode);
+			d_instantiate(direntry, newinode);
+
+			/* we already checked in POSIXCreate whether
+			   frame was long enough */
+			posix_fill_in_inode(direntry->d_inode,
+					pInfo, &obj_type, 1 /* NewInode */);
+#ifdef CONFIG_CIFS_DEBUG2
+			cFYI(1,("instantiated dentry %p %s to inode %p",
+				direntry, direntry->d_name.name, newinode));
+
+			if(newinode->i_nlink != 2)
+				cFYI(1,("unexpected number of links %d",
+					newinode->i_nlink));
+#endif
+		}
+		kfree(pInfo);
+		goto mkdir_out;
+	}	
+	
 	/* BB add setting the equivalent of mode via CreateX w/ACLs */
 	rc = CIFSSMBMkDir(xid, pTcon, full_path, cifs_sb->local_nls,
 			  cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -762,6 +968,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 		cFYI(1, ("cifs_mkdir returned 0x%x", rc));
 		d_drop(direntry);
 	} else {
+mkdir_get_info:		
 		inc_nlink(inode);
 		if (pTcon->ses->capabilities & CAP_UNIX)
 			rc = cifs_get_inode_info_unix(&newinode, full_path,
@@ -775,8 +982,10 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 		else
 			direntry->d_op = &cifs_dentry_ops;
 		d_instantiate(direntry, newinode);
-		if (direntry->d_inode)
-			direntry->d_inode->i_nlink = 2;
+		 /* setting nlink not necessary except in cases where we
+		  * failed to get it from the server or was set bogus */ 
+		if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2))
+				direntry->d_inode->i_nlink = 2; 
 		if (cifs_sb->tcon->ses->capabilities & CAP_UNIX)
 			if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
 				CIFSSMBUnixSetPerms(xid, pTcon, full_path,
@@ -812,6 +1021,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 			}
 		}
 	}
+mkdir_out:	
 	kfree(full_path);
 	FreeXid(xid);
 	return rc;
@@ -1339,17 +1549,17 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 					cpu_to_le32(cifsInode->cifsAttrs |
 						    ATTR_READONLY);
 			}
-		} else if ((mode & S_IWUGO) == S_IWUGO) {
-			if (cifsInode->cifsAttrs & ATTR_READONLY) {
-				set_dosattr = TRUE;
-				time_buf.Attributes =
-					cpu_to_le32(cifsInode->cifsAttrs &
-						    (~ATTR_READONLY));
-				/* Windows ignores set to zero */
-				if(time_buf.Attributes == 0)
-					time_buf.Attributes |= 
-						cpu_to_le32(ATTR_NORMAL);
-			}
+		} else if (cifsInode->cifsAttrs & ATTR_READONLY) {
+			/* If file is readonly on server, we would
+			not be able to write to it - so if any write
+			bit is enabled for user or group or other we
+			need to at least try to remove r/o dos attr */
+			set_dosattr = TRUE;
+			time_buf.Attributes = cpu_to_le32(cifsInode->cifsAttrs &
+					    (~ATTR_READONLY));
+			/* Windows ignores set to zero */
+			if(time_buf.Attributes == 0)
+				time_buf.Attributes |= cpu_to_le32(ATTR_NORMAL);
 		}
 		/* BB to be implemented -
 		   via Windows security descriptors or streams */
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 992e80edc72..53e304d5954 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -30,6 +30,9 @@
 #include <linux/fs.h>
 #include <asm/div64.h>
 #include <asm/byteorder.h>
+#ifdef CONFIG_CIFS_EXPERIMENTAL
+#include <linux/inet.h>
+#endif
 #include "cifsfs.h"
 #include "cifspdu.h"
 #include "cifsglob.h"
@@ -129,11 +132,27 @@ static const struct smb_to_posix_error mapping_table_ERRHRD[] = {
 /* Convert string containing dotted ip address to binary form */
 /* returns 0 if invalid address */
 
-/* BB add address family, change rc to status flag and return union or for ipv6 */
-/*  will need parent to call something like inet_pton to convert ipv6 address  BB */
 int
 cifs_inet_pton(int address_family, char *cp,void *dst)
 {
+#ifdef CONFIG_CIFS_EXPERIMENTAL
+	int ret = 0;
+
+	/* calculate length by finding first slash or NULL */
+	/* BB Should we convert '/' slash to '\' here since it seems already done
+	   before this */
+	if( address_family == AF_INET ){
+		ret = in4_pton(cp, -1 /* len */, dst , '\\', NULL);	
+	} else if( address_family == AF_INET6 ){
+		ret = in6_pton(cp, -1 /* len */, dst , '\\', NULL);
+	}
+#ifdef CONFIG_CIFS_DEBUG2
+	cFYI(1,("address conversion returned %d for %s", ret, cp));
+#endif
+	if (ret > 0)
+		ret = 1;
+	return ret;
+#else
 	int value;
 	int digit;
 	int i;
@@ -192,6 +211,7 @@ cifs_inet_pton(int address_family, char *cp,void *dst)
 
 	*((__be32 *)dst) = *((__be32 *) bytes) | htonl(value);
 	return 1; /* success */
+#endif /* EXPERIMENTAL */	
 }
 
 /*****************************************************************************
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 2a374d5215a..b5364f90d55 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -37,19 +37,19 @@ static void dump_cifs_file_struct(struct file *file, char *label)
 {
 	struct cifsFileInfo * cf;
 
-	if(file) {
+	if (file) {
 		cf = file->private_data;
-		if(cf == NULL) {
+		if (cf == NULL) {
 			cFYI(1,("empty cifs private file data"));
 			return;
 		}
-		if(cf->invalidHandle) {
+		if (cf->invalidHandle) {
 			cFYI(1,("invalid handle"));
 		}
-		if(cf->srch_inf.endOfSearch) {
+		if (cf->srch_inf.endOfSearch) {
 			cFYI(1,("end of search"));
 		}
-		if(cf->srch_inf.emptyDir) {
+		if (cf->srch_inf.emptyDir) {
 			cFYI(1,("empty dir"));
 		}
 		
@@ -77,17 +77,17 @@ static int construct_dentry(struct qstr *qstring, struct file *file,
 		cFYI(0, ("existing dentry with inode 0x%p", tmp_dentry->d_inode));
 		*ptmp_inode = tmp_dentry->d_inode;
 /* BB overwrite old name? i.e. tmp_dentry->d_name and tmp_dentry->d_name.len??*/
-		if(*ptmp_inode == NULL) {
+		if (*ptmp_inode == NULL) {
 			*ptmp_inode = new_inode(file->f_path.dentry->d_sb);
-			if(*ptmp_inode == NULL)
+			if (*ptmp_inode == NULL)
 				return rc;
 			rc = 1;
 		}
-		if(file->f_path.dentry->d_sb->s_flags & MS_NOATIME)
+		if (file->f_path.dentry->d_sb->s_flags & MS_NOATIME)
 			(*ptmp_inode)->i_flags |= S_NOATIME | S_NOCMTIME;
 	} else {
 		tmp_dentry = d_alloc(file->f_path.dentry, qstring);
-		if(tmp_dentry == NULL) {
+		if (tmp_dentry == NULL) {
 			cERROR(1,("Failed allocating dentry"));
 			*ptmp_inode = NULL;
 			return rc;
@@ -98,9 +98,9 @@ static int construct_dentry(struct qstr *qstring, struct file *file,
 			tmp_dentry->d_op = &cifs_ci_dentry_ops;
 		else
 			tmp_dentry->d_op = &cifs_dentry_ops;
-		if(*ptmp_inode == NULL)
+		if (*ptmp_inode == NULL)
 			return rc;
-		if(file->f_path.dentry->d_sb->s_flags & MS_NOATIME)
+		if (file->f_path.dentry->d_sb->s_flags & MS_NOATIME)
 			(*ptmp_inode)->i_flags |= S_NOATIME | S_NOCMTIME;			
 		rc = 2;
 	}
@@ -112,7 +112,7 @@ static int construct_dentry(struct qstr *qstring, struct file *file,
 
 static void AdjustForTZ(struct cifsTconInfo * tcon, struct inode * inode)
 {
-	if((tcon) && (tcon->ses) && (tcon->ses->server)) {
+	if ((tcon) && (tcon->ses) && (tcon->ses->server)) {
 		inode->i_ctime.tv_sec += tcon->ses->server->timeAdj;
 		inode->i_mtime.tv_sec += tcon->ses->server->timeAdj;
 		inode->i_atime.tv_sec += tcon->ses->server->timeAdj;
@@ -137,7 +137,7 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
 	local_mtime = tmp_inode->i_mtime;
 	local_size  = tmp_inode->i_size;
 
-	if(new_buf_type) {
+	if (new_buf_type) {
 		FILE_DIRECTORY_INFO *pfindData = (FILE_DIRECTORY_INFO *)buf;
 
 		attr = le32_to_cpu(pfindData->ExtFileAttributes);
@@ -193,7 +193,7 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
 	if (attr & ATTR_DIRECTORY) {
 		*pobject_type = DT_DIR;
 		/* override default perms since we do not lock dirs */
-		if(atomic_read(&cifsInfo->inUse) == 0) {
+		if (atomic_read(&cifsInfo->inUse) == 0) {
 			tmp_inode->i_mode = cifs_sb->mnt_dir_mode;
 		}
 		tmp_inode->i_mode |= S_IFDIR;
@@ -250,25 +250,25 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
 	if (S_ISREG(tmp_inode->i_mode)) {
 		cFYI(1, ("File inode"));
 		tmp_inode->i_op = &cifs_file_inode_ops;
-		if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
-			if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
+			if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
 				tmp_inode->i_fop = &cifs_file_direct_nobrl_ops;
 			else
 				tmp_inode->i_fop = &cifs_file_direct_ops;
 		
-		} else if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+		} else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
 			tmp_inode->i_fop = &cifs_file_nobrl_ops;
 		else
 			tmp_inode->i_fop = &cifs_file_ops;
 
-		if((cifs_sb->tcon) && (cifs_sb->tcon->ses) &&
+		if ((cifs_sb->tcon) && (cifs_sb->tcon->ses) &&
 		   (cifs_sb->tcon->ses->server->maxBuf <
 			PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE))
 			tmp_inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
 		else
 			tmp_inode->i_data.a_ops = &cifs_addr_ops;
 
-		if(isNewInode)
+		if (isNewInode)
 			return; /* No sense invalidating pages for new inode
 				   since have not started caching readahead file
 				   data yet */
@@ -357,8 +357,14 @@ static void unix_fill_in_inode(struct inode *tmp_inode,
 		cFYI(1,("unknown inode type %d",type)); 
 	}
 
-	tmp_inode->i_uid = le64_to_cpu(pfindData->Uid);
-	tmp_inode->i_gid = le64_to_cpu(pfindData->Gid);
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
+		tmp_inode->i_uid = cifs_sb->mnt_uid;
+	else
+		tmp_inode->i_uid = le64_to_cpu(pfindData->Uid);
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
+		tmp_inode->i_gid = cifs_sb->mnt_gid;
+	else
+		tmp_inode->i_gid = le64_to_cpu(pfindData->Gid);
 	tmp_inode->i_nlink = le64_to_cpu(pfindData->Nlinks);
 
 	spin_lock(&tmp_inode->i_lock);
@@ -377,25 +383,24 @@ static void unix_fill_in_inode(struct inode *tmp_inode,
 		cFYI(1, ("File inode"));
 		tmp_inode->i_op = &cifs_file_inode_ops;
 
-		if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
-			if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
+			if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
 				tmp_inode->i_fop = &cifs_file_direct_nobrl_ops;
 			else
 				tmp_inode->i_fop = &cifs_file_direct_ops;
-		
-		} else if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+		} else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
 			tmp_inode->i_fop = &cifs_file_nobrl_ops;
 		else
 			tmp_inode->i_fop = &cifs_file_ops;
 
-		if((cifs_sb->tcon) && (cifs_sb->tcon->ses) &&
+		if ((cifs_sb->tcon) && (cifs_sb->tcon->ses) &&
 		   (cifs_sb->tcon->ses->server->maxBuf < 
 			PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE))
 			tmp_inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
 		else
 			tmp_inode->i_data.a_ops = &cifs_addr_ops;
 
-		if(isNewInode)
+		if (isNewInode)
 			return; /* No sense invalidating pages for new inode since we
 					   have not started caching readahead file data yet */
 
@@ -430,34 +435,28 @@ static int initiate_cifs_search(const int xid, struct file *file)
 	struct cifs_sb_info *cifs_sb;
 	struct cifsTconInfo *pTcon;
 
-	if(file->private_data == NULL) {
+	if (file->private_data == NULL) {
 		file->private_data = 
-			kmalloc(sizeof(struct cifsFileInfo),GFP_KERNEL);
+			kzalloc(sizeof(struct cifsFileInfo),GFP_KERNEL);
 	}
 
-	if(file->private_data == NULL) {
+	if (file->private_data == NULL)
 		return -ENOMEM;
-	} else {
-		memset(file->private_data,0,sizeof(struct cifsFileInfo));
-	}
 	cifsFile = file->private_data;
 	cifsFile->invalidHandle = TRUE;
 	cifsFile->srch_inf.endOfSearch = FALSE;
 
-	if(file->f_path.dentry == NULL)
-		return -ENOENT;
-
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-	if(cifs_sb == NULL)
+	if (cifs_sb == NULL)
 		return -EINVAL;
 
 	pTcon = cifs_sb->tcon;
-	if(pTcon == NULL)
+	if (pTcon == NULL)
 		return -EINVAL;
 
 	full_path = build_path_from_dentry(file->f_path.dentry);
 
-	if(full_path == NULL) {
+	if (full_path == NULL) {
 		return -ENOMEM;
 	}
 
@@ -480,9 +479,9 @@ ffirst_retry:
 		&cifsFile->netfid, &cifsFile->srch_inf,
 		cifs_sb->mnt_cifs_flags & 
 			CIFS_MOUNT_MAP_SPECIAL_CHR, CIFS_DIR_SEP(cifs_sb));
-	if(rc == 0)
+	if (rc == 0)
 		cifsFile->invalidHandle = FALSE;
-	if((rc == -EOPNOTSUPP) && 
+	if ((rc == -EOPNOTSUPP) && 
 		(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) {
 		cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
 		goto ffirst_retry;
@@ -498,7 +497,7 @@ static int cifs_unicode_bytelen(char *str)
 	__le16 * ustr = (__le16 *)str;
 
 	for(len=0;len <= PATH_MAX;len++) {
-		if(ustr[len] == 0)
+		if (ustr[len] == 0)
 			return len << 1;
 	}
 	cFYI(1,("Unicode string longer than PATH_MAX found"));
@@ -510,7 +509,7 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level)
 	char * new_entry;
 	FILE_DIRECTORY_INFO * pDirInfo = (FILE_DIRECTORY_INFO *)old_entry;
 
-	if(level == SMB_FIND_FILE_INFO_STANDARD) {
+	if (level == SMB_FIND_FILE_INFO_STANDARD) {
 		FIND_FILE_STANDARD_INFO * pfData;
 		pfData = (FIND_FILE_STANDARD_INFO *)pDirInfo;
 
@@ -520,12 +519,12 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level)
 		new_entry = old_entry + le32_to_cpu(pDirInfo->NextEntryOffset);
 	cFYI(1,("new entry %p old entry %p",new_entry,old_entry));
 	/* validate that new_entry is not past end of SMB */
-	if(new_entry >= end_of_smb) {
+	if (new_entry >= end_of_smb) {
 		cERROR(1,
 		      ("search entry %p began after end of SMB %p old entry %p",
 			new_entry, end_of_smb, old_entry)); 
 		return NULL;
-	} else if(((level == SMB_FIND_FILE_INFO_STANDARD) &&
+	} else if (((level == SMB_FIND_FILE_INFO_STANDARD) &&
 		   (new_entry + sizeof(FIND_FILE_STANDARD_INFO) > end_of_smb)) ||
 		  ((level != SMB_FIND_FILE_INFO_STANDARD) &&
 		   (new_entry + sizeof(FILE_DIRECTORY_INFO) > end_of_smb)))  {
@@ -546,39 +545,39 @@ static int cifs_entry_is_dot(char *current_entry, struct cifsFileInfo *cfile)
 	char * filename = NULL;
 	int len = 0; 
 
-	if(cfile->srch_inf.info_level == SMB_FIND_FILE_UNIX) {
+	if (cfile->srch_inf.info_level == SMB_FIND_FILE_UNIX) {
 		FILE_UNIX_INFO * pFindData = (FILE_UNIX_INFO *)current_entry;
 		filename = &pFindData->FileName[0];
-		if(cfile->srch_inf.unicode) {
+		if (cfile->srch_inf.unicode) {
 			len = cifs_unicode_bytelen(filename);
 		} else {
 			/* BB should we make this strnlen of PATH_MAX? */
 			len = strnlen(filename, 5);
 		}
-	} else if(cfile->srch_inf.info_level == SMB_FIND_FILE_DIRECTORY_INFO) {
+	} else if (cfile->srch_inf.info_level == SMB_FIND_FILE_DIRECTORY_INFO) {
 		FILE_DIRECTORY_INFO * pFindData = 
 			(FILE_DIRECTORY_INFO *)current_entry;
 		filename = &pFindData->FileName[0];
 		len = le32_to_cpu(pFindData->FileNameLength);
-	} else if(cfile->srch_inf.info_level == 
+	} else if (cfile->srch_inf.info_level == 
 			SMB_FIND_FILE_FULL_DIRECTORY_INFO) {
 		FILE_FULL_DIRECTORY_INFO * pFindData = 
 			(FILE_FULL_DIRECTORY_INFO *)current_entry;
 		filename = &pFindData->FileName[0];
 		len = le32_to_cpu(pFindData->FileNameLength);
-	} else if(cfile->srch_inf.info_level ==
+	} else if (cfile->srch_inf.info_level ==
 			SMB_FIND_FILE_ID_FULL_DIR_INFO) {
 		SEARCH_ID_FULL_DIR_INFO * pFindData = 
 			(SEARCH_ID_FULL_DIR_INFO *)current_entry;
 		filename = &pFindData->FileName[0];
 		len = le32_to_cpu(pFindData->FileNameLength);
-	} else if(cfile->srch_inf.info_level == 
+	} else if (cfile->srch_inf.info_level == 
 			SMB_FIND_FILE_BOTH_DIRECTORY_INFO) {
 		FILE_BOTH_DIRECTORY_INFO * pFindData = 
 			(FILE_BOTH_DIRECTORY_INFO *)current_entry;
 		filename = &pFindData->FileName[0];
 		len = le32_to_cpu(pFindData->FileNameLength);
-	} else if(cfile->srch_inf.info_level == SMB_FIND_FILE_INFO_STANDARD) {
+	} else if (cfile->srch_inf.info_level == SMB_FIND_FILE_INFO_STANDARD) {
 		FIND_FILE_STANDARD_INFO * pFindData =
 			(FIND_FILE_STANDARD_INFO *)current_entry;
 		filename = &pFindData->FileName[0];
@@ -587,25 +586,25 @@ static int cifs_entry_is_dot(char *current_entry, struct cifsFileInfo *cfile)
 		cFYI(1,("Unknown findfirst level %d",cfile->srch_inf.info_level));
 	}
 
-	if(filename) {
-		if(cfile->srch_inf.unicode) {
+	if (filename) {
+		if (cfile->srch_inf.unicode) {
 			__le16 *ufilename = (__le16 *)filename;
-			if(len == 2) {
+			if (len == 2) {
 				/* check for . */
-				if(ufilename[0] == UNICODE_DOT)
+				if (ufilename[0] == UNICODE_DOT)
 					rc = 1;
-			} else if(len == 4) {
+			} else if (len == 4) {
 				/* check for .. */
-				if((ufilename[0] == UNICODE_DOT)
+				if ((ufilename[0] == UNICODE_DOT)
 				   &&(ufilename[1] == UNICODE_DOT))
 					rc = 2;
 			}
 		} else /* ASCII */ {
-			if(len == 1) {
-				if(filename[0] == '.') 
+			if (len == 1) {
+				if (filename[0] == '.') 
 					rc = 1;
-			} else if(len == 2) {
-				if((filename[0] == '.') && (filename[1] == '.')) 
+			} else if (len == 2) {
+				if((filename[0] == '.') && (filename[1] == '.'))
 					rc = 2;
 			}
 		}
@@ -618,20 +617,10 @@ static int cifs_entry_is_dot(char *current_entry, struct cifsFileInfo *cfile)
    whether we can use the cached search results from the previous search */
 static int is_dir_changed(struct file * file)
 {
-	struct inode * inode;
-	struct cifsInodeInfo *cifsInfo;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct cifsInodeInfo *cifsInfo = CIFS_I(inode);
 
-	if(file->f_path.dentry == NULL)
-		return 0;
-
-	inode = file->f_path.dentry->d_inode;
-
-	if(inode == NULL)
-		return 0;
-
-	cifsInfo = CIFS_I(inode);
-
-	if(cifsInfo->time == 0)
+	if (cifsInfo->time == 0)
 		return 1; /* directory was changed, perhaps due to unlink */
 	else
 		return 0;
@@ -654,7 +643,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 	struct cifsFileInfo * cifsFile = file->private_data;
 	/* check if index in the buffer */
 	
-	if((cifsFile == NULL) || (ppCurrentEntry == NULL) || 
+	if ((cifsFile == NULL) || (ppCurrentEntry == NULL) || 
 	   (num_to_ret == NULL))
 		return -ENOENT;
 	
@@ -672,7 +661,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 #ifdef CONFIG_CIFS_DEBUG2
 	dump_cifs_file_struct(file, "In fce ");
 #endif
-	if(((index_to_find < cifsFile->srch_inf.index_of_last_entry) && 
+	if (((index_to_find < cifsFile->srch_inf.index_of_last_entry) && 
 	     is_dir_changed(file)) || 
 	   (index_to_find < first_entry_in_buffer)) {
 		/* close and restart search */
@@ -681,9 +670,9 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 		CIFSFindClose(xid, pTcon, cifsFile->netfid);
 		kfree(cifsFile->search_resume_name);
 		cifsFile->search_resume_name = NULL;
-		if(cifsFile->srch_inf.ntwrk_buf_start) {
+		if (cifsFile->srch_inf.ntwrk_buf_start) {
 			cFYI(1,("freeing SMB ff cache buf on search rewind"));
-			if(cifsFile->srch_inf.smallBuf)
+			if (cifsFile->srch_inf.smallBuf)
 				cifs_small_buf_release(cifsFile->srch_inf.
 						ntwrk_buf_start);
 			else
@@ -691,7 +680,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 						ntwrk_buf_start);
 		}
 		rc = initiate_cifs_search(xid,file);
-		if(rc) {
+		if (rc) {
 			cFYI(1,("error %d reinitiating a search on rewind",rc));
 			return rc;
 		}
@@ -702,10 +691,10 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 	 	cFYI(1,("calling findnext2"));
 		rc = CIFSFindNext(xid,pTcon,cifsFile->netfid, 
 				  &cifsFile->srch_inf);
-		if(rc)
+		if (rc)
 			return -ENOENT;
 	}
-	if(index_to_find < cifsFile->srch_inf.index_of_last_entry) {
+	if (index_to_find < cifsFile->srch_inf.index_of_last_entry) {
 		/* we found the buffer that contains the entry */
 		/* scan and find it */
 		int i;
@@ -851,9 +840,6 @@ static int cifs_filldir(char *pfindEntry, struct file *file,
 	if((scratch_buf == NULL) || (pfindEntry == NULL) || (pCifsF == NULL))
 		return -ENOENT;
 
-	if(file->f_path.dentry == NULL)
-		return -ENOENT;
-
 	rc = cifs_entry_is_dot(pfindEntry,pCifsF);
 	/* skip . and .. since we added them first */
 	if(rc != 0) 
@@ -997,11 +983,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 
 	xid = GetXid();
 
-	if(file->f_path.dentry == NULL) {
-		FreeXid(xid);
-		return -EIO;
-	}
-
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 	pTcon = cifs_sb->tcon;
 	if(pTcon == NULL)
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 614175a3b02..0aaff3651d1 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -62,8 +62,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
 	struct coda_inode_info *ei = (struct coda_inode_info *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR)
+	if (flags & SLAB_CTOR_CONSTRUCTOR)
 		inode_init_once(&ei->vfs_inode);
 }
  
diff --git a/fs/compat.c b/fs/compat.c
index 040a8be38a4..72e5e692382 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -371,13 +371,14 @@ static void compat_ioctl_error(struct file *filp, unsigned int fd,
 			fn = "?";
 	}
 
-	sprintf(buf,"'%c'", (cmd>>24) & 0x3f);
+	sprintf(buf,"'%c'", (cmd>>_IOC_TYPESHIFT) & _IOC_TYPEMASK);
 	if (!isprint(buf[1]))
 		sprintf(buf, "%02x", buf[1]);
 	compat_printk("ioctl32(%s:%d): Unknown cmd fd(%d) "
-			"cmd(%08x){%s} arg(%08x) on %s\n",
+			"cmd(%08x){t:%s;sz:%u} arg(%08x) on %s\n",
 			current->comm, current->pid,
 			(int)fd, (unsigned int)cmd, buf,
+			(cmd >> _IOC_SIZESHIFT) & _IOC_SIZEMASK,
 			(unsigned int)arg, fn);
 
 	if (path)
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index c68b055fa26..464c04a9541 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -2396,6 +2396,14 @@ lp_timeout_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
 #define ULONG_IOCTL(cmd) \
 	{ (cmd), (ioctl_trans_handler_t)sys_ioctl },
 
+/* ioctl should not be warned about even if it's not implemented.
+   Valid reasons to use this:
+   - It is implemented with ->compat_ioctl on some device, but programs
+   call it on others too.
+   - The ioctl is not implemented in the native kernel, but programs
+   call it commonly anyways.
+   Most other reasons are not valid. */
+#define IGNORE_IOCTL(cmd) COMPATIBLE_IOCTL(cmd)
 
 struct ioctl_trans ioctl_start[] = {
 #include <linux/compat_ioctl.h>
@@ -2594,6 +2602,8 @@ HANDLE_IOCTL(SIOCGIWENCODEEXT, do_wireless_ioctl)
 HANDLE_IOCTL(SIOCSIWPMKSA, do_wireless_ioctl)
 HANDLE_IOCTL(SIOCSIFBR, old_bridge_ioctl)
 HANDLE_IOCTL(SIOCGIFBR, old_bridge_ioctl)
+/* Not implemented in the native kernel */
+IGNORE_IOCTL(SIOCGIFCOUNT)
 HANDLE_IOCTL(RTC_IRQP_READ32, rtc_ioctl)
 HANDLE_IOCTL(RTC_IRQP_SET32, rtc_ioctl)
 HANDLE_IOCTL(RTC_EPOCH_READ32, rtc_ioctl)
@@ -2617,6 +2627,15 @@ COMPATIBLE_IOCTL(LPRESET)
 /*LPGETSTATS not implemented, but no kernels seem to compile it in anyways*/
 COMPATIBLE_IOCTL(LPGETFLAGS)
 HANDLE_IOCTL(LPSETTIMEOUT, lp_timeout_trans)
+
+/* fat 'r' ioctls. These are handled by fat with ->compat_ioctl,
+   but we don't want warnings on other file systems. So declare
+   them as compatible here. */
+#define VFAT_IOCTL_READDIR_BOTH32       _IOR('r', 1, struct compat_dirent[2])
+#define VFAT_IOCTL_READDIR_SHORT32      _IOR('r', 2, struct compat_dirent[2])
+
+IGNORE_IOCTL(VFAT_IOCTL_READDIR_BOTH32)
+IGNORE_IOCTL(VFAT_IOCTL_READDIR_SHORT32)
 };
 
 int ioctl_table_size = ARRAY_SIZE(ioctl_start);
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 6f573004cd7..b00d962de83 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -140,7 +140,7 @@ static int __init configfs_init(void)
 	if (!configfs_dir_cachep)
 		goto out;
 
-	kset_set_kset_s(&config_subsys, kernel_subsys);
+	kobj_set_kset_s(&config_subsys, kernel_subsys);
 	err = subsystem_register(&config_subsys);
 	if (err) {
 		kmem_cache_destroy(configfs_dir_cachep);
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index facd0c89be8..3d194a2be3f 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -180,7 +180,8 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
 		struct page *page = NULL;
 
 		if (blocknr + i < devsize) {
-			page = read_mapping_page(mapping, blocknr + i, NULL);
+			page = read_mapping_page_async(mapping, blocknr + i,
+									NULL);
 			/* synchronous error? */
 			if (IS_ERR(page))
 				page = NULL;
diff --git a/fs/dcache.c b/fs/dcache.c
index d68631f18df..d1bf5d8aeb5 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2052,12 +2052,8 @@ static void __init dcache_init(unsigned long mempages)
 	 * but it is probably not worth it because of the cache nature
 	 * of the dcache. 
 	 */
-	dentry_cache = kmem_cache_create("dentry_cache",
-					 sizeof(struct dentry),
-					 0,
-					 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
-					 SLAB_MEM_SPREAD),
-					 NULL, NULL);
+	dentry_cache = KMEM_CACHE(dentry,
+		SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD);
 	
 	set_shrinker(DEFAULT_SEEKS, shrink_dcache_memory);
 
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 7b324cfebcb..ec8896b264d 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -374,7 +374,7 @@ static int __init debugfs_init(void)
 {
 	int retval;
 
-	kset_set_kset_s(&debug_subsys, kernel_subsys);
+	kobj_set_kset_s(&debug_subsys, kernel_subsys);
 	retval = subsystem_register(&debug_subsys);
 	if (retval)
 		return retval;
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
index 6fa7b0d5c04..69a94690e49 100644
--- a/fs/dlm/Kconfig
+++ b/fs/dlm/Kconfig
@@ -3,36 +3,19 @@ menu "Distributed Lock Manager"
 
 config DLM
 	tristate "Distributed Lock Manager (DLM)"
-	depends on SYSFS && (IPV6 || IPV6=n)
+	depends on IPV6 || IPV6=n
 	select CONFIGFS_FS
-	select IP_SCTP if DLM_SCTP
+	select IP_SCTP
 	help
-	  A general purpose distributed lock manager for kernel or userspace
-	  applications.
-
-choice
-	prompt "Select DLM communications protocol"
-	depends on DLM
-	default DLM_TCP
-	help
-	  The DLM Can use TCP or SCTP for it's network communications.
-	  SCTP supports multi-homed operations whereas TCP doesn't.
-	  However, SCTP seems to have stability problems at the moment.
-
-config DLM_TCP
-	bool "TCP/IP"
-
-config DLM_SCTP
-	bool "SCTP"
-
-endchoice
+	A general purpose distributed lock manager for kernel or userspace
+	applications.
 
 config DLM_DEBUG
 	bool "DLM debugging"
 	depends on DLM
 	help
-	  Under the debugfs mount point, the name of each lockspace will
-	  appear as a file in the "dlm" directory.  The output is the
-	  list of resource and locks the local node knows about.
+	Under the debugfs mount point, the name of each lockspace will
+	appear as a file in the "dlm" directory.  The output is the
+	list of resource and locks the local node knows about.
 
 endmenu
diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile
index 65388944eba..604cf7dc5f3 100644
--- a/fs/dlm/Makefile
+++ b/fs/dlm/Makefile
@@ -8,14 +8,12 @@ dlm-y :=			ast.o \
 				member.o \
 				memory.o \
 				midcomms.o \
+				lowcomms.o \
 				rcom.o \
 				recover.o \
 				recoverd.o \
 				requestqueue.o \
 				user.o \
-				util.o
+				util.o 
 dlm-$(CONFIG_DLM_DEBUG) +=	debug_fs.o
 
-dlm-$(CONFIG_DLM_TCP)   += lowcomms-tcp.o
-
-dlm-$(CONFIG_DLM_SCTP)  += lowcomms-sctp.o
-\ No newline at end of file
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index f91d39cb1e0..6308122890c 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -14,6 +14,7 @@
 #include "dlm_internal.h"
 #include "lock.h"
 #include "user.h"
+#include "ast.h"
 
 #define WAKE_ASTS  0
 
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 8665c88e5af..822abdcd143 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -89,6 +89,7 @@ struct cluster {
 	unsigned int cl_toss_secs;
 	unsigned int cl_scan_secs;
 	unsigned int cl_log_debug;
+	unsigned int cl_protocol;
 };
 
 enum {
@@ -101,6 +102,7 @@ enum {
 	CLUSTER_ATTR_TOSS_SECS,
 	CLUSTER_ATTR_SCAN_SECS,
 	CLUSTER_ATTR_LOG_DEBUG,
+	CLUSTER_ATTR_PROTOCOL,
 };
 
 struct cluster_attribute {
@@ -159,6 +161,7 @@ CLUSTER_ATTR(recover_timer, 1);
 CLUSTER_ATTR(toss_secs, 1);
 CLUSTER_ATTR(scan_secs, 1);
 CLUSTER_ATTR(log_debug, 0);
+CLUSTER_ATTR(protocol, 0);
 
 static struct configfs_attribute *cluster_attrs[] = {
 	[CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr,
@@ -170,6 +173,7 @@ static struct configfs_attribute *cluster_attrs[] = {
 	[CLUSTER_ATTR_TOSS_SECS] = &cluster_attr_toss_secs.attr,
 	[CLUSTER_ATTR_SCAN_SECS] = &cluster_attr_scan_secs.attr,
 	[CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug.attr,
+	[CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol.attr,
 	NULL,
 };
 
@@ -904,6 +908,7 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
 #define DEFAULT_TOSS_SECS         10
 #define DEFAULT_SCAN_SECS          5
 #define DEFAULT_LOG_DEBUG          0
+#define DEFAULT_PROTOCOL           0
 
 struct dlm_config_info dlm_config = {
 	.ci_tcp_port = DEFAULT_TCP_PORT,
@@ -914,6 +919,7 @@ struct dlm_config_info dlm_config = {
 	.ci_recover_timer = DEFAULT_RECOVER_TIMER,
 	.ci_toss_secs = DEFAULT_TOSS_SECS,
 	.ci_scan_secs = DEFAULT_SCAN_SECS,
-	.ci_log_debug = DEFAULT_LOG_DEBUG
+	.ci_log_debug = DEFAULT_LOG_DEBUG,
+	.ci_protocol = DEFAULT_PROTOCOL
 };
 
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index 1e978611a96..967cc3d72e5 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -26,6 +26,7 @@ struct dlm_config_info {
 	int ci_toss_secs;
 	int ci_scan_secs;
 	int ci_log_debug;
+	int ci_protocol;
 };
 
 extern struct dlm_config_info dlm_config;
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 61d93201e1b..30994d68f6a 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -210,6 +210,9 @@ struct dlm_args {
 #define DLM_IFL_MSTCPY		0x00010000
 #define DLM_IFL_RESEND		0x00020000
 #define DLM_IFL_DEAD		0x00040000
+#define DLM_IFL_OVERLAP_UNLOCK  0x00080000
+#define DLM_IFL_OVERLAP_CANCEL  0x00100000
+#define DLM_IFL_ENDOFLIFE	0x00200000
 #define DLM_IFL_USER		0x00000001
 #define DLM_IFL_ORPHAN		0x00000002
 
@@ -230,8 +233,8 @@ struct dlm_lkb {
 	int8_t			lkb_grmode;	/* granted lock mode */
 	int8_t			lkb_bastmode;	/* requested mode */
 	int8_t			lkb_highbast;	/* highest mode bast sent for */
-
 	int8_t			lkb_wait_type;	/* type of reply waiting for */
+	int8_t			lkb_wait_count;
 	int8_t			lkb_ast_type;	/* type of ast queued for */
 
 	struct list_head	lkb_idtbl_list;	/* lockspace lkbtbl */
@@ -339,6 +342,7 @@ struct dlm_header {
 #define DLM_MSG_LOOKUP		11
 #define DLM_MSG_REMOVE		12
 #define DLM_MSG_LOOKUP_REPLY	13
+#define DLM_MSG_PURGE		14
 
 struct dlm_message {
 	struct dlm_header	m_header;
@@ -440,6 +444,9 @@ struct dlm_ls {
 	struct mutex		ls_waiters_mutex;
 	struct list_head	ls_waiters;	/* lkbs needing a reply */
 
+	struct mutex		ls_orphans_mutex;
+	struct list_head	ls_orphans;
+
 	struct list_head	ls_nodes;	/* current nodes in ls */
 	struct list_head	ls_nodes_gone;	/* dead node list, recovery */
 	int			ls_num_nodes;	/* number of nodes in ls */
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index e725005fafd..d8d6e729f96 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2007 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -85,6 +85,7 @@ static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
 static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
 				    struct dlm_message *ms);
 static int receive_extralen(struct dlm_message *ms);
+static void do_purge(struct dlm_ls *ls, int nodeid, int pid);
 
 /*
  * Lock compatibilty matrix - thanks Steve
@@ -223,6 +224,16 @@ static inline int is_demoted(struct dlm_lkb *lkb)
 	return (lkb->lkb_sbflags & DLM_SBF_DEMOTED);
 }
 
+static inline int is_altmode(struct dlm_lkb *lkb)
+{
+	return (lkb->lkb_sbflags & DLM_SBF_ALTMODE);
+}
+
+static inline int is_granted(struct dlm_lkb *lkb)
+{
+	return (lkb->lkb_status == DLM_LKSTS_GRANTED);
+}
+
 static inline int is_remote(struct dlm_rsb *r)
 {
 	DLM_ASSERT(r->res_nodeid >= 0, dlm_print_rsb(r););
@@ -254,6 +265,22 @@ static inline int down_conversion(struct dlm_lkb *lkb)
 	return (!middle_conversion(lkb) && lkb->lkb_rqmode < lkb->lkb_grmode);
 }
 
+static inline int is_overlap_unlock(struct dlm_lkb *lkb)
+{
+	return lkb->lkb_flags & DLM_IFL_OVERLAP_UNLOCK;
+}
+
+static inline int is_overlap_cancel(struct dlm_lkb *lkb)
+{
+	return lkb->lkb_flags & DLM_IFL_OVERLAP_CANCEL;
+}
+
+static inline int is_overlap(struct dlm_lkb *lkb)
+{
+	return (lkb->lkb_flags & (DLM_IFL_OVERLAP_UNLOCK |
+				  DLM_IFL_OVERLAP_CANCEL));
+}
+
 static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
 {
 	if (is_master_copy(lkb))
@@ -267,6 +294,12 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
 	dlm_add_ast(lkb, AST_COMP);
 }
 
+static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	queue_cast(r, lkb,
+		   is_overlap_unlock(lkb) ? -DLM_EUNLOCK : -DLM_ECANCEL);
+}
+
 static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
 {
 	if (is_master_copy(lkb))
@@ -547,6 +580,7 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
 	lkb->lkb_grmode = DLM_LOCK_IV;
 	kref_init(&lkb->lkb_ref);
 	INIT_LIST_HEAD(&lkb->lkb_ownqueue);
+	INIT_LIST_HEAD(&lkb->lkb_rsb_lookup);
 
 	get_random_bytes(&bucket, sizeof(bucket));
 	bucket &= (ls->ls_lkbtbl_size - 1);
@@ -556,7 +590,7 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
 	/* counter can roll over so we must verify lkid is not in use */
 
 	while (lkid == 0) {
-		lkid = bucket | (ls->ls_lkbtbl[bucket].counter++ << 16);
+		lkid = (bucket << 16) | ls->ls_lkbtbl[bucket].counter++;
 
 		list_for_each_entry(tmp, &ls->ls_lkbtbl[bucket].list,
 				    lkb_idtbl_list) {
@@ -577,8 +611,8 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
 
 static struct dlm_lkb *__find_lkb(struct dlm_ls *ls, uint32_t lkid)
 {
-	uint16_t bucket = lkid & 0xFFFF;
 	struct dlm_lkb *lkb;
+	uint16_t bucket = (lkid >> 16);
 
 	list_for_each_entry(lkb, &ls->ls_lkbtbl[bucket].list, lkb_idtbl_list) {
 		if (lkb->lkb_id == lkid)
@@ -590,7 +624,7 @@ static struct dlm_lkb *__find_lkb(struct dlm_ls *ls, uint32_t lkid)
 static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret)
 {
 	struct dlm_lkb *lkb;
-	uint16_t bucket = lkid & 0xFFFF;
+	uint16_t bucket = (lkid >> 16);
 
 	if (bucket >= ls->ls_lkbtbl_size)
 		return -EBADSLT;
@@ -620,7 +654,7 @@ static void kill_lkb(struct kref *kref)
 
 static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
 {
-	uint16_t bucket = lkb->lkb_id & 0xFFFF;
+	uint16_t bucket = (lkb->lkb_id >> 16);
 
 	write_lock(&ls->ls_lkbtbl[bucket].lock);
 	if (kref_put(&lkb->lkb_ref, kill_lkb)) {
@@ -735,23 +769,75 @@ static void move_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int sts)
 	unhold_lkb(lkb);
 }
 
+static int msg_reply_type(int mstype)
+{
+	switch (mstype) {
+	case DLM_MSG_REQUEST:
+		return DLM_MSG_REQUEST_REPLY;
+	case DLM_MSG_CONVERT:
+		return DLM_MSG_CONVERT_REPLY;
+	case DLM_MSG_UNLOCK:
+		return DLM_MSG_UNLOCK_REPLY;
+	case DLM_MSG_CANCEL:
+		return DLM_MSG_CANCEL_REPLY;
+	case DLM_MSG_LOOKUP:
+		return DLM_MSG_LOOKUP_REPLY;
+	}
+	return -1;
+}
+
 /* add/remove lkb from global waiters list of lkb's waiting for
    a reply from a remote node */
 
-static void add_to_waiters(struct dlm_lkb *lkb, int mstype)
+static int add_to_waiters(struct dlm_lkb *lkb, int mstype)
 {
 	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+	int error = 0;
 
 	mutex_lock(&ls->ls_waiters_mutex);
-	if (lkb->lkb_wait_type) {
-		log_print("add_to_waiters error %d", lkb->lkb_wait_type);
+
+	if (is_overlap_unlock(lkb) ||
+	    (is_overlap_cancel(lkb) && (mstype == DLM_MSG_CANCEL))) {
+		error = -EINVAL;
+		goto out;
+	}
+
+	if (lkb->lkb_wait_type || is_overlap_cancel(lkb)) {
+		switch (mstype) {
+		case DLM_MSG_UNLOCK:
+			lkb->lkb_flags |= DLM_IFL_OVERLAP_UNLOCK;
+			break;
+		case DLM_MSG_CANCEL:
+			lkb->lkb_flags |= DLM_IFL_OVERLAP_CANCEL;
+			break;
+		default:
+			error = -EBUSY;
+			goto out;
+		}
+		lkb->lkb_wait_count++;
+		hold_lkb(lkb);
+
+		log_debug(ls, "add overlap %x cur %d new %d count %d flags %x",
+			  lkb->lkb_id, lkb->lkb_wait_type, mstype,
+			  lkb->lkb_wait_count, lkb->lkb_flags);
 		goto out;
 	}
+
+	DLM_ASSERT(!lkb->lkb_wait_count,
+		   dlm_print_lkb(lkb);
+		   printk("wait_count %d\n", lkb->lkb_wait_count););
+
+	lkb->lkb_wait_count++;
 	lkb->lkb_wait_type = mstype;
-	kref_get(&lkb->lkb_ref);
+	hold_lkb(lkb);
 	list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
  out:
+	if (error)
+		log_error(ls, "add_to_waiters %x error %d flags %x %d %d %s",
+			  lkb->lkb_id, error, lkb->lkb_flags, mstype,
+			  lkb->lkb_wait_type, lkb->lkb_resource->res_name);
 	mutex_unlock(&ls->ls_waiters_mutex);
+	return error;
 }
 
 /* We clear the RESEND flag because we might be taking an lkb off the waiters
@@ -759,34 +845,85 @@ static void add_to_waiters(struct dlm_lkb *lkb, int mstype)
    request reply on the requestqueue) between dlm_recover_waiters_pre() which
    set RESEND and dlm_recover_waiters_post() */
 
-static int _remove_from_waiters(struct dlm_lkb *lkb)
+static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype)
 {
-	int error = 0;
+	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+	int overlap_done = 0;
 
-	if (!lkb->lkb_wait_type) {
-		log_print("remove_from_waiters error");
-		error = -EINVAL;
-		goto out;
+	if (is_overlap_unlock(lkb) && (mstype == DLM_MSG_UNLOCK_REPLY)) {
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
+		overlap_done = 1;
+		goto out_del;
+	}
+
+	if (is_overlap_cancel(lkb) && (mstype == DLM_MSG_CANCEL_REPLY)) {
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
+		overlap_done = 1;
+		goto out_del;
+	}
+
+	/* N.B. type of reply may not always correspond to type of original
+	   msg due to lookup->request optimization, verify others? */
+
+	if (lkb->lkb_wait_type) {
+		lkb->lkb_wait_type = 0;
+		goto out_del;
+	}
+
+	log_error(ls, "remove_from_waiters lkid %x flags %x types %d %d",
+		  lkb->lkb_id, lkb->lkb_flags, mstype, lkb->lkb_wait_type);
+	return -1;
+
+ out_del:
+	/* the force-unlock/cancel has completed and we haven't recvd a reply
+	   to the op that was in progress prior to the unlock/cancel; we
+	   give up on any reply to the earlier op.  FIXME: not sure when/how
+	   this would happen */
+
+	if (overlap_done && lkb->lkb_wait_type) {
+		log_error(ls, "remove_from_waiters %x reply %d give up on %d",
+			  lkb->lkb_id, mstype, lkb->lkb_wait_type);
+		lkb->lkb_wait_count--;
+		lkb->lkb_wait_type = 0;
 	}
-	lkb->lkb_wait_type = 0;
+
+	DLM_ASSERT(lkb->lkb_wait_count, dlm_print_lkb(lkb););
+
 	lkb->lkb_flags &= ~DLM_IFL_RESEND;
-	list_del(&lkb->lkb_wait_reply);
+	lkb->lkb_wait_count--;
+	if (!lkb->lkb_wait_count)
+		list_del_init(&lkb->lkb_wait_reply);
 	unhold_lkb(lkb);
- out:
-	return error;
+	return 0;
 }
 
-static int remove_from_waiters(struct dlm_lkb *lkb)
+static int remove_from_waiters(struct dlm_lkb *lkb, int mstype)
 {
 	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
 	int error;
 
 	mutex_lock(&ls->ls_waiters_mutex);
-	error = _remove_from_waiters(lkb);
+	error = _remove_from_waiters(lkb, mstype);
 	mutex_unlock(&ls->ls_waiters_mutex);
 	return error;
 }
 
+/* Handles situations where we might be processing a "fake" or "stub" reply in
+   which we can't try to take waiters_mutex again. */
+
+static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+	int error;
+
+	if (ms != &ls->ls_stub_ms)
+		mutex_lock(&ls->ls_waiters_mutex);
+	error = _remove_from_waiters(lkb, ms->m_type);
+	if (ms != &ls->ls_stub_ms)
+		mutex_unlock(&ls->ls_waiters_mutex);
+	return error;
+}
+
 static void dir_remove(struct dlm_rsb *r)
 {
 	int to_nodeid;
@@ -988,8 +1125,14 @@ static void remove_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
 	_remove_lock(r, lkb);
 }
 
-static void revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+/* returns: 0 did nothing
+	    1 moved lock to granted
+	   -1 removed lock */
+
+static int revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
 {
+	int rv = 0;
+
 	lkb->lkb_rqmode = DLM_LOCK_IV;
 
 	switch (lkb->lkb_status) {
@@ -997,6 +1140,7 @@ static void revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
 		break;
 	case DLM_LKSTS_CONVERT:
 		move_lkb(r, lkb, DLM_LKSTS_GRANTED);
+		rv = 1;
 		break;
 	case DLM_LKSTS_WAITING:
 		del_lkb(r, lkb);
@@ -1004,15 +1148,17 @@ static void revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
 		/* this unhold undoes the original ref from create_lkb()
 		   so this leads to the lkb being freed */
 		unhold_lkb(lkb);
+		rv = -1;
 		break;
 	default:
 		log_print("invalid status for revert %d", lkb->lkb_status);
 	}
+	return rv;
 }
 
-static void revert_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
+static int revert_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
 {
-	revert_lock(r, lkb);
+	return revert_lock(r, lkb);
 }
 
 static void _grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
@@ -1055,6 +1201,50 @@ static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb)
 		queue_cast(r, lkb, 0);
 }
 
+/* The special CONVDEADLK, ALTPR and ALTCW flags allow the master to
+   change the granted/requested modes.  We're munging things accordingly in
+   the process copy.
+   CONVDEADLK: our grmode may have been forced down to NL to resolve a
+   conversion deadlock
+   ALTPR/ALTCW: our rqmode may have been changed to PR or CW to become
+   compatible with other granted locks */
+
+static void munge_demoted(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+	if (ms->m_type != DLM_MSG_CONVERT_REPLY) {
+		log_print("munge_demoted %x invalid reply type %d",
+			  lkb->lkb_id, ms->m_type);
+		return;
+	}
+
+	if (lkb->lkb_rqmode == DLM_LOCK_IV || lkb->lkb_grmode == DLM_LOCK_IV) {
+		log_print("munge_demoted %x invalid modes gr %d rq %d",
+			  lkb->lkb_id, lkb->lkb_grmode, lkb->lkb_rqmode);
+		return;
+	}
+
+	lkb->lkb_grmode = DLM_LOCK_NL;
+}
+
+static void munge_altmode(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+	if (ms->m_type != DLM_MSG_REQUEST_REPLY &&
+	    ms->m_type != DLM_MSG_GRANT) {
+		log_print("munge_altmode %x invalid reply type %d",
+			  lkb->lkb_id, ms->m_type);
+		return;
+	}
+
+	if (lkb->lkb_exflags & DLM_LKF_ALTPR)
+		lkb->lkb_rqmode = DLM_LOCK_PR;
+	else if (lkb->lkb_exflags & DLM_LKF_ALTCW)
+		lkb->lkb_rqmode = DLM_LOCK_CW;
+	else {
+		log_print("munge_altmode invalid exflags %x", lkb->lkb_exflags);
+		dlm_print_lkb(lkb);
+	}
+}
+
 static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head)
 {
 	struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb,
@@ -1499,7 +1689,7 @@ static void process_lookup_list(struct dlm_rsb *r)
 	struct dlm_lkb *lkb, *safe;
 
 	list_for_each_entry_safe(lkb, safe, &r->res_lookup, lkb_rsb_lookup) {
-		list_del(&lkb->lkb_rsb_lookup);
+		list_del_init(&lkb->lkb_rsb_lookup);
 		_request_lock(r, lkb);
 		schedule();
 	}
@@ -1530,7 +1720,7 @@ static void confirm_master(struct dlm_rsb *r, int error)
 		if (!list_empty(&r->res_lookup)) {
 			lkb = list_entry(r->res_lookup.next, struct dlm_lkb,
 					 lkb_rsb_lookup);
-			list_del(&lkb->lkb_rsb_lookup);
+			list_del_init(&lkb->lkb_rsb_lookup);
 			r->res_first_lkid = lkb->lkb_id;
 			_request_lock(r, lkb);
 		} else
@@ -1614,6 +1804,9 @@ static int set_unlock_args(uint32_t flags, void *astarg, struct dlm_args *args)
  		      DLM_LKF_FORCEUNLOCK))
 		return -EINVAL;
 
+	if (flags & DLM_LKF_CANCEL && flags & DLM_LKF_FORCEUNLOCK)
+		return -EINVAL;
+
 	args->flags = flags;
 	args->astparam = (long) astarg;
 	return 0;
@@ -1638,6 +1831,9 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
 
 		if (lkb->lkb_wait_type)
 			goto out;
+
+		if (is_overlap(lkb))
+			goto out;
 	}
 
 	lkb->lkb_exflags = args->flags;
@@ -1654,35 +1850,126 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
 	return rv;
 }
 
+/* when dlm_unlock() sees -EBUSY with CANCEL/FORCEUNLOCK it returns 0
+   for success */
+
+/* note: it's valid for lkb_nodeid/res_nodeid to be -1 when we get here
+   because there may be a lookup in progress and it's valid to do
+   cancel/unlockf on it */
+
 static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
 {
+	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
 	int rv = -EINVAL;
 
-	if (lkb->lkb_flags & DLM_IFL_MSTCPY)
+	if (lkb->lkb_flags & DLM_IFL_MSTCPY) {
+		log_error(ls, "unlock on MSTCPY %x", lkb->lkb_id);
+		dlm_print_lkb(lkb);
 		goto out;
+	}
 
-	if (args->flags & DLM_LKF_FORCEUNLOCK)
-		goto out_ok;
+	/* an lkb may still exist even though the lock is EOL'ed due to a
+	   cancel, unlock or failed noqueue request; an app can't use these
+	   locks; return same error as if the lkid had not been found at all */
 
-	if (args->flags & DLM_LKF_CANCEL &&
-	    lkb->lkb_status == DLM_LKSTS_GRANTED)
+	if (lkb->lkb_flags & DLM_IFL_ENDOFLIFE) {
+		log_debug(ls, "unlock on ENDOFLIFE %x", lkb->lkb_id);
+		rv = -ENOENT;
 		goto out;
+	}
 
-	if (!(args->flags & DLM_LKF_CANCEL) &&
-	    lkb->lkb_status != DLM_LKSTS_GRANTED)
-		goto out;
+	/* an lkb may be waiting for an rsb lookup to complete where the
+	   lookup was initiated by another lock */
+
+	if (args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) {
+		if (!list_empty(&lkb->lkb_rsb_lookup)) {
+			log_debug(ls, "unlock on rsb_lookup %x", lkb->lkb_id);
+			list_del_init(&lkb->lkb_rsb_lookup);
+			queue_cast(lkb->lkb_resource, lkb,
+				   args->flags & DLM_LKF_CANCEL ?
+				   -DLM_ECANCEL : -DLM_EUNLOCK);
+			unhold_lkb(lkb); /* undoes create_lkb() */
+			rv = -EBUSY;
+			goto out;
+		}
+	}
+
+	/* cancel not allowed with another cancel/unlock in progress */
+
+	if (args->flags & DLM_LKF_CANCEL) {
+		if (lkb->lkb_exflags & DLM_LKF_CANCEL)
+			goto out;
+
+		if (is_overlap(lkb))
+			goto out;
+
+		if (lkb->lkb_flags & DLM_IFL_RESEND) {
+			lkb->lkb_flags |= DLM_IFL_OVERLAP_CANCEL;
+			rv = -EBUSY;
+			goto out;
+		}
+
+		switch (lkb->lkb_wait_type) {
+		case DLM_MSG_LOOKUP:
+		case DLM_MSG_REQUEST:
+			lkb->lkb_flags |= DLM_IFL_OVERLAP_CANCEL;
+			rv = -EBUSY;
+			goto out;
+		case DLM_MSG_UNLOCK:
+		case DLM_MSG_CANCEL:
+			goto out;
+		}
+		/* add_to_waiters() will set OVERLAP_CANCEL */
+		goto out_ok;
+	}
+
+	/* do we need to allow a force-unlock if there's a normal unlock
+	   already in progress?  in what conditions could the normal unlock
+	   fail such that we'd want to send a force-unlock to be sure? */
+
+	if (args->flags & DLM_LKF_FORCEUNLOCK) {
+		if (lkb->lkb_exflags & DLM_LKF_FORCEUNLOCK)
+			goto out;
+
+		if (is_overlap_unlock(lkb))
+			goto out;
 
+		if (lkb->lkb_flags & DLM_IFL_RESEND) {
+			lkb->lkb_flags |= DLM_IFL_OVERLAP_UNLOCK;
+			rv = -EBUSY;
+			goto out;
+		}
+
+		switch (lkb->lkb_wait_type) {
+		case DLM_MSG_LOOKUP:
+		case DLM_MSG_REQUEST:
+			lkb->lkb_flags |= DLM_IFL_OVERLAP_UNLOCK;
+			rv = -EBUSY;
+			goto out;
+		case DLM_MSG_UNLOCK:
+			goto out;
+		}
+		/* add_to_waiters() will set OVERLAP_UNLOCK */
+		goto out_ok;
+	}
+
+	/* normal unlock not allowed if there's any op in progress */
 	rv = -EBUSY;
-	if (lkb->lkb_wait_type)
+	if (lkb->lkb_wait_type || lkb->lkb_wait_count)
 		goto out;
 
  out_ok:
-	lkb->lkb_exflags = args->flags;
+	/* an overlapping op shouldn't blow away exflags from other op */
+	lkb->lkb_exflags |= args->flags;
 	lkb->lkb_sbflags = 0;
 	lkb->lkb_astparam = args->astparam;
-
 	rv = 0;
  out:
+	if (rv)
+		log_debug(ls, "validate_unlock_args %d %x %x %x %x %d %s", rv,
+			  lkb->lkb_id, lkb->lkb_flags, lkb->lkb_exflags,
+			  args->flags, lkb->lkb_wait_type,
+			  lkb->lkb_resource->res_name);
 	return rv;
 }
 
@@ -1732,9 +2019,24 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
 		goto out;
 	}
 
-	if (can_be_queued(lkb)) {
-		if (is_demoted(lkb))
+	/* is_demoted() means the can_be_granted() above set the grmode
+	   to NL, and left us on the granted queue.  This auto-demotion
+	   (due to CONVDEADLK) might mean other locks, and/or this lock, are
+	   now grantable.  We have to try to grant other converting locks
+	   before we try again to grant this one. */
+
+	if (is_demoted(lkb)) {
+		grant_pending_convert(r, DLM_LOCK_IV);
+		if (_can_be_granted(r, lkb, 1)) {
+			grant_lock(r, lkb);
+			queue_cast(r, lkb, 0);
 			grant_pending_locks(r);
+			goto out;
+		}
+		/* else fall through and move to convert queue */
+	}
+
+	if (can_be_queued(lkb)) {
 		error = -EINPROGRESS;
 		del_lkb(r, lkb);
 		add_lkb(r, lkb, DLM_LKSTS_CONVERT);
@@ -1759,17 +2061,19 @@ static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
 	return -DLM_EUNLOCK;
 }
 
-/* FIXME: if revert_lock() finds that the lkb is granted, we should
-   skip the queue_cast(ECANCEL).  It indicates that the request/convert
-   completed (and queued a normal ast) just before the cancel; we don't
-   want to clobber the sb_result for the normal ast with ECANCEL. */
+/* returns: 0 did nothing, -DLM_ECANCEL canceled lock */
  
 static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
 {
-	revert_lock(r, lkb);
-	queue_cast(r, lkb, -DLM_ECANCEL);
-	grant_pending_locks(r);
-	return -DLM_ECANCEL;
+	int error;
+
+	error = revert_lock(r, lkb);
+	if (error) {
+		queue_cast(r, lkb, -DLM_ECANCEL);
+		grant_pending_locks(r);
+		return -DLM_ECANCEL;
+	}
+	return 0;
 }
 
 /*
@@ -2035,6 +2339,8 @@ int dlm_unlock(dlm_lockspace_t *lockspace,
 
 	if (error == -DLM_EUNLOCK || error == -DLM_ECANCEL)
 		error = 0;
+	if (error == -EBUSY && (flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)))
+		error = 0;
  out_put:
 	dlm_put_lkb(lkb);
  out:
@@ -2065,31 +2371,14 @@ int dlm_unlock(dlm_lockspace_t *lockspace,
  * receive_lookup_reply		send_lookup_reply
  */
 
-static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
-			  int to_nodeid, int mstype,
-			  struct dlm_message **ms_ret,
-			  struct dlm_mhandle **mh_ret)
+static int _create_message(struct dlm_ls *ls, int mb_len,
+			   int to_nodeid, int mstype,
+			   struct dlm_message **ms_ret,
+			   struct dlm_mhandle **mh_ret)
 {
 	struct dlm_message *ms;
 	struct dlm_mhandle *mh;
 	char *mb;
-	int mb_len = sizeof(struct dlm_message);
-
-	switch (mstype) {
-	case DLM_MSG_REQUEST:
-	case DLM_MSG_LOOKUP:
-	case DLM_MSG_REMOVE:
-		mb_len += r->res_length;
-		break;
-	case DLM_MSG_CONVERT:
-	case DLM_MSG_UNLOCK:
-	case DLM_MSG_REQUEST_REPLY:
-	case DLM_MSG_CONVERT_REPLY:
-	case DLM_MSG_GRANT:
-		if (lkb && lkb->lkb_lvbptr)
-			mb_len += r->res_ls->ls_lvblen;
-		break;
-	}
 
 	/* get_buffer gives us a message handle (mh) that we need to
 	   pass into lowcomms_commit and a message buffer (mb) that we
@@ -2104,7 +2393,7 @@ static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
 	ms = (struct dlm_message *) mb;
 
 	ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
-	ms->m_header.h_lockspace = r->res_ls->ls_global_id;
+	ms->m_header.h_lockspace = ls->ls_global_id;
 	ms->m_header.h_nodeid = dlm_our_nodeid();
 	ms->m_header.h_length = mb_len;
 	ms->m_header.h_cmd = DLM_MSG;
@@ -2116,6 +2405,33 @@ static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
 	return 0;
 }
 
+static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
+			  int to_nodeid, int mstype,
+			  struct dlm_message **ms_ret,
+			  struct dlm_mhandle **mh_ret)
+{
+	int mb_len = sizeof(struct dlm_message);
+
+	switch (mstype) {
+	case DLM_MSG_REQUEST:
+	case DLM_MSG_LOOKUP:
+	case DLM_MSG_REMOVE:
+		mb_len += r->res_length;
+		break;
+	case DLM_MSG_CONVERT:
+	case DLM_MSG_UNLOCK:
+	case DLM_MSG_REQUEST_REPLY:
+	case DLM_MSG_CONVERT_REPLY:
+	case DLM_MSG_GRANT:
+		if (lkb && lkb->lkb_lvbptr)
+			mb_len += r->res_ls->ls_lvblen;
+		break;
+	}
+
+	return _create_message(r->res_ls, mb_len, to_nodeid, mstype,
+			       ms_ret, mh_ret);
+}
+
 /* further lowcomms enhancements or alternate implementations may make
    the return value from this function useful at some point */
 
@@ -2176,7 +2492,9 @@ static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
 	struct dlm_mhandle *mh;
 	int to_nodeid, error;
 
-	add_to_waiters(lkb, mstype);
+	error = add_to_waiters(lkb, mstype);
+	if (error)
+		return error;
 
 	to_nodeid = r->res_nodeid;
 
@@ -2192,7 +2510,7 @@ static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
 	return 0;
 
  fail:
-	remove_from_waiters(lkb);
+	remove_from_waiters(lkb, msg_reply_type(mstype));
 	return error;
 }
 
@@ -2209,7 +2527,8 @@ static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
 
 	/* down conversions go without a reply from the master */
 	if (!error && down_conversion(lkb)) {
-		remove_from_waiters(lkb);
+		remove_from_waiters(lkb, DLM_MSG_CONVERT_REPLY);
+		r->res_ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY;
 		r->res_ls->ls_stub_ms.m_result = 0;
 		r->res_ls->ls_stub_ms.m_flags = lkb->lkb_flags;
 		__receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms);
@@ -2280,7 +2599,9 @@ static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
 	struct dlm_mhandle *mh;
 	int to_nodeid, error;
 
-	add_to_waiters(lkb, DLM_MSG_LOOKUP);
+	error = add_to_waiters(lkb, DLM_MSG_LOOKUP);
+	if (error)
+		return error;
 
 	to_nodeid = dlm_dir_nodeid(r);
 
@@ -2296,7 +2617,7 @@ static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
 	return 0;
 
  fail:
-	remove_from_waiters(lkb);
+	remove_from_waiters(lkb, DLM_MSG_LOOKUP_REPLY);
 	return error;
 }
 
@@ -2656,6 +2977,8 @@ static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
 	lock_rsb(r);
 
 	receive_flags_reply(lkb, ms);
+	if (is_altmode(lkb))
+		munge_altmode(lkb, ms);
 	grant_lock_pc(r, lkb, ms);
 	queue_cast(r, lkb, 0);
 
@@ -2736,11 +3059,16 @@ static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms)
 	dlm_dir_remove_entry(ls, from_nodeid, ms->m_extra, len);
 }
 
+static void receive_purge(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	do_purge(ls, ms->m_nodeid, ms->m_pid);
+}
+
 static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
 {
 	struct dlm_lkb *lkb;
 	struct dlm_rsb *r;
-	int error, mstype;
+	int error, mstype, result;
 
 	error = find_lkb(ls, ms->m_remid, &lkb);
 	if (error) {
@@ -2749,20 +3077,15 @@ static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
 	}
 	DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
 
-	mstype = lkb->lkb_wait_type;
-	error = remove_from_waiters(lkb);
-	if (error) {
-		log_error(ls, "receive_request_reply not on waiters");
-		goto out;
-	}
-
-	/* this is the value returned from do_request() on the master */
-	error = ms->m_result;
-
 	r = lkb->lkb_resource;
 	hold_rsb(r);
 	lock_rsb(r);
 
+	mstype = lkb->lkb_wait_type;
+	error = remove_from_waiters(lkb, DLM_MSG_REQUEST_REPLY);
+	if (error)
+		goto out;
+
 	/* Optimization: the dir node was also the master, so it took our
 	   lookup as a request and sent request reply instead of lookup reply */
 	if (mstype == DLM_MSG_LOOKUP) {
@@ -2770,14 +3093,15 @@ static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
 		lkb->lkb_nodeid = r->res_nodeid;
 	}
 
-	switch (error) {
+	/* this is the value returned from do_request() on the master */
+	result = ms->m_result;
+
+	switch (result) {
 	case -EAGAIN:
-		/* request would block (be queued) on remote master;
-		   the unhold undoes the original ref from create_lkb()
-		   so it leads to the lkb being freed */
+		/* request would block (be queued) on remote master */
 		queue_cast(r, lkb, -EAGAIN);
 		confirm_master(r, -EAGAIN);
-		unhold_lkb(lkb);
+		unhold_lkb(lkb); /* undoes create_lkb() */
 		break;
 
 	case -EINPROGRESS:
@@ -2785,41 +3109,64 @@ static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
 		/* request was queued or granted on remote master */
 		receive_flags_reply(lkb, ms);
 		lkb->lkb_remid = ms->m_lkid;
-		if (error)
+		if (is_altmode(lkb))
+			munge_altmode(lkb, ms);
+		if (result)
 			add_lkb(r, lkb, DLM_LKSTS_WAITING);
 		else {
 			grant_lock_pc(r, lkb, ms);
 			queue_cast(r, lkb, 0);
 		}
-		confirm_master(r, error);
+		confirm_master(r, result);
 		break;
 
 	case -EBADR:
 	case -ENOTBLK:
 		/* find_rsb failed to find rsb or rsb wasn't master */
+		log_debug(ls, "receive_request_reply %x %x master diff %d %d",
+			  lkb->lkb_id, lkb->lkb_flags, r->res_nodeid, result);
 		r->res_nodeid = -1;
 		lkb->lkb_nodeid = -1;
-		_request_lock(r, lkb);
+
+		if (is_overlap(lkb)) {
+			/* we'll ignore error in cancel/unlock reply */
+			queue_cast_overlap(r, lkb);
+			unhold_lkb(lkb); /* undoes create_lkb() */
+		} else
+			_request_lock(r, lkb);
 		break;
 
 	default:
-		log_error(ls, "receive_request_reply error %d", error);
+		log_error(ls, "receive_request_reply %x error %d",
+			  lkb->lkb_id, result);
 	}
 
+	if (is_overlap_unlock(lkb) && (result == 0 || result == -EINPROGRESS)) {
+		log_debug(ls, "receive_request_reply %x result %d unlock",
+			  lkb->lkb_id, result);
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
+		send_unlock(r, lkb);
+	} else if (is_overlap_cancel(lkb) && (result == -EINPROGRESS)) {
+		log_debug(ls, "receive_request_reply %x cancel", lkb->lkb_id);
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
+		send_cancel(r, lkb);
+	} else {
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
+	}
+ out:
 	unlock_rsb(r);
 	put_rsb(r);
- out:
 	dlm_put_lkb(lkb);
 }
 
 static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
 				    struct dlm_message *ms)
 {
-	int error = ms->m_result;
-
 	/* this is the value returned from do_convert() on the master */
-
-	switch (error) {
+	switch (ms->m_result) {
 	case -EAGAIN:
 		/* convert would block (be queued) on remote master */
 		queue_cast(r, lkb, -EAGAIN);
@@ -2827,6 +3174,9 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
 
 	case -EINPROGRESS:
 		/* convert was queued on remote master */
+		receive_flags_reply(lkb, ms);
+		if (is_demoted(lkb))
+			munge_demoted(lkb, ms);
 		del_lkb(r, lkb);
 		add_lkb(r, lkb, DLM_LKSTS_CONVERT);
 		break;
@@ -2834,24 +3184,33 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
 	case 0:
 		/* convert was granted on remote master */
 		receive_flags_reply(lkb, ms);
+		if (is_demoted(lkb))
+			munge_demoted(lkb, ms);
 		grant_lock_pc(r, lkb, ms);
 		queue_cast(r, lkb, 0);
 		break;
 
 	default:
-		log_error(r->res_ls, "receive_convert_reply error %d", error);
+		log_error(r->res_ls, "receive_convert_reply %x error %d",
+			  lkb->lkb_id, ms->m_result);
 	}
 }
 
 static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
 {
 	struct dlm_rsb *r = lkb->lkb_resource;
+	int error;
 
 	hold_rsb(r);
 	lock_rsb(r);
 
-	__receive_convert_reply(r, lkb, ms);
+	/* stub reply can happen with waiters_mutex held */
+	error = remove_from_waiters_ms(lkb, ms);
+	if (error)
+		goto out;
 
+	__receive_convert_reply(r, lkb, ms);
+ out:
 	unlock_rsb(r);
 	put_rsb(r);
 }
@@ -2868,37 +3227,38 @@ static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
 	}
 	DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
 
-	error = remove_from_waiters(lkb);
-	if (error) {
-		log_error(ls, "receive_convert_reply not on waiters");
-		goto out;
-	}
-
 	_receive_convert_reply(lkb, ms);
- out:
 	dlm_put_lkb(lkb);
 }
 
 static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
 {
 	struct dlm_rsb *r = lkb->lkb_resource;
-	int error = ms->m_result;
+	int error;
 
 	hold_rsb(r);
 	lock_rsb(r);
 
+	/* stub reply can happen with waiters_mutex held */
+	error = remove_from_waiters_ms(lkb, ms);
+	if (error)
+		goto out;
+
 	/* this is the value returned from do_unlock() on the master */
 
-	switch (error) {
+	switch (ms->m_result) {
 	case -DLM_EUNLOCK:
 		receive_flags_reply(lkb, ms);
 		remove_lock_pc(r, lkb);
 		queue_cast(r, lkb, -DLM_EUNLOCK);
 		break;
+	case -ENOENT:
+		break;
 	default:
-		log_error(r->res_ls, "receive_unlock_reply error %d", error);
+		log_error(r->res_ls, "receive_unlock_reply %x error %d",
+			  lkb->lkb_id, ms->m_result);
 	}
-
+ out:
 	unlock_rsb(r);
 	put_rsb(r);
 }
@@ -2915,37 +3275,39 @@ static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
 	}
 	DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
 
-	error = remove_from_waiters(lkb);
-	if (error) {
-		log_error(ls, "receive_unlock_reply not on waiters");
-		goto out;
-	}
-
 	_receive_unlock_reply(lkb, ms);
- out:
 	dlm_put_lkb(lkb);
 }
 
 static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
 {
 	struct dlm_rsb *r = lkb->lkb_resource;
-	int error = ms->m_result;
+	int error;
 
 	hold_rsb(r);
 	lock_rsb(r);
 
+	/* stub reply can happen with waiters_mutex held */
+	error = remove_from_waiters_ms(lkb, ms);
+	if (error)
+		goto out;
+
 	/* this is the value returned from do_cancel() on the master */
 
-	switch (error) {
+	switch (ms->m_result) {
 	case -DLM_ECANCEL:
 		receive_flags_reply(lkb, ms);
 		revert_lock_pc(r, lkb);
-		queue_cast(r, lkb, -DLM_ECANCEL);
+		if (ms->m_result)
+			queue_cast(r, lkb, -DLM_ECANCEL);
+		break;
+	case 0:
 		break;
 	default:
-		log_error(r->res_ls, "receive_cancel_reply error %d", error);
+		log_error(r->res_ls, "receive_cancel_reply %x error %d",
+			  lkb->lkb_id, ms->m_result);
 	}
-
+ out:
 	unlock_rsb(r);
 	put_rsb(r);
 }
@@ -2962,14 +3324,7 @@ static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
 	}
 	DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
 
-	error = remove_from_waiters(lkb);
-	if (error) {
-		log_error(ls, "receive_cancel_reply not on waiters");
-		goto out;
-	}
-
 	_receive_cancel_reply(lkb, ms);
- out:
 	dlm_put_lkb(lkb);
 }
 
@@ -2985,20 +3340,17 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
 		return;
 	}
 
-	error = remove_from_waiters(lkb);
-	if (error) {
-		log_error(ls, "receive_lookup_reply not on waiters");
-		goto out;
-	}
-
-	/* this is the value returned by dlm_dir_lookup on dir node
+	/* ms->m_result is the value returned by dlm_dir_lookup on dir node
 	   FIXME: will a non-zero error ever be returned? */
-	error = ms->m_result;
 
 	r = lkb->lkb_resource;
 	hold_rsb(r);
 	lock_rsb(r);
 
+	error = remove_from_waiters(lkb, DLM_MSG_LOOKUP_REPLY);
+	if (error)
+		goto out;
+
 	ret_nodeid = ms->m_nodeid;
 	if (ret_nodeid == dlm_our_nodeid()) {
 		r->res_nodeid = 0;
@@ -3009,14 +3361,22 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
 		r->res_nodeid = ret_nodeid;
 	}
 
+	if (is_overlap(lkb)) {
+		log_debug(ls, "receive_lookup_reply %x unlock %x",
+			  lkb->lkb_id, lkb->lkb_flags);
+		queue_cast_overlap(r, lkb);
+		unhold_lkb(lkb); /* undoes create_lkb() */
+		goto out_list;
+	}
+
 	_request_lock(r, lkb);
 
+ out_list:
 	if (!ret_nodeid)
 		process_lookup_list(r);
-
+ out:
 	unlock_rsb(r);
 	put_rsb(r);
- out:
 	dlm_put_lkb(lkb);
 }
 
@@ -3133,6 +3493,12 @@ int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery)
 		receive_lookup_reply(ls, ms);
 		break;
 
+	/* other messages */
+
+	case DLM_MSG_PURGE:
+		receive_purge(ls, ms);
+		break;
+
 	default:
 		log_error(ls, "unknown message type %d", ms->m_type);
 	}
@@ -3153,9 +3519,9 @@ static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
 {
 	if (middle_conversion(lkb)) {
 		hold_lkb(lkb);
+		ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY;
 		ls->ls_stub_ms.m_result = -EINPROGRESS;
 		ls->ls_stub_ms.m_flags = lkb->lkb_flags;
-		_remove_from_waiters(lkb);
 		_receive_convert_reply(lkb, &ls->ls_stub_ms);
 
 		/* Same special case as in receive_rcom_lock_args() */
@@ -3227,18 +3593,18 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
 
 		case DLM_MSG_UNLOCK:
 			hold_lkb(lkb);
+			ls->ls_stub_ms.m_type = DLM_MSG_UNLOCK_REPLY;
 			ls->ls_stub_ms.m_result = -DLM_EUNLOCK;
 			ls->ls_stub_ms.m_flags = lkb->lkb_flags;
-			_remove_from_waiters(lkb);
 			_receive_unlock_reply(lkb, &ls->ls_stub_ms);
 			dlm_put_lkb(lkb);
 			break;
 
 		case DLM_MSG_CANCEL:
 			hold_lkb(lkb);
+			ls->ls_stub_ms.m_type = DLM_MSG_CANCEL_REPLY;
 			ls->ls_stub_ms.m_result = -DLM_ECANCEL;
 			ls->ls_stub_ms.m_flags = lkb->lkb_flags;
-			_remove_from_waiters(lkb);
 			_receive_cancel_reply(lkb, &ls->ls_stub_ms);
 			dlm_put_lkb(lkb);
 			break;
@@ -3252,37 +3618,47 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
 	mutex_unlock(&ls->ls_waiters_mutex);
 }
 
-static int remove_resend_waiter(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
+static struct dlm_lkb *find_resend_waiter(struct dlm_ls *ls)
 {
 	struct dlm_lkb *lkb;
-	int rv = 0;
+	int found = 0;
 
 	mutex_lock(&ls->ls_waiters_mutex);
 	list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
 		if (lkb->lkb_flags & DLM_IFL_RESEND) {
-			rv = lkb->lkb_wait_type;
-			_remove_from_waiters(lkb);
-			lkb->lkb_flags &= ~DLM_IFL_RESEND;
+			hold_lkb(lkb);
+			found = 1;
 			break;
 		}
 	}
 	mutex_unlock(&ls->ls_waiters_mutex);
 
-	if (!rv)
+	if (!found)
 		lkb = NULL;
-	*lkb_ret = lkb;
-	return rv;
+	return lkb;
 }
 
 /* Deal with lookups and lkb's marked RESEND from _pre.  We may now be the
    master or dir-node for r.  Processing the lkb may result in it being placed
    back on waiters. */
 
+/* We do this after normal locking has been enabled and any saved messages
+   (in requestqueue) have been processed.  We should be confident that at
+   this point we won't get or process a reply to any of these waiting
+   operations.  But, new ops may be coming in on the rsbs/locks here from
+   userspace or remotely. */
+
+/* there may have been an overlap unlock/cancel prior to recovery or after
+   recovery.  if before, the lkb may still have a pos wait_count; if after, the
+   overlap flag would just have been set and nothing new sent.  we can be
+   confident here than any replies to either the initial op or overlap ops
+   prior to recovery have been received. */
+
 int dlm_recover_waiters_post(struct dlm_ls *ls)
 {
 	struct dlm_lkb *lkb;
 	struct dlm_rsb *r;
-	int error = 0, mstype;
+	int error = 0, mstype, err, oc, ou;
 
 	while (1) {
 		if (dlm_locking_stopped(ls)) {
@@ -3291,48 +3667,78 @@ int dlm_recover_waiters_post(struct dlm_ls *ls)
 			break;
 		}
 
-		mstype = remove_resend_waiter(ls, &lkb);
-		if (!mstype)
+		lkb = find_resend_waiter(ls);
+		if (!lkb)
 			break;
 
 		r = lkb->lkb_resource;
+		hold_rsb(r);
+		lock_rsb(r);
+
+		mstype = lkb->lkb_wait_type;
+		oc = is_overlap_cancel(lkb);
+		ou = is_overlap_unlock(lkb);
+		err = 0;
 
 		log_debug(ls, "recover_waiters_post %x type %d flags %x %s",
 			  lkb->lkb_id, mstype, lkb->lkb_flags, r->res_name);
 
-		switch (mstype) {
-
-		case DLM_MSG_LOOKUP:
-			hold_rsb(r);
-			lock_rsb(r);
-			_request_lock(r, lkb);
-			if (is_master(r))
-				confirm_master(r, 0);
-			unlock_rsb(r);
-			put_rsb(r);
-			break;
-
-		case DLM_MSG_REQUEST:
-			hold_rsb(r);
-			lock_rsb(r);
-			_request_lock(r, lkb);
-			if (is_master(r))
-				confirm_master(r, 0);
-			unlock_rsb(r);
-			put_rsb(r);
-			break;
-
-		case DLM_MSG_CONVERT:
-			hold_rsb(r);
-			lock_rsb(r);
-			_convert_lock(r, lkb);
-			unlock_rsb(r);
-			put_rsb(r);
-			break;
-
-		default:
-			log_error(ls, "recover_waiters_post type %d", mstype);
+		/* At this point we assume that we won't get a reply to any
+		   previous op or overlap op on this lock.  First, do a big
+		   remove_from_waiters() for all previous ops. */
+
+		lkb->lkb_flags &= ~DLM_IFL_RESEND;
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
+		lkb->lkb_wait_type = 0;
+		lkb->lkb_wait_count = 0;
+		mutex_lock(&ls->ls_waiters_mutex);
+		list_del_init(&lkb->lkb_wait_reply);
+		mutex_unlock(&ls->ls_waiters_mutex);
+		unhold_lkb(lkb); /* for waiters list */
+
+		if (oc || ou) {
+			/* do an unlock or cancel instead of resending */
+			switch (mstype) {
+			case DLM_MSG_LOOKUP:
+			case DLM_MSG_REQUEST:
+				queue_cast(r, lkb, ou ? -DLM_EUNLOCK :
+							-DLM_ECANCEL);
+				unhold_lkb(lkb); /* undoes create_lkb() */
+				break;
+			case DLM_MSG_CONVERT:
+				if (oc) {
+					queue_cast(r, lkb, -DLM_ECANCEL);
+				} else {
+					lkb->lkb_exflags |= DLM_LKF_FORCEUNLOCK;
+					_unlock_lock(r, lkb);
+				}
+				break;
+			default:
+				err = 1;
+			}
+		} else {
+			switch (mstype) {
+			case DLM_MSG_LOOKUP:
+			case DLM_MSG_REQUEST:
+				_request_lock(r, lkb);
+				if (is_master(r))
+					confirm_master(r, 0);
+				break;
+			case DLM_MSG_CONVERT:
+				_convert_lock(r, lkb);
+				break;
+			default:
+				err = 1;
+			}
 		}
+
+		if (err)
+			log_error(ls, "recover_waiters_post %x %d %x %d %d",
+			  	  lkb->lkb_id, mstype, lkb->lkb_flags, oc, ou);
+		unlock_rsb(r);
+		put_rsb(r);
+		dlm_put_lkb(lkb);
 	}
 
 	return error;
@@ -3684,7 +4090,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
 
 	/* add this new lkb to the per-process list of locks */
 	spin_lock(&ua->proc->locks_spin);
-	kref_get(&lkb->lkb_ref);
+	hold_lkb(lkb);
 	list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks);
 	spin_unlock(&ua->proc->locks_spin);
  out:
@@ -3774,6 +4180,9 @@ int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
 
 	if (error == -DLM_EUNLOCK)
 		error = 0;
+	/* from validate_unlock_args() */
+	if (error == -EBUSY && (flags & DLM_LKF_FORCEUNLOCK))
+		error = 0;
 	if (error)
 		goto out_put;
 
@@ -3786,6 +4195,7 @@ int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
 	dlm_put_lkb(lkb);
  out:
 	unlock_recovery(ls);
+	kfree(ua_tmp);
 	return error;
 }
 
@@ -3815,33 +4225,37 @@ int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
 
 	if (error == -DLM_ECANCEL)
 		error = 0;
-	if (error)
-		goto out_put;
-
-	/* this lkb was removed from the WAITING queue */
-	if (lkb->lkb_grmode == DLM_LOCK_IV) {
-		spin_lock(&ua->proc->locks_spin);
-		list_move(&lkb->lkb_ownqueue, &ua->proc->unlocking);
-		spin_unlock(&ua->proc->locks_spin);
-	}
+	/* from validate_unlock_args() */
+	if (error == -EBUSY)
+		error = 0;
  out_put:
 	dlm_put_lkb(lkb);
  out:
 	unlock_recovery(ls);
+	kfree(ua_tmp);
 	return error;
 }
 
+/* lkb's that are removed from the waiters list by revert are just left on the
+   orphans list with the granted orphan locks, to be freed by purge */
+
 static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
 {
 	struct dlm_user_args *ua = (struct dlm_user_args *)lkb->lkb_astparam;
+	struct dlm_args args;
+	int error;
 
-	if (ua->lksb.sb_lvbptr)
-		kfree(ua->lksb.sb_lvbptr);
-	kfree(ua);
-	lkb->lkb_astparam = (long)NULL;
+	hold_lkb(lkb);
+	mutex_lock(&ls->ls_orphans_mutex);
+	list_add_tail(&lkb->lkb_ownqueue, &ls->ls_orphans);
+	mutex_unlock(&ls->ls_orphans_mutex);
 
-	/* TODO: propogate to master if needed */
-	return 0;
+	set_unlock_args(0, ua, &args);
+
+	error = cancel_lock(ls, lkb, &args);
+	if (error == -DLM_ECANCEL)
+		error = 0;
+	return error;
 }
 
 /* The force flag allows the unlock to go ahead even if the lkb isn't granted.
@@ -3853,10 +4267,6 @@ static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
 	struct dlm_args args;
 	int error;
 
-	/* FIXME: we need to handle the case where the lkb is in limbo
-	   while the rsb is being looked up, currently we assert in
-	   _unlock_lock/is_remote because rsb nodeid is -1. */
-
 	set_unlock_args(DLM_LKF_FORCEUNLOCK, ua, &args);
 
 	error = unlock_lock(ls, lkb, &args);
@@ -3865,6 +4275,31 @@ static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
 	return error;
 }
 
+/* We have to release clear_proc_locks mutex before calling unlock_proc_lock()
+   (which does lock_rsb) due to deadlock with receiving a message that does
+   lock_rsb followed by dlm_user_add_ast() */
+
+static struct dlm_lkb *del_proc_lock(struct dlm_ls *ls,
+				     struct dlm_user_proc *proc)
+{
+	struct dlm_lkb *lkb = NULL;
+
+	mutex_lock(&ls->ls_clear_proc_locks);
+	if (list_empty(&proc->locks))
+		goto out;
+
+	lkb = list_entry(proc->locks.next, struct dlm_lkb, lkb_ownqueue);
+	list_del_init(&lkb->lkb_ownqueue);
+
+	if (lkb->lkb_exflags & DLM_LKF_PERSISTENT)
+		lkb->lkb_flags |= DLM_IFL_ORPHAN;
+	else
+		lkb->lkb_flags |= DLM_IFL_DEAD;
+ out:
+	mutex_unlock(&ls->ls_clear_proc_locks);
+	return lkb;
+}
+
 /* The ls_clear_proc_locks mutex protects against dlm_user_add_asts() which
    1) references lkb->ua which we free here and 2) adds lkbs to proc->asts,
    which we clear here. */
@@ -3880,18 +4315,15 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
 	struct dlm_lkb *lkb, *safe;
 
 	lock_recovery(ls);
-	mutex_lock(&ls->ls_clear_proc_locks);
 
-	list_for_each_entry_safe(lkb, safe, &proc->locks, lkb_ownqueue) {
-		list_del_init(&lkb->lkb_ownqueue);
-
-		if (lkb->lkb_exflags & DLM_LKF_PERSISTENT) {
-			lkb->lkb_flags |= DLM_IFL_ORPHAN;
+	while (1) {
+		lkb = del_proc_lock(ls, proc);
+		if (!lkb)
+			break;
+		if (lkb->lkb_exflags & DLM_LKF_PERSISTENT)
 			orphan_proc_lock(ls, lkb);
-		} else {
-			lkb->lkb_flags |= DLM_IFL_DEAD;
+		else
 			unlock_proc_lock(ls, lkb);
-		}
 
 		/* this removes the reference for the proc->locks list
 		   added by dlm_user_request, it may result in the lkb
@@ -3900,6 +4332,8 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
 		dlm_put_lkb(lkb);
 	}
 
+	mutex_lock(&ls->ls_clear_proc_locks);
+
 	/* in-progress unlocks */
 	list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) {
 		list_del_init(&lkb->lkb_ownqueue);
@@ -3916,3 +4350,92 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
 	unlock_recovery(ls);
 }
 
+static void purge_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
+{
+	struct dlm_lkb *lkb, *safe;
+
+	while (1) {
+		lkb = NULL;
+		spin_lock(&proc->locks_spin);
+		if (!list_empty(&proc->locks)) {
+			lkb = list_entry(proc->locks.next, struct dlm_lkb,
+					 lkb_ownqueue);
+			list_del_init(&lkb->lkb_ownqueue);
+		}
+		spin_unlock(&proc->locks_spin);
+
+		if (!lkb)
+			break;
+
+		lkb->lkb_flags |= DLM_IFL_DEAD;
+		unlock_proc_lock(ls, lkb);
+		dlm_put_lkb(lkb); /* ref from proc->locks list */
+	}
+
+	spin_lock(&proc->locks_spin);
+	list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) {
+		list_del_init(&lkb->lkb_ownqueue);
+		lkb->lkb_flags |= DLM_IFL_DEAD;
+		dlm_put_lkb(lkb);
+	}
+	spin_unlock(&proc->locks_spin);
+
+	spin_lock(&proc->asts_spin);
+	list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) {
+		list_del(&lkb->lkb_astqueue);
+		dlm_put_lkb(lkb);
+	}
+	spin_unlock(&proc->asts_spin);
+}
+
+/* pid of 0 means purge all orphans */
+
+static void do_purge(struct dlm_ls *ls, int nodeid, int pid)
+{
+	struct dlm_lkb *lkb, *safe;
+
+	mutex_lock(&ls->ls_orphans_mutex);
+	list_for_each_entry_safe(lkb, safe, &ls->ls_orphans, lkb_ownqueue) {
+		if (pid && lkb->lkb_ownpid != pid)
+			continue;
+		unlock_proc_lock(ls, lkb);
+		list_del_init(&lkb->lkb_ownqueue);
+		dlm_put_lkb(lkb);
+	}
+	mutex_unlock(&ls->ls_orphans_mutex);
+}
+
+static int send_purge(struct dlm_ls *ls, int nodeid, int pid)
+{
+	struct dlm_message *ms;
+	struct dlm_mhandle *mh;
+	int error;
+
+	error = _create_message(ls, sizeof(struct dlm_message), nodeid,
+				DLM_MSG_PURGE, &ms, &mh);
+	if (error)
+		return error;
+	ms->m_nodeid = nodeid;
+	ms->m_pid = pid;
+
+	return send_message(mh, ms);
+}
+
+int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc,
+		   int nodeid, int pid)
+{
+	int error = 0;
+
+	if (nodeid != dlm_our_nodeid()) {
+		error = send_purge(ls, nodeid, pid);
+	} else {
+		lock_recovery(ls);
+		if (pid == current->pid)
+			purge_proc_locks(ls, proc);
+		else
+			do_purge(ls, nodeid, pid);
+		unlock_recovery(ls);
+	}
+	return error;
+}
+
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index 0843a3073ec..64fc4ec4066 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -41,6 +41,8 @@ int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
 	uint32_t flags, uint32_t lkid, char *lvb_in);
 int dlm_user_cancel(struct dlm_ls *ls,  struct dlm_user_args *ua_tmp,
 	uint32_t flags, uint32_t lkid);
+int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc,
+	int nodeid, int pid);
 void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc);
 
 static inline int is_master(struct dlm_rsb *r)
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index f40817b53c6..a677b2a5eed 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -167,7 +167,6 @@ static struct kobj_type dlm_ktype = {
 };
 
 static struct kset dlm_kset = {
-	.subsys = &kernel_subsys,
 	.kobj   = {.name = "dlm",},
 	.ktype  = &dlm_ktype,
 };
@@ -218,6 +217,7 @@ int dlm_lockspace_init(void)
 	INIT_LIST_HEAD(&lslist);
 	spin_lock_init(&lslist_lock);
 
+	kobj_set_kset_s(&dlm_kset, kernel_subsys);
 	error = kset_register(&dlm_kset);
 	if (error)
 		printk("dlm_lockspace_init: cannot register kset %d\n", error);
@@ -459,6 +459,8 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
 
 	INIT_LIST_HEAD(&ls->ls_waiters);
 	mutex_init(&ls->ls_waiters_mutex);
+	INIT_LIST_HEAD(&ls->ls_orphans);
+	mutex_init(&ls->ls_orphans_mutex);
 
 	INIT_LIST_HEAD(&ls->ls_nodes);
 	INIT_LIST_HEAD(&ls->ls_nodes_gone);
diff --git a/fs/dlm/lowcomms-sctp.c b/fs/dlm/lowcomms-sctp.c
deleted file mode 100644
index dc83a9d979b..00000000000
--- a/fs/dlm/lowcomms-sctp.c
+++ /dev/null
@@ -1,1210 +0,0 @@
-/******************************************************************************
-*******************************************************************************
-**
-**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
-**
-**  This copyrighted material is made available to anyone wishing to use,
-**  modify, copy, or redistribute it subject to the terms and conditions
-**  of the GNU General Public License v.2.
-**
-*******************************************************************************
-******************************************************************************/
-
-/*
- * lowcomms.c
- *
- * This is the "low-level" comms layer.
- *
- * It is responsible for sending/receiving messages
- * from other nodes in the cluster.
- *
- * Cluster nodes are referred to by their nodeids. nodeids are
- * simply 32 bit numbers to the locking module - if they need to
- * be expanded for the cluster infrastructure then that is it's
- * responsibility. It is this layer's
- * responsibility to resolve these into IP address or
- * whatever it needs for inter-node communication.
- *
- * The comms level is two kernel threads that deal mainly with
- * the receiving of messages from other nodes and passing them
- * up to the mid-level comms layer (which understands the
- * message format) for execution by the locking core, and
- * a send thread which does all the setting up of connections
- * to remote nodes and the sending of data. Threads are not allowed
- * to send their own data because it may cause them to wait in times
- * of high load. Also, this way, the sending thread can collect together
- * messages bound for one node and send them in one block.
- *
- * I don't see any problem with the recv thread executing the locking
- * code on behalf of remote processes as the locking code is
- * short, efficient and never (well, hardly ever) waits.
- *
- */
-
-#include <asm/ioctls.h>
-#include <net/sock.h>
-#include <net/tcp.h>
-#include <net/sctp/user.h>
-#include <linux/pagemap.h>
-#include <linux/socket.h>
-#include <linux/idr.h>
-
-#include "dlm_internal.h"
-#include "lowcomms.h"
-#include "config.h"
-#include "midcomms.h"
-
-static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT];
-static int			dlm_local_count;
-static int			dlm_local_nodeid;
-
-/* One of these per connected node */
-
-#define NI_INIT_PENDING 1
-#define NI_WRITE_PENDING 2
-
-struct nodeinfo {
-	spinlock_t		lock;
-	sctp_assoc_t		assoc_id;
-	unsigned long		flags;
-	struct list_head	write_list; /* nodes with pending writes */
-	struct list_head	writequeue; /* outgoing writequeue_entries */
-	spinlock_t		writequeue_lock;
-	int			nodeid;
-	struct work_struct      swork; /* Send workqueue */
-	struct work_struct      lwork; /* Locking workqueue */
-};
-
-static DEFINE_IDR(nodeinfo_idr);
-static DECLARE_RWSEM(nodeinfo_lock);
-static int max_nodeid;
-
-struct cbuf {
-	unsigned int base;
-	unsigned int len;
-	unsigned int mask;
-};
-
-/* Just the one of these, now. But this struct keeps
-   the connection-specific variables together */
-
-#define CF_READ_PENDING 1
-
-struct connection {
-	struct socket           *sock;
-	unsigned long		flags;
-	struct page             *rx_page;
-	atomic_t		waiting_requests;
-	struct cbuf		cb;
-	int                     eagain_flag;
-	struct work_struct      work; /* Send workqueue */
-};
-
-/* An entry waiting to be sent */
-
-struct writequeue_entry {
-	struct list_head	list;
-	struct page             *page;
-	int			offset;
-	int			len;
-	int			end;
-	int			users;
-	struct nodeinfo         *ni;
-};
-
-static void cbuf_add(struct cbuf *cb, int n)
-{
-	cb->len += n;
-}
-
-static int cbuf_data(struct cbuf *cb)
-{
-	return ((cb->base + cb->len) & cb->mask);
-}
-
-static void cbuf_init(struct cbuf *cb, int size)
-{
-	cb->base = cb->len = 0;
-	cb->mask = size-1;
-}
-
-static void cbuf_eat(struct cbuf *cb, int n)
-{
-	cb->len  -= n;
-	cb->base += n;
-	cb->base &= cb->mask;
-}
-
-/* List of nodes which have writes pending */
-static LIST_HEAD(write_nodes);
-static DEFINE_SPINLOCK(write_nodes_lock);
-
-
-/* Maximum number of incoming messages to process before
- * doing a schedule()
- */
-#define MAX_RX_MSG_COUNT 25
-
-/* Work queues */
-static struct workqueue_struct *recv_workqueue;
-static struct workqueue_struct *send_workqueue;
-static struct workqueue_struct *lock_workqueue;
-
-/* The SCTP connection */
-static struct connection sctp_con;
-
-static void process_send_sockets(struct work_struct *work);
-static void process_recv_sockets(struct work_struct *work);
-static void process_lock_request(struct work_struct *work);
-
-static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
-{
-	struct sockaddr_storage addr;
-	int error;
-
-	if (!dlm_local_count)
-		return -1;
-
-	error = dlm_nodeid_to_addr(nodeid, &addr);
-	if (error)
-		return error;
-
-	if (dlm_local_addr[0]->ss_family == AF_INET) {
-		struct sockaddr_in *in4  = (struct sockaddr_in *) &addr;
-		struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr;
-		ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
-	} else {
-		struct sockaddr_in6 *in6  = (struct sockaddr_in6 *) &addr;
-		struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr;
-		memcpy(&ret6->sin6_addr, &in6->sin6_addr,
-		       sizeof(in6->sin6_addr));
-	}
-
-	return 0;
-}
-
-/* If alloc is 0 here we will not attempt to allocate a new
-   nodeinfo struct */
-static struct nodeinfo *nodeid2nodeinfo(int nodeid, gfp_t alloc)
-{
-	struct nodeinfo *ni;
-	int r;
-	int n;
-
-	down_read(&nodeinfo_lock);
-	ni = idr_find(&nodeinfo_idr, nodeid);
-	up_read(&nodeinfo_lock);
-
-	if (ni || !alloc)
-		return ni;
-
-	down_write(&nodeinfo_lock);
-
-	ni = idr_find(&nodeinfo_idr, nodeid);
-	if (ni)
-		goto out_up;
-
-	r = idr_pre_get(&nodeinfo_idr, alloc);
-	if (!r)
-		goto out_up;
-
-	ni = kmalloc(sizeof(struct nodeinfo), alloc);
-	if (!ni)
-		goto out_up;
-
-	r = idr_get_new_above(&nodeinfo_idr, ni, nodeid, &n);
-	if (r) {
-		kfree(ni);
-		ni = NULL;
-		goto out_up;
-	}
-	if (n != nodeid) {
-		idr_remove(&nodeinfo_idr, n);
-		kfree(ni);
-		ni = NULL;
-		goto out_up;
-	}
-	memset(ni, 0, sizeof(struct nodeinfo));
-	spin_lock_init(&ni->lock);
-	INIT_LIST_HEAD(&ni->writequeue);
-	spin_lock_init(&ni->writequeue_lock);
-	INIT_WORK(&ni->lwork, process_lock_request);
-	INIT_WORK(&ni->swork, process_send_sockets);
-	ni->nodeid = nodeid;
-
-	if (nodeid > max_nodeid)
-		max_nodeid = nodeid;
-out_up:
-	up_write(&nodeinfo_lock);
-
-	return ni;
-}
-
-/* Don't call this too often... */
-static struct nodeinfo *assoc2nodeinfo(sctp_assoc_t assoc)
-{
-	int i;
-	struct nodeinfo *ni;
-
-	for (i=1; i<=max_nodeid; i++) {
-		ni = nodeid2nodeinfo(i, 0);
-		if (ni && ni->assoc_id == assoc)
-			return ni;
-	}
-	return NULL;
-}
-
-/* Data or notification available on socket */
-static void lowcomms_data_ready(struct sock *sk, int count_unused)
-{
-	if (test_and_set_bit(CF_READ_PENDING, &sctp_con.flags))
-		queue_work(recv_workqueue, &sctp_con.work);
-}
-
-
-/* Add the port number to an IP6 or 4 sockaddr and return the address length.
-   Also padd out the struct with zeros to make comparisons meaningful */
-
-static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
-			  int *addr_len)
-{
-	struct sockaddr_in *local4_addr;
-	struct sockaddr_in6 *local6_addr;
-
-	if (!dlm_local_count)
-		return;
-
-	if (!port) {
-		if (dlm_local_addr[0]->ss_family == AF_INET) {
-			local4_addr = (struct sockaddr_in *)dlm_local_addr[0];
-			port = be16_to_cpu(local4_addr->sin_port);
-		} else {
-			local6_addr = (struct sockaddr_in6 *)dlm_local_addr[0];
-			port = be16_to_cpu(local6_addr->sin6_port);
-		}
-	}
-
-	saddr->ss_family = dlm_local_addr[0]->ss_family;
-	if (dlm_local_addr[0]->ss_family == AF_INET) {
-		struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
-		in4_addr->sin_port = cpu_to_be16(port);
-		memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero));
-		memset(in4_addr+1, 0, sizeof(struct sockaddr_storage) -
-		       sizeof(struct sockaddr_in));
-		*addr_len = sizeof(struct sockaddr_in);
-	} else {
-		struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr;
-		in6_addr->sin6_port = cpu_to_be16(port);
-		memset(in6_addr+1, 0, sizeof(struct sockaddr_storage) -
-		       sizeof(struct sockaddr_in6));
-		*addr_len = sizeof(struct sockaddr_in6);
-	}
-}
-
-/* Close the connection and tidy up */
-static void close_connection(void)
-{
-	if (sctp_con.sock) {
-		sock_release(sctp_con.sock);
-		sctp_con.sock = NULL;
-	}
-
-	if (sctp_con.rx_page) {
-		__free_page(sctp_con.rx_page);
-		sctp_con.rx_page = NULL;
-	}
-}
-
-/* We only send shutdown messages to nodes that are not part of the cluster */
-static void send_shutdown(sctp_assoc_t associd)
-{
-	static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
-	struct msghdr outmessage;
-	struct cmsghdr *cmsg;
-	struct sctp_sndrcvinfo *sinfo;
-	int ret;
-
-	outmessage.msg_name = NULL;
-	outmessage.msg_namelen = 0;
-	outmessage.msg_control = outcmsg;
-	outmessage.msg_controllen = sizeof(outcmsg);
-	outmessage.msg_flags = MSG_EOR;
-
-	cmsg = CMSG_FIRSTHDR(&outmessage);
-	cmsg->cmsg_level = IPPROTO_SCTP;
-	cmsg->cmsg_type = SCTP_SNDRCV;
-	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
-	outmessage.msg_controllen = cmsg->cmsg_len;
-	sinfo = CMSG_DATA(cmsg);
-	memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
-
-	sinfo->sinfo_flags |= MSG_EOF;
-	sinfo->sinfo_assoc_id = associd;
-
-	ret = kernel_sendmsg(sctp_con.sock, &outmessage, NULL, 0, 0);
-
-	if (ret != 0)
-		log_print("send EOF to node failed: %d", ret);
-}
-
-
-/* INIT failed but we don't know which node...
-   restart INIT on all pending nodes */
-static void init_failed(void)
-{
-	int i;
-	struct nodeinfo *ni;
-
-	for (i=1; i<=max_nodeid; i++) {
-		ni = nodeid2nodeinfo(i, 0);
-		if (!ni)
-			continue;
-
-		if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
-			ni->assoc_id = 0;
-			if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
-				spin_lock_bh(&write_nodes_lock);
-				list_add_tail(&ni->write_list, &write_nodes);
-				spin_unlock_bh(&write_nodes_lock);
-				queue_work(send_workqueue, &ni->swork);
-			}
-		}
-	}
-}
-
-/* Something happened to an association */
-static void process_sctp_notification(struct msghdr *msg, char *buf)
-{
-	union sctp_notification *sn = (union sctp_notification *)buf;
-
-	if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) {
-		switch (sn->sn_assoc_change.sac_state) {
-
-		case SCTP_COMM_UP:
-		case SCTP_RESTART:
-		{
-			/* Check that the new node is in the lockspace */
-			struct sctp_prim prim;
-			mm_segment_t fs;
-			int nodeid;
-			int prim_len, ret;
-			int addr_len;
-			struct nodeinfo *ni;
-
-			/* This seems to happen when we received a connection
-			 * too early... or something...  anyway, it happens but
-			 * we always seem to get a real message too, see
-			 * receive_from_sock */
-
-			if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) {
-				log_print("COMM_UP for invalid assoc ID %d",
-					  (int)sn->sn_assoc_change.sac_assoc_id);
-				init_failed();
-				return;
-			}
-			memset(&prim, 0, sizeof(struct sctp_prim));
-			prim_len = sizeof(struct sctp_prim);
-			prim.ssp_assoc_id = sn->sn_assoc_change.sac_assoc_id;
-
-			fs = get_fs();
-			set_fs(get_ds());
-			ret = sctp_con.sock->ops->getsockopt(sctp_con.sock,
-							     IPPROTO_SCTP,
-							     SCTP_PRIMARY_ADDR,
-							     (char*)&prim,
-							     &prim_len);
-			set_fs(fs);
-			if (ret < 0) {
-				struct nodeinfo *ni;
-
-				log_print("getsockopt/sctp_primary_addr on "
-					  "new assoc %d failed : %d",
-					  (int)sn->sn_assoc_change.sac_assoc_id,
-					  ret);
-
-				/* Retry INIT later */
-				ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
-				if (ni)
-					clear_bit(NI_INIT_PENDING, &ni->flags);
-				return;
-			}
-			make_sockaddr(&prim.ssp_addr, 0, &addr_len);
-			if (dlm_addr_to_nodeid(&prim.ssp_addr, &nodeid)) {
-				log_print("reject connect from unknown addr");
-				send_shutdown(prim.ssp_assoc_id);
-				return;
-			}
-
-			ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
-			if (!ni)
-				return;
-
-			/* Save the assoc ID */
-			ni->assoc_id = sn->sn_assoc_change.sac_assoc_id;
-
-			log_print("got new/restarted association %d nodeid %d",
-				  (int)sn->sn_assoc_change.sac_assoc_id, nodeid);
-
-			/* Send any pending writes */
-			clear_bit(NI_INIT_PENDING, &ni->flags);
-			if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
-				spin_lock_bh(&write_nodes_lock);
-				list_add_tail(&ni->write_list, &write_nodes);
-				spin_unlock_bh(&write_nodes_lock);
-				queue_work(send_workqueue, &ni->swork);
-			}
-		}
-		break;
-
-		case SCTP_COMM_LOST:
-		case SCTP_SHUTDOWN_COMP:
-		{
-			struct nodeinfo *ni;
-
-			ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
-			if (ni) {
-				spin_lock(&ni->lock);
-				ni->assoc_id = 0;
-				spin_unlock(&ni->lock);
-			}
-		}
-		break;
-
-		/* We don't know which INIT failed, so clear the PENDING flags
-		 * on them all.  if assoc_id is zero then it will then try
-		 * again */
-
-		case SCTP_CANT_STR_ASSOC:
-		{
-			log_print("Can't start SCTP association - retrying");
-			init_failed();
-		}
-		break;
-
-		default:
-			log_print("unexpected SCTP assoc change id=%d state=%d",
-				  (int)sn->sn_assoc_change.sac_assoc_id,
-				  sn->sn_assoc_change.sac_state);
-		}
-	}
-}
-
-/* Data received from remote end */
-static int receive_from_sock(void)
-{
-	int ret = 0;
-	struct msghdr msg;
-	struct kvec iov[2];
-	unsigned len;
-	int r;
-	struct sctp_sndrcvinfo *sinfo;
-	struct cmsghdr *cmsg;
-	struct nodeinfo *ni;
-
-	/* These two are marginally too big for stack allocation, but this
-	 * function is (currently) only called by dlm_recvd so static should be
-	 * OK.
-	 */
-	static struct sockaddr_storage msgname;
-	static char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
-
-	if (sctp_con.sock == NULL)
-		goto out;
-
-	if (sctp_con.rx_page == NULL) {
-		/*
-		 * This doesn't need to be atomic, but I think it should
-		 * improve performance if it is.
-		 */
-		sctp_con.rx_page = alloc_page(GFP_ATOMIC);
-		if (sctp_con.rx_page == NULL)
-			goto out_resched;
-		cbuf_init(&sctp_con.cb, PAGE_CACHE_SIZE);
-	}
-
-	memset(&incmsg, 0, sizeof(incmsg));
-	memset(&msgname, 0, sizeof(msgname));
-
-	msg.msg_name = &msgname;
-	msg.msg_namelen = sizeof(msgname);
-	msg.msg_flags = 0;
-	msg.msg_control = incmsg;
-	msg.msg_controllen = sizeof(incmsg);
-	msg.msg_iovlen = 1;
-
-	/* I don't see why this circular buffer stuff is necessary for SCTP
-	 * which is a packet-based protocol, but the whole thing breaks under
-	 * load without it! The overhead is minimal (and is in the TCP lowcomms
-	 * anyway, of course) so I'll leave it in until I can figure out what's
-	 * really happening.
-	 */
-
-	/*
-	 * iov[0] is the bit of the circular buffer between the current end
-	 * point (cb.base + cb.len) and the end of the buffer.
-	 */
-	iov[0].iov_len = sctp_con.cb.base - cbuf_data(&sctp_con.cb);
-	iov[0].iov_base = page_address(sctp_con.rx_page) +
-		cbuf_data(&sctp_con.cb);
-	iov[1].iov_len = 0;
-
-	/*
-	 * iov[1] is the bit of the circular buffer between the start of the
-	 * buffer and the start of the currently used section (cb.base)
-	 */
-	if (cbuf_data(&sctp_con.cb) >= sctp_con.cb.base) {
-		iov[0].iov_len = PAGE_CACHE_SIZE - cbuf_data(&sctp_con.cb);
-		iov[1].iov_len = sctp_con.cb.base;
-		iov[1].iov_base = page_address(sctp_con.rx_page);
-		msg.msg_iovlen = 2;
-	}
-	len = iov[0].iov_len + iov[1].iov_len;
-
-	r = ret = kernel_recvmsg(sctp_con.sock, &msg, iov, msg.msg_iovlen, len,
-				 MSG_NOSIGNAL | MSG_DONTWAIT);
-	if (ret <= 0)
-		goto out_close;
-
-	msg.msg_control = incmsg;
-	msg.msg_controllen = sizeof(incmsg);
-	cmsg = CMSG_FIRSTHDR(&msg);
-	sinfo = CMSG_DATA(cmsg);
-
-	if (msg.msg_flags & MSG_NOTIFICATION) {
-		process_sctp_notification(&msg, page_address(sctp_con.rx_page));
-		return 0;
-	}
-
-	/* Is this a new association ? */
-	ni = nodeid2nodeinfo(le32_to_cpu(sinfo->sinfo_ppid), GFP_KERNEL);
-	if (ni) {
-		ni->assoc_id = sinfo->sinfo_assoc_id;
-		if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
-
-			if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
-				spin_lock_bh(&write_nodes_lock);
-				list_add_tail(&ni->write_list, &write_nodes);
-				spin_unlock_bh(&write_nodes_lock);
-				queue_work(send_workqueue, &ni->swork);
-			}
-		}
-	}
-
-	/* INIT sends a message with length of 1 - ignore it */
-	if (r == 1)
-		return 0;
-
-	cbuf_add(&sctp_con.cb, ret);
-	// PJC: TODO: Add to node's workqueue....can we ??
-	ret = dlm_process_incoming_buffer(cpu_to_le32(sinfo->sinfo_ppid),
-					  page_address(sctp_con.rx_page),
-					  sctp_con.cb.base, sctp_con.cb.len,
-					  PAGE_CACHE_SIZE);
-	if (ret < 0)
-		goto out_close;
-	cbuf_eat(&sctp_con.cb, ret);
-
-out:
-	ret = 0;
-	goto out_ret;
-
-out_resched:
-	lowcomms_data_ready(sctp_con.sock->sk, 0);
-	ret = 0;
-	cond_resched();
-	goto out_ret;
-
-out_close:
-	if (ret != -EAGAIN)
-		log_print("error reading from sctp socket: %d", ret);
-out_ret:
-	return ret;
-}
-
-/* Bind to an IP address. SCTP allows multiple address so it can do multi-homing */
-static int add_bind_addr(struct sockaddr_storage *addr, int addr_len, int num)
-{
-	mm_segment_t fs;
-	int result = 0;
-
-	fs = get_fs();
-	set_fs(get_ds());
-	if (num == 1)
-		result = sctp_con.sock->ops->bind(sctp_con.sock,
-						  (struct sockaddr *) addr,
-						  addr_len);
-	else
-		result = sctp_con.sock->ops->setsockopt(sctp_con.sock, SOL_SCTP,
-							SCTP_SOCKOPT_BINDX_ADD,
-							(char *)addr, addr_len);
-	set_fs(fs);
-
-	if (result < 0)
-		log_print("Can't bind to port %d addr number %d",
-			  dlm_config.ci_tcp_port, num);
-
-	return result;
-}
-
-static void init_local(void)
-{
-	struct sockaddr_storage sas, *addr;
-	int i;
-
-	dlm_local_nodeid = dlm_our_nodeid();
-
-	for (i = 0; i < DLM_MAX_ADDR_COUNT - 1; i++) {
-		if (dlm_our_addr(&sas, i))
-			break;
-
-		addr = kmalloc(sizeof(*addr), GFP_KERNEL);
-		if (!addr)
-			break;
-		memcpy(addr, &sas, sizeof(*addr));
-		dlm_local_addr[dlm_local_count++] = addr;
-	}
-}
-
-/* Initialise SCTP socket and bind to all interfaces */
-static int init_sock(void)
-{
-	mm_segment_t fs;
-	struct socket *sock = NULL;
-	struct sockaddr_storage localaddr;
-	struct sctp_event_subscribe subscribe;
-	int result = -EINVAL, num = 1, i, addr_len;
-
-	if (!dlm_local_count) {
-		init_local();
-		if (!dlm_local_count) {
-			log_print("no local IP address has been set");
-			goto out;
-		}
-	}
-
-	result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_SEQPACKET,
-				  IPPROTO_SCTP, &sock);
-	if (result < 0) {
-		log_print("Can't create comms socket, check SCTP is loaded");
-		goto out;
-	}
-
-	/* Listen for events */
-	memset(&subscribe, 0, sizeof(subscribe));
-	subscribe.sctp_data_io_event = 1;
-	subscribe.sctp_association_event = 1;
-	subscribe.sctp_send_failure_event = 1;
-	subscribe.sctp_shutdown_event = 1;
-	subscribe.sctp_partial_delivery_event = 1;
-
-	fs = get_fs();
-	set_fs(get_ds());
-	result = sock->ops->setsockopt(sock, SOL_SCTP, SCTP_EVENTS,
-				       (char *)&subscribe, sizeof(subscribe));
-	set_fs(fs);
-
-	if (result < 0) {
-		log_print("Failed to set SCTP_EVENTS on socket: result=%d",
-			  result);
-		goto create_delsock;
-	}
-
-	/* Init con struct */
-	sock->sk->sk_user_data = &sctp_con;
-	sctp_con.sock = sock;
-	sctp_con.sock->sk->sk_data_ready = lowcomms_data_ready;
-
-	/* Bind to all interfaces. */
-	for (i = 0; i < dlm_local_count; i++) {
-		memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr));
-		make_sockaddr(&localaddr, dlm_config.ci_tcp_port, &addr_len);
-
-		result = add_bind_addr(&localaddr, addr_len, num);
-		if (result)
-			goto create_delsock;
-		++num;
-	}
-
-	result = sock->ops->listen(sock, 5);
-	if (result < 0) {
-		log_print("Can't set socket listening");
-		goto create_delsock;
-	}
-
-	return 0;
-
-create_delsock:
-	sock_release(sock);
-	sctp_con.sock = NULL;
-out:
-	return result;
-}
-
-
-static struct writequeue_entry *new_writequeue_entry(gfp_t allocation)
-{
-	struct writequeue_entry *entry;
-
-	entry = kmalloc(sizeof(struct writequeue_entry), allocation);
-	if (!entry)
-		return NULL;
-
-	entry->page = alloc_page(allocation);
-	if (!entry->page) {
-		kfree(entry);
-		return NULL;
-	}
-
-	entry->offset = 0;
-	entry->len = 0;
-	entry->end = 0;
-	entry->users = 0;
-
-	return entry;
-}
-
-void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
-{
-	struct writequeue_entry *e;
-	int offset = 0;
-	int users = 0;
-	struct nodeinfo *ni;
-
-	ni = nodeid2nodeinfo(nodeid, allocation);
-	if (!ni)
-		return NULL;
-
-	spin_lock(&ni->writequeue_lock);
-	e = list_entry(ni->writequeue.prev, struct writequeue_entry, list);
-	if ((&e->list == &ni->writequeue) ||
-	    (PAGE_CACHE_SIZE - e->end < len)) {
-		e = NULL;
-	} else {
-		offset = e->end;
-		e->end += len;
-		users = e->users++;
-	}
-	spin_unlock(&ni->writequeue_lock);
-
-	if (e) {
-	got_one:
-		if (users == 0)
-			kmap(e->page);
-		*ppc = page_address(e->page) + offset;
-		return e;
-	}
-
-	e = new_writequeue_entry(allocation);
-	if (e) {
-		spin_lock(&ni->writequeue_lock);
-		offset = e->end;
-		e->end += len;
-		e->ni = ni;
-		users = e->users++;
-		list_add_tail(&e->list, &ni->writequeue);
-		spin_unlock(&ni->writequeue_lock);
-		goto got_one;
-	}
-	return NULL;
-}
-
-void dlm_lowcomms_commit_buffer(void *arg)
-{
-	struct writequeue_entry *e = (struct writequeue_entry *) arg;
-	int users;
-	struct nodeinfo *ni = e->ni;
-
-	spin_lock(&ni->writequeue_lock);
-	users = --e->users;
-	if (users)
-		goto out;
-	e->len = e->end - e->offset;
-	kunmap(e->page);
-	spin_unlock(&ni->writequeue_lock);
-
-	if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
-		spin_lock_bh(&write_nodes_lock);
-		list_add_tail(&ni->write_list, &write_nodes);
-		spin_unlock_bh(&write_nodes_lock);
-
-		queue_work(send_workqueue, &ni->swork);
-	}
-	return;
-
-out:
-	spin_unlock(&ni->writequeue_lock);
-	return;
-}
-
-static void free_entry(struct writequeue_entry *e)
-{
-	__free_page(e->page);
-	kfree(e);
-}
-
-/* Initiate an SCTP association. In theory we could just use sendmsg() on
-   the first IP address and it should work, but this allows us to set up the
-   association before sending any valuable data that we can't afford to lose.
-   It also keeps the send path clean as it can now always use the association ID */
-static void initiate_association(int nodeid)
-{
-	struct sockaddr_storage rem_addr;
-	static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
-	struct msghdr outmessage;
-	struct cmsghdr *cmsg;
-	struct sctp_sndrcvinfo *sinfo;
-	int ret;
-	int addrlen;
-	char buf[1];
-	struct kvec iov[1];
-	struct nodeinfo *ni;
-
-	log_print("Initiating association with node %d", nodeid);
-
-	ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
-	if (!ni)
-		return;
-
-	if (nodeid_to_addr(nodeid, (struct sockaddr *)&rem_addr)) {
-		log_print("no address for nodeid %d", nodeid);
-		return;
-	}
-
-	make_sockaddr(&rem_addr, dlm_config.ci_tcp_port, &addrlen);
-
-	outmessage.msg_name = &rem_addr;
-	outmessage.msg_namelen = addrlen;
-	outmessage.msg_control = outcmsg;
-	outmessage.msg_controllen = sizeof(outcmsg);
-	outmessage.msg_flags = MSG_EOR;
-
-	iov[0].iov_base = buf;
-	iov[0].iov_len = 1;
-
-	/* Real INIT messages seem to cause trouble. Just send a 1 byte message
-	   we can afford to lose */
-	cmsg = CMSG_FIRSTHDR(&outmessage);
-	cmsg->cmsg_level = IPPROTO_SCTP;
-	cmsg->cmsg_type = SCTP_SNDRCV;
-	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
-	sinfo = CMSG_DATA(cmsg);
-	memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
-	sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid);
-
-	outmessage.msg_controllen = cmsg->cmsg_len;
-	ret = kernel_sendmsg(sctp_con.sock, &outmessage, iov, 1, 1);
-	if (ret < 0) {
-		log_print("send INIT to node failed: %d", ret);
-		/* Try again later */
-		clear_bit(NI_INIT_PENDING, &ni->flags);
-	}
-}
-
-/* Send a message */
-static void send_to_sock(struct nodeinfo *ni)
-{
-	int ret = 0;
-	struct writequeue_entry *e;
-	int len, offset;
-	struct msghdr outmsg;
-	static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
-	struct cmsghdr *cmsg;
-	struct sctp_sndrcvinfo *sinfo;
-	struct kvec iov;
-
-	/* See if we need to init an association before we start
-	   sending precious messages */
-	spin_lock(&ni->lock);
-	if (!ni->assoc_id && !test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
-		spin_unlock(&ni->lock);
-		initiate_association(ni->nodeid);
-		return;
-	}
-	spin_unlock(&ni->lock);
-
-	outmsg.msg_name = NULL; /* We use assoc_id */
-	outmsg.msg_namelen = 0;
-	outmsg.msg_control = outcmsg;
-	outmsg.msg_controllen = sizeof(outcmsg);
-	outmsg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL | MSG_EOR;
-
-	cmsg = CMSG_FIRSTHDR(&outmsg);
-	cmsg->cmsg_level = IPPROTO_SCTP;
-	cmsg->cmsg_type = SCTP_SNDRCV;
-	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
-	sinfo = CMSG_DATA(cmsg);
-	memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
-	sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid);
-	sinfo->sinfo_assoc_id = ni->assoc_id;
-	outmsg.msg_controllen = cmsg->cmsg_len;
-
-	spin_lock(&ni->writequeue_lock);
-	for (;;) {
-		if (list_empty(&ni->writequeue))
-			break;
-		e = list_entry(ni->writequeue.next, struct writequeue_entry,
-			       list);
-		len = e->len;
-		offset = e->offset;
-		BUG_ON(len == 0 && e->users == 0);
-		spin_unlock(&ni->writequeue_lock);
-		kmap(e->page);
-
-		ret = 0;
-		if (len) {
-			iov.iov_base = page_address(e->page)+offset;
-			iov.iov_len = len;
-
-			ret = kernel_sendmsg(sctp_con.sock, &outmsg, &iov, 1,
-					     len);
-			if (ret == -EAGAIN) {
-				sctp_con.eagain_flag = 1;
-				goto out;
-			} else if (ret < 0)
-				goto send_error;
-		} else {
-			/* Don't starve people filling buffers */
-			cond_resched();
-		}
-
-		spin_lock(&ni->writequeue_lock);
-		e->offset += ret;
-		e->len -= ret;
-
-		if (e->len == 0 && e->users == 0) {
-			list_del(&e->list);
-			kunmap(e->page);
-			free_entry(e);
-			continue;
-		}
-	}
-	spin_unlock(&ni->writequeue_lock);
-out:
-	return;
-
-send_error:
-	log_print("Error sending to node %d %d", ni->nodeid, ret);
-	spin_lock(&ni->lock);
-	if (!test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
-		ni->assoc_id = 0;
-		spin_unlock(&ni->lock);
-		initiate_association(ni->nodeid);
-	} else
-		spin_unlock(&ni->lock);
-
-	return;
-}
-
-/* Try to send any messages that are pending */
-static void process_output_queue(void)
-{
-	struct list_head *list;
-	struct list_head *temp;
-
-	spin_lock_bh(&write_nodes_lock);
-	list_for_each_safe(list, temp, &write_nodes) {
-		struct nodeinfo *ni =
-			list_entry(list, struct nodeinfo, write_list);
-		clear_bit(NI_WRITE_PENDING, &ni->flags);
-		list_del(&ni->write_list);
-
-		spin_unlock_bh(&write_nodes_lock);
-
-		send_to_sock(ni);
-		spin_lock_bh(&write_nodes_lock);
-	}
-	spin_unlock_bh(&write_nodes_lock);
-}
-
-/* Called after we've had -EAGAIN and been woken up */
-static void refill_write_queue(void)
-{
-	int i;
-
-	for (i=1; i<=max_nodeid; i++) {
-		struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
-
-		if (ni) {
-			if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
-				spin_lock_bh(&write_nodes_lock);
-				list_add_tail(&ni->write_list, &write_nodes);
-				spin_unlock_bh(&write_nodes_lock);
-			}
-		}
-	}
-}
-
-static void clean_one_writequeue(struct nodeinfo *ni)
-{
-	struct list_head *list;
-	struct list_head *temp;
-
-	spin_lock(&ni->writequeue_lock);
-	list_for_each_safe(list, temp, &ni->writequeue) {
-		struct writequeue_entry *e =
-			list_entry(list, struct writequeue_entry, list);
-		list_del(&e->list);
-		free_entry(e);
-	}
-	spin_unlock(&ni->writequeue_lock);
-}
-
-static void clean_writequeues(void)
-{
-	int i;
-
-	for (i=1; i<=max_nodeid; i++) {
-		struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
-		if (ni)
-			clean_one_writequeue(ni);
-	}
-}
-
-
-static void dealloc_nodeinfo(void)
-{
-	int i;
-
-	for (i=1; i<=max_nodeid; i++) {
-		struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
-		if (ni) {
-			idr_remove(&nodeinfo_idr, i);
-			kfree(ni);
-		}
-	}
-}
-
-int dlm_lowcomms_close(int nodeid)
-{
-	struct nodeinfo *ni;
-
-	ni = nodeid2nodeinfo(nodeid, 0);
-	if (!ni)
-		return -1;
-
-	spin_lock(&ni->lock);
-	if (ni->assoc_id) {
-		ni->assoc_id = 0;
-		/* Don't send shutdown here, sctp will just queue it
-		   till the node comes back up! */
-	}
-	spin_unlock(&ni->lock);
-
-	clean_one_writequeue(ni);
-	clear_bit(NI_INIT_PENDING, &ni->flags);
-	return 0;
-}
-
-// PJC: The work queue function for receiving.
-static void process_recv_sockets(struct work_struct *work)
-{
-	if (test_and_clear_bit(CF_READ_PENDING, &sctp_con.flags)) {
-		int ret;
-		int count = 0;
-
-		do {
-			ret = receive_from_sock();
-
-			/* Don't starve out everyone else */
-			if (++count >= MAX_RX_MSG_COUNT) {
-				cond_resched();
-				count = 0;
-			}
-		} while (!kthread_should_stop() && ret >=0);
-	}
-	cond_resched();
-}
-
-// PJC: the work queue function for sending
-static void process_send_sockets(struct work_struct *work)
-{
-	if (sctp_con.eagain_flag) {
-		sctp_con.eagain_flag = 0;
-		refill_write_queue();
-	}
-	process_output_queue();
-}
-
-// PJC: Process lock requests from a particular node.
-// TODO: can we optimise this out on UP ??
-static void process_lock_request(struct work_struct *work)
-{
-}
-
-static void daemons_stop(void)
-{
-	destroy_workqueue(recv_workqueue);
-	destroy_workqueue(send_workqueue);
-	destroy_workqueue(lock_workqueue);
-}
-
-static int daemons_start(void)
-{
-	int error;
-	recv_workqueue = create_workqueue("dlm_recv");
-	error = IS_ERR(recv_workqueue);
-	if (error) {
-		log_print("can't start dlm_recv %d", error);
-		return error;
-	}
-
-	send_workqueue = create_singlethread_workqueue("dlm_send");
-	error = IS_ERR(send_workqueue);
-	if (error) {
-		log_print("can't start dlm_send %d", error);
-		destroy_workqueue(recv_workqueue);
-		return error;
-	}
-
-	lock_workqueue = create_workqueue("dlm_rlock");
-	error = IS_ERR(lock_workqueue);
-	if (error) {
-		log_print("can't start dlm_rlock %d", error);
-		destroy_workqueue(send_workqueue);
-		destroy_workqueue(recv_workqueue);
-		return error;
-	}
-
-	return 0;
-}
-
-/*
- * This is quite likely to sleep...
- */
-int dlm_lowcomms_start(void)
-{
-	int error;
-
-	INIT_WORK(&sctp_con.work, process_recv_sockets);
-
-	error = init_sock();
-	if (error)
-		goto fail_sock;
-	error = daemons_start();
-	if (error)
-		goto fail_sock;
-	return 0;
-
-fail_sock:
-	close_connection();
-	return error;
-}
-
-void dlm_lowcomms_stop(void)
-{
-	int i;
-
-	sctp_con.flags = 0x7;
-	daemons_stop();
-	clean_writequeues();
-	close_connection();
-	dealloc_nodeinfo();
-	max_nodeid = 0;
-
-	dlm_local_count = 0;
-	dlm_local_nodeid = 0;
-
-	for (i = 0; i < dlm_local_count; i++)
-		kfree(dlm_local_addr[i]);
-}
diff --git a/fs/dlm/lowcomms-tcp.c b/fs/dlm/lowcomms.c
index 07e0a122c32..27970a58d29 100644
--- a/fs/dlm/lowcomms-tcp.c
+++ b/fs/dlm/lowcomms.c
@@ -36,30 +36,36 @@
  * of high load. Also, this way, the sending thread can collect together
  * messages bound for one node and send them in one block.
  *
- * I don't see any problem with the recv thread executing the locking
- * code on behalf of remote processes as the locking code is
- * short, efficient and never waits.
+ * lowcomms will choose to use wither TCP or SCTP as its transport layer
+ * depending on the configuration variable 'protocol'. This should be set
+ * to 0 (default) for TCP or 1 for SCTP. It shouldbe configured using a
+ * cluster-wide mechanism as it must be the same on all nodes of the cluster
+ * for the DLM to function.
  *
  */
 
-
 #include <asm/ioctls.h>
 #include <net/sock.h>
 #include <net/tcp.h>
 #include <linux/pagemap.h>
+#include <linux/idr.h>
+#include <linux/file.h>
+#include <linux/sctp.h>
+#include <net/sctp/user.h>
 
 #include "dlm_internal.h"
 #include "lowcomms.h"
 #include "midcomms.h"
 #include "config.h"
 
+#define NEEDED_RMEM (4*1024*1024)
+
 struct cbuf {
 	unsigned int base;
 	unsigned int len;
 	unsigned int mask;
 };
 
-#define NODE_INCREMENT 32
 static void cbuf_add(struct cbuf *cb, int n)
 {
 	cb->len += n;
@@ -88,28 +94,25 @@ static bool cbuf_empty(struct cbuf *cb)
 	return cb->len == 0;
 }
 
-/* Maximum number of incoming messages to process before
-   doing a cond_resched()
-*/
-#define MAX_RX_MSG_COUNT 25
-
 struct connection {
 	struct socket *sock;	/* NULL if not connected */
 	uint32_t nodeid;	/* So we know who we are in the list */
 	struct mutex sock_mutex;
-	unsigned long flags;	/* bit 1,2 = We are on the read/write lists */
+	unsigned long flags;
 #define CF_READ_PENDING 1
 #define CF_WRITE_PENDING 2
 #define CF_CONNECT_PENDING 3
-#define CF_IS_OTHERCON 4
+#define CF_INIT_PENDING 4
+#define CF_IS_OTHERCON 5
 	struct list_head writequeue;  /* List of outgoing writequeue_entries */
-	struct list_head listenlist;  /* List of allocated listening sockets */
 	spinlock_t writequeue_lock;
 	int (*rx_action) (struct connection *);	/* What to do when active */
+	void (*connect_action) (struct connection *);	/* What to do to connect */
 	struct page *rx_page;
 	struct cbuf cb;
 	int retries;
 #define MAX_CONNECT_RETRIES 3
+	int sctp_assoc;
 	struct connection *othercon;
 	struct work_struct rwork; /* Receive workqueue */
 	struct work_struct swork; /* Send workqueue */
@@ -127,68 +130,136 @@ struct writequeue_entry {
 	struct connection *con;
 };
 
-static struct sockaddr_storage dlm_local_addr;
+static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT];
+static int dlm_local_count;
 
 /* Work queues */
 static struct workqueue_struct *recv_workqueue;
 static struct workqueue_struct *send_workqueue;
 
-/* An array of pointers to connections, indexed by NODEID */
-static struct connection **connections;
+static DEFINE_IDR(connections_idr);
 static DECLARE_MUTEX(connections_lock);
+static int max_nodeid;
 static struct kmem_cache *con_cache;
-static int conn_array_size;
 
 static void process_recv_sockets(struct work_struct *work);
 static void process_send_sockets(struct work_struct *work);
 
-static struct connection *nodeid2con(int nodeid, gfp_t allocation)
+/*
+ * If 'allocation' is zero then we don't attempt to create a new
+ * connection structure for this node.
+ */
+static struct connection *__nodeid2con(int nodeid, gfp_t alloc)
 {
 	struct connection *con = NULL;
+	int r;
+	int n;
 
-	down(&connections_lock);
-	if (nodeid >= conn_array_size) {
-		int new_size = nodeid + NODE_INCREMENT;
-		struct connection **new_conns;
+	con = idr_find(&connections_idr, nodeid);
+	if (con || !alloc)
+		return con;
 
-		new_conns = kzalloc(sizeof(struct connection *) *
-				    new_size, allocation);
-		if (!new_conns)
-			goto finish;
+	r = idr_pre_get(&connections_idr, alloc);
+	if (!r)
+		return NULL;
+
+	con = kmem_cache_zalloc(con_cache, alloc);
+	if (!con)
+		return NULL;
 
-		memcpy(new_conns, connections,  sizeof(struct connection *) * conn_array_size);
-		conn_array_size = new_size;
-		kfree(connections);
-		connections = new_conns;
+	r = idr_get_new_above(&connections_idr, con, nodeid, &n);
+	if (r) {
+		kmem_cache_free(con_cache, con);
+		return NULL;
+	}
 
+	if (n != nodeid) {
+		idr_remove(&connections_idr, n);
+		kmem_cache_free(con_cache, con);
+		return NULL;
 	}
 
-	con = connections[nodeid];
-	if (con == NULL && allocation) {
-		con = kmem_cache_zalloc(con_cache, allocation);
-		if (!con)
-			goto finish;
+	con->nodeid = nodeid;
+	mutex_init(&con->sock_mutex);
+	INIT_LIST_HEAD(&con->writequeue);
+	spin_lock_init(&con->writequeue_lock);
+	INIT_WORK(&con->swork, process_send_sockets);
+	INIT_WORK(&con->rwork, process_recv_sockets);
 
-		con->nodeid = nodeid;
-		mutex_init(&con->sock_mutex);
-		INIT_LIST_HEAD(&con->writequeue);
-		spin_lock_init(&con->writequeue_lock);
-		INIT_WORK(&con->swork, process_send_sockets);
-		INIT_WORK(&con->rwork, process_recv_sockets);
+	/* Setup action pointers for child sockets */
+	if (con->nodeid) {
+		struct connection *zerocon = idr_find(&connections_idr, 0);
 
-		connections[nodeid] = con;
+		con->connect_action = zerocon->connect_action;
+		if (!con->rx_action)
+			con->rx_action = zerocon->rx_action;
 	}
 
-finish:
+	if (nodeid > max_nodeid)
+		max_nodeid = nodeid;
+
+	return con;
+}
+
+static struct connection *nodeid2con(int nodeid, gfp_t allocation)
+{
+	struct connection *con;
+
+	down(&connections_lock);
+	con = __nodeid2con(nodeid, allocation);
 	up(&connections_lock);
+
 	return con;
 }
 
+/* This is a bit drastic, but only called when things go wrong */
+static struct connection *assoc2con(int assoc_id)
+{
+	int i;
+	struct connection *con;
+
+	down(&connections_lock);
+	for (i=0; i<=max_nodeid; i++) {
+		con = __nodeid2con(i, 0);
+		if (con && con->sctp_assoc == assoc_id) {
+			up(&connections_lock);
+			return con;
+		}
+	}
+	up(&connections_lock);
+	return NULL;
+}
+
+static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
+{
+	struct sockaddr_storage addr;
+	int error;
+
+	if (!dlm_local_count)
+		return -1;
+
+	error = dlm_nodeid_to_addr(nodeid, &addr);
+	if (error)
+		return error;
+
+	if (dlm_local_addr[0]->ss_family == AF_INET) {
+		struct sockaddr_in *in4  = (struct sockaddr_in *) &addr;
+		struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr;
+		ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
+	} else {
+		struct sockaddr_in6 *in6  = (struct sockaddr_in6 *) &addr;
+		struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr;
+		memcpy(&ret6->sin6_addr, &in6->sin6_addr,
+		       sizeof(in6->sin6_addr));
+	}
+
+	return 0;
+}
+
 /* Data available on socket or listen socket received a connect */
 static void lowcomms_data_ready(struct sock *sk, int count_unused)
 {
 	struct connection *con = sock2con(sk);
-
 	if (!test_and_set_bit(CF_READ_PENDING, &con->flags))
 		queue_work(recv_workqueue, &con->rwork);
 }
@@ -222,20 +293,21 @@ static int add_sock(struct socket *sock, struct connection *con)
 	con->sock->sk->sk_data_ready = lowcomms_data_ready;
 	con->sock->sk->sk_write_space = lowcomms_write_space;
 	con->sock->sk->sk_state_change = lowcomms_state_change;
-
+	con->sock->sk->sk_user_data = con;
 	return 0;
 }
 
-/* Add the port number to an IP6 or 4 sockaddr and return the address
+/* Add the port number to an IPv6 or 4 sockaddr and return the address
    length */
 static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
 			  int *addr_len)
 {
-	saddr->ss_family =  dlm_local_addr.ss_family;
+	saddr->ss_family =  dlm_local_addr[0]->ss_family;
 	if (saddr->ss_family == AF_INET) {
 		struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
 		in4_addr->sin_port = cpu_to_be16(port);
 		*addr_len = sizeof(struct sockaddr_in);
+		memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero));
 	} else {
 		struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr;
 		in6_addr->sin6_port = cpu_to_be16(port);
@@ -264,6 +336,193 @@ static void close_connection(struct connection *con, bool and_other)
 	mutex_unlock(&con->sock_mutex);
 }
 
+/* We only send shutdown messages to nodes that are not part of the cluster */
+static void sctp_send_shutdown(sctp_assoc_t associd)
+{
+	static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+	struct msghdr outmessage;
+	struct cmsghdr *cmsg;
+	struct sctp_sndrcvinfo *sinfo;
+	int ret;
+	struct connection *con;
+
+	con = nodeid2con(0,0);
+	BUG_ON(con == NULL);
+
+	outmessage.msg_name = NULL;
+	outmessage.msg_namelen = 0;
+	outmessage.msg_control = outcmsg;
+	outmessage.msg_controllen = sizeof(outcmsg);
+	outmessage.msg_flags = MSG_EOR;
+
+	cmsg = CMSG_FIRSTHDR(&outmessage);
+	cmsg->cmsg_level = IPPROTO_SCTP;
+	cmsg->cmsg_type = SCTP_SNDRCV;
+	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
+	outmessage.msg_controllen = cmsg->cmsg_len;
+	sinfo = CMSG_DATA(cmsg);
+	memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
+
+	sinfo->sinfo_flags |= MSG_EOF;
+	sinfo->sinfo_assoc_id = associd;
+
+	ret = kernel_sendmsg(con->sock, &outmessage, NULL, 0, 0);
+
+	if (ret != 0)
+		log_print("send EOF to node failed: %d", ret);
+}
+
+/* INIT failed but we don't know which node...
+   restart INIT on all pending nodes */
+static void sctp_init_failed(void)
+{
+	int i;
+	struct connection *con;
+
+	down(&connections_lock);
+	for (i=1; i<=max_nodeid; i++) {
+		con = __nodeid2con(i, 0);
+		if (!con)
+			continue;
+		con->sctp_assoc = 0;
+		if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
+			if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) {
+				queue_work(send_workqueue, &con->swork);
+			}
+		}
+	}
+	up(&connections_lock);
+}
+
+/* Something happened to an association */
+static void process_sctp_notification(struct connection *con,
+				      struct msghdr *msg, char *buf)
+{
+	union sctp_notification *sn = (union sctp_notification *)buf;
+
+	if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) {
+		switch (sn->sn_assoc_change.sac_state) {
+
+		case SCTP_COMM_UP:
+		case SCTP_RESTART:
+		{
+			/* Check that the new node is in the lockspace */
+			struct sctp_prim prim;
+			int nodeid;
+			int prim_len, ret;
+			int addr_len;
+			struct connection *new_con;
+			struct file *file;
+			sctp_peeloff_arg_t parg;
+			int parglen = sizeof(parg);
+
+			/*
+			 * We get this before any data for an association.
+			 * We verify that the node is in the cluster and
+			 * then peel off a socket for it.
+			 */
+			if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) {
+				log_print("COMM_UP for invalid assoc ID %d",
+					 (int)sn->sn_assoc_change.sac_assoc_id);
+				sctp_init_failed();
+				return;
+			}
+			memset(&prim, 0, sizeof(struct sctp_prim));
+			prim_len = sizeof(struct sctp_prim);
+			prim.ssp_assoc_id = sn->sn_assoc_change.sac_assoc_id;
+
+			ret = kernel_getsockopt(con->sock,
+						IPPROTO_SCTP,
+						SCTP_PRIMARY_ADDR,
+						(char*)&prim,
+						&prim_len);
+			if (ret < 0) {
+				log_print("getsockopt/sctp_primary_addr on "
+					  "new assoc %d failed : %d",
+					  (int)sn->sn_assoc_change.sac_assoc_id,
+					  ret);
+
+				/* Retry INIT later */
+				new_con = assoc2con(sn->sn_assoc_change.sac_assoc_id);
+				if (new_con)
+					clear_bit(CF_CONNECT_PENDING, &con->flags);
+				return;
+			}
+			make_sockaddr(&prim.ssp_addr, 0, &addr_len);
+			if (dlm_addr_to_nodeid(&prim.ssp_addr, &nodeid)) {
+				int i;
+				unsigned char *b=(unsigned char *)&prim.ssp_addr;
+				log_print("reject connect from unknown addr");
+				for (i=0; i<sizeof(struct sockaddr_storage);i++)
+					printk("%02x ", b[i]);
+				printk("\n");
+				sctp_send_shutdown(prim.ssp_assoc_id);
+				return;
+			}
+
+			new_con = nodeid2con(nodeid, GFP_KERNEL);
+			if (!new_con)
+				return;
+
+			/* Peel off a new sock */
+			parg.associd = sn->sn_assoc_change.sac_assoc_id;
+			ret = kernel_getsockopt(con->sock, IPPROTO_SCTP,
+						SCTP_SOCKOPT_PEELOFF,
+						(void *)&parg, &parglen);
+			if (ret) {
+				log_print("Can't peel off a socket for "
+					  "connection %d to node %d: err=%d\n",
+					  parg.associd, nodeid, ret);
+			}
+			file = fget(parg.sd);
+			new_con->sock = SOCKET_I(file->f_dentry->d_inode);
+			add_sock(new_con->sock, new_con);
+			fput(file);
+			put_unused_fd(parg.sd);
+
+			log_print("got new/restarted association %d nodeid %d",
+				 (int)sn->sn_assoc_change.sac_assoc_id, nodeid);
+
+			/* Send any pending writes */
+			clear_bit(CF_CONNECT_PENDING, &new_con->flags);
+			clear_bit(CF_INIT_PENDING, &con->flags);
+			if (!test_and_set_bit(CF_WRITE_PENDING, &new_con->flags)) {
+				queue_work(send_workqueue, &new_con->swork);
+			}
+			if (!test_and_set_bit(CF_READ_PENDING, &new_con->flags))
+				queue_work(recv_workqueue, &new_con->rwork);
+		}
+		break;
+
+		case SCTP_COMM_LOST:
+		case SCTP_SHUTDOWN_COMP:
+		{
+			con = assoc2con(sn->sn_assoc_change.sac_assoc_id);
+			if (con) {
+				con->sctp_assoc = 0;
+			}
+		}
+		break;
+
+		/* We don't know which INIT failed, so clear the PENDING flags
+		 * on them all.  if assoc_id is zero then it will then try
+		 * again */
+
+		case SCTP_CANT_STR_ASSOC:
+		{
+			log_print("Can't start SCTP association - retrying");
+			sctp_init_failed();
+		}
+		break;
+
+		default:
+			log_print("unexpected SCTP assoc change id=%d state=%d",
+				  (int)sn->sn_assoc_change.sac_assoc_id,
+				  sn->sn_assoc_change.sac_state);
+		}
+	}
+}
+
 /* Data received from remote end */
 static int receive_from_sock(struct connection *con)
 {
@@ -274,6 +533,7 @@ static int receive_from_sock(struct connection *con)
 	int r;
 	int call_again_soon = 0;
 	int nvec;
+	char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
 
 	mutex_lock(&con->sock_mutex);
 
@@ -293,12 +553,18 @@ static int receive_from_sock(struct connection *con)
 		cbuf_init(&con->cb, PAGE_CACHE_SIZE);
 	}
 
+	/* Only SCTP needs these really */
+	memset(&incmsg, 0, sizeof(incmsg));
+	msg.msg_control = incmsg;
+	msg.msg_controllen = sizeof(incmsg);
+
 	/*
 	 * iov[0] is the bit of the circular buffer between the current end
 	 * point (cb.base + cb.len) and the end of the buffer.
 	 */
 	iov[0].iov_len = con->cb.base - cbuf_data(&con->cb);
 	iov[0].iov_base = page_address(con->rx_page) + cbuf_data(&con->cb);
+	iov[1].iov_len = 0;
 	nvec = 1;
 
 	/*
@@ -315,11 +581,20 @@ static int receive_from_sock(struct connection *con)
 
 	r = ret = kernel_recvmsg(con->sock, &msg, iov, nvec, len,
 			       MSG_DONTWAIT | MSG_NOSIGNAL);
-
 	if (ret <= 0)
 		goto out_close;
-	if (ret == -EAGAIN)
-		goto out_resched;
+
+	/* Process SCTP notifications */
+	if (msg.msg_flags & MSG_NOTIFICATION) {
+		msg.msg_control = incmsg;
+		msg.msg_controllen = sizeof(incmsg);
+
+		process_sctp_notification(con, &msg,
+				page_address(con->rx_page) + con->cb.base);
+		mutex_unlock(&con->sock_mutex);
+		return 0;
+	}
+	BUG_ON(con->nodeid == 0);
 
 	if (ret == len)
 		call_again_soon = 1;
@@ -329,10 +604,10 @@ static int receive_from_sock(struct connection *con)
 					  con->cb.base, con->cb.len,
 					  PAGE_CACHE_SIZE);
 	if (ret == -EBADMSG) {
-		printk(KERN_INFO "dlm: lowcomms: addr=%p, base=%u, len=%u, "
-		       "iov_len=%u, iov_base[0]=%p, read=%d\n",
-		       page_address(con->rx_page), con->cb.base, con->cb.len,
-		       len, iov[0].iov_base, r);
+		log_print("lowcomms: addr=%p, base=%u, len=%u, "
+			  "iov_len=%u, iov_base[0]=%p, read=%d",
+			  page_address(con->rx_page), con->cb.base, con->cb.len,
+			  len, iov[0].iov_base, r);
 	}
 	if (ret < 0)
 		goto out_close;
@@ -368,7 +643,7 @@ out_close:
 }
 
 /* Listening socket is busy, accept a connection */
-static int accept_from_sock(struct connection *con)
+static int tcp_accept_from_sock(struct connection *con)
 {
 	int result;
 	struct sockaddr_storage peeraddr;
@@ -379,7 +654,7 @@ static int accept_from_sock(struct connection *con)
 	struct connection *addcon;
 
 	memset(&peeraddr, 0, sizeof(peeraddr));
-	result = sock_create_kern(dlm_local_addr.ss_family, SOCK_STREAM,
+	result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM,
 				  IPPROTO_TCP, &newsock);
 	if (result < 0)
 		return -ENOMEM;
@@ -408,7 +683,7 @@ static int accept_from_sock(struct connection *con)
 	/* Get the new node's NODEID */
 	make_sockaddr(&peeraddr, 0, &len);
 	if (dlm_addr_to_nodeid(&peeraddr, &nodeid)) {
-		printk("dlm: connect from non cluster node\n");
+		log_print("connect from non cluster node");
 		sock_release(newsock);
 		mutex_unlock(&con->sock_mutex);
 		return -1;
@@ -419,7 +694,6 @@ static int accept_from_sock(struct connection *con)
 	/*  Check to see if we already have a connection to this node. This
 	 *  could happen if the two nodes initiate a connection at roughly
 	 *  the same time and the connections cross on the wire.
-	 * TEMPORARY FIX:
 	 *  In this case we store the incoming one in "othercon"
 	 */
 	newcon = nodeid2con(nodeid, GFP_KERNEL);
@@ -434,7 +708,7 @@ static int accept_from_sock(struct connection *con)
 		if (!othercon) {
 			othercon = kmem_cache_zalloc(con_cache, GFP_KERNEL);
 			if (!othercon) {
-				printk("dlm: failed to allocate incoming socket\n");
+				log_print("failed to allocate incoming socket");
 				mutex_unlock(&newcon->sock_mutex);
 				result = -ENOMEM;
 				goto accept_err;
@@ -477,12 +751,107 @@ accept_err:
 	sock_release(newsock);
 
 	if (result != -EAGAIN)
-		printk("dlm: error accepting connection from node: %d\n", result);
+		log_print("error accepting connection from node: %d", result);
 	return result;
 }
 
+static void free_entry(struct writequeue_entry *e)
+{
+	__free_page(e->page);
+	kfree(e);
+}
+
+/* Initiate an SCTP association.
+   This is a special case of send_to_sock() in that we don't yet have a
+   peeled-off socket for this association, so we use the listening socket
+   and add the primary IP address of the remote node.
+ */
+static void sctp_init_assoc(struct connection *con)
+{
+	struct sockaddr_storage rem_addr;
+	char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+	struct msghdr outmessage;
+	struct cmsghdr *cmsg;
+	struct sctp_sndrcvinfo *sinfo;
+	struct connection *base_con;
+	struct writequeue_entry *e;
+	int len, offset;
+	int ret;
+	int addrlen;
+	struct kvec iov[1];
+
+	if (test_and_set_bit(CF_INIT_PENDING, &con->flags))
+		return;
+
+	if (con->retries++ > MAX_CONNECT_RETRIES)
+		return;
+
+	log_print("Initiating association with node %d", con->nodeid);
+
+	if (nodeid_to_addr(con->nodeid, (struct sockaddr *)&rem_addr)) {
+		log_print("no address for nodeid %d", con->nodeid);
+		return;
+	}
+	base_con = nodeid2con(0, 0);
+	BUG_ON(base_con == NULL);
+
+	make_sockaddr(&rem_addr, dlm_config.ci_tcp_port, &addrlen);
+
+	outmessage.msg_name = &rem_addr;
+	outmessage.msg_namelen = addrlen;
+	outmessage.msg_control = outcmsg;
+	outmessage.msg_controllen = sizeof(outcmsg);
+	outmessage.msg_flags = MSG_EOR;
+
+	spin_lock(&con->writequeue_lock);
+	e = list_entry(con->writequeue.next, struct writequeue_entry,
+		       list);
+
+	BUG_ON((struct list_head *) e == &con->writequeue);
+
+	len = e->len;
+	offset = e->offset;
+	spin_unlock(&con->writequeue_lock);
+	kmap(e->page);
+
+	/* Send the first block off the write queue */
+	iov[0].iov_base = page_address(e->page)+offset;
+	iov[0].iov_len = len;
+
+	cmsg = CMSG_FIRSTHDR(&outmessage);
+	cmsg->cmsg_level = IPPROTO_SCTP;
+	cmsg->cmsg_type = SCTP_SNDRCV;
+	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
+	sinfo = CMSG_DATA(cmsg);
+	memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
+	sinfo->sinfo_ppid = cpu_to_le32(dlm_our_nodeid());
+	outmessage.msg_controllen = cmsg->cmsg_len;
+
+	ret = kernel_sendmsg(base_con->sock, &outmessage, iov, 1, len);
+	if (ret < 0) {
+		log_print("Send first packet to node %d failed: %d",
+			  con->nodeid, ret);
+
+		/* Try again later */
+		clear_bit(CF_CONNECT_PENDING, &con->flags);
+		clear_bit(CF_INIT_PENDING, &con->flags);
+	}
+	else {
+		spin_lock(&con->writequeue_lock);
+		e->offset += ret;
+		e->len -= ret;
+
+		if (e->len == 0 && e->users == 0) {
+			list_del(&e->list);
+			kunmap(e->page);
+			free_entry(e);
+		}
+		spin_unlock(&con->writequeue_lock);
+	}
+}
+
 /* Connect a new socket to its peer */
-static void connect_to_sock(struct connection *con)
+static void tcp_connect_to_sock(struct connection *con)
 {
 	int result = -EHOSTUNREACH;
 	struct sockaddr_storage saddr;
@@ -505,7 +874,7 @@ static void connect_to_sock(struct connection *con)
 	}
 
 	/* Create a socket to communicate with */
-	result = sock_create_kern(dlm_local_addr.ss_family, SOCK_STREAM,
+	result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM,
 				  IPPROTO_TCP, &sock);
 	if (result < 0)
 		goto out_err;
@@ -516,11 +885,11 @@ static void connect_to_sock(struct connection *con)
 
 	sock->sk->sk_user_data = con;
 	con->rx_action = receive_from_sock;
+	con->connect_action = tcp_connect_to_sock;
+	add_sock(sock, con);
 
 	make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len);
 
-	add_sock(sock, con);
-
 	log_print("connecting to %d", con->nodeid);
 	result =
 		sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len,
@@ -550,64 +919,57 @@ out:
 	return;
 }
 
-static struct socket *create_listen_sock(struct connection *con,
-					 struct sockaddr_storage *saddr)
+static struct socket *tcp_create_listen_sock(struct connection *con,
+					     struct sockaddr_storage *saddr)
 {
 	struct socket *sock = NULL;
-	mm_segment_t fs;
 	int result = 0;
 	int one = 1;
 	int addr_len;
 
-	if (dlm_local_addr.ss_family == AF_INET)
+	if (dlm_local_addr[0]->ss_family == AF_INET)
 		addr_len = sizeof(struct sockaddr_in);
 	else
 		addr_len = sizeof(struct sockaddr_in6);
 
 	/* Create a socket to communicate with */
-	result = sock_create_kern(dlm_local_addr.ss_family, SOCK_STREAM, IPPROTO_TCP, &sock);
+	result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM,
+				  IPPROTO_TCP, &sock);
 	if (result < 0) {
-		printk("dlm: Can't create listening comms socket\n");
+		log_print("Can't create listening comms socket");
 		goto create_out;
 	}
 
-	fs = get_fs();
-	set_fs(get_ds());
-	result = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
-				 (char *)&one, sizeof(one));
-	set_fs(fs);
+	result = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
+				   (char *)&one, sizeof(one));
+
 	if (result < 0) {
-		printk("dlm: Failed to set SO_REUSEADDR on socket: result=%d\n",
-		       result);
+		log_print("Failed to set SO_REUSEADDR on socket: %d", result);
 	}
 	sock->sk->sk_user_data = con;
-	con->rx_action = accept_from_sock;
+	con->rx_action = tcp_accept_from_sock;
+	con->connect_action = tcp_connect_to_sock;
 	con->sock = sock;
 
 	/* Bind to our port */
 	make_sockaddr(saddr, dlm_config.ci_tcp_port, &addr_len);
 	result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len);
 	if (result < 0) {
-		printk("dlm: Can't bind to port %d\n", dlm_config.ci_tcp_port);
+		log_print("Can't bind to port %d", dlm_config.ci_tcp_port);
 		sock_release(sock);
 		sock = NULL;
 		con->sock = NULL;
 		goto create_out;
 	}
-
-	fs = get_fs();
-	set_fs(get_ds());
-
-	result = sock_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
+	result = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
 				 (char *)&one, sizeof(one));
-	set_fs(fs);
 	if (result < 0) {
-		printk("dlm: Set keepalive failed: %d\n", result);
+		log_print("Set keepalive failed: %d", result);
 	}
 
 	result = sock->ops->listen(sock, 5);
 	if (result < 0) {
-		printk("dlm: Can't listen on port %d\n", dlm_config.ci_tcp_port);
+		log_print("Can't listen on port %d", dlm_config.ci_tcp_port);
 		sock_release(sock);
 		sock = NULL;
 		goto create_out;
@@ -617,18 +979,146 @@ create_out:
 	return sock;
 }
 
+/* Get local addresses */
+static void init_local(void)
+{
+	struct sockaddr_storage sas, *addr;
+	int i;
+
+	dlm_local_count = 0;
+	for (i = 0; i < DLM_MAX_ADDR_COUNT - 1; i++) {
+		if (dlm_our_addr(&sas, i))
+			break;
+
+		addr = kmalloc(sizeof(*addr), GFP_KERNEL);
+		if (!addr)
+			break;
+		memcpy(addr, &sas, sizeof(*addr));
+		dlm_local_addr[dlm_local_count++] = addr;
+	}
+}
+
+/* Bind to an IP address. SCTP allows multiple address so it can do
+   multi-homing */
+static int add_sctp_bind_addr(struct connection *sctp_con,
+			      struct sockaddr_storage *addr,
+			      int addr_len, int num)
+{
+	int result = 0;
+
+	if (num == 1)
+		result = kernel_bind(sctp_con->sock,
+				     (struct sockaddr *) addr,
+				     addr_len);
+	else
+		result = kernel_setsockopt(sctp_con->sock, SOL_SCTP,
+					   SCTP_SOCKOPT_BINDX_ADD,
+					   (char *)addr, addr_len);
+
+	if (result < 0)
+		log_print("Can't bind to port %d addr number %d",
+			  dlm_config.ci_tcp_port, num);
+
+	return result;
+}
 
-/* Listen on all interfaces */
-static int listen_for_all(void)
+/* Initialise SCTP socket and bind to all interfaces */
+static int sctp_listen_for_all(void)
+{
+	struct socket *sock = NULL;
+	struct sockaddr_storage localaddr;
+	struct sctp_event_subscribe subscribe;
+	int result = -EINVAL, num = 1, i, addr_len;
+	struct connection *con = nodeid2con(0, GFP_KERNEL);
+	int bufsize = NEEDED_RMEM;
+
+	if (!con)
+		return -ENOMEM;
+
+	log_print("Using SCTP for communications");
+
+	result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_SEQPACKET,
+				  IPPROTO_SCTP, &sock);
+	if (result < 0) {
+		log_print("Can't create comms socket, check SCTP is loaded");
+		goto out;
+	}
+
+	/* Listen for events */
+	memset(&subscribe, 0, sizeof(subscribe));
+	subscribe.sctp_data_io_event = 1;
+	subscribe.sctp_association_event = 1;
+	subscribe.sctp_send_failure_event = 1;
+	subscribe.sctp_shutdown_event = 1;
+	subscribe.sctp_partial_delivery_event = 1;
+
+	result = kernel_setsockopt(sock, SOL_SOCKET, SO_RCVBUF,
+				 (char *)&bufsize, sizeof(bufsize));
+	if (result)
+		log_print("Error increasing buffer space on socket %d", result);
+
+	result = kernel_setsockopt(sock, SOL_SCTP, SCTP_EVENTS,
+				   (char *)&subscribe, sizeof(subscribe));
+	if (result < 0) {
+		log_print("Failed to set SCTP_EVENTS on socket: result=%d",
+			  result);
+		goto create_delsock;
+	}
+
+	/* Init con struct */
+	sock->sk->sk_user_data = con;
+	con->sock = sock;
+	con->sock->sk->sk_data_ready = lowcomms_data_ready;
+	con->rx_action = receive_from_sock;
+	con->connect_action = sctp_init_assoc;
+
+	/* Bind to all interfaces. */
+	for (i = 0; i < dlm_local_count; i++) {
+		memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr));
+		make_sockaddr(&localaddr, dlm_config.ci_tcp_port, &addr_len);
+
+		result = add_sctp_bind_addr(con, &localaddr, addr_len, num);
+		if (result)
+			goto create_delsock;
+		++num;
+	}
+
+	result = sock->ops->listen(sock, 5);
+	if (result < 0) {
+		log_print("Can't set socket listening");
+		goto create_delsock;
+	}
+
+	return 0;
+
+create_delsock:
+	sock_release(sock);
+	con->sock = NULL;
+out:
+	return result;
+}
+
+static int tcp_listen_for_all(void)
 {
 	struct socket *sock = NULL;
 	struct connection *con = nodeid2con(0, GFP_KERNEL);
 	int result = -EINVAL;
 
+	if (!con)
+		return -ENOMEM;
+
 	/* We don't support multi-homed hosts */
+	if (dlm_local_addr[1] != NULL) {
+		log_print("TCP protocol can't handle multi-homed hosts, "
+			  "try SCTP");
+		return -EINVAL;
+	}
+
+	log_print("Using TCP for communications");
+
 	set_bit(CF_IS_OTHERCON, &con->flags);
 
-	sock = create_listen_sock(con, &dlm_local_addr);
+	sock = tcp_create_listen_sock(con, dlm_local_addr[0]);
 	if (sock) {
 		add_sock(sock, con);
 		result = 0;
@@ -666,8 +1156,7 @@ static struct writequeue_entry *new_writequeue_entry(struct connection *con,
 	return entry;
 }
 
-void *dlm_lowcomms_get_buffer(int nodeid, int len,
-			      gfp_t allocation, char **ppc)
+void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
 {
 	struct connection *con;
 	struct writequeue_entry *e;
@@ -735,12 +1224,6 @@ out:
 	return;
 }
 
-static void free_entry(struct writequeue_entry *e)
-{
-	__free_page(e->page);
-	kfree(e);
-}
-
 /* Send a message */
 static void send_to_sock(struct connection *con)
 {
@@ -777,8 +1260,7 @@ static void send_to_sock(struct connection *con)
 				goto out;
 			if (ret <= 0)
 				goto send_error;
-		}
-		else {
+		} else {
 			/* Don't starve people filling buffers */
 			cond_resched();
 		}
@@ -807,7 +1289,8 @@ send_error:
 
 out_connect:
 	mutex_unlock(&con->sock_mutex);
-	connect_to_sock(con);
+	if (!test_bit(CF_INIT_PENDING, &con->flags))
+		lowcomms_connect_sock(con);
 	return;
 }
 
@@ -832,9 +1315,6 @@ int dlm_lowcomms_close(int nodeid)
 {
 	struct connection *con;
 
-	if (!connections)
-		goto out;
-
 	log_print("closing connection to node %d", nodeid);
 	con = nodeid2con(nodeid, 0);
 	if (con) {
@@ -842,12 +1322,9 @@ int dlm_lowcomms_close(int nodeid)
 		close_connection(con, true);
 	}
 	return 0;
-
-out:
-	return -1;
 }
 
-/* Look for activity on active sockets */
+/* Receive workqueue function */
 static void process_recv_sockets(struct work_struct *work)
 {
 	struct connection *con = container_of(work, struct connection, rwork);
@@ -859,15 +1336,14 @@ static void process_recv_sockets(struct work_struct *work)
 	} while (!err);
 }
 
-
+/* Send workqueue function */
 static void process_send_sockets(struct work_struct *work)
 {
 	struct connection *con = container_of(work, struct connection, swork);
 
 	if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
-		connect_to_sock(con);
+		con->connect_action(con);
 	}
-
 	clear_bit(CF_WRITE_PENDING, &con->flags);
 	send_to_sock(con);
 }
@@ -878,8 +1354,8 @@ static void clean_writequeues(void)
 {
 	int nodeid;
 
-	for (nodeid = 1; nodeid < conn_array_size; nodeid++) {
-		struct connection *con = nodeid2con(nodeid, 0);
+	for (nodeid = 1; nodeid <= max_nodeid; nodeid++) {
+		struct connection *con = __nodeid2con(nodeid, 0);
 
 		if (con)
 			clean_one_writequeue(con);
@@ -916,64 +1392,67 @@ static int work_start(void)
 void dlm_lowcomms_stop(void)
 {
 	int i;
+	struct connection *con;
 
 	/* Set all the flags to prevent any
 	   socket activity.
 	*/
-	for (i = 0; i < conn_array_size; i++) {
-		if (connections[i])
-			connections[i]->flags |= 0xFF;
+	down(&connections_lock);
+	for (i = 0; i <= max_nodeid; i++) {
+		con = __nodeid2con(i, 0);
+		if (con)
+			con->flags |= 0xFF;
 	}
+	up(&connections_lock);
 
 	work_stop();
+
+	down(&connections_lock);
 	clean_writequeues();
 
-	for (i = 0; i < conn_array_size; i++) {
-		if (connections[i]) {
-			close_connection(connections[i], true);
-			if (connections[i]->othercon)
-				kmem_cache_free(con_cache, connections[i]->othercon);
-			kmem_cache_free(con_cache, connections[i]);
+	for (i = 0; i <= max_nodeid; i++) {
+		con = __nodeid2con(i, 0);
+		if (con) {
+			close_connection(con, true);
+			if (con->othercon)
+				kmem_cache_free(con_cache, con->othercon);
+			kmem_cache_free(con_cache, con);
 		}
 	}
-
-	kfree(connections);
-	connections = NULL;
-
+	max_nodeid = 0;
+	up(&connections_lock);
 	kmem_cache_destroy(con_cache);
+	idr_init(&connections_idr);
 }
 
-/* This is quite likely to sleep... */
 int dlm_lowcomms_start(void)
 {
-	int error = 0;
-
-	error = -ENOMEM;
-	connections = kzalloc(sizeof(struct connection *) *
-			      NODE_INCREMENT, GFP_KERNEL);
-	if (!connections)
-		goto out;
-
-	conn_array_size = NODE_INCREMENT;
+	int error = -EINVAL;
+	struct connection *con;
 
-	if (dlm_our_addr(&dlm_local_addr, 0)) {
+	init_local();
+	if (!dlm_local_count) {
+		error = -ENOTCONN;
 		log_print("no local IP address has been set");
-		goto fail_free_conn;
-	}
-	if (!dlm_our_addr(&dlm_local_addr, 1)) {
-		log_print("This dlm comms module does not support multi-homed clustering");
-		goto fail_free_conn;
+		goto out;
 	}
 
+	error = -ENOMEM;
 	con_cache = kmem_cache_create("dlm_conn", sizeof(struct connection),
 				      __alignof__(struct connection), 0,
 				      NULL, NULL);
 	if (!con_cache)
-		goto fail_free_conn;
+		goto out;
 
+	/* Set some sysctl minima */
+	if (sysctl_rmem_max < NEEDED_RMEM)
+		sysctl_rmem_max = NEEDED_RMEM;
 
 	/* Start listening */
-	error = listen_for_all();
+	if (dlm_config.ci_protocol == 0)
+		error = tcp_listen_for_all();
+	else
+		error = sctp_listen_for_all();
 	if (error)
 		goto fail_unlisten;
 
@@ -984,24 +1463,13 @@ int dlm_lowcomms_start(void)
 	return 0;
 
 fail_unlisten:
-	close_connection(connections[0], false);
-	kmem_cache_free(con_cache, connections[0]);
+	con = nodeid2con(0,0);
+	if (con) {
+		close_connection(con, false);
+		kmem_cache_free(con_cache, con);
+	}
 	kmem_cache_destroy(con_cache);
 
-fail_free_conn:
-	kfree(connections);
-
 out:
 	return error;
 }
-
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 3870150b83a..b0201ec325a 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2006 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2006-2007 Red Hat, Inc.  All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -56,6 +56,7 @@ struct dlm_write_request32 {
 	union  {
 		struct dlm_lock_params32 lock;
 		struct dlm_lspace_params lspace;
+		struct dlm_purge_params purge;
 	} i;
 };
 
@@ -92,6 +93,9 @@ static void compat_input(struct dlm_write_request *kb,
 		kb->i.lspace.flags = kb32->i.lspace.flags;
 		kb->i.lspace.minor = kb32->i.lspace.minor;
 		strcpy(kb->i.lspace.name, kb32->i.lspace.name);
+	} else if (kb->cmd == DLM_USER_PURGE) {
+		kb->i.purge.nodeid = kb32->i.purge.nodeid;
+		kb->i.purge.pid = kb32->i.purge.pid;
 	} else {
 		kb->i.lock.mode = kb32->i.lock.mode;
 		kb->i.lock.namelen = kb32->i.lock.namelen;
@@ -111,8 +115,6 @@ static void compat_input(struct dlm_write_request *kb,
 static void compat_output(struct dlm_lock_result *res,
 			  struct dlm_lock_result32 *res32)
 {
-	res32->length = res->length - (sizeof(struct dlm_lock_result) -
-				       sizeof(struct dlm_lock_result32));
 	res32->user_astaddr = (__u32)(long)res->user_astaddr;
 	res32->user_astparam = (__u32)(long)res->user_astparam;
 	res32->user_lksb = (__u32)(long)res->user_lksb;
@@ -128,35 +130,30 @@ static void compat_output(struct dlm_lock_result *res,
 }
 #endif
 
+/* we could possibly check if the cancel of an orphan has resulted in the lkb
+   being removed and then remove that lkb from the orphans list and free it */
 
 void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
 {
 	struct dlm_ls *ls;
 	struct dlm_user_args *ua;
 	struct dlm_user_proc *proc;
-	int remove_ownqueue = 0;
+	int eol = 0, ast_type;
 
-	/* dlm_clear_proc_locks() sets ORPHAN/DEAD flag on each
-	   lkb before dealing with it.  We need to check this
-	   flag before taking ls_clear_proc_locks mutex because if
-	   it's set, dlm_clear_proc_locks() holds the mutex. */
-
-	if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) {
-		/* log_print("user_add_ast skip1 %x", lkb->lkb_flags); */
+	if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD))
 		return;
-	}
 
 	ls = lkb->lkb_resource->res_ls;
 	mutex_lock(&ls->ls_clear_proc_locks);
 
 	/* If ORPHAN/DEAD flag is set, it means the process is dead so an ast
 	   can't be delivered.  For ORPHAN's, dlm_clear_proc_locks() freed
-	   lkb->ua so we can't try to use it. */
+	   lkb->ua so we can't try to use it.  This second check is necessary
+	   for cases where a completion ast is received for an operation that
+	   began before clear_proc_locks did its cancel/unlock. */
 
-	if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) {
-		/* log_print("user_add_ast skip2 %x", lkb->lkb_flags); */
+	if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD))
 		goto out;
-	}
 
 	DLM_ASSERT(lkb->lkb_astparam, dlm_print_lkb(lkb););
 	ua = (struct dlm_user_args *)lkb->lkb_astparam;
@@ -166,28 +163,42 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
 		goto out;
 
 	spin_lock(&proc->asts_spin);
-	if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
+
+	ast_type = lkb->lkb_ast_type;
+	lkb->lkb_ast_type |= type;
+
+	if (!ast_type) {
 		kref_get(&lkb->lkb_ref);
 		list_add_tail(&lkb->lkb_astqueue, &proc->asts);
-		lkb->lkb_ast_type |= type;
 		wake_up_interruptible(&proc->wait);
 	}
-
-	/* noqueue requests that fail may need to be removed from the
-	   proc's locks list, there should be a better way of detecting
-	   this situation than checking all these things... */
-
-	if (type == AST_COMP && lkb->lkb_grmode == DLM_LOCK_IV &&
-	    ua->lksb.sb_status == -EAGAIN && !list_empty(&lkb->lkb_ownqueue))
-		remove_ownqueue = 1;
-
-	/* unlocks or cancels of waiting requests need to be removed from the
-	   proc's unlocking list, again there must be a better way...  */
-
-	if (ua->lksb.sb_status == -DLM_EUNLOCK ||
+	if (type == AST_COMP && (ast_type & AST_COMP))
+		log_debug(ls, "ast overlap %x status %x %x",
+			  lkb->lkb_id, ua->lksb.sb_status, lkb->lkb_flags);
+
+	/* Figure out if this lock is at the end of its life and no longer
+	   available for the application to use.  The lkb still exists until
+	   the final ast is read.  A lock becomes EOL in three situations:
+	     1. a noqueue request fails with EAGAIN
+	     2. an unlock completes with EUNLOCK
+	     3. a cancel of a waiting request completes with ECANCEL
+	   An EOL lock needs to be removed from the process's list of locks.
+	   And we can't allow any new operation on an EOL lock.  This is
+	   not related to the lifetime of the lkb struct which is managed
+	   entirely by refcount. */
+
+	if (type == AST_COMP &&
+	    lkb->lkb_grmode == DLM_LOCK_IV &&
+	    ua->lksb.sb_status == -EAGAIN)
+		eol = 1;
+	else if (ua->lksb.sb_status == -DLM_EUNLOCK ||
 	    (ua->lksb.sb_status == -DLM_ECANCEL &&
 	     lkb->lkb_grmode == DLM_LOCK_IV))
-		remove_ownqueue = 1;
+		eol = 1;
+	if (eol) {
+		lkb->lkb_ast_type &= ~AST_BAST;
+		lkb->lkb_flags |= DLM_IFL_ENDOFLIFE;
+	}
 
 	/* We want to copy the lvb to userspace when the completion
 	   ast is read if the status is 0, the lock has an lvb and
@@ -204,11 +215,13 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
 
 	spin_unlock(&proc->asts_spin);
 
-	if (remove_ownqueue) {
+	if (eol) {
 		spin_lock(&ua->proc->locks_spin);
-		list_del_init(&lkb->lkb_ownqueue);
+		if (!list_empty(&lkb->lkb_ownqueue)) {
+			list_del_init(&lkb->lkb_ownqueue);
+			dlm_put_lkb(lkb);
+		}
 		spin_unlock(&ua->proc->locks_spin);
-		dlm_put_lkb(lkb);
 	}
  out:
 	mutex_unlock(&ls->ls_clear_proc_locks);
@@ -286,47 +299,71 @@ static int device_user_unlock(struct dlm_user_proc *proc,
 	return error;
 }
 
-static int device_create_lockspace(struct dlm_lspace_params *params)
+static int create_misc_device(struct dlm_ls *ls, char *name)
 {
-	dlm_lockspace_t *lockspace;
-	struct dlm_ls *ls;
 	int error, len;
 
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	error = dlm_new_lockspace(params->name, strlen(params->name),
-				  &lockspace, 0, DLM_USER_LVB_LEN);
-	if (error)
-		return error;
-
-	ls = dlm_find_lockspace_local(lockspace);
-	if (!ls)
-		return -ENOENT;
-
 	error = -ENOMEM;
-	len = strlen(params->name) + strlen(name_prefix) + 2;
+	len = strlen(name) + strlen(name_prefix) + 2;
 	ls->ls_device.name = kzalloc(len, GFP_KERNEL);
 	if (!ls->ls_device.name)
 		goto fail;
+
 	snprintf((char *)ls->ls_device.name, len, "%s_%s", name_prefix,
-		 params->name);
+		 name);
 	ls->ls_device.fops = &device_fops;
 	ls->ls_device.minor = MISC_DYNAMIC_MINOR;
 
 	error = misc_register(&ls->ls_device);
 	if (error) {
 		kfree(ls->ls_device.name);
-		goto fail;
 	}
+fail:
+	return error;
+}
+
+static int device_user_purge(struct dlm_user_proc *proc,
+			     struct dlm_purge_params *params)
+{
+	struct dlm_ls *ls;
+	int error;
+
+	ls = dlm_find_lockspace_local(proc->lockspace);
+	if (!ls)
+		return -ENOENT;
+
+	error = dlm_user_purge(ls, proc, params->nodeid, params->pid);
 
-	error = ls->ls_device.minor;
 	dlm_put_lockspace(ls);
 	return error;
+}
+
+static int device_create_lockspace(struct dlm_lspace_params *params)
+{
+	dlm_lockspace_t *lockspace;
+	struct dlm_ls *ls;
+	int error;
 
- fail:
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	error = dlm_new_lockspace(params->name, strlen(params->name),
+				  &lockspace, 0, DLM_USER_LVB_LEN);
+	if (error)
+		return error;
+
+	ls = dlm_find_lockspace_local(lockspace);
+	if (!ls)
+		return -ENOENT;
+
+	error = create_misc_device(ls, params->name);
 	dlm_put_lockspace(ls);
-	dlm_release_lockspace(lockspace, 0);
+
+	if (error)
+		dlm_release_lockspace(lockspace, 0);
+	else
+		error = ls->ls_device.minor;
+
 	return error;
 }
 
@@ -343,6 +380,10 @@ static int device_remove_lockspace(struct dlm_lspace_params *params)
 	if (!ls)
 		return -ENOENT;
 
+	/* Deregister the misc device first, so we don't have
+	 * a device that's not attached to a lockspace. If
+	 * dlm_release_lockspace fails then we can recreate it
+	 */
 	error = misc_deregister(&ls->ls_device);
 	if (error) {
 		dlm_put_lockspace(ls);
@@ -361,6 +402,8 @@ static int device_remove_lockspace(struct dlm_lspace_params *params)
 
 	dlm_put_lockspace(ls);
 	error = dlm_release_lockspace(lockspace, force);
+	if (error)
+		create_misc_device(ls, ls->ls_name);
  out:
 	return error;
 }
@@ -497,6 +540,14 @@ static ssize_t device_write(struct file *file, const char __user *buf,
 		error = device_remove_lockspace(&kbuf->i.lspace);
 		break;
 
+	case DLM_USER_PURGE:
+		if (!proc) {
+			log_print("no locking on control device");
+			goto out_sig;
+		}
+		error = device_user_purge(proc, &kbuf->i.purge);
+		break;
+
 	default:
 		log_print("Unknown command passed to DLM device : %d\n",
 			  kbuf->cmd);
diff --git a/fs/dquot.c b/fs/dquot.c
index b16f991662c..0a5febc159f 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -1432,7 +1432,7 @@ int vfs_quota_off(struct super_block *sb, int type)
 			mutex_unlock(&dqopt->dqonoff_mutex);
 		}
 	if (sb->s_bdev)
-		invalidate_bdev(sb->s_bdev, 0);
+		invalidate_bdev(sb->s_bdev);
 	return 0;
 }
 
@@ -1468,7 +1468,7 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
 	 * we see all the changes from userspace... */
 	write_inode_now(inode, 1);
 	/* And now flush the block cache so that kernel sees the changes */
-	invalidate_bdev(sb->s_bdev, 0);
+	invalidate_bdev(sb->s_bdev);
 	mutex_lock(&inode->i_mutex);
 	mutex_lock(&dqopt->dqonoff_mutex);
 	if (sb_has_quota_enabled(sb, type)) {
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index fc4a3a22464..8cbf3f69ebe 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -583,8 +583,7 @@ inode_info_init_once(void *vptr, struct kmem_cache *cachep, unsigned long flags)
 {
 	struct ecryptfs_inode_info *ei = (struct ecryptfs_inode_info *)vptr;
 
-	if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR)
+	if (flags & SLAB_CTOR_CONSTRUCTOR)
 		inode_init_once(&ei->vfs_inode);
 }
 
@@ -793,7 +792,7 @@ static int do_sysfs_registration(void)
 		       "Unable to register ecryptfs sysfs subsystem\n");
 		goto out;
 	}
-	rc = sysfs_create_file(&ecryptfs_subsys.kset.kobj,
+	rc = sysfs_create_file(&ecryptfs_subsys.kobj,
 			       &sysfs_attr_version.attr);
 	if (rc) {
 		printk(KERN_ERR
@@ -801,12 +800,12 @@ static int do_sysfs_registration(void)
 		subsystem_unregister(&ecryptfs_subsys);
 		goto out;
 	}
-	rc = sysfs_create_file(&ecryptfs_subsys.kset.kobj,
+	rc = sysfs_create_file(&ecryptfs_subsys.kobj,
 			       &sysfs_attr_version_str.attr);
 	if (rc) {
 		printk(KERN_ERR
 		       "Unable to create ecryptfs version_str attribute\n");
-		sysfs_remove_file(&ecryptfs_subsys.kset.kobj,
+		sysfs_remove_file(&ecryptfs_subsys.kobj,
 				  &sysfs_attr_version.attr);
 		subsystem_unregister(&ecryptfs_subsys);
 		goto out;
@@ -841,7 +840,7 @@ static int __init ecryptfs_init(void)
 		ecryptfs_free_kmem_caches();
 		goto out;
 	}
-	kset_set_kset_s(&ecryptfs_subsys, fs_subsys);
+	kobj_set_kset_s(&ecryptfs_subsys, fs_subsys);
 	sysfs_attr_version.attr.owner = THIS_MODULE;
 	sysfs_attr_version_str.attr.owner = THIS_MODULE;
 	rc = do_sysfs_registration();
@@ -862,9 +861,9 @@ out:
 
 static void __exit ecryptfs_exit(void)
 {
-	sysfs_remove_file(&ecryptfs_subsys.kset.kobj,
+	sysfs_remove_file(&ecryptfs_subsys.kobj,
 			  &sysfs_attr_version.attr);
-	sysfs_remove_file(&ecryptfs_subsys.kset.kobj,
+	sysfs_remove_file(&ecryptfs_subsys.kobj,
 			  &sysfs_attr_version_str.attr);
 	subsystem_unregister(&ecryptfs_subsys);
 	ecryptfs_release_messaging(ecryptfs_transport);
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index b731b09499c..0770c4b66f5 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -46,7 +46,6 @@ struct kmem_cache *ecryptfs_lower_page_cache;
  */
 static struct page *ecryptfs_get1page(struct file *file, int index)
 {
-	struct page *page;
 	struct dentry *dentry;
 	struct inode *inode;
 	struct address_space *mapping;
@@ -54,14 +53,7 @@ static struct page *ecryptfs_get1page(struct file *file, int index)
 	dentry = file->f_path.dentry;
 	inode = dentry->d_inode;
 	mapping = inode->i_mapping;
-	page = read_cache_page(mapping, index,
-			       (filler_t *)mapping->a_ops->readpage,
-			       (void *)file);
-	if (IS_ERR(page))
-		goto out;
-	wait_on_page_locked(page);
-out:
-	return page;
+	return read_mapping_page(mapping, index, (void *)file);
 }
 
 static
@@ -233,7 +225,6 @@ int ecryptfs_do_readpage(struct file *file, struct page *page,
 		ecryptfs_printk(KERN_ERR, "Error reading from page cache\n");
 		goto out;
 	}
-	wait_on_page_locked(lower_page);
 	page_data = kmap_atomic(page, KM_USER0);
 	lower_page_data = kmap_atomic(lower_page, KM_USER1);
 	memcpy(page_data, lower_page_data, PAGE_CACHE_SIZE);
diff --git a/fs/efs/super.c b/fs/efs/super.c
index c2235e46edc..ba7a8b9da0c 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -72,8 +72,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
 	struct efs_inode_info *ei = (struct efs_inode_info *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR)
+	if (flags & SLAB_CTOR_CONSTRUCTOR)
 		inode_init_once(&ei->vfs_inode);
 }
  
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index e89bfc8cf95..1d1e7e30d70 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -161,10 +161,7 @@ static struct page * ext2_get_page(struct inode *dir, unsigned long n)
 	struct address_space *mapping = dir->i_mapping;
 	struct page *page = read_mapping_page(mapping, n, NULL);
 	if (!IS_ERR(page)) {
-		wait_on_page_locked(page);
 		kmap(page);
-		if (!PageUptodate(page))
-			goto fail;
 		if (!PageChecked(page))
 			ext2_check_page(page);
 		if (PageError(page))
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index a046a419d8a..685a1c28717 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -160,8 +160,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
 	struct ext2_inode_info *ei = (struct ext2_inode_info *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
+	if (flags & SLAB_CTOR_CONSTRUCTOR) {
 		rwlock_init(&ei->i_meta_lock);
 #ifdef CONFIG_EXT2_FS_XATTR
 		init_rwsem(&ei->xattr_sem);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 4a4fcd6868c..54d3c904125 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -420,7 +420,7 @@ static void ext3_put_super (struct super_block * sb)
 		dump_orphan_list(sb, sbi);
 	J_ASSERT(list_empty(&sbi->s_orphan));
 
-	invalidate_bdev(sb->s_bdev, 0);
+	invalidate_bdev(sb->s_bdev);
 	if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
 		/*
 		 * Invalidate the journal device's buffers.  We don't want them
@@ -428,7 +428,7 @@ static void ext3_put_super (struct super_block * sb)
 		 * hotswapped, and it breaks the `ro-after' testing code.
 		 */
 		sync_blockdev(sbi->journal_bdev);
-		invalidate_bdev(sbi->journal_bdev, 0);
+		invalidate_bdev(sbi->journal_bdev);
 		ext3_blkdev_remove(sbi);
 	}
 	sb->s_fs_info = NULL;
@@ -466,8 +466,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
 	struct ext3_inode_info *ei = (struct ext3_inode_info *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
+	if (flags & SLAB_CTOR_CONSTRUCTOR) {
 		INIT_LIST_HEAD(&ei->i_orphan);
 #ifdef CONFIG_EXT3_FS_XATTR
 		init_rwsem(&ei->xattr_sem);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 61c4718e4a5..71912693235 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -470,7 +470,7 @@ static void ext4_put_super (struct super_block * sb)
 		dump_orphan_list(sb, sbi);
 	J_ASSERT(list_empty(&sbi->s_orphan));
 
-	invalidate_bdev(sb->s_bdev, 0);
+	invalidate_bdev(sb->s_bdev);
 	if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
 		/*
 		 * Invalidate the journal device's buffers.  We don't want them
@@ -478,7 +478,7 @@ static void ext4_put_super (struct super_block * sb)
 		 * hotswapped, and it breaks the `ro-after' testing code.
 		 */
 		sync_blockdev(sbi->journal_bdev);
-		invalidate_bdev(sbi->journal_bdev, 0);
+		invalidate_bdev(sbi->journal_bdev);
 		ext4_blkdev_remove(sbi);
 	}
 	sb->s_fs_info = NULL;
@@ -517,8 +517,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
 	struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
+	if (flags & SLAB_CTOR_CONSTRUCTOR) {
 		INIT_LIST_HEAD(&ei->i_orphan);
 #ifdef CONFIG_EXT4DEV_FS_XATTR
 		init_rwsem(&ei->xattr_sem);
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 05c2941c74f..1959143c1d2 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -40,8 +40,7 @@ static void init_once(void *foo, struct kmem_cache *cachep, unsigned long flags)
 {
 	struct fat_cache *cache = (struct fat_cache *)foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR)
+	if (flags & SLAB_CTOR_CONSTRUCTOR)
 		INIT_LIST_HEAD(&cache->cache_list);
 }
 
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 9bfe607c892..65cb54bde48 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -499,8 +499,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
 	struct msdos_inode_info *ei = (struct msdos_inode_info *)foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
+	if (flags & SLAB_CTOR_CONSTRUCTOR) {
 		spin_lock_init(&ei->cache_lru_lock);
 		ei->nr_caches = 0;
 		ei->cache_valid_id = FAT_CACHE_VALID + 1;
diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c
index decac62efe5..ed8f0b0dd88 100644
--- a/fs/freevxfs/vxfs_subr.c
+++ b/fs/freevxfs/vxfs_subr.c
@@ -74,10 +74,7 @@ vxfs_get_page(struct address_space *mapping, u_long n)
 	pp = read_mapping_page(mapping, n, NULL);
 
 	if (!IS_ERR(pp)) {
-		wait_on_page_locked(pp);
 		kmap(pp);
-		if (!PageUptodate(pp))
-			goto fail;
 		/** if (!PageChecked(pp)) **/
 			/** vxfs_check_page(pp); **/
 		if (PageError(pp))
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 2fd06927e85..acfad65a6e8 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -738,8 +738,7 @@ static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
 
 	if (cmd == F_GETLK) {
 		if (fc->no_lock) {
-			if (!posix_test_lock(file, fl, fl))
-				fl->fl_type = F_UNLCK;
+			posix_test_lock(file, fl);
 			err = 0;
 		} else
 			err = fuse_getlk(file, fl);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 608db81219a..d8003be56e0 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -685,8 +685,7 @@ static void fuse_inode_init_once(void *foo, struct kmem_cache *cachep,
 {
 	struct inode * inode = foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR)
+	if (flags & SLAB_CTOR_CONSTRUCTOR)
 		inode_init_once(inode);
 }
 
@@ -731,12 +730,12 @@ static int fuse_sysfs_init(void)
 {
 	int err;
 
-	kset_set_kset_s(&fuse_subsys, fs_subsys);
+	kobj_set_kset_s(&fuse_subsys, fs_subsys);
 	err = subsystem_register(&fuse_subsys);
 	if (err)
 		goto out_err;
 
-	kset_set_kset_s(&connections_subsys, fuse_subsys);
+	kobj_set_kset_s(&connections_subsys, fuse_subsys);
 	err = subsystem_register(&connections_subsys);
 	if (err)
 		goto out_fuse_unregister;
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 82a1ac7895a..a96fa07b3f3 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1262,9 +1262,10 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
 			      u64 leaf_no)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct buffer_head *bh;
 	struct gfs2_leaf *lf;
-	unsigned entries = 0;
+	unsigned entries = 0, entries2 = 0;
 	unsigned leaves = 0;
 	const struct gfs2_dirent **darr, *dent;
 	struct dirent_gather g;
@@ -1290,7 +1291,13 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
 		return 0;
 
 	error = -ENOMEM;
-	larr = vmalloc((leaves + entries) * sizeof(void *));
+	/*
+	 * The extra 99 entries are not normally used, but are a buffer
+	 * zone in case the number of entries in the leaf is corrupt.
+	 * 99 is the maximum number of entries that can fit in a single
+	 * leaf block.
+	 */
+	larr = vmalloc((leaves + entries + 99) * sizeof(void *));
 	if (!larr)
 		goto out;
 	darr = (const struct gfs2_dirent **)(larr + leaves);
@@ -1305,10 +1312,20 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
 		lf = (struct gfs2_leaf *)bh->b_data;
 		lfn = be64_to_cpu(lf->lf_next);
 		if (lf->lf_entries) {
+			entries2 += be16_to_cpu(lf->lf_entries);
 			dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
 						gfs2_dirent_gather, NULL, &g);
 			error = PTR_ERR(dent);
-			if (IS_ERR(dent)) {
+			if (IS_ERR(dent))
+				goto out_kfree;
+			if (entries2 != g.offset) {
+				fs_warn(sdp, "Number of entries corrupt in dir "
+						"leaf %llu, entries2 (%u) != "
+						"g.offset (%u)\n",
+					(unsigned long long)bh->b_blocknr,
+					entries2, g.offset);
+					
+				error = -EIO;
 				goto out_kfree;
 			}
 			error = 0;
@@ -1318,6 +1335,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
 		}
 	} while(lfn);
 
+	BUG_ON(entries2 != entries);
 	error = do_filldir_main(ip, offset, opaque, filldir, darr,
 				entries, copied);
 out_kfree:
@@ -1401,6 +1419,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
 		  filldir_t filldir)
 {
 	struct gfs2_inode *dip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct dirent_gather g;
 	const struct gfs2_dirent **darr, *dent;
 	struct buffer_head *dibh;
@@ -1423,8 +1442,8 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
 		return error;
 
 	error = -ENOMEM;
-	darr = kmalloc(dip->i_di.di_entries * sizeof(struct gfs2_dirent *),
-		       GFP_KERNEL);
+	/* 96 is max number of dirents which can be stuffed into an inode */
+	darr = kmalloc(96 * sizeof(struct gfs2_dirent *), GFP_KERNEL);
 	if (darr) {
 		g.pdent = darr;
 		g.offset = 0;
@@ -1434,6 +1453,15 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
 			error = PTR_ERR(dent);
 			goto out;
 		}
+		if (dip->i_di.di_entries != g.offset) {
+			fs_warn(sdp, "Number of entries corrupt in dir %llu, "
+				"ip->i_di.di_entries (%u) != g.offset (%u)\n",
+				(unsigned long long)dip->i_num.no_addr,
+				dip->i_di.di_entries,
+				g.offset);
+			error = -EIO;
+			goto out;
+		}
 		error = do_filldir_main(dip, offset, opaque, filldir, darr,
 					dip->i_di.di_entries, &copied);
 out:
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 12accb08fe0..1815429a297 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -23,6 +23,10 @@
 #include <linux/module.h>
 #include <linux/rwsem.h>
 #include <asm/uaccess.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -40,20 +44,30 @@ struct gfs2_gl_hash_bucket {
         struct hlist_head hb_list;
 };
 
+struct glock_iter {
+	int hash;                     /* hash bucket index         */
+	struct gfs2_sbd *sdp;         /* incore superblock         */
+	struct gfs2_glock *gl;        /* current glock struct      */
+	struct hlist_head *hb_list;   /* current hash bucket ptr   */
+	struct seq_file *seq;         /* sequence file for debugfs */
+	char string[512];             /* scratch space             */
+};
+
 typedef void (*glock_examiner) (struct gfs2_glock * gl);
 
 static int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
-static int dump_glock(struct gfs2_glock *gl);
-static int dump_inode(struct gfs2_inode *ip);
-static void gfs2_glock_xmote_th(struct gfs2_holder *gh);
+static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl);
+static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh);
 static void gfs2_glock_drop_th(struct gfs2_glock *gl);
 static DECLARE_RWSEM(gfs2_umount_flush_sem);
+static struct dentry *gfs2_root;
 
 #define GFS2_GL_HASH_SHIFT      15
 #define GFS2_GL_HASH_SIZE       (1 << GFS2_GL_HASH_SHIFT)
 #define GFS2_GL_HASH_MASK       (GFS2_GL_HASH_SIZE - 1)
 
 static struct gfs2_gl_hash_bucket gl_hash_table[GFS2_GL_HASH_SIZE];
+static struct dentry *gfs2_root;
 
 /*
  * Despite what you might think, the numbers below are not arbitrary :-)
@@ -202,7 +216,6 @@ int gfs2_glock_put(struct gfs2_glock *gl)
 		gfs2_assert(sdp, list_empty(&gl->gl_reclaim));
 		gfs2_assert(sdp, list_empty(&gl->gl_holders));
 		gfs2_assert(sdp, list_empty(&gl->gl_waiters1));
-		gfs2_assert(sdp, list_empty(&gl->gl_waiters2));
 		gfs2_assert(sdp, list_empty(&gl->gl_waiters3));
 		glock_free(gl);
 		rv = 1;
@@ -303,7 +316,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 	atomic_set(&gl->gl_ref, 1);
 	gl->gl_state = LM_ST_UNLOCKED;
 	gl->gl_hash = hash;
-	gl->gl_owner = NULL;
+	gl->gl_owner_pid = 0;
 	gl->gl_ip = 0;
 	gl->gl_ops = glops;
 	gl->gl_req_gh = NULL;
@@ -367,7 +380,7 @@ void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
 	INIT_LIST_HEAD(&gh->gh_list);
 	gh->gh_gl = gl;
 	gh->gh_ip = (unsigned long)__builtin_return_address(0);
-	gh->gh_owner = current;
+	gh->gh_owner_pid = current->pid;
 	gh->gh_state = state;
 	gh->gh_flags = flags;
 	gh->gh_error = 0;
@@ -389,7 +402,7 @@ void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *
 {
 	gh->gh_state = state;
 	gh->gh_flags = flags;
-	gh->gh_iflags &= 1 << HIF_ALLOCED;
+	gh->gh_iflags = 0;
 	gh->gh_ip = (unsigned long)__builtin_return_address(0);
 }
 
@@ -406,54 +419,8 @@ void gfs2_holder_uninit(struct gfs2_holder *gh)
 	gh->gh_ip = 0;
 }
 
-/**
- * gfs2_holder_get - get a struct gfs2_holder structure
- * @gl: the glock
- * @state: the state we're requesting
- * @flags: the modifier flags
- * @gfp_flags:
- *
- * Figure out how big an impact this function has.  Either:
- * 1) Replace it with a cache of structures hanging off the struct gfs2_sbd
- * 2) Leave it like it is
- *
- * Returns: the holder structure, NULL on ENOMEM
- */
-
-static struct gfs2_holder *gfs2_holder_get(struct gfs2_glock *gl,
-					   unsigned int state,
-					   int flags, gfp_t gfp_flags)
-{
-	struct gfs2_holder *gh;
-
-	gh = kmalloc(sizeof(struct gfs2_holder), gfp_flags);
-	if (!gh)
-		return NULL;
-
-	gfs2_holder_init(gl, state, flags, gh);
-	set_bit(HIF_ALLOCED, &gh->gh_iflags);
-	gh->gh_ip = (unsigned long)__builtin_return_address(0);
-	return gh;
-}
-
-/**
- * gfs2_holder_put - get rid of a struct gfs2_holder structure
- * @gh: the holder structure
- *
- */
-
-static void gfs2_holder_put(struct gfs2_holder *gh)
+static void gfs2_holder_wake(struct gfs2_holder *gh)
 {
-	gfs2_holder_uninit(gh);
-	kfree(gh);
-}
-
-static void gfs2_holder_dispose_or_wake(struct gfs2_holder *gh)
-{
-	if (test_bit(HIF_DEALLOC, &gh->gh_iflags)) {
-		gfs2_holder_put(gh);
-		return;
-	}
 	clear_bit(HIF_WAIT, &gh->gh_iflags);
 	smp_mb();
 	wake_up_bit(&gh->gh_iflags, HIF_WAIT);
@@ -519,7 +486,7 @@ static int rq_promote(struct gfs2_holder *gh)
 				gfs2_reclaim_glock(sdp);
 			}
 
-			gfs2_glock_xmote_th(gh);
+			gfs2_glock_xmote_th(gh->gh_gl, gh);
 			spin_lock(&gl->gl_spin);
 		}
 		return 1;
@@ -542,7 +509,7 @@ static int rq_promote(struct gfs2_holder *gh)
 	gh->gh_error = 0;
 	set_bit(HIF_HOLDER, &gh->gh_iflags);
 
-	gfs2_holder_dispose_or_wake(gh);
+	gfs2_holder_wake(gh);
 
 	return 0;
 }
@@ -554,32 +521,24 @@ static int rq_promote(struct gfs2_holder *gh)
  * Returns: 1 if the queue is blocked
  */
 
-static int rq_demote(struct gfs2_holder *gh)
+static int rq_demote(struct gfs2_glock *gl)
 {
-	struct gfs2_glock *gl = gh->gh_gl;
-
 	if (!list_empty(&gl->gl_holders))
 		return 1;
 
-	if (gl->gl_state == gh->gh_state || gl->gl_state == LM_ST_UNLOCKED) {
-		list_del_init(&gh->gh_list);
-		gh->gh_error = 0;
-		spin_unlock(&gl->gl_spin);
-		gfs2_holder_dispose_or_wake(gh);
-		spin_lock(&gl->gl_spin);
-	} else {
-		gl->gl_req_gh = gh;
-		set_bit(GLF_LOCK, &gl->gl_flags);
-		spin_unlock(&gl->gl_spin);
-
-		if (gh->gh_state == LM_ST_UNLOCKED ||
-		    gl->gl_state != LM_ST_EXCLUSIVE)
-			gfs2_glock_drop_th(gl);
-		else
-			gfs2_glock_xmote_th(gh);
-
-		spin_lock(&gl->gl_spin);
+	if (gl->gl_state == gl->gl_demote_state ||
+	    gl->gl_state == LM_ST_UNLOCKED) {
+		clear_bit(GLF_DEMOTE, &gl->gl_flags);
+		return 0;
 	}
+	set_bit(GLF_LOCK, &gl->gl_flags);
+	spin_unlock(&gl->gl_spin);
+	if (gl->gl_demote_state == LM_ST_UNLOCKED ||
+	    gl->gl_state != LM_ST_EXCLUSIVE)
+		gfs2_glock_drop_th(gl);
+	else
+		gfs2_glock_xmote_th(gl, NULL);
+	spin_lock(&gl->gl_spin);
 
 	return 0;
 }
@@ -607,16 +566,8 @@ static void run_queue(struct gfs2_glock *gl)
 			else
 				gfs2_assert_warn(gl->gl_sbd, 0);
 
-		} else if (!list_empty(&gl->gl_waiters2) &&
-			   !test_bit(GLF_SKIP_WAITERS2, &gl->gl_flags)) {
-			gh = list_entry(gl->gl_waiters2.next,
-					struct gfs2_holder, gh_list);
-
-			if (test_bit(HIF_DEMOTE, &gh->gh_iflags))
-				blocked = rq_demote(gh);
-			else
-				gfs2_assert_warn(gl->gl_sbd, 0);
-
+		} else if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
+			blocked = rq_demote(gl);
 		} else if (!list_empty(&gl->gl_waiters3)) {
 			gh = list_entry(gl->gl_waiters3.next,
 					struct gfs2_holder, gh_list);
@@ -654,7 +605,7 @@ static void gfs2_glmutex_lock(struct gfs2_glock *gl)
 	if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
 		list_add_tail(&gh.gh_list, &gl->gl_waiters1);
 	} else {
-		gl->gl_owner = current;
+		gl->gl_owner_pid = current->pid;
 		gl->gl_ip = (unsigned long)__builtin_return_address(0);
 		clear_bit(HIF_WAIT, &gh.gh_iflags);
 		smp_mb();
@@ -681,7 +632,7 @@ static int gfs2_glmutex_trylock(struct gfs2_glock *gl)
 	if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
 		acquired = 0;
 	} else {
-		gl->gl_owner = current;
+		gl->gl_owner_pid = current->pid;
 		gl->gl_ip = (unsigned long)__builtin_return_address(0);
 	}
 	spin_unlock(&gl->gl_spin);
@@ -699,7 +650,7 @@ static void gfs2_glmutex_unlock(struct gfs2_glock *gl)
 {
 	spin_lock(&gl->gl_spin);
 	clear_bit(GLF_LOCK, &gl->gl_flags);
-	gl->gl_owner = NULL;
+	gl->gl_owner_pid = 0;
 	gl->gl_ip = 0;
 	run_queue(gl);
 	BUG_ON(!spin_is_locked(&gl->gl_spin));
@@ -707,50 +658,24 @@ static void gfs2_glmutex_unlock(struct gfs2_glock *gl)
 }
 
 /**
- * handle_callback - add a demote request to a lock's queue
+ * handle_callback - process a demote request
  * @gl: the glock
  * @state: the state the caller wants us to change to
  *
- * Note: This may fail sliently if we are out of memory.
+ * There are only two requests that we are going to see in actual
+ * practise: LM_ST_SHARED and LM_ST_UNLOCKED
  */
 
 static void handle_callback(struct gfs2_glock *gl, unsigned int state)
 {
-	struct gfs2_holder *gh, *new_gh = NULL;
-
-restart:
 	spin_lock(&gl->gl_spin);
-
-	list_for_each_entry(gh, &gl->gl_waiters2, gh_list) {
-		if (test_bit(HIF_DEMOTE, &gh->gh_iflags) &&
-		    gl->gl_req_gh != gh) {
-			if (gh->gh_state != state)
-				gh->gh_state = LM_ST_UNLOCKED;
-			goto out;
-		}
-	}
-
-	if (new_gh) {
-		list_add_tail(&new_gh->gh_list, &gl->gl_waiters2);
-		new_gh = NULL;
-	} else {
-		spin_unlock(&gl->gl_spin);
-
-		new_gh = gfs2_holder_get(gl, state, LM_FLAG_TRY, GFP_NOFS);
-		if (!new_gh)
-			return;
-		set_bit(HIF_DEMOTE, &new_gh->gh_iflags);
-		set_bit(HIF_DEALLOC, &new_gh->gh_iflags);
-		set_bit(HIF_WAIT, &new_gh->gh_iflags);
-
-		goto restart;
+	if (test_and_set_bit(GLF_DEMOTE, &gl->gl_flags) == 0) {
+		gl->gl_demote_state = state;
+		gl->gl_demote_time = jiffies;
+	} else if (gl->gl_demote_state != LM_ST_UNLOCKED) {
+		gl->gl_demote_state = state;
 	}
-
-out:
 	spin_unlock(&gl->gl_spin);
-
-	if (new_gh)
-		gfs2_holder_put(new_gh);
 }
 
 /**
@@ -810,56 +735,37 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
 
 	/*  Deal with each possible exit condition  */
 
-	if (!gh)
+	if (!gh) {
 		gl->gl_stamp = jiffies;
-	else if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
+		if (ret & LM_OUT_CANCELED)
+			op_done = 0;
+		else
+			clear_bit(GLF_DEMOTE, &gl->gl_flags);
+	} else {
 		spin_lock(&gl->gl_spin);
 		list_del_init(&gh->gh_list);
 		gh->gh_error = -EIO;
-		spin_unlock(&gl->gl_spin);
-	} else if (test_bit(HIF_DEMOTE, &gh->gh_iflags)) {
-		spin_lock(&gl->gl_spin);
-		list_del_init(&gh->gh_list);
-		if (gl->gl_state == gh->gh_state ||
-		    gl->gl_state == LM_ST_UNLOCKED) {
+		if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) 
+			goto out;
+		gh->gh_error = GLR_CANCELED;
+		if (ret & LM_OUT_CANCELED) 
+			goto out;
+		if (relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
+			list_add_tail(&gh->gh_list, &gl->gl_holders);
 			gh->gh_error = 0;
-		} else {
-			if (gfs2_assert_warn(sdp, gh->gh_flags &
-					(LM_FLAG_TRY | LM_FLAG_TRY_1CB)) == -1)
-				fs_warn(sdp, "ret = 0x%.8X\n", ret);
-			gh->gh_error = GLR_TRYFAILED;
+			set_bit(HIF_HOLDER, &gh->gh_iflags);
+			set_bit(HIF_FIRST, &gh->gh_iflags);
+			op_done = 0;
+			goto out;
 		}
-		spin_unlock(&gl->gl_spin);
-
-		if (ret & LM_OUT_CANCELED)
-			handle_callback(gl, LM_ST_UNLOCKED);
-
-	} else if (ret & LM_OUT_CANCELED) {
-		spin_lock(&gl->gl_spin);
-		list_del_init(&gh->gh_list);
-		gh->gh_error = GLR_CANCELED;
-		spin_unlock(&gl->gl_spin);
-
-	} else if (relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
-		spin_lock(&gl->gl_spin);
-		list_move_tail(&gh->gh_list, &gl->gl_holders);
-		gh->gh_error = 0;
-		set_bit(HIF_HOLDER, &gh->gh_iflags);
-		spin_unlock(&gl->gl_spin);
-
-		set_bit(HIF_FIRST, &gh->gh_iflags);
-
-		op_done = 0;
-
-	} else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
-		spin_lock(&gl->gl_spin);
-		list_del_init(&gh->gh_list);
 		gh->gh_error = GLR_TRYFAILED;
-		spin_unlock(&gl->gl_spin);
-
-	} else {
+		if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))
+			goto out;
+		gh->gh_error = -EINVAL;
 		if (gfs2_assert_withdraw(sdp, 0) == -1)
 			fs_err(sdp, "ret = 0x%.8X\n", ret);
+out:
+		spin_unlock(&gl->gl_spin);
 	}
 
 	if (glops->go_xmote_bh)
@@ -877,7 +783,7 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
 	gfs2_glock_put(gl);
 
 	if (gh)
-		gfs2_holder_dispose_or_wake(gh);
+		gfs2_holder_wake(gh);
 }
 
 /**
@@ -888,12 +794,11 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
  *
  */
 
-void gfs2_glock_xmote_th(struct gfs2_holder *gh)
+void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh)
 {
-	struct gfs2_glock *gl = gh->gh_gl;
 	struct gfs2_sbd *sdp = gl->gl_sbd;
-	int flags = gh->gh_flags;
-	unsigned state = gh->gh_state;
+	int flags = gh ? gh->gh_flags : 0;
+	unsigned state = gh ? gh->gh_state : gl->gl_demote_state;
 	const struct gfs2_glock_operations *glops = gl->gl_ops;
 	int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB |
 				 LM_FLAG_NOEXP | LM_FLAG_ANY |
@@ -943,6 +848,7 @@ static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
 	gfs2_assert_warn(sdp, !ret);
 
 	state_change(gl, LM_ST_UNLOCKED);
+	clear_bit(GLF_DEMOTE, &gl->gl_flags);
 
 	if (glops->go_inval)
 		glops->go_inval(gl, DIO_METADATA);
@@ -964,7 +870,7 @@ static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
 	gfs2_glock_put(gl);
 
 	if (gh)
-		gfs2_holder_dispose_or_wake(gh);
+		gfs2_holder_wake(gh);
 }
 
 /**
@@ -1097,18 +1003,32 @@ static int glock_wait_internal(struct gfs2_holder *gh)
 }
 
 static inline struct gfs2_holder *
-find_holder_by_owner(struct list_head *head, struct task_struct *owner)
+find_holder_by_owner(struct list_head *head, pid_t pid)
 {
 	struct gfs2_holder *gh;
 
 	list_for_each_entry(gh, head, gh_list) {
-		if (gh->gh_owner == owner)
+		if (gh->gh_owner_pid == pid)
 			return gh;
 	}
 
 	return NULL;
 }
 
+static void print_dbg(struct glock_iter *gi, const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	if (gi) {
+		vsprintf(gi->string, fmt, args);
+		seq_printf(gi->seq, gi->string);
+	}
+	else
+		vprintk(fmt, args);
+	va_end(args);
+}
+
 /**
  * add_to_queue - Add a holder to the wait queue (but look for recursion)
  * @gh: the holder structure to add
@@ -1120,24 +1040,24 @@ static void add_to_queue(struct gfs2_holder *gh)
 	struct gfs2_glock *gl = gh->gh_gl;
 	struct gfs2_holder *existing;
 
-	BUG_ON(!gh->gh_owner);
+	BUG_ON(!gh->gh_owner_pid);
 	if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags))
 		BUG();
 
-	existing = find_holder_by_owner(&gl->gl_holders, gh->gh_owner);
+	existing = find_holder_by_owner(&gl->gl_holders, gh->gh_owner_pid);
 	if (existing) {
 		print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip);
-		printk(KERN_INFO "pid : %d\n", existing->gh_owner->pid);
+		printk(KERN_INFO "pid : %d\n", existing->gh_owner_pid);
 		printk(KERN_INFO "lock type : %d lock state : %d\n",
 				existing->gh_gl->gl_name.ln_type, existing->gh_gl->gl_state);
 		print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
-		printk(KERN_INFO "pid : %d\n", gh->gh_owner->pid);
+		printk(KERN_INFO "pid : %d\n", gh->gh_owner_pid);
 		printk(KERN_INFO "lock type : %d lock state : %d\n",
 				gl->gl_name.ln_type, gl->gl_state);
 		BUG();
 	}
 
-	existing = find_holder_by_owner(&gl->gl_waiters3, gh->gh_owner);
+	existing = find_holder_by_owner(&gl->gl_waiters3, gh->gh_owner_pid);
 	if (existing) {
 		print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip);
 		print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
@@ -1267,9 +1187,8 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
 		if (glops->go_unlock)
 			glops->go_unlock(gh);
 
-		gl->gl_stamp = jiffies;
-
 		spin_lock(&gl->gl_spin);
+		gl->gl_stamp = jiffies;
 	}
 
 	clear_bit(GLF_LOCK, &gl->gl_flags);
@@ -1841,6 +1760,15 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
  *  Diagnostic routines to help debug distributed deadlock
  */
 
+static void gfs2_print_symbol(struct glock_iter *gi, const char *fmt,
+                              unsigned long address)
+{
+	char buffer[KSYM_SYMBOL_LEN];
+
+	sprint_symbol(buffer, address);
+	print_dbg(gi, fmt, buffer);
+}
+
 /**
  * dump_holder - print information about a glock holder
  * @str: a string naming the type of holder
@@ -1849,31 +1777,37 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
  * Returns: 0 on success, -ENOBUFS when we run out of space
  */
 
-static int dump_holder(char *str, struct gfs2_holder *gh)
+static int dump_holder(struct glock_iter *gi, char *str,
+		       struct gfs2_holder *gh)
 {
 	unsigned int x;
-	int error = -ENOBUFS;
-
-	printk(KERN_INFO "  %s\n", str);
-	printk(KERN_INFO "    owner = %ld\n",
-		   (gh->gh_owner) ? (long)gh->gh_owner->pid : -1);
-	printk(KERN_INFO "    gh_state = %u\n", gh->gh_state);
-	printk(KERN_INFO "    gh_flags =");
+	struct task_struct *gh_owner;
+
+	print_dbg(gi, "  %s\n", str);
+	if (gh->gh_owner_pid) {
+		print_dbg(gi, "    owner = %ld ", (long)gh->gh_owner_pid);
+		gh_owner = find_task_by_pid(gh->gh_owner_pid);
+		if (gh_owner)
+			print_dbg(gi, "(%s)\n", gh_owner->comm);
+		else
+			print_dbg(gi, "(ended)\n");
+	} else
+		print_dbg(gi, "    owner = -1\n");
+	print_dbg(gi, "    gh_state = %u\n", gh->gh_state);
+	print_dbg(gi, "    gh_flags =");
 	for (x = 0; x < 32; x++)
 		if (gh->gh_flags & (1 << x))
-			printk(" %u", x);
-	printk(" \n");
-	printk(KERN_INFO "    error = %d\n", gh->gh_error);
-	printk(KERN_INFO "    gh_iflags =");
+			print_dbg(gi, " %u", x);
+	print_dbg(gi, " \n");
+	print_dbg(gi, "    error = %d\n", gh->gh_error);
+	print_dbg(gi, "    gh_iflags =");
 	for (x = 0; x < 32; x++)
 		if (test_bit(x, &gh->gh_iflags))
-			printk(" %u", x);
-	printk(" \n");
-	print_symbol(KERN_INFO "    initialized at: %s\n", gh->gh_ip);
-
-	error = 0;
+			print_dbg(gi, " %u", x);
+	print_dbg(gi, " \n");
+        gfs2_print_symbol(gi, "    initialized at: %s\n", gh->gh_ip);
 
-	return error;
+	return 0;
 }
 
 /**
@@ -1883,25 +1817,20 @@ static int dump_holder(char *str, struct gfs2_holder *gh)
  * Returns: 0 on success, -ENOBUFS when we run out of space
  */
 
-static int dump_inode(struct gfs2_inode *ip)
+static int dump_inode(struct glock_iter *gi, struct gfs2_inode *ip)
 {
 	unsigned int x;
-	int error = -ENOBUFS;
 
-	printk(KERN_INFO "  Inode:\n");
-	printk(KERN_INFO "    num = %llu %llu\n",
-		    (unsigned long long)ip->i_num.no_formal_ino,
-		    (unsigned long long)ip->i_num.no_addr);
-	printk(KERN_INFO "    type = %u\n", IF2DT(ip->i_inode.i_mode));
-	printk(KERN_INFO "    i_flags =");
+	print_dbg(gi, "  Inode:\n");
+	print_dbg(gi, "    num = %llu/%llu\n",
+		    ip->i_num.no_formal_ino, ip->i_num.no_addr);
+	print_dbg(gi, "    type = %u\n", IF2DT(ip->i_inode.i_mode));
+	print_dbg(gi, "    i_flags =");
 	for (x = 0; x < 32; x++)
 		if (test_bit(x, &ip->i_flags))
-			printk(" %u", x);
-	printk(" \n");
-
-	error = 0;
-
-	return error;
+			print_dbg(gi, " %u", x);
+	print_dbg(gi, " \n");
+	return 0;
 }
 
 /**
@@ -1912,74 +1841,86 @@ static int dump_inode(struct gfs2_inode *ip)
  * Returns: 0 on success, -ENOBUFS when we run out of space
  */
 
-static int dump_glock(struct gfs2_glock *gl)
+static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl)
 {
 	struct gfs2_holder *gh;
 	unsigned int x;
 	int error = -ENOBUFS;
+	struct task_struct *gl_owner;
 
 	spin_lock(&gl->gl_spin);
 
-	printk(KERN_INFO "Glock 0x%p (%u, %llu)\n", gl, gl->gl_name.ln_type,
-	       (unsigned long long)gl->gl_name.ln_number);
-	printk(KERN_INFO "  gl_flags =");
+	print_dbg(gi, "Glock 0x%p (%u, %llu)\n", gl, gl->gl_name.ln_type,
+		   (unsigned long long)gl->gl_name.ln_number);
+	print_dbg(gi, "  gl_flags =");
 	for (x = 0; x < 32; x++) {
 		if (test_bit(x, &gl->gl_flags))
-			printk(" %u", x);
+			print_dbg(gi, " %u", x);
 	}
-	printk(" \n");
-	printk(KERN_INFO "  gl_ref = %d\n", atomic_read(&gl->gl_ref));
-	printk(KERN_INFO "  gl_state = %u\n", gl->gl_state);
-	printk(KERN_INFO "  gl_owner = %s\n", gl->gl_owner->comm);
-	print_symbol(KERN_INFO "  gl_ip = %s\n", gl->gl_ip);
-	printk(KERN_INFO "  req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no");
-	printk(KERN_INFO "  req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no");
-	printk(KERN_INFO "  lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
-	printk(KERN_INFO "  object = %s\n", (gl->gl_object) ? "yes" : "no");
-	printk(KERN_INFO "  le = %s\n",
+	if (!test_bit(GLF_LOCK, &gl->gl_flags))
+		print_dbg(gi, " (unlocked)");
+	print_dbg(gi, " \n");
+	print_dbg(gi, "  gl_ref = %d\n", atomic_read(&gl->gl_ref));
+	print_dbg(gi, "  gl_state = %u\n", gl->gl_state);
+	if (gl->gl_owner_pid) {
+		gl_owner = find_task_by_pid(gl->gl_owner_pid);
+		if (gl_owner)
+			print_dbg(gi, "  gl_owner = pid %d (%s)\n",
+				  gl->gl_owner_pid, gl_owner->comm);
+		else
+			print_dbg(gi, "  gl_owner = %d (ended)\n",
+				  gl->gl_owner_pid);
+	} else
+		print_dbg(gi, "  gl_owner = -1\n");
+	print_dbg(gi, "  gl_ip = %lu\n", gl->gl_ip);
+	print_dbg(gi, "  req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no");
+	print_dbg(gi, "  req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no");
+	print_dbg(gi, "  lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
+	print_dbg(gi, "  object = %s\n", (gl->gl_object) ? "yes" : "no");
+	print_dbg(gi, "  le = %s\n",
 		   (list_empty(&gl->gl_le.le_list)) ? "no" : "yes");
-	printk(KERN_INFO "  reclaim = %s\n",
-		    (list_empty(&gl->gl_reclaim)) ? "no" : "yes");
+	print_dbg(gi, "  reclaim = %s\n",
+		   (list_empty(&gl->gl_reclaim)) ? "no" : "yes");
 	if (gl->gl_aspace)
-		printk(KERN_INFO "  aspace = 0x%p nrpages = %lu\n", gl->gl_aspace,
-		       gl->gl_aspace->i_mapping->nrpages);
+		print_dbg(gi, "  aspace = 0x%p nrpages = %lu\n", gl->gl_aspace,
+			   gl->gl_aspace->i_mapping->nrpages);
 	else
-		printk(KERN_INFO "  aspace = no\n");
-	printk(KERN_INFO "  ail = %d\n", atomic_read(&gl->gl_ail_count));
+		print_dbg(gi, "  aspace = no\n");
+	print_dbg(gi, "  ail = %d\n", atomic_read(&gl->gl_ail_count));
 	if (gl->gl_req_gh) {
-		error = dump_holder("Request", gl->gl_req_gh);
+		error = dump_holder(gi, "Request", gl->gl_req_gh);
 		if (error)
 			goto out;
 	}
 	list_for_each_entry(gh, &gl->gl_holders, gh_list) {
-		error = dump_holder("Holder", gh);
+		error = dump_holder(gi, "Holder", gh);
 		if (error)
 			goto out;
 	}
 	list_for_each_entry(gh, &gl->gl_waiters1, gh_list) {
-		error = dump_holder("Waiter1", gh);
-		if (error)
-			goto out;
-	}
-	list_for_each_entry(gh, &gl->gl_waiters2, gh_list) {
-		error = dump_holder("Waiter2", gh);
+		error = dump_holder(gi, "Waiter1", gh);
 		if (error)
 			goto out;
 	}
 	list_for_each_entry(gh, &gl->gl_waiters3, gh_list) {
-		error = dump_holder("Waiter3", gh);
+		error = dump_holder(gi, "Waiter3", gh);
 		if (error)
 			goto out;
 	}
+	if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
+		print_dbg(gi, "  Demotion req to state %u (%llu uS ago)\n",
+			  gl->gl_demote_state,
+			  (u64)(jiffies - gl->gl_demote_time)*(1000000/HZ));
+	}
 	if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) {
 		if (!test_bit(GLF_LOCK, &gl->gl_flags) &&
-		    list_empty(&gl->gl_holders)) {
-			error = dump_inode(gl->gl_object);
+			list_empty(&gl->gl_holders)) {
+			error = dump_inode(gi, gl->gl_object);
 			if (error)
 				goto out;
 		} else {
 			error = -ENOBUFS;
-			printk(KERN_INFO "  Inode: busy\n");
+			print_dbg(gi, "  Inode: busy\n");
 		}
 	}
 
@@ -2014,7 +1955,7 @@ static int gfs2_dump_lockstate(struct gfs2_sbd *sdp)
 			if (gl->gl_sbd != sdp)
 				continue;
 
-			error = dump_glock(gl);
+			error = dump_glock(NULL, gl);
 			if (error)
 				break;
 		}
@@ -2043,3 +1984,189 @@ int __init gfs2_glock_init(void)
 	return 0;
 }
 
+static int gfs2_glock_iter_next(struct glock_iter *gi)
+{
+	read_lock(gl_lock_addr(gi->hash));
+	while (1) {
+		if (!gi->hb_list) {  /* If we don't have a hash bucket yet */
+			gi->hb_list = &gl_hash_table[gi->hash].hb_list;
+			if (hlist_empty(gi->hb_list)) {
+				read_unlock(gl_lock_addr(gi->hash));
+				gi->hash++;
+				read_lock(gl_lock_addr(gi->hash));
+				gi->hb_list = NULL;
+				if (gi->hash >= GFS2_GL_HASH_SIZE) {
+					read_unlock(gl_lock_addr(gi->hash));
+					return 1;
+				}
+				else
+					continue;
+			}
+			if (!hlist_empty(gi->hb_list)) {
+				gi->gl = list_entry(gi->hb_list->first,
+						    struct gfs2_glock,
+						    gl_list);
+			}
+		} else {
+			if (gi->gl->gl_list.next == NULL) {
+				read_unlock(gl_lock_addr(gi->hash));
+				gi->hash++;
+				read_lock(gl_lock_addr(gi->hash));
+				gi->hb_list = NULL;
+				continue;
+			}
+			gi->gl = list_entry(gi->gl->gl_list.next,
+					    struct gfs2_glock, gl_list);
+		}
+		if (gi->gl)
+			break;
+	}
+	read_unlock(gl_lock_addr(gi->hash));
+	return 0;
+}
+
+static void gfs2_glock_iter_free(struct glock_iter *gi)
+{
+	kfree(gi);
+}
+
+static struct glock_iter *gfs2_glock_iter_init(struct gfs2_sbd *sdp)
+{
+	struct glock_iter *gi;
+
+	gi = kmalloc(sizeof (*gi), GFP_KERNEL);
+	if (!gi)
+		return NULL;
+
+	gi->sdp = sdp;
+	gi->hash = 0;
+	gi->gl = NULL;
+	gi->hb_list = NULL;
+	gi->seq = NULL;
+	memset(gi->string, 0, sizeof(gi->string));
+
+	if (gfs2_glock_iter_next(gi)) {
+		gfs2_glock_iter_free(gi);
+		return NULL;
+	}
+
+	return gi;
+}
+
+static void *gfs2_glock_seq_start(struct seq_file *file, loff_t *pos)
+{
+	struct glock_iter *gi;
+	loff_t n = *pos;
+
+	gi = gfs2_glock_iter_init(file->private);
+	if (!gi)
+		return NULL;
+
+	while (n--) {
+		if (gfs2_glock_iter_next(gi)) {
+			gfs2_glock_iter_free(gi);
+			return NULL;
+		}
+	}
+
+	return gi;
+}
+
+static void *gfs2_glock_seq_next(struct seq_file *file, void *iter_ptr,
+				 loff_t *pos)
+{
+	struct glock_iter *gi = iter_ptr;
+
+	(*pos)++;
+
+	if (gfs2_glock_iter_next(gi)) {
+		gfs2_glock_iter_free(gi);
+		return NULL;
+	}
+
+	return gi;
+}
+
+static void gfs2_glock_seq_stop(struct seq_file *file, void *iter_ptr)
+{
+	/* nothing for now */
+}
+
+static int gfs2_glock_seq_show(struct seq_file *file, void *iter_ptr)
+{
+	struct glock_iter *gi = iter_ptr;
+
+	gi->seq = file;
+	dump_glock(gi, gi->gl);
+
+	return 0;
+}
+
+static struct seq_operations gfs2_glock_seq_ops = {
+	.start = gfs2_glock_seq_start,
+	.next  = gfs2_glock_seq_next,
+	.stop  = gfs2_glock_seq_stop,
+	.show  = gfs2_glock_seq_show,
+};
+
+static int gfs2_debugfs_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int ret;
+
+	ret = seq_open(file, &gfs2_glock_seq_ops);
+	if (ret)
+		return ret;
+
+	seq = file->private_data;
+	seq->private = inode->i_private;
+
+	return 0;
+}
+
+static const struct file_operations gfs2_debug_fops = {
+	.owner   = THIS_MODULE,
+	.open    = gfs2_debugfs_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release
+};
+
+int gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
+{
+	sdp->debugfs_dir = debugfs_create_dir(sdp->sd_table_name, gfs2_root);
+	if (!sdp->debugfs_dir)
+		return -ENOMEM;
+	sdp->debugfs_dentry_glocks = debugfs_create_file("glocks",
+							 S_IFREG | S_IRUGO,
+							 sdp->debugfs_dir, sdp,
+							 &gfs2_debug_fops);
+	if (!sdp->debugfs_dentry_glocks)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp)
+{
+	if (sdp && sdp->debugfs_dir) {
+		if (sdp->debugfs_dentry_glocks) {
+			debugfs_remove(sdp->debugfs_dentry_glocks);
+			sdp->debugfs_dentry_glocks = NULL;
+		}
+		debugfs_remove(sdp->debugfs_dir);
+		sdp->debugfs_dir = NULL;
+	}
+}
+
+int gfs2_register_debugfs(void)
+{
+	gfs2_root = debugfs_create_dir("gfs2", NULL);
+	return gfs2_root ? 0 : -ENOMEM;
+}
+
+void gfs2_unregister_debugfs(void)
+{
+	debugfs_remove(gfs2_root);
+	gfs2_root = NULL;
+}
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index f50e40ceca4..11477ca3a3c 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -38,7 +38,7 @@ static inline int gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
 	/* Look in glock's list of holders for one with current task as owner */
 	spin_lock(&gl->gl_spin);
 	list_for_each_entry(gh, &gl->gl_holders, gh_list) {
-		if (gh->gh_owner == current) {
+		if (gh->gh_owner_pid == current->pid) {
 			locked = 1;
 			break;
 		}
@@ -67,7 +67,7 @@ static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
 {
 	int ret;
 	spin_lock(&gl->gl_spin);
-	ret = !list_empty(&gl->gl_waiters2) || !list_empty(&gl->gl_waiters3);
+	ret = test_bit(GLF_DEMOTE, &gl->gl_flags) || !list_empty(&gl->gl_waiters3);
 	spin_unlock(&gl->gl_spin);
 	return ret;
 }
@@ -135,5 +135,9 @@ void gfs2_scand_internal(struct gfs2_sbd *sdp);
 void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait);
 
 int __init gfs2_glock_init(void);
+int gfs2_create_debugfs_file(struct gfs2_sbd *sdp);
+void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
+int gfs2_register_debugfs(void);
+void gfs2_unregister_debugfs(void);
 
 #endif /* __GLOCK_DOT_H__ */
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 49f0dbf40d8..d995441373a 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -115,11 +115,8 @@ enum {
 	/* Actions */
 	HIF_MUTEX		= 0,
 	HIF_PROMOTE		= 1,
-	HIF_DEMOTE		= 2,
 
 	/* States */
-	HIF_ALLOCED		= 4,
-	HIF_DEALLOC		= 5,
 	HIF_HOLDER		= 6,
 	HIF_FIRST		= 7,
 	HIF_ABORTED		= 9,
@@ -130,7 +127,7 @@ struct gfs2_holder {
 	struct list_head gh_list;
 
 	struct gfs2_glock *gh_gl;
-	struct task_struct *gh_owner;
+	pid_t gh_owner_pid;
 	unsigned int gh_state;
 	unsigned gh_flags;
 
@@ -142,8 +139,8 @@ struct gfs2_holder {
 enum {
 	GLF_LOCK		= 1,
 	GLF_STICKY		= 2,
+	GLF_DEMOTE		= 3,
 	GLF_DIRTY		= 5,
-	GLF_SKIP_WAITERS2	= 6,
 };
 
 struct gfs2_glock {
@@ -156,11 +153,12 @@ struct gfs2_glock {
 
 	unsigned int gl_state;
 	unsigned int gl_hash;
-	struct task_struct *gl_owner;
+	unsigned int gl_demote_state; /* state requested by remote node */
+	unsigned long gl_demote_time; /* time of first demote request */
+	pid_t gl_owner_pid;
 	unsigned long gl_ip;
 	struct list_head gl_holders;
 	struct list_head gl_waiters1;	/* HIF_MUTEX */
-	struct list_head gl_waiters2;	/* HIF_DEMOTE */
 	struct list_head gl_waiters3;	/* HIF_PROMOTE */
 
 	const struct gfs2_glock_operations *gl_ops;
@@ -611,6 +609,8 @@ struct gfs2_sbd {
 
 	unsigned long sd_last_warning;
 	struct vfsmount *sd_gfs2mnt;
+	struct dentry *debugfs_dir;    /* debugfs directory */
+	struct dentry *debugfs_dentry_glocks; /* for debugfs */
 };
 
 #endif /* __INCORE_DOT_H__ */
diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
index b167addf9fd..c305255bfe8 100644
--- a/fs/gfs2/locking/dlm/lock.c
+++ b/fs/gfs2/locking/dlm/lock.c
@@ -151,7 +151,7 @@ static inline unsigned int make_flags(struct gdlm_lock *lp,
 
 /* make_strname - convert GFS lock numbers to a string */
 
-static inline void make_strname(struct lm_lockname *lockname,
+static inline void make_strname(const struct lm_lockname *lockname,
 				struct gdlm_strname *str)
 {
 	sprintf(str->name, "%8x%16llx", lockname->ln_type,
@@ -169,6 +169,7 @@ static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
 		return -ENOMEM;
 
 	lp->lockname = *name;
+	make_strname(name, &lp->strname);
 	lp->ls = ls;
 	lp->cur = DLM_LOCK_IV;
 	lp->lvb = NULL;
@@ -227,7 +228,6 @@ void gdlm_put_lock(void *lock)
 unsigned int gdlm_do_lock(struct gdlm_lock *lp)
 {
 	struct gdlm_ls *ls = lp->ls;
-	struct gdlm_strname str;
 	int error, bast = 1;
 
 	/*
@@ -249,8 +249,6 @@ unsigned int gdlm_do_lock(struct gdlm_lock *lp)
 	if (test_bit(LFL_NOBAST, &lp->flags))
 		bast = 0;
 
-	make_strname(&lp->lockname, &str);
-
 	set_bit(LFL_ACTIVE, &lp->flags);
 
 	log_debug("lk %x,%llx id %x %d,%d %x", lp->lockname.ln_type,
@@ -258,8 +256,8 @@ unsigned int gdlm_do_lock(struct gdlm_lock *lp)
 		  lp->cur, lp->req, lp->lkf);
 
 	error = dlm_lock(ls->dlm_lockspace, lp->req, &lp->lksb, lp->lkf,
-			 str.name, str.namelen, 0, gdlm_ast, lp,
-			 bast ? gdlm_bast : NULL);
+			 lp->strname.name, lp->strname.namelen, 0, gdlm_ast,
+			 lp, bast ? gdlm_bast : NULL);
 
 	if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) {
 		lp->lksb.sb_status = -EAGAIN;
@@ -268,7 +266,7 @@ unsigned int gdlm_do_lock(struct gdlm_lock *lp)
 	}
 
 	if (error) {
-		log_debug("%s: gdlm_lock %x,%llx err=%d cur=%d req=%d lkf=%x "
+		log_error("%s: gdlm_lock %x,%llx err=%d cur=%d req=%d lkf=%x "
 			  "flags=%lx", ls->fsname, lp->lockname.ln_type,
 			  (unsigned long long)lp->lockname.ln_number, error,
 			  lp->cur, lp->req, lp->lkf, lp->flags);
@@ -296,7 +294,7 @@ static unsigned int gdlm_do_unlock(struct gdlm_lock *lp)
 	error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, lkf, NULL, lp);
 
 	if (error) {
-		log_debug("%s: gdlm_unlock %x,%llx err=%d cur=%d req=%d lkf=%x "
+		log_error("%s: gdlm_unlock %x,%llx err=%d cur=%d req=%d lkf=%x "
 			  "flags=%lx", ls->fsname, lp->lockname.ln_type,
 			  (unsigned long long)lp->lockname.ln_number, error,
 			  lp->cur, lp->req, lp->lkf, lp->flags);
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
index a87c7bf3c56..d074c6e6f9b 100644
--- a/fs/gfs2/locking/dlm/lock_dlm.h
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -36,7 +36,7 @@
 
 #define GDLM_STRNAME_BYTES	24
 #define GDLM_LVB_SIZE		32
-#define GDLM_DROP_COUNT		200000
+#define GDLM_DROP_COUNT		0
 #define GDLM_DROP_PERIOD	60
 #define GDLM_NAME_LEN		128
 
@@ -106,6 +106,7 @@ enum {
 struct gdlm_lock {
 	struct gdlm_ls		*ls;
 	struct lm_lockname	lockname;
+	struct gdlm_strname	strname;
 	char			*lvb;
 	struct dlm_lksb		lksb;
 
diff --git a/fs/gfs2/locking/dlm/plock.c b/fs/gfs2/locking/dlm/plock.c
index 1dd4215b83d..f82495e18c2 100644
--- a/fs/gfs2/locking/dlm/plock.c
+++ b/fs/gfs2/locking/dlm/plock.c
@@ -25,6 +25,15 @@ struct plock_op {
 	struct gdlm_plock_info info;
 };
 
+struct plock_xop {
+	struct plock_op xop;
+	void *callback;
+	void *fl;
+	void *file;
+	struct file_lock flc;
+};
+
+
 static inline void set_version(struct gdlm_plock_info *info)
 {
 	info->version[0] = GDLM_PLOCK_VERSION_MAJOR;
@@ -64,12 +73,14 @@ int gdlm_plock(void *lockspace, struct lm_lockname *name,
 {
 	struct gdlm_ls *ls = lockspace;
 	struct plock_op *op;
+	struct plock_xop *xop;
 	int rv;
 
-	op = kzalloc(sizeof(*op), GFP_KERNEL);
-	if (!op)
+	xop = kzalloc(sizeof(*xop), GFP_KERNEL);
+	if (!xop)
 		return -ENOMEM;
 
+	op = &xop->xop;
 	op->info.optype		= GDLM_PLOCK_OP_LOCK;
 	op->info.pid		= fl->fl_pid;
 	op->info.ex		= (fl->fl_type == F_WRLCK);
@@ -79,9 +90,21 @@ int gdlm_plock(void *lockspace, struct lm_lockname *name,
 	op->info.start		= fl->fl_start;
 	op->info.end		= fl->fl_end;
 	op->info.owner		= (__u64)(long) fl->fl_owner;
+	if (fl->fl_lmops && fl->fl_lmops->fl_grant) {
+		xop->callback	= fl->fl_lmops->fl_grant;
+		locks_init_lock(&xop->flc);
+		locks_copy_lock(&xop->flc, fl);
+		xop->fl		= fl;
+		xop->file	= file;
+	} else
+		xop->callback	= NULL;
 
 	send_op(op);
-	wait_event(recv_wq, (op->done != 0));
+
+	if (xop->callback == NULL)
+		wait_event(recv_wq, (op->done != 0));
+	else
+		return -EINPROGRESS;
 
 	spin_lock(&ops_lock);
 	if (!list_empty(&op->list)) {
@@ -99,7 +122,63 @@ int gdlm_plock(void *lockspace, struct lm_lockname *name,
 				  (unsigned long long)name->ln_number);
 	}
 
-	kfree(op);
+	kfree(xop);
+	return rv;
+}
+
+/* Returns failure iff a succesful lock operation should be canceled */
+static int gdlm_plock_callback(struct plock_op *op)
+{
+	struct file *file;
+	struct file_lock *fl;
+	struct file_lock *flc;
+	int (*notify)(void *, void *, int) = NULL;
+	struct plock_xop *xop = (struct plock_xop *)op;
+	int rv = 0;
+
+	spin_lock(&ops_lock);
+	if (!list_empty(&op->list)) {
+		printk(KERN_INFO "plock op on list\n");
+		list_del(&op->list);
+	}
+	spin_unlock(&ops_lock);
+
+	/* check if the following 2 are still valid or make a copy */
+	file = xop->file;
+	flc = &xop->flc;
+	fl = xop->fl;
+	notify = xop->callback;
+
+	if (op->info.rv) {
+		notify(flc, NULL, op->info.rv);
+		goto out;
+	}
+
+	/* got fs lock; bookkeep locally as well: */
+	flc->fl_flags &= ~FL_SLEEP;
+	if (posix_lock_file(file, flc, NULL)) {
+		/*
+		 * This can only happen in the case of kmalloc() failure.
+		 * The filesystem's own lock is the authoritative lock,
+		 * so a failure to get the lock locally is not a disaster.
+		 * As long as GFS cannot reliably cancel locks (especially
+		 * in a low-memory situation), we're better off ignoring
+		 * this failure than trying to recover.
+		 */
+		log_error("gdlm_plock: vfs lock error file %p fl %p",
+				file, fl);
+	}
+
+	rv = notify(flc, NULL, 0);
+	if (rv) {
+		/* XXX: We need to cancel the fs lock here: */
+		printk("gfs2 lock granted after lock request failed;"
+						" dangling lock!\n");
+		goto out;
+	}
+
+out:
+	kfree(xop);
 	return rv;
 }
 
@@ -138,6 +217,9 @@ int gdlm_punlock(void *lockspace, struct lm_lockname *name,
 
 	rv = op->info.rv;
 
+	if (rv == -ENOENT)
+		rv = 0;
+
 	kfree(op);
 	return rv;
 }
@@ -161,6 +243,7 @@ int gdlm_plock_get(void *lockspace, struct lm_lockname *name,
 	op->info.start		= fl->fl_start;
 	op->info.end		= fl->fl_end;
 
+
 	send_op(op);
 	wait_event(recv_wq, (op->done != 0));
 
@@ -173,9 +256,10 @@ int gdlm_plock_get(void *lockspace, struct lm_lockname *name,
 
 	rv = op->info.rv;
 
-	if (rv == 0)
-		fl->fl_type = F_UNLCK;
-	else if (rv > 0) {
+	fl->fl_type = F_UNLCK;
+	if (rv == -ENOENT)
+		rv = 0;
+	else if (rv == 0 && op->info.pid != fl->fl_pid) {
 		fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK;
 		fl->fl_pid = op->info.pid;
 		fl->fl_start = op->info.start;
@@ -243,9 +327,14 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
 	}
 	spin_unlock(&ops_lock);
 
-	if (found)
-		wake_up(&recv_wq);
-	else
+	if (found) {
+		struct plock_xop *xop;
+		xop = (struct plock_xop *)op;
+		if (xop->callback)
+			count = gdlm_plock_callback(op);
+		else
+			wake_up(&recv_wq);
+	} else
 		printk(KERN_INFO "gdlm dev_write no op %x %llx\n", info.fsid,
 			(unsigned long long)info.number);
 	return count;
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index 4746b884662..d9fe3ca40e1 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -190,7 +190,6 @@ static struct kobj_type gdlm_ktype = {
 };
 
 static struct kset gdlm_kset = {
-	.subsys = &kernel_subsys,
 	.kobj   = {.name = "lock_dlm",},
 	.ktype  = &gdlm_ktype,
 };
@@ -225,6 +224,7 @@ int gdlm_sysfs_init(void)
 {
 	int error;
 
+	kobj_set_kset_s(&gdlm_kset, kernel_subsys);
 	error = kset_register(&gdlm_kset);
 	if (error)
 		printk("lock_dlm: cannot register kset %d\n", error);
diff --git a/fs/gfs2/locking/nolock/main.c b/fs/gfs2/locking/nolock/main.c
index acfbc941f31..5cc1dfa7944 100644
--- a/fs/gfs2/locking/nolock/main.c
+++ b/fs/gfs2/locking/nolock/main.c
@@ -164,13 +164,7 @@ static void nolock_unhold_lvb(void *lock, char *lvb)
 static int nolock_plock_get(void *lockspace, struct lm_lockname *name,
 			    struct file *file, struct file_lock *fl)
 {
-	struct file_lock tmp;
-	int ret;
-
-	ret = posix_test_lock(file, fl, &tmp);
-	fl->fl_type = F_UNLCK;
-	if (ret)
-		memcpy(fl, &tmp, sizeof(struct file_lock));
+	posix_test_lock(file, fl);
 
 	return 0;
 }
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 16bb4b4561a..f82d84d05d2 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -33,16 +33,17 @@ static void glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
 
 	tr->tr_touched = 1;
 
-	if (!list_empty(&le->le_list))
-		return;
-
 	gl = container_of(le, struct gfs2_glock, gl_le);
 	if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl)))
 		return;
-	gfs2_glock_hold(gl);
-	set_bit(GLF_DIRTY, &gl->gl_flags);
 
 	gfs2_log_lock(sdp);
+	if (!list_empty(&le->le_list)){
+		gfs2_log_unlock(sdp);
+		return;
+	}
+	gfs2_glock_hold(gl);
+	set_bit(GLF_DIRTY, &gl->gl_flags);
 	sdp->sd_log_num_gl++;
 	list_add(&le->le_list, &sdp->sd_log_le_gl);
 	gfs2_log_unlock(sdp);
@@ -415,13 +416,14 @@ static void rg_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
 
 	tr->tr_touched = 1;
 
-	if (!list_empty(&le->le_list))
-		return;
-
 	rgd = container_of(le, struct gfs2_rgrpd, rd_le);
-	gfs2_rgrp_bh_hold(rgd);
 
 	gfs2_log_lock(sdp);
+	if (!list_empty(&le->le_list)){
+		gfs2_log_unlock(sdp);
+		return;
+	}
+	gfs2_rgrp_bh_hold(rgd);
 	sdp->sd_log_num_rg++;
 	list_add(&le->le_list, &sdp->sd_log_le_rg);
 	gfs2_log_unlock(sdp);
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 6e8a59809ab..e460487c055 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -27,8 +27,7 @@
 static void gfs2_init_inode_once(void *foo, struct kmem_cache *cachep, unsigned long flags)
 {
 	struct gfs2_inode *ip = foo;
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
+	if (flags & SLAB_CTOR_CONSTRUCTOR) {
 		inode_init_once(&ip->i_inode);
 		spin_lock_init(&ip->i_spin);
 		init_rwsem(&ip->i_rw_mutex);
@@ -39,13 +38,11 @@ static void gfs2_init_inode_once(void *foo, struct kmem_cache *cachep, unsigned
 static void gfs2_init_glock_once(void *foo, struct kmem_cache *cachep, unsigned long flags)
 {
 	struct gfs2_glock *gl = foo;
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
+	if (flags & SLAB_CTOR_CONSTRUCTOR) {
 		INIT_HLIST_NODE(&gl->gl_list);
 		spin_lock_init(&gl->gl_spin);
 		INIT_LIST_HEAD(&gl->gl_holders);
 		INIT_LIST_HEAD(&gl->gl_waiters1);
-		INIT_LIST_HEAD(&gl->gl_waiters2);
 		INIT_LIST_HEAD(&gl->gl_waiters3);
 		gl->gl_lvb = NULL;
 		atomic_set(&gl->gl_lvb_count, 0);
@@ -103,6 +100,8 @@ static int __init init_gfs2_fs(void)
 	if (error)
 		goto fail_unregister;
 
+	gfs2_register_debugfs();
+
 	printk("GFS2 (built %s %s) installed\n", __DATE__, __TIME__);
 
 	return 0;
@@ -130,6 +129,7 @@ fail:
 
 static void __exit exit_gfs2_fs(void)
 {
+	gfs2_unregister_debugfs();
 	unregister_filesystem(&gfs2_fs_type);
 	unregister_filesystem(&gfs2meta_fs_type);
 
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
index 32caecd2030..4864659555d 100644
--- a/fs/gfs2/mount.c
+++ b/fs/gfs2/mount.c
@@ -13,6 +13,7 @@
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/lm_interface.h>
+#include <linux/parser.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -20,6 +21,52 @@
 #include "sys.h"
 #include "util.h"
 
+enum {
+	Opt_lockproto,
+	Opt_locktable,
+	Opt_hostdata,
+	Opt_spectator,
+	Opt_ignore_local_fs,
+	Opt_localflocks,
+	Opt_localcaching,
+	Opt_debug,
+	Opt_nodebug,
+	Opt_upgrade,
+	Opt_num_glockd,
+	Opt_acl,
+	Opt_noacl,
+	Opt_quota_off,
+	Opt_quota_account,
+	Opt_quota_on,
+	Opt_suiddir,
+	Opt_nosuiddir,
+	Opt_data_writeback,
+	Opt_data_ordered,
+};
+
+static match_table_t tokens = {
+	{Opt_lockproto, "lockproto=%s"},
+	{Opt_locktable, "locktable=%s"},
+	{Opt_hostdata, "hostdata=%s"},
+	{Opt_spectator, "spectator"},
+	{Opt_ignore_local_fs, "ignore_local_fs"},
+	{Opt_localflocks, "localflocks"},
+	{Opt_localcaching, "localcaching"},
+	{Opt_debug, "debug"},
+	{Opt_nodebug, "nodebug"},
+	{Opt_upgrade, "upgrade"},
+	{Opt_num_glockd, "num_glockd=%d"},
+	{Opt_acl, "acl"},
+	{Opt_noacl, "noacl"},
+	{Opt_quota_off, "quota=off"},
+	{Opt_quota_account, "quota=account"},
+	{Opt_quota_on, "quota=on"},
+	{Opt_suiddir, "suiddir"},
+	{Opt_nosuiddir, "nosuiddir"},
+	{Opt_data_writeback, "data=writeback"},
+	{Opt_data_ordered, "data=ordered"}
+};
+
 /**
  * gfs2_mount_args - Parse mount options
  * @sdp:
@@ -54,146 +101,150 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
 	   process them */
 
 	for (options = data; (o = strsep(&options, ",")); ) {
+		int token, option;
+		substring_t tmp[MAX_OPT_ARGS];
+
 		if (!*o)
 			continue;
 
-		v = strchr(o, '=');
-		if (v)
-			*v++ = 0;
+		token = match_token(o, tokens, tmp);
+		switch (token) {
+		case Opt_lockproto:
+			v = match_strdup(&tmp[0]);
+			if (!v) {
+				fs_info(sdp, "no memory for lockproto\n");
+				error = -ENOMEM;
+				goto out_error;
+			}
 
-		if (!strcmp(o, "lockproto")) {
-			if (!v)
-				goto need_value;
-			if (remount && strcmp(v, args->ar_lockproto))
+			if (remount && strcmp(v, args->ar_lockproto)) {
+				kfree(v);
 				goto cant_remount;
+			}
+			
 			strncpy(args->ar_lockproto, v, GFS2_LOCKNAME_LEN);
 			args->ar_lockproto[GFS2_LOCKNAME_LEN - 1] = 0;
-		}
+			kfree(v);
+			break;
+		case Opt_locktable:
+			v = match_strdup(&tmp[0]);
+			if (!v) {
+				fs_info(sdp, "no memory for locktable\n");
+				error = -ENOMEM;
+				goto out_error;
+			}
 
-		else if (!strcmp(o, "locktable")) {
-			if (!v)
-				goto need_value;
-			if (remount && strcmp(v, args->ar_locktable))
+			if (remount && strcmp(v, args->ar_locktable)) {
+				kfree(v);
 				goto cant_remount;
+			}
+
 			strncpy(args->ar_locktable, v, GFS2_LOCKNAME_LEN);
-			args->ar_locktable[GFS2_LOCKNAME_LEN - 1] = 0;
-		}
+			args->ar_locktable[GFS2_LOCKNAME_LEN - 1]  = 0;
+			kfree(v);
+			break;
+		case Opt_hostdata:
+			v = match_strdup(&tmp[0]);
+			if (!v) {
+				fs_info(sdp, "no memory for hostdata\n");
+				error = -ENOMEM;
+				goto out_error;
+			}
 
-		else if (!strcmp(o, "hostdata")) {
-			if (!v)
-				goto need_value;
-			if (remount && strcmp(v, args->ar_hostdata))
+			if (remount && strcmp(v, args->ar_hostdata)) {
+				kfree(v);
 				goto cant_remount;
+			}
+
 			strncpy(args->ar_hostdata, v, GFS2_LOCKNAME_LEN);
 			args->ar_hostdata[GFS2_LOCKNAME_LEN - 1] = 0;
-		}
-
-		else if (!strcmp(o, "spectator")) {
+			kfree(v);
+			break;
+		case Opt_spectator:
 			if (remount && !args->ar_spectator)
 				goto cant_remount;
 			args->ar_spectator = 1;
 			sdp->sd_vfs->s_flags |= MS_RDONLY;
-		}
-
-		else if (!strcmp(o, "ignore_local_fs")) {
+			break;
+		case Opt_ignore_local_fs:
 			if (remount && !args->ar_ignore_local_fs)
 				goto cant_remount;
 			args->ar_ignore_local_fs = 1;
-		}
-
-		else if (!strcmp(o, "localflocks")) {
+			break;
+		case Opt_localflocks:
 			if (remount && !args->ar_localflocks)
 				goto cant_remount;
 			args->ar_localflocks = 1;
-		}
-
-		else if (!strcmp(o, "localcaching")) {
+			break;
+		case Opt_localcaching:
 			if (remount && !args->ar_localcaching)
 				goto cant_remount;
 			args->ar_localcaching = 1;
-		}
-
-		else if (!strcmp(o, "debug"))
+			break;
+		case Opt_debug:
 			args->ar_debug = 1;
-
-		else if (!strcmp(o, "nodebug"))
+			break;
+		case Opt_nodebug:
 			args->ar_debug = 0;
-
-		else if (!strcmp(o, "upgrade")) {
+			break;
+		case Opt_upgrade:
 			if (remount && !args->ar_upgrade)
 				goto cant_remount;
 			args->ar_upgrade = 1;
-		}
+			break;
+		case Opt_num_glockd:
+			if ((error = match_int(&tmp[0], &option))) {
+				fs_info(sdp, "problem getting num_glockd\n");
+				goto out_error;
+			}
 
-		else if (!strcmp(o, "num_glockd")) {
-			unsigned int x;
-			if (!v)
-				goto need_value;
-			sscanf(v, "%u", &x);
-			if (remount && x != args->ar_num_glockd)
+			if (remount && option != args->ar_num_glockd)
 				goto cant_remount;
-			if (!x || x > GFS2_GLOCKD_MAX) {
+			if (!option || option > GFS2_GLOCKD_MAX) {
 				fs_info(sdp, "0 < num_glockd <= %u  (not %u)\n",
-				        GFS2_GLOCKD_MAX, x);
+				        GFS2_GLOCKD_MAX, option);
 				error = -EINVAL;
-				break;
+				goto out_error;
 			}
-			args->ar_num_glockd = x;
-		}
-
-		else if (!strcmp(o, "acl")) {
+			args->ar_num_glockd = option;
+			break;
+		case Opt_acl:
 			args->ar_posix_acl = 1;
 			sdp->sd_vfs->s_flags |= MS_POSIXACL;
-		}
-
-		else if (!strcmp(o, "noacl")) {
+			break;
+		case Opt_noacl:
 			args->ar_posix_acl = 0;
 			sdp->sd_vfs->s_flags &= ~MS_POSIXACL;
-		}
-
-		else if (!strcmp(o, "quota")) {
-			if (!v)
-				goto need_value;
-			if (!strcmp(v, "off"))
-				args->ar_quota = GFS2_QUOTA_OFF;
-			else if (!strcmp(v, "account"))
-				args->ar_quota = GFS2_QUOTA_ACCOUNT;
-			else if (!strcmp(v, "on"))
-				args->ar_quota = GFS2_QUOTA_ON;
-			else {
-				fs_info(sdp, "invalid value for quota\n");
-				error = -EINVAL;
-				break;
-			}
-		}
-
-		else if (!strcmp(o, "suiddir"))
+			break;
+		case Opt_quota_off:
+			args->ar_quota = GFS2_QUOTA_OFF;
+			break;
+		case Opt_quota_account:
+			args->ar_quota = GFS2_QUOTA_ACCOUNT;
+			break;
+		case Opt_quota_on:
+			args->ar_quota = GFS2_QUOTA_ON;
+			break;
+		case Opt_suiddir:
 			args->ar_suiddir = 1;
-
-		else if (!strcmp(o, "nosuiddir"))
+			break;
+		case Opt_nosuiddir:
 			args->ar_suiddir = 0;
-
-		else if (!strcmp(o, "data")) {
-			if (!v)
-				goto need_value;
-			if (!strcmp(v, "writeback"))
-				args->ar_data = GFS2_DATA_WRITEBACK;
-			else if (!strcmp(v, "ordered"))
-				args->ar_data = GFS2_DATA_ORDERED;
-			else {
-				fs_info(sdp, "invalid value for data\n");
-				error = -EINVAL;
-				break;
-			}
-		}
-
-		else {
+			break;
+		case Opt_data_writeback:
+			args->ar_data = GFS2_DATA_WRITEBACK;
+			break;
+		case Opt_data_ordered:
+			args->ar_data = GFS2_DATA_ORDERED;
+			break;
+		default:
 			fs_info(sdp, "unknown option: %s\n", o);
 			error = -EINVAL;
-			break;
+			goto out_error;
 		}
 	}
 
+out_error:
 	if (error)
 		fs_info(sdp, "invalid mount option(s)\n");
 
@@ -202,10 +253,6 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
 
 	return error;
 
-need_value:
-	fs_info(sdp, "need value for option %s\n", o);
-	return -EINVAL;
-
 cant_remount:
 	fs_info(sdp, "can't remount with option %s\n", o);
 	return -EINVAL;
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index b3b7e847535..30c15622174 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -197,7 +197,19 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
 	void *kaddr;
 	int error;
 
-	BUG_ON(page->index);
+	/*
+	 * Due to the order of unstuffing files and ->nopage(), we can be
+	 * asked for a zero page in the case of a stuffed file being extended,
+	 * so we need to supply one here. It doesn't happen often.
+	 */
+	if (unlikely(page->index)) {
+		kaddr = kmap_atomic(page, KM_USER0);
+		memset(kaddr, 0, PAGE_CACHE_SIZE);
+		kunmap_atomic(kaddr, KM_USER0);
+		flush_dcache_page(page);
+		SetPageUptodate(page);
+		return 0;
+	}
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	if (error)
@@ -208,9 +220,8 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
 	       ip->i_di.di_size);
 	memset(kaddr + ip->i_di.di_size, 0, PAGE_CACHE_SIZE - ip->i_di.di_size);
 	kunmap_atomic(kaddr, KM_USER0);
-
+	flush_dcache_page(page);
 	brelse(dibh);
-
 	SetPageUptodate(page);
 
 	return 0;
@@ -507,7 +518,9 @@ static int gfs2_commit_write(struct file *file, struct page *page,
 		gfs2_quota_unlock(ip);
 		gfs2_alloc_put(ip);
 	}
+	unlock_page(page);
 	gfs2_glock_dq_m(1, &ip->i_gh);
+	lock_page(page);
 	gfs2_holder_uninit(&ip->i_gh);
 	return 0;
 
@@ -520,7 +533,9 @@ fail_endtrans:
 		gfs2_quota_unlock(ip);
 		gfs2_alloc_put(ip);
 	}
+	unlock_page(page);
 	gfs2_glock_dq_m(1, &ip->i_gh);
+	lock_page(page);
 	gfs2_holder_uninit(&ip->i_gh);
 fail_nounlock:
 	ClearPageUptodate(page);
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index b50180e2277..329c4dcdecd 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -513,18 +513,18 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
 
 	if (sdp->sd_args.ar_localflocks) {
 		if (IS_GETLK(cmd)) {
-			struct file_lock tmp;
-			int ret;
-			ret = posix_test_lock(file, fl, &tmp);
-			fl->fl_type = F_UNLCK;
-			if (ret)
-				memcpy(fl, &tmp, sizeof(struct file_lock));
+			posix_test_lock(file, fl);
 			return 0;
 		} else {
 			return posix_lock_file_wait(file, fl);
 		}
 	}
 
+	if (cmd == F_CANCELLK) {
+		/* Hack: */
+		cmd = F_SETLK;
+		fl->fl_type = F_UNLCK;
+	}
 	if (IS_GETLK(cmd))
 		return gfs2_lm_plock_get(sdp, &name, file, fl);
 	else if (fl->fl_type == F_UNLCK)
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index ee54cb66708..2c5f8e7def0 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -690,6 +690,8 @@ static int fill_super(struct super_block *sb, void *data, int silent)
 	if (error)
 		goto fail;
 
+	gfs2_create_debugfs_file(sdp);
+
 	error = gfs2_sys_fs_add(sdp);
 	if (error)
 		goto fail;
@@ -754,6 +756,7 @@ fail_lm:
 fail_sys:
 	gfs2_sys_fs_del(sdp);
 fail:
+	gfs2_delete_debugfs_file(sdp);
 	kfree(sdp);
 	sb->s_fs_info = NULL;
 	return error;
@@ -896,6 +899,7 @@ error:
 
 static void gfs2_kill_sb(struct super_block *sb)
 {
+	gfs2_delete_debugfs_file(sb->s_fs_info);
 	kill_block_super(sb);
 }
 
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index b89999d3a76..485ce3d4992 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -284,6 +284,31 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
 }
 
 /**
+ * gfs2_drop_inode - Drop an inode (test for remote unlink)
+ * @inode: The inode to drop
+ *
+ * If we've received a callback on an iopen lock then its because a
+ * remote node tried to deallocate the inode but failed due to this node
+ * still having the inode open. Here we mark the link count zero
+ * since we know that it must have reached zero if the GLF_DEMOTE flag
+ * is set on the iopen glock. If we didn't do a disk read since the
+ * remote node removed the final link then we might otherwise miss
+ * this event. This check ensures that this node will deallocate the
+ * inode's blocks, or alternatively pass the baton on to another
+ * node for later deallocation.
+ */
+static void gfs2_drop_inode(struct inode *inode)
+{
+	if (inode->i_private && inode->i_nlink) {
+		struct gfs2_inode *ip = GFS2_I(inode);
+		struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
+		if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
+			clear_nlink(inode);
+	}
+	generic_drop_inode(inode);
+}
+
+/**
  * gfs2_clear_inode - Deallocate an inode when VFS is done with it
  * @inode: The VFS inode
  *
@@ -441,7 +466,7 @@ out_unlock:
 out_uninit:
 	gfs2_holder_uninit(&ip->i_iopen_gh);
 	gfs2_glock_dq_uninit(&gh);
-	if (error)
+	if (error && error != GLR_TRYFAILED)
 		fs_warn(sdp, "gfs2_delete_inode: %d\n", error);
 out:
 	truncate_inode_pages(&inode->i_data, 0);
@@ -481,6 +506,7 @@ const struct super_operations gfs2_super_ops = {
 	.statfs			= gfs2_statfs,
 	.remount_fs		= gfs2_remount_fs,
 	.clear_inode		= gfs2_clear_inode,
+	.drop_inode		= gfs2_drop_inode,
 	.show_options		= gfs2_show_options,
 };
 
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 8d9c08b5c4b..1727f5012ef 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -27,6 +27,7 @@
 #include "trans.h"
 #include "ops_file.h"
 #include "util.h"
+#include "log.h"
 
 #define BFITNOENT ((u32)~0)
 
@@ -697,8 +698,6 @@ struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip)
  * @al: the struct gfs2_alloc structure describing the reservation
  *
  * If there's room for the requested blocks to be allocated from the RG:
- *   Sets the $al_reserved_data field in @al.
- *   Sets the $al_reserved_meta field in @al.
  *   Sets the $al_rgd field in @al.
  *
  * Returns: 1 on success (it fits), 0 on failure (it doesn't fit)
@@ -709,6 +708,9 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
 	struct gfs2_sbd *sdp = rgd->rd_sbd;
 	int ret = 0;
 
+	if (rgd->rd_rg.rg_flags & GFS2_RGF_NOALLOC)
+		return 0;
+
 	spin_lock(&sdp->sd_rindex_spin);
 	if (rgd->rd_free_clone >= al->al_requested) {
 		al->al_rgd = rgd;
@@ -941,9 +943,13 @@ static int get_local_rgrp(struct gfs2_inode *ip)
 			rgd = gfs2_rgrpd_get_first(sdp);
 
 		if (rgd == begin) {
-			if (++loops >= 2 || !skipped)
+			if (++loops >= 3)
 				return -ENOSPC;
+			if (!skipped)
+				loops++;
 			flags = 0;
+			if (loops == 2)
+				gfs2_log_flush(sdp, NULL);
 		}
 	}
 
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index d01f9f0fda2..c26c21b53c1 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -222,7 +222,6 @@ static struct kobj_type gfs2_ktype = {
 };
 
 static struct kset gfs2_kset = {
-	.subsys = &fs_subsys,
 	.kobj   = {.name = "gfs2"},
 	.ktype  = &gfs2_ktype,
 };
@@ -554,6 +553,7 @@ int gfs2_sys_init(void)
 {
 	gfs2_sys_margs = NULL;
 	spin_lock_init(&gfs2_sys_margs_lock);
+	kobj_set_kset_s(&gfs2_kset, fs_subsys);
 	return kset_register(&gfs2_kset);
 }
 
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 623f509f1d4..4f1888f16cf 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -434,7 +434,7 @@ static void hfs_init_once(void *p, struct kmem_cache *cachep, unsigned long flag
 {
 	struct hfs_inode_info *i = p;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == SLAB_CTOR_CONSTRUCTOR)
+	if (flags & SLAB_CTOR_CONSTRUCTOR)
 		inode_init_once(&i->vfs_inode);
 }
 
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 1a97f929344..37afbec8a76 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -470,7 +470,7 @@ static void hfsplus_init_once(void *p, struct kmem_cache *cachep, unsigned long
 {
 	struct hfsplus_inode_info *i = p;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == SLAB_CTOR_CONSTRUCTOR)
+	if (flags & SLAB_CTOR_CONSTRUCTOR)
 		inode_init_once(&i->vfs_inode);
 }
 
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index e0174e33852..1b95f39fbc3 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -176,8 +176,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
 	struct hpfs_inode_info *ei = (struct hpfs_inode_info *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
+	if (flags & SLAB_CTOR_CONSTRUCTOR) {
 		mutex_init(&ei->i_mutex);
 		mutex_init(&ei->i_parent_mutex);
 		inode_init_once(&ei->vfs_inode);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 8c718a3d413..98959b87cdf 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -22,6 +22,7 @@
 #include <linux/backing-dev.h>
 #include <linux/hugetlb.h>
 #include <linux/pagevec.h>
+#include <linux/mman.h>
 #include <linux/quotaops.h>
 #include <linux/slab.h>
 #include <linux/dnotify.h>
@@ -98,10 +99,7 @@ out:
  * Called under down_write(mmap_sem).
  */
 
-#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
-unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
-		unsigned long len, unsigned long pgoff, unsigned long flags);
-#else
+#ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
 static unsigned long
 hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 		unsigned long len, unsigned long pgoff, unsigned long flags)
@@ -115,6 +113,12 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 	if (len > TASK_SIZE)
 		return -ENOMEM;
 
+	if (flags & MAP_FIXED) {
+		if (prepare_hugepage_range(addr, len, pgoff))
+			return -EINVAL;
+		return addr;
+	}
+
 	if (addr) {
 		addr = ALIGN(addr, HPAGE_SIZE);
 		vma = find_vma(mm, addr);
@@ -453,7 +457,7 @@ static int hugetlbfs_symlink(struct inode *dir,
  */
 static int hugetlbfs_set_page_dirty(struct page *page)
 {
-	struct page *head = (struct page *)page_private(page);
+	struct page *head = compound_head(page);
 
 	SetPageDirty(head);
 	return 0;
@@ -552,8 +556,7 @@ static void init_once(void *foo, struct kmem_cache *cachep, unsigned long flags)
 {
 	struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR)
+	if (flags & SLAB_CTOR_CONSTRUCTOR)
 		inode_init_once(&ei->vfs_inode);
 }
 
@@ -744,6 +747,9 @@ struct file *hugetlb_zero_setup(size_t size)
 	char buf[16];
 	static atomic_t counter;
 
+	if (!hugetlbfs_vfsmount)
+		return ERR_PTR(-ENOENT);
+
 	if (!can_do_hugetlb_shm())
 		return ERR_PTR(-EPERM);
 
diff --git a/fs/inode.c b/fs/inode.c
index 5abb097ab1b..b4296bf6273 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -213,8 +213,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
 	struct inode * inode = (struct inode *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR)
+	if (flags & SLAB_CTOR_CONSTRUCTOR)
 		inode_init_once(inode);
 }
 
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 64a96cdfe3a..e99f7ff4ecb 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -77,8 +77,7 @@ static void init_once(void *foo, struct kmem_cache * cachep, unsigned long flags
 {
 	struct iso_inode_info *ei = foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR)
+	if (flags & SLAB_CTOR_CONSTRUCTOR)
 		inode_init_once(&ei->vfs_inode);
 }
  
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index e51164a8a8d..45368f8bbe7 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -47,8 +47,7 @@ static void jffs2_i_init_once(void * foo, struct kmem_cache * cachep, unsigned l
 {
 	struct jffs2_inode_info *ei = (struct jffs2_inode_info *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
+	if (flags & SLAB_CTOR_CONSTRUCTOR) {
 		init_MUTEX(&ei->sem);
 		inode_init_once(&ei->vfs_inode);
 	}
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 58deae00750..6b3acb0b578 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -184,8 +184,7 @@ static void init_once(void *foo, struct kmem_cache *cachep, unsigned long flags)
 {
 	struct metapage *mp = (struct metapage *)foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
+	if (flags & SLAB_CTOR_CONSTRUCTOR) {
 		mp->lid = 0;
 		mp->lsn = 0;
 		mp->flag = 0;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 52d73d54a93..ea9dc3e65dc 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -752,8 +752,7 @@ static void init_once(void *foo, struct kmem_cache * cachep, unsigned long flags
 {
 	struct jfs_inode_info *jfs_ip = (struct jfs_inode_info *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
+	if (flags & SLAB_CTOR_CONSTRUCTOR) {
 		memset(jfs_ip, 0, sizeof(struct jfs_inode_info));
 		INIT_LIST_HEAD(&jfs_ip->anon_inode_list);
 		init_rwsem(&jfs_ip->rdwrlock);
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index eb243edf893..2102e2d0134 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -225,16 +225,13 @@ xdr_decode_stat(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp)
 #define SM_monres_sz	2
 #define SM_unmonres_sz	1
 
-#ifndef MAX
-# define MAX(a, b)	(((a) > (b))? (a) : (b))
-#endif
-
 static struct rpc_procinfo	nsm_procedures[] = {
 [SM_MON] = {
 		.p_proc		= SM_MON,
 		.p_encode	= (kxdrproc_t) xdr_encode_mon,
 		.p_decode	= (kxdrproc_t) xdr_decode_stat_res,
-		.p_bufsiz	= MAX(SM_mon_sz, SM_monres_sz) << 2,
+		.p_arglen	= SM_mon_sz,
+		.p_replen	= SM_monres_sz,
 		.p_statidx	= SM_MON,
 		.p_name		= "MONITOR",
 	},
@@ -242,7 +239,8 @@ static struct rpc_procinfo	nsm_procedures[] = {
 		.p_proc		= SM_UNMON,
 		.p_encode	= (kxdrproc_t) xdr_encode_unmon,
 		.p_decode	= (kxdrproc_t) xdr_decode_stat,
-		.p_bufsiz	= MAX(SM_mon_id_sz, SM_unmonres_sz) << 2,
+		.p_arglen	= SM_mon_id_sz,
+		.p_replen	= SM_unmonres_sz,
 		.p_statidx	= SM_UNMON,
 		.p_name		= "UNMONITOR",
 	},
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 47a66aa5d55..bf27b6c6cb6 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -99,7 +99,9 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
 		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
 
 	/* Now check for conflicting locks */
-	resp->status = nlmsvc_testlock(file, &argp->lock, &resp->lock);
+	resp->status = nlmsvc_testlock(rqstp, file, &argp->lock, &resp->lock, &resp->cookie);
+	if (resp->status == nlm_drop_reply)
+		return rpc_drop_reply;
 
 	dprintk("lockd: TEST4          status %d\n", ntohl(resp->status));
 	nlm_release_host(host);
@@ -143,6 +145,8 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 	/* Now try to lock the file */
 	resp->status = nlmsvc_lock(rqstp, file, &argp->lock,
 					argp->block, &argp->cookie);
+	if (resp->status == nlm_drop_reply)
+		return rpc_drop_reply;
 
 	dprintk("lockd: LOCK          status %d\n", ntohl(resp->status));
 	nlm_release_host(host);
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index cf51f849e76..b3efa4536cc 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -173,7 +173,7 @@ found:
  */
 static inline struct nlm_block *
 nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_file *file,
-				struct nlm_lock *lock, struct nlm_cookie *cookie)
+		struct nlm_lock *lock, struct nlm_cookie *cookie)
 {
 	struct nlm_block	*block;
 	struct nlm_host		*host;
@@ -210,6 +210,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_file *file,
 	block->b_daemon = rqstp->rq_server;
 	block->b_host   = host;
 	block->b_file   = file;
+	block->b_fl = NULL;
 	file->f_count++;
 
 	/* Add to file's list of blocks */
@@ -261,6 +262,7 @@ static void nlmsvc_free_block(struct kref *kref)
 	nlmsvc_freegrantargs(block->b_call);
 	nlm_release_call(block->b_call);
 	nlm_release_file(block->b_file);
+	kfree(block->b_fl);
 	kfree(block);
 }
 
@@ -331,6 +333,31 @@ static void nlmsvc_freegrantargs(struct nlm_rqst *call)
 }
 
 /*
+ * Deferred lock request handling for non-blocking lock
+ */
+static u32
+nlmsvc_defer_lock_rqst(struct svc_rqst *rqstp, struct nlm_block *block)
+{
+	u32 status = nlm_lck_denied_nolocks;
+
+	block->b_flags |= B_QUEUED;
+
+	nlmsvc_insert_block(block, NLM_TIMEOUT);
+
+	block->b_cache_req = &rqstp->rq_chandle;
+	if (rqstp->rq_chandle.defer) {
+		block->b_deferred_req =
+			rqstp->rq_chandle.defer(block->b_cache_req);
+		if (block->b_deferred_req != NULL)
+			status = nlm_drop_reply;
+	}
+	dprintk("lockd: nlmsvc_defer_lock_rqst block %p flags %d status %d\n",
+		block, block->b_flags, status);
+
+	return status;
+}
+
+/*
  * Attempt to establish a lock, and if it can't be granted, block it
  * if required.
  */
@@ -338,7 +365,7 @@ __be32
 nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 			struct nlm_lock *lock, int wait, struct nlm_cookie *cookie)
 {
-	struct nlm_block	*block, *newblock = NULL;
+	struct nlm_block	*block = NULL;
 	int			error;
 	__be32			ret;
 
@@ -351,29 +378,58 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 				wait);
 
 
-	lock->fl.fl_flags &= ~FL_SLEEP;
-again:
 	/* Lock file against concurrent access */
 	mutex_lock(&file->f_mutex);
-	/* Get existing block (in case client is busy-waiting) */
+	/* Get existing block (in case client is busy-waiting)
+	 * or create new block
+	 */
 	block = nlmsvc_lookup_block(file, lock);
 	if (block == NULL) {
-		if (newblock != NULL)
-			lock = &newblock->b_call->a_args.lock;
-	} else
+		block = nlmsvc_create_block(rqstp, file, lock, cookie);
+		ret = nlm_lck_denied_nolocks;
+		if (block == NULL)
+			goto out;
 		lock = &block->b_call->a_args.lock;
+	} else
+		lock->fl.fl_flags &= ~FL_SLEEP;
 
-	error = posix_lock_file(file->f_file, &lock->fl);
-	lock->fl.fl_flags &= ~FL_SLEEP;
+	if (block->b_flags & B_QUEUED) {
+		dprintk("lockd: nlmsvc_lock deferred block %p flags %d\n",
+							block, block->b_flags);
+		if (block->b_granted) {
+			nlmsvc_unlink_block(block);
+			ret = nlm_granted;
+			goto out;
+		}
+		if (block->b_flags & B_TIMED_OUT) {
+			nlmsvc_unlink_block(block);
+			ret = nlm_lck_denied;
+			goto out;
+		}
+		ret = nlm_drop_reply;
+		goto out;
+	}
 
-	dprintk("lockd: posix_lock_file returned %d\n", error);
+	if (!wait)
+		lock->fl.fl_flags &= ~FL_SLEEP;
+	error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL);
+	lock->fl.fl_flags &= ~FL_SLEEP;
 
+	dprintk("lockd: vfs_lock_file returned %d\n", error);
 	switch(error) {
 		case 0:
 			ret = nlm_granted;
 			goto out;
 		case -EAGAIN:
+			ret = nlm_lck_denied;
 			break;
+		case -EINPROGRESS:
+			if (wait)
+				break;
+			/* Filesystem lock operation is in progress
+			   Add it to the queue waiting for callback */
+			ret = nlmsvc_defer_lock_rqst(rqstp, block);
+			goto out;
 		case -EDEADLK:
 			ret = nlm_deadlock;
 			goto out;
@@ -387,26 +443,11 @@ again:
 		goto out;
 
 	ret = nlm_lck_blocked;
-	if (block != NULL)
-		goto out;
-
-	/* If we don't have a block, create and initialize it. Then
-	 * retry because we may have slept in kmalloc. */
-	/* We have to release f_mutex as nlmsvc_create_block may try to
-	 * to claim it while doing host garbage collection */
-	if (newblock == NULL) {
-		mutex_unlock(&file->f_mutex);
-		dprintk("lockd: blocking on this lock (allocating).\n");
-		if (!(newblock = nlmsvc_create_block(rqstp, file, lock, cookie)))
-			return nlm_lck_denied_nolocks;
-		goto again;
-	}
 
 	/* Append to list of blocked */
-	nlmsvc_insert_block(newblock, NLM_NEVER);
+	nlmsvc_insert_block(block, NLM_NEVER);
 out:
 	mutex_unlock(&file->f_mutex);
-	nlmsvc_release_block(newblock);
 	nlmsvc_release_block(block);
 	dprintk("lockd: nlmsvc_lock returned %u\n", ret);
 	return ret;
@@ -416,9 +457,14 @@ out:
  * Test for presence of a conflicting lock.
  */
 __be32
-nlmsvc_testlock(struct nlm_file *file, struct nlm_lock *lock,
-				       struct nlm_lock *conflock)
+nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
+		struct nlm_lock *lock, struct nlm_lock *conflock,
+		struct nlm_cookie *cookie)
 {
+	struct nlm_block 	*block = NULL;
+	int			error;
+	__be32			ret;
+
 	dprintk("lockd: nlmsvc_testlock(%s/%ld, ty=%d, %Ld-%Ld)\n",
 				file->f_file->f_path.dentry->d_inode->i_sb->s_id,
 				file->f_file->f_path.dentry->d_inode->i_ino,
@@ -426,19 +472,70 @@ nlmsvc_testlock(struct nlm_file *file, struct nlm_lock *lock,
 				(long long)lock->fl.fl_start,
 				(long long)lock->fl.fl_end);
 
-	if (posix_test_lock(file->f_file, &lock->fl, &conflock->fl)) {
-		dprintk("lockd: conflicting lock(ty=%d, %Ld-%Ld)\n",
-				conflock->fl.fl_type,
-				(long long)conflock->fl.fl_start,
-				(long long)conflock->fl.fl_end);
-		conflock->caller = "somehost";	/* FIXME */
-		conflock->len = strlen(conflock->caller);
-		conflock->oh.len = 0;		/* don't return OH info */
-		conflock->svid = conflock->fl.fl_pid;
-		return nlm_lck_denied;
+	/* Get existing block (in case client is busy-waiting) */
+	block = nlmsvc_lookup_block(file, lock);
+
+	if (block == NULL) {
+		struct file_lock *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
+
+		if (conf == NULL)
+			return nlm_granted;
+		block = nlmsvc_create_block(rqstp, file, lock, cookie);
+		if (block == NULL) {
+			kfree(conf);
+			return nlm_granted;
+		}
+		block->b_fl = conf;
+	}
+	if (block->b_flags & B_QUEUED) {
+		dprintk("lockd: nlmsvc_testlock deferred block %p flags %d fl %p\n",
+			block, block->b_flags, block->b_fl);
+		if (block->b_flags & B_TIMED_OUT) {
+			nlmsvc_unlink_block(block);
+			return nlm_lck_denied;
+		}
+		if (block->b_flags & B_GOT_CALLBACK) {
+			if (block->b_fl != NULL
+					&& block->b_fl->fl_type != F_UNLCK) {
+				lock->fl = *block->b_fl;
+				goto conf_lock;
+			}
+			else {
+				nlmsvc_unlink_block(block);
+				return nlm_granted;
+			}
+		}
+		return nlm_drop_reply;
 	}
 
-	return nlm_granted;
+	error = vfs_test_lock(file->f_file, &lock->fl);
+	if (error == -EINPROGRESS)
+		return nlmsvc_defer_lock_rqst(rqstp, block);
+	if (error) {
+		ret = nlm_lck_denied_nolocks;
+		goto out;
+	}
+	if (lock->fl.fl_type == F_UNLCK) {
+		ret = nlm_granted;
+		goto out;
+	}
+
+conf_lock:
+	dprintk("lockd: conflicting lock(ty=%d, %Ld-%Ld)\n",
+		lock->fl.fl_type, (long long)lock->fl.fl_start,
+		(long long)lock->fl.fl_end);
+	conflock->caller = "somehost";	/* FIXME */
+	conflock->len = strlen(conflock->caller);
+	conflock->oh.len = 0;		/* don't return OH info */
+	conflock->svid = lock->fl.fl_pid;
+	conflock->fl.fl_type = lock->fl.fl_type;
+	conflock->fl.fl_start = lock->fl.fl_start;
+	conflock->fl.fl_end = lock->fl.fl_end;
+	ret = nlm_lck_denied;
+out:
+	if (block)
+		nlmsvc_release_block(block);
+	return ret;
 }
 
 /*
@@ -464,7 +561,7 @@ nlmsvc_unlock(struct nlm_file *file, struct nlm_lock *lock)
 	nlmsvc_cancel_blocked(file, lock);
 
 	lock->fl.fl_type = F_UNLCK;
-	error = posix_lock_file(file->f_file, &lock->fl);
+	error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL);
 
 	return (error < 0)? nlm_lck_denied_nolocks : nlm_granted;
 }
@@ -493,6 +590,8 @@ nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock)
 	block = nlmsvc_lookup_block(file, lock);
 	mutex_unlock(&file->f_mutex);
 	if (block != NULL) {
+		vfs_cancel_lock(block->b_file->f_file,
+				&block->b_call->a_args.lock.fl);
 		status = nlmsvc_unlink_block(block);
 		nlmsvc_release_block(block);
 	}
@@ -500,6 +599,63 @@ nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock)
 }
 
 /*
+ * This is a callback from the filesystem for VFS file lock requests.
+ * It will be used if fl_grant is defined and the filesystem can not
+ * respond to the request immediately.
+ * For GETLK request it will copy the reply to the nlm_block.
+ * For SETLK or SETLKW request it will get the local posix lock.
+ * In all cases it will move the block to the head of nlm_blocked q where
+ * nlmsvc_retry_blocked() can send back a reply for SETLKW or revisit the
+ * deferred rpc for GETLK and SETLK.
+ */
+static void
+nlmsvc_update_deferred_block(struct nlm_block *block, struct file_lock *conf,
+			     int result)
+{
+	block->b_flags |= B_GOT_CALLBACK;
+	if (result == 0)
+		block->b_granted = 1;
+	else
+		block->b_flags |= B_TIMED_OUT;
+	if (conf) {
+		if (block->b_fl)
+			locks_copy_lock(block->b_fl, conf);
+	}
+}
+
+static int nlmsvc_grant_deferred(struct file_lock *fl, struct file_lock *conf,
+					int result)
+{
+	struct nlm_block *block;
+	int rc = -ENOENT;
+
+	lock_kernel();
+	list_for_each_entry(block, &nlm_blocked, b_list) {
+		if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) {
+			dprintk("lockd: nlmsvc_notify_blocked block %p flags %d\n",
+							block, block->b_flags);
+			if (block->b_flags & B_QUEUED) {
+				if (block->b_flags & B_TIMED_OUT) {
+					rc = -ENOLCK;
+					break;
+				}
+				nlmsvc_update_deferred_block(block, conf, result);
+			} else if (result == 0)
+				block->b_granted = 1;
+
+			nlmsvc_insert_block(block, 0);
+			svc_wake_up(block->b_daemon);
+			rc = 0;
+			break;
+		}
+	}
+	unlock_kernel();
+	if (rc == -ENOENT)
+		printk(KERN_WARNING "lockd: grant for unknown block\n");
+	return rc;
+}
+
+/*
  * Unblock a blocked lock request. This is a callback invoked from the
  * VFS layer when a lock on which we blocked is removed.
  *
@@ -531,6 +687,7 @@ static int nlmsvc_same_owner(struct file_lock *fl1, struct file_lock *fl2)
 struct lock_manager_operations nlmsvc_lock_operations = {
 	.fl_compare_owner = nlmsvc_same_owner,
 	.fl_notify = nlmsvc_notify_blocked,
+	.fl_grant = nlmsvc_grant_deferred,
 };
 
 /*
@@ -553,6 +710,8 @@ nlmsvc_grant_blocked(struct nlm_block *block)
 
 	dprintk("lockd: grant blocked lock %p\n", block);
 
+	kref_get(&block->b_count);
+
 	/* Unlink block request from list */
 	nlmsvc_unlink_block(block);
 
@@ -566,20 +725,23 @@ nlmsvc_grant_blocked(struct nlm_block *block)
 
 	/* Try the lock operation again */
 	lock->fl.fl_flags |= FL_SLEEP;
-	error = posix_lock_file(file->f_file, &lock->fl);
+	error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL);
 	lock->fl.fl_flags &= ~FL_SLEEP;
 
 	switch (error) {
 	case 0:
 		break;
 	case -EAGAIN:
-		dprintk("lockd: lock still blocked\n");
+	case -EINPROGRESS:
+		dprintk("lockd: lock still blocked error %d\n", error);
 		nlmsvc_insert_block(block, NLM_NEVER);
+		nlmsvc_release_block(block);
 		return;
 	default:
 		printk(KERN_WARNING "lockd: unexpected error %d in %s!\n",
 				-error, __FUNCTION__);
 		nlmsvc_insert_block(block, 10 * HZ);
+		nlmsvc_release_block(block);
 		return;
 	}
 
@@ -592,7 +754,6 @@ callback:
 	nlmsvc_insert_block(block, 30 * HZ);
 
 	/* Call the client */
-	kref_get(&block->b_count);
 	nlm_async_call(block->b_call, NLMPROC_GRANTED_MSG, &nlmsvc_grant_ops);
 }
 
@@ -665,6 +826,23 @@ nlmsvc_grant_reply(struct nlm_cookie *cookie, __be32 status)
 	nlmsvc_release_block(block);
 }
 
+/* Helper function to handle retry of a deferred block.
+ * If it is a blocking lock, call grant_blocked.
+ * For a non-blocking lock or test lock, revisit the request.
+ */
+static void
+retry_deferred_block(struct nlm_block *block)
+{
+	if (!(block->b_flags & B_GOT_CALLBACK))
+		block->b_flags |= B_TIMED_OUT;
+	nlmsvc_insert_block(block, NLM_TIMEOUT);
+	dprintk("revisit block %p flags %d\n",	block, block->b_flags);
+	if (block->b_deferred_req) {
+		block->b_deferred_req->revisit(block->b_deferred_req, 0);
+		block->b_deferred_req = NULL;
+	}
+}
+
 /*
  * Retry all blocked locks that have been notified. This is where lockd
  * picks up locks that can be granted, or grant notifications that must
@@ -688,9 +866,12 @@ nlmsvc_retry_blocked(void)
 
 		dprintk("nlmsvc_retry_blocked(%p, when=%ld)\n",
 			block, block->b_when);
-		kref_get(&block->b_count);
-		nlmsvc_grant_blocked(block);
-		nlmsvc_release_block(block);
+		if (block->b_flags & B_QUEUED) {
+			dprintk("nlmsvc_retry_blocked delete block (%p, granted=%d, flags=%d)\n",
+				block, block->b_granted, block->b_flags);
+			retry_deferred_block(block);
+		} else
+			nlmsvc_grant_blocked(block);
 	}
 
 	return timeout;
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 31cb4842573..9cd5c8b3759 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -33,6 +33,7 @@ cast_to_nlm(__be32 status, u32 vers)
 		case nlm_lck_denied_nolocks:
 		case nlm_lck_blocked:
 		case nlm_lck_denied_grace_period:
+		case nlm_drop_reply:
 			break;
 		case nlm4_deadlock:
 			status = nlm_lck_denied;
@@ -127,7 +128,9 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
 		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
 
 	/* Now check for conflicting locks */
-	resp->status = cast_status(nlmsvc_testlock(file, &argp->lock, &resp->lock));
+	resp->status = cast_status(nlmsvc_testlock(rqstp, file, &argp->lock, &resp->lock, &resp->cookie));
+	if (resp->status == nlm_drop_reply)
+		return rpc_drop_reply;
 
 	dprintk("lockd: TEST          status %d vers %d\n",
 		ntohl(resp->status), rqstp->rq_vers);
@@ -172,6 +175,8 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 	/* Now try to lock the file */
 	resp->status = cast_status(nlmsvc_lock(rqstp, file, &argp->lock,
 					       argp->block, &argp->cookie));
+	if (resp->status == nlm_drop_reply)
+		return rpc_drop_reply;
 
 	dprintk("lockd: LOCK          status %d\n", ntohl(resp->status));
 	nlm_release_host(host);
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index c0df00c74ce..84ebba33b98 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -182,7 +182,7 @@ again:
 			lock.fl_type  = F_UNLCK;
 			lock.fl_start = 0;
 			lock.fl_end   = OFFSET_MAX;
-			if (posix_lock_file(file->f_file, &lock) < 0) {
+			if (vfs_lock_file(file->f_file, F_SETLK, &lock, NULL) < 0) {
 				printk("lockd: unlock failure in %s:%d\n",
 						__FILE__, __LINE__);
 				return 1;
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 34dae5d7073..9702956d206 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -510,17 +510,20 @@ nlmclt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
 	return 0;
 }
 
+#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
+#  error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
+#endif
+
 /*
  * Buffer requirements for NLM
  */
 #define NLM_void_sz		0
 #define NLM_cookie_sz		1+XDR_QUADLEN(NLM_MAXCOOKIELEN)
-#define NLM_caller_sz		1+XDR_QUADLEN(sizeof(utsname()->nodename))
-#define NLM_netobj_sz		1+XDR_QUADLEN(XDR_MAX_NETOBJ)
-/* #define NLM_owner_sz		1+XDR_QUADLEN(NLM_MAXOWNER) */
+#define NLM_caller_sz		1+XDR_QUADLEN(NLMCLNT_OHSIZE)
+#define NLM_owner_sz		1+XDR_QUADLEN(NLMCLNT_OHSIZE)
 #define NLM_fhandle_sz		1+XDR_QUADLEN(NFS2_FHSIZE)
-#define NLM_lock_sz		3+NLM_caller_sz+NLM_netobj_sz+NLM_fhandle_sz
-#define NLM_holder_sz		4+NLM_netobj_sz
+#define NLM_lock_sz		3+NLM_caller_sz+NLM_owner_sz+NLM_fhandle_sz
+#define NLM_holder_sz		4+NLM_owner_sz
 
 #define NLM_testargs_sz		NLM_cookie_sz+1+NLM_lock_sz
 #define NLM_lockargs_sz		NLM_cookie_sz+4+NLM_lock_sz
@@ -531,10 +534,6 @@ nlmclt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
 #define NLM_res_sz		NLM_cookie_sz+1
 #define NLM_norep_sz		0
 
-#ifndef MAX
-# define MAX(a, b)		(((a) > (b))? (a) : (b))
-#endif
-
 /*
  * For NLM, a void procedure really returns nothing
  */
@@ -545,7 +544,8 @@ nlmclt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
 	.p_proc      = NLMPROC_##proc,					\
 	.p_encode    = (kxdrproc_t) nlmclt_encode_##argtype,		\
 	.p_decode    = (kxdrproc_t) nlmclt_decode_##restype,		\
-	.p_bufsiz    = MAX(NLM_##argtype##_sz, NLM_##restype##_sz) << 2,	\
+	.p_arglen    = NLM_##argtype##_sz,				\
+	.p_replen    = NLM_##restype##_sz,				\
 	.p_statidx   = NLMPROC_##proc,					\
 	.p_name      = #proc,						\
 	}
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index a7824055121..ce1efdbe1b3 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -516,17 +516,24 @@ nlm4clt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
 	return 0;
 }
 
+#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
+#  error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
+#endif
+
+#if (NLMCLNT_OHSIZE > NLM_MAXSTRLEN)
+#  error "NLM host name cannot be larger than NLM's maximum string length!"
+#endif
+
 /*
  * Buffer requirements for NLM
  */
 #define NLM4_void_sz		0
 #define NLM4_cookie_sz		1+XDR_QUADLEN(NLM_MAXCOOKIELEN)
-#define NLM4_caller_sz		1+XDR_QUADLEN(NLM_MAXSTRLEN)
-#define NLM4_netobj_sz		1+XDR_QUADLEN(XDR_MAX_NETOBJ)
-/* #define NLM4_owner_sz		1+XDR_QUADLEN(NLM4_MAXOWNER) */
+#define NLM4_caller_sz		1+XDR_QUADLEN(NLMCLNT_OHSIZE)
+#define NLM4_owner_sz		1+XDR_QUADLEN(NLMCLNT_OHSIZE)
 #define NLM4_fhandle_sz		1+XDR_QUADLEN(NFS3_FHSIZE)
-#define NLM4_lock_sz		5+NLM4_caller_sz+NLM4_netobj_sz+NLM4_fhandle_sz
-#define NLM4_holder_sz		6+NLM4_netobj_sz
+#define NLM4_lock_sz		5+NLM4_caller_sz+NLM4_owner_sz+NLM4_fhandle_sz
+#define NLM4_holder_sz		6+NLM4_owner_sz
 
 #define NLM4_testargs_sz	NLM4_cookie_sz+1+NLM4_lock_sz
 #define NLM4_lockargs_sz	NLM4_cookie_sz+4+NLM4_lock_sz
@@ -537,10 +544,6 @@ nlm4clt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
 #define NLM4_res_sz		NLM4_cookie_sz+1
 #define NLM4_norep_sz		0
 
-#ifndef MAX
-# define MAX(a,b)		(((a) > (b))? (a) : (b))
-#endif
-
 /*
  * For NLM, a void procedure really returns nothing
  */
@@ -551,7 +554,8 @@ nlm4clt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
 	.p_proc      = NLMPROC_##proc,					\
 	.p_encode    = (kxdrproc_t) nlm4clt_encode_##argtype,		\
 	.p_decode    = (kxdrproc_t) nlm4clt_decode_##restype,		\
-	.p_bufsiz    = MAX(NLM4_##argtype##_sz, NLM4_##restype##_sz) << 2,	\
+	.p_arglen    = NLM4_##argtype##_sz,				\
+	.p_replen    = NLM4_##restype##_sz,				\
 	.p_statidx   = NLMPROC_##proc,					\
 	.p_name      = #proc,						\
 	}
diff --git a/fs/locks.c b/fs/locks.c
index 52a81005dab..671a034dc99 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -203,8 +203,7 @@ static void init_once(void *foo, struct kmem_cache *cache, unsigned long flags)
 {
 	struct file_lock *lock = (struct file_lock *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) !=
-					SLAB_CTOR_CONSTRUCTOR)
+	if (!(flags & SLAB_CTOR_CONSTRUCTOR))
 		return;
 
 	locks_init_lock(lock);
@@ -666,11 +665,11 @@ static int locks_block_on_timeout(struct file_lock *blocker, struct file_lock *w
 }
 
 int
-posix_test_lock(struct file *filp, struct file_lock *fl,
-		struct file_lock *conflock)
+posix_test_lock(struct file *filp, struct file_lock *fl)
 {
 	struct file_lock *cfl;
 
+	fl->fl_type = F_UNLCK;
 	lock_kernel();
 	for (cfl = filp->f_path.dentry->d_inode->i_flock; cfl; cfl = cfl->fl_next) {
 		if (!IS_POSIX(cfl))
@@ -679,7 +678,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl,
 			break;
 	}
 	if (cfl) {
-		__locks_copy_lock(conflock, cfl);
+		__locks_copy_lock(fl, cfl);
 		unlock_kernel();
 		return 1;
 	}
@@ -801,7 +800,7 @@ out:
 	return error;
 }
 
-static int __posix_lock_file_conf(struct inode *inode, struct file_lock *request, struct file_lock *conflock)
+static int __posix_lock_file(struct inode *inode, struct file_lock *request, struct file_lock *conflock)
 {
 	struct file_lock *fl;
 	struct file_lock *new_fl = NULL;
@@ -1007,6 +1006,7 @@ static int __posix_lock_file_conf(struct inode *inode, struct file_lock *request
  * posix_lock_file - Apply a POSIX-style lock to a file
  * @filp: The file to apply the lock to
  * @fl: The lock to be applied
+ * @conflock: Place to return a copy of the conflicting lock, if found.
  *
  * Add a POSIX style lock to a file.
  * We merge adjacent & overlapping locks whenever possible.
@@ -1016,26 +1016,12 @@ static int __posix_lock_file_conf(struct inode *inode, struct file_lock *request
  * whether or not a lock was successfully freed by testing the return
  * value for -ENOENT.
  */
-int posix_lock_file(struct file *filp, struct file_lock *fl)
-{
-	return __posix_lock_file_conf(filp->f_path.dentry->d_inode, fl, NULL);
-}
-EXPORT_SYMBOL(posix_lock_file);
-
-/**
- * posix_lock_file_conf - Apply a POSIX-style lock to a file
- * @filp: The file to apply the lock to
- * @fl: The lock to be applied
- * @conflock: Place to return a copy of the conflicting lock, if found.
- *
- * Except for the conflock parameter, acts just like posix_lock_file.
- */
-int posix_lock_file_conf(struct file *filp, struct file_lock *fl,
+int posix_lock_file(struct file *filp, struct file_lock *fl,
 			struct file_lock *conflock)
 {
-	return __posix_lock_file_conf(filp->f_path.dentry->d_inode, fl, conflock);
+	return __posix_lock_file(filp->f_path.dentry->d_inode, fl, conflock);
 }
-EXPORT_SYMBOL(posix_lock_file_conf);
+EXPORT_SYMBOL(posix_lock_file);
 
 /**
  * posix_lock_file_wait - Apply a POSIX-style lock to a file
@@ -1051,7 +1037,7 @@ int posix_lock_file_wait(struct file *filp, struct file_lock *fl)
 	int error;
 	might_sleep ();
 	for (;;) {
-		error = posix_lock_file(filp, fl);
+		error = posix_lock_file(filp, fl, NULL);
 		if ((error != -EAGAIN) || !(fl->fl_flags & FL_SLEEP))
 			break;
 		error = wait_event_interruptible(fl->fl_wait, !fl->fl_next);
@@ -1123,7 +1109,7 @@ int locks_mandatory_area(int read_write, struct inode *inode,
 	fl.fl_end = offset + count - 1;
 
 	for (;;) {
-		error = __posix_lock_file_conf(inode, &fl, NULL);
+		error = __posix_lock_file(inode, &fl, NULL);
 		if (error != -EAGAIN)
 			break;
 		if (!(fl.fl_flags & FL_SLEEP))
@@ -1611,12 +1597,62 @@ asmlinkage long sys_flock(unsigned int fd, unsigned int cmd)
 	return error;
 }
 
+/**
+ * vfs_test_lock - test file byte range lock
+ * @filp: The file to test lock for
+ * @fl: The lock to test
+ * @conf: Place to return a copy of the conflicting lock, if found
+ *
+ * Returns -ERRNO on failure.  Indicates presence of conflicting lock by
+ * setting conf->fl_type to something other than F_UNLCK.
+ */
+int vfs_test_lock(struct file *filp, struct file_lock *fl)
+{
+	if (filp->f_op && filp->f_op->lock)
+		return filp->f_op->lock(filp, F_GETLK, fl);
+	posix_test_lock(filp, fl);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vfs_test_lock);
+
+static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl)
+{
+	flock->l_pid = fl->fl_pid;
+#if BITS_PER_LONG == 32
+	/*
+	 * Make sure we can represent the posix lock via
+	 * legacy 32bit flock.
+	 */
+	if (fl->fl_start > OFFT_OFFSET_MAX)
+		return -EOVERFLOW;
+	if (fl->fl_end != OFFSET_MAX && fl->fl_end > OFFT_OFFSET_MAX)
+		return -EOVERFLOW;
+#endif
+	flock->l_start = fl->fl_start;
+	flock->l_len = fl->fl_end == OFFSET_MAX ? 0 :
+		fl->fl_end - fl->fl_start + 1;
+	flock->l_whence = 0;
+	return 0;
+}
+
+#if BITS_PER_LONG == 32
+static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl)
+{
+	flock->l_pid = fl->fl_pid;
+	flock->l_start = fl->fl_start;
+	flock->l_len = fl->fl_end == OFFSET_MAX ? 0 :
+		fl->fl_end - fl->fl_start + 1;
+	flock->l_whence = 0;
+	flock->l_type = fl->fl_type;
+}
+#endif
+
 /* Report the first existing lock that would conflict with l.
  * This implements the F_GETLK command of fcntl().
  */
 int fcntl_getlk(struct file *filp, struct flock __user *l)
 {
-	struct file_lock *fl, cfl, file_lock;
+	struct file_lock file_lock;
 	struct flock flock;
 	int error;
 
@@ -1631,38 +1667,15 @@ int fcntl_getlk(struct file *filp, struct flock __user *l)
 	if (error)
 		goto out;
 
-	if (filp->f_op && filp->f_op->lock) {
-		error = filp->f_op->lock(filp, F_GETLK, &file_lock);
-		if (file_lock.fl_ops && file_lock.fl_ops->fl_release_private)
-			file_lock.fl_ops->fl_release_private(&file_lock);
-		if (error < 0)
-			goto out;
-		else
-		  fl = (file_lock.fl_type == F_UNLCK ? NULL : &file_lock);
-	} else {
-		fl = (posix_test_lock(filp, &file_lock, &cfl) ? &cfl : NULL);
-	}
+	error = vfs_test_lock(filp, &file_lock);
+	if (error)
+		goto out;
  
-	flock.l_type = F_UNLCK;
-	if (fl != NULL) {
-		flock.l_pid = fl->fl_pid;
-#if BITS_PER_LONG == 32
-		/*
-		 * Make sure we can represent the posix lock via
-		 * legacy 32bit flock.
-		 */
-		error = -EOVERFLOW;
-		if (fl->fl_start > OFFT_OFFSET_MAX)
-			goto out;
-		if ((fl->fl_end != OFFSET_MAX)
-		    && (fl->fl_end > OFFT_OFFSET_MAX))
+	flock.l_type = file_lock.fl_type;
+	if (file_lock.fl_type != F_UNLCK) {
+		error = posix_lock_to_flock(&flock, &file_lock);
+		if (error)
 			goto out;
-#endif
-		flock.l_start = fl->fl_start;
-		flock.l_len = fl->fl_end == OFFSET_MAX ? 0 :
-			fl->fl_end - fl->fl_start + 1;
-		flock.l_whence = 0;
-		flock.l_type = fl->fl_type;
 	}
 	error = -EFAULT;
 	if (!copy_to_user(l, &flock, sizeof(flock)))
@@ -1671,6 +1684,48 @@ out:
 	return error;
 }
 
+/**
+ * vfs_lock_file - file byte range lock
+ * @filp: The file to apply the lock to
+ * @cmd: type of locking operation (F_SETLK, F_GETLK, etc.)
+ * @fl: The lock to be applied
+ * @conf: Place to return a copy of the conflicting lock, if found.
+ *
+ * A caller that doesn't care about the conflicting lock may pass NULL
+ * as the final argument.
+ *
+ * If the filesystem defines a private ->lock() method, then @conf will
+ * be left unchanged; so a caller that cares should initialize it to
+ * some acceptable default.
+ *
+ * To avoid blocking kernel daemons, such as lockd, that need to acquire POSIX
+ * locks, the ->lock() interface may return asynchronously, before the lock has
+ * been granted or denied by the underlying filesystem, if (and only if)
+ * fl_grant is set. Callers expecting ->lock() to return asynchronously
+ * will only use F_SETLK, not F_SETLKW; they will set FL_SLEEP if (and only if)
+ * the request is for a blocking lock. When ->lock() does return asynchronously,
+ * it must return -EINPROGRESS, and call ->fl_grant() when the lock
+ * request completes.
+ * If the request is for non-blocking lock the file system should return
+ * -EINPROGRESS then try to get the lock and call the callback routine with
+ * the result. If the request timed out the callback routine will return a
+ * nonzero return code and the file system should release the lock. The file
+ * system is also responsible to keep a corresponding posix lock when it
+ * grants a lock so the VFS can find out which locks are locally held and do
+ * the correct lock cleanup when required.
+ * The underlying filesystem must not drop the kernel lock or call
+ * ->fl_grant() before returning to the caller with a -EINPROGRESS
+ * return code.
+ */
+int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf)
+{
+	if (filp->f_op && filp->f_op->lock)
+		return filp->f_op->lock(filp, cmd, fl);
+	else
+		return posix_lock_file(filp, fl, conf);
+}
+EXPORT_SYMBOL_GPL(vfs_lock_file);
+
 /* Apply the lock described by l to an open file descriptor.
  * This implements both the F_SETLK and F_SETLKW commands of fcntl().
  */
@@ -1733,21 +1788,17 @@ again:
 	if (error)
 		goto out;
 
-	if (filp->f_op && filp->f_op->lock != NULL)
-		error = filp->f_op->lock(filp, cmd, file_lock);
-	else {
-		for (;;) {
-			error = posix_lock_file(filp, file_lock);
-			if ((error != -EAGAIN) || (cmd == F_SETLK))
-				break;
-			error = wait_event_interruptible(file_lock->fl_wait,
-					!file_lock->fl_next);
-			if (!error)
-				continue;
-
-			locks_delete_block(file_lock);
+	for (;;) {
+		error = vfs_lock_file(filp, cmd, file_lock, NULL);
+		if (error != -EAGAIN || cmd == F_SETLK)
 			break;
-		}
+		error = wait_event_interruptible(file_lock->fl_wait,
+				!file_lock->fl_next);
+		if (!error)
+			continue;
+
+		locks_delete_block(file_lock);
+		break;
 	}
 
 	/*
@@ -1770,7 +1821,7 @@ out:
  */
 int fcntl_getlk64(struct file *filp, struct flock64 __user *l)
 {
-	struct file_lock *fl, cfl, file_lock;
+	struct file_lock file_lock;
 	struct flock64 flock;
 	int error;
 
@@ -1785,27 +1836,14 @@ int fcntl_getlk64(struct file *filp, struct flock64 __user *l)
 	if (error)
 		goto out;
 
-	if (filp->f_op && filp->f_op->lock) {
-		error = filp->f_op->lock(filp, F_GETLK, &file_lock);
-		if (file_lock.fl_ops && file_lock.fl_ops->fl_release_private)
-			file_lock.fl_ops->fl_release_private(&file_lock);
-		if (error < 0)
-			goto out;
-		else
-		  fl = (file_lock.fl_type == F_UNLCK ? NULL : &file_lock);
-	} else {
-		fl = (posix_test_lock(filp, &file_lock, &cfl) ? &cfl : NULL);
-	}
- 
-	flock.l_type = F_UNLCK;
-	if (fl != NULL) {
-		flock.l_pid = fl->fl_pid;
-		flock.l_start = fl->fl_start;
-		flock.l_len = fl->fl_end == OFFSET_MAX ? 0 :
-			fl->fl_end - fl->fl_start + 1;
-		flock.l_whence = 0;
-		flock.l_type = fl->fl_type;
-	}
+	error = vfs_test_lock(filp, &file_lock);
+	if (error)
+		goto out;
+
+	flock.l_type = file_lock.fl_type;
+	if (file_lock.fl_type != F_UNLCK)
+		posix_lock_to_flock64(&flock, &file_lock);
+
 	error = -EFAULT;
 	if (!copy_to_user(l, &flock, sizeof(flock)))
 		error = 0;
@@ -1876,21 +1914,17 @@ again:
 	if (error)
 		goto out;
 
-	if (filp->f_op && filp->f_op->lock != NULL)
-		error = filp->f_op->lock(filp, cmd, file_lock);
-	else {
-		for (;;) {
-			error = posix_lock_file(filp, file_lock);
-			if ((error != -EAGAIN) || (cmd == F_SETLK64))
-				break;
-			error = wait_event_interruptible(file_lock->fl_wait,
-					!file_lock->fl_next);
-			if (!error)
-				continue;
-
-			locks_delete_block(file_lock);
+	for (;;) {
+		error = vfs_lock_file(filp, cmd, file_lock, NULL);
+		if (error != -EAGAIN || cmd == F_SETLK64)
 			break;
-		}
+		error = wait_event_interruptible(file_lock->fl_wait,
+				!file_lock->fl_next);
+		if (!error)
+			continue;
+
+		locks_delete_block(file_lock);
+		break;
 	}
 
 	/*
@@ -1935,10 +1969,7 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner)
 	lock.fl_ops = NULL;
 	lock.fl_lmops = NULL;
 
-	if (filp->f_op && filp->f_op->lock != NULL)
-		filp->f_op->lock(filp, F_SETLK, &lock);
-	else
-		posix_lock_file(filp, &lock);
+	vfs_lock_file(filp, F_SETLK, &lock, NULL);
 
 	if (lock.fl_ops && lock.fl_ops->fl_release_private)
 		lock.fl_ops->fl_release_private(&lock);
@@ -2015,6 +2046,22 @@ posix_unblock_lock(struct file *filp, struct file_lock *waiter)
 
 EXPORT_SYMBOL(posix_unblock_lock);
 
+/**
+ * vfs_cancel_lock - file byte range unblock lock
+ * @filp: The file to apply the unblock to
+ * @fl: The lock to be unblocked
+ *
+ * Used by lock managers to cancel blocked requests
+ */
+int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
+{
+	if (filp->f_op && filp->f_op->lock)
+		return filp->f_op->lock(filp, F_CANCELLK, fl);
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(vfs_cancel_lock);
+
 static void lock_get_status(char* out, struct file_lock *fl, int id, char *pfx)
 {
 	struct inode *inode = NULL;
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index cb4cb571fdd..e207cbe7095 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -65,7 +65,6 @@ static struct page * dir_get_page(struct inode *dir, unsigned long n)
 	struct address_space *mapping = dir->i_mapping;
 	struct page *page = read_mapping_page(mapping, n, NULL);
 	if (!IS_ERR(page)) {
-		wait_on_page_locked(page);
 		kmap(page);
 		if (!PageUptodate(page))
 			goto fail;
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 92e383af370..2f4d43a2a31 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -73,8 +73,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
 	struct minix_inode_info *ei = (struct minix_inode_info *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR)
+	if (flags & SLAB_CTOR_CONSTRUCTOR)
 		inode_init_once(&ei->vfs_inode);
 }
  
diff --git a/fs/namei.c b/fs/namei.c
index 880052cadbc..94b2f60aec2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2671,19 +2671,9 @@ static char *page_getlink(struct dentry * dentry, struct page **ppage)
 	struct address_space *mapping = dentry->d_inode->i_mapping;
 	page = read_mapping_page(mapping, 0, NULL);
 	if (IS_ERR(page))
-		goto sync_fail;
-	wait_on_page_locked(page);
-	if (!PageUptodate(page))
-		goto async_fail;
+		return (char*)page;
 	*ppage = page;
 	return kmap(page);
-
-async_fail:
-	page_cache_release(page);
-	return ERR_PTR(-EIO);
-
-sync_fail:
-	return (char*)page;
 }
 
 int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 7285c94956c..c29f00ad495 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -60,8 +60,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
 	struct ncp_inode_info *ei = (struct ncp_inode_info *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
+	if (flags & SLAB_CTOR_CONSTRUCTOR) {
 		mutex_init(&ei->open_mutex);
 		inode_init_once(&ei->vfs_inode);
 	}
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 2190e6c2792..5bd03b97002 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -618,7 +618,8 @@ static int nfs_init_server(struct nfs_server *server, const struct nfs_mount_dat
 	if (clp->cl_nfsversion == 3) {
 		if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN)
 			server->namelen = NFS3_MAXNAMLEN;
-		server->caps |= NFS_CAP_READDIRPLUS;
+		if (!(data->flags & NFS_MOUNT_NORDIRPLUS))
+			server->caps |= NFS_CAP_READDIRPLUS;
 	} else {
 		if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN)
 			server->namelen = NFS2_MAXNAMLEN;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index cd3469720cb..625d8e5fb39 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -154,6 +154,8 @@ typedef struct {
 	decode_dirent_t	decode;
 	int		plus;
 	int		error;
+	unsigned long	timestamp;
+	int		timestamp_valid;
 } nfs_readdir_descriptor_t;
 
 /* Now we cache directories properly, by stuffing the dirent
@@ -195,6 +197,8 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
 		}
 		goto error;
 	}
+	desc->timestamp = timestamp;
+	desc->timestamp_valid = 1;
 	SetPageUptodate(page);
 	spin_lock(&inode->i_lock);
 	NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME;
@@ -225,6 +229,10 @@ int dir_decode(nfs_readdir_descriptor_t *desc)
 	if (IS_ERR(p))
 		return PTR_ERR(p);
 	desc->ptr = p;
+	if (desc->timestamp_valid)
+		desc->entry->fattr->time_start = desc->timestamp;
+	else
+		desc->entry->fattr->valid &= ~NFS_ATTR_FATTR;
 	return 0;
 }
 
@@ -316,14 +324,16 @@ int find_dirent_page(nfs_readdir_descriptor_t *desc)
 			__FUNCTION__, desc->page_index,
 			(long long) *desc->dir_cookie);
 
+	/* If we find the page in the page_cache, we cannot be sure
+	 * how fresh the data is, so we will ignore readdir_plus attributes.
+	 */
+	desc->timestamp_valid = 0;
 	page = read_cache_page(inode->i_mapping, desc->page_index,
 			       (filler_t *)nfs_readdir_filler, desc);
 	if (IS_ERR(page)) {
 		status = PTR_ERR(page);
 		goto out;
 	}
-	if (!PageUptodate(page))
-		goto read_error;
 
 	/* NOTE: Someone else may have changed the READDIRPLUS flag */
 	desc->page = page;
@@ -337,9 +347,6 @@ int find_dirent_page(nfs_readdir_descriptor_t *desc)
  out:
 	dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __FUNCTION__, status);
 	return status;
- read_error:
-	page_cache_release(page);
-	return -EIO;
 }
 
 /*
@@ -468,6 +475,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
 	struct rpc_cred	*cred = nfs_file_cred(file);
 	struct page	*page = NULL;
 	int		status;
+	unsigned long	timestamp;
 
 	dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
 			(unsigned long long)*desc->dir_cookie);
@@ -477,6 +485,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
 		status = -ENOMEM;
 		goto out;
 	}
+	timestamp = jiffies;
 	desc->error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, *desc->dir_cookie,
 						page,
 						NFS_SERVER(inode)->dtsize,
@@ -487,6 +496,8 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
 	desc->page = page;
 	desc->ptr = kmap(page);		/* matching kunmap in nfs_do_filldir */
 	if (desc->error >= 0) {
+		desc->timestamp = timestamp;
+		desc->timestamp_valid = 1;
 		if ((status = dir_decode(desc)) == 0)
 			desc->entry->prev_cookie = *desc->dir_cookie;
 	} else
@@ -849,6 +860,10 @@ static int nfs_dentry_delete(struct dentry *dentry)
 static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
 {
 	nfs_inode_return_delegation(inode);
+	if (S_ISDIR(inode->i_mode))
+		/* drop any readdir cache as it could easily be old */
+		NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
+
 	if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
 		lock_kernel();
 		drop_nlink(inode);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 2877744cb60..889de60f8a8 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -54,6 +54,7 @@
 #include <asm/uaccess.h>
 #include <asm/atomic.h>
 
+#include "internal.h"
 #include "iostat.h"
 
 #define NFSDBG_FACILITY		NFSDBG_VFS
@@ -271,7 +272,7 @@ static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned lo
 		bytes = min(rsize,count);
 
 		result = -ENOMEM;
-		data = nfs_readdata_alloc(pgbase + bytes);
+		data = nfs_readdata_alloc(nfs_page_array_len(pgbase, bytes));
 		if (unlikely(!data))
 			break;
 
@@ -602,7 +603,7 @@ static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned l
 		bytes = min(wsize,count);
 
 		result = -ENOMEM;
-		data = nfs_writedata_alloc(pgbase + bytes);
+		data = nfs_writedata_alloc(nfs_page_array_len(pgbase, bytes));
 		if (unlikely(!data))
 			break;
 
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 8e66b5a2d49..5eaee6dd040 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -391,17 +391,12 @@ out_swapfile:
 
 static int do_getlk(struct file *filp, int cmd, struct file_lock *fl)
 {
-	struct file_lock cfl;
 	struct inode *inode = filp->f_mapping->host;
 	int status = 0;
 
 	lock_kernel();
 	/* Try local locking first */
-	if (posix_test_lock(filp, fl, &cfl)) {
-		fl->fl_start = cfl.fl_start;
-		fl->fl_end = cfl.fl_end;
-		fl->fl_type = cfl.fl_type;
-		fl->fl_pid = cfl.fl_pid;
+	if (posix_test_lock(filp, fl)) {
 		goto out;
 	}
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 44aa9b72657..1e9a915d1fe 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1167,8 +1167,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
 	struct nfs_inode *nfsi = (struct nfs_inode *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
+	if (flags & SLAB_CTOR_CONSTRUCTOR) {
 		inode_init_once(&nfsi->vfs_inode);
 		spin_lock_init(&nfsi->req_lock);
 		INIT_LIST_HEAD(&nfsi->dirty);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 6610f2b0207..ad2b40db1e6 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -231,3 +231,15 @@ unsigned int nfs_page_length(struct page *page)
 	}
 	return 0;
 }
+
+/*
+ * Determine the number of pages in an array of length 'len' and
+ * with a base offset of 'base'
+ */
+static inline
+unsigned int nfs_page_array_len(unsigned int base, size_t len)
+{
+	return ((unsigned long)len + (unsigned long)base +
+		PAGE_SIZE - 1) >> PAGE_SHIFT;
+}
+
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index f75fe72b416..ca5a266a314 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -133,13 +133,15 @@ xdr_decode_fhstatus3(struct rpc_rqst *req, __be32 *p, struct mnt_fhstatus *res)
 
 #define MNT_dirpath_sz		(1 + 256)
 #define MNT_fhstatus_sz		(1 + 8)
+#define MNT_fhstatus3_sz	(1 + 16)
 
 static struct rpc_procinfo	mnt_procedures[] = {
 [MNTPROC_MNT] = {
 	  .p_proc		= MNTPROC_MNT,
 	  .p_encode		= (kxdrproc_t) xdr_encode_dirpath,	
 	  .p_decode		= (kxdrproc_t) xdr_decode_fhstatus,
-	  .p_bufsiz		= MNT_dirpath_sz << 2,
+	  .p_arglen		= MNT_dirpath_sz,
+	  .p_replen		= MNT_fhstatus_sz,
 	  .p_statidx		= MNTPROC_MNT,
 	  .p_name		= "MOUNT",
 	},
@@ -150,7 +152,8 @@ static struct rpc_procinfo mnt3_procedures[] = {
 	  .p_proc		= MOUNTPROC3_MNT,
 	  .p_encode		= (kxdrproc_t) xdr_encode_dirpath,
 	  .p_decode		= (kxdrproc_t) xdr_decode_fhstatus3,
-	  .p_bufsiz		= MNT_dirpath_sz << 2,
+	  .p_arglen		= MNT_dirpath_sz,
+	  .p_replen		= MNT_fhstatus3_sz,
 	  .p_statidx		= MOUNTPROC3_MNT,
 	  .p_name		= "MOUNT",
 	},
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 3be4e72a022..abd9f8b4894 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -687,16 +687,13 @@ nfs_stat_to_errno(int stat)
 	return nfs_errtbl[i].errno;
 }
 
-#ifndef MAX
-# define MAX(a, b)	(((a) > (b))? (a) : (b))
-#endif
-
 #define PROC(proc, argtype, restype, timer)				\
 [NFSPROC_##proc] = {							\
 	.p_proc	    =  NFSPROC_##proc,					\
 	.p_encode   =  (kxdrproc_t) nfs_xdr_##argtype,			\
 	.p_decode   =  (kxdrproc_t) nfs_xdr_##restype,			\
-	.p_bufsiz   =  MAX(NFS_##argtype##_sz,NFS_##restype##_sz) << 2,	\
+	.p_arglen   =  NFS_##argtype##_sz,				\
+	.p_replen   =  NFS_##restype##_sz,				\
 	.p_timer    =  timer,						\
 	.p_statidx  =  NFSPROC_##proc,					\
 	.p_name     =  #proc,						\
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 0ace092d126..b51df8eb9f0 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1102,16 +1102,13 @@ nfs3_xdr_setaclres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
 }
 #endif  /* CONFIG_NFS_V3_ACL */
 
-#ifndef MAX
-# define MAX(a, b)	(((a) > (b))? (a) : (b))
-#endif
-
 #define PROC(proc, argtype, restype, timer)				\
 [NFS3PROC_##proc] = {							\
 	.p_proc      = NFS3PROC_##proc,					\
 	.p_encode    = (kxdrproc_t) nfs3_xdr_##argtype,			\
 	.p_decode    = (kxdrproc_t) nfs3_xdr_##restype,			\
-	.p_bufsiz    = MAX(NFS3_##argtype##_sz,NFS3_##restype##_sz) << 2,	\
+	.p_arglen    = NFS3_##argtype##_sz,				\
+	.p_replen    = NFS3_##restype##_sz,				\
 	.p_timer     = timer,						\
 	.p_statidx   = NFS3PROC_##proc,					\
 	.p_name      = #proc,						\
@@ -1153,7 +1150,8 @@ static struct rpc_procinfo	nfs3_acl_procedures[] = {
 		.p_proc = ACLPROC3_GETACL,
 		.p_encode = (kxdrproc_t) nfs3_xdr_getaclargs,
 		.p_decode = (kxdrproc_t) nfs3_xdr_getaclres,
-		.p_bufsiz = MAX(ACL3_getaclargs_sz, ACL3_getaclres_sz) << 2,
+		.p_arglen = ACL3_getaclargs_sz,
+		.p_replen = ACL3_getaclres_sz,
 		.p_timer = 1,
 		.p_name = "GETACL",
 	},
@@ -1161,7 +1159,8 @@ static struct rpc_procinfo	nfs3_acl_procedures[] = {
 		.p_proc = ACLPROC3_SETACL,
 		.p_encode = (kxdrproc_t) nfs3_xdr_setaclargs,
 		.p_decode = (kxdrproc_t) nfs3_xdr_setaclres,
-		.p_bufsiz = MAX(ACL3_setaclargs_sz, ACL3_setaclres_sz) << 2,
+		.p_arglen = ACL3_setaclargs_sz,
+		.p_replen = ACL3_setaclres_sz,
 		.p_timer = 0,
 		.p_name = "SETACL",
 	},
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index f52cf5c33c6..d6a30e96578 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2647,8 +2647,7 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
 	nfs_inode_return_delegation(inode);
 	buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
 	ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
-	if (ret == 0)
-		nfs4_write_cached_acl(inode, buf, buflen);
+	nfs_zap_caches(inode);
 	return ret;
 }
 
@@ -3018,6 +3017,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
 		case -NFS4ERR_DENIED:
 			status = 0;
 	}
+	request->fl_ops->fl_release_private(request);
 out:
 	up_read(&clp->cl_sem);
 	return status;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index f02d522fd78..b8c28f2380a 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4546,16 +4546,13 @@ nfs4_stat_to_errno(int stat)
 	return stat;
 }
 
-#ifndef MAX
-# define MAX(a, b)	(((a) > (b))? (a) : (b))
-#endif
-
 #define PROC(proc, argtype, restype)				\
 [NFSPROC4_CLNT_##proc] = {					\
 	.p_proc   = NFSPROC4_COMPOUND,				\
 	.p_encode = (kxdrproc_t) nfs4_xdr_##argtype,		\
 	.p_decode = (kxdrproc_t) nfs4_xdr_##restype,		\
-	.p_bufsiz = MAX(NFS4_##argtype##_sz,NFS4_##restype##_sz) << 2,	\
+	.p_arglen = NFS4_##argtype##_sz,			\
+	.p_replen = NFS4_##restype##_sz,			\
 	.p_statidx = NFSPROC4_CLNT_##proc,			\
 	.p_name   = #proc,					\
     }
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 75f819dc025..49d1008ce1d 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -428,7 +428,7 @@ static int __init root_nfs_getport(int program, int version, int proto)
 	printk(KERN_NOTICE "Looking up port of RPC %d/%d on %u.%u.%u.%u\n",
 		program, version, NIPQUAD(servaddr));
 	set_sockaddr(&sin, servaddr, 0);
-	return rpc_getport_external(&sin, program, version, proto);
+	return rpcb_getport_external(&sin, program, version, proto);
 }
 
 
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index ca4b1d4ff42..388950118f5 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -17,7 +17,8 @@
 #include <linux/nfs_page.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_mount.h>
-#include <linux/writeback.h>
+
+#include "internal.h"
 
 #define NFS_PARANOIA 1
 
@@ -50,9 +51,7 @@ nfs_page_free(struct nfs_page *p)
  * @count: number of bytes to read/write
  *
  * The page must be locked by the caller. This makes sure we never
- * create two different requests for the same page, and avoids
- * a possible deadlock when we reach the hard limit on the number
- * of dirty pages.
+ * create two different requests for the same page.
  * User should ensure it is safe to sleep in this function.
  */
 struct nfs_page *
@@ -63,16 +62,12 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
 	struct nfs_server *server = NFS_SERVER(inode);
 	struct nfs_page		*req;
 
-	/* Deal with hard limits.  */
 	for (;;) {
 		/* try to allocate the request struct */
 		req = nfs_page_alloc();
 		if (req != NULL)
 			break;
 
-		/* Try to free up at least one request in order to stay
-		 * below the hard limit
-		 */
 		if (signalled() && (server->flags & NFS_MOUNT_INTR))
 			return ERR_PTR(-ERESTARTSYS);
 		yield();
@@ -223,124 +218,151 @@ out:
 }
 
 /**
- * nfs_coalesce_requests - Split coalesced requests out from a list.
- * @head: source list
- * @dst: destination list
- * @nmax: maximum number of requests to coalesce
- *
- * Moves a maximum of 'nmax' elements from one list to another.
- * The elements are checked to ensure that they form a contiguous set
- * of pages, and that the RPC credentials are the same.
+ * nfs_pageio_init - initialise a page io descriptor
+ * @desc: pointer to descriptor
+ * @inode: pointer to inode
+ * @doio: pointer to io function
+ * @bsize: io block size
+ * @io_flags: extra parameters for the io function
  */
-int
-nfs_coalesce_requests(struct list_head *head, struct list_head *dst,
-		      unsigned int nmax)
+void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
+		     struct inode *inode,
+		     int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int),
+		     size_t bsize,
+		     int io_flags)
 {
-	struct nfs_page		*req = NULL;
-	unsigned int		npages = 0;
-
-	while (!list_empty(head)) {
-		struct nfs_page	*prev = req;
-
-		req = nfs_list_entry(head->next);
-		if (prev) {
-			if (req->wb_context->cred != prev->wb_context->cred)
-				break;
-			if (req->wb_context->lockowner != prev->wb_context->lockowner)
-				break;
-			if (req->wb_context->state != prev->wb_context->state)
-				break;
-			if (req->wb_index != (prev->wb_index + 1))
-				break;
-
-			if (req->wb_pgbase != 0)
-				break;
-		}
-		nfs_list_remove_request(req);
-		nfs_list_add_request(req, dst);
-		npages++;
-		if (req->wb_pgbase + req->wb_bytes != PAGE_CACHE_SIZE)
-			break;
-		if (npages >= nmax)
-			break;
-	}
-	return npages;
+	INIT_LIST_HEAD(&desc->pg_list);
+	desc->pg_bytes_written = 0;
+	desc->pg_count = 0;
+	desc->pg_bsize = bsize;
+	desc->pg_base = 0;
+	desc->pg_inode = inode;
+	desc->pg_doio = doio;
+	desc->pg_ioflags = io_flags;
+	desc->pg_error = 0;
 }
 
-#define NFS_SCAN_MAXENTRIES 16
 /**
- * nfs_scan_dirty - Scan the radix tree for dirty requests
- * @mapping: pointer to address space
- * @wbc: writeback_control structure
- * @dst: Destination list
+ * nfs_can_coalesce_requests - test two requests for compatibility
+ * @prev: pointer to nfs_page
+ * @req: pointer to nfs_page
  *
- * Moves elements from one of the inode request lists.
- * If the number of requests is set to 0, the entire address_space
- * starting at index idx_start, is scanned.
- * The requests are *not* checked to ensure that they form a contiguous set.
- * You must be holding the inode's req_lock when calling this function
+ * The nfs_page structures 'prev' and 'req' are compared to ensure that the
+ * page data area they describe is contiguous, and that their RPC
+ * credentials, NFSv4 open state, and lockowners are the same.
+ *
+ * Return 'true' if this is the case, else return 'false'.
  */
-long nfs_scan_dirty(struct address_space *mapping,
-			struct writeback_control *wbc,
-			struct list_head *dst)
+static int nfs_can_coalesce_requests(struct nfs_page *prev,
+				     struct nfs_page *req)
 {
-	struct nfs_inode *nfsi = NFS_I(mapping->host);
-	struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES];
-	struct nfs_page *req;
-	pgoff_t idx_start, idx_end;
-	long res = 0;
-	int found, i;
-
-	if (nfsi->ndirty == 0)
+	if (req->wb_context->cred != prev->wb_context->cred)
 		return 0;
-	if (wbc->range_cyclic) {
-		idx_start = 0;
-		idx_end = ULONG_MAX;
-	} else if (wbc->range_end == 0) {
-		idx_start = wbc->range_start >> PAGE_CACHE_SHIFT;
-		idx_end = ULONG_MAX;
-	} else {
-		idx_start = wbc->range_start >> PAGE_CACHE_SHIFT;
-		idx_end = wbc->range_end >> PAGE_CACHE_SHIFT;
-	}
+	if (req->wb_context->lockowner != prev->wb_context->lockowner)
+		return 0;
+	if (req->wb_context->state != prev->wb_context->state)
+		return 0;
+	if (req->wb_index != (prev->wb_index + 1))
+		return 0;
+	if (req->wb_pgbase != 0)
+		return 0;
+	if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
+		return 0;
+	return 1;
+}
 
-	for (;;) {
-		unsigned int toscan = NFS_SCAN_MAXENTRIES;
+/**
+ * nfs_pageio_do_add_request - Attempt to coalesce a request into a page list.
+ * @desc: destination io descriptor
+ * @req: request
+ *
+ * Returns true if the request 'req' was successfully coalesced into the
+ * existing list of pages 'desc'.
+ */
+static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
+				     struct nfs_page *req)
+{
+	size_t newlen = req->wb_bytes;
 
-		found = radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree,
-				(void **)&pgvec[0], idx_start, toscan,
-				NFS_PAGE_TAG_DIRTY);
+	if (desc->pg_count != 0) {
+		struct nfs_page *prev;
 
-		/* Did we make progress? */
-		if (found <= 0)
-			break;
+		/*
+		 * FIXME: ideally we should be able to coalesce all requests
+		 * that are not block boundary aligned, but currently this
+		 * is problematic for the case of bsize < PAGE_CACHE_SIZE,
+		 * since nfs_flush_multi and nfs_pagein_multi assume you
+		 * can have only one struct nfs_page.
+		 */
+		if (desc->pg_bsize < PAGE_SIZE)
+			return 0;
+		newlen += desc->pg_count;
+		if (newlen > desc->pg_bsize)
+			return 0;
+		prev = nfs_list_entry(desc->pg_list.prev);
+		if (!nfs_can_coalesce_requests(prev, req))
+			return 0;
+	} else
+		desc->pg_base = req->wb_pgbase;
+	nfs_list_remove_request(req);
+	nfs_list_add_request(req, &desc->pg_list);
+	desc->pg_count = newlen;
+	return 1;
+}
 
-		for (i = 0; i < found; i++) {
-			req = pgvec[i];
-			if (!wbc->range_cyclic && req->wb_index > idx_end)
-				goto out;
+/*
+ * Helper for nfs_pageio_add_request and nfs_pageio_complete
+ */
+static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
+{
+	if (!list_empty(&desc->pg_list)) {
+		int error = desc->pg_doio(desc->pg_inode,
+					  &desc->pg_list,
+					  nfs_page_array_len(desc->pg_base,
+							     desc->pg_count),
+					  desc->pg_count,
+					  desc->pg_ioflags);
+		if (error < 0)
+			desc->pg_error = error;
+		else
+			desc->pg_bytes_written += desc->pg_count;
+	}
+	if (list_empty(&desc->pg_list)) {
+		desc->pg_count = 0;
+		desc->pg_base = 0;
+	}
+}
 
-			/* Try to lock request and mark it for writeback */
-			if (!nfs_set_page_writeback_locked(req))
-				goto next;
-			radix_tree_tag_clear(&nfsi->nfs_page_tree,
-					req->wb_index, NFS_PAGE_TAG_DIRTY);
-			nfsi->ndirty--;
-			nfs_list_remove_request(req);
-			nfs_list_add_request(req, dst);
-			res++;
-			if (res == LONG_MAX)
-				goto out;
-next:
-			idx_start = req->wb_index + 1;
-		}
+/**
+ * nfs_pageio_add_request - Attempt to coalesce a request into a page list.
+ * @desc: destination io descriptor
+ * @req: request
+ *
+ * Returns true if the request 'req' was successfully coalesced into the
+ * existing list of pages 'desc'.
+ */
+int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
+			   struct nfs_page *req)
+{
+	while (!nfs_pageio_do_add_request(desc, req)) {
+		nfs_pageio_doio(desc);
+		if (desc->pg_error < 0)
+			return 0;
 	}
-out:
-	WARN_ON ((nfsi->ndirty == 0) != list_empty(&nfsi->dirty));
-	return res;
+	return 1;
 }
 
 /**
+ * nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor
+ * @desc: pointer to io descriptor
+ */
+void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)
+{
+	nfs_pageio_doio(desc);
+}
+
+#define NFS_SCAN_MAXENTRIES 16
+/**
  * nfs_scan_list - Scan a list for matching requests
  * @nfsi: NFS inode
  * @head: One of the NFS inode request lists
@@ -355,12 +377,12 @@ out:
  * You must be holding the inode's req_lock when calling this function
  */
 int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *head,
-		struct list_head *dst, unsigned long idx_start,
+		struct list_head *dst, pgoff_t idx_start,
 		unsigned int npages)
 {
 	struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES];
 	struct nfs_page *req;
-	unsigned long idx_end;
+	pgoff_t idx_end;
 	int found, i;
 	int res;
 
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 6ab4d5a9edf..9a55807b2a7 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -27,7 +27,8 @@
 
 #define NFSDBG_FACILITY		NFSDBG_PAGECACHE
 
-static int nfs_pagein_one(struct list_head *, struct inode *);
+static int nfs_pagein_multi(struct inode *, struct list_head *, unsigned int, size_t, int);
+static int nfs_pagein_one(struct inode *, struct list_head *, unsigned int, size_t, int);
 static const struct rpc_call_ops nfs_read_partial_ops;
 static const struct rpc_call_ops nfs_read_full_ops;
 
@@ -36,9 +37,8 @@ static mempool_t *nfs_rdata_mempool;
 
 #define MIN_POOL_READ	(32)
 
-struct nfs_read_data *nfs_readdata_alloc(size_t len)
+struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
 {
-	unsigned int pagecount = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_NOFS);
 
 	if (p) {
@@ -133,7 +133,10 @@ static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
 		memclear_highpage_flush(page, len, PAGE_CACHE_SIZE - len);
 
 	nfs_list_add_request(new, &one_request);
-	nfs_pagein_one(&one_request, inode);
+	if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE)
+		nfs_pagein_multi(inode, &one_request, 1, len, 0);
+	else
+		nfs_pagein_one(inode, &one_request, 1, len, 0);
 	return 0;
 }
 
@@ -230,7 +233,7 @@ static void nfs_execute_read(struct nfs_read_data *data)
  * won't see the new data until our attribute cache is updated.  This is more
  * or less conventional NFS client behavior.
  */
-static int nfs_pagein_multi(struct list_head *head, struct inode *inode)
+static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags)
 {
 	struct nfs_page *req = nfs_list_entry(head->next);
 	struct page *page = req->wb_page;
@@ -242,11 +245,11 @@ static int nfs_pagein_multi(struct list_head *head, struct inode *inode)
 
 	nfs_list_remove_request(req);
 
-	nbytes = req->wb_bytes;
+	nbytes = count;
 	do {
 		size_t len = min(nbytes,rsize);
 
-		data = nfs_readdata_alloc(len);
+		data = nfs_readdata_alloc(1);
 		if (!data)
 			goto out_bad;
 		INIT_LIST_HEAD(&data->pages);
@@ -258,23 +261,19 @@ static int nfs_pagein_multi(struct list_head *head, struct inode *inode)
 
 	ClearPageError(page);
 	offset = 0;
-	nbytes = req->wb_bytes;
+	nbytes = count;
 	do {
 		data = list_entry(list.next, struct nfs_read_data, pages);
 		list_del_init(&data->pages);
 
 		data->pagevec[0] = page;
 
-		if (nbytes > rsize) {
-			nfs_read_rpcsetup(req, data, &nfs_read_partial_ops,
-					rsize, offset);
-			offset += rsize;
-			nbytes -= rsize;
-		} else {
-			nfs_read_rpcsetup(req, data, &nfs_read_partial_ops,
-					nbytes, offset);
-			nbytes = 0;
-		}
+		if (nbytes < rsize)
+			rsize = nbytes;
+		nfs_read_rpcsetup(req, data, &nfs_read_partial_ops,
+				  rsize, offset);
+		offset += rsize;
+		nbytes -= rsize;
 		nfs_execute_read(data);
 	} while (nbytes != 0);
 
@@ -291,30 +290,24 @@ out_bad:
 	return -ENOMEM;
 }
 
-static int nfs_pagein_one(struct list_head *head, struct inode *inode)
+static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags)
 {
 	struct nfs_page		*req;
 	struct page		**pages;
 	struct nfs_read_data	*data;
-	unsigned int		count;
 
-	if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE)
-		return nfs_pagein_multi(head, inode);
-
-	data = nfs_readdata_alloc(NFS_SERVER(inode)->rsize);
+	data = nfs_readdata_alloc(npages);
 	if (!data)
 		goto out_bad;
 
 	INIT_LIST_HEAD(&data->pages);
 	pages = data->pagevec;
-	count = 0;
 	while (!list_empty(head)) {
 		req = nfs_list_entry(head->next);
 		nfs_list_remove_request(req);
 		nfs_list_add_request(req, &data->pages);
 		ClearPageError(req->wb_page);
 		*pages++ = req->wb_page;
-		count += req->wb_bytes;
 	}
 	req = nfs_list_entry(data->pages.next);
 
@@ -327,28 +320,6 @@ out_bad:
 	return -ENOMEM;
 }
 
-static int
-nfs_pagein_list(struct list_head *head, int rpages)
-{
-	LIST_HEAD(one_request);
-	struct nfs_page		*req;
-	int			error = 0;
-	unsigned int		pages = 0;
-
-	while (!list_empty(head)) {
-		pages += nfs_coalesce_requests(head, &one_request, rpages);
-		req = nfs_list_entry(one_request.next);
-		error = nfs_pagein_one(&one_request, req->wb_context->dentry->d_inode);
-		if (error < 0)
-			break;
-	}
-	if (error >= 0)
-		return pages;
-
-	nfs_async_read_error(head);
-	return error;
-}
-
 /*
  * This is the callback from RPC telling us whether a reply was
  * received or some error occurred (timeout or socket shutdown).
@@ -538,7 +509,7 @@ out_error:
 }
 
 struct nfs_readdesc {
-	struct list_head *head;
+	struct nfs_pageio_descriptor *pgio;
 	struct nfs_open_context *ctx;
 };
 
@@ -562,19 +533,21 @@ readpage_async_filler(void *data, struct page *page)
 	}
 	if (len < PAGE_CACHE_SIZE)
 		memclear_highpage_flush(page, len, PAGE_CACHE_SIZE - len);
-	nfs_list_add_request(new, desc->head);
+	nfs_pageio_add_request(desc->pgio, new);
 	return 0;
 }
 
 int nfs_readpages(struct file *filp, struct address_space *mapping,
 		struct list_head *pages, unsigned nr_pages)
 {
-	LIST_HEAD(head);
+	struct nfs_pageio_descriptor pgio;
 	struct nfs_readdesc desc = {
-		.head		= &head,
+		.pgio = &pgio,
 	};
 	struct inode *inode = mapping->host;
 	struct nfs_server *server = NFS_SERVER(inode);
+	size_t rsize = server->rsize;
+	unsigned long npages;
 	int ret = -ESTALE;
 
 	dprintk("NFS: nfs_readpages (%s/%Ld %d)\n",
@@ -593,13 +566,16 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
 	} else
 		desc.ctx = get_nfs_open_context((struct nfs_open_context *)
 				filp->private_data);
+	if (rsize < PAGE_CACHE_SIZE)
+		nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
+	else
+		nfs_pageio_init(&pgio, inode, nfs_pagein_one, rsize, 0);
+
 	ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);
-	if (!list_empty(&head)) {
-		int err = nfs_pagein_list(&head, server->rpages);
-		if (!ret)
-			nfs_add_stats(inode, NFSIOS_READPAGES, err);
-			ret = err;
-	}
+
+	nfs_pageio_complete(&pgio);
+	npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	nfs_add_stats(inode, NFSIOS_READPAGES, npages);
 	put_nfs_open_context(desc.ctx);
 out:
 	return ret;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f1eae44b9a1..ca20d3cc260 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -204,9 +204,9 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	lock_kernel();
 
 	error = server->nfs_client->rpc_ops->statfs(server, fh, &res);
-	buf->f_type = NFS_SUPER_MAGIC;
 	if (error < 0)
 		goto out_err;
+	buf->f_type = NFS_SUPER_MAGIC;
 
 	/*
 	 * Current versions of glibc do not correctly handle the
@@ -233,15 +233,14 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_ffree = res.afiles;
 
 	buf->f_namelen = server->namelen;
- out:
+
 	unlock_kernel();
 	return 0;
 
  out_err:
 	dprintk("%s: statfs error = %d\n", __FUNCTION__, -error);
-	buf->f_bsize = buf->f_blocks = buf->f_bfree = buf->f_bavail = -1;
-	goto out;
-
+	unlock_kernel();
+	return error;
 }
 
 /*
@@ -291,6 +290,7 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
 		{ NFS_MOUNT_NOAC, ",noac", "" },
 		{ NFS_MOUNT_NONLM, ",nolock", "" },
 		{ NFS_MOUNT_NOACL, ",noacl", "" },
+		{ NFS_MOUNT_NORDIRPLUS, ",nordirplus", "" },
 		{ 0, NULL, NULL }
 	};
 	const struct proc_nfs_info *nfs_infop;
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index f4a0548b9ce..bc2821331c2 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -61,15 +61,9 @@ static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 		err = page;
 		goto read_failed;
 	}
-	if (!PageUptodate(page)) {
-		err = ERR_PTR(-EIO);
-		goto getlink_read_error;
-	}
 	nd_set_link(nd, kmap(page));
 	return page;
 
-getlink_read_error:
-	page_cache_release(page);
 read_failed:
 	nd_set_link(nd, err);
 	return NULL;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 79755894174..5d44b8bd107 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -38,7 +38,8 @@
 static struct nfs_page * nfs_update_request(struct nfs_open_context*,
 					    struct page *,
 					    unsigned int, unsigned int);
-static long nfs_flush_mapping(struct address_space *mapping, struct writeback_control *wbc, int how);
+static void nfs_pageio_init_write(struct nfs_pageio_descriptor *desc,
+				  struct inode *inode, int ioflags);
 static const struct rpc_call_ops nfs_write_partial_ops;
 static const struct rpc_call_ops nfs_write_full_ops;
 static const struct rpc_call_ops nfs_commit_ops;
@@ -71,9 +72,8 @@ void nfs_commit_free(struct nfs_write_data *wdata)
 	call_rcu_bh(&wdata->task.u.tk_rcu, nfs_commit_rcu_free);
 }
 
-struct nfs_write_data *nfs_writedata_alloc(size_t len)
+struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
 {
-	unsigned int pagecount = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, GFP_NOFS);
 
 	if (p) {
@@ -139,7 +139,7 @@ static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int c
 {
 	struct inode *inode = page->mapping->host;
 	loff_t end, i_size = i_size_read(inode);
-	unsigned long end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
+	pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
 
 	if (i_size > 0 && page->index < end_index)
 		return;
@@ -201,7 +201,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
 static int wb_priority(struct writeback_control *wbc)
 {
 	if (wbc->for_reclaim)
-		return FLUSH_HIGHPRI;
+		return FLUSH_HIGHPRI | FLUSH_STABLE;
 	if (wbc->for_kupdate)
 		return FLUSH_LOWPRI;
 	return 0;
@@ -251,7 +251,8 @@ static void nfs_end_page_writeback(struct page *page)
  * was not tagged.
  * May also return an error if the user signalled nfs_wait_on_request().
  */
-static int nfs_page_mark_flush(struct page *page)
+static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
+				struct page *page)
 {
 	struct nfs_page *req;
 	struct nfs_inode *nfsi = NFS_I(page->mapping->host);
@@ -273,6 +274,8 @@ static int nfs_page_mark_flush(struct page *page)
 		 *	 request as dirty (in which case we don't care).
 		 */
 		spin_unlock(req_lock);
+		/* Prevent deadlock! */
+		nfs_pageio_complete(pgio);
 		ret = nfs_wait_on_request(req);
 		nfs_release_request(req);
 		if (ret != 0)
@@ -283,21 +286,18 @@ static int nfs_page_mark_flush(struct page *page)
 		/* This request is marked for commit */
 		spin_unlock(req_lock);
 		nfs_unlock_request(req);
+		nfs_pageio_complete(pgio);
 		return 1;
 	}
-	if (nfs_set_page_writeback(page) == 0) {
-		nfs_list_remove_request(req);
-		/* add the request to the inode's dirty list. */
-		radix_tree_tag_set(&nfsi->nfs_page_tree,
-				req->wb_index, NFS_PAGE_TAG_DIRTY);
-		nfs_list_add_request(req, &nfsi->dirty);
-		nfsi->ndirty++;
-		spin_unlock(req_lock);
-		__mark_inode_dirty(page->mapping->host, I_DIRTY_PAGES);
-	} else
+	if (nfs_set_page_writeback(page) != 0) {
 		spin_unlock(req_lock);
+		BUG();
+	}
+	radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
+			NFS_PAGE_TAG_WRITEBACK);
 	ret = test_bit(PG_NEED_FLUSH, &req->wb_flags);
-	nfs_unlock_request(req);
+	spin_unlock(req_lock);
+	nfs_pageio_add_request(pgio, req);
 	return ret;
 }
 
@@ -306,6 +306,7 @@ static int nfs_page_mark_flush(struct page *page)
  */
 static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc)
 {
+	struct nfs_pageio_descriptor mypgio, *pgio;
 	struct nfs_open_context *ctx;
 	struct inode *inode = page->mapping->host;
 	unsigned offset;
@@ -314,7 +315,14 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc
 	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
 	nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
 
-	err = nfs_page_mark_flush(page);
+	if (wbc->for_writepages)
+		pgio = wbc->fs_private;
+	else {
+		nfs_pageio_init_write(&mypgio, inode, wb_priority(wbc));
+		pgio = &mypgio;
+	}
+
+	err = nfs_page_async_flush(pgio, page);
 	if (err <= 0)
 		goto out;
 	err = 0;
@@ -331,12 +339,12 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc
 	put_nfs_open_context(ctx);
 	if (err != 0)
 		goto out;
-	err = nfs_page_mark_flush(page);
+	err = nfs_page_async_flush(pgio, page);
 	if (err > 0)
 		err = 0;
 out:
 	if (!wbc->for_writepages)
-		nfs_flush_mapping(page->mapping, wbc, FLUSH_STABLE|wb_priority(wbc));
+		nfs_pageio_complete(pgio);
 	return err;
 }
 
@@ -352,20 +360,20 @@ int nfs_writepage(struct page *page, struct writeback_control *wbc)
 int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
 	struct inode *inode = mapping->host;
+	struct nfs_pageio_descriptor pgio;
 	int err;
 
 	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
 
+	nfs_pageio_init_write(&pgio, inode, wb_priority(wbc));
+	wbc->fs_private = &pgio;
 	err = generic_writepages(mapping, wbc);
+	nfs_pageio_complete(&pgio);
 	if (err)
 		return err;
-	err = nfs_flush_mapping(mapping, wbc, wb_priority(wbc));
-	if (err < 0)
-		goto out;
-	nfs_add_stats(inode, NFSIOS_WRITEPAGES, err);
-	err = 0;
-out:
-	return err;
+	if (pgio.pg_error)
+		return pgio.pg_error;
+	return 0;
 }
 
 /*
@@ -503,11 +511,11 @@ int nfs_reschedule_unstable_write(struct nfs_page *req)
  *
  * Interruptible by signals only if mounted with intr flag.
  */
-static int nfs_wait_on_requests_locked(struct inode *inode, unsigned long idx_start, unsigned int npages)
+static int nfs_wait_on_requests_locked(struct inode *inode, pgoff_t idx_start, unsigned int npages)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_page *req;
-	unsigned long		idx_end, next;
+	pgoff_t idx_end, next;
 	unsigned int		res = 0;
 	int			error;
 
@@ -536,18 +544,6 @@ static int nfs_wait_on_requests_locked(struct inode *inode, unsigned long idx_st
 	return res;
 }
 
-static void nfs_cancel_dirty_list(struct list_head *head)
-{
-	struct nfs_page *req;
-	while(!list_empty(head)) {
-		req = nfs_list_entry(head->next);
-		nfs_list_remove_request(req);
-		nfs_end_page_writeback(req->wb_page);
-		nfs_inode_remove_request(req);
-		nfs_clear_page_writeback(req);
-	}
-}
-
 static void nfs_cancel_commit_list(struct list_head *head)
 {
 	struct nfs_page *req;
@@ -574,7 +570,7 @@ static void nfs_cancel_commit_list(struct list_head *head)
  * The requests are *not* checked to ensure that they form a contiguous set.
  */
 static int
-nfs_scan_commit(struct inode *inode, struct list_head *dst, unsigned long idx_start, unsigned int npages)
+nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	int res = 0;
@@ -588,40 +584,12 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, unsigned long idx_st
 	return res;
 }
 #else
-static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, unsigned long idx_start, unsigned int npages)
+static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
 {
 	return 0;
 }
 #endif
 
-static int nfs_wait_on_write_congestion(struct address_space *mapping)
-{
-	struct inode *inode = mapping->host;
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
-	int ret = 0;
-
-	might_sleep();
-
-	if (!bdi_write_congested(bdi))
-		return 0;
-
-	nfs_inc_stats(inode, NFSIOS_CONGESTIONWAIT);
-
-	do {
-		struct rpc_clnt *clnt = NFS_CLIENT(inode);
-		sigset_t oldset;
-
-		rpc_clnt_sigmask(clnt, &oldset);
-		ret = congestion_wait_interruptible(WRITE, HZ/10);
-		rpc_clnt_sigunmask(clnt, &oldset);
-		if (ret == -ERESTARTSYS)
-			break;
-		ret = 0;
-	} while (bdi_write_congested(bdi));
-
-	return ret;
-}
-
 /*
  * Try to update any existing write request, or create one if there is none.
  * In order to match, the request's credentials must match those of
@@ -636,12 +604,10 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
 	struct inode *inode = mapping->host;
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_page		*req, *new = NULL;
-	unsigned long		rqend, end;
+	pgoff_t		rqend, end;
 
 	end = offset + bytes;
 
-	if (nfs_wait_on_write_congestion(mapping))
-		return ERR_PTR(-ERESTARTSYS);
 	for (;;) {
 		/* Loop over all inode entries and see if we find
 		 * A request for the page we wish to update
@@ -865,7 +831,7 @@ static void nfs_execute_write(struct nfs_write_data *data)
  * Generate multiple small requests to write out a single
  * contiguous dirty area on one page.
  */
-static int nfs_flush_multi(struct inode *inode, struct list_head *head, int how)
+static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how)
 {
 	struct nfs_page *req = nfs_list_entry(head->next);
 	struct page *page = req->wb_page;
@@ -877,11 +843,11 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, int how)
 
 	nfs_list_remove_request(req);
 
-	nbytes = req->wb_bytes;
+	nbytes = count;
 	do {
 		size_t len = min(nbytes, wsize);
 
-		data = nfs_writedata_alloc(len);
+		data = nfs_writedata_alloc(1);
 		if (!data)
 			goto out_bad;
 		list_add(&data->pages, &list);
@@ -892,23 +858,19 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, int how)
 
 	ClearPageError(page);
 	offset = 0;
-	nbytes = req->wb_bytes;
+	nbytes = count;
 	do {
 		data = list_entry(list.next, struct nfs_write_data, pages);
 		list_del_init(&data->pages);
 
 		data->pagevec[0] = page;
 
-		if (nbytes > wsize) {
-			nfs_write_rpcsetup(req, data, &nfs_write_partial_ops,
-					wsize, offset, how);
-			offset += wsize;
-			nbytes -= wsize;
-		} else {
-			nfs_write_rpcsetup(req, data, &nfs_write_partial_ops,
-					nbytes, offset, how);
-			nbytes = 0;
-		}
+		if (nbytes < wsize)
+			wsize = nbytes;
+		nfs_write_rpcsetup(req, data, &nfs_write_partial_ops,
+				   wsize, offset, how);
+		offset += wsize;
+		nbytes -= wsize;
 		nfs_execute_write(data);
 	} while (nbytes != 0);
 
@@ -934,26 +896,23 @@ out_bad:
  * This is the case if nfs_updatepage detects a conflicting request
  * that has been written but not committed.
  */
-static int nfs_flush_one(struct inode *inode, struct list_head *head, int how)
+static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how)
 {
 	struct nfs_page		*req;
 	struct page		**pages;
 	struct nfs_write_data	*data;
-	unsigned int		count;
 
-	data = nfs_writedata_alloc(NFS_SERVER(inode)->wsize);
+	data = nfs_writedata_alloc(npages);
 	if (!data)
 		goto out_bad;
 
 	pages = data->pagevec;
-	count = 0;
 	while (!list_empty(head)) {
 		req = nfs_list_entry(head->next);
 		nfs_list_remove_request(req);
 		nfs_list_add_request(req, &data->pages);
 		ClearPageError(req->wb_page);
 		*pages++ = req->wb_page;
-		count += req->wb_bytes;
 	}
 	req = nfs_list_entry(data->pages.next);
 
@@ -973,40 +932,15 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, int how)
 	return -ENOMEM;
 }
 
-static int nfs_flush_list(struct inode *inode, struct list_head *head, int npages, int how)
+static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
+				  struct inode *inode, int ioflags)
 {
-	LIST_HEAD(one_request);
-	int (*flush_one)(struct inode *, struct list_head *, int);
-	struct nfs_page	*req;
-	int wpages = NFS_SERVER(inode)->wpages;
 	int wsize = NFS_SERVER(inode)->wsize;
-	int error;
 
-	flush_one = nfs_flush_one;
 	if (wsize < PAGE_CACHE_SIZE)
-		flush_one = nfs_flush_multi;
-	/* For single writes, FLUSH_STABLE is more efficient */
-	if (npages <= wpages && npages == NFS_I(inode)->npages
-			&& nfs_list_entry(head->next)->wb_bytes <= wsize)
-		how |= FLUSH_STABLE;
-
-	do {
-		nfs_coalesce_requests(head, &one_request, wpages);
-		req = nfs_list_entry(one_request.next);
-		error = flush_one(inode, &one_request, how);
-		if (error < 0)
-			goto out_err;
-	} while (!list_empty(head));
-	return 0;
-out_err:
-	while (!list_empty(head)) {
-		req = nfs_list_entry(head->next);
-		nfs_list_remove_request(req);
-		nfs_redirty_request(req);
-		nfs_end_page_writeback(req->wb_page);
-		nfs_clear_page_writeback(req);
-	}
-	return error;
+		nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
+	else
+		nfs_pageio_init(pgio, inode, nfs_flush_one, wsize, ioflags);
 }
 
 /*
@@ -1330,31 +1264,7 @@ static const struct rpc_call_ops nfs_commit_ops = {
 	.rpc_call_done = nfs_commit_done,
 	.rpc_release = nfs_commit_release,
 };
-#else
-static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how)
-{
-	return 0;
-}
-#endif
-
-static long nfs_flush_mapping(struct address_space *mapping, struct writeback_control *wbc, int how)
-{
-	struct nfs_inode *nfsi = NFS_I(mapping->host);
-	LIST_HEAD(head);
-	long res;
-
-	spin_lock(&nfsi->req_lock);
-	res = nfs_scan_dirty(mapping, wbc, &head);
-	spin_unlock(&nfsi->req_lock);
-	if (res) {
-		int error = nfs_flush_list(mapping->host, &head, res, how);
-		if (error < 0)
-			return error;
-	}
-	return res;
-}
 
-#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 int nfs_commit_inode(struct inode *inode, int how)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
@@ -1371,13 +1281,18 @@ int nfs_commit_inode(struct inode *inode, int how)
 	}
 	return res;
 }
+#else
+static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how)
+{
+	return 0;
+}
 #endif
 
 long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how)
 {
 	struct inode *inode = mapping->host;
 	struct nfs_inode *nfsi = NFS_I(inode);
-	unsigned long idx_start, idx_end;
+	pgoff_t idx_start, idx_end;
 	unsigned int npages = 0;
 	LIST_HEAD(head);
 	int nocommit = how & FLUSH_NOCOMMIT;
@@ -1390,41 +1305,24 @@ long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_contr
 		idx_start = wbc->range_start >> PAGE_CACHE_SHIFT;
 		idx_end = wbc->range_end >> PAGE_CACHE_SHIFT;
 		if (idx_end > idx_start) {
-			unsigned long l_npages = 1 + idx_end - idx_start;
+			pgoff_t l_npages = 1 + idx_end - idx_start;
 			npages = l_npages;
 			if (sizeof(npages) != sizeof(l_npages) &&
-					(unsigned long)npages != l_npages)
+					(pgoff_t)npages != l_npages)
 				npages = 0;
 		}
 	}
 	how &= ~FLUSH_NOCOMMIT;
 	spin_lock(&nfsi->req_lock);
 	do {
-		wbc->pages_skipped = 0;
 		ret = nfs_wait_on_requests_locked(inode, idx_start, npages);
 		if (ret != 0)
 			continue;
-		pages = nfs_scan_dirty(mapping, wbc, &head);
-		if (pages != 0) {
-			spin_unlock(&nfsi->req_lock);
-			if (how & FLUSH_INVALIDATE) {
-				nfs_cancel_dirty_list(&head);
-				ret = pages;
-			} else
-				ret = nfs_flush_list(inode, &head, pages, how);
-			spin_lock(&nfsi->req_lock);
-			continue;
-		}
-		if (wbc->pages_skipped != 0)
-			continue;
 		if (nocommit)
 			break;
 		pages = nfs_scan_commit(inode, &head, idx_start, npages);
-		if (pages == 0) {
-			if (wbc->pages_skipped != 0)
-				continue;
+		if (pages == 0)
 			break;
-		}
 		if (how & FLUSH_INVALIDATE) {
 			spin_unlock(&nfsi->req_lock);
 			nfs_cancel_commit_list(&head);
@@ -1456,7 +1354,7 @@ int nfs_wb_all(struct inode *inode)
 	};
 	int ret;
 
-	ret = generic_writepages(mapping, &wbc);
+	ret = nfs_writepages(mapping, &wbc);
 	if (ret < 0)
 		goto out;
 	ret = nfs_sync_mapping_wait(mapping, &wbc, 0);
@@ -1479,11 +1377,9 @@ int nfs_sync_mapping_range(struct address_space *mapping, loff_t range_start, lo
 	};
 	int ret;
 
-	if (!(how & FLUSH_NOWRITEPAGE)) {
-		ret = generic_writepages(mapping, &wbc);
-		if (ret < 0)
-			goto out;
-	}
+	ret = nfs_writepages(mapping, &wbc);
+	if (ret < 0)
+		goto out;
 	ret = nfs_sync_mapping_wait(mapping, &wbc, how);
 	if (ret >= 0)
 		return 0;
@@ -1506,7 +1402,7 @@ int nfs_wb_page_priority(struct inode *inode, struct page *page, int how)
 	int ret;
 
 	BUG_ON(!PageLocked(page));
-	if (!(how & FLUSH_NOWRITEPAGE) && clear_page_dirty_for_io(page)) {
+	if (clear_page_dirty_for_io(page)) {
 		ret = nfs_writepage_locked(page, &wbc);
 		if (ret < 0)
 			goto out;
@@ -1531,10 +1427,18 @@ int nfs_wb_page(struct inode *inode, struct page* page)
 
 int nfs_set_page_dirty(struct page *page)
 {
-	spinlock_t *req_lock = &NFS_I(page->mapping->host)->req_lock;
+	struct address_space *mapping = page->mapping;
+	struct inode *inode;
+	spinlock_t *req_lock;
 	struct nfs_page *req;
 	int ret;
 
+	if (!mapping)
+		goto out_raced;
+	inode = mapping->host;
+	if (!inode)
+		goto out_raced;
+	req_lock = &NFS_I(inode)->req_lock;
 	spin_lock(req_lock);
 	req = nfs_page_find_request_locked(page);
 	if (req != NULL) {
@@ -1547,6 +1451,8 @@ int nfs_set_page_dirty(struct page *page)
 	ret = __set_page_dirty_nobuffers(page);
 	spin_unlock(req_lock);
 	return ret;
+out_raced:
+	return !TestSetPageDirty(page);
 }
 
 
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index fb14d68eaca..32ffea033c7 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -315,16 +315,13 @@ out:
 /*
  * RPC procedure tables
  */
-#ifndef MAX
-# define MAX(a, b)      (((a) > (b))? (a) : (b))
-#endif
-
 #define PROC(proc, call, argtype, restype)                              \
 [NFSPROC4_CLNT_##proc] = {                                      	\
         .p_proc   = NFSPROC4_CB_##call,					\
         .p_encode = (kxdrproc_t) nfs4_xdr_##argtype,                    \
         .p_decode = (kxdrproc_t) nfs4_xdr_##restype,                    \
-        .p_bufsiz = MAX(NFS4_##argtype##_sz,NFS4_##restype##_sz) << 2,  \
+        .p_arglen = NFS4_##argtype##_sz,                                \
+        .p_replen = NFS4_##restype##_sz,                                \
         .p_statidx = NFSPROC4_CB_##call,				\
 	.p_name   = #proc,                                              \
 }
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index af360705e55..678f3be88ac 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -50,6 +50,7 @@
 #include <linux/nfsd/xdr4.h>
 #include <linux/namei.h>
 #include <linux/mutex.h>
+#include <linux/lockd/bind.h>
 
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 
@@ -2657,6 +2658,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	struct file_lock conflock;
 	__be32 status = 0;
 	unsigned int strhashval;
+	unsigned int cmd;
 	int err;
 
 	dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n",
@@ -2739,10 +2741,12 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		case NFS4_READ_LT:
 		case NFS4_READW_LT:
 			file_lock.fl_type = F_RDLCK;
+			cmd = F_SETLK;
 		break;
 		case NFS4_WRITE_LT:
 		case NFS4_WRITEW_LT:
 			file_lock.fl_type = F_WRLCK;
+			cmd = F_SETLK;
 		break;
 		default:
 			status = nfserr_inval;
@@ -2769,9 +2773,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	/* XXX?: Just to divert the locks_release_private at the start of
 	 * locks_copy_lock: */
-	conflock.fl_ops = NULL;
-	conflock.fl_lmops = NULL;
-	err = posix_lock_file_conf(filp, &file_lock, &conflock);
+	locks_init_lock(&conflock);
+	err = vfs_lock_file(filp, cmd, &file_lock, &conflock);
 	switch (-err) {
 	case 0: /* success! */
 		update_stateid(&lock_stp->st_stateid);
@@ -2788,7 +2791,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		status = nfserr_deadlock;
 		break;
 	default:        
-		dprintk("NFSD: nfsd4_lock: posix_lock_file_conf() failed! status %d\n",err);
+		dprintk("NFSD: nfsd4_lock: vfs_lock_file() failed! status %d\n",err);
 		status = nfserr_resource;
 		break;
 	}
@@ -2813,7 +2816,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	struct inode *inode;
 	struct file file;
 	struct file_lock file_lock;
-	struct file_lock conflock;
+	int error;
 	__be32 status;
 
 	if (nfs4_in_grace())
@@ -2869,18 +2872,23 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	nfs4_transform_lock_offset(&file_lock);
 
-	/* posix_test_lock uses the struct file _only_ to resolve the inode.
+	/* vfs_test_lock uses the struct file _only_ to resolve the inode.
 	 * since LOCKT doesn't require an OPEN, and therefore a struct
-	 * file may not exist, pass posix_test_lock a struct file with
+	 * file may not exist, pass vfs_test_lock a struct file with
 	 * only the dentry:inode set.
 	 */
 	memset(&file, 0, sizeof (struct file));
 	file.f_path.dentry = cstate->current_fh.fh_dentry;
 
 	status = nfs_ok;
-	if (posix_test_lock(&file, &file_lock, &conflock)) {
+	error = vfs_test_lock(&file, &file_lock);
+	if (error) {
+		status = nfserrno(error);
+		goto out;
+	}
+	if (file_lock.fl_type != F_UNLCK) {
 		status = nfserr_denied;
-		nfs4_set_lock_denied(&conflock, &lockt->lt_denied);
+		nfs4_set_lock_denied(&file_lock, &lockt->lt_denied);
 	}
 out:
 	nfs4_unlock_state();
@@ -2933,9 +2941,9 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	/*
 	*  Try to unlock the file in the VFS.
 	*/
-	err = posix_lock_file(filp, &file_lock);
+	err = vfs_lock_file(filp, F_SETLK, &file_lock, NULL);
 	if (err) {
-		dprintk("NFSD: nfs4_locku: posix_lock_file failed!\n");
+		dprintk("NFSD: nfs4_locku: vfs_lock_file failed!\n");
 		goto out_nfserr;
 	}
 	/*
diff --git a/fs/ntfs/aops.h b/fs/ntfs/aops.h
index 9393f4b1e29..caecc58f529 100644
--- a/fs/ntfs/aops.h
+++ b/fs/ntfs/aops.h
@@ -89,9 +89,8 @@ static inline struct page *ntfs_map_page(struct address_space *mapping,
 	struct page *page = read_mapping_page(mapping, index, NULL);
 
 	if (!IS_ERR(page)) {
-		wait_on_page_locked(page);
 		kmap(page);
-		if (PageUptodate(page) && !PageError(page))
+		if (!PageError(page))
 			return page;
 		ntfs_unmap_page(page);
 		return ERR_PTR(-EIO);
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 7659cc19299..1c08fefe487 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -2532,14 +2532,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
 		page = read_mapping_page(mapping, idx, NULL);
 		if (IS_ERR(page)) {
 			ntfs_error(vol->sb, "Failed to read first partial "
-					"page (sync error, index 0x%lx).", idx);
-			return PTR_ERR(page);
-		}
-		wait_on_page_locked(page);
-		if (unlikely(!PageUptodate(page))) {
-			ntfs_error(vol->sb, "Failed to read first partial page "
-					"(async error, index 0x%lx).", idx);
-			page_cache_release(page);
+					"page (error, index 0x%lx).", idx);
 			return PTR_ERR(page);
 		}
 		/*
@@ -2602,14 +2595,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
 		page = read_mapping_page(mapping, idx, NULL);
 		if (IS_ERR(page)) {
 			ntfs_error(vol->sb, "Failed to read last partial page "
-					"(sync error, index 0x%lx).", idx);
-			return PTR_ERR(page);
-		}
-		wait_on_page_locked(page);
-		if (unlikely(!PageUptodate(page))) {
-			ntfs_error(vol->sb, "Failed to read last partial page "
-					"(async error, index 0x%lx).", idx);
-			page_cache_release(page);
+					"(error, index 0x%lx).", idx);
 			return PTR_ERR(page);
 		}
 		kaddr = kmap_atomic(page, KM_USER0);
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index d69c4595ccd..dbbac559310 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -236,8 +236,7 @@ do_non_resident_extend:
 			err = PTR_ERR(page);
 			goto init_err_out;
 		}
-		wait_on_page_locked(page);
-		if (unlikely(!PageUptodate(page) || PageError(page))) {
+		if (unlikely(PageError(page))) {
 			page_cache_release(page);
 			err = -EIO;
 			goto init_err_out;
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 1594c90b716..21d834e5ed7 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2471,7 +2471,6 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
 	s64 nr_free = vol->nr_clusters;
 	u32 *kaddr;
 	struct address_space *mapping = vol->lcnbmp_ino->i_mapping;
-	filler_t *readpage = (filler_t*)mapping->a_ops->readpage;
 	struct page *page;
 	pgoff_t index, max_index;
 
@@ -2494,24 +2493,14 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
 		 * Read the page from page cache, getting it from backing store
 		 * if necessary, and increment the use count.
 		 */
-		page = read_cache_page(mapping, index, (filler_t*)readpage,
-				NULL);
+		page = read_mapping_page(mapping, index, NULL);
 		/* Ignore pages which errored synchronously. */
 		if (IS_ERR(page)) {
-			ntfs_debug("Sync read_cache_page() error. Skipping "
+			ntfs_debug("read_mapping_page() error. Skipping "
 					"page (index 0x%lx).", index);
 			nr_free -= PAGE_CACHE_SIZE * 8;
 			continue;
 		}
-		wait_on_page_locked(page);
-		/* Ignore pages which errored asynchronously. */
-		if (!PageUptodate(page)) {
-			ntfs_debug("Async read_cache_page() error. Skipping "
-					"page (index 0x%lx).", index);
-			page_cache_release(page);
-			nr_free -= PAGE_CACHE_SIZE * 8;
-			continue;
-		}
 		kaddr = (u32*)kmap_atomic(page, KM_USER0);
 		/*
 		 * For each 4 bytes, subtract the number of set bits. If this
@@ -2562,7 +2551,6 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
 {
 	u32 *kaddr;
 	struct address_space *mapping = vol->mftbmp_ino->i_mapping;
-	filler_t *readpage = (filler_t*)mapping->a_ops->readpage;
 	struct page *page;
 	pgoff_t index;
 
@@ -2576,21 +2564,11 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
 		 * Read the page from page cache, getting it from backing store
 		 * if necessary, and increment the use count.
 		 */
-		page = read_cache_page(mapping, index, (filler_t*)readpage,
-				NULL);
+		page = read_mapping_page(mapping, index, NULL);
 		/* Ignore pages which errored synchronously. */
 		if (IS_ERR(page)) {
-			ntfs_debug("Sync read_cache_page() error. Skipping "
-					"page (index 0x%lx).", index);
-			nr_free -= PAGE_CACHE_SIZE * 8;
-			continue;
-		}
-		wait_on_page_locked(page);
-		/* Ignore pages which errored asynchronously. */
-		if (!PageUptodate(page)) {
-			ntfs_debug("Async read_cache_page() error. Skipping "
+			ntfs_debug("read_mapping_page() error. Skipping "
 					"page (index 0x%lx).", index);
-			page_cache_release(page);
 			nr_free -= PAGE_CACHE_SIZE * 8;
 			continue;
 		}
@@ -3107,8 +3085,7 @@ static void ntfs_big_inode_init_once(void *foo, struct kmem_cache *cachep,
 {
 	ntfs_inode *ni = (ntfs_inode *)foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-			SLAB_CTOR_CONSTRUCTOR)
+	if (flags & SLAB_CTOR_CONSTRUCTOR)
 		inode_init_once(VFS_I(ni));
 }
 
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index a0c8667caa7..19712a7d145 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -2869,7 +2869,7 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
 	tl = &tl_copy->id2.i_dealloc;
 	num_recs = le16_to_cpu(tl->tl_used);
 	mlog(0, "cleanup %u records from %llu\n", num_recs,
-	     (unsigned long long)tl_copy->i_blkno);
+	     (unsigned long long)le64_to_cpu(tl_copy->i_blkno));
 
 	mutex_lock(&tl_inode->i_mutex);
 	for(i = 0; i < num_recs; i++) {
@@ -3801,8 +3801,8 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
 
 	mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
-	     "%llu\n", fe->i_clusters, new_i_clusters,
-	     (unsigned long long)fe->i_size);
+	     "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
+	     (unsigned long long)le64_to_cpu(fe->i_size));
 
 	*tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
 	if (!(*tc)) {
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 56963e6c46c..8e7cafb5fc6 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -78,7 +78,8 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
 
 	if (!OCFS2_IS_VALID_DINODE(fe)) {
 		mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
-		     (unsigned long long)fe->i_blkno, 7, fe->i_signature);
+		     (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
+		     fe->i_signature);
 		goto bail;
 	}
 
@@ -939,9 +940,9 @@ out:
  * Returns a negative error code or the number of bytes copied into
  * the page.
  */
-int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
-			  u64 *p_blkno, struct page *page,
-			  struct ocfs2_write_ctxt *wc, int new)
+static int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
+				 u64 *p_blkno, struct page *page,
+				 struct ocfs2_write_ctxt *wc, int new)
 {
 	int ret, copied = 0;
 	unsigned int from = 0, to = 0;
@@ -1086,7 +1087,7 @@ static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
 	for(i = 0; i < numpages; i++) {
 		index = start + i;
 
-		cpages[i] = grab_cache_page(mapping, index);
+		cpages[i] = find_or_create_page(mapping, index, GFP_NOFS);
 		if (!cpages[i]) {
 			ret = -ENOMEM;
 			mlog_errno(ret);
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index eba282da500..979113479c6 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -438,7 +438,7 @@ static inline void o2hb_prepare_block(struct o2hb_region *reg,
 								   hb_block));
 
 	mlog(ML_HB_BIO, "our node generation = 0x%llx, cksum = 0x%x\n",
-	     (long long)cpu_to_le64(generation),
+	     (long long)generation,
 	     le32_to_cpu(hb_block->hb_cksum));
 }
 
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 636593bf4d1..2e975c0a35e 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -147,7 +147,7 @@ static struct kset mlog_kset = {
 	.kobj   = {.name = "logmask", .ktype = &mlog_ktype},
 };
 
-int mlog_sys_init(struct subsystem *o2cb_subsys)
+int mlog_sys_init(struct kset *o2cb_subsys)
 {
 	int i = 0;
 
@@ -157,7 +157,7 @@ int mlog_sys_init(struct subsystem *o2cb_subsys)
 	}
 	mlog_attr_ptrs[i] = NULL;
 
-	mlog_kset.subsys = o2cb_subsys;
+	kobj_set_kset_s(&mlog_kset, o2cb_subsys);
 	return kset_register(&mlog_kset);
 }
 
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index a42628ba9dd..75cd877f6d4 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -278,7 +278,7 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
 
 #include <linux/kobject.h>
 #include <linux/sysfs.h>
-int mlog_sys_init(struct subsystem *o2cb_subsys);
+int mlog_sys_init(struct kset *o2cb_subsys);
 void mlog_sys_shutdown(void);
 
 #endif /* O2CLUSTER_MASKLOG_H */
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index 1d9f6acafa2..64f6f378fd0 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -42,7 +42,6 @@ struct o2cb_attribute {
 #define O2CB_ATTR(_name, _mode, _show, _store)	\
 struct o2cb_attribute o2cb_attr_##_name = __ATTR(_name, _mode, _show, _store)
 
-#define to_o2cb_subsys(k) container_of(to_kset(k), struct subsystem, kset)
 #define to_o2cb_attr(_attr) container_of(_attr, struct o2cb_attribute, attr)
 
 static ssize_t o2cb_interface_revision_show(char *buf)
@@ -79,7 +78,7 @@ static ssize_t
 o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer)
 {
 	struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
-	struct subsystem *sbs = to_o2cb_subsys(kobj);
+	struct kset *sbs = to_kset(kobj);
 
 	BUG_ON(sbs != &o2cb_subsys);
 
@@ -93,7 +92,7 @@ o2cb_store(struct kobject * kobj, struct attribute * attr,
 	     const char * buffer, size_t count)
 {
 	struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
-	struct subsystem *sbs = to_o2cb_subsys(kobj);
+	struct kset *sbs = to_kset(kobj);
 
 	BUG_ON(sbs != &o2cb_subsys);
 
@@ -112,7 +111,7 @@ int o2cb_sys_init(void)
 {
 	int ret;
 
-	o2cb_subsys.kset.kobj.ktype = &o2cb_subsys_type;
+	o2cb_subsys.kobj.ktype = &o2cb_subsys_type;
 	ret = subsystem_register(&o2cb_subsys);
 	if (ret)
 		return ret;
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 69caf3e12fe..0b229a9c795 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1496,7 +1496,7 @@ static void o2net_start_connect(struct work_struct *work)
 	sock->sk->sk_allocation = GFP_ATOMIC;
 
 	myaddr.sin_family = AF_INET;
-	myaddr.sin_addr.s_addr = (__force u32)mynode->nd_ipv4_address;
+	myaddr.sin_addr.s_addr = mynode->nd_ipv4_address;
 	myaddr.sin_port = (__force u16)htons(0); /* any port */
 
 	ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr,
@@ -1521,8 +1521,8 @@ static void o2net_start_connect(struct work_struct *work)
 	spin_unlock(&nn->nn_lock);
 
 	remoteaddr.sin_family = AF_INET;
-	remoteaddr.sin_addr.s_addr = (__force u32)node->nd_ipv4_address;
-	remoteaddr.sin_port = (__force u16)node->nd_ipv4_port;
+	remoteaddr.sin_addr.s_addr = node->nd_ipv4_address;
+	remoteaddr.sin_port = node->nd_ipv4_port;
 
 	ret = sc->sc_sock->ops->connect(sc->sc_sock,
 					(struct sockaddr *)&remoteaddr,
@@ -1810,8 +1810,8 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port)
 	int ret;
 	struct sockaddr_in sin = {
 		.sin_family = PF_INET,
-		.sin_addr = { .s_addr = (__force u32)addr },
-		.sin_port = (__force u16)port,
+		.sin_addr = { .s_addr = addr },
+		.sin_port = port,
 	};
 
 	ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 67e6866a2a4..c441ef1f2ba 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -403,7 +403,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 			    struct buffer_head **new_de_bh)
 {
 	int status = 0;
-	int credits, num_free_extents;
+	int credits, num_free_extents, drop_alloc_sem = 0;
 	loff_t dir_i_size;
 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
 	struct ocfs2_alloc_context *data_ac = NULL;
@@ -452,6 +452,9 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 		credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
 	}
 
+	down_write(&OCFS2_I(dir)->ip_alloc_sem);
+	drop_alloc_sem = 1;
+
 	handle = ocfs2_start_trans(osb, credits);
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
@@ -497,6 +500,8 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 	*new_de_bh = new_bh;
 	get_bh(*new_de_bh);
 bail:
+	if (drop_alloc_sem)
+		up_write(&OCFS2_I(dir)->ip_alloc_sem);
 	if (handle)
 		ocfs2_commit_trans(osb, handle);
 
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 241cad342a4..2fd8bded38f 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -312,8 +312,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
 	    past->type != DLM_BAST) {
 		mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu"
 		     "name=%.*s\n", past->type, 
-		     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
-		     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+		     dlm_get_lock_cookie_node(cookie),
+		     dlm_get_lock_cookie_seq(cookie),
 		     locklen, name);
 		ret = DLM_IVLOCKID;
 		goto leave;
@@ -324,8 +324,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
 		mlog(0, "got %sast for unknown lockres! "
 		     "cookie=%u:%llu, name=%.*s, namelen=%u\n",
 		     past->type == DLM_AST ? "" : "b",
-		     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
-		     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+		     dlm_get_lock_cookie_node(cookie),
+		     dlm_get_lock_cookie_seq(cookie),
 		     locklen, name, locklen);
 		ret = DLM_IVLOCKID;
 		goto leave;
@@ -370,8 +370,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
 
 	mlog(0, "got %sast for unknown lock!  cookie=%u:%llu, "
 	     "name=%.*s, namelen=%u\n", past->type == DLM_AST ? "" : "b", 
-	     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
-	     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+	     dlm_get_lock_cookie_node(cookie),
+	     dlm_get_lock_cookie_seq(cookie),
 	     locklen, name, locklen);
 
 	ret = DLM_NORMAL;
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index de952eba29a..d4e46d067ed 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -263,8 +263,7 @@ static void dlmfs_init_once(void *foo,
 	struct dlmfs_inode_private *ip =
 		(struct dlmfs_inode_private *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
+	if (flags & SLAB_CTOR_CONSTRUCTOR) {
 		ip->ip_dlm = NULL;
 		ip->ip_parent = NULL;
 
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index c1807a42c49..671c4ed58ee 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1769,7 +1769,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
 			/* lock is always created locally first, and
 			 * destroyed locally last.  it must be on the list */
 			if (!lock) {
-				u64 c = ml->cookie;
+				__be64 c = ml->cookie;
 				mlog(ML_ERROR, "could not find local lock "
 					       "with cookie %u:%llu!\n",
 				     dlm_get_lock_cookie_node(be64_to_cpu(c)),
@@ -1878,7 +1878,7 @@ skip_lvb:
 		spin_lock(&res->spinlock);
 		list_for_each_entry(lock, queue, list) {
 			if (lock->ml.cookie == ml->cookie) {
-				u64 c = lock->ml.cookie;
+				__be64 c = lock->ml.cookie;
 				mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already "
 				     "exists on this lockres!\n", dlm->name,
 				     res->lockname.len, res->lockname.name,
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 2b264c6ba03..cebd089f895 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -76,7 +76,7 @@ repeat:
 		goto repeat;
 	}
 	remove_wait_queue(&res->wq, &wait);
-	current->state = TASK_RUNNING;
+	__set_current_state(TASK_RUNNING);
 }
 
 int __dlm_lockres_has_locks(struct dlm_lock_resource *res)
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 27e43b0c0ea..024777abc8e 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -104,6 +104,35 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
 static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
 				     struct ocfs2_lock_res *lockres);
 
+
+#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
+
+/* This aids in debugging situations where a bad LVB might be involved. */
+static void ocfs2_dump_meta_lvb_info(u64 level,
+				     const char *function,
+				     unsigned int line,
+				     struct ocfs2_lock_res *lockres)
+{
+	struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+
+	mlog(level, "LVB information for %s (called from %s:%u):\n",
+	     lockres->l_name, function, line);
+	mlog(level, "version: %u, clusters: %u, generation: 0x%x\n",
+	     lvb->lvb_version, be32_to_cpu(lvb->lvb_iclusters),
+	     be32_to_cpu(lvb->lvb_igeneration));
+	mlog(level, "size: %llu, uid %u, gid %u, mode 0x%x\n",
+	     (unsigned long long)be64_to_cpu(lvb->lvb_isize),
+	     be32_to_cpu(lvb->lvb_iuid), be32_to_cpu(lvb->lvb_igid),
+	     be16_to_cpu(lvb->lvb_imode));
+	mlog(level, "nlink %u, atime_packed 0x%llx, ctime_packed 0x%llx, "
+	     "mtime_packed 0x%llx iattr 0x%x\n", be16_to_cpu(lvb->lvb_inlink),
+	     (long long)be64_to_cpu(lvb->lvb_iatime_packed),
+	     (long long)be64_to_cpu(lvb->lvb_ictime_packed),
+	     (long long)be64_to_cpu(lvb->lvb_imtime_packed),
+	     be32_to_cpu(lvb->lvb_iattr));
+}
+
+
 /*
  * OCFS2 Lock Resource Operations
  *
@@ -3078,28 +3107,3 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
 
 	mlog_exit_void();
 }
-
-/* This aids in debugging situations where a bad LVB might be involved. */
-void ocfs2_dump_meta_lvb_info(u64 level,
-			      const char *function,
-			      unsigned int line,
-			      struct ocfs2_lock_res *lockres)
-{
-	struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
-
-	mlog(level, "LVB information for %s (called from %s:%u):\n",
-	     lockres->l_name, function, line);
-	mlog(level, "version: %u, clusters: %u, generation: 0x%x\n",
-	     lvb->lvb_version, be32_to_cpu(lvb->lvb_iclusters),
-	     be32_to_cpu(lvb->lvb_igeneration));
-	mlog(level, "size: %llu, uid %u, gid %u, mode 0x%x\n",
-	     (unsigned long long)be64_to_cpu(lvb->lvb_isize),
-	     be32_to_cpu(lvb->lvb_iuid), be32_to_cpu(lvb->lvb_igid),
-	     be16_to_cpu(lvb->lvb_imode));
-	mlog(level, "nlink %u, atime_packed 0x%llx, ctime_packed 0x%llx, "
-	     "mtime_packed 0x%llx iattr 0x%x\n", be16_to_cpu(lvb->lvb_inlink),
-	     (long long)be64_to_cpu(lvb->lvb_iatime_packed),
-	     (long long)be64_to_cpu(lvb->lvb_ictime_packed),
-	     (long long)be64_to_cpu(lvb->lvb_imtime_packed),
-	     be32_to_cpu(lvb->lvb_iattr));
-}
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 59cb566e798..492bad32a8c 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -119,11 +119,4 @@ void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
 struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
 void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
 
-/* aids in debugging and tracking lvbs */
-void ocfs2_dump_meta_lvb_info(u64 level,
-			      const char *function,
-			      unsigned int line,
-			      struct ocfs2_lock_res *lockres);
-#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
-
 #endif	/* DLMGLUE_H */
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 56e1fefc120..bc48177bd18 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -140,7 +140,7 @@ bail:
 	return parent;
 }
 
-static int ocfs2_encode_fh(struct dentry *dentry, __be32 *fh, int *max_len,
+static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
 			   int connectable)
 {
 	struct inode *inode = dentry->d_inode;
@@ -148,6 +148,7 @@ static int ocfs2_encode_fh(struct dentry *dentry, __be32 *fh, int *max_len,
 	int type = 1;
 	u64 blkno;
 	u32 generation;
+	__le32 *fh = (__force __le32 *) fh_in;
 
 	mlog_entry("(0x%p, '%.*s', 0x%p, %d, %d)\n", dentry,
 		   dentry->d_name.len, dentry->d_name.name,
@@ -199,7 +200,7 @@ bail:
 	return type;
 }
 
-static struct dentry *ocfs2_decode_fh(struct super_block *sb, __be32 *fh,
+static struct dentry *ocfs2_decode_fh(struct super_block *sb, u32 *fh_in,
 				      int fh_len, int fileid_type,
 				      int (*acceptable)(void *context,
 						        struct dentry *de),
@@ -207,6 +208,7 @@ static struct dentry *ocfs2_decode_fh(struct super_block *sb, __be32 *fh,
 {
 	struct ocfs2_inode_handle handle, parent;
 	struct dentry *ret = NULL;
+	__le32 *fh = (__force __le32 *) fh_in;
 
 	mlog_entry("(0x%p, 0x%p, %d, %d, 0x%p, 0x%p)\n",
 		   sb, fh, fh_len, fileid_type, acceptable, context);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 520a2a6d767..9395b4fa547 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -207,10 +207,10 @@ out:
 	return ret;
 }
 
-int ocfs2_set_inode_size(handle_t *handle,
-			 struct inode *inode,
-			 struct buffer_head *fe_bh,
-			 u64 new_i_size)
+static int ocfs2_set_inode_size(handle_t *handle,
+				struct inode *inode,
+				struct buffer_head *fe_bh,
+				u64 new_i_size)
 {
 	int status;
 
@@ -713,7 +713,8 @@ restarted_transaction:
 	}
 
 	mlog(0, "fe: i_clusters = %u, i_size=%llu\n",
-	     fe->i_clusters, (unsigned long long)fe->i_size);
+	     le32_to_cpu(fe->i_clusters),
+	     (unsigned long long)le64_to_cpu(fe->i_size));
 	mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
 	     OCFS2_I(inode)->ip_clusters, i_size_read(inode));
 
@@ -1853,6 +1854,9 @@ const struct file_operations ocfs2_fops = {
 	.aio_read	= ocfs2_file_aio_read,
 	.aio_write	= ocfs2_file_aio_write,
 	.ioctl		= ocfs2_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl   = ocfs2_compat_ioctl,
+#endif
 	.splice_read	= ocfs2_file_splice_read,
 	.splice_write	= ocfs2_file_splice_write,
 };
@@ -1862,4 +1866,7 @@ const struct file_operations ocfs2_dops = {
 	.readdir	= ocfs2_readdir,
 	.fsync		= ocfs2_sync_file,
 	.ioctl		= ocfs2_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl   = ocfs2_compat_ioctl,
+#endif
 };
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 2c4460fced5..a4dd1fa1822 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -56,11 +56,6 @@ int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
 int ocfs2_permission(struct inode *inode, int mask,
 		     struct nameidata *nd);
 
-int ocfs2_set_inode_size(handle_t *handle,
-			 struct inode *inode,
-			 struct buffer_head *fe_bh,
-			 u64 new_i_size);
-
 int ocfs2_should_update_atime(struct inode *inode,
 			      struct vfsmount *vfsmnt);
 int ocfs2_update_inode_atime(struct inode *inode,
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 21a605079c6..bc844bfe607 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -89,6 +89,25 @@ void ocfs2_set_inode_flags(struct inode *inode)
 		inode->i_flags |= S_DIRSYNC;
 }
 
+/* Propagate flags from i_flags to OCFS2_I(inode)->ip_attr */
+void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi)
+{
+	unsigned int flags = oi->vfs_inode.i_flags;
+
+	oi->ip_attr &= ~(OCFS2_SYNC_FL|OCFS2_APPEND_FL|
+			OCFS2_IMMUTABLE_FL|OCFS2_NOATIME_FL|OCFS2_DIRSYNC_FL);
+	if (flags & S_SYNC)
+		oi->ip_attr |= OCFS2_SYNC_FL;
+	if (flags & S_APPEND)
+		oi->ip_attr |= OCFS2_APPEND_FL;
+	if (flags & S_IMMUTABLE)
+		oi->ip_attr |= OCFS2_IMMUTABLE_FL;
+	if (flags & S_NOATIME)
+		oi->ip_attr |= OCFS2_NOATIME_FL;
+	if (flags & S_DIRSYNC)
+		oi->ip_attr |= OCFS2_DIRSYNC_FL;
+}
+
 struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags)
 {
 	struct inode *inode = NULL;
@@ -196,7 +215,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 	int status = -EINVAL;
 
 	mlog_entry("(0x%p, size:%llu)\n", inode,
-		   (unsigned long long)fe->i_size);
+		   (unsigned long long)le64_to_cpu(fe->i_size));
 
 	sb = inode->i_sb;
 	osb = OCFS2_SB(sb);
@@ -248,7 +267,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 		mlog(ML_ERROR,
 		     "ip_blkno %llu != i_blkno %llu!\n",
 		     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-		     (unsigned long long)fe->i_blkno);
+		     (unsigned long long)le64_to_cpu(fe->i_blkno));
 
 	inode->i_nlink = le16_to_cpu(fe->i_links_count);
 
@@ -301,7 +320,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 		 * the generation argument to
 		 * ocfs2_inode_lock_res_init() will have to change.
 		 */
-		BUG_ON(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL));
+		BUG_ON(le32_to_cpu(fe->i_flags) & OCFS2_SYSTEM_FL);
 
 		ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
 					  OCFS2_LOCK_TYPE_META, 0, inode);
@@ -437,7 +456,8 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 	fe = (struct ocfs2_dinode *) bh->b_data;
 	if (!OCFS2_IS_VALID_DINODE(fe)) {
 		mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
-		     (unsigned long long)fe->i_blkno, 7, fe->i_signature);
+		     (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
+		     fe->i_signature);
 		goto bail;
 	}
 
@@ -812,8 +832,8 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
 		     "Inode %llu (on-disk %llu) not orphaned! "
 		     "Disk flags  0x%x, inode flags 0x%x\n",
 		     (unsigned long long)oi->ip_blkno,
-		     (unsigned long long)di->i_blkno, di->i_flags,
-		     oi->ip_flags);
+		     (unsigned long long)le64_to_cpu(di->i_blkno),
+		     le32_to_cpu(di->i_flags), oi->ip_flags);
 		goto bail;
 	}
 
@@ -1106,8 +1126,10 @@ struct buffer_head *ocfs2_bread(struct inode *inode,
 		return NULL;
 	}
 
+	down_read(&OCFS2_I(inode)->ip_alloc_sem);
 	tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
 					     NULL);
+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
 	if (tmperr < 0) {
 		mlog_errno(tmperr);
 		goto fail;
@@ -1197,6 +1219,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
 
 	spin_lock(&OCFS2_I(inode)->ip_lock);
 	fe->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
+	ocfs2_get_inode_flags(OCFS2_I(inode));
 	fe->i_attr = cpu_to_le32(OCFS2_I(inode)->ip_attr);
 	spin_unlock(&OCFS2_I(inode)->ip_lock);
 
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 03ae075869e..a41d0817121 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -141,6 +141,7 @@ int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
 int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
 
 void ocfs2_set_inode_flags(struct inode *inode);
+void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi);
 
 static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode)
 {
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 4768be5f308..f3ad21ad9ae 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -31,6 +31,7 @@ static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
 		mlog_errno(status);
 		return status;
 	}
+	ocfs2_get_inode_flags(OCFS2_I(inode));
 	*flags = OCFS2_I(inode)->ip_attr;
 	ocfs2_meta_unlock(inode, 0);
 
@@ -134,3 +135,26 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
 	}
 }
 
+#ifdef CONFIG_COMPAT
+long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	int ret;
+
+	switch (cmd) {
+	case OCFS2_IOC32_GETFLAGS:
+		cmd = OCFS2_IOC_GETFLAGS;
+		break;
+	case OCFS2_IOC32_SETFLAGS:
+		cmd = OCFS2_IOC_SETFLAGS;
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+
+	lock_kernel();
+	ret = ocfs2_ioctl(inode, file, cmd, arg);
+	unlock_kernel();
+	return ret;
+}
+#endif
diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h
index 4a7c82931db..4d6c4f430d0 100644
--- a/fs/ocfs2/ioctl.h
+++ b/fs/ocfs2/ioctl.h
@@ -12,5 +12,6 @@
 
 int ocfs2_ioctl(struct inode * inode, struct file * filp,
 	unsigned int cmd, unsigned long arg);
+long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg);
 
 #endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 5a8a90d1c78..dc118808172 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -435,7 +435,8 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
 		 * handle the errors in a specific manner, so no need
 		 * to call ocfs2_error() here. */
 		mlog(ML_ERROR, "Journal dinode %llu  has invalid "
-		     "signature: %.*s", (unsigned long long)fe->i_blkno, 7,
+		     "signature: %.*s",
+		     (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
 		     fe->i_signature);
 		status = -EIO;
 		goto out;
@@ -742,7 +743,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
 		la_dinode = item->lri_la_dinode;
 		if (la_dinode) {
 			mlog(0, "Clean up local alloc %llu\n",
-			     (unsigned long long)la_dinode->i_blkno);
+			     (unsigned long long)le64_to_cpu(la_dinode->i_blkno));
 
 			ret = ocfs2_complete_local_alloc_recovery(osb,
 								  la_dinode);
@@ -755,7 +756,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
 		tl_dinode = item->lri_tl_dinode;
 		if (tl_dinode) {
 			mlog(0, "Clean up truncate log %llu\n",
-			     (unsigned long long)tl_dinode->i_blkno);
+			     (unsigned long long)le64_to_cpu(tl_dinode->i_blkno));
 
 			ret = ocfs2_complete_truncate_log_recovery(osb,
 								   tl_dinode);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 2bcf353fd7c..36289e6295c 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -578,8 +578,9 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 	if (ocfs2_populate_inode(inode, fe, 1) < 0) {
 		mlog(ML_ERROR, "populate inode failed! bh->b_blocknr=%llu, "
 		     "i_blkno=%llu, i_ino=%lu\n",
-		     (unsigned long long) (*new_fe_bh)->b_blocknr,
-		     (unsigned long long)fe->i_blkno, inode->i_ino);
+		     (unsigned long long)(*new_fe_bh)->b_blocknr,
+		     (unsigned long long)le64_to_cpu(fe->i_blkno),
+		     inode->i_ino);
 		BUG();
 	}
 
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 82cc92dcf8a..a860633e833 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -363,9 +363,9 @@ static inline int ocfs2_mount_local(struct ocfs2_super *osb)
 	typeof(__di) ____di = (__di);					\
 	ocfs2_error((__sb), 						\
 		"Dinode # %llu has bad signature %.*s",			\
-		(unsigned long long)(____di)->i_blkno, 7,		\
+		(unsigned long long)le64_to_cpu((____di)->i_blkno), 7, 	\
 		(____di)->i_signature);					\
-} while (0);
+} while (0)
 
 #define OCFS2_IS_VALID_EXTENT_BLOCK(ptr)				\
 	(!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE))
@@ -374,9 +374,9 @@ static inline int ocfs2_mount_local(struct ocfs2_super *osb)
 	typeof(__eb) ____eb = (__eb);					\
 	ocfs2_error((__sb), 						\
 		"Extent Block # %llu has bad signature %.*s",		\
-		(unsigned long long)(____eb)->h_blkno, 7,		\
+		(unsigned long long)le64_to_cpu((____eb)->h_blkno), 7,	\
 		(____eb)->h_signature);					\
-} while (0);
+} while (0)
 
 #define OCFS2_IS_VALID_GROUP_DESC(ptr)					\
 	(!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE))
@@ -385,9 +385,9 @@ static inline int ocfs2_mount_local(struct ocfs2_super *osb)
 	typeof(__gd) ____gd = (__gd);					\
 		ocfs2_error((__sb),					\
 		"Group Descriptor # %llu has bad signature %.*s",	\
-		(unsigned long long)(____gd)->bg_blkno, 7,		\
+		(unsigned long long)le64_to_cpu((____gd)->bg_blkno), 7, \
 		(____gd)->bg_signature);				\
-} while (0);
+} while (0)
 
 static inline unsigned long ino_from_blkno(struct super_block *sb,
 					   u64 blkno)
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 71306479c68..f0d9eb08547 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -166,6 +166,8 @@
  */
 #define OCFS2_IOC_GETFLAGS	_IOR('f', 1, long)
 #define OCFS2_IOC_SETFLAGS	_IOW('f', 2, long)
+#define OCFS2_IOC32_GETFLAGS	_IOR('f', 1, int)
+#define OCFS2_IOC32_SETFLAGS	_IOW('f', 2, int)
 
 /*
  * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 0da655ae5d6..e3437626d18 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -849,9 +849,9 @@ static int ocfs2_relink_block_group(handle_t *handle,
 	}
 
 	mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
-	     (unsigned long long)fe->i_blkno, chain,
-	     (unsigned long long)bg->bg_blkno,
-	     (unsigned long long)prev_bg->bg_blkno);
+	     (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
+	     (unsigned long long)le64_to_cpu(bg->bg_blkno),
+	     (unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
 
 	fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
 	bg_ptr = le64_to_cpu(bg->bg_next_group);
@@ -1162,7 +1162,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 	}
 
 	mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
-	     tmp_bits, (unsigned long long)bg->bg_blkno);
+	     tmp_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
 
 	*num_bits = tmp_bits;
 
@@ -1227,7 +1227,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 	}
 
 	mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits,
-	     (unsigned long long)fe->i_blkno);
+	     (unsigned long long)le64_to_cpu(fe->i_blkno));
 
 	*bg_blkno = le64_to_cpu(bg->bg_blkno);
 	*bits_left = le16_to_cpu(bg->bg_free_bits_count);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 5c9e8243691..7c5e3f5d663 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -937,8 +937,7 @@ static void ocfs2_inode_init_once(void *data,
 {
 	struct ocfs2_inode_info *oi = data;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
+	if (flags & SLAB_CTOR_CONSTRUCTOR) {
 		oi->ip_flags = 0;
 		oi->ip_open_count = 0;
 		spin_lock_init(&oi->ip_lock);
@@ -1538,7 +1537,7 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
 		} else if (bh->b_blocknr != le64_to_cpu(di->i_blkno)) {
 			mlog(ML_ERROR, "bad block number on superblock: "
 			     "found %llu, should be %llu\n",
-			     (unsigned long long)di->i_blkno,
+			     (unsigned long long)le64_to_cpu(di->i_blkno),
 			     (unsigned long long)bh->b_blocknr);
 		} else if (le32_to_cpu(di->id2.i_super.s_clustersize_bits) < 12 ||
 			    le32_to_cpu(di->id2.i_super.s_clustersize_bits) > 20) {
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 40dc1a51f4a..7134007ba22 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -67,16 +67,9 @@ static char *ocfs2_page_getlink(struct dentry * dentry,
 	page = read_mapping_page(mapping, 0, NULL);
 	if (IS_ERR(page))
 		goto sync_fail;
-	wait_on_page_locked(page);
-	if (!PageUptodate(page))
-		goto async_fail;
 	*ppage = page;
 	return kmap(page);
 
-async_fail:
-	page_cache_release(page);
-	return ERR_PTR(-EIO);
-
 sync_fail:
 	return (char*)page;
 }
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index bde1c164417..731a90e9f0c 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -419,8 +419,7 @@ static void op_inode_init_once(void *data, struct kmem_cache * cachep, unsigned
 {
 	struct op_inode_info *oi = (struct op_inode_info *) data;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR)
+	if (flags & SLAB_CTOR_CONSTRUCTOR)
 		inode_init_once(&oi->vfs_inode);
 }
 
diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c
index 1bc9f372c7d..e3491328596 100644
--- a/fs/partitions/acorn.c
+++ b/fs/partitions/acorn.c
@@ -271,7 +271,7 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
 		extern void xd_set_geometry(struct block_device *,
 			unsigned char, unsigned char, unsigned int);
 		xd_set_geometry(bdev, dr->secspertrack, heads, 1);
-		invalidate_bdev(bdev, 1);
+		invalidate_bh_lrus();
 		truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
 	}
 #endif
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 8a7d0035ad7..6b9dae3f0e6 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -312,7 +312,7 @@ static struct attribute * default_attrs[] = {
 	NULL,
 };
 
-extern struct subsystem block_subsys;
+extern struct kset block_subsys;
 
 static void part_release(struct kobject *kobj)
 {
@@ -388,7 +388,7 @@ void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len,
 	kobject_add(&p->kobj);
 	if (!disk->part_uevent_suppress)
 		kobject_uevent(&p->kobj, KOBJ_ADD);
-	sysfs_create_link(&p->kobj, &block_subsys.kset.kobj, "subsystem");
+	sysfs_create_link(&p->kobj, &block_subsys.kobj, "subsystem");
 	if (flags & ADDPART_FLAG_WHOLEDISK) {
 		static struct attribute addpartattr = {
 			.name = "whole_disk",
@@ -444,7 +444,7 @@ static int disk_sysfs_symlinks(struct gendisk *disk)
 			goto err_out_dev_link;
 	}
 
-	err = sysfs_create_link(&disk->kobj, &block_subsys.kset.kobj,
+	err = sysfs_create_link(&disk->kobj, &block_subsys.kobj,
 				"subsystem");
 	if (err)
 		goto err_out_disk_name_lnk;
@@ -569,9 +569,6 @@ unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p)
 	page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)),
 				 NULL);
 	if (!IS_ERR(page)) {
-		wait_on_page_locked(page);
-		if (!PageUptodate(page))
-			goto fail;
 		if (PageError(page))
 			goto fail;
 		p->v = page;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 989af5e55d1..ec158dd02b3 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -715,6 +715,40 @@ static const struct file_operations proc_oom_adjust_operations = {
 	.write		= oom_adjust_write,
 };
 
+static ssize_t clear_refs_write(struct file *file, const char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	struct task_struct *task;
+	char buffer[PROC_NUMBUF], *end;
+	struct mm_struct *mm;
+
+	memset(buffer, 0, sizeof(buffer));
+	if (count > sizeof(buffer) - 1)
+		count = sizeof(buffer) - 1;
+	if (copy_from_user(buffer, buf, count))
+		return -EFAULT;
+	if (!simple_strtol(buffer, &end, 0))
+		return -EINVAL;
+	if (*end == '\n')
+		end++;
+	task = get_proc_task(file->f_path.dentry->d_inode);
+	if (!task)
+		return -ESRCH;
+	mm = get_task_mm(task);
+	if (mm) {
+		clear_refs_smap(mm);
+		mmput(mm);
+	}
+	put_task_struct(task);
+	if (end - buffer == 0)
+		return -EIO;
+	return end - buffer;
+}
+
+static struct file_operations proc_clear_refs_operations = {
+	.write		= clear_refs_write,
+};
+
 #ifdef CONFIG_AUDITSYSCALL
 #define TMPBUFLEN 21
 static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
@@ -1851,6 +1885,7 @@ static struct pid_entry tgid_base_stuff[] = {
 	REG("mounts",     S_IRUGO, mounts),
 	REG("mountstats", S_IRUSR, mountstats),
 #ifdef CONFIG_MMU
+	REG("clear_refs", S_IWUSR, clear_refs),
 	REG("smaps",      S_IRUGO, smaps),
 #endif
 #ifdef CONFIG_SECURITY
@@ -2132,6 +2167,7 @@ static struct pid_entry tid_base_stuff[] = {
 	LNK("exe",       exe),
 	REG("mounts",    S_IRUGO, mounts),
 #ifdef CONFIG_MMU
+	REG("clear_refs", S_IWUSR, clear_refs),
 	REG("smaps",     S_IRUGO, smaps),
 #endif
 #ifdef CONFIG_SECURITY
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index c372eb151a3..22b1158389a 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -109,8 +109,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
 	struct proc_inode *ei = (struct proc_inode *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR)
+	if (flags & SLAB_CTOR_CONSTRUCTOR)
 		inode_init_once(&ei->vfs_inode);
 }
  
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index e2c4c0a5c90..75ec6523d29 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -398,8 +398,6 @@ static const struct file_operations proc_modules_operations = {
 #endif
 
 #ifdef CONFIG_SLAB
-extern struct seq_operations slabinfo_op;
-extern ssize_t slabinfo_write(struct file *, const char __user *, size_t, loff_t *);
 static int slabinfo_open(struct inode *inode, struct file *file)
 {
 	return seq_open(file, &slabinfo_op);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 7445980c802..4008c060f7e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -120,6 +120,14 @@ struct mem_size_stats
 	unsigned long shared_dirty;
 	unsigned long private_clean;
 	unsigned long private_dirty;
+	unsigned long referenced;
+};
+
+struct pmd_walker {
+	struct vm_area_struct *vma;
+	void *private;
+	void (*action)(struct vm_area_struct *, pmd_t *, unsigned long,
+		       unsigned long, void *);
 };
 
 static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss)
@@ -181,18 +189,20 @@ static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats
 
 	if (mss)
 		seq_printf(m,
-			   "Size:          %8lu kB\n"
-			   "Rss:           %8lu kB\n"
-			   "Shared_Clean:  %8lu kB\n"
-			   "Shared_Dirty:  %8lu kB\n"
-			   "Private_Clean: %8lu kB\n"
-			   "Private_Dirty: %8lu kB\n",
+			   "Size:           %8lu kB\n"
+			   "Rss:            %8lu kB\n"
+			   "Shared_Clean:   %8lu kB\n"
+			   "Shared_Dirty:   %8lu kB\n"
+			   "Private_Clean:  %8lu kB\n"
+			   "Private_Dirty:  %8lu kB\n"
+			   "Referenced:     %8lu kB\n",
 			   (vma->vm_end - vma->vm_start) >> 10,
 			   mss->resident >> 10,
 			   mss->shared_clean  >> 10,
 			   mss->shared_dirty  >> 10,
 			   mss->private_clean >> 10,
-			   mss->private_dirty >> 10);
+			   mss->private_dirty >> 10,
+			   mss->referenced >> 10);
 
 	if (m->count < m->size)  /* vma is copied successfully */
 		m->version = (vma != get_gate_vma(task))? vma->vm_start: 0;
@@ -205,15 +215,16 @@ static int show_map(struct seq_file *m, void *v)
 }
 
 static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
-				unsigned long addr, unsigned long end,
-				struct mem_size_stats *mss)
+			    unsigned long addr, unsigned long end,
+			    void *private)
 {
+	struct mem_size_stats *mss = private;
 	pte_t *pte, ptent;
 	spinlock_t *ptl;
 	struct page *page;
 
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
-	do {
+	for (; addr != end; pte++, addr += PAGE_SIZE) {
 		ptent = *pte;
 		if (!pte_present(ptent))
 			continue;
@@ -224,6 +235,9 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		if (!page)
 			continue;
 
+		/* Accumulate the size in pages that have been accessed. */
+		if (pte_young(ptent) || PageReferenced(page))
+			mss->referenced += PAGE_SIZE;
 		if (page_mapcount(page) >= 2) {
 			if (pte_dirty(ptent))
 				mss->shared_dirty += PAGE_SIZE;
@@ -235,57 +249,99 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 			else
 				mss->private_clean += PAGE_SIZE;
 		}
-	} while (pte++, addr += PAGE_SIZE, addr != end);
+	}
 	pte_unmap_unlock(pte - 1, ptl);
 	cond_resched();
 }
 
-static inline void smaps_pmd_range(struct vm_area_struct *vma, pud_t *pud,
-				unsigned long addr, unsigned long end,
-				struct mem_size_stats *mss)
+static void clear_refs_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+				 unsigned long addr, unsigned long end,
+				 void *private)
+{
+	pte_t *pte, ptent;
+	spinlock_t *ptl;
+	struct page *page;
+
+	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+	for (; addr != end; pte++, addr += PAGE_SIZE) {
+		ptent = *pte;
+		if (!pte_present(ptent))
+			continue;
+
+		page = vm_normal_page(vma, addr, ptent);
+		if (!page)
+			continue;
+
+		/* Clear accessed and referenced bits. */
+		ptep_test_and_clear_young(vma, addr, pte);
+		ClearPageReferenced(page);
+	}
+	pte_unmap_unlock(pte - 1, ptl);
+	cond_resched();
+}
+
+static inline void walk_pmd_range(struct pmd_walker *walker, pud_t *pud,
+				  unsigned long addr, unsigned long end)
 {
 	pmd_t *pmd;
 	unsigned long next;
 
-	pmd = pmd_offset(pud, addr);
-	do {
+	for (pmd = pmd_offset(pud, addr); addr != end;
+	     pmd++, addr = next) {
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
-		smaps_pte_range(vma, pmd, addr, next, mss);
-	} while (pmd++, addr = next, addr != end);
+		walker->action(walker->vma, pmd, addr, next, walker->private);
+	}
 }
 
-static inline void smaps_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
-				unsigned long addr, unsigned long end,
-				struct mem_size_stats *mss)
+static inline void walk_pud_range(struct pmd_walker *walker, pgd_t *pgd,
+				  unsigned long addr, unsigned long end)
 {
 	pud_t *pud;
 	unsigned long next;
 
-	pud = pud_offset(pgd, addr);
-	do {
+	for (pud = pud_offset(pgd, addr); addr != end;
+	     pud++, addr = next) {
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud))
 			continue;
-		smaps_pmd_range(vma, pud, addr, next, mss);
-	} while (pud++, addr = next, addr != end);
+		walk_pmd_range(walker, pud, addr, next);
+	}
 }
 
-static inline void smaps_pgd_range(struct vm_area_struct *vma,
-				unsigned long addr, unsigned long end,
-				struct mem_size_stats *mss)
+/*
+ * walk_page_range - walk the page tables of a VMA with a callback
+ * @vma - VMA to walk
+ * @action - callback invoked for every bottom-level (PTE) page table
+ * @private - private data passed to the callback function
+ *
+ * Recursively walk the page table for the memory area in a VMA, calling
+ * a callback for every bottom-level (PTE) page table.
+ */
+static inline void walk_page_range(struct vm_area_struct *vma,
+				   void (*action)(struct vm_area_struct *,
+						  pmd_t *, unsigned long,
+						  unsigned long, void *),
+				   void *private)
 {
+	unsigned long addr = vma->vm_start;
+	unsigned long end = vma->vm_end;
+	struct pmd_walker walker = {
+		.vma		= vma,
+		.private	= private,
+		.action		= action,
+	};
 	pgd_t *pgd;
 	unsigned long next;
 
-	pgd = pgd_offset(vma->vm_mm, addr);
-	do {
+	for (pgd = pgd_offset(vma->vm_mm, addr); addr != end;
+	     pgd++, addr = next) {
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		smaps_pud_range(vma, pgd, addr, next, mss);
-	} while (pgd++, addr = next, addr != end);
+		walk_pud_range(&walker, pgd, addr, next);
+	}
 }
 
 static int show_smap(struct seq_file *m, void *v)
@@ -295,10 +351,22 @@ static int show_smap(struct seq_file *m, void *v)
 
 	memset(&mss, 0, sizeof mss);
 	if (vma->vm_mm && !is_vm_hugetlb_page(vma))
-		smaps_pgd_range(vma, vma->vm_start, vma->vm_end, &mss);
+		walk_page_range(vma, smaps_pte_range, &mss);
 	return show_map_internal(m, v, &mss);
 }
 
+void clear_refs_smap(struct mm_struct *mm)
+{
+	struct vm_area_struct *vma;
+
+	down_read(&mm->mmap_sem);
+	for (vma = mm->mmap; vma; vma = vma->vm_next)
+		if (vma->vm_mm && !is_vm_hugetlb_page(vma))
+			walk_page_range(vma, clear_refs_pte_range, NULL);
+	flush_tlb_mm(mm);
+	up_read(&mm->mmap_sem);
+}
+
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
 	struct proc_maps_private *priv = m->private;
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index d96050728c4..523e1098ae8 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -514,7 +514,7 @@ static int __init parse_crash_elf64_headers(void)
 	/* Do some basic Verification. */
 	if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 ||
 		(ehdr.e_type != ET_CORE) ||
-		!elf_check_arch(&ehdr) ||
+		!vmcore_elf_check_arch(&ehdr) ||
 		ehdr.e_ident[EI_CLASS] != ELFCLASS64 ||
 		ehdr.e_ident[EI_VERSION] != EV_CURRENT ||
 		ehdr.e_version != EV_CURRENT ||
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 83bc8e7824c..75fc8498f2e 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -536,8 +536,7 @@ static void init_once(void *foo, struct kmem_cache * cachep,
 {
 	struct qnx4_inode_info *ei = (struct qnx4_inode_info *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR)
+	if (flags & SLAB_CTOR_CONSTRUCTOR)
 		inode_init_once(&ei->vfs_inode);
 }
 
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index f13a7f164dc..7054aaef049 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -511,8 +511,7 @@ static void init_once(void *foo, struct kmem_cache * cachep, unsigned long flags
 {
 	struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
+	if (flags & SLAB_CTOR_CONSTRUCTOR) {
 		INIT_LIST_HEAD(&ei->i_prealloc_list);
 		inode_init_once(&ei->vfs_inode);
 #ifdef CONFIG_REISERFS_FS_POSIX_ACL
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 2cac56210e2..bf6e5821453 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -410,11 +410,7 @@ static struct page *reiserfs_get_page(struct inode *dir, unsigned long n)
 	mapping_set_gfp_mask(mapping, GFP_NOFS);
 	page = read_mapping_page(mapping, n, NULL);
 	if (!IS_ERR(page)) {
-		wait_on_page_locked(page);
 		kmap(page);
-		if (!PageUptodate(page))
-			goto fail;
-
 		if (PageError(page))
 			goto fail;
 	}
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index fd601014813..80428519027 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -570,8 +570,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
 	struct romfs_inode_info *ei = (struct romfs_inode_info *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR)
+	if (flags & SLAB_CTOR_CONSTRUCTOR)
 		inode_init_once(&ei->vfs_inode);
 }
  
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 5faba4f1c9a..424a3ddf86d 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -69,9 +69,8 @@ static void smb_destroy_inode(struct inode *inode)
 static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flags)
 {
 	struct smb_inode_info *ei = (struct smb_inode_info *) foo;
-	unsigned long flagmask = SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR;
 
-	if ((flags & flagmask) == SLAB_CTOR_CONSTRUCTOR)
+	if (flags & SLAB_CTOR_CONSTRUCTOR)
 		inode_init_once(&ei->vfs_inode);
 }
  
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 8ea2a51ce88..d3b9f5f07db 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -59,7 +59,7 @@ read(struct file * file, char __user * userbuf, size_t count, loff_t * off)
 	if (copy_to_user(userbuf, buffer, count))
 		return -EFAULT;
 
-	pr_debug("offs = %lld, *off = %lld, count = %d\n", offs, *off, count);
+	pr_debug("offs = %lld, *off = %lld, count = %zd\n", offs, *off, count);
 
 	*off = offs + count;
 
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index db0413a411d..0e637adc2b8 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -13,8 +13,7 @@
 
 #include "sysfs.h"
 
-#define to_subsys(k) container_of(k,struct subsystem,kset.kobj)
-#define to_sattr(a) container_of(a,struct subsys_attribute,attr)
+#define to_sattr(a) container_of(a,struct subsys_attribute, attr)
 
 /*
  * Subsystem file operations.
@@ -24,12 +23,12 @@
 static ssize_t 
 subsys_attr_show(struct kobject * kobj, struct attribute * attr, char * page)
 {
-	struct subsystem * s = to_subsys(kobj);
+	struct kset *kset = to_kset(kobj);
 	struct subsys_attribute * sattr = to_sattr(attr);
 	ssize_t ret = -EIO;
 
 	if (sattr->show)
-		ret = sattr->show(s,page);
+		ret = sattr->show(kset, page);
 	return ret;
 }
 
@@ -37,12 +36,12 @@ static ssize_t
 subsys_attr_store(struct kobject * kobj, struct attribute * attr, 
 		  const char * page, size_t count)
 {
-	struct subsystem * s = to_subsys(kobj);
+	struct kset *kset = to_kset(kobj);
 	struct subsys_attribute * sattr = to_sattr(attr);
 	ssize_t ret = -EIO;
 
 	if (sattr->store)
-		ret = sattr->store(s,page,count);
+		ret = sattr->store(kset, page, count);
 	return ret;
 }
 
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index ebf7007fa16..e566b387fcf 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -54,17 +54,9 @@ static struct page * dir_get_page(struct inode *dir, unsigned long n)
 {
 	struct address_space *mapping = dir->i_mapping;
 	struct page *page = read_mapping_page(mapping, n, NULL);
-	if (!IS_ERR(page)) {
-		wait_on_page_locked(page);
+	if (!IS_ERR(page))
 		kmap(page);
-		if (!PageUptodate(page))
-			goto fail;
-	}
 	return page;
-
-fail:
-	dir_put_page(page);
-	return ERR_PTR(-EIO);
 }
 
 static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir)
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 9311cac186f..3152d741560 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -322,8 +322,7 @@ static void init_once(void *p, struct kmem_cache *cachep, unsigned long flags)
 {
 	struct sysv_inode_info *si = (struct sysv_inode_info *)p;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-			SLAB_CTOR_CONSTRUCTOR)
+	if (flags & SLAB_CTOR_CONSTRUCTOR)
 		inode_init_once(&si->vfs_inode);
 }
 
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 8672b88f7ff..023b304fdd9 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -134,9 +134,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
 	struct udf_inode_info *ei = (struct udf_inode_info *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR)
-	{
+	if (flags & SLAB_CTOR_CONSTRUCTOR) {
 		ei->i_ext.i_data = NULL;
 		inode_init_once(&ei->vfs_inode);
 	}
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 4890ddf1518..4fb8b2e077e 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -180,13 +180,9 @@ fail:
 static struct page *ufs_get_page(struct inode *dir, unsigned long n)
 {
 	struct address_space *mapping = dir->i_mapping;
-	struct page *page = read_cache_page(mapping, n,
-				(filler_t*)mapping->a_ops->readpage, NULL);
+	struct page *page = read_mapping_page(mapping, n, NULL);
 	if (!IS_ERR(page)) {
-		wait_on_page_locked(page);
 		kmap(page);
-		if (!PageUptodate(page))
-			goto fail;
 		if (!PageChecked(page))
 			ufs_check_page(page);
 		if (PageError(page))
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index b5a6461ec66..be7c48c5f20 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1237,8 +1237,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
 	struct ufs_inode_info *ei = (struct ufs_inode_info *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR)
+	if (flags & SLAB_CTOR_CONSTRUCTOR)
 		inode_init_once(&ei->vfs_inode);
 }
  
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index 17437574f79..84357f1ff0e 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -251,13 +251,11 @@ struct page *ufs_get_locked_page(struct address_space *mapping,
 
 	page = find_lock_page(mapping, index);
 	if (!page) {
-		page = read_cache_page(mapping, index,
-				       (filler_t*)mapping->a_ops->readpage,
-				       NULL);
+		page = read_mapping_page(mapping, index, NULL);
 
 		if (IS_ERR(page)) {
 			printk(KERN_ERR "ufs_change_blocknr: "
-			       "read_cache_page error: ino %lu, index: %lu\n",
+			       "read_mapping_page error: ino %lu, index: %lu\n",
 			       mapping->host->i_ino, index);
 			goto out;
 		}
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 2f2c40db562..14e2cbe5a8d 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -360,8 +360,7 @@ xfs_fs_inode_init_once(
 	kmem_zone_t		*zonep,
 	unsigned long		flags)
 {
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-		      SLAB_CTOR_CONSTRUCTOR)
+	if (flags & SLAB_CTOR_CONSTRUCTOR)
 		inode_init_once(vn_to_inode((bhv_vnode_t *)vnode));
 }