From 1f5ce9e93aa96a867f195ed45f6f77935175f12e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 9 Jun 2006 09:34:16 -0400
Subject: VFS: Unexport do_kern_mount() and clean up simple_pin_fs()

Replace all module uses with the new vfs_kern_mount() interface, and fix up
simple_pin_fs().

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 mm/shmem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/shmem.c b/mm/shmem.c
index 4c5e68e4e9a..8184342440f 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2261,7 +2261,7 @@ static int __init init_tmpfs(void)
 #ifdef CONFIG_TMPFS
 	devfs_mk_dir("shm");
 #endif
-	shm_mnt = do_kern_mount(tmpfs_fs_type.name, MS_NOUSER,
+	shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER,
 				tmpfs_fs_type.name, NULL);
 	if (IS_ERR(shm_mnt)) {
 		error = PTR_ERR(shm_mnt);
-- 
cgit v1.2.3-70-g09d2


From 454e2398be9b9fa30433fccc548db34d19aa9958 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 23 Jun 2006 02:02:57 -0700
Subject: [PATCH] VFS: Permit filesystem to override root dentry on mount

Extend the get_sb() filesystem operation to take an extra argument that
permits the VFS to pass in the target vfsmount that defines the mountpoint.

The filesystem is then required to manually set the superblock and root dentry
pointers.  For most filesystems, this should be done with simple_set_mnt()
which will set the superblock pointer and then set the root dentry to the
superblock's s_root (as per the old default behaviour).

The get_sb() op now returns an integer as there's now no need to return the
superblock pointer.

This patch permits a superblock to be implicitly shared amongst several mount
points, such as can be done with NFS to avoid potential inode aliasing.  In
such a case, simple_set_mnt() would not be called, and instead the mnt_root
and mnt_sb would be set directly.

The patch also makes the following changes:

 (*) the get_sb_*() convenience functions in the core kernel now take a vfsmount
     pointer argument and return an integer, so most filesystems have to change
     very little.

 (*) If one of the convenience function is not used, then get_sb() should
     normally call simple_set_mnt() to instantiate the vfsmount. This will
     always return 0, and so can be tail-called from get_sb().

 (*) generic_shutdown_super() now calls shrink_dcache_sb() to clean up the
     dcache upon superblock destruction rather than shrink_dcache_anon().

     This is required because the superblock may now have multiple trees that
     aren't actually bound to s_root, but that still need to be cleaned up. The
     currently called functions assume that the whole tree is rooted at s_root,
     and that anonymous dentries are not the roots of trees which results in
     dentries being left unculled.

     However, with the way NFS superblock sharing are currently set to be
     implemented, these assumptions are violated: the root of the filesystem is
     simply a dummy dentry and inode (the real inode for '/' may well be
     inaccessible), and all the vfsmounts are rooted on anonymous[*] dentries
     with child trees.

     [*] Anonymous until discovered from another tree.

 (*) The documentation has been adjusted, including the additional bit of
     changing ext2_* into foo_* in the documentation.

[akpm@osdl.org: convert ipath_fs, do other stuff]
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Nathan Scott <nathans@sgi.com>
Cc: Roland Dreier <rolandd@cisco.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/filesystems/Locking         |   7 +-
 Documentation/filesystems/porting         |   7 +-
 Documentation/filesystems/vfs.txt         |   4 +-
 arch/ia64/kernel/perfmon.c                |   7 +-
 arch/powerpc/platforms/cell/spufs/inode.c |   6 +-
 drivers/infiniband/core/uverbs_main.c     |   7 +-
 drivers/infiniband/hw/ipath/ipath_fs.c    |  13 ++--
 drivers/isdn/capi/capifs.c                |   6 +-
 drivers/misc/ibmasm/ibmasmfs.c            |   7 +-
 drivers/oprofile/oprofilefs.c             |   6 +-
 drivers/usb/core/inode.c                  |   6 +-
 drivers/usb/gadget/inode.c                |   6 +-
 fs/9p/vfs_super.c                         |  21 +++---
 fs/adfs/super.c                           |   7 +-
 fs/affs/super.c                           |   7 +-
 fs/afs/super.c                            |  24 ++++---
 fs/autofs/init.c                          |   6 +-
 fs/autofs4/init.c                         |   6 +-
 fs/befs/linuxvfs.c                        |   7 +-
 fs/bfs/inode.c                            |   6 +-
 fs/binfmt_misc.c                          |   6 +-
 fs/block_dev.c                            |   6 +-
 fs/cifs/cifsfs.c                          |  10 +--
 fs/coda/inode.c                           |   6 +-
 fs/configfs/mount.c                       |   6 +-
 fs/cramfs/inode.c                         |   7 +-
 fs/dcache.c                               |  40 -----------
 fs/debugfs/inode.c                        |   8 +--
 fs/devfs/base.c                           |   8 +--
 fs/devpts/inode.c                         |   6 +-
 fs/efs/super.c                            |   6 +-
 fs/eventpoll.c                            |  13 ++--
 fs/ext2/super.c                           |   6 +-
 fs/ext3/super.c                           |   6 +-
 fs/freevxfs/vxfs_super.c                  |   7 +-
 fs/fuse/inode.c                           |   8 +--
 fs/hfs/super.c                            |   7 +-
 fs/hfsplus/super.c                        |   8 ++-
 fs/hostfs/hostfs_kern.c                   |   8 +--
 fs/hpfs/super.c                           |   7 +-
 fs/hppfs/hppfs_kern.c                     |   8 +--
 fs/hugetlbfs/inode.c                      |   6 +-
 fs/inotify_user.c                         |   6 +-
 fs/isofs/inode.c                          |   7 +-
 fs/jffs/inode-v23.c                       |   7 +-
 fs/jffs2/super.c                          |  49 ++++++++------
 fs/jfs/super.c                            |   7 +-
 fs/libfs.c                                |  12 ++--
 fs/minix/inode.c                          |   7 +-
 fs/msdos/namei.c                          |   9 +--
 fs/namespace.c                            |   9 +++
 fs/ncpfs/inode.c                          |   6 +-
 fs/nfs/inode.c                            |  96 +++++++++++++++-----------
 fs/nfsd/nfsctl.c                          |   6 +-
 fs/ntfs/super.c                           |   7 +-
 fs/ocfs2/dlm/dlmfs.c                      |   6 +-
 fs/ocfs2/super.c                          |  12 ++--
 fs/openpromfs/inode.c                     |   6 +-
 fs/pipe.c                                 |   9 ++-
 fs/proc/root.c                            |   6 +-
 fs/qnx4/inode.c                           |   7 +-
 fs/ramfs/inode.c                          |  13 ++--
 fs/reiserfs/super.c                       |   9 +--
 fs/romfs/inode.c                          |   7 +-
 fs/smbfs/inode.c                          |   6 +-
 fs/super.c                                | 109 +++++++++++++++++-------------
 fs/sysfs/mount.c                          |   6 +-
 fs/sysv/super.c                           |  13 ++--
 fs/udf/super.c                            |   6 +-
 fs/ufs/super.c                            |   6 +-
 fs/vfat/namei.c                           |   9 +--
 fs/xfs/linux-2.6/xfs_super.c              |   8 ++-
 include/linux/dcache.h                    |   1 -
 include/linux/fs.h                        |  25 ++++---
 include/linux/ramfs.h                     |   4 +-
 ipc/mqueue.c                              |   8 +--
 kernel/cpuset.c                           |   8 +--
 kernel/futex.c                            |   8 +--
 mm/shmem.c                                |   6 +-
 net/socket.c                              |   7 +-
 net/sunrpc/rpc_pipe.c                     |   6 +-
 security/inode.c                          |   8 +--
 security/selinux/selinuxfs.c              |   7 +-
 83 files changed, 482 insertions(+), 431 deletions(-)

(limited to 'mm')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 1045da582b9..3abf08f1b14 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -142,15 +142,16 @@ see also dquot_operations section.
 
 --------------------------- file_system_type ---------------------------
 prototypes:
-	struct super_block *(*get_sb) (struct file_system_type *, int,
-			const char *, void *);
+	struct int (*get_sb) (struct file_system_type *, int,
+			const char *, void *, struct vfsmount *);
 	void (*kill_sb) (struct super_block *);
 locking rules:
 		may block	BKL
 get_sb		yes		yes
 kill_sb		yes		yes
 
-->get_sb() returns error or a locked superblock (exclusive on ->s_umount).
+->get_sb() returns error or 0 with locked superblock attached to the vfsmount
+(exclusive on ->s_umount).
 ->kill_sb() takes a write-locked superblock, does all shutdown work on it,
 unlocks and drops the reference.
 
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 2f388460cbe..5531694059a 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -50,10 +50,11 @@ Turn your foo_read_super() into a function that would return 0 in case of
 success and negative number in case of error (-EINVAL unless you have more
 informative error value to report).  Call it foo_fill_super().  Now declare
 
-struct super_block foo_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+int foo_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ext2_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, foo_fill_super,
+			   mnt);
 }
 
 (or similar with s/bdev/nodev/ or s/bdev/single/, depending on the kind of
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 3a2e5520c1e..dd7d0dcedc8 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -113,8 +113,8 @@ members are defined:
 struct file_system_type {
 	const char *name;
 	int fs_flags;
-        struct super_block *(*get_sb) (struct file_system_type *, int,
-                                       const char *, void *);
+        struct int (*get_sb) (struct file_system_type *, int,
+                              const char *, void *, struct vfsmount *);
         void (*kill_sb) (struct super_block *);
         struct module *owner;
         struct file_system_type * next;
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 077f21216b6..2359e2809f5 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -595,10 +595,11 @@ pfm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 }
 
 
-static struct super_block *
-pfmfs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data)
+static int
+pfmfs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data,
+	     struct vfsmount *mnt)
 {
-	return get_sb_pseudo(fs_type, "pfm:", NULL, PFMFS_MAGIC);
+	return get_sb_pseudo(fs_type, "pfm:", NULL, PFMFS_MAGIC, mnt);
 }
 
 static struct file_system_type pfm_fs_type = {
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 1987697b23a..7b4572805db 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -436,11 +436,11 @@ spufs_fill_super(struct super_block *sb, void *data, int silent)
 	return spufs_create_root(sb, data);
 }
 
-static struct super_block *
+static int
 spufs_get_sb(struct file_system_type *fstype, int flags,
-		const char *name, void *data)
+		const char *name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fstype, flags, data, spufs_fill_super);
+	return get_sb_single(fstype, flags, data, spufs_fill_super, mnt);
 }
 
 static struct file_system_type spufs_type = {
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 5ec2d49e9bb..e57d3c50f75 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -821,11 +821,12 @@ static void ib_uverbs_remove_one(struct ib_device *device)
 	kref_put(&uverbs_dev->ref, ib_uverbs_release_dev);
 }
 
-static struct super_block *uverbs_event_get_sb(struct file_system_type *fs_type, int flags,
-					       const char *dev_name, void *data)
+static int uverbs_event_get_sb(struct file_system_type *fs_type, int flags,
+			       const char *dev_name, void *data,
+			       struct vfsmount *mnt)
 {
 	return get_sb_pseudo(fs_type, "infinibandevent:", NULL,
-			     INFINIBANDEVENTFS_MAGIC);
+			     INFINIBANDEVENTFS_MAGIC, mnt);
 }
 
 static struct file_system_type uverbs_event_fs = {
diff --git a/drivers/infiniband/hw/ipath/ipath_fs.c b/drivers/infiniband/hw/ipath/ipath_fs.c
index e274120567e..63de3046aff 100644
--- a/drivers/infiniband/hw/ipath/ipath_fs.c
+++ b/drivers/infiniband/hw/ipath/ipath_fs.c
@@ -542,13 +542,14 @@ bail:
 	return ret;
 }
 
-static struct super_block *ipathfs_get_sb(struct file_system_type *fs_type,
-				        int flags, const char *dev_name,
-					void *data)
+static int ipathfs_get_sb(struct file_system_type *fs_type, int flags,
+			const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	ipath_super = get_sb_single(fs_type, flags, data,
-				    ipathfs_fill_super);
-	return ipath_super;
+	int ret = get_sb_single(fs_type, flags, data,
+				    ipathfs_fill_super, mnt);
+	if (ret >= 0)
+		ipath_super = mnt->mnt_sb;
+	return ret;
 }
 
 static void ipathfs_kill_super(struct super_block *s)
diff --git a/drivers/isdn/capi/capifs.c b/drivers/isdn/capi/capifs.c
index 0a37aded4b5..9ea6bd0ddc3 100644
--- a/drivers/isdn/capi/capifs.c
+++ b/drivers/isdn/capi/capifs.c
@@ -121,10 +121,10 @@ fail:
 	return -ENOMEM;
 }
 
-static struct super_block *capifs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int capifs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, capifs_fill_super);
+	return get_sb_single(fs_type, flags, data, capifs_fill_super, mnt);
 }
 
 static struct file_system_type capifs_fs_type = {
diff --git a/drivers/misc/ibmasm/ibmasmfs.c b/drivers/misc/ibmasm/ibmasmfs.c
index 26a230b6ff8..4a35caff5d0 100644
--- a/drivers/misc/ibmasm/ibmasmfs.c
+++ b/drivers/misc/ibmasm/ibmasmfs.c
@@ -90,10 +90,11 @@ static void ibmasmfs_create_files (struct super_block *sb, struct dentry *root);
 static int ibmasmfs_fill_super (struct super_block *sb, void *data, int silent);
 
 
-static struct super_block *ibmasmfs_get_super(struct file_system_type *fst,
-			int flags, const char *name, void *data)
+static int ibmasmfs_get_super(struct file_system_type *fst,
+			int flags, const char *name, void *data,
+			struct vfsmount *mnt)
 {
-	return get_sb_single(fst, flags, data, ibmasmfs_fill_super);
+	return get_sb_single(fst, flags, data, ibmasmfs_fill_super, mnt);
 }
 
 static struct super_operations ibmasmfs_s_ops = {
diff --git a/drivers/oprofile/oprofilefs.c b/drivers/oprofile/oprofilefs.c
index b62da9b0cbf..71c2da277d6 100644
--- a/drivers/oprofile/oprofilefs.c
+++ b/drivers/oprofile/oprofilefs.c
@@ -272,10 +272,10 @@ static int oprofilefs_fill_super(struct super_block * sb, void * data, int silen
 }
 
 
-static struct super_block *oprofilefs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int oprofilefs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, oprofilefs_fill_super);
+	return get_sb_single(fs_type, flags, data, oprofilefs_fill_super, mnt);
 }
 
 
diff --git a/drivers/usb/core/inode.c b/drivers/usb/core/inode.c
index 3cf945cc5b9..95f5ad923b0 100644
--- a/drivers/usb/core/inode.c
+++ b/drivers/usb/core/inode.c
@@ -543,10 +543,10 @@ static void fs_remove_file (struct dentry *dentry)
 
 /* --------------------------------------------------------------------- */
 
-static struct super_block *usb_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int usb_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, usbfs_fill_super);
+	return get_sb_single(fs_type, flags, data, usbfs_fill_super, mnt);
 }
 
 static struct file_system_type usb_fs_type = {
diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c
index aef0722b8f1..3bdc5e3ba23 100644
--- a/drivers/usb/gadget/inode.c
+++ b/drivers/usb/gadget/inode.c
@@ -2070,11 +2070,11 @@ enomem0:
 }
 
 /* "mount -t gadgetfs path /dev/gadget" ends up here */
-static struct super_block *
+static int
 gadgetfs_get_sb (struct file_system_type *t, int flags,
-		const char *path, void *opts)
+		const char *path, void *opts, struct vfsmount *mnt)
 {
-	return get_sb_single (t, flags, opts, gadgetfs_fill_super);
+	return get_sb_single (t, flags, opts, gadgetfs_fill_super, mnt);
 }
 
 static void
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 61c599b4a1e..872943004e5 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -99,12 +99,13 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
  * @flags: mount flags
  * @dev_name: device name that was mounted
  * @data: mount options
+ * @mnt: mountpoint record to be instantiated
  *
  */
 
-static struct super_block *v9fs_get_sb(struct file_system_type
-				       *fs_type, int flags,
-				       const char *dev_name, void *data)
+static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
+		       const char *dev_name, void *data,
+		       struct vfsmount *mnt)
 {
 	struct super_block *sb = NULL;
 	struct v9fs_fcall *fcall = NULL;
@@ -123,17 +124,19 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 
 	v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
 	if (!v9ses)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 
 	if ((newfid = v9fs_session_init(v9ses, dev_name, data)) < 0) {
 		dprintk(DEBUG_ERROR, "problem initiating session\n");
-		sb = ERR_PTR(newfid);
+		retval = newfid;
 		goto out_free_session;
 	}
 
 	sb = sget(fs_type, NULL, v9fs_set_super, v9ses);
-	if (IS_ERR(sb))
+	if (IS_ERR(sb)) {
+		retval = PTR_ERR(sb);
 		goto out_close_session;
+	}
 	v9fs_fill_super(sb, v9ses, flags);
 
 	inode = v9fs_get_inode(sb, S_IFDIR | mode);
@@ -184,19 +187,19 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 		goto put_back_sb;
 	}
 
-	return sb;
+	return simple_set_mnt(mnt, sb);
 
 out_close_session:
 	v9fs_session_close(v9ses);
 out_free_session:
 	kfree(v9ses);
-	return sb;
+	return retval;
 
 put_back_sb:
 	/* deactivate_super calls v9fs_kill_super which will frees the rest */
 	up_write(&sb->s_umount);
 	deactivate_super(sb);
-	return ERR_PTR(retval);
+	return retval;
 }
 
 /**
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 252abda0d20..1b58a9b7f6a 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -470,10 +470,11 @@ error:
 	return -EINVAL;
 }
 
-static struct super_block *adfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int adfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, adfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, adfs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type adfs_fs_type = {
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 4d7e5b19e5c..6a52e787540 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -524,10 +524,11 @@ affs_statfs(struct super_block *sb, struct kstatfs *buf)
 	return 0;
 }
 
-static struct super_block *affs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int affs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, affs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, affs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type affs_fs_type = {
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 53c56e7231a..82468df0ba5 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -38,9 +38,9 @@ struct afs_mount_params {
 static void afs_i_init_once(void *foo, kmem_cache_t *cachep,
 			    unsigned long flags);
 
-static struct super_block *afs_get_sb(struct file_system_type *fs_type,
-				      int flags, const char *dev_name,
-				      void *data);
+static int afs_get_sb(struct file_system_type *fs_type,
+		      int flags, const char *dev_name,
+		      void *data, struct vfsmount *mnt);
 
 static struct inode *afs_alloc_inode(struct super_block *sb);
 
@@ -294,10 +294,11 @@ static int afs_fill_super(struct super_block *sb, void *data, int silent)
  * get an AFS superblock
  * - TODO: don't use get_sb_nodev(), but rather call sget() directly
  */
-static struct super_block *afs_get_sb(struct file_system_type *fs_type,
-				      int flags,
-				      const char *dev_name,
-				      void *options)
+static int afs_get_sb(struct file_system_type *fs_type,
+		      int flags,
+		      const char *dev_name,
+		      void *options,
+		      struct vfsmount *mnt)
 {
 	struct afs_mount_params params;
 	struct super_block *sb;
@@ -311,7 +312,7 @@ static struct super_block *afs_get_sb(struct file_system_type *fs_type,
 	ret = afscm_start();
 	if (ret < 0) {
 		_leave(" = %d", ret);
-		return ERR_PTR(ret);
+		return ret;
 	}
 
 	/* parse the options */
@@ -348,18 +349,19 @@ static struct super_block *afs_get_sb(struct file_system_type *fs_type,
 		goto error;
 	}
 	sb->s_flags |= MS_ACTIVE;
+	simple_set_mnt(mnt, sb);
 
 	afs_put_volume(params.volume);
 	afs_put_cell(params.default_cell);
-	_leave(" = %p", sb);
-	return sb;
+	_leave(" = 0 [%p]", 0, sb);
+	return 0;
 
  error:
 	afs_put_volume(params.volume);
 	afs_put_cell(params.default_cell);
 	afscm_stop();
 	_leave(" = %d", ret);
-	return ERR_PTR(ret);
+	return ret;
 } /* end afs_get_sb() */
 
 /*****************************************************************************/
diff --git a/fs/autofs/init.c b/fs/autofs/init.c
index b977ece69f0..aca12375240 100644
--- a/fs/autofs/init.c
+++ b/fs/autofs/init.c
@@ -14,10 +14,10 @@
 #include <linux/init.h>
 #include "autofs_i.h"
 
-static struct super_block *autofs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int autofs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, autofs_fill_super);
+	return get_sb_nodev(fs_type, flags, data, autofs_fill_super, mnt);
 }
 
 static struct file_system_type autofs_fs_type = {
diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c
index acecec8578c..5d9193332be 100644
--- a/fs/autofs4/init.c
+++ b/fs/autofs4/init.c
@@ -14,10 +14,10 @@
 #include <linux/init.h>
 #include "autofs_i.h"
 
-static struct super_block *autofs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int autofs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, autofs4_fill_super);
+	return get_sb_nodev(fs_type, flags, data, autofs4_fill_super, mnt);
 }
 
 static struct file_system_type autofs_fs_type = {
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 68ebd10f345..6ed07a5f10c 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -899,11 +899,12 @@ befs_statfs(struct super_block *sb, struct kstatfs *buf)
 	return 0;
 }
 
-static struct super_block *
+static int
 befs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name,
-	    void *data)
+	    void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, befs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, befs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type befs_fs_type = {
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 55a7a78332f..e7da03f63a5 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -410,10 +410,10 @@ out:
 	return -EINVAL;
 }
 
-static struct super_block *bfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int bfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, bfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, bfs_fill_super, mnt);
 }
 
 static struct file_system_type bfs_fs_type = {
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 599f36fd0f6..07a4996cca3 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -739,10 +739,10 @@ static int bm_fill_super(struct super_block * sb, void * data, int silent)
 	return err;
 }
 
-static struct super_block *bm_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int bm_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, bm_fill_super);
+	return get_sb_single(fs_type, flags, data, bm_fill_super, mnt);
 }
 
 static struct linux_binfmt misc_format = {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 44aaba202f7..028d9fb9c2d 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -300,10 +300,10 @@ static struct super_operations bdev_sops = {
 	.clear_inode = bdev_clear_inode,
 };
 
-static struct super_block *bd_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int bd_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576);
+	return get_sb_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576, mnt);
 }
 
 static struct file_system_type bd_type = {
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index c262d8874ce..08b35801dfe 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -460,9 +460,9 @@ struct super_operations cifs_super_ops = {
 	.remount_fs = cifs_remount,
 };
 
-static struct super_block *
+static int
 cifs_get_sb(struct file_system_type *fs_type,
-	    int flags, const char *dev_name, void *data)
+	    int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
 	int rc;
 	struct super_block *sb = sget(fs_type, NULL, set_anon_super, NULL);
@@ -470,7 +470,7 @@ cifs_get_sb(struct file_system_type *fs_type,
 	cFYI(1, ("Devname: %s flags: %d ", dev_name, flags));
 
 	if (IS_ERR(sb))
-		return sb;
+		return PTR_ERR(sb);
 
 	sb->s_flags = flags;
 
@@ -478,10 +478,10 @@ cifs_get_sb(struct file_system_type *fs_type,
 	if (rc) {
 		up_write(&sb->s_umount);
 		deactivate_super(sb);
-		return ERR_PTR(rc);
+		return rc;
 	}
 	sb->s_flags |= MS_ACTIVE;
-	return sb;
+	return simple_set_mnt(mnt, sb);
 }
 
 static ssize_t cifs_file_writev(struct file *file, const struct iovec *iov,
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index ada1a81df6b..cba70201567 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -307,10 +307,10 @@ static int coda_statfs(struct super_block *sb, struct kstatfs *buf)
 
 /* init_coda: used by filesystems.c to register coda */
 
-static struct super_block *coda_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int coda_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, coda_fill_super);
+	return get_sb_nodev(fs_type, flags, data, coda_fill_super, mnt);
 }
 
 struct file_system_type coda_fs_type = {
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index f920d30478e..94dab7bdd85 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -103,10 +103,10 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 }
 
-static struct super_block *configfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int configfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, configfs_fill_super);
+	return get_sb_single(fs_type, flags, data, configfs_fill_super, mnt);
 }
 
 static struct file_system_type configfs_fs_type = {
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 9efcc3a164e..37a91a153aa 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -528,10 +528,11 @@ static struct super_operations cramfs_ops = {
 	.statfs		= cramfs_statfs,
 };
 
-static struct super_block *cramfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int cramfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, cramfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, cramfs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type cramfs_fs_type = {
diff --git a/fs/dcache.c b/fs/dcache.c
index 59dbc92c207..313b54b2b8f 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -687,46 +687,6 @@ void shrink_dcache_parent(struct dentry * parent)
 		prune_dcache(found, parent->d_sb);
 }
 
-/**
- * shrink_dcache_anon - further prune the cache
- * @head: head of d_hash list of dentries to prune
- *
- * Prune the dentries that are anonymous
- *
- * parsing d_hash list does not hlist_for_each_entry_rcu() as it
- * done under dcache_lock.
- *
- */
-void shrink_dcache_anon(struct super_block *sb)
-{
-	struct hlist_node *lp;
-	struct hlist_head *head = &sb->s_anon;
-	int found;
-	do {
-		found = 0;
-		spin_lock(&dcache_lock);
-		hlist_for_each(lp, head) {
-			struct dentry *this = hlist_entry(lp, struct dentry, d_hash);
-			if (!list_empty(&this->d_lru)) {
-				dentry_stat.nr_unused--;
-				list_del_init(&this->d_lru);
-			}
-
-			/* 
-			 * move only zero ref count dentries to the end 
-			 * of the unused list for prune_dcache
-			 */
-			if (!atomic_read(&this->d_count)) {
-				list_add_tail(&this->d_lru, &dentry_unused);
-				dentry_stat.nr_unused++;
-				found++;
-			}
-		}
-		spin_unlock(&dcache_lock);
-		prune_dcache(found, sb);
-	} while(found);
-}
-
 /*
  * Scan `nr' dentries and return the number which remain.
  *
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index b55b4ea9a67..440128ebef3 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -111,11 +111,11 @@ static int debug_fill_super(struct super_block *sb, void *data, int silent)
 	return simple_fill_super(sb, DEBUGFS_MAGIC, debug_files);
 }
 
-static struct super_block *debug_get_sb(struct file_system_type *fs_type,
-				        int flags, const char *dev_name,
-					void *data)
+static int debug_get_sb(struct file_system_type *fs_type,
+			int flags, const char *dev_name,
+			void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, debug_fill_super);
+	return get_sb_single(fs_type, flags, data, debug_fill_super, mnt);
 }
 
 static struct file_system_type debug_fs_type = {
diff --git a/fs/devfs/base.c b/fs/devfs/base.c
index 52f5059c4f3..51a97f13274 100644
--- a/fs/devfs/base.c
+++ b/fs/devfs/base.c
@@ -2549,11 +2549,11 @@ static int devfs_fill_super(struct super_block *sb, void *data, int silent)
 	return -EINVAL;
 }				/*  End Function devfs_fill_super  */
 
-static struct super_block *devfs_get_sb(struct file_system_type *fs_type,
-					int flags, const char *dev_name,
-					void *data)
+static int devfs_get_sb(struct file_system_type *fs_type,
+			int flags, const char *dev_name,
+			void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, devfs_fill_super);
+	return get_sb_single(fs_type, flags, data, devfs_fill_super, mnt);
 }
 
 static struct file_system_type devfs_fs_type = {
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 14c5620b5ca..f7aef5bb584 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -130,10 +130,10 @@ fail:
 	return -ENOMEM;
 }
 
-static struct super_block *devpts_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int devpts_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, devpts_fill_super);
+	return get_sb_single(fs_type, flags, data, devpts_fill_super, mnt);
 }
 
 static struct file_system_type devpts_fs_type = {
diff --git a/fs/efs/super.c b/fs/efs/super.c
index dff623e3ddb..1ba5e14f879 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -18,10 +18,10 @@
 static int efs_statfs(struct super_block *s, struct kstatfs *buf);
 static int efs_fill_super(struct super_block *s, void *d, int silent);
 
-static struct super_block *efs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int efs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, efs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, efs_fill_super, mnt);
 }
 
 static struct file_system_type efs_fs_type = {
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 2695337d4d6..08e7e6a555c 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -268,9 +268,9 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 		   int maxevents, long timeout);
 static int eventpollfs_delete_dentry(struct dentry *dentry);
 static struct inode *ep_eventpoll_inode(void);
-static struct super_block *eventpollfs_get_sb(struct file_system_type *fs_type,
-					      int flags, const char *dev_name,
-					      void *data);
+static int eventpollfs_get_sb(struct file_system_type *fs_type,
+			      int flags, const char *dev_name,
+			      void *data, struct vfsmount *mnt);
 
 /*
  * This semaphore is used to serialize ep_free() and eventpoll_release_file().
@@ -1595,11 +1595,12 @@ eexit_1:
 }
 
 
-static struct super_block *
+static int
 eventpollfs_get_sb(struct file_system_type *fs_type, int flags,
-		   const char *dev_name, void *data)
+		   const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_pseudo(fs_type, "eventpoll:", NULL, EVENTPOLLFS_MAGIC);
+	return get_sb_pseudo(fs_type, "eventpoll:", NULL, EVENTPOLLFS_MAGIC,
+			     mnt);
 }
 
 
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 7e30bae174e..a4dfffac596 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1087,10 +1087,10 @@ static int ext2_statfs (struct super_block * sb, struct kstatfs * buf)
 	return 0;
 }
 
-static struct super_block *ext2_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int ext2_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ext2_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, ext2_fill_super, mnt);
 }
 
 #ifdef CONFIG_QUOTA
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index f8a5266ea1f..657f8e73b62 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2646,10 +2646,10 @@ out:
 
 #endif
 
-static struct super_block *ext3_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int ext3_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ext3_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, ext3_fill_super, mnt);
 }
 
 static struct file_system_type ext3_fs_type = {
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index b44c916d24a..d76eeaafbde 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -241,10 +241,11 @@ out:
 /*
  * The usual module blurb.
  */
-static struct super_block *vxfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int vxfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, vxfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, vxfs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type vxfs_fs_type = {
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 7627022446b..c91f0a50aad 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -569,11 +569,11 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	return err;
 }
 
-static struct super_block *fuse_get_sb(struct file_system_type *fs_type,
-				       int flags, const char *dev_name,
-				       void *raw_data)
+static int fuse_get_sb(struct file_system_type *fs_type,
+		       int flags, const char *dev_name,
+		       void *raw_data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, raw_data, fuse_fill_super);
+	return get_sb_nodev(fs_type, flags, raw_data, fuse_fill_super, mnt);
 }
 
 static struct file_system_type fuse_fs_type = {
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 1181d116117..ee5b80a409e 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -413,10 +413,11 @@ bail:
 	return res;
 }
 
-static struct super_block *hfs_get_sb(struct file_system_type *fs_type,
-				      int flags, const char *dev_name, void *data)
+static int hfs_get_sb(struct file_system_type *fs_type,
+		      int flags, const char *dev_name, void *data,
+		      struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, hfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, hfs_fill_super, mnt);
 }
 
 static struct file_system_type hfs_fs_type = {
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 7843f792a4b..0ed8b7e8e87 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -450,10 +450,12 @@ static void hfsplus_destroy_inode(struct inode *inode)
 
 #define HFSPLUS_INODE_SIZE	sizeof(struct hfsplus_inode_info)
 
-static struct super_block *hfsplus_get_sb(struct file_system_type *fs_type,
-					  int flags, const char *dev_name, void *data)
+static int hfsplus_get_sb(struct file_system_type *fs_type,
+			  int flags, const char *dev_name, void *data,
+			  struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super,
+			   mnt);
 }
 
 static struct file_system_type hfsplus_fs_type = {
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index bf0f8e16e43..04035e08f5c 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -993,11 +993,11 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
 	return(err);
 }
 
-static struct super_block *hostfs_read_sb(struct file_system_type *type,
-					     int flags, const char *dev_name,
-					     void *data)
+static int hostfs_read_sb(struct file_system_type *type,
+			  int flags, const char *dev_name,
+			  void *data, struct vfsmount *mnt)
 {
-	return(get_sb_nodev(type, flags, data, hostfs_fill_sb_common));
+	return get_sb_nodev(type, flags, data, hostfs_fill_sb_common, mnt);
 }
 
 static struct file_system_type hostfs_type = {
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index d72d8c87c99..3b25cf3e2e6 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -662,10 +662,11 @@ bail0:
 	return -EINVAL;
 }
 
-static struct super_block *hpfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int hpfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, hpfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, hpfs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type hpfs_fs_type = {
diff --git a/fs/hppfs/hppfs_kern.c b/fs/hppfs/hppfs_kern.c
index 5e6363be246..ec43c22bc9c 100644
--- a/fs/hppfs/hppfs_kern.c
+++ b/fs/hppfs/hppfs_kern.c
@@ -769,11 +769,11 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
 	return(err);
 }
 
-static struct super_block *hppfs_read_super(struct file_system_type *type,
-					     int flags, const char *dev_name,
-					     void *data)
+static int hppfs_read_super(struct file_system_type *type,
+			    int flags, const char *dev_name,
+			    void *data, struct vfsmount *mnt)
 {
-	return(get_sb_nodev(type, flags, data, hppfs_fill_super));
+	return get_sb_nodev(type, flags, data, hppfs_fill_super, mnt);
 }
 
 static struct file_system_type hppfs_type = {
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 3a5b4e92345..4665c26171f 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -723,10 +723,10 @@ void hugetlb_put_quota(struct address_space *mapping)
 	}
 }
 
-static struct super_block *hugetlbfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int hugetlbfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, hugetlbfs_fill_super);
+	return get_sb_nodev(fs_type, flags, data, hugetlbfs_fill_super, mnt);
 }
 
 static struct file_system_type hugetlbfs_fs_type = {
diff --git a/fs/inotify_user.c b/fs/inotify_user.c
index 9e9931e2bad..f2386442ade 100644
--- a/fs/inotify_user.c
+++ b/fs/inotify_user.c
@@ -672,11 +672,11 @@ out:
 	return ret;
 }
 
-static struct super_block *
+static int
 inotify_get_sb(struct file_system_type *fs_type, int flags,
-	       const char *dev_name, void *data)
+	       const char *dev_name, void *data, struct vfsmount *mnt)
 {
-    return get_sb_pseudo(fs_type, "inotify", NULL, 0xBAD1DEA);
+	return get_sb_pseudo(fs_type, "inotify", NULL, 0xBAD1DEA, mnt);
 }
 
 static struct file_system_type inotify_fs_type = {
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 70adbb98bad..17268da63a4 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -1399,10 +1399,11 @@ struct inode *isofs_iget(struct super_block *sb,
 	return inode;
 }
 
-static struct super_block *isofs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int isofs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, isofs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, isofs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type iso9660_fs_type = {
diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c
index 020cc097c53..dd93a091ad6 100644
--- a/fs/jffs/inode-v23.c
+++ b/fs/jffs/inode-v23.c
@@ -1785,10 +1785,11 @@ static struct super_operations jffs_ops =
 	.remount_fs	= jffs_remount,
 };
 
-static struct super_block *jffs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int jffs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, jffs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, jffs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type jffs_fs_type = {
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 9d0521451f5..2378a662c25 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -111,9 +111,10 @@ static int jffs2_sb_set(struct super_block *sb, void *data)
 	return 0;
 }
 
-static struct super_block *jffs2_get_sb_mtd(struct file_system_type *fs_type,
-					      int flags, const char *dev_name,
-					      void *data, struct mtd_info *mtd)
+static int jffs2_get_sb_mtd(struct file_system_type *fs_type,
+			    int flags, const char *dev_name,
+			    void *data, struct mtd_info *mtd,
+			    struct vfsmount *mnt)
 {
 	struct super_block *sb;
 	struct jffs2_sb_info *c;
@@ -121,19 +122,20 @@ static struct super_block *jffs2_get_sb_mtd(struct file_system_type *fs_type,
 
 	c = kmalloc(sizeof(*c), GFP_KERNEL);
 	if (!c)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 	memset(c, 0, sizeof(*c));
 	c->mtd = mtd;
 
 	sb = sget(fs_type, jffs2_sb_compare, jffs2_sb_set, c);
 
 	if (IS_ERR(sb))
-		goto out_put;
+		goto out_error;
 
 	if (sb->s_root) {
 		/* New mountpoint for JFFS2 which is already mounted */
 		D1(printk(KERN_DEBUG "jffs2_get_sb_mtd(): Device %d (\"%s\") is already mounted\n",
 			  mtd->index, mtd->name));
+		ret = simple_set_mnt(mnt, sb);
 		goto out_put;
 	}
 
@@ -161,44 +163,47 @@ static struct super_block *jffs2_get_sb_mtd(struct file_system_type *fs_type,
 		/* Failure case... */
 		up_write(&sb->s_umount);
 		deactivate_super(sb);
-		return ERR_PTR(ret);
+		return ret;
 	}
 
 	sb->s_flags |= MS_ACTIVE;
-	return sb;
+	return simple_set_mnt(mnt, sb);
 
+out_error:
+	ret = PTR_ERR(sb);
  out_put:
 	kfree(c);
 	put_mtd_device(mtd);
 
-	return sb;
+	return ret;
 }
 
-static struct super_block *jffs2_get_sb_mtdnr(struct file_system_type *fs_type,
-					      int flags, const char *dev_name,
-					      void *data, int mtdnr)
+static int jffs2_get_sb_mtdnr(struct file_system_type *fs_type,
+			      int flags, const char *dev_name,
+			      void *data, int mtdnr,
+			      struct vfsmount *mnt)
 {
 	struct mtd_info *mtd;
 
 	mtd = get_mtd_device(NULL, mtdnr);
 	if (!mtd) {
 		D1(printk(KERN_DEBUG "jffs2: MTD device #%u doesn't appear to exist\n", mtdnr));
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 	}
 
-	return jffs2_get_sb_mtd(fs_type, flags, dev_name, data, mtd);
+	return jffs2_get_sb_mtd(fs_type, flags, dev_name, data, mtd, mnt);
 }
 
-static struct super_block *jffs2_get_sb(struct file_system_type *fs_type,
-					int flags, const char *dev_name,
-					void *data)
+static int jffs2_get_sb(struct file_system_type *fs_type,
+			int flags, const char *dev_name,
+			void *data, struct vfsmount *mnt)
 {
 	int err;
 	struct nameidata nd;
 	int mtdnr;
 
 	if (!dev_name)
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	D1(printk(KERN_DEBUG "jffs2_get_sb(): dev_name \"%s\"\n", dev_name));
 
@@ -220,7 +225,7 @@ static struct super_block *jffs2_get_sb(struct file_system_type *fs_type,
 				mtd = get_mtd_device(NULL, mtdnr);
 				if (mtd) {
 					if (!strcmp(mtd->name, dev_name+4))
-						return jffs2_get_sb_mtd(fs_type, flags, dev_name, data, mtd);
+						return jffs2_get_sb_mtd(fs_type, flags, dev_name, data, mtd, mnt);
 					put_mtd_device(mtd);
 				}
 			}
@@ -233,7 +238,7 @@ static struct super_block *jffs2_get_sb(struct file_system_type *fs_type,
 			if (!*endptr) {
 				/* It was a valid number */
 				D1(printk(KERN_DEBUG "jffs2_get_sb(): mtd%%d, mtdnr %d\n", mtdnr));
-				return jffs2_get_sb_mtdnr(fs_type, flags, dev_name, data, mtdnr);
+				return jffs2_get_sb_mtdnr(fs_type, flags, dev_name, data, mtdnr, mnt);
 			}
 		}
 	}
@@ -247,7 +252,7 @@ static struct super_block *jffs2_get_sb(struct file_system_type *fs_type,
 		  err, nd.dentry->d_inode));
 
 	if (err)
-		return ERR_PTR(err);
+		return err;
 
 	err = -EINVAL;
 
@@ -269,11 +274,11 @@ static struct super_block *jffs2_get_sb(struct file_system_type *fs_type,
 	mtdnr = iminor(nd.dentry->d_inode);
 	path_release(&nd);
 
-	return jffs2_get_sb_mtdnr(fs_type, flags, dev_name, data, mtdnr);
+	return jffs2_get_sb_mtdnr(fs_type, flags, dev_name, data, mtdnr, mnt);
 
 out:
 	path_release(&nd);
-	return ERR_PTR(err);
+	return err;
 }
 
 static void jffs2_put_super (struct super_block *sb)
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index db6f41d6dd6..18a28137b90 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -565,10 +565,11 @@ static void jfs_unlockfs(struct super_block *sb)
 	}
 }
 
-static struct super_block *jfs_get_sb(struct file_system_type *fs_type, 
-	int flags, const char *dev_name, void *data)
+static int jfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, jfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, jfs_fill_super,
+			   mnt);
 }
 
 static int jfs_sync_fs(struct super_block *sb, int wait)
diff --git a/fs/libfs.c b/fs/libfs.c
index 7145ba7a48d..7d70efa46da 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -196,9 +196,9 @@ struct inode_operations simple_dir_inode_operations = {
  * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
  * will never be mountable)
  */
-struct super_block *
-get_sb_pseudo(struct file_system_type *fs_type, char *name,
-	struct super_operations *ops, unsigned long magic)
+int get_sb_pseudo(struct file_system_type *fs_type, char *name,
+	struct super_operations *ops, unsigned long magic,
+	struct vfsmount *mnt)
 {
 	struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
 	static struct super_operations default_ops = {.statfs = simple_statfs};
@@ -207,7 +207,7 @@ get_sb_pseudo(struct file_system_type *fs_type, char *name,
 	struct qstr d_name = {.name = name, .len = strlen(name)};
 
 	if (IS_ERR(s))
-		return s;
+		return PTR_ERR(s);
 
 	s->s_flags = MS_NOUSER;
 	s->s_maxbytes = ~0ULL;
@@ -232,12 +232,12 @@ get_sb_pseudo(struct file_system_type *fs_type, char *name,
 	d_instantiate(dentry, root);
 	s->s_root = dentry;
 	s->s_flags |= MS_ACTIVE;
-	return s;
+	return simple_set_mnt(mnt, s);
 
 Enomem:
 	up_write(&s->s_umount);
 	deactivate_super(s);
-	return ERR_PTR(-ENOMEM);
+	return -ENOMEM;
 }
 
 int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 2dcccf1d1b7..14f24dfbfe3 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -559,10 +559,11 @@ void minix_truncate(struct inode * inode)
 		V2_minix_truncate(inode);
 }
 
-static struct super_block *minix_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int minix_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, minix_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, minix_fill_super,
+			   mnt);
 }
 
 static struct file_system_type minix_fs_type = {
diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c
index 5b76ccd19e3..9e44158a754 100644
--- a/fs/msdos/namei.c
+++ b/fs/msdos/namei.c
@@ -661,11 +661,12 @@ static int msdos_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 }
 
-static struct super_block *msdos_get_sb(struct file_system_type *fs_type,
-					int flags, const char *dev_name,
-					void *data)
+static int msdos_get_sb(struct file_system_type *fs_type,
+			int flags, const char *dev_name,
+			void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, msdos_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, msdos_fill_super,
+			   mnt);
 }
 
 static struct file_system_type msdos_fs_type = {
diff --git a/fs/namespace.c b/fs/namespace.c
index bf478addb85..c13072a5f1e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -86,6 +86,15 @@ struct vfsmount *alloc_vfsmnt(const char *name)
 	return mnt;
 }
 
+int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
+{
+	mnt->mnt_sb = sb;
+	mnt->mnt_root = dget(sb->s_root);
+	return 0;
+}
+
+EXPORT_SYMBOL(simple_set_mnt);
+
 void free_vfsmnt(struct vfsmount *mnt)
 {
 	kfree(mnt->mnt_devname);
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index a1f3e972c6e..8db033fab3f 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -957,10 +957,10 @@ out:
 	return result;
 }
 
-static struct super_block *ncp_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int ncp_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, ncp_fill_super);
+	return get_sb_nodev(fs_type, flags, data, ncp_fill_super, mnt);
 }
 
 static struct file_system_type ncp_fs_type = {
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index d0b991a9232..ff645a961bc 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1690,8 +1690,8 @@ static int nfs_compare_super(struct super_block *sb, void *data)
 	return !nfs_compare_fh(&old->fh, &server->fh);
 }
 
-static struct super_block *nfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *raw_data)
+static int nfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
 {
 	int error;
 	struct nfs_server *server = NULL;
@@ -1699,14 +1699,14 @@ static struct super_block *nfs_get_sb(struct file_system_type *fs_type,
 	struct nfs_fh *root;
 	struct nfs_mount_data *data = raw_data;
 
-	s = ERR_PTR(-EINVAL);
+	error = -EINVAL;
 	if (data == NULL) {
 		dprintk("%s: missing data argument\n", __FUNCTION__);
-		goto out_err;
+		goto out_err_noserver;
 	}
 	if (data->version <= 0 || data->version > NFS_MOUNT_VERSION) {
 		dprintk("%s: bad mount version\n", __FUNCTION__);
-		goto out_err;
+		goto out_err_noserver;
 	}
 	switch (data->version) {
 		case 1:
@@ -1718,7 +1718,7 @@ static struct super_block *nfs_get_sb(struct file_system_type *fs_type,
 				dprintk("%s: mount structure version %d does not support NFSv3\n",
 						__FUNCTION__,
 						data->version);
-				goto out_err;
+				goto out_err_noserver;
 			}
 			data->root.size = NFS2_FHSIZE;
 			memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE);
@@ -1727,24 +1727,24 @@ static struct super_block *nfs_get_sb(struct file_system_type *fs_type,
 				dprintk("%s: mount structure version %d does not support strong security\n",
 						__FUNCTION__,
 						data->version);
-				goto out_err;
+				goto out_err_noserver;
 			}
 		case 5:
 			memset(data->context, 0, sizeof(data->context));
 	}
 #ifndef CONFIG_NFS_V3
 	/* If NFSv3 is not compiled in, return -EPROTONOSUPPORT */
-	s = ERR_PTR(-EPROTONOSUPPORT);
+	error = -EPROTONOSUPPORT;
 	if (data->flags & NFS_MOUNT_VER3) {
 		dprintk("%s: NFSv3 not compiled into kernel\n", __FUNCTION__);
-		goto out_err;
+		goto out_err_noserver;
 	}
 #endif /* CONFIG_NFS_V3 */
 
-	s = ERR_PTR(-ENOMEM);
+	error = -ENOMEM;
 	server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
 	if (!server)
-		goto out_err;
+		goto out_err_noserver;
 	/* Zero out the NFS state stuff */
 	init_nfsv4_state(server);
 	server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
@@ -1754,7 +1754,7 @@ static struct super_block *nfs_get_sb(struct file_system_type *fs_type,
 		root->size = data->root.size;
 	else
 		root->size = NFS2_FHSIZE;
-	s = ERR_PTR(-EINVAL);
+	error = -EINVAL;
 	if (root->size > sizeof(root->data)) {
 		dprintk("%s: invalid root filehandle\n", __FUNCTION__);
 		goto out_err;
@@ -1770,15 +1770,20 @@ static struct super_block *nfs_get_sb(struct file_system_type *fs_type,
 	}
 
 	/* Fire up rpciod if not yet running */
-	s = ERR_PTR(rpciod_up());
-	if (IS_ERR(s)) {
-		dprintk("%s: couldn't start rpciod! Error = %ld\n",
-				__FUNCTION__, PTR_ERR(s));
+	error = rpciod_up();
+	if (error < 0) {
+		dprintk("%s: couldn't start rpciod! Error = %d\n",
+				__FUNCTION__, error);
 		goto out_err;
 	}
 
 	s = sget(fs_type, nfs_compare_super, nfs_set_super, server);
-	if (IS_ERR(s) || s->s_root)
+	if (IS_ERR(s)) {
+		error = PTR_ERR(s);
+		goto out_err_rpciod;
+	}
+
+	if (s->s_root)
 		goto out_rpciod_down;
 
 	s->s_flags = flags;
@@ -1787,15 +1792,22 @@ static struct super_block *nfs_get_sb(struct file_system_type *fs_type,
 	if (error) {
 		up_write(&s->s_umount);
 		deactivate_super(s);
-		return ERR_PTR(error);
+		return error;
 	}
 	s->s_flags |= MS_ACTIVE;
-	return s;
+	return simple_set_mnt(mnt, s);
+
 out_rpciod_down:
 	rpciod_down();
+	kfree(server);
+	return simple_set_mnt(mnt, s);
+
+out_err_rpciod:
+	rpciod_down();
 out_err:
 	kfree(server);
-	return s;
+out_err_noserver:
+	return error;
 }
 
 static void nfs_kill_super(struct super_block *s)
@@ -2032,8 +2044,8 @@ nfs_copy_user_string(char *dst, struct nfs_string *src, int maxlen)
 	return dst;
 }
 
-static struct super_block *nfs4_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *raw_data)
+static int nfs4_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
 {
 	int error;
 	struct nfs_server *server;
@@ -2043,16 +2055,16 @@ static struct super_block *nfs4_get_sb(struct file_system_type *fs_type,
 
 	if (data == NULL) {
 		dprintk("%s: missing data argument\n", __FUNCTION__);
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 	}
 	if (data->version <= 0 || data->version > NFS4_MOUNT_VERSION) {
 		dprintk("%s: bad mount version\n", __FUNCTION__);
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 	}
 
 	server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
 	if (!server)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 	/* Zero out the NFS state stuff */
 	init_nfsv4_state(server);
 	server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
@@ -2074,33 +2086,41 @@ static struct super_block *nfs4_get_sb(struct file_system_type *fs_type,
 
 	/* We now require that the mount process passes the remote address */
 	if (data->host_addrlen != sizeof(server->addr)) {
-		s = ERR_PTR(-EINVAL);
+		error = -EINVAL;
 		goto out_free;
 	}
 	if (copy_from_user(&server->addr, data->host_addr, sizeof(server->addr))) {
-		s = ERR_PTR(-EFAULT);
+		error = -EFAULT;
 		goto out_free;
 	}
 	if (server->addr.sin_family != AF_INET ||
 	    server->addr.sin_addr.s_addr == INADDR_ANY) {
 		dprintk("%s: mount program didn't pass remote IP address!\n",
 				__FUNCTION__);
-		s = ERR_PTR(-EINVAL);
+		error = -EINVAL;
 		goto out_free;
 	}
 
 	/* Fire up rpciod if not yet running */
-	s = ERR_PTR(rpciod_up());
-	if (IS_ERR(s)) {
-		dprintk("%s: couldn't start rpciod! Error = %ld\n",
-				__FUNCTION__, PTR_ERR(s));
+	error = rpciod_up();
+	if (error < 0) {
+		dprintk("%s: couldn't start rpciod! Error = %d\n",
+				__FUNCTION__, error);
 		goto out_free;
 	}
 
 	s = sget(fs_type, nfs4_compare_super, nfs_set_super, server);
-
-	if (IS_ERR(s) || s->s_root)
+	if (IS_ERR(s)) {
+		error = PTR_ERR(s);
 		goto out_free;
+	}
+
+	if (s->s_root) {
+		kfree(server->mnt_path);
+		kfree(server->hostname);
+		kfree(server);
+		return simple_set_mnt(mnt, s);
+	}
 
 	s->s_flags = flags;
 
@@ -2108,17 +2128,17 @@ static struct super_block *nfs4_get_sb(struct file_system_type *fs_type,
 	if (error) {
 		up_write(&s->s_umount);
 		deactivate_super(s);
-		return ERR_PTR(error);
+		return error;
 	}
 	s->s_flags |= MS_ACTIVE;
-	return s;
+	return simple_set_mnt(mnt, s);
 out_err:
-	s = (struct super_block *)p;
+	error = PTR_ERR(p);
 out_free:
 	kfree(server->mnt_path);
 	kfree(server->hostname);
 	kfree(server);
-	return s;
+	return error;
 }
 
 static void nfs4_kill_super(struct super_block *sb)
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 3ef017b3b5b..a1810e6a93e 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -494,10 +494,10 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
 	return simple_fill_super(sb, 0x6e667364, nfsd_files);
 }
 
-static struct super_block *nfsd_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int nfsd_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, nfsd_fill_super);
+	return get_sb_single(fs_type, flags, data, nfsd_fill_super, mnt);
 }
 
 static struct file_system_type nfsd_fs_type = {
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 27833f6df49..d5d5e969294 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -3093,10 +3093,11 @@ struct kmem_cache *ntfs_index_ctx_cache;
 /* Driver wide mutex. */
 DEFINE_MUTEX(ntfs_lock);
 
-static struct super_block *ntfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int ntfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ntfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, ntfs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type ntfs_fs_type = {
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 7e88e24b347..7273d9fa6ba 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -574,10 +574,10 @@ static struct inode_operations dlmfs_file_inode_operations = {
 	.getattr	= simple_getattr,
 };
 
-static struct super_block *dlmfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int dlmfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super);
+	return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super, mnt);
 }
 
 static struct file_system_type dlmfs_fs_type = {
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 949b3dac30f..788b8b50dc4 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -672,12 +672,14 @@ read_super_error:
 	return status;
 }
 
-static struct super_block *ocfs2_get_sb(struct file_system_type *fs_type,
-					int flags,
-					const char *dev_name,
-					void *data)
+static int ocfs2_get_sb(struct file_system_type *fs_type,
+			int flags,
+			const char *dev_name,
+			void *data,
+			struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super,
+			   mnt);
 }
 
 static struct file_system_type ocfs2_fs_type = {
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 0f14276a2e5..464e2bce020 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -1054,10 +1054,10 @@ out_no_root:
 	return -ENOMEM;
 }
 
-static struct super_block *openprom_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int openprom_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, openprom_fill_super);
+	return get_sb_single(fs_type, flags, data, openprom_fill_super, mnt);
 }
 
 static struct file_system_type openprom_fs_type = {
diff --git a/fs/pipe.c b/fs/pipe.c
index 5acd8954aaa..20352573e02 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -979,12 +979,11 @@ no_files:
  * any operations on the root directory. However, we need a non-trivial
  * d_name - pipe: will go nicely and kill the special-casing in procfs.
  */
-
-static struct super_block *
-pipefs_get_sb(struct file_system_type *fs_type, int flags,
-	      const char *dev_name, void *data)
+static int pipefs_get_sb(struct file_system_type *fs_type,
+			 int flags, const char *dev_name, void *data,
+			 struct vfsmount *mnt)
 {
-	return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC);
+	return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC, mnt);
 }
 
 static struct file_system_type pipe_fs_type = {
diff --git a/fs/proc/root.c b/fs/proc/root.c
index c3fd3611112..9995356ce73 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -26,10 +26,10 @@ struct proc_dir_entry *proc_net, *proc_net_stat, *proc_bus, *proc_root_fs, *proc
 struct proc_dir_entry *proc_sys_root;
 #endif
 
-static struct super_block *proc_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int proc_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, proc_fill_super);
+	return get_sb_single(fs_type, flags, data, proc_fill_super, mnt);
 }
 
 static struct file_system_type proc_fs_type = {
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 2ecd46f85e9..e6cca5cd4b4 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -561,10 +561,11 @@ static void destroy_inodecache(void)
 		       "qnx4_inode_cache: not all structures were freed\n");
 }
 
-static struct super_block *qnx4_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int qnx4_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, qnx4_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, qnx4_fill_super,
+			   mnt);
 }
 
 static struct file_system_type qnx4_fs_type = {
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 14bd2246fb6..b9677335cc8 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -185,16 +185,17 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
 	return 0;
 }
 
-struct super_block *ramfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+int ramfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, ramfs_fill_super);
+	return get_sb_nodev(fs_type, flags, data, ramfs_fill_super, mnt);
 }
 
-static struct super_block *rootfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int rootfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super);
+	return get_sb_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super,
+			    mnt);
 }
 
 static struct file_system_type ramfs_fs_type = {
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index cae2abbc0c7..f3ff41d3398 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2249,11 +2249,12 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
 
 #endif
 
-static struct super_block *get_super_block(struct file_system_type *fs_type,
-					   int flags, const char *dev_name,
-					   void *data)
+static int get_super_block(struct file_system_type *fs_type,
+			   int flags, const char *dev_name,
+			   void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super,
+			   mnt);
 }
 
 static int __init init_reiserfs_fs(void)
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index 9b9eda7b335..4d6cd662106 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -607,10 +607,11 @@ static struct super_operations romfs_ops = {
 	.remount_fs	= romfs_remount,
 };
 
-static struct super_block *romfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int romfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, romfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, romfs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type romfs_fs_type = {
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index fdeabc0a34f..4a37c2bbfa3 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -782,10 +782,10 @@ out:
 	return error;
 }
 
-static struct super_block *smb_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int smb_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, smb_fill_super);
+	return get_sb_nodev(fs_type, flags, data, smb_fill_super, mnt);
 }
 
 static struct file_system_type smb_fs_type = {
diff --git a/fs/super.c b/fs/super.c
index 9d5c2add722..324c2d232f5 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -231,7 +231,7 @@ void generic_shutdown_super(struct super_block *sb)
 	if (root) {
 		sb->s_root = NULL;
 		shrink_dcache_parent(root);
-		shrink_dcache_anon(sb);
+		shrink_dcache_sb(sb);
 		dput(root);
 		fsync_super(sb);
 		lock_super(sb);
@@ -676,9 +676,10 @@ static void bdev_uevent(struct block_device *bdev, enum kobject_action action)
 	}
 }
 
-struct super_block *get_sb_bdev(struct file_system_type *fs_type,
+int get_sb_bdev(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data,
-	int (*fill_super)(struct super_block *, void *, int))
+	int (*fill_super)(struct super_block *, void *, int),
+	struct vfsmount *mnt)
 {
 	struct block_device *bdev;
 	struct super_block *s;
@@ -686,7 +687,7 @@ struct super_block *get_sb_bdev(struct file_system_type *fs_type,
 
 	bdev = open_bdev_excl(dev_name, flags, fs_type);
 	if (IS_ERR(bdev))
-		return (struct super_block *)bdev;
+		return PTR_ERR(bdev);
 
 	/*
 	 * once the super is inserted into the list by sget, s_umount
@@ -697,15 +698,17 @@ struct super_block *get_sb_bdev(struct file_system_type *fs_type,
 	s = sget(fs_type, test_bdev_super, set_bdev_super, bdev);
 	mutex_unlock(&bdev->bd_mount_mutex);
 	if (IS_ERR(s))
-		goto out;
+		goto error_s;
 
 	if (s->s_root) {
 		if ((flags ^ s->s_flags) & MS_RDONLY) {
 			up_write(&s->s_umount);
 			deactivate_super(s);
-			s = ERR_PTR(-EBUSY);
+			error = -EBUSY;
+			goto error_bdev;
 		}
-		goto out;
+
+		close_bdev_excl(bdev);
 	} else {
 		char b[BDEVNAME_SIZE];
 
@@ -716,18 +719,21 @@ struct super_block *get_sb_bdev(struct file_system_type *fs_type,
 		if (error) {
 			up_write(&s->s_umount);
 			deactivate_super(s);
-			s = ERR_PTR(error);
-		} else {
-			s->s_flags |= MS_ACTIVE;
-			bdev_uevent(bdev, KOBJ_MOUNT);
+			goto error;
 		}
+
+		s->s_flags |= MS_ACTIVE;
+		bdev_uevent(bdev, KOBJ_MOUNT);
 	}
 
-	return s;
+	return simple_set_mnt(mnt, s);
 
-out:
+error_s:
+	error = PTR_ERR(s);
+error_bdev:
 	close_bdev_excl(bdev);
-	return s;
+error:
+	return error;
 }
 
 EXPORT_SYMBOL(get_sb_bdev);
@@ -744,15 +750,16 @@ void kill_block_super(struct super_block *sb)
 
 EXPORT_SYMBOL(kill_block_super);
 
-struct super_block *get_sb_nodev(struct file_system_type *fs_type,
+int get_sb_nodev(struct file_system_type *fs_type,
 	int flags, void *data,
-	int (*fill_super)(struct super_block *, void *, int))
+	int (*fill_super)(struct super_block *, void *, int),
+	struct vfsmount *mnt)
 {
 	int error;
 	struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
 
 	if (IS_ERR(s))
-		return s;
+		return PTR_ERR(s);
 
 	s->s_flags = flags;
 
@@ -760,10 +767,10 @@ struct super_block *get_sb_nodev(struct file_system_type *fs_type,
 	if (error) {
 		up_write(&s->s_umount);
 		deactivate_super(s);
-		return ERR_PTR(error);
+		return error;
 	}
 	s->s_flags |= MS_ACTIVE;
-	return s;
+	return simple_set_mnt(mnt, s);
 }
 
 EXPORT_SYMBOL(get_sb_nodev);
@@ -773,94 +780,102 @@ static int compare_single(struct super_block *s, void *p)
 	return 1;
 }
 
-struct super_block *get_sb_single(struct file_system_type *fs_type,
+int get_sb_single(struct file_system_type *fs_type,
 	int flags, void *data,
-	int (*fill_super)(struct super_block *, void *, int))
+	int (*fill_super)(struct super_block *, void *, int),
+	struct vfsmount *mnt)
 {
 	struct super_block *s;
 	int error;
 
 	s = sget(fs_type, compare_single, set_anon_super, NULL);
 	if (IS_ERR(s))
-		return s;
+		return PTR_ERR(s);
 	if (!s->s_root) {
 		s->s_flags = flags;
 		error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
 		if (error) {
 			up_write(&s->s_umount);
 			deactivate_super(s);
-			return ERR_PTR(error);
+			return error;
 		}
 		s->s_flags |= MS_ACTIVE;
 	}
 	do_remount_sb(s, flags, data, 0);
-	return s;
+	return simple_set_mnt(mnt, s);
 }
 
 EXPORT_SYMBOL(get_sb_single);
 
 struct vfsmount *
-do_kern_mount(const char *fstype, int flags, const char *name, void *data)
+vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
 {
-	struct file_system_type *type = get_fs_type(fstype);
-	struct super_block *sb = ERR_PTR(-ENOMEM);
 	struct vfsmount *mnt;
-	int error;
 	char *secdata = NULL;
+	int error;
 
 	if (!type)
 		return ERR_PTR(-ENODEV);
 
+	error = -ENOMEM;
 	mnt = alloc_vfsmnt(name);
 	if (!mnt)
 		goto out;
 
 	if (data) {
 		secdata = alloc_secdata();
-		if (!secdata) {
-			sb = ERR_PTR(-ENOMEM);
+		if (!secdata)
 			goto out_mnt;
-		}
 
 		error = security_sb_copy_data(type, data, secdata);
-		if (error) {
-			sb = ERR_PTR(error);
+		if (error)
 			goto out_free_secdata;
-		}
 	}
 
-	sb = type->get_sb(type, flags, name, data);
-	if (IS_ERR(sb))
+	error = type->get_sb(type, flags, name, data, mnt);
+	if (error < 0)
 		goto out_free_secdata;
- 	error = security_sb_kern_mount(sb, secdata);
+
+ 	error = security_sb_kern_mount(mnt->mnt_sb, secdata);
  	if (error)
  		goto out_sb;
-	mnt->mnt_sb = sb;
-	mnt->mnt_root = dget(sb->s_root);
-	mnt->mnt_mountpoint = sb->s_root;
+
+	mnt->mnt_mountpoint = mnt->mnt_root;
 	mnt->mnt_parent = mnt;
-	up_write(&sb->s_umount);
+	up_write(&mnt->mnt_sb->s_umount);
 	free_secdata(secdata);
-	put_filesystem(type);
 	return mnt;
 out_sb:
-	up_write(&sb->s_umount);
-	deactivate_super(sb);
-	sb = ERR_PTR(error);
+	dput(mnt->mnt_root);
+	up_write(&mnt->mnt_sb->s_umount);
+	deactivate_super(mnt->mnt_sb);
 out_free_secdata:
 	free_secdata(secdata);
 out_mnt:
 	free_vfsmnt(mnt);
 out:
+	return ERR_PTR(error);
+}
+
+EXPORT_SYMBOL_GPL(vfs_kern_mount);
+
+struct vfsmount *
+do_kern_mount(const char *fstype, int flags, const char *name, void *data)
+{
+	struct file_system_type *type = get_fs_type(fstype);
+	struct vfsmount *mnt;
+	if (!type)
+		return ERR_PTR(-ENODEV);
+	mnt = vfs_kern_mount(type, flags, name, data);
 	put_filesystem(type);
-	return (struct vfsmount *)sb;
+	return mnt;
 }
 
 EXPORT_SYMBOL_GPL(do_kern_mount);
 
 struct vfsmount *kern_mount(struct file_system_type *type)
 {
-	return do_kern_mount(type->name, 0, type->name, NULL);
+	return vfs_kern_mount(type, 0, type->name, NULL);
 }
 
 EXPORT_SYMBOL(kern_mount);
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index f1117e885bd..40190c48927 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -66,10 +66,10 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 }
 
-static struct super_block *sysfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int sysfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, sysfs_fill_super);
+	return get_sb_single(fs_type, flags, data, sysfs_fill_super, mnt);
 }
 
 static struct file_system_type sysfs_fs_type = {
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index e92b991e6dd..876639b9332 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -506,16 +506,17 @@ failed:
 
 /* Every kernel module contains stuff like this. */
 
-static struct super_block *sysv_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int sysv_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, sysv_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, sysv_fill_super,
+			   mnt);
 }
 
-static struct super_block *v7_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int v7_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, v7_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, v7_fill_super, mnt);
 }
 
 static struct file_system_type sysv_fs_type = {
diff --git a/fs/udf/super.c b/fs/udf/super.c
index e45789fe38e..2250774a831 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -94,10 +94,10 @@ static unsigned int udf_count_free(struct super_block *);
 static int udf_statfs(struct super_block *, struct kstatfs *);
 
 /* UDF filesystem type */
-static struct super_block *udf_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int udf_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, udf_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, udf_fill_super, mnt);
 }
 
 static struct file_system_type udf_fstype = {
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index db98a4c71e6..768fb8d9e67 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1311,10 +1311,10 @@ out:
 
 #endif
 
-static struct super_block *ufs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int ufs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ufs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, ufs_fill_super, mnt);
 }
 
 static struct file_system_type ufs_fs_type = {
diff --git a/fs/vfat/namei.c b/fs/vfat/namei.c
index a56cec3be5f..9a8f48bae95 100644
--- a/fs/vfat/namei.c
+++ b/fs/vfat/namei.c
@@ -1023,11 +1023,12 @@ static int vfat_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 }
 
-static struct super_block *vfat_get_sb(struct file_system_type *fs_type,
-				       int flags, const char *dev_name,
-				       void *data)
+static int vfat_get_sb(struct file_system_type *fs_type,
+		       int flags, const char *dev_name,
+		       void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, vfat_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, vfat_fill_super,
+			   mnt);
 }
 
 static struct file_system_type vfat_fs_type = {
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index f2a0778536f..d03c89a3665 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -853,14 +853,16 @@ fail_vfsop:
 	return -error;
 }
 
-STATIC struct super_block *
+STATIC int
 xfs_fs_get_sb(
 	struct file_system_type	*fs_type,
 	int			flags,
 	const char		*dev_name,
-	void			*data)
+	void			*data,
+	struct vfsmount		*mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super,
+			   mnt);
 }
 
 STATIC struct super_operations xfs_super_operations = {
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 46d0e079735..0dd1610a94a 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -217,7 +217,6 @@ extern struct dentry * d_alloc_anon(struct inode *);
 extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
 extern void shrink_dcache_sb(struct super_block *);
 extern void shrink_dcache_parent(struct dentry *);
-extern void shrink_dcache_anon(struct super_block *);
 extern int d_invalidate(struct dentry *);
 
 /* only used at mount-time */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 73c7d6f04b3..3e50dd24af8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1269,23 +1269,26 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent,
 struct file_system_type {
 	const char *name;
 	int fs_flags;
-	struct super_block *(*get_sb) (struct file_system_type *, int,
-				       const char *, void *);
+	int (*get_sb) (struct file_system_type *, int,
+		       const char *, void *, struct vfsmount *);
 	void (*kill_sb) (struct super_block *);
 	struct module *owner;
 	struct file_system_type * next;
 	struct list_head fs_supers;
 };
 
-struct super_block *get_sb_bdev(struct file_system_type *fs_type,
+extern int get_sb_bdev(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data,
-	int (*fill_super)(struct super_block *, void *, int));
-struct super_block *get_sb_single(struct file_system_type *fs_type,
+	int (*fill_super)(struct super_block *, void *, int),
+	struct vfsmount *mnt);
+extern int get_sb_single(struct file_system_type *fs_type,
 	int flags, void *data,
-	int (*fill_super)(struct super_block *, void *, int));
-struct super_block *get_sb_nodev(struct file_system_type *fs_type,
+	int (*fill_super)(struct super_block *, void *, int),
+	struct vfsmount *mnt);
+extern int get_sb_nodev(struct file_system_type *fs_type,
 	int flags, void *data,
-	int (*fill_super)(struct super_block *, void *, int));
+	int (*fill_super)(struct super_block *, void *, int),
+	struct vfsmount *mnt);
 void generic_shutdown_super(struct super_block *sb);
 void kill_block_super(struct super_block *sb);
 void kill_anon_super(struct super_block *sb);
@@ -1296,8 +1299,10 @@ struct super_block *sget(struct file_system_type *type,
 			int (*test)(struct super_block *,void *),
 			int (*set)(struct super_block *,void *),
 			void *data);
-struct super_block *get_sb_pseudo(struct file_system_type *, char *,
-			struct super_operations *ops, unsigned long);
+extern int get_sb_pseudo(struct file_system_type *, char *,
+	struct super_operations *ops, unsigned long,
+	struct vfsmount *mnt);
+extern int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb);
 int __put_super(struct super_block *sb);
 int __put_super_and_need_restart(struct super_block *sb);
 void unnamed_dev_init(void);
diff --git a/include/linux/ramfs.h b/include/linux/ramfs.h
index 78ecfa28b1c..00b340ba661 100644
--- a/include/linux/ramfs.h
+++ b/include/linux/ramfs.h
@@ -2,8 +2,8 @@
 #define _LINUX_RAMFS_H
 
 struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev);
-struct super_block *ramfs_get_sb(struct file_system_type *fs_type,
-	 int flags, const char *dev_name, void *data);
+extern int ramfs_get_sb(struct file_system_type *fs_type,
+	 int flags, const char *dev_name, void *data, struct vfsmount *mnt);
 
 #ifndef CONFIG_MMU
 extern unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 1511714a958..0a2a24b6ebe 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -205,11 +205,11 @@ static int mqueue_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 }
 
-static struct super_block *mqueue_get_sb(struct file_system_type *fs_type,
-					 int flags, const char *dev_name,
-					 void *data)
+static int mqueue_get_sb(struct file_system_type *fs_type,
+			 int flags, const char *dev_name,
+			 void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, mqueue_fill_super);
+	return get_sb_single(fs_type, flags, data, mqueue_fill_super, mnt);
 }
 
 static void init_once(void *foo, kmem_cache_t * cachep, unsigned long flags)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ab81fdd4572..77f45ffd5ea 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -392,11 +392,11 @@ static int cpuset_fill_super(struct super_block *sb, void *unused_data,
 	return 0;
 }
 
-static struct super_block *cpuset_get_sb(struct file_system_type *fs_type,
-					int flags, const char *unused_dev_name,
-					void *data)
+static int cpuset_get_sb(struct file_system_type *fs_type,
+			 int flags, const char *unused_dev_name,
+			 void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, cpuset_fill_super);
+	return get_sb_single(fs_type, flags, data, cpuset_fill_super, mnt);
 }
 
 static struct file_system_type cpuset_fs_type = {
diff --git a/kernel/futex.c b/kernel/futex.c
index 5699c512057..e1a380c77a5 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1056,11 +1056,11 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
 			(unsigned long)uaddr2, val2, val3);
 }
 
-static struct super_block *
-futexfs_get_sb(struct file_system_type *fs_type,
-	       int flags, const char *dev_name, void *data)
+static int futexfs_get_sb(struct file_system_type *fs_type,
+			  int flags, const char *dev_name, void *data,
+			  struct vfsmount *mnt)
 {
-	return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA);
+	return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA, mnt);
 }
 
 static struct file_system_type futex_fs_type = {
diff --git a/mm/shmem.c b/mm/shmem.c
index 1e43c8a865b..7617bb1c6bf 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2233,10 +2233,10 @@ static struct vm_operations_struct shmem_vm_ops = {
 };
 
 
-static struct super_block *shmem_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int shmem_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, shmem_fill_super);
+	return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt);
 }
 
 static struct file_system_type tmpfs_fs_type = {
diff --git a/net/socket.c b/net/socket.c
index 02948b622bd..565f5e8d119 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -335,10 +335,11 @@ static struct super_operations sockfs_ops = {
 	.statfs =	simple_statfs,
 };
 
-static struct super_block *sockfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int sockfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC);
+	return get_sb_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC,
+			     mnt);
 }
 
 static struct vfsmount *sock_mnt __read_mostly;
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index cc673dd8433..8241fa72680 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -815,11 +815,11 @@ out:
 	return -ENOMEM;
 }
 
-static struct super_block *
+static int
 rpc_get_sb(struct file_system_type *fs_type,
-		int flags, const char *dev_name, void *data)
+		int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, rpc_fill_super);
+	return get_sb_single(fs_type, flags, data, rpc_fill_super, mnt);
 }
 
 static struct file_system_type rpc_pipe_fs_type = {
diff --git a/security/inode.c b/security/inode.c
index 0f77b022366..e6fc29ac856 100644
--- a/security/inode.c
+++ b/security/inode.c
@@ -135,11 +135,11 @@ static int fill_super(struct super_block *sb, void *data, int silent)
 	return simple_fill_super(sb, SECURITYFS_MAGIC, files);
 }
 
-static struct super_block *get_sb(struct file_system_type *fs_type,
-				        int flags, const char *dev_name,
-					void *data)
+static int get_sb(struct file_system_type *fs_type,
+		  int flags, const char *dev_name,
+		  void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, fill_super);
+	return get_sb_single(fs_type, flags, data, fill_super, mnt);
 }
 
 static struct file_system_type fs_type = {
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 2e73d3279f2..7029bbc9bef 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -1345,10 +1345,11 @@ err:
 	goto out;
 }
 
-static struct super_block *sel_get_sb(struct file_system_type *fs_type,
-				      int flags, const char *dev_name, void *data)
+static int sel_get_sb(struct file_system_type *fs_type,
+		      int flags, const char *dev_name, void *data,
+		      struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, sel_fill_super);
+	return get_sb_single(fs_type, flags, data, sel_fill_super, mnt);
 }
 
 static struct file_system_type sel_fs_type = {
-- 
cgit v1.2.3-70-g09d2


From 726c334223180e3c0197cc980a432681370d4baf Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 23 Jun 2006 02:02:58 -0700
Subject: [PATCH] VFS: Permit filesystem to perform statfs with a known root
 dentry

Give the statfs superblock operation a dentry pointer rather than a superblock
pointer.

This complements the get_sb() patch.  That reduced the significance of
sb->s_root, allowing NFS to place a fake root there.  However, NFS does
require a dentry to use as a target for the statfs operation.  This permits
the root in the vfsmount to be used instead.

linux/mount.h has been added where necessary to make allyesconfig build
successfully.

Interest has also been expressed for use with the FUSE and XFS filesystems.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Nathan Scott <nathans@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/filesystems/Locking |  2 +-
 Documentation/filesystems/vfs.txt |  2 +-
 arch/alpha/kernel/osf_sys.c       |  2 +-
 arch/mips/kernel/sysirix.c        | 12 ++++++------
 arch/parisc/hpux/sys_hpux.c       | 10 +++++-----
 arch/sparc64/solaris/fs.c         |  4 ++--
 fs/adfs/super.c                   |  8 ++++----
 fs/affs/super.c                   |  5 +++--
 fs/befs/linuxvfs.c                |  5 +++--
 fs/bfs/inode.c                    |  3 ++-
 fs/cifs/cifsfs.c                  |  3 ++-
 fs/coda/inode.c                   |  6 +++---
 fs/coda/upcall.c                  |  4 ++--
 fs/compat.c                       |  8 ++++----
 fs/cramfs/inode.c                 |  4 +++-
 fs/efs/super.c                    |  6 +++---
 fs/ext2/super.c                   |  5 +++--
 fs/ext3/super.c                   |  5 +++--
 fs/fat/inode.c                    |  8 ++++----
 fs/freevxfs/vxfs_super.c          | 13 +++++++------
 fs/fuse/inode.c                   |  3 ++-
 fs/hfs/super.c                    |  4 +++-
 fs/hfsplus/super.c                |  4 +++-
 fs/hostfs/hostfs_kern.c           |  4 ++--
 fs/hpfs/super.c                   |  3 ++-
 fs/hppfs/hppfs_kern.c             |  2 +-
 fs/hugetlbfs/inode.c              |  4 ++--
 fs/isofs/inode.c                  |  6 ++++--
 fs/jffs/inode-v23.c               |  4 ++--
 fs/jffs2/fs.c                     |  4 ++--
 fs/jffs2/os-linux.h               |  2 +-
 fs/jfs/super.c                    |  4 ++--
 fs/libfs.c                        |  4 ++--
 fs/minix/inode.c                  | 10 +++++-----
 fs/ncpfs/inode.c                  |  5 +++--
 fs/nfs/inode.c                    |  5 +++--
 fs/nfsd/nfs4xdr.c                 |  2 +-
 fs/nfsd/vfs.c                     |  2 +-
 fs/ntfs/super.c                   |  7 ++++---
 fs/ocfs2/super.c                  | 10 +++++-----
 fs/open.c                         | 26 +++++++++++++-------------
 fs/qnx4/inode.c                   |  6 ++++--
 fs/reiserfs/super.c               |  8 ++++----
 fs/romfs/inode.c                  |  4 ++--
 fs/smbfs/inode.c                  |  6 +++---
 fs/smbfs/proc.c                   |  4 ++--
 fs/smbfs/proto.h                  |  2 +-
 fs/super.c                        |  2 +-
 fs/sysv/inode.c                   |  3 ++-
 fs/udf/super.c                    |  6 ++++--
 fs/ufs/super.c                    |  3 ++-
 fs/xfs/linux-2.6/xfs_super.c      |  4 ++--
 include/linux/coda_psdev.h        |  2 +-
 include/linux/fs.h                |  6 +++---
 include/linux/mount.h             |  5 +++++
 include/linux/security.h          | 14 +++++++-------
 kernel/acct.c                     |  2 +-
 mm/shmem.c                        |  4 ++--
 security/dummy.c                  |  2 +-
 security/selinux/hooks.c          |  6 +++---
 60 files changed, 175 insertions(+), 144 deletions(-)

(limited to 'mm')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 3abf08f1b14..d31efbbdfe5 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -99,7 +99,7 @@ prototypes:
 	int (*sync_fs)(struct super_block *sb, int wait);
 	void (*write_super_lockfs) (struct super_block *);
 	void (*unlockfs) (struct super_block *);
-	int (*statfs) (struct super_block *, struct kstatfs *);
+	int (*statfs) (struct dentry *, struct kstatfs *);
 	int (*remount_fs) (struct super_block *, int *, char *);
 	void (*clear_inode) (struct inode *);
 	void (*umount_begin) (struct super_block *);
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index dd7d0dcedc8..9d3aed628bc 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -211,7 +211,7 @@ struct super_operations {
         int (*sync_fs)(struct super_block *sb, int wait);
         void (*write_super_lockfs) (struct super_block *);
         void (*unlockfs) (struct super_block *);
-        int (*statfs) (struct super_block *, struct kstatfs *);
+        int (*statfs) (struct dentry *, struct kstatfs *);
         int (*remount_fs) (struct super_block *, int *, char *);
         void (*clear_inode) (struct inode *);
         void (*umount_begin) (struct super_block *);
diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 31afe3d91ac..e15dcf4f3dc 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -244,7 +244,7 @@ do_osf_statfs(struct dentry * dentry, struct osf_statfs __user *buffer,
 	      unsigned long bufsiz)
 {
 	struct kstatfs linux_stat;
-	int error = vfs_statfs(dentry->d_inode->i_sb, &linux_stat);
+	int error = vfs_statfs(dentry, &linux_stat);
 	if (!error)
 		error = linux_to_osf_statfs(&linux_stat, buffer, bufsiz);
 	return error;	
diff --git a/arch/mips/kernel/sysirix.c b/arch/mips/kernel/sysirix.c
index 5407b784cd0..19e1ef43eb4 100644
--- a/arch/mips/kernel/sysirix.c
+++ b/arch/mips/kernel/sysirix.c
@@ -694,7 +694,7 @@ asmlinkage int irix_statfs(const char __user *path,
 	if (error)
 		goto out;
 
-	error = vfs_statfs(nd.dentry->d_inode->i_sb, &kbuf);
+	error = vfs_statfs(nd.dentry, &kbuf);
 	if (error)
 		goto dput_and_out;
 
@@ -732,7 +732,7 @@ asmlinkage int irix_fstatfs(unsigned int fd, struct irix_statfs __user *buf)
 		goto out;
 	}
 
-	error = vfs_statfs(file->f_dentry->d_inode->i_sb, &kbuf);
+	error = vfs_statfs(file->f_dentry, &kbuf);
 	if (error)
 		goto out_f;
 
@@ -1360,7 +1360,7 @@ asmlinkage int irix_statvfs(char __user *fname, struct irix_statvfs __user *buf)
 	error = user_path_walk(fname, &nd);
 	if (error)
 		goto out;
-	error = vfs_statfs(nd.dentry->d_inode->i_sb, &kbuf);
+	error = vfs_statfs(nd.dentry, &kbuf);
 	if (error)
 		goto dput_and_out;
 
@@ -1406,7 +1406,7 @@ asmlinkage int irix_fstatvfs(int fd, struct irix_statvfs __user *buf)
 		error = -EBADF;
 		goto out;
 	}
-	error = vfs_statfs(file->f_dentry->d_inode->i_sb, &kbuf);
+	error = vfs_statfs(file->f_dentry, &kbuf);
 	if (error)
 		goto out_f;
 
@@ -1611,7 +1611,7 @@ asmlinkage int irix_statvfs64(char __user *fname, struct irix_statvfs64 __user *
 	error = user_path_walk(fname, &nd);
 	if (error)
 		goto out;
-	error = vfs_statfs(nd.dentry->d_inode->i_sb, &kbuf);
+	error = vfs_statfs(nd.dentry, &kbuf);
 	if (error)
 		goto dput_and_out;
 
@@ -1658,7 +1658,7 @@ asmlinkage int irix_fstatvfs64(int fd, struct irix_statvfs __user *buf)
 		error = -EBADF;
 		goto out;
 	}
-	error = vfs_statfs(file->f_dentry->d_inode->i_sb, &kbuf);
+	error = vfs_statfs(file->f_dentry, &kbuf);
 	if (error)
 		goto out_f;
 
diff --git a/arch/parisc/hpux/sys_hpux.c b/arch/parisc/hpux/sys_hpux.c
index 05273ccced0..cb69727027a 100644
--- a/arch/parisc/hpux/sys_hpux.c
+++ b/arch/parisc/hpux/sys_hpux.c
@@ -145,7 +145,7 @@ static int hpux_ustat(dev_t dev, struct hpux_ustat __user *ubuf)
 	s = user_get_super(dev);
 	if (s == NULL)
 		goto out;
-	err = vfs_statfs(s, &sbuf);
+	err = vfs_statfs(s->s_root, &sbuf);
 	drop_super(s);
 	if (err)
 		goto out;
@@ -186,12 +186,12 @@ struct hpux_statfs {
      int16_t f_pad;
 };
 
-static int vfs_statfs_hpux(struct super_block *sb, struct hpux_statfs *buf)
+static int vfs_statfs_hpux(struct dentry *dentry, struct hpux_statfs *buf)
 {
 	struct kstatfs st;
 	int retval;
 	
-	retval = vfs_statfs(sb, &st);
+	retval = vfs_statfs(dentry, &st);
 	if (retval)
 		return retval;
 
@@ -219,7 +219,7 @@ asmlinkage long hpux_statfs(const char __user *path,
 	error = user_path_walk(path, &nd);
 	if (!error) {
 		struct hpux_statfs tmp;
-		error = vfs_statfs_hpux(nd.dentry->d_inode->i_sb, &tmp);
+		error = vfs_statfs_hpux(nd.dentry, &tmp);
 		if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 			error = -EFAULT;
 		path_release(&nd);
@@ -237,7 +237,7 @@ asmlinkage long hpux_fstatfs(unsigned int fd, struct hpux_statfs __user * buf)
 	file = fget(fd);
 	if (!file)
 		goto out;
-	error = vfs_statfs_hpux(file->f_dentry->d_inode->i_sb, &tmp);
+	error = vfs_statfs_hpux(file->f_dentry, &tmp);
 	if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 		error = -EFAULT;
 	fput(file);
diff --git a/arch/sparc64/solaris/fs.c b/arch/sparc64/solaris/fs.c
index 4885ca6cbc7..0f0eb6aa1c4 100644
--- a/arch/sparc64/solaris/fs.c
+++ b/arch/sparc64/solaris/fs.c
@@ -356,7 +356,7 @@ static int report_statvfs(struct vfsmount *mnt, struct inode *inode, u32 buf)
 	int error;
 	struct sol_statvfs __user *ss = A(buf);
 
-	error = vfs_statfs(mnt->mnt_sb, &s);
+	error = vfs_statfs(mnt->mnt_root, &s);
 	if (!error) {
 		const char *p = mnt->mnt_sb->s_type->name;
 		int i = 0;
@@ -392,7 +392,7 @@ static int report_statvfs64(struct vfsmount *mnt, struct inode *inode, u32 buf)
 	int error;
 	struct sol_statvfs64 __user *ss = A(buf);
 			
-	error = vfs_statfs(mnt->mnt_sb, &s);
+	error = vfs_statfs(mnt->mnt_root, &s);
 	if (!error) {
 		const char *p = mnt->mnt_sb->s_type->name;
 		int i = 0;
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 1b58a9b7f6a..ba1c88af49f 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -196,17 +196,17 @@ static int adfs_remount(struct super_block *sb, int *flags, char *data)
 	return parse_options(sb, data);
 }
 
-static int adfs_statfs(struct super_block *sb, struct kstatfs *buf)
+static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct adfs_sb_info *asb = ADFS_SB(sb);
+	struct adfs_sb_info *asb = ADFS_SB(dentry->d_sb);
 
 	buf->f_type    = ADFS_SUPER_MAGIC;
 	buf->f_namelen = asb->s_namelen;
-	buf->f_bsize   = sb->s_blocksize;
+	buf->f_bsize   = dentry->d_sb->s_blocksize;
 	buf->f_blocks  = asb->s_size;
 	buf->f_files   = asb->s_ids_per_zone * asb->s_map_size;
 	buf->f_bavail  =
-	buf->f_bfree   = adfs_map_free(sb);
+	buf->f_bfree   = adfs_map_free(dentry->d_sb);
 	buf->f_ffree   = (long)(buf->f_bfree * buf->f_files) / (long)buf->f_blocks;
 
 	return 0;
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 6a52e787540..8765cba35bb 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -18,7 +18,7 @@
 
 extern struct timezone sys_tz;
 
-static int affs_statfs(struct super_block *sb, struct kstatfs *buf);
+static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int affs_remount (struct super_block *sb, int *flags, char *data);
 
 static void
@@ -508,8 +508,9 @@ affs_remount(struct super_block *sb, int *flags, char *data)
 }
 
 static int
-affs_statfs(struct super_block *sb, struct kstatfs *buf)
+affs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
 	int		 free;
 
 	pr_debug("AFFS: statfs() partsize=%d, reserved=%d\n",AFFS_SB(sb)->s_partition_size,
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 6ed07a5f10c..08201fab26c 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -49,7 +49,7 @@ static int befs_nls2utf(struct super_block *sb, const char *in, int in_len,
 			char **out, int *out_len);
 static void befs_put_super(struct super_block *);
 static int befs_remount(struct super_block *, int *, char *);
-static int befs_statfs(struct super_block *, struct kstatfs *);
+static int befs_statfs(struct dentry *, struct kstatfs *);
 static int parse_options(char *, befs_mount_options *);
 
 static const struct super_operations befs_sops = {
@@ -880,8 +880,9 @@ befs_remount(struct super_block *sb, int *flags, char *data)
 }
 
 static int
-befs_statfs(struct super_block *sb, struct kstatfs *buf)
+befs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
 
 	befs_debug(sb, "---> befs_statfs()");
 
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index e7da03f63a5..cf74f3d4d96 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -203,8 +203,9 @@ static void bfs_put_super(struct super_block *s)
 	s->s_fs_info = NULL;
 }
 
-static int bfs_statfs(struct super_block *s, struct kstatfs *buf)
+static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *s = dentry->d_sb;
 	struct bfs_sb_info *info = BFS_SB(s);
 	u64 id = huge_encode_dev(s->s_bdev->bd_dev);
 	buf->f_type = BFS_MAGIC;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 08b35801dfe..7520f468715 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -166,8 +166,9 @@ cifs_put_super(struct super_block *sb)
 }
 
 static int
-cifs_statfs(struct super_block *sb, struct kstatfs *buf)
+cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
 	int xid; 
 	int rc = -EOPNOTSUPP;
 	struct cifs_sb_info *cifs_sb;
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index cba70201567..87f1dc8aa24 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -36,7 +36,7 @@
 /* VFS super_block ops */
 static void coda_clear_inode(struct inode *);
 static void coda_put_super(struct super_block *);
-static int coda_statfs(struct super_block *sb, struct kstatfs *buf);
+static int coda_statfs(struct dentry *dentry, struct kstatfs *buf);
 
 static kmem_cache_t * coda_inode_cachep;
 
@@ -278,13 +278,13 @@ struct inode_operations coda_file_inode_operations = {
 	.setattr	= coda_setattr,
 };
 
-static int coda_statfs(struct super_block *sb, struct kstatfs *buf)
+static int coda_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	int error;
 	
 	lock_kernel();
 
-	error = venus_statfs(sb, buf);
+	error = venus_statfs(dentry, buf);
 
 	unlock_kernel();
 
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index 1bae99650a9..b040eba13a7 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -611,7 +611,7 @@ int venus_pioctl(struct super_block *sb, struct CodaFid *fid,
 	return error;
 }
 
-int venus_statfs(struct super_block *sb, struct kstatfs *sfs) 
+int venus_statfs(struct dentry *dentry, struct kstatfs *sfs)
 { 
         union inputArgs *inp;
         union outputArgs *outp;
@@ -620,7 +620,7 @@ int venus_statfs(struct super_block *sb, struct kstatfs *sfs)
 	insize = max_t(unsigned int, INSIZE(statfs), OUTSIZE(statfs));
 	UPARG(CODA_STATFS);
 
-        error = coda_upcall(coda_sbp(sb), insize, &outsize, inp);
+        error = coda_upcall(coda_sbp(dentry->d_sb), insize, &outsize, inp);
 	
         if (!error) {
 		sfs->f_blocks = outp->coda_statfs.stat.f_blocks;
diff --git a/fs/compat.c b/fs/compat.c
index b1f64786a61..7e7e5bc4f3c 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -197,7 +197,7 @@ asmlinkage long compat_sys_statfs(const char __user *path, struct compat_statfs
 	error = user_path_walk(path, &nd);
 	if (!error) {
 		struct kstatfs tmp;
-		error = vfs_statfs(nd.dentry->d_inode->i_sb, &tmp);
+		error = vfs_statfs(nd.dentry, &tmp);
 		if (!error)
 			error = put_compat_statfs(buf, &tmp);
 		path_release(&nd);
@@ -215,7 +215,7 @@ asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user
 	file = fget(fd);
 	if (!file)
 		goto out;
-	error = vfs_statfs(file->f_dentry->d_inode->i_sb, &tmp);
+	error = vfs_statfs(file->f_dentry, &tmp);
 	if (!error)
 		error = put_compat_statfs(buf, &tmp);
 	fput(file);
@@ -265,7 +265,7 @@ asmlinkage long compat_sys_statfs64(const char __user *path, compat_size_t sz, s
 	error = user_path_walk(path, &nd);
 	if (!error) {
 		struct kstatfs tmp;
-		error = vfs_statfs(nd.dentry->d_inode->i_sb, &tmp);
+		error = vfs_statfs(nd.dentry, &tmp);
 		if (!error)
 			error = put_compat_statfs64(buf, &tmp);
 		path_release(&nd);
@@ -286,7 +286,7 @@ asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct c
 	file = fget(fd);
 	if (!file)
 		goto out;
-	error = vfs_statfs(file->f_dentry->d_inode->i_sb, &tmp);
+	error = vfs_statfs(file->f_dentry, &tmp);
 	if (!error)
 		error = put_compat_statfs64(buf, &tmp);
 	fput(file);
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 37a91a153aa..8a9d5d3b326 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -322,8 +322,10 @@ out:
 	return -EINVAL;
 }
 
-static int cramfs_statfs(struct super_block *sb, struct kstatfs *buf)
+static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
+
 	buf->f_type = CRAMFS_MAGIC;
 	buf->f_bsize = PAGE_CACHE_SIZE;
 	buf->f_blocks = CRAMFS_SB(sb)->blocks;
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 1ba5e14f879..8ac2462ae5d 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -15,7 +15,7 @@
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
 
-static int efs_statfs(struct super_block *s, struct kstatfs *buf);
+static int efs_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int efs_fill_super(struct super_block *s, void *d, int silent);
 
 static int efs_get_sb(struct file_system_type *fs_type,
@@ -322,8 +322,8 @@ out_no_fs:
 	return -EINVAL;
 }
 
-static int efs_statfs(struct super_block *s, struct kstatfs *buf) {
-	struct efs_sb_info *sb = SUPER_INFO(s);
+static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) {
+	struct efs_sb_info *sb = SUPER_INFO(dentry->d_sb);
 
 	buf->f_type    = EFS_SUPER_MAGIC;	/* efs magic number */
 	buf->f_bsize   = EFS_BLOCKSIZE;		/* blocksize */
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index a4dfffac596..a6c4d6e0232 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -39,7 +39,7 @@
 static void ext2_sync_super(struct super_block *sb,
 			    struct ext2_super_block *es);
 static int ext2_remount (struct super_block * sb, int * flags, char * data);
-static int ext2_statfs (struct super_block * sb, struct kstatfs * buf);
+static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
 
 void ext2_error (struct super_block * sb, const char * function,
 		 const char * fmt, ...)
@@ -1038,8 +1038,9 @@ restore_opts:
 	return err;
 }
 
-static int ext2_statfs (struct super_block * sb, struct kstatfs * buf)
+static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
 {
+	struct super_block *sb = dentry->d_sb;
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
 	unsigned long overhead;
 	int i;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 657f8e73b62..e08b6439617 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -58,7 +58,7 @@ static int ext3_sync_fs(struct super_block *sb, int wait);
 static const char *ext3_decode_error(struct super_block * sb, int errno,
 				     char nbuf[16]);
 static int ext3_remount (struct super_block * sb, int * flags, char * data);
-static int ext3_statfs (struct super_block * sb, struct kstatfs * buf);
+static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf);
 static void ext3_unlockfs(struct super_block *sb);
 static void ext3_write_super (struct super_block * sb);
 static void ext3_write_super_lockfs(struct super_block *sb);
@@ -2318,8 +2318,9 @@ restore_opts:
 	return err;
 }
 
-static int ext3_statfs (struct super_block * sb, struct kstatfs * buf)
+static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
 {
+	struct super_block *sb = dentry->d_sb;
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
 	struct ext3_super_block *es = sbi->s_es;
 	unsigned long overhead;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index c1ce284f8a9..7c35d582ec1 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -539,18 +539,18 @@ static int fat_remount(struct super_block *sb, int *flags, char *data)
 	return 0;
 }
 
-static int fat_statfs(struct super_block *sb, struct kstatfs *buf)
+static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
 
 	/* If the count of free cluster is still unknown, counts it here. */
 	if (sbi->free_clusters == -1) {
-		int err = fat_count_free_clusters(sb);
+		int err = fat_count_free_clusters(dentry->d_sb);
 		if (err)
 			return err;
 	}
 
-	buf->f_type = sb->s_magic;
+	buf->f_type = dentry->d_sb->s_magic;
 	buf->f_bsize = sbi->cluster_size;
 	buf->f_blocks = sbi->max_cluster - FAT_START_ENT;
 	buf->f_bfree = sbi->free_clusters;
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index d76eeaafbde..b74b791fc23 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -40,6 +40,7 @@
 #include <linux/slab.h>
 #include <linux/stat.h>
 #include <linux/vfs.h>
+#include <linux/mount.h>
 
 #include "vxfs.h"
 #include "vxfs_extern.h"
@@ -55,7 +56,7 @@ MODULE_ALIAS("vxfs"); /* makes mount -t vxfs autoload the module */
 
 
 static void		vxfs_put_super(struct super_block *);
-static int		vxfs_statfs(struct super_block *, struct kstatfs *);
+static int		vxfs_statfs(struct dentry *, struct kstatfs *);
 static int		vxfs_remount(struct super_block *, int *, char *);
 
 static struct super_operations vxfs_super_ops = {
@@ -90,12 +91,12 @@ vxfs_put_super(struct super_block *sbp)
 
 /**
  * vxfs_statfs - get filesystem information
- * @sbp:	VFS superblock
+ * @dentry:	VFS dentry to locate superblock
  * @bufp:	output buffer
  *
  * Description:
  *   vxfs_statfs fills the statfs buffer @bufp with information
- *   about the filesystem described by @sbp.
+ *   about the filesystem described by @dentry.
  *
  * Returns:
  *   Zero.
@@ -107,12 +108,12 @@ vxfs_put_super(struct super_block *sbp)
  *   This is everything but complete...
  */
 static int
-vxfs_statfs(struct super_block *sbp, struct kstatfs *bufp)
+vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp)
 {
-	struct vxfs_sb_info		*infp = VXFS_SBI(sbp);
+	struct vxfs_sb_info		*infp = VXFS_SBI(dentry->d_sb);
 
 	bufp->f_type = VXFS_SUPER_MAGIC;
-	bufp->f_bsize = sbp->s_blocksize;
+	bufp->f_bsize = dentry->d_sb->s_blocksize;
 	bufp->f_blocks = infp->vsi_raw->vs_dsize;
 	bufp->f_bfree = infp->vsi_raw->vs_free;
 	bufp->f_bavail = 0;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index c91f0a50aad..a13c0f52905 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -236,8 +236,9 @@ static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr
 	/* fsid is left zero */
 }
 
-static int fuse_statfs(struct super_block *sb, struct kstatfs *buf)
+static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
 	struct fuse_conn *fc = get_fuse_conn_super(sb);
 	struct fuse_req *req;
 	struct fuse_statfs_out outarg;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index ee5b80a409e..d9227bf14e8 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -80,8 +80,10 @@ static void hfs_put_super(struct super_block *sb)
  *
  * changed f_files/f_ffree to reflect the fs_ablock/free_ablocks.
  */
-static int hfs_statfs(struct super_block *sb, struct kstatfs *buf)
+static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
+
 	buf->f_type = HFS_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
 	buf->f_blocks = (u32)HFS_SB(sb)->fs_ablocks * HFS_SB(sb)->fs_div;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 0ed8b7e8e87..0a92fa2336a 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -212,8 +212,10 @@ static void hfsplus_put_super(struct super_block *sb)
 	sb->s_fs_info = NULL;
 }
 
-static int hfsplus_statfs(struct super_block *sb, struct kstatfs *buf)
+static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
+
 	buf->f_type = HFSPLUS_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
 	buf->f_blocks = HFSPLUS_SB(sb).total_blocks << HFSPLUS_SB(sb).fs_shift;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 04035e08f5c..8e0d37743e7 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -239,7 +239,7 @@ static int read_inode(struct inode *ino)
 	return(err);
 }
 
-int hostfs_statfs(struct super_block *sb, struct kstatfs *sf)
+int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf)
 {
 	/* do_statfs uses struct statfs64 internally, but the linux kernel
 	 * struct statfs still has 32-bit versions for most of these fields,
@@ -252,7 +252,7 @@ int hostfs_statfs(struct super_block *sb, struct kstatfs *sf)
 	long long f_files;
 	long long f_ffree;
 
-	err = do_statfs(HOSTFS_I(sb->s_root->d_inode)->host_filename,
+	err = do_statfs(HOSTFS_I(dentry->d_sb->s_root->d_inode)->host_filename,
 			&sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files,
 			&f_ffree, &sf->f_fsid, sizeof(sf->f_fsid),
 			&sf->f_namelen, sf->f_spare);
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 3b25cf3e2e6..f798480a363 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -135,8 +135,9 @@ static unsigned count_bitmaps(struct super_block *s)
 	return count;
 }
 
-static int hpfs_statfs(struct super_block *s, struct kstatfs *buf)
+static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *s = dentry->d_sb;
 	struct hpfs_sb_info *sbi = hpfs_sb(s);
 	lock_kernel();
 
diff --git a/fs/hppfs/hppfs_kern.c b/fs/hppfs/hppfs_kern.c
index ec43c22bc9c..3a9bdf58166 100644
--- a/fs/hppfs/hppfs_kern.c
+++ b/fs/hppfs/hppfs_kern.c
@@ -616,7 +616,7 @@ static const struct file_operations hppfs_dir_fops = {
 	.fsync		= hppfs_fsync,
 };
 
-static int hppfs_statfs(struct super_block *sb, struct kstatfs *sf)
+static int hppfs_statfs(struct dentry *dentry, struct kstatfs *sf)
 {
 	sf->f_blocks = 0;
 	sf->f_bfree = 0;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 4665c26171f..678fc72c364 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -467,9 +467,9 @@ static int hugetlbfs_set_page_dirty(struct page *page)
 	return 0;
 }
 
-static int hugetlbfs_statfs(struct super_block *sb, struct kstatfs *buf)
+static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
+	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
 
 	buf->f_type = HUGETLBFS_MAGIC;
 	buf->f_bsize = HPAGE_SIZE;
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 17268da63a4..3f9c8ba1fa1 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -56,7 +56,7 @@ static void isofs_put_super(struct super_block *sb)
 }
 
 static void isofs_read_inode(struct inode *);
-static int isofs_statfs (struct super_block *, struct kstatfs *);
+static int isofs_statfs (struct dentry *, struct kstatfs *);
 
 static kmem_cache_t *isofs_inode_cachep;
 
@@ -901,8 +901,10 @@ out_freesbi:
 	return -EINVAL;
 }
 
-static int isofs_statfs (struct super_block *sb, struct kstatfs *buf)
+static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
+
 	buf->f_type = ISOFS_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
 	buf->f_blocks = (ISOFS_SB(sb)->s_nzones
diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c
index dd93a091ad6..9e46ea6da75 100644
--- a/fs/jffs/inode-v23.c
+++ b/fs/jffs/inode-v23.c
@@ -377,9 +377,9 @@ jffs_new_inode(const struct inode * dir, struct jffs_raw_inode *raw_inode,
 
 /* Get statistics of the file system.  */
 static int
-jffs_statfs(struct super_block *sb, struct kstatfs *buf)
+jffs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct jffs_control *c = (struct jffs_control *) sb->s_fs_info;
+	struct jffs_control *c = (struct jffs_control *) dentry->d_sb->s_fs_info;
 	struct jffs_fmcontrol *fmc;
 
 	lock_kernel();
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 7b6c24b14f8..2900ec3ec3a 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -192,9 +192,9 @@ int jffs2_setattr(struct dentry *dentry, struct iattr *iattr)
 	return rc;
 }
 
-int jffs2_statfs(struct super_block *sb, struct kstatfs *buf)
+int jffs2_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(dentry->d_sb);
 	unsigned long avail;
 
 	buf->f_type = JFFS2_SUPER_MAGIC;
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index cd4021bcb94..6b522356540 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -175,7 +175,7 @@ void jffs2_clear_inode (struct inode *);
 void jffs2_dirty_inode(struct inode *inode);
 struct inode *jffs2_new_inode (struct inode *dir_i, int mode,
 			       struct jffs2_raw_inode *ri);
-int jffs2_statfs (struct super_block *, struct kstatfs *);
+int jffs2_statfs (struct dentry *, struct kstatfs *);
 void jffs2_write_super (struct super_block *);
 int jffs2_remount_fs (struct super_block *, int *, char *);
 int jffs2_do_fill_super(struct super_block *sb, void *data, int silent);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 18a28137b90..73d2aba084c 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -139,9 +139,9 @@ static void jfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(jfs_inode_cachep, ji);
 }
 
-static int jfs_statfs(struct super_block *sb, struct kstatfs *buf)
+static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct jfs_sb_info *sbi = JFS_SBI(sb);
+	struct jfs_sb_info *sbi = JFS_SBI(dentry->d_sb);
 	s64 maxinodes;
 	struct inomap *imap = JFS_IP(sbi->ipimap)->i_imap;
 
diff --git a/fs/libfs.c b/fs/libfs.c
index 7d70efa46da..1b115638178 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -20,9 +20,9 @@ int simple_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	return 0;
 }
 
-int simple_statfs(struct super_block *sb, struct kstatfs *buf)
+int simple_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	buf->f_type = sb->s_magic;
+	buf->f_type = dentry->d_sb->s_magic;
 	buf->f_bsize = PAGE_CACHE_SIZE;
 	buf->f_namelen = NAME_MAX;
 	return 0;
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 14f24dfbfe3..a6fb509b734 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -19,7 +19,7 @@
 
 static void minix_read_inode(struct inode * inode);
 static int minix_write_inode(struct inode * inode, int wait);
-static int minix_statfs(struct super_block *sb, struct kstatfs *buf);
+static int minix_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int minix_remount (struct super_block * sb, int * flags, char * data);
 
 static void minix_delete_inode(struct inode *inode)
@@ -296,11 +296,11 @@ out_bad_sb:
 	return -EINVAL;
 }
 
-static int minix_statfs(struct super_block *sb, struct kstatfs *buf)
+static int minix_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct minix_sb_info *sbi = minix_sb(sb);
-	buf->f_type = sb->s_magic;
-	buf->f_bsize = sb->s_blocksize;
+	struct minix_sb_info *sbi = minix_sb(dentry->d_sb);
+	buf->f_type = dentry->d_sb->s_magic;
+	buf->f_bsize = dentry->d_sb->s_blocksize;
 	buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size;
 	buf->f_bfree = minix_count_free_blocks(sbi);
 	buf->f_bavail = buf->f_bfree;
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 8db033fab3f..90d2ea28f33 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -39,7 +39,7 @@
 
 static void ncp_delete_inode(struct inode *);
 static void ncp_put_super(struct super_block *);
-static int  ncp_statfs(struct super_block *, struct kstatfs *);
+static int  ncp_statfs(struct dentry *, struct kstatfs *);
 
 static kmem_cache_t * ncp_inode_cachep;
 
@@ -724,13 +724,14 @@ static void ncp_put_super(struct super_block *sb)
 	kfree(server);
 }
 
-static int ncp_statfs(struct super_block *sb, struct kstatfs *buf)
+static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct dentry* d;
 	struct inode* i;
 	struct ncp_inode_info* ni;
 	struct ncp_server* s;
 	struct ncp_volume_info vi;
+	struct super_block *sb = dentry->d_sb;
 	int err;
 	__u8 dh;
 	
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index ff645a961bc..937fbfc381b 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -65,7 +65,7 @@ static int nfs_write_inode(struct inode *,int);
 static void nfs_delete_inode(struct inode *);
 static void nfs_clear_inode(struct inode *);
 static void nfs_umount_begin(struct super_block *);
-static int  nfs_statfs(struct super_block *, struct kstatfs *);
+static int  nfs_statfs(struct dentry *, struct kstatfs *);
 static int  nfs_show_options(struct seq_file *, struct vfsmount *);
 static int  nfs_show_stats(struct seq_file *, struct vfsmount *);
 static void nfs_zap_acl_cache(struct inode *);
@@ -534,8 +534,9 @@ nfs_fill_super(struct super_block *sb, struct nfs_mount_data *data, int silent)
 }
 
 static int
-nfs_statfs(struct super_block *sb, struct kstatfs *buf)
+nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
 	struct nfs_server *server = NFS_SB(sb);
 	unsigned char blockbits;
 	unsigned long blockres;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index de3998f15f1..5446a0861d1 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1310,7 +1310,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 	if ((bmval0 & (FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL)) ||
 	    (bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
 		       FATTR4_WORD1_SPACE_TOTAL))) {
-		status = vfs_statfs(dentry->d_inode->i_sb, &statfs);
+		status = vfs_statfs(dentry, &statfs);
 		if (status)
 			goto out_nfserr;
 	}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 1d65f13f458..245eaa1fb59 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1737,7 +1737,7 @@ int
 nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat)
 {
 	int err = fh_verify(rqstp, fhp, 0, MAY_NOP);
-	if (!err && vfs_statfs(fhp->fh_dentry->d_inode->i_sb,stat))
+	if (!err && vfs_statfs(fhp->fh_dentry,stat))
 		err = nfserr_io;
 	return err;
 }
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index d5d5e969294..0e14acea3f8 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2601,10 +2601,10 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
 
 /**
  * ntfs_statfs - return information about mounted NTFS volume
- * @sb:		super block of mounted volume
+ * @dentry:	dentry from mounted volume
  * @sfs:	statfs structure in which to return the information
  *
- * Return information about the mounted NTFS volume @sb in the statfs structure
+ * Return information about the mounted NTFS volume @dentry in the statfs structure
  * pointed to by @sfs (this is initialized with zeros before ntfs_statfs is
  * called). We interpret the values to be correct of the moment in time at
  * which we are called. Most values are variable otherwise and this isn't just
@@ -2617,8 +2617,9 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
  *
  * Return 0 on success or -errno on error.
  */
-static int ntfs_statfs(struct super_block *sb, struct kstatfs *sfs)
+static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs)
 {
+	struct super_block *sb = dentry->d_sb;
 	s64 size;
 	ntfs_volume *vol = NTFS_SB(sb);
 	ntfs_inode *mft_ni = NTFS_I(vol->mft_ino);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 788b8b50dc4..cdf73393f09 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -100,7 +100,7 @@ static int ocfs2_initialize_mem_caches(void);
 static void ocfs2_free_mem_caches(void);
 static void ocfs2_delete_osb(struct ocfs2_super *osb);
 
-static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf);
+static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf);
 
 static int ocfs2_sync_fs(struct super_block *sb, int wait);
 
@@ -857,7 +857,7 @@ static void ocfs2_put_super(struct super_block *sb)
 	mlog_exit_void();
 }
 
-static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf)
+static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct ocfs2_super *osb;
 	u32 numbits, freebits;
@@ -866,9 +866,9 @@ static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf)
 	struct buffer_head *bh = NULL;
 	struct inode *inode = NULL;
 
-	mlog_entry("(%p, %p)\n", sb, buf);
+	mlog_entry("(%p, %p)\n", dentry->d_sb, buf);
 
-	osb = OCFS2_SB(sb);
+	osb = OCFS2_SB(dentry->d_sb);
 
 	inode = ocfs2_get_system_file_inode(osb,
 					    GLOBAL_BITMAP_SYSTEM_INODE,
@@ -891,7 +891,7 @@ static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf)
 	freebits = numbits - le32_to_cpu(bm_lock->id1.bitmap1.i_used);
 
 	buf->f_type = OCFS2_SUPER_MAGIC;
-	buf->f_bsize = sb->s_blocksize;
+	buf->f_bsize = dentry->d_sb->s_blocksize;
 	buf->f_namelen = OCFS2_MAX_FILENAME_LEN;
 	buf->f_blocks = ((sector_t) numbits) *
 			(osb->s_clustersize >> osb->sb->s_blocksize_bits);
diff --git a/fs/open.c b/fs/open.c
index 4f178acd4c0..a37ff861108 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -31,18 +31,18 @@
 
 #include <asm/unistd.h>
 
-int vfs_statfs(struct super_block *sb, struct kstatfs *buf)
+int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	int retval = -ENODEV;
 
-	if (sb) {
+	if (dentry) {
 		retval = -ENOSYS;
-		if (sb->s_op->statfs) {
+		if (dentry->d_sb->s_op->statfs) {
 			memset(buf, 0, sizeof(*buf));
-			retval = security_sb_statfs(sb);
+			retval = security_sb_statfs(dentry);
 			if (retval)
 				return retval;
-			retval = sb->s_op->statfs(sb, buf);
+			retval = dentry->d_sb->s_op->statfs(dentry, buf);
 			if (retval == 0 && buf->f_frsize == 0)
 				buf->f_frsize = buf->f_bsize;
 		}
@@ -52,12 +52,12 @@ int vfs_statfs(struct super_block *sb, struct kstatfs *buf)
 
 EXPORT_SYMBOL(vfs_statfs);
 
-static int vfs_statfs_native(struct super_block *sb, struct statfs *buf)
+static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf)
 {
 	struct kstatfs st;
 	int retval;
 
-	retval = vfs_statfs(sb, &st);
+	retval = vfs_statfs(dentry, &st);
 	if (retval)
 		return retval;
 
@@ -95,12 +95,12 @@ static int vfs_statfs_native(struct super_block *sb, struct statfs *buf)
 	return 0;
 }
 
-static int vfs_statfs64(struct super_block *sb, struct statfs64 *buf)
+static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf)
 {
 	struct kstatfs st;
 	int retval;
 
-	retval = vfs_statfs(sb, &st);
+	retval = vfs_statfs(dentry, &st);
 	if (retval)
 		return retval;
 
@@ -130,7 +130,7 @@ asmlinkage long sys_statfs(const char __user * path, struct statfs __user * buf)
 	error = user_path_walk(path, &nd);
 	if (!error) {
 		struct statfs tmp;
-		error = vfs_statfs_native(nd.dentry->d_inode->i_sb, &tmp);
+		error = vfs_statfs_native(nd.dentry, &tmp);
 		if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 			error = -EFAULT;
 		path_release(&nd);
@@ -149,7 +149,7 @@ asmlinkage long sys_statfs64(const char __user *path, size_t sz, struct statfs64
 	error = user_path_walk(path, &nd);
 	if (!error) {
 		struct statfs64 tmp;
-		error = vfs_statfs64(nd.dentry->d_inode->i_sb, &tmp);
+		error = vfs_statfs64(nd.dentry, &tmp);
 		if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 			error = -EFAULT;
 		path_release(&nd);
@@ -168,7 +168,7 @@ asmlinkage long sys_fstatfs(unsigned int fd, struct statfs __user * buf)
 	file = fget(fd);
 	if (!file)
 		goto out;
-	error = vfs_statfs_native(file->f_dentry->d_inode->i_sb, &tmp);
+	error = vfs_statfs_native(file->f_dentry, &tmp);
 	if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 		error = -EFAULT;
 	fput(file);
@@ -189,7 +189,7 @@ asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz, struct statfs64 __user
 	file = fget(fd);
 	if (!file)
 		goto out;
-	error = vfs_statfs64(file->f_dentry->d_inode->i_sb, &tmp);
+	error = vfs_statfs64(file->f_dentry, &tmp);
 	if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 		error = -EFAULT;
 	fput(file);
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index e6cca5cd4b4..2f24c46f72a 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -128,7 +128,7 @@ static struct inode *qnx4_alloc_inode(struct super_block *sb);
 static void qnx4_destroy_inode(struct inode *inode);
 static void qnx4_read_inode(struct inode *);
 static int qnx4_remount(struct super_block *sb, int *flags, char *data);
-static int qnx4_statfs(struct super_block *, struct kstatfs *);
+static int qnx4_statfs(struct dentry *, struct kstatfs *);
 
 static struct super_operations qnx4_sops =
 {
@@ -282,8 +282,10 @@ unsigned long qnx4_block_map( struct inode *inode, long iblock )
 	return block;
 }
 
-static int qnx4_statfs(struct super_block *sb, struct kstatfs *buf)
+static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
+
 	lock_kernel();
 
 	buf->f_type    = sb->s_magic;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index f3ff41d3398..00f1321e920 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -60,7 +60,7 @@ static int is_any_reiserfs_magic_string(struct reiserfs_super_block *rs)
 }
 
 static int reiserfs_remount(struct super_block *s, int *flags, char *data);
-static int reiserfs_statfs(struct super_block *s, struct kstatfs *buf);
+static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf);
 
 static int reiserfs_sync_fs(struct super_block *s, int wait)
 {
@@ -1938,15 +1938,15 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 	return errval;
 }
 
-static int reiserfs_statfs(struct super_block *s, struct kstatfs *buf)
+static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s);
+	struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(dentry->d_sb);
 
 	buf->f_namelen = (REISERFS_MAX_NAME(s->s_blocksize));
 	buf->f_bfree = sb_free_blocks(rs);
 	buf->f_bavail = buf->f_bfree;
 	buf->f_blocks = sb_block_count(rs) - sb_bmap_nr(rs) - 1;
-	buf->f_bsize = s->s_blocksize;
+	buf->f_bsize = dentry->d_sb->s_blocksize;
 	/* changed to accommodate gcc folks. */
 	buf->f_type = REISERFS_SUPER_MAGIC;
 	return 0;
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index 4d6cd662106..283fbc6b8ee 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -179,12 +179,12 @@ outnobh:
 /* That's simple too. */
 
 static int
-romfs_statfs(struct super_block *sb, struct kstatfs *buf)
+romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	buf->f_type = ROMFS_MAGIC;
 	buf->f_bsize = ROMBSIZE;
 	buf->f_bfree = buf->f_bavail = buf->f_ffree;
-	buf->f_blocks = (romfs_maxsize(sb)+ROMBSIZE-1)>>ROMBSBITS;
+	buf->f_blocks = (romfs_maxsize(dentry->d_sb)+ROMBSIZE-1)>>ROMBSBITS;
 	buf->f_namelen = ROMFS_MAXFN;
 	return 0;
 }
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 4a37c2bbfa3..506ff87c1d4 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -48,7 +48,7 @@
 
 static void smb_delete_inode(struct inode *);
 static void smb_put_super(struct super_block *);
-static int  smb_statfs(struct super_block *, struct kstatfs *);
+static int  smb_statfs(struct dentry *, struct kstatfs *);
 static int  smb_show_options(struct seq_file *, struct vfsmount *);
 
 static kmem_cache_t *smb_inode_cachep;
@@ -641,13 +641,13 @@ out_no_server:
 }
 
 static int
-smb_statfs(struct super_block *sb, struct kstatfs *buf)
+smb_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	int result;
 	
 	lock_kernel();
 
-	result = smb_proc_dskattr(sb, buf);
+	result = smb_proc_dskattr(dentry, buf);
 
 	unlock_kernel();
 
diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c
index b1b878b8173..c3495059889 100644
--- a/fs/smbfs/proc.c
+++ b/fs/smbfs/proc.c
@@ -3226,9 +3226,9 @@ smb_proc_settime(struct dentry *dentry, struct smb_fattr *fattr)
 }
 
 int
-smb_proc_dskattr(struct super_block *sb, struct kstatfs *attr)
+smb_proc_dskattr(struct dentry *dentry, struct kstatfs *attr)
 {
-	struct smb_sb_info *server = SMB_SB(sb);
+	struct smb_sb_info *server = SMB_SB(dentry->d_sb);
 	int result;
 	char *p;
 	long unit;
diff --git a/fs/smbfs/proto.h b/fs/smbfs/proto.h
index 47664597e6b..972ed7dad38 100644
--- a/fs/smbfs/proto.h
+++ b/fs/smbfs/proto.h
@@ -29,7 +29,7 @@ extern int smb_proc_getattr(struct dentry *dir, struct smb_fattr *fattr);
 extern int smb_proc_setattr(struct dentry *dir, struct smb_fattr *fattr);
 extern int smb_proc_setattr_unix(struct dentry *d, struct iattr *attr, unsigned int major, unsigned int minor);
 extern int smb_proc_settime(struct dentry *dentry, struct smb_fattr *fattr);
-extern int smb_proc_dskattr(struct super_block *sb, struct kstatfs *attr);
+extern int smb_proc_dskattr(struct dentry *dentry, struct kstatfs *attr);
 extern int smb_proc_read_link(struct smb_sb_info *server, struct dentry *d, char *buffer, int len);
 extern int smb_proc_symlink(struct smb_sb_info *server, struct dentry *d, const char *oldpath);
 extern int smb_proc_link(struct smb_sb_info *server, struct dentry *dentry, struct dentry *new_dentry);
diff --git a/fs/super.c b/fs/super.c
index 324c2d232f5..057b5325b7e 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -486,7 +486,7 @@ asmlinkage long sys_ustat(unsigned dev, struct ustat __user * ubuf)
         s = user_get_super(new_decode_dev(dev));
         if (s == NULL)
                 goto out;
-	err = vfs_statfs(s, &sbuf);
+	err = vfs_statfs(s->s_root, &sbuf);
 	drop_super(s);
 	if (err)
 		goto out;
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 3ff89cc5833..58b2d22142b 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -85,8 +85,9 @@ static void sysv_put_super(struct super_block *sb)
 	kfree(sbi);
 }
 
-static int sysv_statfs(struct super_block *sb, struct kstatfs *buf)
+static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
 	struct sysv_sb_info *sbi = SYSV_SB(sb);
 
 	buf->f_type = sb->s_magic;
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 2250774a831..44fe2cb0bbb 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -91,7 +91,7 @@ static void udf_load_partdesc(struct super_block *, struct buffer_head *);
 static void udf_open_lvid(struct super_block *);
 static void udf_close_lvid(struct super_block *);
 static unsigned int udf_count_free(struct super_block *);
-static int udf_statfs(struct super_block *, struct kstatfs *);
+static int udf_statfs(struct dentry *, struct kstatfs *);
 
 /* UDF filesystem type */
 static int udf_get_sb(struct file_system_type *fs_type,
@@ -1779,8 +1779,10 @@ udf_put_super(struct super_block *sb)
  *	Written, tested, and released.
  */
 static int
-udf_statfs(struct super_block *sb, struct kstatfs *buf)
+udf_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
+
 	buf->f_type = UDF_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
 	buf->f_blocks = UDF_SB_PARTLEN(sb, UDF_SB_PARTITION(sb));
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 768fb8d9e67..fe5ab2aa289 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1113,8 +1113,9 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 	return 0;
 }
 
-static int ufs_statfs (struct super_block *sb, struct kstatfs *buf)
+static int ufs_statfs (struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
 	struct ufs_sb_private_info * uspi;
 	struct ufs_super_block_first * usb1;
 	struct ufs_super_block * usb;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index d03c89a3665..4fb0fc65af3 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -684,10 +684,10 @@ xfs_fs_sync_super(
 
 STATIC int
 xfs_fs_statfs(
-	struct super_block	*sb,
+	struct dentry		*dentry,
 	struct kstatfs		*statp)
 {
-	return -bhv_vfs_statvfs(vfs_from_sb(sb), statp, NULL);
+	return -bhv_vfs_statvfs(vfs_from_sb(dentry->d_sb), statp, NULL);
 }
 
 STATIC int
diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h
index d539262a8f8..98f6c52c152 100644
--- a/include/linux/coda_psdev.h
+++ b/include/linux/coda_psdev.h
@@ -70,7 +70,7 @@ int venus_pioctl(struct super_block *sb, struct CodaFid *fid,
 		 unsigned int cmd, struct PioctlData *data);
 int coda_downcall(int opcode, union outputArgs *out, struct super_block *sb);
 int venus_fsync(struct super_block *sb, struct CodaFid *fid);
-int venus_statfs(struct super_block *sb, struct kstatfs *sfs);
+int venus_statfs(struct dentry *dentry, struct kstatfs *sfs);
 
 
 /* messages between coda filesystem in kernel and Venus */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3e50dd24af8..c823a3815e2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1096,7 +1096,7 @@ struct super_operations {
 	int (*sync_fs)(struct super_block *sb, int wait);
 	void (*write_super_lockfs) (struct super_block *);
 	void (*unlockfs) (struct super_block *);
-	int (*statfs) (struct super_block *, struct kstatfs *);
+	int (*statfs) (struct dentry *, struct kstatfs *);
 	int (*remount_fs) (struct super_block *, int *, char *);
 	void (*clear_inode) (struct inode *);
 	void (*umount_begin) (struct super_block *);
@@ -1325,7 +1325,7 @@ extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
 extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
 				  struct vfsmount *);
 
-extern int vfs_statfs(struct super_block *, struct kstatfs *);
+extern int vfs_statfs(struct dentry *, struct kstatfs *);
 
 /* /sys/fs */
 extern struct subsystem fs_subsys;
@@ -1746,7 +1746,7 @@ extern int dcache_dir_close(struct inode *, struct file *);
 extern loff_t dcache_dir_lseek(struct file *, loff_t, int);
 extern int dcache_readdir(struct file *, void *, filldir_t);
 extern int simple_getattr(struct vfsmount *, struct dentry *, struct kstat *);
-extern int simple_statfs(struct super_block *, struct kstatfs *);
+extern int simple_statfs(struct dentry *, struct kstatfs *);
 extern int simple_link(struct dentry *, struct inode *, struct dentry *);
 extern int simple_unlink(struct inode *, struct dentry *);
 extern int simple_rmdir(struct inode *, struct dentry *);
diff --git a/include/linux/mount.h b/include/linux/mount.h
index b7472ae91fa..60718f12caa 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -17,6 +17,11 @@
 #include <linux/spinlock.h>
 #include <asm/atomic.h>
 
+struct super_block;
+struct vfsmount;
+struct dentry;
+struct namespace;
+
 #define MNT_NOSUID	0x01
 #define MNT_NODEV	0x02
 #define MNT_NOEXEC	0x04
diff --git a/include/linux/security.h b/include/linux/security.h
index 47722d35553..383c320fc83 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -171,9 +171,9 @@ struct swap_info_struct;
  *	Deallocate and clear the sb->s_security field.
  *	@sb contains the super_block structure to be modified.
  * @sb_statfs:
- *	Check permission before obtaining filesystem statistics for the @sb
- *	filesystem.
- *	@sb contains the super_block structure for the filesystem.
+ *	Check permission before obtaining filesystem statistics for the @mnt
+ *	mountpoint.
+ *	@dentry is a handle on the superblock for the filesystem.
  *	Return 0 if permission is granted.  
  * @sb_mount:
  *	Check permission before an object specified by @dev_name is mounted on
@@ -1127,7 +1127,7 @@ struct security_operations {
 	int (*sb_copy_data)(struct file_system_type *type,
 			    void *orig, void *copy);
 	int (*sb_kern_mount) (struct super_block *sb, void *data);
-	int (*sb_statfs) (struct super_block * sb);
+	int (*sb_statfs) (struct dentry *dentry);
 	int (*sb_mount) (char *dev_name, struct nameidata * nd,
 			 char *type, unsigned long flags, void *data);
 	int (*sb_check_sb) (struct vfsmount * mnt, struct nameidata * nd);
@@ -1450,9 +1450,9 @@ static inline int security_sb_kern_mount (struct super_block *sb, void *data)
 	return security_ops->sb_kern_mount (sb, data);
 }
 
-static inline int security_sb_statfs (struct super_block *sb)
+static inline int security_sb_statfs (struct dentry *dentry)
 {
-	return security_ops->sb_statfs (sb);
+	return security_ops->sb_statfs (dentry);
 }
 
 static inline int security_sb_mount (char *dev_name, struct nameidata *nd,
@@ -2162,7 +2162,7 @@ static inline int security_sb_kern_mount (struct super_block *sb, void *data)
 	return 0;
 }
 
-static inline int security_sb_statfs (struct super_block *sb)
+static inline int security_sb_statfs (struct dentry *dentry)
 {
 	return 0;
 }
diff --git a/kernel/acct.c b/kernel/acct.c
index b327f4d2010..6802020e0ce 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -118,7 +118,7 @@ static int check_free_space(struct file *file)
 	spin_unlock(&acct_globals.lock);
 
 	/* May block */
-	if (vfs_statfs(file->f_dentry->d_inode->i_sb, &sbuf))
+	if (vfs_statfs(file->f_dentry, &sbuf))
 		return res;
 	suspend = sbuf.f_blocks * SUSPEND;
 	resume = sbuf.f_blocks * RESUME;
diff --git a/mm/shmem.c b/mm/shmem.c
index 7617bb1c6bf..10020d8b407 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1654,9 +1654,9 @@ static ssize_t shmem_file_sendfile(struct file *in_file, loff_t *ppos,
 	return desc.error;
 }
 
-static int shmem_statfs(struct super_block *sb, struct kstatfs *buf)
+static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+	struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
 
 	buf->f_type = TMPFS_MAGIC;
 	buf->f_bsize = PAGE_CACHE_SIZE;
diff --git a/security/dummy.c b/security/dummy.c
index 6de4a4a5eb1..c98d553984e 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -191,7 +191,7 @@ static int dummy_sb_kern_mount (struct super_block *sb, void *data)
 	return 0;
 }
 
-static int dummy_sb_statfs (struct super_block *sb)
+static int dummy_sb_statfs (struct dentry *dentry)
 {
 	return 0;
 }
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 524915dfda6..093efba4d9b 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1903,13 +1903,13 @@ static int selinux_sb_kern_mount(struct super_block *sb, void *data)
 	return superblock_has_perm(current, sb, FILESYSTEM__MOUNT, &ad);
 }
 
-static int selinux_sb_statfs(struct super_block *sb)
+static int selinux_sb_statfs(struct dentry *dentry)
 {
 	struct avc_audit_data ad;
 
 	AVC_AUDIT_DATA_INIT(&ad,FS);
-	ad.u.fs.dentry = sb->s_root;
-	return superblock_has_perm(current, sb, FILESYSTEM__GETATTR, &ad);
+	ad.u.fs.dentry = dentry->d_sb->s_root;
+	return superblock_has_perm(current, dentry->d_sb, FILESYSTEM__GETATTR, &ad);
 }
 
 static int selinux_mount(char * dev_name,
-- 
cgit v1.2.3-70-g09d2


From cb2b95e1c6b56e3d2369d3a5f4bc97f4fa180683 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Fri, 23 Jun 2006 02:03:01 -0700
Subject: [PATCH] zone handle unaligned zone boundaries

The buddy allocator has a requirement that boundaries between contigious
zones occur aligned with the the MAX_ORDER ranges.  Where they do not we
will incorrectly merge pages cross zone boundaries.  This can lead to pages
from the wrong zone being handed out.

Originally the buddy allocator would check that buddies were in the same
zone by referencing the zone start and end page frame numbers.  This was
removed as it became very expensive and the buddy allocator already made
the assumption that zones boundaries were aligned.

It is clear that not all configurations and architectures are honouring
this alignment requirement.  Therefore it seems safest to reintroduce
support for non-aligned zone boundaries.  This patch introduces a new check
when considering a page a buddy it compares the zone_table index for the
two pages and refuses to merge the pages where they do not match.  The
zone_table index is unique for each node/zone combination when
FLATMEM/DISCONTIGMEM is enabled and for each section/zone combination when
SPARSEMEM is enabled (a SPARSEMEM section is at least a MAX_ORDER size).

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h |  7 +++++--
 mm/page_alloc.c    | 17 +++++++++++------
 2 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index e2fa375e478..697c6bf248c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -465,10 +465,13 @@ static inline unsigned long page_zonenum(struct page *page)
 struct zone;
 extern struct zone *zone_table[];
 
+static inline int page_zone_id(struct page *page)
+{
+	return (page->flags >> ZONETABLE_PGSHIFT) & ZONETABLE_MASK;
+}
 static inline struct zone *page_zone(struct page *page)
 {
-	return zone_table[(page->flags >> ZONETABLE_PGSHIFT) &
-			ZONETABLE_MASK];
+	return zone_table[page_zone_id(page)];
 }
 
 static inline unsigned long page_to_nid(struct page *page)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 253a450c400..fd631c2536a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -286,22 +286,27 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
  * we can do coalesce a page and its buddy if
  * (a) the buddy is not in a hole &&
  * (b) the buddy is in the buddy system &&
- * (c) a page and its buddy have the same order.
+ * (c) a page and its buddy have the same order &&
+ * (d) a page and its buddy are in the same zone.
  *
  * For recording whether a page is in the buddy system, we use PG_buddy.
  * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
  *
  * For recording page's order, we use page_private(page).
  */
-static inline int page_is_buddy(struct page *page, int order)
+static inline int page_is_buddy(struct page *page, struct page *buddy,
+								int order)
 {
 #ifdef CONFIG_HOLES_IN_ZONE
-	if (!pfn_valid(page_to_pfn(page)))
+	if (!pfn_valid(page_to_pfn(buddy)))
 		return 0;
 #endif
 
-	if (PageBuddy(page) && page_order(page) == order) {
-		BUG_ON(page_count(page) != 0);
+	if (page_zone_id(page) != page_zone_id(buddy))
+		return 0;
+
+	if (PageBuddy(buddy) && page_order(buddy) == order) {
+		BUG_ON(page_count(buddy) != 0);
 		return 1;
 	}
 	return 0;
@@ -352,7 +357,7 @@ static inline void __free_one_page(struct page *page,
 		struct page *buddy;
 
 		buddy = __page_find_buddy(page, page_idx, order);
-		if (!page_is_buddy(buddy, order))
+		if (!page_is_buddy(page, buddy, order))
 			break;		/* Move the buddy up one level. */
 
 		list_del(&buddy->lru);
-- 
cgit v1.2.3-70-g09d2


From 4da5eda0dca9730f59f391230304526ab4bffec7 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 23 Jun 2006 02:03:04 -0700
Subject: [PATCH] Page Migration: Make do_swap_page redo the fault

It is better to redo the complete fault if do_swap_page() finds that the
page is not in PageSwapCache() because the page migration code may have
replaced the swap pte already with a pte pointing to valid memory.

do_swap_page() may interpret an invalid swap entry without this patch
because we do not reload the pte if we are looping back.  The page
migration code may already have reused the swap entry referenced by our
local swp_entry.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index 0ec7bc64427..7e3683fd4f3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1879,7 +1879,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		goto out;
 
 	entry = pte_to_swp_entry(orig_pte);
-again:
 	page = lookup_swap_cache(entry);
 	if (!page) {
  		swapin_readahead(entry, address, vma);
@@ -1903,12 +1902,6 @@ again:
 
 	mark_page_accessed(page);
 	lock_page(page);
-	if (!PageSwapCache(page)) {
-		/* Page migration has occured */
-		unlock_page(page);
-		page_cache_release(page);
-		goto again;
-	}
 
 	/*
 	 * Back out if somebody else already faulted in this pte.
-- 
cgit v1.2.3-70-g09d2


From 729bd0b74ce9ac6c829109052fcd565f5c366ca5 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Fri, 23 Jun 2006 02:03:05 -0700
Subject: [PATCH] slab: extract cache_free_alien from __cache_free

Move alien object freeing to cache_free_alien() to reduce #ifdef clutter in
__cache_free().

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 77 ++++++++++++++++++++++++++++++++++-----------------------------
 1 file changed, 42 insertions(+), 35 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index f1b644eb39d..bf05ea900ce 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1024,6 +1024,40 @@ static void drain_alien_cache(struct kmem_cache *cachep,
 		}
 	}
 }
+
+static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
+{
+	struct slab *slabp = virt_to_slab(objp);
+	int nodeid = slabp->nodeid;
+	struct kmem_list3 *l3;
+	struct array_cache *alien = NULL;
+
+	/*
+	 * Make sure we are not freeing a object from another node to the array
+	 * cache on this cpu.
+	 */
+	if (likely(slabp->nodeid == numa_node_id()))
+		return 0;
+
+	l3 = cachep->nodelists[numa_node_id()];
+	STATS_INC_NODEFREES(cachep);
+	if (l3->alien && l3->alien[nodeid]) {
+		alien = l3->alien[nodeid];
+		spin_lock(&alien->lock);
+		if (unlikely(alien->avail == alien->limit)) {
+			STATS_INC_ACOVERFLOW(cachep);
+			__drain_alien_cache(cachep, alien, nodeid);
+		}
+		alien->entry[alien->avail++] = objp;
+		spin_unlock(&alien->lock);
+	} else {
+		spin_lock(&(cachep->nodelists[nodeid])->list_lock);
+		free_block(cachep, &objp, 1, nodeid);
+		spin_unlock(&(cachep->nodelists[nodeid])->list_lock);
+	}
+	return 1;
+}
+
 #else
 
 #define drain_alien_cache(cachep, alien) do { } while (0)
@@ -1038,6 +1072,11 @@ static inline void free_alien_cache(struct array_cache **ac_ptr)
 {
 }
 
+static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
+{
+	return 0;
+}
+
 #endif
 
 static int cpuup_callback(struct notifier_block *nfb,
@@ -3087,41 +3126,9 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
 	check_irq_off();
 	objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
 
-	/* Make sure we are not freeing a object from another
-	 * node to the array cache on this cpu.
-	 */
-#ifdef CONFIG_NUMA
-	{
-		struct slab *slabp;
-		slabp = virt_to_slab(objp);
-		if (unlikely(slabp->nodeid != numa_node_id())) {
-			struct array_cache *alien = NULL;
-			int nodeid = slabp->nodeid;
-			struct kmem_list3 *l3;
-
-			l3 = cachep->nodelists[numa_node_id()];
-			STATS_INC_NODEFREES(cachep);
-			if (l3->alien && l3->alien[nodeid]) {
-				alien = l3->alien[nodeid];
-				spin_lock(&alien->lock);
-				if (unlikely(alien->avail == alien->limit)) {
-					STATS_INC_ACOVERFLOW(cachep);
-					__drain_alien_cache(cachep,
-							    alien, nodeid);
-				}
-				alien->entry[alien->avail++] = objp;
-				spin_unlock(&alien->lock);
-			} else {
-				spin_lock(&(cachep->nodelists[nodeid])->
-					  list_lock);
-				free_block(cachep, &objp, 1, nodeid);
-				spin_unlock(&(cachep->nodelists[nodeid])->
-					    list_lock);
-			}
-			return;
-		}
-	}
-#endif
+	if (cache_free_alien(cachep, objp))
+		return;
+
 	if (likely(ac->avail < ac->limit)) {
 		STATS_INC_FREEHIT(cachep);
 		ac->entry[ac->avail++] = objp;
-- 
cgit v1.2.3-70-g09d2


From 4776874ff096cd410382c0eca5d75f69c9dfa58f Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Fri, 23 Jun 2006 02:03:07 -0700
Subject: [PATCH] slab: page mapping cleanup

Clean up slab allocator page mapping a bit.  The memory allocated for a
slab is physically contiguous so it is okay to assume struct pages are too
so kill the long-standing comment.  Furthermore, rename set_slab_attr to
slab_map_pages and add a comment explaining why its needed.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index bf05ea900ce..a94cf0fea8a 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2499,23 +2499,28 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
 	slabp->inuse--;
 }
 
-static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp,
-			void *objp)
+/*
+ * Map pages beginning at addr to the given cache and slab. This is required
+ * for the slab allocator to be able to lookup the cache and slab of a
+ * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging.
+ */
+static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
+			   void *addr)
 {
-	int i;
+	int nr_pages;
 	struct page *page;
 
-	/* Nasty!!!!!! I hope this is OK. */
-	page = virt_to_page(objp);
+	page = virt_to_page(addr);
 
-	i = 1;
+	nr_pages = 1;
 	if (likely(!PageCompound(page)))
-		i <<= cachep->gfporder;
+		nr_pages <<= cache->gfporder;
+
 	do {
-		page_set_cache(page, cachep);
-		page_set_slab(page, slabp);
+		page_set_cache(page, cache);
+		page_set_slab(page, slab);
 		page++;
-	} while (--i);
+	} while (--nr_pages);
 }
 
 /*
@@ -2587,7 +2592,7 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 		goto opps1;
 
 	slabp->nodeid = nodeid;
-	set_slab_attr(cachep, slabp, objp);
+	slab_map_pages(cachep, slabp, objp);
 
 	cache_init_objs(cachep, slabp, ctor_flags);
 
-- 
cgit v1.2.3-70-g09d2


From 3c5a87f476bed45616e7e543dcaea4440c77bf93 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 23 Jun 2006 02:03:08 -0700
Subject: [PATCH] migration: remove unnecessary PageSwapCache checks

Remove two unnecessary PageSwapCache checks.  The page refcount is raised
and therefore page migration cannot occur in both functions.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/shmem.c    | 8 --------
 mm/swapfile.c | 7 -------
 2 files changed, 15 deletions(-)

(limited to 'mm')

diff --git a/mm/shmem.c b/mm/shmem.c
index 10020d8b407..84b5cf9b63c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1081,14 +1081,6 @@ repeat:
 			page_cache_release(swappage);
 			goto repeat;
 		}
-		if (!PageSwapCache(swappage)) {
-			/* Page migration has occured */
-			shmem_swp_unmap(entry);
-			spin_unlock(&info->lock);
-			unlock_page(swappage);
-			page_cache_release(swappage);
-			goto repeat;
-		}
 		if (PageWriteback(swappage)) {
 			shmem_swp_unmap(entry);
 			spin_unlock(&info->lock);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e5fd5385f0c..47a6812f5f8 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -716,7 +716,6 @@ static int try_to_unuse(unsigned int type)
 		 */
 		swap_map = &si->swap_map[i];
 		entry = swp_entry(type, i);
-again:
 		page = read_swap_cache_async(entry, NULL, 0);
 		if (!page) {
 			/*
@@ -751,12 +750,6 @@ again:
 		wait_on_page_locked(page);
 		wait_on_page_writeback(page);
 		lock_page(page);
-		if (!PageSwapCache(page)) {
-			/* Page migration has occured */
-			unlock_page(page);
-			page_cache_release(page);
-			goto again;
-		}
 		wait_on_page_writeback(page);
 
 		/*
-- 
cgit v1.2.3-70-g09d2


From 02b694dea473ad3db1e2d1b14c1fef8fbd92e5e6 Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Fri, 23 Jun 2006 02:03:08 -0700
Subject: [PATCH] wait_table and zonelist initializing for memory hotadd:
 change name of wait_table_size()

This is just to rename from wait_table_size() to wait_table_hash_nr_entries().

Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h |  4 ++--
 mm/page_alloc.c        | 12 +++++++-----
 2 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9742e3c1622..652673ea92f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -197,7 +197,7 @@ struct zone {
 
 	/*
 	 * wait_table		-- the array holding the hash table
-	 * wait_table_size	-- the size of the hash table array
+	 * wait_table_hash_nr_entries	-- the size of the hash table array
 	 * wait_table_bits	-- wait_table_size == (1 << wait_table_bits)
 	 *
 	 * The purpose of all these is to keep track of the people
@@ -220,7 +220,7 @@ struct zone {
 	 * free_area_init_core() performs the initialization of them.
 	 */
 	wait_queue_head_t	* wait_table;
-	unsigned long		wait_table_size;
+	unsigned long		wait_table_hash_nr_entries;
 	unsigned long		wait_table_bits;
 
 	/*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fd631c2536a..27320a0542d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1727,7 +1727,7 @@ void __init build_all_zonelists(void)
  */
 #define PAGES_PER_WAITQUEUE	256
 
-static inline unsigned long wait_table_size(unsigned long pages)
+static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
 {
 	unsigned long size = 1;
 
@@ -2019,13 +2019,15 @@ void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
 	 * The per-page waitqueue mechanism uses hashed waitqueues
 	 * per zone.
 	 */
-	zone->wait_table_size = wait_table_size(zone_size_pages);
-	zone->wait_table_bits =	wait_table_bits(zone->wait_table_size);
+	zone->wait_table_hash_nr_entries =
+		 wait_table_hash_nr_entries(zone_size_pages);
+	zone->wait_table_bits =
+		wait_table_bits(zone->wait_table_hash_nr_entries);
 	zone->wait_table = (wait_queue_head_t *)
-		alloc_bootmem_node(pgdat, zone->wait_table_size
+		alloc_bootmem_node(pgdat, zone->wait_table_hash_nr_entries
 					* sizeof(wait_queue_head_t));
 
-	for(i = 0; i < zone->wait_table_size; ++i)
+	for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
 		init_waitqueue_head(zone->wait_table + i);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 86356ab147669bd3bcb2149fd9561d1280835c24 Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Fri, 23 Jun 2006 02:03:09 -0700
Subject: [PATCH] wait_table and zonelist initializing for memory hotadd:
 change to meminit for build_zonelist

Change definitions of some functions and data from __init to __meminit.

These functions and data can be used after bootup by this patch to be used for
hot-add codes.

Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/bootmem.h |  4 ++--
 mm/page_alloc.c         | 18 +++++++++---------
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index da2d107fe2c..22866fa2d96 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -91,8 +91,8 @@ static inline void *alloc_remap(int nid, unsigned long size)
 }
 #endif
 
-extern unsigned long __initdata nr_kernel_pages;
-extern unsigned long __initdata nr_all_pages;
+extern unsigned long nr_kernel_pages;
+extern unsigned long nr_all_pages;
 
 extern void *__init alloc_large_system_hash(const char *tablename,
 					    unsigned long bucketsize,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 27320a0542d..5ae75bead4d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -83,8 +83,8 @@ EXPORT_SYMBOL(zone_table);
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
 int min_free_kbytes = 1024;
 
-unsigned long __initdata nr_kernel_pages;
-unsigned long __initdata nr_all_pages;
+unsigned long __meminitdata nr_kernel_pages;
+unsigned long __meminitdata nr_all_pages;
 
 #ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
@@ -1517,7 +1517,7 @@ void show_free_areas(void)
  *
  * Add all populated zones of a node to the zonelist.
  */
-static int __init build_zonelists_node(pg_data_t *pgdat,
+static int __meminit build_zonelists_node(pg_data_t *pgdat,
 			struct zonelist *zonelist, int nr_zones, int zone_type)
 {
 	struct zone *zone;
@@ -1553,7 +1553,7 @@ static inline int highest_zone(int zone_bits)
 
 #ifdef CONFIG_NUMA
 #define MAX_NODE_LOAD (num_online_nodes())
-static int __initdata node_load[MAX_NUMNODES];
+static int __meminitdata node_load[MAX_NUMNODES];
 /**
  * find_next_best_node - find the next node that should appear in a given node's fallback list
  * @node: node whose fallback list we're appending
@@ -1568,7 +1568,7 @@ static int __initdata node_load[MAX_NUMNODES];
  * on them otherwise.
  * It returns -1 if no node is found.
  */
-static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
+static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
 {
 	int n, val;
 	int min_val = INT_MAX;
@@ -1614,7 +1614,7 @@ static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
 	return best_node;
 }
 
-static void __init build_zonelists(pg_data_t *pgdat)
+static void __meminit build_zonelists(pg_data_t *pgdat)
 {
 	int i, j, k, node, local_node;
 	int prev_node, load;
@@ -1666,7 +1666,7 @@ static void __init build_zonelists(pg_data_t *pgdat)
 
 #else	/* CONFIG_NUMA */
 
-static void __init build_zonelists(pg_data_t *pgdat)
+static void __meminit build_zonelists(pg_data_t *pgdat)
 {
 	int i, j, k, node, local_node;
 
@@ -2071,7 +2071,7 @@ static __meminit void init_currently_empty_zone(struct zone *zone,
  *   - mark all memory queues empty
  *   - clear the memory bitmaps
  */
-static void __init free_area_init_core(struct pglist_data *pgdat,
+static void __meminit free_area_init_core(struct pglist_data *pgdat,
 		unsigned long *zones_size, unsigned long *zholes_size)
 {
 	unsigned long j;
@@ -2159,7 +2159,7 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat)
 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
 }
 
-void __init free_area_init_node(int nid, struct pglist_data *pgdat,
+void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
 		unsigned long *zones_size, unsigned long node_start_pfn,
 		unsigned long *zholes_size)
 {
-- 
cgit v1.2.3-70-g09d2


From 718127cc3170454f4aa274fdd2f1e01574fecd66 Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Fri, 23 Jun 2006 02:03:10 -0700
Subject: [PATCH] wait_table and zonelist initializing for memory hotadd: add
 return code for init_current_empty_zone

When add_zone() is called against empty zone (not populated zone), we have to
initialize the zone which didn't initialize at boot time.  But,
init_currently_empty_zone() may fail due to allocation of wait table.  So,
this patch is to catch its error code.

Changes against wait_table is in the next patch.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h |  3 +++
 mm/memory_hotplug.c    | 15 +++++++++++++--
 mm/page_alloc.c        | 11 ++++++++---
 3 files changed, 24 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 652673ea92f..e82fc1a52cd 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -333,6 +333,9 @@ void wakeup_kswapd(struct zone *zone, int order);
 int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 		int classzone_idx, int alloc_flags);
 
+extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn,
+				     unsigned long size);
+
 #ifdef CONFIG_HAVE_MEMORY_PRESENT
 void memory_present(int nid, unsigned long start, unsigned long end);
 #else
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 70df5c0d957..71da5c98c9c 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -26,7 +26,7 @@
 
 extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
 			  unsigned long size);
-static void __add_zone(struct zone *zone, unsigned long phys_start_pfn)
+static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
 {
 	struct pglist_data *pgdat = zone->zone_pgdat;
 	int nr_pages = PAGES_PER_SECTION;
@@ -34,8 +34,15 @@ static void __add_zone(struct zone *zone, unsigned long phys_start_pfn)
 	int zone_type;
 
 	zone_type = zone - pgdat->node_zones;
+	if (!populated_zone(zone)) {
+		int ret = 0;
+		ret = init_currently_empty_zone(zone, phys_start_pfn, nr_pages);
+		if (ret < 0)
+			return ret;
+	}
 	memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn);
 	zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages);
+	return 0;
 }
 
 extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
@@ -50,7 +57,11 @@ static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
 	if (ret < 0)
 		return ret;
 
-	__add_zone(zone, phys_start_pfn);
+	ret = __add_zone(zone, phys_start_pfn);
+
+	if (ret < 0)
+		return ret;
+
 	return register_new_memory(__pfn_to_section(phys_start_pfn));
 }
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5ae75bead4d..4bc66f6b771 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2050,8 +2050,9 @@ static __meminit void zone_pcp_init(struct zone *zone)
 			zone->name, zone->present_pages, batch);
 }
 
-static __meminit void init_currently_empty_zone(struct zone *zone,
-		unsigned long zone_start_pfn, unsigned long size)
+__meminit int init_currently_empty_zone(struct zone *zone,
+					unsigned long zone_start_pfn,
+					unsigned long size)
 {
 	struct pglist_data *pgdat = zone->zone_pgdat;
 
@@ -2063,6 +2064,8 @@ static __meminit void init_currently_empty_zone(struct zone *zone,
 	memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
 
 	zone_init_free_lists(pgdat, zone, zone->spanned_pages);
+
+	return 0;
 }
 
 /*
@@ -2077,6 +2080,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
 	unsigned long j;
 	int nid = pgdat->node_id;
 	unsigned long zone_start_pfn = pgdat->node_start_pfn;
+	int ret;
 
 	pgdat_resize_init(pgdat);
 	pgdat->nr_zones = 0;
@@ -2118,7 +2122,8 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
 			continue;
 
 		zonetable_add(zone, nid, j, zone_start_pfn, size);
-		init_currently_empty_zone(zone, zone_start_pfn, size);
+		ret = init_currently_empty_zone(zone, zone_start_pfn, size);
+		BUG_ON(ret);
 		zone_start_pfn += size;
 	}
 }
-- 
cgit v1.2.3-70-g09d2


From cca448fe92246fb59efe55ba2e048ded0971a9af Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Fri, 23 Jun 2006 02:03:10 -0700
Subject: [PATCH] wait_table and zonelist initializing for memory hotadd:
 wait_table initialization

Wait_table is initialized according to zone size at boot time.  But, we cannot
know the maixmum zone size when memory hotplug is enabled.  It can be
changed....  And resizing of wait_table is hard.

So kernel allocate and initialzie wait_table as its maximum size.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 53 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4bc66f6b771..62564e27b44 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1727,6 +1727,7 @@ void __init build_all_zonelists(void)
  */
 #define PAGES_PER_WAITQUEUE	256
 
+#ifndef CONFIG_MEMORY_HOTPLUG
 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
 {
 	unsigned long size = 1;
@@ -1745,6 +1746,29 @@ static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
 
 	return max(size, 4UL);
 }
+#else
+/*
+ * A zone's size might be changed by hot-add, so it is not possible to determine
+ * a suitable size for its wait_table.  So we use the maximum size now.
+ *
+ * The max wait table size = 4096 x sizeof(wait_queue_head_t).   ie:
+ *
+ *    i386 (preemption config)    : 4096 x 16 = 64Kbyte.
+ *    ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
+ *    ia64, x86-64 (preemption)   : 4096 x 24 = 96Kbyte.
+ *
+ * The maximum entries are prepared when a zone's memory is (512K + 256) pages
+ * or more by the traditional way. (See above).  It equals:
+ *
+ *    i386, x86-64, powerpc(4K page size) : =  ( 2G + 1M)byte.
+ *    ia64(16K page size)                 : =  ( 8G + 4M)byte.
+ *    powerpc (64K page size)             : =  (32G +16M)byte.
+ */
+static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
+{
+	return 4096UL;
+}
+#endif
 
 /*
  * This is an integer logarithm so that shifts can be used later
@@ -2010,10 +2034,11 @@ void __init setup_per_cpu_pageset(void)
 #endif
 
 static __meminit
-void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
+int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
 {
 	int i;
 	struct pglist_data *pgdat = zone->zone_pgdat;
+	size_t alloc_size;
 
 	/*
 	 * The per-page waitqueue mechanism uses hashed waitqueues
@@ -2023,12 +2048,32 @@ void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
 		 wait_table_hash_nr_entries(zone_size_pages);
 	zone->wait_table_bits =
 		wait_table_bits(zone->wait_table_hash_nr_entries);
-	zone->wait_table = (wait_queue_head_t *)
-		alloc_bootmem_node(pgdat, zone->wait_table_hash_nr_entries
-					* sizeof(wait_queue_head_t));
+	alloc_size = zone->wait_table_hash_nr_entries
+					* sizeof(wait_queue_head_t);
+
+ 	if (system_state == SYSTEM_BOOTING) {
+		zone->wait_table = (wait_queue_head_t *)
+			alloc_bootmem_node(pgdat, alloc_size);
+	} else {
+		/*
+		 * This case means that a zone whose size was 0 gets new memory
+		 * via memory hot-add.
+		 * But it may be the case that a new node was hot-added.  In
+		 * this case vmalloc() will not be able to use this new node's
+		 * memory - this wait_table must be initialized to use this new
+		 * node itself as well.
+		 * To use this new node's memory, further consideration will be
+		 * necessary.
+		 */
+		zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size);
+	}
+	if (!zone->wait_table)
+		return -ENOMEM;
 
 	for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
 		init_waitqueue_head(zone->wait_table + i);
+
+	return 0;
 }
 
 static __meminit void zone_pcp_init(struct zone *zone)
@@ -2055,8 +2100,10 @@ __meminit int init_currently_empty_zone(struct zone *zone,
 					unsigned long size)
 {
 	struct pglist_data *pgdat = zone->zone_pgdat;
-
-	zone_wait_table_init(zone, size);
+	int ret;
+	ret = zone_wait_table_init(zone, size);
+	if (ret)
+		return ret;
 	pgdat->nr_zones = zone_idx(zone) + 1;
 
 	zone->zone_start_pfn = zone_start_pfn;
-- 
cgit v1.2.3-70-g09d2


From 6811378e7d8b9aa4fca2a1ca73d24c9d67c9cb12 Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Fri, 23 Jun 2006 02:03:11 -0700
Subject: [PATCH] wait_table and zonelist initializing for memory hotadd:
 update zonelists

In current code, zonelist is considered to be build once, no modification.
But MemoryHotplug can add new zone/pgdat.  It must be updated.

This patch modifies build_all_zonelists().  By this, build_all_zonelist() can
reconfig pgdat's zonelists.

To update them safety, this patch use stop_machine_run().  Other cpus don't
touch among updating them by using it.

In old version (V2 of node hotadd), kernel updated them after zone
initialization.  But present_page of its new zone is still 0, because
online_page() is not called yet at this time.  Build_zonelists() checks
present_pages to find present zone.  It was too early.  So, I changed it after
online_pages().

Signed-off-by: Yasunori Goto     <y-goto@jp.fujitsu.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory_hotplug.c | 12 ++++++++++++
 mm/page_alloc.c     | 26 +++++++++++++++++++++-----
 2 files changed, 33 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 71da5c98c9c..1b1ac3db218 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -127,6 +127,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 	unsigned long flags;
 	unsigned long onlined_pages = 0;
 	struct zone *zone;
+	int need_zonelists_rebuild = 0;
 
 	/*
 	 * This doesn't need a lock to do pfn_to_page().
@@ -139,6 +140,14 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 	grow_pgdat_span(zone->zone_pgdat, pfn, pfn + nr_pages);
 	pgdat_resize_unlock(zone->zone_pgdat, &flags);
 
+	/*
+	 * If this zone is not populated, then it is not in zonelist.
+	 * This means the page allocator ignores this zone.
+	 * So, zonelist must be updated after online.
+	 */
+	if (!populated_zone(zone))
+		need_zonelists_rebuild = 1;
+
 	for (i = 0; i < nr_pages; i++) {
 		struct page *page = pfn_to_page(pfn + i);
 		online_page(page);
@@ -149,5 +158,8 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 
 	setup_per_zone_pages_min();
 
+	if (need_zonelists_rebuild)
+		build_all_zonelists();
+
 	return 0;
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 62564e27b44..9dfbe6b7d1c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -37,6 +37,7 @@
 #include <linux/nodemask.h>
 #include <linux/vmalloc.h>
 #include <linux/mempolicy.h>
+#include <linux/stop_machine.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -1704,14 +1705,29 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
 
 #endif	/* CONFIG_NUMA */
 
-void __init build_all_zonelists(void)
+/* return values int ....just for stop_machine_run() */
+static int __meminit __build_all_zonelists(void *dummy)
 {
-	int i;
+	int nid;
+	for_each_online_node(nid)
+		build_zonelists(NODE_DATA(nid));
+	return 0;
+}
+
+void __meminit build_all_zonelists(void)
+{
+	if (system_state == SYSTEM_BOOTING) {
+		__build_all_zonelists(0);
+		cpuset_init_current_mems_allowed();
+	} else {
+		/* we have to stop all cpus to guaranntee there is no user
+		   of zonelist */
+		stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
+		/* cpuset refresh routine should be here */
+	}
 
-	for_each_online_node(i)
-		build_zonelists(NODE_DATA(i));
 	printk("Built %i zonelists\n", num_online_nodes());
-	cpuset_init_current_mems_allowed();
+
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 67de648211fa041fe08a0c25241a4980bbb90698 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Fri, 23 Jun 2006 02:03:12 -0700
Subject: [PATCH] squash duplicate page_to_pfn and pfn_to_page

We have architectures where the size of page_to_pfn and pfn_to_page are
significant enough to overall image size that they wish to push them out of
line.  However, in the process we have grown a second copy of the
implementation of each of these routines for each memory model.  Share the
implmentation exposing it either inline or out-of-line as required.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-generic/memory_model.h | 27 +++++++++++++++------------
 mm/page_alloc.c                    | 32 ++------------------------------
 2 files changed, 17 insertions(+), 42 deletions(-)

(limited to 'mm')

diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h
index 0cfb086dd37..8078cbd2c01 100644
--- a/include/asm-generic/memory_model.h
+++ b/include/asm-generic/memory_model.h
@@ -23,29 +23,23 @@
 
 #endif /* CONFIG_DISCONTIGMEM */
 
-#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
-struct page;
-/* this is useful when inlined pfn_to_page is too big */
-extern struct page *pfn_to_page(unsigned long pfn);
-extern unsigned long page_to_pfn(struct page *page);
-#else
 /*
  * supports 3 memory models.
  */
 #if defined(CONFIG_FLATMEM)
 
-#define pfn_to_page(pfn)	(mem_map + ((pfn) - ARCH_PFN_OFFSET))
-#define page_to_pfn(page)	((unsigned long)((page) - mem_map) + \
+#define __pfn_to_page(pfn)	(mem_map + ((pfn) - ARCH_PFN_OFFSET))
+#define __page_to_pfn(page)	((unsigned long)((page) - mem_map) + \
 				 ARCH_PFN_OFFSET)
 #elif defined(CONFIG_DISCONTIGMEM)
 
-#define pfn_to_page(pfn)			\
+#define __pfn_to_page(pfn)			\
 ({	unsigned long __pfn = (pfn);		\
 	unsigned long __nid = arch_pfn_to_nid(pfn);  \
 	NODE_DATA(__nid)->node_mem_map + arch_local_page_offset(__pfn, __nid);\
 })
 
-#define page_to_pfn(pg)							\
+#define __page_to_pfn(pg)						\
 ({	struct page *__pg = (pg);					\
 	struct pglist_data *__pgdat = NODE_DATA(page_to_nid(__pg));	\
 	(unsigned long)(__pg - __pgdat->node_mem_map) +			\
@@ -57,18 +51,27 @@ extern unsigned long page_to_pfn(struct page *page);
  * Note: section's mem_map is encorded to reflect its start_pfn.
  * section[i].section_mem_map == mem_map's address - start_pfn;
  */
-#define page_to_pfn(pg)					\
+#define __page_to_pfn(pg)					\
 ({	struct page *__pg = (pg);				\
 	int __sec = page_to_section(__pg);			\
 	__pg - __section_mem_map_addr(__nr_to_section(__sec));	\
 })
 
-#define pfn_to_page(pfn)				\
+#define __pfn_to_page(pfn)				\
 ({	unsigned long __pfn = (pfn);			\
 	struct mem_section *__sec = __pfn_to_section(__pfn);	\
 	__section_mem_map_addr(__sec) + __pfn;		\
 })
 #endif /* CONFIG_FLATMEM/DISCONTIGMEM/SPARSEMEM */
+
+#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
+struct page;
+/* this is useful when inlined pfn_to_page is too big */
+extern struct page *pfn_to_page(unsigned long pfn);
+extern unsigned long page_to_pfn(struct page *page);
+#else
+#define page_to_pfn __page_to_pfn
+#define pfn_to_page __pfn_to_page
 #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
 
 #endif /* __ASSEMBLY__ */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9dfbe6b7d1c..5af33186a25 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2879,42 +2879,14 @@ void *__init alloc_large_system_hash(const char *tablename,
 }
 
 #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
-/*
- * pfn <-> page translation. out-of-line version.
- * (see asm-generic/memory_model.h)
- */
-#if defined(CONFIG_FLATMEM)
-struct page *pfn_to_page(unsigned long pfn)
-{
-	return mem_map + (pfn - ARCH_PFN_OFFSET);
-}
-unsigned long page_to_pfn(struct page *page)
-{
-	return (page - mem_map) + ARCH_PFN_OFFSET;
-}
-#elif defined(CONFIG_DISCONTIGMEM)
 struct page *pfn_to_page(unsigned long pfn)
 {
-	int nid = arch_pfn_to_nid(pfn);
-	return NODE_DATA(nid)->node_mem_map + arch_local_page_offset(pfn,nid);
+	return __pfn_to_page(pfn);
 }
 unsigned long page_to_pfn(struct page *page)
 {
-	struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
-	return (page - pgdat->node_mem_map) + pgdat->node_start_pfn;
-}
-#elif defined(CONFIG_SPARSEMEM)
-struct page *pfn_to_page(unsigned long pfn)
-{
-	return __section_mem_map_addr(__pfn_to_section(pfn)) + pfn;
-}
-
-unsigned long page_to_pfn(struct page *page)
-{
-	long section_id = page_to_section(page);
-	return page - __section_mem_map_addr(__nr_to_section(section_id));
+	return __page_to_pfn(page);
 }
-#endif /* CONFIG_FLATMEM/DISCONTIGMME/SPARSEMEM */
 EXPORT_SYMBOL(pfn_to_page);
 EXPORT_SYMBOL(page_to_pfn);
 #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
-- 
cgit v1.2.3-70-g09d2


From fadd8fbd153c12963f8fe3c9ef7f8967f286f98b Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Fri, 23 Jun 2006 02:03:13 -0700
Subject: [PATCH] support for panic at OOM

This patch adds panic_on_oom sysctl under sys.vm.

When sysctl vm.panic_on_oom = 1, the kernel panics intead of killing rogue
processes.  And if vm.panic_on_oom is 0 the kernel will do oom_kill() in
the same way as it does today.  Of course, the default value is 0 and only
root can modifies it.

In general, oom_killer works well and kill rogue processes.  So the whole
system can survive.  But there are environments where panic is preferable
rather than kill some processes.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/sysctl/vm.txt | 13 +++++++++++++
 include/linux/sysctl.h      |  1 +
 kernel/sysctl.c             |  9 +++++++++
 mm/oom_kill.c               |  3 +++
 4 files changed, 26 insertions(+)

(limited to 'mm')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index a46c10fcddf..2dc246af488 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -29,6 +29,7 @@ Currently, these files are in /proc/sys/vm:
 - drop-caches
 - zone_reclaim_mode
 - zone_reclaim_interval
+- panic_on_oom
 
 ==============================================================
 
@@ -178,3 +179,15 @@ Time is set in seconds and set by default to 30 seconds.
 Reduce the interval if undesired off node allocations occur. However, too
 frequent scans will have a negative impact onoff node allocation performance.
 
+=============================================================
+
+panic_on_oom
+
+This enables or disables panic on out-of-memory feature.  If this is set to 1,
+the kernel panics when out-of-memory happens.  If this is set to 0, the kernel
+will kill some rogue process, called oom_killer.  Usually, oom_killer can kill
+rogue processes and system will survive.  If you want to panic the system
+rather than killing rogue processes, set this to 1.
+
+The default value is 0.
+
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index cee944dbdcd..c7132029af0 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -186,6 +186,7 @@ enum
 	VM_PERCPU_PAGELIST_FRACTION=30,/* int: fraction of pages in each percpu_pagelist */
 	VM_ZONE_RECLAIM_MODE=31, /* reclaim local zone memory before going off node */
 	VM_ZONE_RECLAIM_INTERVAL=32, /* time period to wait after reclaim failure */
+	VM_PANIC_ON_OOM=33,	/* panic at out-of-memory */
 };
 
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0d656e61621..072ac446810 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -59,6 +59,7 @@ extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
 extern int C_A_D;
 extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
+extern int sysctl_panic_on_oom;
 extern int max_threads;
 extern int sysrq_enabled;
 extern int core_uses_pid;
@@ -701,6 +702,14 @@ static ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= VM_PANIC_ON_OOM,
+		.procname	= "panic_on_oom",
+		.data		= &sysctl_panic_on_oom,
+		.maxlen		= sizeof(sysctl_panic_on_oom),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 	{
 		.ctl_name	= VM_OVERCOMMIT_RATIO,
 		.procname	= "overcommit_ratio",
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 042e6436c3e..f9bb3cf3238 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -22,6 +22,7 @@
 #include <linux/jiffies.h>
 #include <linux/cpuset.h>
 
+int sysctl_panic_on_oom;
 /* #define DEBUG */
 
 /**
@@ -344,6 +345,8 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
 		break;
 
 	case CONSTRAINT_NONE:
+		if (sysctl_panic_on_oom)
+			panic("out of memory. panic_on_oom is selected\n");
 retry:
 		/*
 		 * Rambo mode: Shoot down a process and hope it solves whatever
-- 
cgit v1.2.3-70-g09d2


From 6937a25cff818d32d0f9ff58a518c9ab96760aeb Mon Sep 17 00:00:00 2001
From: Dave Peterson <dsp@llnl.gov>
Date: Fri, 23 Jun 2006 02:03:13 -0700
Subject: [PATCH] mm: fix typos in comments in mm/oom_kill.c

This fixes a few typos in the comments in mm/oom_kill.c.

Signed-off-by: David S. Peterson <dsp@llnl.gov>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f9bb3cf3238..d46ed0f1dc0 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -26,7 +26,7 @@ int sysctl_panic_on_oom;
 /* #define DEBUG */
 
 /**
- * oom_badness - calculate a numeric value for how bad this task has been
+ * badness - calculate a numeric value for how bad this task has been
  * @p: task struct of which task we should calculate
  * @uptime: current uptime in seconds
  *
@@ -201,7 +201,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
 			continue;
 
 		/*
-		 * This is in the process of releasing memory so for wait it
+		 * This is in the process of releasing memory so wait for it
 		 * to finish before killing some other task by mistake.
 		 */
 		releasing = test_tsk_thread_flag(p, TIF_MEMDIE) ||
@@ -307,7 +307,7 @@ static int oom_kill_process(struct task_struct *p, unsigned long points,
 }
 
 /**
- * oom_kill - kill the "best" process when we run out of memory
+ * out_of_memory - kill the "best" process when we run out of memory
  *
  * If we run out of memory, we have the choice between either
  * killing a random task (bad), letting the system crash (worse)
-- 
cgit v1.2.3-70-g09d2


From a43a8c39bbb493c9e93f6764b350de2e33e18e92 Mon Sep 17 00:00:00 2001
From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Date: Fri, 23 Jun 2006 02:03:15 -0700
Subject: [PATCH] tightening hugetlb strict accounting

Current hugetlb strict accounting for shared mapping always assume mapping
starts at zero file offset and reserves pages between zero and size of the
file.  This assumption often reserves (or lock down) a lot more pages then
necessary if application maps at none zero file offset.  libhugetlbfs is
one example that requires proper reservation on shared mapping starts at
none zero offset.

This patch extends the reservation and hugetlb strict accounting to support
any arbitrary pair of (offset, len), resulting a much more robust and
accurate scheme.  More importantly, it won't lock down any hugetlb pages
outside file mapping.

Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Acked-by: Adam Litke <agl@us.ibm.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/hugetlbfs/inode.c    |  21 ++--
 include/linux/hugetlb.h |   8 +-
 mm/hugetlb.c            | 282 +++++++++++++++++++++++++++---------------------
 3 files changed, 173 insertions(+), 138 deletions(-)

(limited to 'mm')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 678fc72c364..e6410d8edd0 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -59,7 +59,6 @@ static void huge_pagevec_release(struct pagevec *pvec)
 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct inode *inode = file->f_dentry->d_inode;
-	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
 	loff_t len, vma_len;
 	int ret;
 
@@ -87,9 +86,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size)
 		goto out;
 
-	if (vma->vm_flags & VM_MAYSHARE)
-		if (hugetlb_extend_reservation(info, len >> HPAGE_SHIFT) != 0)
-			goto out;
+	if (vma->vm_flags & VM_MAYSHARE &&
+	    hugetlb_reserve_pages(inode, vma->vm_pgoff >> (HPAGE_SHIFT-PAGE_SHIFT),
+				  len >> HPAGE_SHIFT))
+		goto out;
 
 	ret = 0;
 	hugetlb_prefault_arch_hook(vma->vm_mm);
@@ -195,12 +195,8 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart)
 	const pgoff_t start = lstart >> HPAGE_SHIFT;
 	struct pagevec pvec;
 	pgoff_t next;
-	int i;
+	int i, freed = 0;
 
-	hugetlb_truncate_reservation(HUGETLBFS_I(inode),
-				     lstart >> HPAGE_SHIFT);
-	if (!mapping->nrpages)
-		return;
 	pagevec_init(&pvec, 0);
 	next = start;
 	while (1) {
@@ -221,10 +217,12 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart)
 			truncate_huge_page(page);
 			unlock_page(page);
 			hugetlb_put_quota(mapping);
+			freed++;
 		}
 		huge_pagevec_release(&pvec);
 	}
 	BUG_ON(!lstart && mapping->nrpages);
+	hugetlb_unreserve_pages(inode, start, freed);
 }
 
 static void hugetlbfs_delete_inode(struct inode *inode)
@@ -366,6 +364,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
 		inode->i_mapping->a_ops = &hugetlbfs_aops;
 		inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		INIT_LIST_HEAD(&inode->i_mapping->private_list);
 		info = HUGETLBFS_I(inode);
 		mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, NULL);
 		switch (mode & S_IFMT) {
@@ -538,7 +537,6 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
 		hugetlbfs_inc_free_inodes(sbinfo);
 		return NULL;
 	}
-	p->prereserved_hpages = 0;
 	return &p->vfs_inode;
 }
 
@@ -781,8 +779,7 @@ struct file *hugetlb_zero_setup(size_t size)
 		goto out_file;
 
 	error = -ENOMEM;
-	if (hugetlb_extend_reservation(HUGETLBFS_I(inode),
-				       size >> HPAGE_SHIFT) != 0)
+	if (hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT))
 		goto out_inode;
 
 	d_instantiate(dentry, inode);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 4c5e610fe44..c25a38d8f60 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -23,6 +23,8 @@ int hugetlb_report_node_meminfo(int, char *);
 unsigned long hugetlb_total_pages(void);
 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, int write_access);
+int hugetlb_reserve_pages(struct inode *inode, long from, long to);
+void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
 
 extern unsigned long max_huge_pages;
 extern const unsigned long hugetlb_zero, hugetlb_infinity;
@@ -139,8 +141,6 @@ struct hugetlbfs_sb_info {
 
 struct hugetlbfs_inode_info {
 	struct shared_policy policy;
-	/* Protected by the (global) hugetlb_lock */
-	unsigned long prereserved_hpages;
 	struct inode vfs_inode;
 };
 
@@ -157,10 +157,6 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
 extern const struct file_operations hugetlbfs_file_operations;
 extern struct vm_operations_struct hugetlb_vm_ops;
 struct file *hugetlb_zero_setup(size_t);
-int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
-			       unsigned long atleast_hpages);
-void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
-				  unsigned long atmost_hpages);
 int hugetlb_get_quota(struct address_space *mapping);
 void hugetlb_put_quota(struct address_space *mapping);
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 832f676ca03..df499973255 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -22,7 +22,7 @@
 #include "internal.h"
 
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
-static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages;
+static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
 unsigned long max_huge_pages;
 static struct list_head hugepage_freelists[MAX_NUMNODES];
 static unsigned int nr_huge_pages_node[MAX_NUMNODES];
@@ -123,39 +123,13 @@ static int alloc_fresh_huge_page(void)
 static struct page *alloc_huge_page(struct vm_area_struct *vma,
 				    unsigned long addr)
 {
-	struct inode *inode = vma->vm_file->f_dentry->d_inode;
 	struct page *page;
-	int use_reserve = 0;
-	unsigned long idx;
 
 	spin_lock(&hugetlb_lock);
-
-	if (vma->vm_flags & VM_MAYSHARE) {
-
-		/* idx = radix tree index, i.e. offset into file in
-		 * HPAGE_SIZE units */
-		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
-			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
-
-		/* The hugetlbfs specific inode info stores the number
-		 * of "guaranteed available" (huge) pages.  That is,
-		 * the first 'prereserved_hpages' pages of the inode
-		 * are either already instantiated, or have been
-		 * pre-reserved (by hugetlb_reserve_for_inode()). Here
-		 * we're in the process of instantiating the page, so
-		 * we use this to determine whether to draw from the
-		 * pre-reserved pool or the truly free pool. */
-		if (idx < HUGETLBFS_I(inode)->prereserved_hpages)
-			use_reserve = 1;
-	}
-
-	if (!use_reserve) {
-		if (free_huge_pages <= reserved_huge_pages)
-			goto fail;
-	} else {
-		BUG_ON(reserved_huge_pages == 0);
-		reserved_huge_pages--;
-	}
+	if (vma->vm_flags & VM_MAYSHARE)
+		resv_huge_pages--;
+	else if (free_huge_pages <= resv_huge_pages)
+		goto fail;
 
 	page = dequeue_huge_page(vma, addr);
 	if (!page)
@@ -165,96 +139,11 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
 	set_page_refcounted(page);
 	return page;
 
- fail:
-	WARN_ON(use_reserve); /* reserved allocations shouldn't fail */
+fail:
 	spin_unlock(&hugetlb_lock);
 	return NULL;
 }
 
-/* hugetlb_extend_reservation()
- *
- * Ensure that at least 'atleast' hugepages are, and will remain,
- * available to instantiate the first 'atleast' pages of the given
- * inode.  If the inode doesn't already have this many pages reserved
- * or instantiated, set aside some hugepages in the reserved pool to
- * satisfy later faults (or fail now if there aren't enough, rather
- * than getting the SIGBUS later).
- */
-int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
-			       unsigned long atleast)
-{
-	struct inode *inode = &info->vfs_inode;
-	unsigned long change_in_reserve = 0;
-	int ret = 0;
-
-	spin_lock(&hugetlb_lock);
-	read_lock_irq(&inode->i_mapping->tree_lock);
-
-	if (info->prereserved_hpages >= atleast)
-		goto out;
-
-	/* Because we always call this on shared mappings, none of the
-	 * pages beyond info->prereserved_hpages can have been
-	 * instantiated, so we need to reserve all of them now. */
-	change_in_reserve = atleast - info->prereserved_hpages;
-
-	if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	reserved_huge_pages += change_in_reserve;
-	info->prereserved_hpages = atleast;
-
- out:
-	read_unlock_irq(&inode->i_mapping->tree_lock);
-	spin_unlock(&hugetlb_lock);
-
-	return ret;
-}
-
-/* hugetlb_truncate_reservation()
- *
- * This returns pages reserved for the given inode to the general free
- * hugepage pool.  If the inode has any pages prereserved, but not
- * instantiated, beyond offset (atmost << HPAGE_SIZE), then release
- * them.
- */
-void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
-				  unsigned long atmost)
-{
-	struct inode *inode = &info->vfs_inode;
-	struct address_space *mapping = inode->i_mapping;
-	unsigned long idx;
-	unsigned long change_in_reserve = 0;
-	struct page *page;
-
-	spin_lock(&hugetlb_lock);
-	read_lock_irq(&inode->i_mapping->tree_lock);
-
-	if (info->prereserved_hpages <= atmost)
-		goto out;
-
-	/* Count pages which were reserved, but not instantiated, and
-	 * which we can now release. */
-	for (idx = atmost; idx < info->prereserved_hpages; idx++) {
-		page = radix_tree_lookup(&mapping->page_tree, idx);
-		if (!page)
-			/* Pages which are already instantiated can't
-			 * be unreserved (and in fact have already
-			 * been removed from the reserved pool) */
-			change_in_reserve++;
-	}
-
-	BUG_ON(reserved_huge_pages < change_in_reserve);
-	reserved_huge_pages -= change_in_reserve;
-	info->prereserved_hpages = atmost;
-
- out:
-	read_unlock_irq(&inode->i_mapping->tree_lock);
-	spin_unlock(&hugetlb_lock);
-}
-
 static int __init hugetlb_init(void)
 {
 	unsigned long i;
@@ -334,7 +223,7 @@ static unsigned long set_max_huge_pages(unsigned long count)
 		return nr_huge_pages;
 
 	spin_lock(&hugetlb_lock);
-	count = max(count, reserved_huge_pages);
+	count = max(count, resv_huge_pages);
 	try_to_free_low(count);
 	while (count < nr_huge_pages) {
 		struct page *page = dequeue_huge_page(NULL, 0);
@@ -361,11 +250,11 @@ int hugetlb_report_meminfo(char *buf)
 	return sprintf(buf,
 			"HugePages_Total: %5lu\n"
 			"HugePages_Free:  %5lu\n"
-		        "HugePages_Rsvd:  %5lu\n"
+			"HugePages_Rsvd:  %5lu\n"
 			"Hugepagesize:    %5lu kB\n",
 			nr_huge_pages,
 			free_huge_pages,
-		        reserved_huge_pages,
+			resv_huge_pages,
 			HPAGE_SIZE/1024);
 }
 
@@ -754,3 +643,156 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
 	flush_tlb_range(vma, start, end);
 }
 
+struct file_region {
+	struct list_head link;
+	long from;
+	long to;
+};
+
+static long region_add(struct list_head *head, long f, long t)
+{
+	struct file_region *rg, *nrg, *trg;
+
+	/* Locate the region we are either in or before. */
+	list_for_each_entry(rg, head, link)
+		if (f <= rg->to)
+			break;
+
+	/* Round our left edge to the current segment if it encloses us. */
+	if (f > rg->from)
+		f = rg->from;
+
+	/* Check for and consume any regions we now overlap with. */
+	nrg = rg;
+	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
+		if (&rg->link == head)
+			break;
+		if (rg->from > t)
+			break;
+
+		/* If this area reaches higher then extend our area to
+		 * include it completely.  If this is not the first area
+		 * which we intend to reuse, free it. */
+		if (rg->to > t)
+			t = rg->to;
+		if (rg != nrg) {
+			list_del(&rg->link);
+			kfree(rg);
+		}
+	}
+	nrg->from = f;
+	nrg->to = t;
+	return 0;
+}
+
+static long region_chg(struct list_head *head, long f, long t)
+{
+	struct file_region *rg, *nrg;
+	long chg = 0;
+
+	/* Locate the region we are before or in. */
+	list_for_each_entry(rg, head, link)
+		if (f <= rg->to)
+			break;
+
+	/* If we are below the current region then a new region is required.
+	 * Subtle, allocate a new region at the position but make it zero
+	 * size such that we can guarentee to record the reservation. */
+	if (&rg->link == head || t < rg->from) {
+		nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
+		if (nrg == 0)
+			return -ENOMEM;
+		nrg->from = f;
+		nrg->to   = f;
+		INIT_LIST_HEAD(&nrg->link);
+		list_add(&nrg->link, rg->link.prev);
+
+		return t - f;
+	}
+
+	/* Round our left edge to the current segment if it encloses us. */
+	if (f > rg->from)
+		f = rg->from;
+	chg = t - f;
+
+	/* Check for and consume any regions we now overlap with. */
+	list_for_each_entry(rg, rg->link.prev, link) {
+		if (&rg->link == head)
+			break;
+		if (rg->from > t)
+			return chg;
+
+		/* We overlap with this area, if it extends futher than
+		 * us then we must extend ourselves.  Account for its
+		 * existing reservation. */
+		if (rg->to > t) {
+			chg += rg->to - t;
+			t = rg->to;
+		}
+		chg -= rg->to - rg->from;
+	}
+	return chg;
+}
+
+static long region_truncate(struct list_head *head, long end)
+{
+	struct file_region *rg, *trg;
+	long chg = 0;
+
+	/* Locate the region we are either in or before. */
+	list_for_each_entry(rg, head, link)
+		if (end <= rg->to)
+			break;
+	if (&rg->link == head)
+		return 0;
+
+	/* If we are in the middle of a region then adjust it. */
+	if (end > rg->from) {
+		chg = rg->to - end;
+		rg->to = end;
+		rg = list_entry(rg->link.next, typeof(*rg), link);
+	}
+
+	/* Drop any remaining regions. */
+	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
+		if (&rg->link == head)
+			break;
+		chg += rg->to - rg->from;
+		list_del(&rg->link);
+		kfree(rg);
+	}
+	return chg;
+}
+
+static int hugetlb_acct_memory(long delta)
+{
+	int ret = -ENOMEM;
+
+	spin_lock(&hugetlb_lock);
+	if ((delta + resv_huge_pages) <= free_huge_pages) {
+		resv_huge_pages += delta;
+		ret = 0;
+	}
+	spin_unlock(&hugetlb_lock);
+	return ret;
+}
+
+int hugetlb_reserve_pages(struct inode *inode, long from, long to)
+{
+	long ret, chg;
+
+	chg = region_chg(&inode->i_mapping->private_list, from, to);
+	if (chg < 0)
+		return chg;
+	ret = hugetlb_acct_memory(chg);
+	if (ret < 0)
+		return ret;
+	region_add(&inode->i_mapping->private_list, from, to);
+	return 0;
+}
+
+void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
+{
+	long chg = region_truncate(&inode->i_mapping->private_list, offset);
+	hugetlb_acct_memory(freed - chg);
+}
-- 
cgit v1.2.3-70-g09d2


From e1b6aa6f1404f162697650df2cdb6c374b1d6a5b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 23 Jun 2006 02:03:17 -0700
Subject: [PATCH] slab: clean up kmem_getpages

The last ifdef addition hit the ugliness treshold on this functions, so:

 - rename the variable i to nr_pages so it's somewhat descriptive
 - remove the addr variable and do the page_address call at the very end
 - instead of ifdef'ing the whole alloc_pages_node call just make the
   __GFP_COMP addition to flags conditional
 - rewrite the __GFP_COMP comment to make sense

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index a94cf0fea8a..7bd19639efd 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1489,31 +1489,29 @@ __initcall(cpucache_init);
 static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 {
 	struct page *page;
-	void *addr;
+	int nr_pages;
 	int i;
 
-	flags |= cachep->gfpflags;
 #ifndef CONFIG_MMU
-	/* nommu uses slab's for process anonymous memory allocations, so
-	 * requires __GFP_COMP to properly refcount higher order allocations"
+	/*
+	 * Nommu uses slab's for process anonymous memory allocations, and thus
+	 * requires __GFP_COMP to properly refcount higher order allocations
 	 */
-	page = alloc_pages_node(nodeid, (flags | __GFP_COMP), cachep->gfporder);
-#else
-	page = alloc_pages_node(nodeid, flags, cachep->gfporder);
+	flags |= __GFP_COMP;
 #endif
+	flags |= cachep->gfpflags;
+
+	page = alloc_pages_node(nodeid, flags, cachep->gfporder);
 	if (!page)
 		return NULL;
-	addr = page_address(page);
 
-	i = (1 << cachep->gfporder);
+	nr_pages = (1 << cachep->gfporder);
 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
-		atomic_add(i, &slab_reclaim_pages);
-	add_page_state(nr_slab, i);
-	while (i--) {
-		__SetPageSlab(page);
-		page++;
-	}
-	return addr;
+		atomic_add(nr_pages, &slab_reclaim_pages);
+	add_page_state(nr_slab, nr_pages);
+	for (i = 0; i < nr_pages; i++)
+		__SetPageSlab(page + i);
+	return page_address(page);
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 7a7c381d25067b9a2bfe025dfcb16459daec0373 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 23 Jun 2006 02:03:17 -0700
Subject: [PATCH] slab: stop using list_for_each

Use the _entry variant everywhere to clean the code up a tiny bit.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 38 +++++++++++---------------------------
 1 file changed, 11 insertions(+), 27 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 7bd19639efd..54bd2eb3f2f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1950,8 +1950,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	void (*dtor)(void*, struct kmem_cache *, unsigned long))
 {
 	size_t left_over, slab_size, ralign;
-	struct kmem_cache *cachep = NULL;
-	struct list_head *p;
+	struct kmem_cache *cachep = NULL, *pc;
 
 	/*
 	 * Sanity checks... these are all serious usage bugs.
@@ -1971,8 +1970,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 
 	mutex_lock(&cache_chain_mutex);
 
-	list_for_each(p, &cache_chain) {
-		struct kmem_cache *pc = list_entry(p, struct kmem_cache, next);
+	list_for_each_entry(pc, &cache_chain, next) {
 		mm_segment_t old_fs = get_fs();
 		char tmp;
 		int res;
@@ -3690,7 +3688,7 @@ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
  */
 static void cache_reap(void *unused)
 {
-	struct list_head *walk;
+	struct kmem_cache *searchp;
 	struct kmem_list3 *l3;
 	int node = numa_node_id();
 
@@ -3701,13 +3699,11 @@ static void cache_reap(void *unused)
 		return;
 	}
 
-	list_for_each(walk, &cache_chain) {
-		struct kmem_cache *searchp;
+	list_for_each_entry(searchp, &cache_chain, next) {
 		struct list_head *p;
 		int tofree;
 		struct slab *slabp;
 
-		searchp = list_entry(walk, struct kmem_cache, next);
 		check_irq_on();
 
 		/*
@@ -3835,7 +3831,6 @@ static void s_stop(struct seq_file *m, void *p)
 static int s_show(struct seq_file *m, void *p)
 {
 	struct kmem_cache *cachep = p;
-	struct list_head *q;
 	struct slab *slabp;
 	unsigned long active_objs;
 	unsigned long num_objs;
@@ -3856,15 +3851,13 @@ static int s_show(struct seq_file *m, void *p)
 		check_irq_on();
 		spin_lock_irq(&l3->list_lock);
 
-		list_for_each(q, &l3->slabs_full) {
-			slabp = list_entry(q, struct slab, list);
+		list_for_each_entry(slabp, &l3->slabs_full, list) {
 			if (slabp->inuse != cachep->num && !error)
 				error = "slabs_full accounting error";
 			active_objs += cachep->num;
 			active_slabs++;
 		}
-		list_for_each(q, &l3->slabs_partial) {
-			slabp = list_entry(q, struct slab, list);
+		list_for_each_entry(slabp, &l3->slabs_partial, list) {
 			if (slabp->inuse == cachep->num && !error)
 				error = "slabs_partial inuse accounting error";
 			if (!slabp->inuse && !error)
@@ -3872,8 +3865,7 @@ static int s_show(struct seq_file *m, void *p)
 			active_objs += slabp->inuse;
 			active_slabs++;
 		}
-		list_for_each(q, &l3->slabs_free) {
-			slabp = list_entry(q, struct slab, list);
+		list_for_each_entry(slabp, &l3->slabs_free, list) {
 			if (slabp->inuse && !error)
 				error = "slabs_free/inuse accounting error";
 			num_slabs++;
@@ -3966,7 +3958,7 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
 {
 	char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
 	int limit, batchcount, shared, res;
-	struct list_head *p;
+	struct kmem_cache *cachep;
 
 	if (count > MAX_SLABINFO_WRITE)
 		return -EINVAL;
@@ -3985,10 +3977,7 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
 	/* Find the cache in the chain of caches. */
 	mutex_lock(&cache_chain_mutex);
 	res = -EINVAL;
-	list_for_each(p, &cache_chain) {
-		struct kmem_cache *cachep;
-
-		cachep = list_entry(p, struct kmem_cache, next);
+	list_for_each_entry(cachep, &cache_chain, next) {
 		if (!strcmp(cachep->name, kbuf)) {
 			if (limit < 1 || batchcount < 1 ||
 					batchcount > limit || shared < 0) {
@@ -4090,7 +4079,6 @@ static void show_symbol(struct seq_file *m, unsigned long address)
 static int leaks_show(struct seq_file *m, void *p)
 {
 	struct kmem_cache *cachep = p;
-	struct list_head *q;
 	struct slab *slabp;
 	struct kmem_list3 *l3;
 	const char *name;
@@ -4115,14 +4103,10 @@ static int leaks_show(struct seq_file *m, void *p)
 		check_irq_on();
 		spin_lock_irq(&l3->list_lock);
 
-		list_for_each(q, &l3->slabs_full) {
-			slabp = list_entry(q, struct slab, list);
+		list_for_each_entry(slabp, &l3->slabs_full, list)
 			handle_slab(n, cachep, slabp);
-		}
-		list_for_each(q, &l3->slabs_partial) {
-			slabp = list_entry(q, struct slab, list);
+		list_for_each_entry(slabp, &l3->slabs_partial, list)
 			handle_slab(n, cachep, slabp);
-		}
 		spin_unlock_irq(&l3->list_lock);
 	}
 	name = cachep->name;
-- 
cgit v1.2.3-70-g09d2


From d6277db4ab271862ed599da08d78961c70f00002 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 23 Jun 2006 02:03:18 -0700
Subject: [PATCH] swsusp: rework memory shrinker
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rework the swsusp's memory shrinker in the following way:

- Simplify balance_pgdat() by removing all of the swsusp-related code
  from it.

- Make shrink_all_memory() use shrink_slab() and a new function
  shrink_all_zones() which calls shrink_active_list() and
  shrink_inactive_list() directly for each zone in a way that's optimized
  for suspend.

In shrink_all_memory() we try to free exactly as many pages as the caller
asks for, preferably in one shot, starting from easier targets.   If slab
caches are huge, they are most likely to have enough pages to reclaim.
 The inactive lists are next (the zones with more inactive pages go first)
etc.

Each time shrink_all_memory() attempts to shrink the active and inactive
lists for each zone in 5 passes.   In the first pass, only the inactive
lists are taken into consideration.   In the next two passes the active
lists are also shrunk, but mapped pages are not reclaimed.   In the last
two passes the active and inactive lists are shrunk and mapped pages are
reclaimed as well.  The aim of this is to alter the reclaim logic to choose
the best pages to keep on resume and improve the responsiveness of the
resumed system.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/swsusp.c |  10 ++-
 mm/vmscan.c           | 219 +++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 172 insertions(+), 57 deletions(-)

(limited to 'mm')

diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index c4016cbbd3e..f9238faf76e 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -175,6 +175,12 @@ void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
  */
 
 #define SHRINK_BITE	10000
+static inline unsigned long __shrink_memory(long tmp)
+{
+	if (tmp > SHRINK_BITE)
+		tmp = SHRINK_BITE;
+	return shrink_all_memory(tmp);
+}
 
 int swsusp_shrink_memory(void)
 {
@@ -195,12 +201,12 @@ int swsusp_shrink_memory(void)
 			if (!is_highmem(zone))
 				tmp -= zone->free_pages;
 		if (tmp > 0) {
-			tmp = shrink_all_memory(SHRINK_BITE);
+			tmp = __shrink_memory(tmp);
 			if (!tmp)
 				return -ENOMEM;
 			pages += tmp;
 		} else if (size > image_size / PAGE_SIZE) {
-			tmp = shrink_all_memory(SHRINK_BITE);
+			tmp = __shrink_memory(size - (image_size / PAGE_SIZE));
 			pages += tmp;
 		}
 		printk("\b%c", p[i++%4]);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 440a733fe2e..46be8a02280 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -61,6 +61,8 @@ struct scan_control {
 	 * In this context, it doesn't matter that we scan the
 	 * whole list at once. */
 	int swap_cluster_max;
+
+	int swappiness;
 };
 
 /*
@@ -741,7 +743,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 		 * A 100% value of vm_swappiness overrides this algorithm
 		 * altogether.
 		 */
-		swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
+		swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
 
 		/*
 		 * Now use this metric to decide whether to start moving mapped
@@ -957,6 +959,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 		.may_writepage = !laptop_mode,
 		.swap_cluster_max = SWAP_CLUSTER_MAX,
 		.may_swap = 1,
+		.swappiness = vm_swappiness,
 	};
 
 	inc_page_state(allocstall);
@@ -1021,10 +1024,6 @@ out:
  * For kswapd, balance_pgdat() will work across all this node's zones until
  * they are all at pages_high.
  *
- * If `nr_pages' is non-zero then it is the number of pages which are to be
- * reclaimed, regardless of the zone occupancies.  This is a software suspend
- * special.
- *
  * Returns the number of pages which were actually freed.
  *
  * There is special handling here for zones which are full of pinned pages.
@@ -1042,10 +1041,8 @@ out:
  * the page allocator fallback scheme to ensure that aging of pages is balanced
  * across the zones.
  */
-static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages,
-				int order)
+static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
 {
-	unsigned long to_free = nr_pages;
 	int all_zones_ok;
 	int priority;
 	int i;
@@ -1055,7 +1052,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages,
 	struct scan_control sc = {
 		.gfp_mask = GFP_KERNEL,
 		.may_swap = 1,
-		.swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX,
+		.swap_cluster_max = SWAP_CLUSTER_MAX,
+		.swappiness = vm_swappiness,
 	};
 
 loop_again:
@@ -1082,31 +1080,26 @@ loop_again:
 
 		all_zones_ok = 1;
 
-		if (nr_pages == 0) {
-			/*
-			 * Scan in the highmem->dma direction for the highest
-			 * zone which needs scanning
-			 */
-			for (i = pgdat->nr_zones - 1; i >= 0; i--) {
-				struct zone *zone = pgdat->node_zones + i;
+		/*
+		 * Scan in the highmem->dma direction for the highest
+		 * zone which needs scanning
+		 */
+		for (i = pgdat->nr_zones - 1; i >= 0; i--) {
+			struct zone *zone = pgdat->node_zones + i;
 
-				if (!populated_zone(zone))
-					continue;
+			if (!populated_zone(zone))
+				continue;
 
-				if (zone->all_unreclaimable &&
-						priority != DEF_PRIORITY)
-					continue;
+			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+				continue;
 
-				if (!zone_watermark_ok(zone, order,
-						zone->pages_high, 0, 0)) {
-					end_zone = i;
-					goto scan;
-				}
+			if (!zone_watermark_ok(zone, order, zone->pages_high,
+					       0, 0)) {
+				end_zone = i;
+				goto scan;
 			}
-			goto out;
-		} else {
-			end_zone = pgdat->nr_zones - 1;
 		}
+		goto out;
 scan:
 		for (i = 0; i <= end_zone; i++) {
 			struct zone *zone = pgdat->node_zones + i;
@@ -1133,11 +1126,9 @@ scan:
 			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
 				continue;
 
-			if (nr_pages == 0) {	/* Not software suspend */
-				if (!zone_watermark_ok(zone, order,
-						zone->pages_high, end_zone, 0))
-					all_zones_ok = 0;
-			}
+			if (!zone_watermark_ok(zone, order, zone->pages_high,
+					       end_zone, 0))
+				all_zones_ok = 0;
 			zone->temp_priority = priority;
 			if (zone->prev_priority > priority)
 				zone->prev_priority = priority;
@@ -1162,8 +1153,6 @@ scan:
 			    total_scanned > nr_reclaimed + nr_reclaimed / 2)
 				sc.may_writepage = 1;
 		}
-		if (nr_pages && to_free > nr_reclaimed)
-			continue;	/* swsusp: need to do more work */
 		if (all_zones_ok)
 			break;		/* kswapd: all done */
 		/*
@@ -1179,7 +1168,7 @@ scan:
 		 * matches the direct reclaim path behaviour in terms of impact
 		 * on zone->*_priority.
 		 */
-		if ((nr_reclaimed >= SWAP_CLUSTER_MAX) && !nr_pages)
+		if (nr_reclaimed >= SWAP_CLUSTER_MAX)
 			break;
 	}
 out:
@@ -1261,7 +1250,7 @@ static int kswapd(void *p)
 		}
 		finish_wait(&pgdat->kswapd_wait, &wait);
 
-		balance_pgdat(pgdat, 0, order);
+		balance_pgdat(pgdat, order);
 	}
 	return 0;
 }
@@ -1290,35 +1279,154 @@ void wakeup_kswapd(struct zone *zone, int order)
 
 #ifdef CONFIG_PM
 /*
- * Try to free `nr_pages' of memory, system-wide.  Returns the number of freed
- * pages.
+ * Helper function for shrink_all_memory().  Tries to reclaim 'nr_pages' pages
+ * from LRU lists system-wide, for given pass and priority, and returns the
+ * number of reclaimed pages
+ *
+ * For pass > 3 we also try to shrink the LRU lists that contain a few pages
+ */
+static unsigned long shrink_all_zones(unsigned long nr_pages, int pass,
+				      int prio, struct scan_control *sc)
+{
+	struct zone *zone;
+	unsigned long nr_to_scan, ret = 0;
+
+	for_each_zone(zone) {
+
+		if (!populated_zone(zone))
+			continue;
+
+		if (zone->all_unreclaimable && prio != DEF_PRIORITY)
+			continue;
+
+		/* For pass = 0 we don't shrink the active list */
+		if (pass > 0) {
+			zone->nr_scan_active += (zone->nr_active >> prio) + 1;
+			if (zone->nr_scan_active >= nr_pages || pass > 3) {
+				zone->nr_scan_active = 0;
+				nr_to_scan = min(nr_pages, zone->nr_active);
+				shrink_active_list(nr_to_scan, zone, sc);
+			}
+		}
+
+		zone->nr_scan_inactive += (zone->nr_inactive >> prio) + 1;
+		if (zone->nr_scan_inactive >= nr_pages || pass > 3) {
+			zone->nr_scan_inactive = 0;
+			nr_to_scan = min(nr_pages, zone->nr_inactive);
+			ret += shrink_inactive_list(nr_to_scan, zone, sc);
+			if (ret >= nr_pages)
+				return ret;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * Try to free `nr_pages' of memory, system-wide, and return the number of
+ * freed pages.
+ *
+ * Rather than trying to age LRUs the aim is to preserve the overall
+ * LRU order by reclaiming preferentially
+ * inactive > active > active referenced > active mapped
  */
 unsigned long shrink_all_memory(unsigned long nr_pages)
 {
-	pg_data_t *pgdat;
-	unsigned long nr_to_free = nr_pages;
+	unsigned long lru_pages, nr_slab;
 	unsigned long ret = 0;
-	unsigned retry = 2;
-	struct reclaim_state reclaim_state = {
-		.reclaimed_slab = 0,
+	int pass;
+	struct reclaim_state reclaim_state;
+	struct zone *zone;
+	struct scan_control sc = {
+		.gfp_mask = GFP_KERNEL,
+		.may_swap = 0,
+		.swap_cluster_max = nr_pages,
+		.may_writepage = 1,
+		.swappiness = vm_swappiness,
 	};
 
 	current->reclaim_state = &reclaim_state;
-repeat:
-	for_each_online_pgdat(pgdat) {
-		unsigned long freed;
 
-		freed = balance_pgdat(pgdat, nr_to_free, 0);
-		ret += freed;
-		nr_to_free -= freed;
-		if ((long)nr_to_free <= 0)
+	lru_pages = 0;
+	for_each_zone(zone)
+		lru_pages += zone->nr_active + zone->nr_inactive;
+
+	nr_slab = read_page_state(nr_slab);
+	/* If slab caches are huge, it's better to hit them first */
+	while (nr_slab >= lru_pages) {
+		reclaim_state.reclaimed_slab = 0;
+		shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
+		if (!reclaim_state.reclaimed_slab)
 			break;
+
+		ret += reclaim_state.reclaimed_slab;
+		if (ret >= nr_pages)
+			goto out;
+
+		nr_slab -= reclaim_state.reclaimed_slab;
 	}
-	if (retry-- && ret < nr_pages) {
-		blk_congestion_wait(WRITE, HZ/5);
-		goto repeat;
+
+	/*
+	 * We try to shrink LRUs in 5 passes:
+	 * 0 = Reclaim from inactive_list only
+	 * 1 = Reclaim from active list but don't reclaim mapped
+	 * 2 = 2nd pass of type 1
+	 * 3 = Reclaim mapped (normal reclaim)
+	 * 4 = 2nd pass of type 3
+	 */
+	for (pass = 0; pass < 5; pass++) {
+		int prio;
+
+		/* Needed for shrinking slab caches later on */
+		if (!lru_pages)
+			for_each_zone(zone) {
+				lru_pages += zone->nr_active;
+				lru_pages += zone->nr_inactive;
+			}
+
+		/* Force reclaiming mapped pages in the passes #3 and #4 */
+		if (pass > 2) {
+			sc.may_swap = 1;
+			sc.swappiness = 100;
+		}
+
+		for (prio = DEF_PRIORITY; prio >= 0; prio--) {
+			unsigned long nr_to_scan = nr_pages - ret;
+
+			sc.nr_mapped = read_page_state(nr_mapped);
+			sc.nr_scanned = 0;
+
+			ret += shrink_all_zones(nr_to_scan, prio, pass, &sc);
+			if (ret >= nr_pages)
+				goto out;
+
+			reclaim_state.reclaimed_slab = 0;
+			shrink_slab(sc.nr_scanned, sc.gfp_mask, lru_pages);
+			ret += reclaim_state.reclaimed_slab;
+			if (ret >= nr_pages)
+				goto out;
+
+			if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
+				blk_congestion_wait(WRITE, HZ / 10);
+		}
+
+		lru_pages = 0;
 	}
+
+	/*
+	 * If ret = 0, we could not shrink LRUs, but there may be something
+	 * in slab caches
+	 */
+	if (!ret)
+		do {
+			reclaim_state.reclaimed_slab = 0;
+			shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
+			ret += reclaim_state.reclaimed_slab;
+		} while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
+
+out:
 	current->reclaim_state = NULL;
+
 	return ret;
 }
 #endif
@@ -1416,6 +1524,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 		.swap_cluster_max = max_t(unsigned long, nr_pages,
 					SWAP_CLUSTER_MAX),
 		.gfp_mask = gfp_mask,
+		.swappiness = vm_swappiness,
 	};
 
 	disable_swap_token();
-- 
cgit v1.2.3-70-g09d2


From 833423143c3a7c6545e409d65febd0d92deb351b Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Fri, 23 Jun 2006 02:03:20 -0700
Subject: [PATCH] mm: introduce remap_vmalloc_range()

Add remap_vmalloc_range, vmalloc_user, and vmalloc_32_user so that drivers
can have a nice interface for remapping vmalloc memory.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/vmalloc.h |   8 ++++
 mm/vmalloc.c            | 122 +++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 128 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 1d5577b2b75..f6024ab4eff 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -4,10 +4,13 @@
 #include <linux/spinlock.h>
 #include <asm/page.h>		/* pgprot_t */
 
+struct vm_area_struct;
+
 /* bits in vm_struct->flags */
 #define VM_IOREMAP	0x00000001	/* ioremap() and friends */
 #define VM_ALLOC	0x00000002	/* vmalloc() */
 #define VM_MAP		0x00000004	/* vmap()ed pages */
+#define VM_USERMAP	0x00000008	/* suitable for remap_vmalloc_range */
 /* bits [20..32] reserved for arch specific ioremap internals */
 
 /*
@@ -32,9 +35,11 @@ struct vm_struct {
  *	Highlevel APIs for driver use
  */
 extern void *vmalloc(unsigned long size);
+extern void *vmalloc_user(unsigned long size);
 extern void *vmalloc_node(unsigned long size, int node);
 extern void *vmalloc_exec(unsigned long size);
 extern void *vmalloc_32(unsigned long size);
+extern void *vmalloc_32_user(unsigned long size);
 extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot);
 extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask,
 				pgprot_t prot);
@@ -45,6 +50,9 @@ extern void vfree(void *addr);
 extern void *vmap(struct page **pages, unsigned int count,
 			unsigned long flags, pgprot_t prot);
 extern void vunmap(void *addr);
+
+extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
+							unsigned long pgoff);
  
 /*
  *	Lowlevel-APIs (not for driver use!)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index c0504f1e34e..35f8553f893 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -256,6 +256,19 @@ struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int
 	return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node);
 }
 
+/* Caller must hold vmlist_lock */
+static struct vm_struct *__find_vm_area(void *addr)
+{
+	struct vm_struct *tmp;
+
+	for (tmp = vmlist; tmp != NULL; tmp = tmp->next) {
+		 if (tmp->addr == addr)
+			break;
+	}
+
+	return tmp;
+}
+
 /* Caller must hold vmlist_lock */
 struct vm_struct *__remove_vm_area(void *addr)
 {
@@ -498,10 +511,32 @@ EXPORT_SYMBOL(__vmalloc);
  */
 void *vmalloc(unsigned long size)
 {
-       return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
+	return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
 }
 EXPORT_SYMBOL(vmalloc);
 
+/**
+ *	vmalloc_user  -  allocate virtually contiguous memory which has
+ *			   been zeroed so it can be mapped to userspace without
+ *			   leaking data.
+ *
+ *	@size:		allocation size
+ */
+void *vmalloc_user(unsigned long size)
+{
+	struct vm_struct *area;
+	void *ret;
+
+	ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
+	write_lock(&vmlist_lock);
+	area = __find_vm_area(ret);
+	area->flags |= VM_USERMAP;
+	write_unlock(&vmlist_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(vmalloc_user);
+
 /**
  *	vmalloc_node  -  allocate memory on a specific node
  *
@@ -516,7 +551,7 @@ EXPORT_SYMBOL(vmalloc);
  */
 void *vmalloc_node(unsigned long size, int node)
 {
-       return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node);
+	return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node);
 }
 EXPORT_SYMBOL(vmalloc_node);
 
@@ -556,6 +591,28 @@ void *vmalloc_32(unsigned long size)
 }
 EXPORT_SYMBOL(vmalloc_32);
 
+/**
+ *	vmalloc_32_user  -  allocate virtually contiguous memory (32bit
+ *			      addressable) which is zeroed so it can be
+ *			      mapped to userspace without leaking data.
+ *
+ *	@size:		allocation size
+ */
+void *vmalloc_32_user(unsigned long size)
+{
+	struct vm_struct *area;
+	void *ret;
+
+	ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
+	write_lock(&vmlist_lock);
+	area = __find_vm_area(ret);
+	area->flags |= VM_USERMAP;
+	write_unlock(&vmlist_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(vmalloc_32_user);
+
 long vread(char *buf, char *addr, unsigned long count)
 {
 	struct vm_struct *tmp;
@@ -630,3 +687,64 @@ finished:
 	read_unlock(&vmlist_lock);
 	return buf - buf_start;
 }
+
+/**
+ *	remap_vmalloc_range  -  map vmalloc pages to userspace
+ *
+ *	@vma:		vma to cover (map full range of vma)
+ *	@addr:		vmalloc memory
+ *	@pgoff:		number of pages into addr before first page to map
+ *	@returns:	0 for success, -Exxx on failure
+ *
+ *	This function checks that addr is a valid vmalloc'ed area, and
+ *	that it is big enough to cover the vma. Will return failure if
+ *	that criteria isn't met.
+ *
+ *	Similar to remap_pfn_range (see mm/memory.c)
+ */
+int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
+						unsigned long pgoff)
+{
+	struct vm_struct *area;
+	unsigned long uaddr = vma->vm_start;
+	unsigned long usize = vma->vm_end - vma->vm_start;
+	int ret;
+
+	if ((PAGE_SIZE-1) & (unsigned long)addr)
+		return -EINVAL;
+
+	read_lock(&vmlist_lock);
+	area = __find_vm_area(addr);
+	if (!area)
+		goto out_einval_locked;
+
+	if (!(area->flags & VM_USERMAP))
+		goto out_einval_locked;
+
+	if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE)
+		goto out_einval_locked;
+	read_unlock(&vmlist_lock);
+
+	addr += pgoff << PAGE_SHIFT;
+	do {
+		struct page *page = vmalloc_to_page(addr);
+		ret = vm_insert_page(vma, uaddr, page);
+		if (ret)
+			return ret;
+
+		uaddr += PAGE_SIZE;
+		addr += PAGE_SIZE;
+		usize -= PAGE_SIZE;
+	} while (usize > 0);
+
+	/* Prevent "things" like memory migration? VM_flags need a cleanup... */
+	vma->vm_flags |= VM_RESERVED;
+
+	return ret;
+
+out_einval_locked:
+	read_unlock(&vmlist_lock);
+	return -EINVAL;
+}
+EXPORT_SYMBOL(remap_vmalloc_range);
+
-- 
cgit v1.2.3-70-g09d2


From b344e05c585406904c70865e531e02467c4c7931 Mon Sep 17 00:00:00 2001
From: Hua Zhong <hzhong@gmail.com>
Date: Fri, 23 Jun 2006 02:03:23 -0700
Subject: [PATCH] likely cleanup: remove unlikely in sys_mprotect()

With likely/unlikely profiling on my not-so-busy-typical-developmentsystem
there are 5k misses vs 2k hits.  So I guess we should remove the unlikely.

Signed-off-by: Hua Zhong <hzhong@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mprotect.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/mprotect.c b/mm/mprotect.c
index 4c14d4289b6..5faf01ad3ef 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -205,8 +205,7 @@ sys_mprotect(unsigned long start, size_t len, unsigned long prot)
 	/*
 	 * Does the application expect PROT_READ to imply PROT_EXEC:
 	 */
-	if (unlikely((prot & PROT_READ) &&
-			(current->personality & READ_IMPLIES_EXEC)))
+	if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
 		prot |= PROT_EXEC;
 
 	vm_flags = calc_vm_prot_bits(prot);
-- 
cgit v1.2.3-70-g09d2


From 58ce1fd5805647a58a050bbbbd2252ea5ecb47b3 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Fri, 23 Jun 2006 02:03:24 -0700
Subject: [PATCH] slab: redzone double-free detection

At present our slab debugging tells us that it detected a double-free or
corruption - it does not distinguish between them.  Sometimes it's useful
to be able to differentiate between these two types of information.

Add double-free detection to redzone verification when freeing an object.
As explained by Manfred, when we are freeing an object, both redzones
should be RED_ACTIVE.  However, if both are RED_INACTIVE, we are trying to
free an object that was already free'd.

Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 32 +++++++++++++++++++++++---------
 1 file changed, 23 insertions(+), 9 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 54bd2eb3f2f..2046da2aa46 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2636,6 +2636,28 @@ static void kfree_debugcheck(const void *objp)
 	}
 }
 
+static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
+{
+	unsigned long redzone1, redzone2;
+
+	redzone1 = *dbg_redzone1(cache, obj);
+	redzone2 = *dbg_redzone2(cache, obj);
+
+	/*
+	 * Redzone is ok.
+	 */
+	if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE)
+		return;
+
+	if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE)
+		slab_error(cache, "double free detected");
+	else
+		slab_error(cache, "memory outside object was overwritten");
+
+	printk(KERN_ERR "%p: redzone 1:0x%lx, redzone 2:0x%lx.\n",
+			obj, redzone1, redzone2);
+}
+
 static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
 				   void *caller)
 {
@@ -2659,15 +2681,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
 	slabp = page_get_slab(page);
 
 	if (cachep->flags & SLAB_RED_ZONE) {
-		if (*dbg_redzone1(cachep, objp) != RED_ACTIVE ||
-				*dbg_redzone2(cachep, objp) != RED_ACTIVE) {
-			slab_error(cachep, "double free, or memory outside"
-						" object was overwritten");
-			printk(KERN_ERR "%p: redzone 1:0x%lx, "
-					"redzone 2:0x%lx.\n",
-			       objp, *dbg_redzone1(cachep, objp),
-			       *dbg_redzone2(cachep, objp));
-		}
+		verify_redzone_free(cachep, objp);
 		*dbg_redzone1(cachep, objp) = RED_INACTIVE;
 		*dbg_redzone2(cachep, objp) = RED_INACTIVE;
 	}
-- 
cgit v1.2.3-70-g09d2


From 111ebb6e6f7bd7de6d722c5848e95621f43700d9 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Fri, 23 Jun 2006 02:03:26 -0700
Subject: [PATCH] writeback: fix range handling

When a writeback_control's `start' and `end' fields are used to
indicate a one-byte-range starting at file offset zero, the required
values of .start=0,.end=0 mean that the ->writepages() implementation
has no way of telling that it is being asked to perform a range
request.  Because we're currently overloading (start == 0 && end == 0)
to mean "this is not a write-a-range request".

To make all this sane, the patch changes range of writeback_control.

So caller does: If it is calling ->writepages() to write pages, it
sets range (range_start/end or range_cyclic) always.

And if range_cyclic is true, ->writepages() thinks the range is
cyclic, otherwise it just uses range_start and range_end.

This patch does,

    - Add LLONG_MAX, LLONG_MIN, ULLONG_MAX to include/linux/kernel.h
      -1 is usually ok for range_end (type is long long). But, if someone did,

		range_end += val;		range_end is "val - 1"
		u64val = range_end >> bits;	u64val is "~(0ULL)"

      or something, they are wrong. So, this adds LLONG_MAX to avoid nasty
      things, and uses LLONG_MAX for range_end.

    - All callers of ->writepages() sets range_start/end or range_cyclic.

    - Fix updates of ->writeback_index. It seems already bit strange.
      If it starts at 0 and ended by check of nr_to_write, this last
      index may reduce chance to scan end of file.  So, this updates
      ->writeback_index only if range_cyclic is true or whole-file is
      scanned.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Nathan Scott <nathans@sgi.com>
Cc: Anton Altaparmakov <aia21@cantab.net>
Cc: Steven French <sfrench@us.ibm.com>
Cc: "Vladimir V. Saveliev" <vs@namesys.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/cifs/file.c            | 24 +++++++++++-------------
 fs/fs-writeback.c         |  4 ++++
 fs/mpage.c                | 22 ++++++++++------------
 fs/sync.c                 |  2 +-
 include/linux/kernel.h    |  3 +++
 include/linux/writeback.h |  5 +++--
 mm/filemap.c              |  6 +++---
 mm/page-writeback.c       |  3 +++
 mm/vmscan.c               |  2 ++
 9 files changed, 40 insertions(+), 31 deletions(-)

(limited to 'mm')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index e2b4ce1dad6..487ea8b3baa 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1079,9 +1079,9 @@ static int cifs_writepages(struct address_space *mapping,
 	unsigned int bytes_written;
 	struct cifs_sb_info *cifs_sb;
 	int done = 0;
-	pgoff_t end = -1;
+	pgoff_t end;
 	pgoff_t index;
-	int is_range = 0;
+ 	int range_whole = 0;
 	struct kvec iov[32];
 	int len;
 	int n_iov = 0;
@@ -1122,16 +1122,14 @@ static int cifs_writepages(struct address_space *mapping,
 	xid = GetXid();
 
 	pagevec_init(&pvec, 0);
-	if (wbc->sync_mode == WB_SYNC_NONE)
+	if (wbc->range_cyclic) {
 		index = mapping->writeback_index; /* Start from prev offset */
-	else {
-		index = 0;
-		scanned = 1;
-	}
-	if (wbc->start || wbc->end) {
-		index = wbc->start >> PAGE_CACHE_SHIFT;
-		end = wbc->end >> PAGE_CACHE_SHIFT;
-		is_range = 1;
+		end = -1;
+	} else {
+		index = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+			range_whole = 1;
 		scanned = 1;
 	}
 retry:
@@ -1167,7 +1165,7 @@ retry:
 				break;
 			}
 
-			if (unlikely(is_range) && (page->index > end)) {
+			if (!wbc->range_cyclic && page->index > end) {
 				done = 1;
 				unlock_page(page);
 				break;
@@ -1271,7 +1269,7 @@ retry:
 		index = 0;
 		goto retry;
 	}
-	if (!is_range)
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 		mapping->writeback_index = index;
 
 	FreeXid(xid);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index f3fbe2d030f..6db95cf3aaa 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -461,6 +461,8 @@ void sync_inodes_sb(struct super_block *sb, int wait)
 {
 	struct writeback_control wbc = {
 		.sync_mode	= wait ? WB_SYNC_ALL : WB_SYNC_HOLD,
+		.range_start	= 0,
+		.range_end	= LLONG_MAX,
 	};
 	unsigned long nr_dirty = read_page_state(nr_dirty);
 	unsigned long nr_unstable = read_page_state(nr_unstable);
@@ -559,6 +561,8 @@ int write_inode_now(struct inode *inode, int sync)
 	struct writeback_control wbc = {
 		.nr_to_write = LONG_MAX,
 		.sync_mode = WB_SYNC_ALL,
+		.range_start = 0,
+		.range_end = LLONG_MAX,
 	};
 
 	if (!mapping_cap_writeback_dirty(inode->i_mapping))
diff --git a/fs/mpage.c b/fs/mpage.c
index 9bf2eb30e6f..1e4598247d0 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -707,9 +707,9 @@ mpage_writepages(struct address_space *mapping,
 	struct pagevec pvec;
 	int nr_pages;
 	pgoff_t index;
-	pgoff_t end = -1;		/* Inclusive */
+	pgoff_t end;		/* Inclusive */
 	int scanned = 0;
-	int is_range = 0;
+	int range_whole = 0;
 
 	if (wbc->nonblocking && bdi_write_congested(bdi)) {
 		wbc->encountered_congestion = 1;
@@ -721,16 +721,14 @@ mpage_writepages(struct address_space *mapping,
 		writepage = mapping->a_ops->writepage;
 
 	pagevec_init(&pvec, 0);
-	if (wbc->sync_mode == WB_SYNC_NONE) {
+	if (wbc->range_cyclic) {
 		index = mapping->writeback_index; /* Start from prev offset */
+		end = -1;
 	} else {
-		index = 0;			  /* whole-file sweep */
-		scanned = 1;
-	}
-	if (wbc->start || wbc->end) {
-		index = wbc->start >> PAGE_CACHE_SHIFT;
-		end = wbc->end >> PAGE_CACHE_SHIFT;
-		is_range = 1;
+		index = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+			range_whole = 1;
 		scanned = 1;
 	}
 retry:
@@ -759,7 +757,7 @@ retry:
 				continue;
 			}
 
-			if (unlikely(is_range) && page->index > end) {
+			if (!wbc->range_cyclic && page->index > end) {
 				done = 1;
 				unlock_page(page);
 				continue;
@@ -810,7 +808,7 @@ retry:
 		index = 0;
 		goto retry;
 	}
-	if (!is_range)
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 		mapping->writeback_index = index;
 	if (bio)
 		mpage_bio_submit(WRITE, bio);
diff --git a/fs/sync.c b/fs/sync.c
index aab5ffe77e9..955aef04da2 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -100,7 +100,7 @@ asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
 	}
 
 	if (nbytes == 0)
-		endbyte = -1;
+		endbyte = LLONG_MAX;
 	else
 		endbyte--;		/* inclusive */
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index f4fc576ed4c..25fccd859fb 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -24,6 +24,9 @@ extern const char linux_banner[];
 #define LONG_MAX	((long)(~0UL>>1))
 #define LONG_MIN	(-LONG_MAX - 1)
 #define ULONG_MAX	(~0UL)
+#define LLONG_MAX	((long long)(~0ULL>>1))
+#define LLONG_MIN	(-LLONG_MAX - 1)
+#define ULLONG_MAX	(~0ULL)
 
 #define STACK_MAGIC	0xdeadbeef
 
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 56f92fcbe94..9e38b566d0e 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -50,14 +50,15 @@ struct writeback_control {
 	 * a hint that the filesystem need only write out the pages inside that
 	 * byterange.  The byte at `end' is included in the writeout request.
 	 */
-	loff_t start;
-	loff_t end;
+	loff_t range_start;
+	loff_t range_end;
 
 	unsigned nonblocking:1;		/* Don't get stuck on request queues */
 	unsigned encountered_congestion:1; /* An output: a queue is full */
 	unsigned for_kupdate:1;		/* A kupdate writeback */
 	unsigned for_reclaim:1;		/* Invoked from the page allocator */
 	unsigned for_writepages:1;	/* This is a writepages() call */
+	unsigned range_cyclic:1;	/* range_start is cyclic */
 };
 
 /*
diff --git a/mm/filemap.c b/mm/filemap.c
index fd57442186c..3342067ca43 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -190,8 +190,8 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
 	struct writeback_control wbc = {
 		.sync_mode = sync_mode,
 		.nr_to_write = mapping->nrpages * 2,
-		.start = start,
-		.end = end,
+		.range_start = start,
+		.range_end = end,
 	};
 
 	if (!mapping_cap_writeback_dirty(mapping))
@@ -204,7 +204,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
 static inline int __filemap_fdatawrite(struct address_space *mapping,
 	int sync_mode)
 {
-	return __filemap_fdatawrite_range(mapping, 0, 0, sync_mode);
+	return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
 }
 
 int filemap_fdatawrite(struct address_space *mapping)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 75d7f48b79b..8ccf6f1b147 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -204,6 +204,7 @@ static void balance_dirty_pages(struct address_space *mapping)
 			.sync_mode	= WB_SYNC_NONE,
 			.older_than_this = NULL,
 			.nr_to_write	= write_chunk,
+			.range_cyclic	= 1,
 		};
 
 		get_dirty_limits(&wbs, &background_thresh,
@@ -331,6 +332,7 @@ static void background_writeout(unsigned long _min_pages)
 		.older_than_this = NULL,
 		.nr_to_write	= 0,
 		.nonblocking	= 1,
+		.range_cyclic	= 1,
 	};
 
 	for ( ; ; ) {
@@ -407,6 +409,7 @@ static void wb_kupdate(unsigned long arg)
 		.nr_to_write	= 0,
 		.nonblocking	= 1,
 		.for_kupdate	= 1,
+		.range_cyclic	= 1,
 	};
 
 	sync_supers();
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 46be8a02280..bc5d4f43036 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -339,6 +339,8 @@ pageout_t pageout(struct page *page, struct address_space *mapping)
 		struct writeback_control wbc = {
 			.sync_mode = WB_SYNC_NONE,
 			.nr_to_write = SWAP_CLUSTER_MAX,
+			.range_start = 0,
+			.range_end = LLONG_MAX,
 			.nonblocking = 1,
 			.for_reclaim = 1,
 		};
-- 
cgit v1.2.3-70-g09d2


From 7352349a19e84b28f10668eca9cbb6bbbfa9d38e Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 23 Jun 2006 02:03:27 -0700
Subject: [PATCH] page migration cleanup: rename "ignrefs" to "migration"

migrate is a better name since it is only used by page migration.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/rmap.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'mm')

diff --git a/mm/rmap.c b/mm/rmap.c
index 1963e269314..10806b7af40 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -578,7 +578,7 @@ void page_remove_rmap(struct page *page)
  * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
  */
 static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
-				int ignore_refs)
+				int migration)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long address;
@@ -602,7 +602,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 	 */
 	if ((vma->vm_flags & VM_LOCKED) ||
 			(ptep_clear_flush_young(vma, address, pte)
-				&& !ignore_refs)) {
+				&& !migration)) {
 		ret = SWAP_FAIL;
 		goto out_unmap;
 	}
@@ -736,7 +736,7 @@ static void try_to_unmap_cluster(unsigned long cursor,
 	pte_unmap_unlock(pte - 1, ptl);
 }
 
-static int try_to_unmap_anon(struct page *page, int ignore_refs)
+static int try_to_unmap_anon(struct page *page, int migration)
 {
 	struct anon_vma *anon_vma;
 	struct vm_area_struct *vma;
@@ -747,7 +747,7 @@ static int try_to_unmap_anon(struct page *page, int ignore_refs)
 		return ret;
 
 	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
-		ret = try_to_unmap_one(page, vma, ignore_refs);
+		ret = try_to_unmap_one(page, vma, migration);
 		if (ret == SWAP_FAIL || !page_mapped(page))
 			break;
 	}
@@ -764,7 +764,7 @@ static int try_to_unmap_anon(struct page *page, int ignore_refs)
  *
  * This function is only called from try_to_unmap for object-based pages.
  */
-static int try_to_unmap_file(struct page *page, int ignore_refs)
+static int try_to_unmap_file(struct page *page, int migration)
 {
 	struct address_space *mapping = page->mapping;
 	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -778,7 +778,7 @@ static int try_to_unmap_file(struct page *page, int ignore_refs)
 
 	spin_lock(&mapping->i_mmap_lock);
 	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
-		ret = try_to_unmap_one(page, vma, ignore_refs);
+		ret = try_to_unmap_one(page, vma, migration);
 		if (ret == SWAP_FAIL || !page_mapped(page))
 			goto out;
 	}
@@ -863,16 +863,16 @@ out:
  * SWAP_AGAIN	- we missed a mapping, try again later
  * SWAP_FAIL	- the page is unswappable
  */
-int try_to_unmap(struct page *page, int ignore_refs)
+int try_to_unmap(struct page *page, int migration)
 {
 	int ret;
 
 	BUG_ON(!PageLocked(page));
 
 	if (PageAnon(page))
-		ret = try_to_unmap_anon(page, ignore_refs);
+		ret = try_to_unmap_anon(page, migration);
 	else
-		ret = try_to_unmap_file(page, ignore_refs);
+		ret = try_to_unmap_file(page, migration);
 
 	if (!page_mapped(page))
 		ret = SWAP_SUCCESS;
-- 
cgit v1.2.3-70-g09d2


From 1d8b85ccf1ed53a71b092fb5d807edf1ea7dabdd Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 23 Jun 2006 02:03:28 -0700
Subject: [PATCH] page migration cleanup: group functions

Reorder functions in migrate.c.  Group all migration functions for struct
address_space_operations together.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/migrate.c | 142 ++++++++++++++++++++++++++++++-----------------------------
 1 file changed, 72 insertions(+), 70 deletions(-)

(limited to 'mm')

diff --git a/mm/migrate.c b/mm/migrate.c
index 1c25040693d..49e71ddb679 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -119,15 +119,6 @@ int putback_lru_pages(struct list_head *l)
 	return count;
 }
 
-/*
- * Non migratable page
- */
-int fail_migrate_page(struct page *newpage, struct page *page)
-{
-	return -EIO;
-}
-EXPORT_SYMBOL(fail_migrate_page);
-
 /*
  * swapout a single page
  * page is locked upon entry, unlocked on exit
@@ -297,6 +288,17 @@ void migrate_page_copy(struct page *newpage, struct page *page)
 }
 EXPORT_SYMBOL(migrate_page_copy);
 
+/************************************************************
+ *                    Migration functions
+ ***********************************************************/
+
+/* Always fail migration. Used for mappings that are not movable */
+int fail_migrate_page(struct page *newpage, struct page *page)
+{
+	return -EIO;
+}
+EXPORT_SYMBOL(fail_migrate_page);
+
 /*
  * Common logic to directly migrate a single page suitable for
  * pages that do not use PagePrivate.
@@ -329,6 +331,67 @@ int migrate_page(struct page *newpage, struct page *page)
 }
 EXPORT_SYMBOL(migrate_page);
 
+/*
+ * Migration function for pages with buffers. This function can only be used
+ * if the underlying filesystem guarantees that no other references to "page"
+ * exist.
+ */
+int buffer_migrate_page(struct page *newpage, struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct buffer_head *bh, *head;
+	int rc;
+
+	if (!mapping)
+		return -EAGAIN;
+
+	if (!page_has_buffers(page))
+		return migrate_page(newpage, page);
+
+	head = page_buffers(page);
+
+	rc = migrate_page_remove_references(newpage, page, 3);
+
+	if (rc)
+		return rc;
+
+	bh = head;
+	do {
+		get_bh(bh);
+		lock_buffer(bh);
+		bh = bh->b_this_page;
+
+	} while (bh != head);
+
+	ClearPagePrivate(page);
+	set_page_private(newpage, page_private(page));
+	set_page_private(page, 0);
+	put_page(page);
+	get_page(newpage);
+
+	bh = head;
+	do {
+		set_bh_page(bh, newpage, bh_offset(bh));
+		bh = bh->b_this_page;
+
+	} while (bh != head);
+
+	SetPagePrivate(newpage);
+
+	migrate_page_copy(newpage, page);
+
+	bh = head;
+	do {
+		unlock_buffer(bh);
+ 		put_bh(bh);
+		bh = bh->b_this_page;
+
+	} while (bh != head);
+
+	return 0;
+}
+EXPORT_SYMBOL(buffer_migrate_page);
+
 /*
  * migrate_pages
  *
@@ -528,67 +591,6 @@ next:
 	return nr_failed + retry;
 }
 
-/*
- * Migration function for pages with buffers. This function can only be used
- * if the underlying filesystem guarantees that no other references to "page"
- * exist.
- */
-int buffer_migrate_page(struct page *newpage, struct page *page)
-{
-	struct address_space *mapping = page->mapping;
-	struct buffer_head *bh, *head;
-	int rc;
-
-	if (!mapping)
-		return -EAGAIN;
-
-	if (!page_has_buffers(page))
-		return migrate_page(newpage, page);
-
-	head = page_buffers(page);
-
-	rc = migrate_page_remove_references(newpage, page, 3);
-
-	if (rc)
-		return rc;
-
-	bh = head;
-	do {
-		get_bh(bh);
-		lock_buffer(bh);
-		bh = bh->b_this_page;
-
-	} while (bh != head);
-
-	ClearPagePrivate(page);
-	set_page_private(newpage, page_private(page));
-	set_page_private(page, 0);
-	put_page(page);
-	get_page(newpage);
-
-	bh = head;
-	do {
-		set_bh_page(bh, newpage, bh_offset(bh));
-		bh = bh->b_this_page;
-
-	} while (bh != head);
-
-	SetPagePrivate(newpage);
-
-	migrate_page_copy(newpage, page);
-
-	bh = head;
-	do {
-		unlock_buffer(bh);
- 		put_bh(bh);
-		bh = bh->b_this_page;
-
-	} while (bh != head);
-
-	return 0;
-}
-EXPORT_SYMBOL(buffer_migrate_page);
-
 /*
  * Migrate the list 'pagelist' of pages to a certain destination.
  *
-- 
cgit v1.2.3-70-g09d2


From e7340f73307abed9283d0a07570d06e228c205dd Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 23 Jun 2006 02:03:29 -0700
Subject: [PATCH] page migration cleanup: remove useless definitions

Remove the export for migrate_page_remove_references() and migrate_page_copy()
that are unlikely to be used directly by filesystems implementing migration.
The export was useful when buffer_migrate_page() lived in fs/buffer.c but it
has now been moved to migrate.c in the migration reorg.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/migrate.h | 2 --
 mm/migrate.c            | 6 ++----
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 6789c4940c9..e8d3b08cc35 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -7,8 +7,6 @@
 extern int isolate_lru_page(struct page *p, struct list_head *pagelist);
 extern int putback_lru_pages(struct list_head *l);
 extern int migrate_page(struct page *, struct page *);
-extern void migrate_page_copy(struct page *, struct page *);
-extern int migrate_page_remove_references(struct page *, struct page *, int);
 extern int migrate_pages(struct list_head *l, struct list_head *t,
 		struct list_head *moved, struct list_head *failed);
 extern int migrate_pages_to(struct list_head *pagelist,
diff --git a/mm/migrate.c b/mm/migrate.c
index 49e71ddb679..be3f141e53a 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -169,7 +169,7 @@ retry:
  * Remove references for a page and establish the new page with the correct
  * basic settings to be able to stop accesses to the page.
  */
-int migrate_page_remove_references(struct page *newpage,
+static int migrate_page_remove_references(struct page *newpage,
 				struct page *page, int nr_refs)
 {
 	struct address_space *mapping = page_mapping(page);
@@ -246,12 +246,11 @@ int migrate_page_remove_references(struct page *newpage,
 
 	return 0;
 }
-EXPORT_SYMBOL(migrate_page_remove_references);
 
 /*
  * Copy the page to its new location
  */
-void migrate_page_copy(struct page *newpage, struct page *page)
+static void migrate_page_copy(struct page *newpage, struct page *page)
 {
 	copy_highpage(newpage, page);
 
@@ -286,7 +285,6 @@ void migrate_page_copy(struct page *newpage, struct page *page)
 	if (PageWriteback(newpage))
 		end_page_writeback(newpage);
 }
-EXPORT_SYMBOL(migrate_page_copy);
 
 /************************************************************
  *                    Migration functions
-- 
cgit v1.2.3-70-g09d2


From 5b5c7120e2154239837fad5e3c7b7b781092b19c Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 23 Jun 2006 02:03:29 -0700
Subject: [PATCH] page migration cleanup: drop nr_refs in remove_references()

Drop nr_refs parameter from migrate_page_remove_references()

The nr_refs parameter is not really useful since the number of remaining
references is always

1 for anonymous pages without a mapping
2 for pages with a mapping
3 for pages with a mapping and PagePrivate set.

Remove the early check for the number of references since we are checking
page_mapcount() earlier.  Ultimately only the refcount matters after the
tree_lock has been obtained.

Signed-off-by: Christoph Lameter <clameter@sgi.coim>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/migrate.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

(limited to 'mm')

diff --git a/mm/migrate.c b/mm/migrate.c
index be3f141e53a..2803a6698dd 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -168,19 +168,19 @@ retry:
 /*
  * Remove references for a page and establish the new page with the correct
  * basic settings to be able to stop accesses to the page.
+ *
+ * The number of remaining references must be:
+ * 1 for anonymous pages without a mapping
+ * 2 for pages with a mapping
+ * 3 for pages with a mapping and PagePrivate set.
  */
 static int migrate_page_remove_references(struct page *newpage,
-				struct page *page, int nr_refs)
+				struct page *page)
 {
 	struct address_space *mapping = page_mapping(page);
 	struct page **radix_pointer;
 
-	/*
-	 * Avoid doing any of the following work if the page count
-	 * indicates that the page is in use or truncate has removed
-	 * the page.
-	 */
-	if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
+	if (!mapping)
 		return -EAGAIN;
 
 	/*
@@ -218,7 +218,8 @@ static int migrate_page_remove_references(struct page *newpage,
 						&mapping->page_tree,
 						page_index(page));
 
-	if (!page_mapping(page) || page_count(page) != nr_refs ||
+	if (!page_mapping(page) ||
+			page_count(page) != 2 + !!PagePrivate(page) ||
 			*radix_pointer != page) {
 		write_unlock_irq(&mapping->tree_lock);
 		return -EAGAIN;
@@ -309,7 +310,7 @@ int migrate_page(struct page *newpage, struct page *page)
 
 	BUG_ON(PageWriteback(page));	/* Writeback must be complete */
 
-	rc = migrate_page_remove_references(newpage, page, 2);
+	rc = migrate_page_remove_references(newpage, page);
 
 	if (rc)
 		return rc;
@@ -348,7 +349,7 @@ int buffer_migrate_page(struct page *newpage, struct page *page)
 
 	head = page_buffers(page);
 
-	rc = migrate_page_remove_references(newpage, page, 3);
+	rc = migrate_page_remove_references(newpage, page);
 
 	if (rc)
 		return rc;
-- 
cgit v1.2.3-70-g09d2


From c3fcf8a5daacf350f0632e1379414c01f34eeea3 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 23 Jun 2006 02:03:32 -0700
Subject: [PATCH] page migration cleanup: extract try_to_unmap from migration
 functions

Extract try_to_unmap and rename remove_references -> move_mapping

try_to_unmap() may significantly change the page state by for example setting
the dirty bit.  It is therefore best to unmap in migrate_pages() before
calling any migration functions.

migrate_page_remove_references() will then only move the new page in place of
the old page in the mapping.  Rename the function to
migrate_page_move_mapping().

This allows us to get rid of the special unmapping for the fallback path.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/migrate.c | 76 +++++++++++++++++++++++++-----------------------------------
 1 file changed, 31 insertions(+), 45 deletions(-)

(limited to 'mm')

diff --git a/mm/migrate.c b/mm/migrate.c
index 2803a6698dd..8095c607a49 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -166,15 +166,14 @@ retry:
 }
 
 /*
- * Remove references for a page and establish the new page with the correct
- * basic settings to be able to stop accesses to the page.
+ * Replace the page in the mapping.
  *
  * The number of remaining references must be:
  * 1 for anonymous pages without a mapping
  * 2 for pages with a mapping
  * 3 for pages with a mapping and PagePrivate set.
  */
-static int migrate_page_remove_references(struct page *newpage,
+static int migrate_page_move_mapping(struct page *newpage,
 				struct page *page)
 {
 	struct address_space *mapping = page_mapping(page);
@@ -183,35 +182,6 @@ static int migrate_page_remove_references(struct page *newpage,
 	if (!mapping)
 		return -EAGAIN;
 
-	/*
-	 * Establish swap ptes for anonymous pages or destroy pte
-	 * maps for files.
-	 *
-	 * In order to reestablish file backed mappings the fault handlers
-	 * will take the radix tree_lock which may then be used to stop
-  	 * processses from accessing this page until the new page is ready.
-	 *
-	 * A process accessing via a swap pte (an anonymous page) will take a
-	 * page_lock on the old page which will block the process until the
-	 * migration attempt is complete. At that time the PageSwapCache bit
-	 * will be examined. If the page was migrated then the PageSwapCache
-	 * bit will be clear and the operation to retrieve the page will be
-	 * retried which will find the new page in the radix tree. Then a new
-	 * direct mapping may be generated based on the radix tree contents.
-	 *
-	 * If the page was not migrated then the PageSwapCache bit
-	 * is still set and the operation may continue.
-	 */
-	if (try_to_unmap(page, 1) == SWAP_FAIL)
-		/* A vma has VM_LOCKED set -> permanent failure */
-		return -EPERM;
-
-	/*
-	 * Give up if we were unable to remove all mappings.
-	 */
-	if (page_mapcount(page))
-		return -EAGAIN;
-
 	write_lock_irq(&mapping->tree_lock);
 
 	radix_pointer = (struct page **)radix_tree_lookup_slot(
@@ -310,7 +280,7 @@ int migrate_page(struct page *newpage, struct page *page)
 
 	BUG_ON(PageWriteback(page));	/* Writeback must be complete */
 
-	rc = migrate_page_remove_references(newpage, page);
+	rc = migrate_page_move_mapping(newpage, page);
 
 	if (rc)
 		return rc;
@@ -349,7 +319,7 @@ int buffer_migrate_page(struct page *newpage, struct page *page)
 
 	head = page_buffers(page);
 
-	rc = migrate_page_remove_references(newpage, page);
+	rc = migrate_page_move_mapping(newpage, page);
 
 	if (rc)
 		return rc;
@@ -481,6 +451,33 @@ redo:
 		newpage = lru_to_page(to);
 		lock_page(newpage);
 
+		/*
+		 * Establish swap ptes for anonymous pages or destroy pte
+		 * maps for files.
+		 *
+		 * In order to reestablish file backed mappings the fault handlers
+		 * will take the radix tree_lock which may then be used to stop
+	  	 * processses from accessing this page until the new page is ready.
+		 *
+		 * A process accessing via a swap pte (an anonymous page) will take a
+		 * page_lock on the old page which will block the process until the
+		 * migration attempt is complete. At that time the PageSwapCache bit
+		 * will be examined. If the page was migrated then the PageSwapCache
+		 * bit will be clear and the operation to retrieve the page will be
+		 * retried which will find the new page in the radix tree. Then a new
+		 * direct mapping may be generated based on the radix tree contents.
+		 *
+		 * If the page was not migrated then the PageSwapCache bit
+		 * is still set and the operation may continue.
+		 */
+		rc = -EPERM;
+		if (try_to_unmap(page, 1) == SWAP_FAIL)
+			/* A vma has VM_LOCKED set -> permanent failure */
+			goto unlock_both;
+
+		rc = -EAGAIN;
+		if (page_mapped(page))
+			goto unlock_both;
 		/*
 		 * Pages are properly locked and writeback is complete.
 		 * Try to migrate the page.
@@ -501,17 +498,6 @@ redo:
 			goto unlock_both;
                 }
 
-		/* Make sure the dirty bit is up to date */
-		if (try_to_unmap(page, 1) == SWAP_FAIL) {
-			rc = -EPERM;
-			goto unlock_both;
-		}
-
-		if (page_mapcount(page)) {
-			rc = -EAGAIN;
-			goto unlock_both;
-		}
-
 		/*
 		 * Default handling if a filesystem does not provide
 		 * a migration function. We can only migrate clean
-- 
cgit v1.2.3-70-g09d2


From 2d1db3b1170db4e8bf0531dd636742269c2cf579 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 23 Jun 2006 02:03:33 -0700
Subject: [PATCH] page migration cleanup: pass "mapping" to migration functions

Change handling of address spaces.

Pass a pointer to the address space in which the page is migrated to all
migration function.  This avoids repeatedly having to retrieve the address
space pointer from the page and checking it for validity.  The old page
mapping will change once migration has gone to a certain step, so it is less
confusing to have the pointer always available.

Move the setting of the mapping and index for the new page into
migrate_pages().

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/fs.h      |  6 +++--
 include/linux/migrate.h |  6 +++--
 mm/migrate.c            | 70 ++++++++++++++++++++++++-------------------------
 3 files changed, 42 insertions(+), 40 deletions(-)

(limited to 'mm')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index c823a3815e2..e917403f4d5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -376,7 +376,8 @@ struct address_space_operations {
 	struct page* (*get_xip_page)(struct address_space *, sector_t,
 			int);
 	/* migrate the contents of a page to the specified target */
-	int (*migratepage) (struct page *, struct page *);
+	int (*migratepage) (struct address_space *,
+			struct page *, struct page *);
 };
 
 struct backing_dev_info;
@@ -1772,7 +1773,8 @@ extern void simple_release_fs(struct vfsmount **mount, int *count);
 extern ssize_t simple_read_from_buffer(void __user *, size_t, loff_t *, const void *, size_t);
 
 #ifdef CONFIG_MIGRATION
-extern int buffer_migrate_page(struct page *, struct page *);
+extern int buffer_migrate_page(struct address_space *,
+				struct page *, struct page *);
 #else
 #define buffer_migrate_page NULL
 #endif
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index e8d3b08cc35..287c47b5e5d 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -6,12 +6,14 @@
 #ifdef CONFIG_MIGRATION
 extern int isolate_lru_page(struct page *p, struct list_head *pagelist);
 extern int putback_lru_pages(struct list_head *l);
-extern int migrate_page(struct page *, struct page *);
+extern int migrate_page(struct address_space *,
+			struct page *, struct page *);
 extern int migrate_pages(struct list_head *l, struct list_head *t,
 		struct list_head *moved, struct list_head *failed);
 extern int migrate_pages_to(struct list_head *pagelist,
 			struct vm_area_struct *vma, int dest);
-extern int fail_migrate_page(struct page *, struct page *);
+extern int fail_migrate_page(struct address_space *,
+			struct page *, struct page *);
 
 extern int migrate_prep(void);
 
diff --git a/mm/migrate.c b/mm/migrate.c
index 8095c607a49..f65e69d9452 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -173,15 +173,11 @@ retry:
  * 2 for pages with a mapping
  * 3 for pages with a mapping and PagePrivate set.
  */
-static int migrate_page_move_mapping(struct page *newpage,
-				struct page *page)
+static int migrate_page_move_mapping(struct address_space *mapping,
+		struct page *newpage, struct page *page)
 {
-	struct address_space *mapping = page_mapping(page);
 	struct page **radix_pointer;
 
-	if (!mapping)
-		return -EAGAIN;
-
 	write_lock_irq(&mapping->tree_lock);
 
 	radix_pointer = (struct page **)radix_tree_lookup_slot(
@@ -197,15 +193,8 @@ static int migrate_page_move_mapping(struct page *newpage,
 
 	/*
 	 * Now we know that no one else is looking at the page.
-	 *
-	 * Certain minimal information about a page must be available
-	 * in order for other subsystems to properly handle the page if they
-	 * find it through the radix tree update before we are finished
-	 * copying the page.
 	 */
 	get_page(newpage);
-	newpage->index = page->index;
-	newpage->mapping = page->mapping;
 	if (PageSwapCache(page)) {
 		SetPageSwapCache(newpage);
 		set_page_private(newpage, page_private(page));
@@ -262,7 +251,8 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
  ***********************************************************/
 
 /* Always fail migration. Used for mappings that are not movable */
-int fail_migrate_page(struct page *newpage, struct page *page)
+int fail_migrate_page(struct address_space *mapping,
+			struct page *newpage, struct page *page)
 {
 	return -EIO;
 }
@@ -274,13 +264,14 @@ EXPORT_SYMBOL(fail_migrate_page);
  *
  * Pages are locked upon entry and exit.
  */
-int migrate_page(struct page *newpage, struct page *page)
+int migrate_page(struct address_space *mapping,
+		struct page *newpage, struct page *page)
 {
 	int rc;
 
 	BUG_ON(PageWriteback(page));	/* Writeback must be complete */
 
-	rc = migrate_page_move_mapping(newpage, page);
+	rc = migrate_page_move_mapping(mapping, newpage, page);
 
 	if (rc)
 		return rc;
@@ -305,21 +296,18 @@ EXPORT_SYMBOL(migrate_page);
  * if the underlying filesystem guarantees that no other references to "page"
  * exist.
  */
-int buffer_migrate_page(struct page *newpage, struct page *page)
+int buffer_migrate_page(struct address_space *mapping,
+		struct page *newpage, struct page *page)
 {
-	struct address_space *mapping = page->mapping;
 	struct buffer_head *bh, *head;
 	int rc;
 
-	if (!mapping)
-		return -EAGAIN;
-
 	if (!page_has_buffers(page))
-		return migrate_page(newpage, page);
+		return migrate_page(mapping, newpage, page);
 
 	head = page_buffers(page);
 
-	rc = migrate_page_move_mapping(newpage, page);
+	rc = migrate_page_move_mapping(mapping, newpage, page);
 
 	if (rc)
 		return rc;
@@ -448,9 +436,6 @@ redo:
 			goto next;
 		}
 
-		newpage = lru_to_page(to);
-		lock_page(newpage);
-
 		/*
 		 * Establish swap ptes for anonymous pages or destroy pte
 		 * maps for files.
@@ -473,11 +458,18 @@ redo:
 		rc = -EPERM;
 		if (try_to_unmap(page, 1) == SWAP_FAIL)
 			/* A vma has VM_LOCKED set -> permanent failure */
-			goto unlock_both;
+			goto unlock_page;
 
 		rc = -EAGAIN;
 		if (page_mapped(page))
-			goto unlock_both;
+			goto unlock_page;
+
+		newpage = lru_to_page(to);
+		lock_page(newpage);
+		/* Prepare mapping for the new page.*/
+		newpage->index = page->index;
+		newpage->mapping = page->mapping;
+
 		/*
 		 * Pages are properly locked and writeback is complete.
 		 * Try to migrate the page.
@@ -494,7 +486,8 @@ redo:
 			 * own migration function. This is the most common
 			 * path for page migration.
 			 */
-			rc = mapping->a_ops->migratepage(newpage, page);
+			rc = mapping->a_ops->migratepage(mapping,
+							newpage, page);
 			goto unlock_both;
                 }
 
@@ -524,7 +517,7 @@ redo:
 		 */
 		if (!page_has_buffers(page) ||
 		    try_to_release_page(page, GFP_KERNEL)) {
-			rc = migrate_page(newpage, page);
+			rc = migrate_page(mapping, newpage, page);
 			goto unlock_both;
 		}
 
@@ -553,12 +546,17 @@ unlock_page:
 		unlock_page(page);
 
 next:
-		if (rc == -EAGAIN) {
-			retry++;
-		} else if (rc) {
-			/* Permanent failure */
-			list_move(&page->lru, failed);
-			nr_failed++;
+		if (rc) {
+			if (newpage)
+				newpage->mapping = NULL;
+
+			if (rc == -EAGAIN)
+				retry++;
+			else {
+				/* Permanent failure */
+				list_move(&page->lru, failed);
+				nr_failed++;
+			}
 		} else {
 			if (newpage) {
 				/* Successful migration. Return page to LRU */
-- 
cgit v1.2.3-70-g09d2


From 8351a6e4785218a2b03c142be92926baff95ba5c Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 23 Jun 2006 02:03:33 -0700
Subject: [PATCH] page migration cleanup: move fallback handling into special
 function

Move the fallback code into a new fallback function and make the function
behave like any other migration function.  This requires retaking the lock if
pageout() drops it.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/migrate.c | 90 ++++++++++++++++++++++++++----------------------------------
 1 file changed, 39 insertions(+), 51 deletions(-)

(limited to 'mm')

diff --git a/mm/migrate.c b/mm/migrate.c
index f65e69d9452..5a340f4ca21 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -349,6 +349,42 @@ int buffer_migrate_page(struct address_space *mapping,
 }
 EXPORT_SYMBOL(buffer_migrate_page);
 
+static int fallback_migrate_page(struct address_space *mapping,
+	struct page *newpage, struct page *page)
+{
+	/*
+	 * Default handling if a filesystem does not provide
+	 * a migration function. We can only migrate clean
+	 * pages so try to write out any dirty pages first.
+	 */
+	if (PageDirty(page)) {
+		switch (pageout(page, mapping)) {
+		case PAGE_KEEP:
+		case PAGE_ACTIVATE:
+			return -EAGAIN;
+
+		case PAGE_SUCCESS:
+			/* Relock since we lost the lock */
+			lock_page(page);
+			/* Must retry since page state may have changed */
+			return -EAGAIN;
+
+		case PAGE_CLEAN:
+			; /* try to migrate the page below */
+		}
+	}
+
+	/*
+	 * Buffers may be managed in a filesystem specific way.
+	 * We must have no buffers or drop them.
+	 */
+	if (page_has_buffers(page) &&
+	    !try_to_release_page(page, GFP_KERNEL))
+		return -EAGAIN;
+
+	return migrate_page(mapping, newpage, page);
+}
+
 /*
  * migrate_pages
  *
@@ -478,7 +514,7 @@ redo:
 		if (!mapping)
 			goto unlock_both;
 
-		if (mapping->a_ops->migratepage) {
+		if (mapping->a_ops->migratepage)
 			/*
 			 * Most pages have a mapping and most filesystems
 			 * should provide a migration function. Anonymous
@@ -488,56 +524,8 @@ redo:
 			 */
 			rc = mapping->a_ops->migratepage(mapping,
 							newpage, page);
-			goto unlock_both;
-                }
-
-		/*
-		 * Default handling if a filesystem does not provide
-		 * a migration function. We can only migrate clean
-		 * pages so try to write out any dirty pages first.
-		 */
-		if (PageDirty(page)) {
-			switch (pageout(page, mapping)) {
-			case PAGE_KEEP:
-			case PAGE_ACTIVATE:
-				goto unlock_both;
-
-			case PAGE_SUCCESS:
-				unlock_page(newpage);
-				goto next;
-
-			case PAGE_CLEAN:
-				; /* try to migrate the page below */
-			}
-                }
-
-		/*
-		 * Buffers are managed in a filesystem specific way.
-		 * We must have no buffers or drop them.
-		 */
-		if (!page_has_buffers(page) ||
-		    try_to_release_page(page, GFP_KERNEL)) {
-			rc = migrate_page(mapping, newpage, page);
-			goto unlock_both;
-		}
-
-		/*
-		 * On early passes with mapped pages simply
-		 * retry. There may be a lock held for some
-		 * buffers that may go away. Later
-		 * swap them out.
-		 */
-		if (pass > 4) {
-			/*
-			 * Persistently unable to drop buffers..... As a
-			 * measure of last resort we fall back to
-			 * swap_page().
-			 */
-			unlock_page(newpage);
-			newpage = NULL;
-			rc = swap_page(page);
-			goto next;
-		}
+		else
+			rc = fallback_migrate_page(mapping, newpage, page);
 
 unlock_both:
 		unlock_page(newpage);
-- 
cgit v1.2.3-70-g09d2


From 0697212a411c1dae03c27845f2de2f3adb32c331 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 23 Jun 2006 02:03:35 -0700
Subject: [PATCH] Swapless page migration: add R/W migration entries

Implement read/write migration ptes

We take the upper two swapfiles for the two types of migration ptes and define
a series of macros in swapops.h.

The VM is modified to handle the migration entries.  migration entries can
only be encountered when the page they are pointing to is locked.  This limits
the number of places one has to fix.  We also check in copy_pte_range and in
mprotect_pte_range() for migration ptes.

We check for migration ptes in do_swap_cache and call a function that will
then wait on the page lock.  This allows us to effectively stop all accesses
to apge.

Migration entries are created by try_to_unmap if called for migration and
removed by local functions in migrate.c

From: Hugh Dickins <hugh@veritas.com>

  Several times while testing swapless page migration (I've no NUMA, just
  hacking it up to migrate recklessly while running load), I've hit the
  BUG_ON(!PageLocked(p)) in migration_entry_to_page.

  This comes from an orphaned migration entry, unrelated to the current
  correctly locked migration, but hit by remove_anon_migration_ptes as it
  checks an address in each vma of the anon_vma list.

  Such an orphan may be left behind if an earlier migration raced with fork:
  copy_one_pte can duplicate a migration entry from parent to child, after
  remove_anon_migration_ptes has checked the child vma, but before it has
  removed it from the parent vma.  (If the process were later to fault on this
  orphaned entry, it would hit the same BUG from migration_entry_wait.)

  This could be fixed by locking anon_vma in copy_one_pte, but we'd rather
  not.  There's no such problem with file pages, because vma_prio_tree_add
  adds child vma after parent vma, and the page table locking at each end is
  enough to serialize.  Follow that example with anon_vma: add new vmas to the
  tail instead of the head.

  (There's no corresponding problem when inserting migration entries,
  because a missed pte will leave the page count and mapcount high, which is
  allowed for.  And there's no corresponding problem when migrating via swap,
  because a leftover swap entry will be correctly faulted.  But the swapless
  method has no refcounting of its entries.)

From: Ingo Molnar <mingo@elte.hu>

  pte_unmap_unlock() takes the pte pointer as an argument.

From: Hugh Dickins <hugh@veritas.com>

  Several times while testing swapless page migration, gcc has tried to exec
  a pointer instead of a string: smells like COW mappings are not being
  properly write-protected on fork.

  The protection in copy_one_pte looks very convincing, until at last you
  realize that the second arg to make_migration_entry is a boolean "write",
  and SWP_MIGRATION_READ is 30.

  Anyway, it's better done like in change_pte_range, using
  is_write_migration_entry and make_migration_entry_read.

From: Hugh Dickins <hugh@veritas.com>

  Remove unnecessary obfuscation from sys_swapon's range check on swap type,
  which blew up causing memory corruption once swapless migration made
  MAX_SWAPFILES no longer 2 ^ MAX_SWAPFILES_SHIFT.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Christoph Lameter <clameter@engr.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
From: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h    |   7 +++
 include/linux/swapops.h |  53 ++++++++++++++++++++
 mm/memory.c             |  18 ++++++-
 mm/migrate.c            | 128 +++++++++++++++++++++++++++++++++++++++++++++++-
 mm/mprotect.c           |  23 +++++++--
 mm/rmap.c               |  38 ++++++++------
 mm/swapfile.c           |  20 +++-----
 7 files changed, 255 insertions(+), 32 deletions(-)

(limited to 'mm')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index cd28ad206da..7cee73ef4f1 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -28,7 +28,14 @@ static inline int current_is_kswapd(void)
  * the type/offset into the pte as 5/27 as well.
  */
 #define MAX_SWAPFILES_SHIFT	5
+#ifndef CONFIG_MIGRATION
 #define MAX_SWAPFILES		(1 << MAX_SWAPFILES_SHIFT)
+#else
+/* Use last two entries for page migration swap entries */
+#define MAX_SWAPFILES		((1 << MAX_SWAPFILES_SHIFT)-2)
+#define SWP_MIGRATION_READ	MAX_SWAPFILES
+#define SWP_MIGRATION_WRITE	(MAX_SWAPFILES + 1)
+#endif
 
 /*
  * Magic header for a swap area. The first part of the union is
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 87b9d14c710..ec639aa3a1d 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -67,3 +67,56 @@ static inline pte_t swp_entry_to_pte(swp_entry_t entry)
 	BUG_ON(pte_file(__swp_entry_to_pte(arch_entry)));
 	return __swp_entry_to_pte(arch_entry);
 }
+
+#ifdef CONFIG_MIGRATION
+static inline swp_entry_t make_migration_entry(struct page *page, int write)
+{
+	BUG_ON(!PageLocked(page));
+	return swp_entry(write ? SWP_MIGRATION_WRITE : SWP_MIGRATION_READ,
+			page_to_pfn(page));
+}
+
+static inline int is_migration_entry(swp_entry_t entry)
+{
+	return unlikely(swp_type(entry) == SWP_MIGRATION_READ ||
+			swp_type(entry) == SWP_MIGRATION_WRITE);
+}
+
+static inline int is_write_migration_entry(swp_entry_t entry)
+{
+	return unlikely(swp_type(entry) == SWP_MIGRATION_WRITE);
+}
+
+static inline struct page *migration_entry_to_page(swp_entry_t entry)
+{
+	struct page *p = pfn_to_page(swp_offset(entry));
+	/*
+	 * Any use of migration entries may only occur while the
+	 * corresponding page is locked
+	 */
+	BUG_ON(!PageLocked(p));
+	return p;
+}
+
+static inline void make_migration_entry_read(swp_entry_t *entry)
+{
+	*entry = swp_entry(SWP_MIGRATION_READ, swp_offset(*entry));
+}
+
+extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
+					unsigned long address);
+#else
+
+#define make_migration_entry(page, write) swp_entry(0, 0)
+#define is_migration_entry(swp) 0
+#define migration_entry_to_page(swp) NULL
+static inline void make_migration_entry_read(swp_entry_t *entryp) { }
+static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
+					 unsigned long address) { }
+static inline int is_write_migration_entry(swp_entry_t entry)
+{
+	return 0;
+}
+
+#endif
+
diff --git a/mm/memory.c b/mm/memory.c
index 7e3683fd4f3..11673c5d2c2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -434,7 +434,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	/* pte contains position in swap or file, so copy. */
 	if (unlikely(!pte_present(pte))) {
 		if (!pte_file(pte)) {
-			swap_duplicate(pte_to_swp_entry(pte));
+			swp_entry_t entry = pte_to_swp_entry(pte);
+
+			swap_duplicate(entry);
 			/* make sure dst_mm is on swapoff's mmlist. */
 			if (unlikely(list_empty(&dst_mm->mmlist))) {
 				spin_lock(&mmlist_lock);
@@ -443,6 +445,16 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 						 &src_mm->mmlist);
 				spin_unlock(&mmlist_lock);
 			}
+			if (is_write_migration_entry(entry) &&
+					is_cow_mapping(vm_flags)) {
+				/*
+				 * COW mappings require pages in both parent
+				 * and child to be set to read.
+				 */
+				make_migration_entry_read(&entry);
+				pte = swp_entry_to_pte(entry);
+				set_pte_at(src_mm, addr, src_pte, pte);
+			}
 		}
 		goto out_set_pte;
 	}
@@ -1879,6 +1891,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		goto out;
 
 	entry = pte_to_swp_entry(orig_pte);
+	if (is_migration_entry(entry)) {
+		migration_entry_wait(mm, pmd, address);
+		goto out;
+	}
 	page = lookup_swap_cache(entry);
 	if (!page) {
  		swapin_readahead(entry, address, vma);
diff --git a/mm/migrate.c b/mm/migrate.c
index 5a340f4ca21..0a011e421bb 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -15,6 +15,7 @@
 #include <linux/migrate.h>
 #include <linux/module.h>
 #include <linux/swap.h>
+#include <linux/swapops.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/mm_inline.h>
@@ -23,7 +24,6 @@
 #include <linux/topology.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
-#include <linux/swapops.h>
 
 #include "internal.h"
 
@@ -119,6 +119,132 @@ int putback_lru_pages(struct list_head *l)
 	return count;
 }
 
+static inline int is_swap_pte(pte_t pte)
+{
+	return !pte_none(pte) && !pte_present(pte) && !pte_file(pte);
+}
+
+/*
+ * Restore a potential migration pte to a working pte entry
+ */
+static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr,
+		struct page *old, struct page *new)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	swp_entry_t entry;
+ 	pgd_t *pgd;
+ 	pud_t *pud;
+ 	pmd_t *pmd;
+	pte_t *ptep, pte;
+ 	spinlock_t *ptl;
+
+ 	pgd = pgd_offset(mm, addr);
+	if (!pgd_present(*pgd))
+                return;
+
+	pud = pud_offset(pgd, addr);
+	if (!pud_present(*pud))
+                return;
+
+	pmd = pmd_offset(pud, addr);
+	if (!pmd_present(*pmd))
+		return;
+
+	ptep = pte_offset_map(pmd, addr);
+
+	if (!is_swap_pte(*ptep)) {
+		pte_unmap(ptep);
+ 		return;
+ 	}
+
+ 	ptl = pte_lockptr(mm, pmd);
+ 	spin_lock(ptl);
+	pte = *ptep;
+	if (!is_swap_pte(pte))
+		goto out;
+
+	entry = pte_to_swp_entry(pte);
+
+	if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old)
+		goto out;
+
+	inc_mm_counter(mm, anon_rss);
+	get_page(new);
+	pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
+	if (is_write_migration_entry(entry))
+		pte = pte_mkwrite(pte);
+	set_pte_at(mm, addr, ptep, pte);
+	page_add_anon_rmap(new, vma, addr);
+out:
+	pte_unmap_unlock(ptep, ptl);
+}
+
+/*
+ * Get rid of all migration entries and replace them by
+ * references to the indicated page.
+ *
+ * Must hold mmap_sem lock on at least one of the vmas containing
+ * the page so that the anon_vma cannot vanish.
+ */
+static void remove_migration_ptes(struct page *old, struct page *new)
+{
+	struct anon_vma *anon_vma;
+	struct vm_area_struct *vma;
+	unsigned long mapping;
+
+	mapping = (unsigned long)new->mapping;
+
+	if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
+		return;
+
+	/*
+	 * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
+	 */
+	anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
+	spin_lock(&anon_vma->lock);
+
+	list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
+		remove_migration_pte(vma, page_address_in_vma(new, vma),
+					old, new);
+
+	spin_unlock(&anon_vma->lock);
+}
+
+/*
+ * Something used the pte of a page under migration. We need to
+ * get to the page and wait until migration is finished.
+ * When we return from this function the fault will be retried.
+ *
+ * This function is called from do_swap_page().
+ */
+void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
+				unsigned long address)
+{
+	pte_t *ptep, pte;
+	spinlock_t *ptl;
+	swp_entry_t entry;
+	struct page *page;
+
+	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+	pte = *ptep;
+	if (!is_swap_pte(pte))
+		goto out;
+
+	entry = pte_to_swp_entry(pte);
+	if (!is_migration_entry(entry))
+		goto out;
+
+	page = migration_entry_to_page(entry);
+
+	get_page(page);
+	pte_unmap_unlock(ptep, ptl);
+	wait_on_page_locked(page);
+	put_page(page);
+	return;
+out:
+	pte_unmap_unlock(ptep, ptl);
+}
+
 /*
  * swapout a single page
  * page is locked upon entry, unlocked on exit
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 5faf01ad3ef..14f93e62270 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -19,7 +19,8 @@
 #include <linux/mempolicy.h>
 #include <linux/personality.h>
 #include <linux/syscalls.h>
-
+#include <linux/swap.h>
+#include <linux/swapops.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
@@ -28,12 +29,13 @@
 static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
 		unsigned long addr, unsigned long end, pgprot_t newprot)
 {
-	pte_t *pte;
+	pte_t *pte, oldpte;
 	spinlock_t *ptl;
 
 	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	do {
-		if (pte_present(*pte)) {
+		oldpte = *pte;
+		if (pte_present(oldpte)) {
 			pte_t ptent;
 
 			/* Avoid an SMP race with hardware updated dirty/clean
@@ -43,7 +45,22 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
 			ptent = pte_modify(ptep_get_and_clear(mm, addr, pte), newprot);
 			set_pte_at(mm, addr, pte, ptent);
 			lazy_mmu_prot_update(ptent);
+#ifdef CONFIG_MIGRATION
+		} else if (!pte_file(oldpte)) {
+			swp_entry_t entry = pte_to_swp_entry(oldpte);
+
+			if (is_write_migration_entry(entry)) {
+				/*
+				 * A protection check is difficult so
+				 * just be safe and disable write
+				 */
+				make_migration_entry_read(&entry);
+				set_pte_at(mm, addr, pte,
+					swp_entry_to_pte(entry));
+			}
+#endif
 		}
+
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	pte_unmap_unlock(pte - 1, ptl);
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index 10806b7af40..3b8ce86daa3 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -103,7 +103,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
 		spin_lock(&mm->page_table_lock);
 		if (likely(!vma->anon_vma)) {
 			vma->anon_vma = anon_vma;
-			list_add(&vma->anon_vma_node, &anon_vma->head);
+			list_add_tail(&vma->anon_vma_node, &anon_vma->head);
 			allocated = NULL;
 		}
 		spin_unlock(&mm->page_table_lock);
@@ -127,7 +127,7 @@ void __anon_vma_link(struct vm_area_struct *vma)
 	struct anon_vma *anon_vma = vma->anon_vma;
 
 	if (anon_vma) {
-		list_add(&vma->anon_vma_node, &anon_vma->head);
+		list_add_tail(&vma->anon_vma_node, &anon_vma->head);
 		validate_anon_vma(vma);
 	}
 }
@@ -138,7 +138,7 @@ void anon_vma_link(struct vm_area_struct *vma)
 
 	if (anon_vma) {
 		spin_lock(&anon_vma->lock);
-		list_add(&vma->anon_vma_node, &anon_vma->head);
+		list_add_tail(&vma->anon_vma_node, &anon_vma->head);
 		validate_anon_vma(vma);
 		spin_unlock(&anon_vma->lock);
 	}
@@ -620,17 +620,27 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 
 	if (PageAnon(page)) {
 		swp_entry_t entry = { .val = page_private(page) };
-		/*
-		 * Store the swap location in the pte.
-		 * See handle_pte_fault() ...
-		 */
-		BUG_ON(!PageSwapCache(page));
-		swap_duplicate(entry);
-		if (list_empty(&mm->mmlist)) {
-			spin_lock(&mmlist_lock);
-			if (list_empty(&mm->mmlist))
-				list_add(&mm->mmlist, &init_mm.mmlist);
-			spin_unlock(&mmlist_lock);
+
+		if (PageSwapCache(page)) {
+			/*
+			 * Store the swap location in the pte.
+			 * See handle_pte_fault() ...
+			 */
+			swap_duplicate(entry);
+			if (list_empty(&mm->mmlist)) {
+				spin_lock(&mmlist_lock);
+				if (list_empty(&mm->mmlist))
+					list_add(&mm->mmlist, &init_mm.mmlist);
+				spin_unlock(&mmlist_lock);
+			}
+		} else {
+			/*
+			 * Store the pfn of the page in a special migration
+			 * pte. do_swap_page() will wait until the migration
+			 * pte is removed and then restart fault handling.
+			 */
+			BUG_ON(!migration);
+			entry = make_migration_entry(page, pte_write(pteval));
 		}
 		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
 		BUG_ON(pte_file(*pte));
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 47a6812f5f8..e3b1362372c 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -395,6 +395,9 @@ void free_swap_and_cache(swp_entry_t entry)
 	struct swap_info_struct * p;
 	struct page *page = NULL;
 
+	if (is_migration_entry(entry))
+		return;
+
 	p = swap_info_get(entry);
 	if (p) {
 		if (swap_entry_free(p, swp_offset(entry)) == 1) {
@@ -1400,19 +1403,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 		if (!(p->flags & SWP_USED))
 			break;
 	error = -EPERM;
-	/*
-	 * Test if adding another swap device is possible. There are
-	 * two limiting factors: 1) the number of bits for the swap
-	 * type swp_entry_t definition and 2) the number of bits for
-	 * the swap type in the swap ptes as defined by the different
-	 * architectures. To honor both limitations a swap entry
-	 * with swap offset 0 and swap type ~0UL is created, encoded
-	 * to a swap pte, decoded to a swp_entry_t again and finally
-	 * the swap type part is extracted. This will mask all bits
-	 * from the initial ~0UL that can't be encoded in either the
-	 * swp_entry_t or the architecture definition of a swap pte.
-	 */
-	if (type > swp_type(pte_to_swp_entry(swp_entry_to_pte(swp_entry(~0UL,0))))) {
+	if (type >= MAX_SWAPFILES) {
 		spin_unlock(&swap_lock);
 		goto out;
 	}
@@ -1702,6 +1693,9 @@ int swap_duplicate(swp_entry_t entry)
 	unsigned long offset, type;
 	int result = 0;
 
+	if (is_migration_entry(entry))
+		return 1;
+
 	type = swp_type(entry);
 	if (type >= nr_swapfiles)
 		goto bad_file;
-- 
cgit v1.2.3-70-g09d2


From d75a0fcda2cfc71b50e16dc89e0c32c57d427e85 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 23 Jun 2006 02:03:36 -0700
Subject: [PATCH] Swapless page migration: rip out swap based logic

Rip the page migration logic out.

Remove all code that has to do with swapping during page migration.

This also guts the ability to migrate pages to swap.  No one used that so lets
let it go for good.

Page migration should be a bit broken after this patch.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/rmap.h |  1 -
 mm/migrate.c         | 75 +++-------------------------------------------------
 mm/rmap.c            | 38 --------------------------
 mm/swapfile.c        |  9 -------
 4 files changed, 3 insertions(+), 120 deletions(-)

(limited to 'mm')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 2d4c81a220d..bf97b090001 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -91,7 +91,6 @@ static inline void page_dup_rmap(struct page *page)
  */
 int page_referenced(struct page *, int is_locked);
 int try_to_unmap(struct page *, int ignore_refs);
-void remove_from_swap(struct page *page);
 
 /*
  * Called from mm/filemap_xip.c to unmap empty zero page
diff --git a/mm/migrate.c b/mm/migrate.c
index 0a011e421bb..81721a061d5 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -70,10 +70,6 @@ int isolate_lru_page(struct page *page, struct list_head *pagelist)
  */
 int migrate_prep(void)
 {
-	/* Must have swap device for migration */
-	if (nr_swap_pages <= 0)
-		return -ENODEV;
-
 	/*
 	 * Clear the LRU lists so pages can be isolated.
 	 * Note that pages may be moved off the LRU after we have
@@ -245,52 +241,6 @@ out:
 	pte_unmap_unlock(ptep, ptl);
 }
 
-/*
- * swapout a single page
- * page is locked upon entry, unlocked on exit
- */
-static int swap_page(struct page *page)
-{
-	struct address_space *mapping = page_mapping(page);
-
-	if (page_mapped(page) && mapping)
-		if (try_to_unmap(page, 1) != SWAP_SUCCESS)
-			goto unlock_retry;
-
-	if (PageDirty(page)) {
-		/* Page is dirty, try to write it out here */
-		switch(pageout(page, mapping)) {
-		case PAGE_KEEP:
-		case PAGE_ACTIVATE:
-			goto unlock_retry;
-
-		case PAGE_SUCCESS:
-			goto retry;
-
-		case PAGE_CLEAN:
-			; /* try to free the page below */
-		}
-	}
-
-	if (PagePrivate(page)) {
-		if (!try_to_release_page(page, GFP_KERNEL) ||
-		    (!mapping && page_count(page) == 1))
-			goto unlock_retry;
-	}
-
-	if (remove_mapping(mapping, page)) {
-		/* Success */
-		unlock_page(page);
-		return 0;
-	}
-
-unlock_retry:
-	unlock_page(page);
-
-retry:
-	return -EAGAIN;
-}
-
 /*
  * Replace the page in the mapping.
  *
@@ -517,8 +467,7 @@ static int fallback_migrate_page(struct address_space *mapping,
  * Two lists are passed to this function. The first list
  * contains the pages isolated from the LRU to be migrated.
  * The second list contains new pages that the pages isolated
- * can be moved to. If the second list is NULL then all
- * pages are swapped out.
+ * can be moved to.
  *
  * The function returns after 10 attempts or if no pages
  * are movable anymore because to has become empty
@@ -574,29 +523,11 @@ redo:
 		 * Only wait on writeback if we have already done a pass where
 		 * we we may have triggered writeouts for lots of pages.
 		 */
-		if (pass > 0) {
+		if (pass > 0)
 			wait_on_page_writeback(page);
-		} else {
+		else
 			if (PageWriteback(page))
 				goto unlock_page;
-		}
-
-		/*
-		 * Anonymous pages must have swap cache references otherwise
-		 * the information contained in the page maps cannot be
-		 * preserved.
-		 */
-		if (PageAnon(page) && !PageSwapCache(page)) {
-			if (!add_to_swap(page, GFP_KERNEL)) {
-				rc = -ENOMEM;
-				goto unlock_page;
-			}
-		}
-
-		if (!to) {
-			rc = swap_page(page);
-			goto next;
-		}
 
 		/*
 		 * Establish swap ptes for anonymous pages or destroy pte
diff --git a/mm/rmap.c b/mm/rmap.c
index 3b8ce86daa3..a53a10b93ec 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -205,44 +205,6 @@ out:
 	return anon_vma;
 }
 
-#ifdef CONFIG_MIGRATION
-/*
- * Remove an anonymous page from swap replacing the swap pte's
- * through real pte's pointing to valid pages and then releasing
- * the page from the swap cache.
- *
- * Must hold page lock on page and mmap_sem of one vma that contains
- * the page.
- */
-void remove_from_swap(struct page *page)
-{
-	struct anon_vma *anon_vma;
-	struct vm_area_struct *vma;
-	unsigned long mapping;
-
-	if (!PageSwapCache(page))
-		return;
-
-	mapping = (unsigned long)page->mapping;
-
-	if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
-		return;
-
-	/*
-	 * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
-	 */
-	anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
-	spin_lock(&anon_vma->lock);
-
-	list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
-		remove_vma_swap(vma, page);
-
-	spin_unlock(&anon_vma->lock);
-	delete_from_swap_cache(page);
-}
-EXPORT_SYMBOL(remove_from_swap);
-#endif
-
 /*
  * At what user virtual address is page expected in vma?
  */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e3b1362372c..fbceed67a07 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -618,15 +618,6 @@ static int unuse_mm(struct mm_struct *mm,
 	return 0;
 }
 
-#ifdef CONFIG_MIGRATION
-int remove_vma_swap(struct vm_area_struct *vma, struct page *page)
-{
-	swp_entry_t entry = { .val = page_private(page) };
-
-	return unuse_vma(vma, entry, page);
-}
-#endif
-
 /*
  * Scan swap_map from current position to next entry still in use.
  * Recycle to start on reaching the end, returning 0 when empty.
-- 
cgit v1.2.3-70-g09d2


From 6c5240ae7f48c83fcaa8e24fa63e7eb09aba5651 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 23 Jun 2006 02:03:37 -0700
Subject: [PATCH] Swapless page migration: modify core logic

Use the migration entries for page migration

This modifies the migration code to use the new migration entries.  It now
becomes possible to migrate anonymous pages without having to add a swap
entry.

We add a couple of new functions to replace migration entries with the proper
ptes.

We cannot take the tree_lock for migrating anonymous pages anymore.  However,
we know that we hold the only remaining reference to the page when the page
count reaches 1.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/Kconfig   |  4 ++--
 mm/migrate.c | 53 +++++++++++++++++++++--------------------------------
 2 files changed, 23 insertions(+), 34 deletions(-)

(limited to 'mm')

diff --git a/mm/Kconfig b/mm/Kconfig
index 332f5c29b53..66e65ab3942 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -138,8 +138,8 @@ config SPLIT_PTLOCK_CPUS
 #
 config MIGRATION
 	bool "Page migration"
-	def_bool y if NUMA
-	depends on SWAP && NUMA
+	def_bool y
+	depends on NUMA
 	help
 	  Allows the migration of the physical location of pages of processes
 	  while the virtual addresses are not changed. This is useful for
diff --git a/mm/migrate.c b/mm/migrate.c
index 81721a061d5..8f91463eab4 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -254,14 +254,20 @@ static int migrate_page_move_mapping(struct address_space *mapping,
 {
 	struct page **radix_pointer;
 
+	if (!mapping) {
+		/* Anonymous page */
+		if (page_count(page) != 1)
+			return -EAGAIN;
+		return 0;
+	}
+
 	write_lock_irq(&mapping->tree_lock);
 
 	radix_pointer = (struct page **)radix_tree_lookup_slot(
 						&mapping->page_tree,
 						page_index(page));
 
-	if (!page_mapping(page) ||
-			page_count(page) != 2 + !!PagePrivate(page) ||
+	if (page_count(page) != 2 + !!PagePrivate(page) ||
 			*radix_pointer != page) {
 		write_unlock_irq(&mapping->tree_lock);
 		return -EAGAIN;
@@ -271,10 +277,12 @@ static int migrate_page_move_mapping(struct address_space *mapping,
 	 * Now we know that no one else is looking at the page.
 	 */
 	get_page(newpage);
+#ifdef CONFIG_SWAP
 	if (PageSwapCache(page)) {
 		SetPageSwapCache(newpage);
 		set_page_private(newpage, page_private(page));
 	}
+#endif
 
 	*radix_pointer = newpage;
 	__put_page(page);
@@ -308,7 +316,9 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
 		set_page_dirty(newpage);
  	}
 
+#ifdef CONFIG_SWAP
 	ClearPageSwapCache(page);
+#endif
 	ClearPageActive(page);
 	ClearPagePrivate(page);
 	set_page_private(page, 0);
@@ -353,16 +363,6 @@ int migrate_page(struct address_space *mapping,
 		return rc;
 
 	migrate_page_copy(newpage, page);
-
-	/*
-	 * Remove auxiliary swap entries and replace
-	 * them with real ptes.
-	 *
-	 * Note that a real pte entry will allow processes that are not
-	 * waiting on the page lock to use the new page via the page tables
-	 * before the new page is unlocked.
-	 */
-	remove_from_swap(newpage);
 	return 0;
 }
 EXPORT_SYMBOL(migrate_page);
@@ -530,23 +530,7 @@ redo:
 				goto unlock_page;
 
 		/*
-		 * Establish swap ptes for anonymous pages or destroy pte
-		 * maps for files.
-		 *
-		 * In order to reestablish file backed mappings the fault handlers
-		 * will take the radix tree_lock which may then be used to stop
-	  	 * processses from accessing this page until the new page is ready.
-		 *
-		 * A process accessing via a swap pte (an anonymous page) will take a
-		 * page_lock on the old page which will block the process until the
-		 * migration attempt is complete. At that time the PageSwapCache bit
-		 * will be examined. If the page was migrated then the PageSwapCache
-		 * bit will be clear and the operation to retrieve the page will be
-		 * retried which will find the new page in the radix tree. Then a new
-		 * direct mapping may be generated based on the radix tree contents.
-		 *
-		 * If the page was not migrated then the PageSwapCache bit
-		 * is still set and the operation may continue.
+		 * Establish migration ptes or remove ptes
 		 */
 		rc = -EPERM;
 		if (try_to_unmap(page, 1) == SWAP_FAIL)
@@ -569,9 +553,9 @@ redo:
 		 */
 		mapping = page_mapping(page);
 		if (!mapping)
-			goto unlock_both;
+			rc = migrate_page(mapping, newpage, page);
 
-		if (mapping->a_ops->migratepage)
+		else if (mapping->a_ops->migratepage)
 			/*
 			 * Most pages have a mapping and most filesystems
 			 * should provide a migration function. Anonymous
@@ -584,10 +568,15 @@ redo:
 		else
 			rc = fallback_migrate_page(mapping, newpage, page);
 
-unlock_both:
+		if (!rc)
+			remove_migration_ptes(page, newpage);
+
 		unlock_page(newpage);
 
 unlock_page:
+		if (rc)
+			remove_migration_ptes(page, page);
+
 		unlock_page(page);
 
 next:
-- 
cgit v1.2.3-70-g09d2


From 442c9137de8d769053e81d325709dca72f0b5e44 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 23 Jun 2006 02:03:38 -0700
Subject: [PATCH] More page migration: do not inc/dec rss counters

If we install a migration entry then the rss not really decreases since the
page is just moved somewhere else.  We can save ourselves the work of
decrementing and later incrementing which will just eventually cause cacheline
bouncing.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/migrate.c | 1 -
 mm/rmap.c    | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/migrate.c b/mm/migrate.c
index 8f91463eab4..96b9546e69e 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -164,7 +164,6 @@ static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr,
 	if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old)
 		goto out;
 
-	inc_mm_counter(mm, anon_rss);
 	get_page(new);
 	pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
 	if (is_write_migration_entry(entry))
diff --git a/mm/rmap.c b/mm/rmap.c
index a53a10b93ec..05d6d73a692 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -595,6 +595,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 					list_add(&mm->mmlist, &init_mm.mmlist);
 				spin_unlock(&mmlist_lock);
 			}
+			dec_mm_counter(mm, anon_rss);
 		} else {
 			/*
 			 * Store the pfn of the page in a special migration
@@ -606,7 +607,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 		}
 		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
 		BUG_ON(pte_file(*pte));
-		dec_mm_counter(mm, anon_rss);
 	} else
 		dec_mm_counter(mm, file_rss);
 
-- 
cgit v1.2.3-70-g09d2


From 04e62a29bf157ce1edd168f2b71b533c80d13628 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 23 Jun 2006 02:03:38 -0700
Subject: [PATCH] More page migration: use migration entries for file pages

This implements the use of migration entries to preserve ptes of file backed
pages during migration.  Processes can therefore be migrated back and forth
without loosing their connection to pagecache pages.

Note that we implement the migration entries only for linear mappings.
Nonlinear mappings still require the unmapping of the ptes for migration.

And another writepage() ugliness shows up.  writepage() can drop the page
lock.  Therefore we have to remove migration ptes before calling writepages()
in order to avoid having migration entries point to unlocked pages.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h |  15 ------
 mm/migrate.c         | 127 ++++++++++++++++++++++++++++++++++++++++-----------
 mm/rmap.c            |  11 +++++
 mm/vmscan.c          |  14 +++++-
 4 files changed, 124 insertions(+), 43 deletions(-)

(limited to 'mm')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7cee73ef4f1..1cf234e8df5 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -186,20 +186,6 @@ extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;
 extern int remove_mapping(struct address_space *mapping, struct page *page);
 
-/* possible outcome of pageout() */
-typedef enum {
-	/* failed to write page out, page is locked */
-	PAGE_KEEP,
-	/* move page to the active list, page is locked */
-	PAGE_ACTIVATE,
-	/* page has been sent to the disk successfully, page is unlocked */
-	PAGE_SUCCESS,
-	/* page is clean and locked */
-	PAGE_CLEAN,
-} pageout_t;
-
-extern pageout_t pageout(struct page *page, struct address_space *mapping);
-
 #ifdef CONFIG_NUMA
 extern int zone_reclaim_mode;
 extern int zone_reclaim_interval;
@@ -259,7 +245,6 @@ extern int remove_exclusive_swap_page(struct page *);
 struct backing_dev_info;
 
 extern spinlock_t swap_lock;
-extern int remove_vma_swap(struct vm_area_struct *vma, struct page *page);
 
 /* linux/mm/thrash.c */
 extern struct mm_struct * swap_token_mm;
diff --git a/mm/migrate.c b/mm/migrate.c
index 96b9546e69e..b5000d46389 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -24,6 +24,7 @@
 #include <linux/topology.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
+#include <linux/writeback.h>
 
 #include "internal.h"
 
@@ -123,7 +124,7 @@ static inline int is_swap_pte(pte_t pte)
 /*
  * Restore a potential migration pte to a working pte entry
  */
-static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr,
+static void remove_migration_pte(struct vm_area_struct *vma,
 		struct page *old, struct page *new)
 {
 	struct mm_struct *mm = vma->vm_mm;
@@ -133,6 +134,10 @@ static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr,
  	pmd_t *pmd;
 	pte_t *ptep, pte;
  	spinlock_t *ptl;
+	unsigned long addr = page_address_in_vma(new, vma);
+
+	if (addr == -EFAULT)
+		return;
 
  	pgd = pgd_offset(mm, addr);
 	if (!pgd_present(*pgd))
@@ -169,19 +174,47 @@ static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr,
 	if (is_write_migration_entry(entry))
 		pte = pte_mkwrite(pte);
 	set_pte_at(mm, addr, ptep, pte);
-	page_add_anon_rmap(new, vma, addr);
+
+	if (PageAnon(new))
+		page_add_anon_rmap(new, vma, addr);
+	else
+		page_add_file_rmap(new);
+
+	/* No need to invalidate - it was non-present before */
+	update_mmu_cache(vma, addr, pte);
+	lazy_mmu_prot_update(pte);
+
 out:
 	pte_unmap_unlock(ptep, ptl);
 }
 
 /*
- * Get rid of all migration entries and replace them by
- * references to the indicated page.
- *
+ * Note that remove_file_migration_ptes will only work on regular mappings,
+ * Nonlinear mappings do not use migration entries.
+ */
+static void remove_file_migration_ptes(struct page *old, struct page *new)
+{
+	struct vm_area_struct *vma;
+	struct address_space *mapping = page_mapping(new);
+	struct prio_tree_iter iter;
+	pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+
+	if (!mapping)
+		return;
+
+	spin_lock(&mapping->i_mmap_lock);
+
+	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff)
+		remove_migration_pte(vma, old, new);
+
+	spin_unlock(&mapping->i_mmap_lock);
+}
+
+/*
  * Must hold mmap_sem lock on at least one of the vmas containing
  * the page so that the anon_vma cannot vanish.
  */
-static void remove_migration_ptes(struct page *old, struct page *new)
+static void remove_anon_migration_ptes(struct page *old, struct page *new)
 {
 	struct anon_vma *anon_vma;
 	struct vm_area_struct *vma;
@@ -199,12 +232,23 @@ static void remove_migration_ptes(struct page *old, struct page *new)
 	spin_lock(&anon_vma->lock);
 
 	list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
-		remove_migration_pte(vma, page_address_in_vma(new, vma),
-					old, new);
+		remove_migration_pte(vma, old, new);
 
 	spin_unlock(&anon_vma->lock);
 }
 
+/*
+ * Get rid of all migration entries and replace them by
+ * references to the indicated page.
+ */
+static void remove_migration_ptes(struct page *old, struct page *new)
+{
+	if (PageAnon(new))
+		remove_anon_migration_ptes(old, new);
+	else
+		remove_file_migration_ptes(old, new);
+}
+
 /*
  * Something used the pte of a page under migration. We need to
  * get to the page and wait until migration is finished.
@@ -424,30 +468,59 @@ int buffer_migrate_page(struct address_space *mapping,
 }
 EXPORT_SYMBOL(buffer_migrate_page);
 
-static int fallback_migrate_page(struct address_space *mapping,
-	struct page *newpage, struct page *page)
+/*
+ * Writeback a page to clean the dirty state
+ */
+static int writeout(struct address_space *mapping, struct page *page)
 {
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_NONE,
+		.nr_to_write = 1,
+		.range_start = 0,
+		.range_end = LLONG_MAX,
+		.nonblocking = 1,
+		.for_reclaim = 1
+	};
+	int rc;
+
+	if (!mapping->a_ops->writepage)
+		/* No write method for the address space */
+		return -EINVAL;
+
+	if (!clear_page_dirty_for_io(page))
+		/* Someone else already triggered a write */
+		return -EAGAIN;
+
 	/*
-	 * Default handling if a filesystem does not provide
-	 * a migration function. We can only migrate clean
-	 * pages so try to write out any dirty pages first.
+	 * A dirty page may imply that the underlying filesystem has
+	 * the page on some queue. So the page must be clean for
+	 * migration. Writeout may mean we loose the lock and the
+	 * page state is no longer what we checked for earlier.
+	 * At this point we know that the migration attempt cannot
+	 * be successful.
 	 */
-	if (PageDirty(page)) {
-		switch (pageout(page, mapping)) {
-		case PAGE_KEEP:
-		case PAGE_ACTIVATE:
-			return -EAGAIN;
+	remove_migration_ptes(page, page);
 
-		case PAGE_SUCCESS:
-			/* Relock since we lost the lock */
-			lock_page(page);
-			/* Must retry since page state may have changed */
-			return -EAGAIN;
+	rc = mapping->a_ops->writepage(page, &wbc);
+	if (rc < 0)
+		/* I/O Error writing */
+		return -EIO;
 
-		case PAGE_CLEAN:
-			; /* try to migrate the page below */
-		}
-	}
+	if (rc != AOP_WRITEPAGE_ACTIVATE)
+		/* unlocked. Relock */
+		lock_page(page);
+
+	return -EAGAIN;
+}
+
+/*
+ * Default handling if a filesystem does not provide a migration function.
+ */
+static int fallback_migrate_page(struct address_space *mapping,
+	struct page *newpage, struct page *page)
+{
+	if (PageDirty(page))
+		return writeout(mapping, page);
 
 	/*
 	 * Buffers may be managed in a filesystem specific way.
diff --git a/mm/rmap.c b/mm/rmap.c
index 05d6d73a692..882a85826bb 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -596,6 +596,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 				spin_unlock(&mmlist_lock);
 			}
 			dec_mm_counter(mm, anon_rss);
+#ifdef CONFIG_MIGRATION
 		} else {
 			/*
 			 * Store the pfn of the page in a special migration
@@ -604,12 +605,22 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			 */
 			BUG_ON(!migration);
 			entry = make_migration_entry(page, pte_write(pteval));
+#endif
 		}
 		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
 		BUG_ON(pte_file(*pte));
 	} else
+#ifdef CONFIG_MIGRATION
+	if (migration) {
+		/* Establish migration entry for a file page */
+		swp_entry_t entry;
+		entry = make_migration_entry(page, pte_write(pteval));
+		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
+	} else
+#endif
 		dec_mm_counter(mm, file_rss);
 
+
 	page_remove_rmap(page);
 	page_cache_release(page);
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bc5d4f43036..71a02e29503 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -290,11 +290,23 @@ static void handle_write_error(struct address_space *mapping,
 	unlock_page(page);
 }
 
+/* possible outcome of pageout() */
+typedef enum {
+	/* failed to write page out, page is locked */
+	PAGE_KEEP,
+	/* move page to the active list, page is locked */
+	PAGE_ACTIVATE,
+	/* page has been sent to the disk successfully, page is unlocked */
+	PAGE_SUCCESS,
+	/* page is clean and locked */
+	PAGE_CLEAN,
+} pageout_t;
+
 /*
  * pageout is called by shrink_page_list() for each dirty page.
  * Calls ->writepage().
  */
-pageout_t pageout(struct page *page, struct address_space *mapping)
+static pageout_t pageout(struct page *page, struct address_space *mapping)
 {
 	/*
 	 * If the page is dirty, only perform writeback if that write
-- 
cgit v1.2.3-70-g09d2


From ddc2e812d592457747c4367fb73edcaa8e1e49ff Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Fri, 23 Jun 2006 02:03:40 -0700
Subject: [PATCH] slab: verify pointers before free

Passing an invalid pointer to kfree() and kmem_cache_free() is likely to
cause bad memory corruption or even take down the whole system because the
bad pointer is likely reused immediately due to the per-CPU caches.  Until
now, we don't do any verification for this if CONFIG_DEBUG_SLAB is
disabled.

As suggested by Linus, add PageSlab check to page_to_cache() and
page_to_slab() to verify pointers passed to kfree().  Also, move the
stronger check from cache_free_debugcheck() to kmem_cache_free() to ensure
the passed pointer actually belongs to the cache we're about to free the
object.

For page_to_cache() and page_to_slab(), the assertions should have
virtually no extra cost (two instructions, no data cache pressure) and for
kmem_cache_free() the overhead should be minimal.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Manfred Spraul <manfred@colorfullife.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 2046da2aa46..e3260db04b9 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -592,6 +592,7 @@ static inline struct kmem_cache *page_get_cache(struct page *page)
 {
 	if (unlikely(PageCompound(page)))
 		page = (struct page *)page_private(page);
+	BUG_ON(!PageSlab(page));
 	return (struct kmem_cache *)page->lru.next;
 }
 
@@ -604,6 +605,7 @@ static inline struct slab *page_get_slab(struct page *page)
 {
 	if (unlikely(PageCompound(page)))
 		page = (struct page *)page_private(page);
+	BUG_ON(!PageSlab(page));
 	return (struct slab *)page->lru.prev;
 }
 
@@ -2669,15 +2671,6 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
 	kfree_debugcheck(objp);
 	page = virt_to_page(objp);
 
-	if (page_get_cache(page) != cachep) {
-		printk(KERN_ERR "mismatch in kmem_cache_free: expected "
-				"cache %p, got %p\n",
-		       page_get_cache(page), cachep);
-		printk(KERN_ERR "%p is %s.\n", cachep, cachep->name);
-		printk(KERN_ERR "%p is %s.\n", page_get_cache(page),
-		       page_get_cache(page)->name);
-		WARN_ON(1);
-	}
 	slabp = page_get_slab(page);
 
 	if (cachep->flags & SLAB_RED_ZONE) {
@@ -3393,6 +3386,8 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
 {
 	unsigned long flags;
 
+	BUG_ON(virt_to_cache(objp) != cachep);
+
 	local_irq_save(flags);
 	__cache_free(cachep, objp);
 	local_irq_restore(flags);
-- 
cgit v1.2.3-70-g09d2


From 30c253e6da655d73eb8bfe2adca9b8f4d82fb81e Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Fri, 23 Jun 2006 02:03:41 -0700
Subject: [PATCH] sparsemem: record nid during memory present

Record the node id as we mark sections for instantiation.  Use this nid
during instantiation to direct allocations.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Cc: Mike Kravetz <kravetz@us.ibm.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Bob Picco <bob.picco@hp.com>
Cc: Jack Steiner <steiner@sgi.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Martin Bligh <mbligh@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h |  5 +++++
 mm/sparse.c            | 22 ++++++++++++++++++++--
 2 files changed, 25 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e82fc1a52cd..d6120fa6911 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -509,6 +509,10 @@ struct mem_section {
 	 * pages.  However, it is stored with some other magic.
 	 * (see sparse.c::sparse_init_one_section())
 	 *
+	 * Additionally during early boot we encode node id of
+	 * the location of the section here to guide allocation.
+	 * (see sparse.c::memory_present())
+	 *
 	 * Making it a UL at least makes someone do a cast
 	 * before using it wrong.
 	 */
@@ -548,6 +552,7 @@ extern int __section_nr(struct mem_section* ms);
 #define SECTION_HAS_MEM_MAP	(1UL<<1)
 #define SECTION_MAP_LAST_BIT	(1UL<<2)
 #define SECTION_MAP_MASK	(~(SECTION_MAP_LAST_BIT-1))
+#define SECTION_NID_SHIFT	2
 
 static inline struct page *__section_mem_map_addr(struct mem_section *section)
 {
diff --git a/mm/sparse.c b/mm/sparse.c
index 100040c0dfb..e0a3fe48aa3 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -99,6 +99,22 @@ int __section_nr(struct mem_section* ms)
 	return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
 }
 
+/*
+ * During early boot, before section_mem_map is used for an actual
+ * mem_map, we use section_mem_map to store the section's NUMA
+ * node.  This keeps us from having to use another data structure.  The
+ * node information is cleared just before we store the real mem_map.
+ */
+static inline unsigned long sparse_encode_early_nid(int nid)
+{
+	return (nid << SECTION_NID_SHIFT);
+}
+
+static inline int sparse_early_nid(struct mem_section *section)
+{
+	return (section->section_mem_map >> SECTION_NID_SHIFT);
+}
+
 /* Record a memory area against a node. */
 void memory_present(int nid, unsigned long start, unsigned long end)
 {
@@ -113,7 +129,8 @@ void memory_present(int nid, unsigned long start, unsigned long end)
 
 		ms = __nr_to_section(section);
 		if (!ms->section_mem_map)
-			ms->section_mem_map = SECTION_MARKED_PRESENT;
+			ms->section_mem_map = sparse_encode_early_nid(nid) |
+							SECTION_MARKED_PRESENT;
 	}
 }
 
@@ -164,6 +181,7 @@ static int sparse_init_one_section(struct mem_section *ms,
 	if (!valid_section(ms))
 		return -EINVAL;
 
+	ms->section_mem_map &= ~SECTION_MAP_MASK;
 	ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum);
 
 	return 1;
@@ -172,8 +190,8 @@ static int sparse_init_one_section(struct mem_section *ms,
 static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
 {
 	struct page *map;
-	int nid = early_pfn_to_nid(section_nr_to_pfn(pnum));
 	struct mem_section *ms = __nr_to_section(pnum);
+	int nid = sparse_early_nid(ms);
 
 	map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
 	if (map)
-- 
cgit v1.2.3-70-g09d2


From 9637a5efd4fbe36164c5ce7f6a0ee68b2bf22b7f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 23 Jun 2006 02:03:43 -0700
Subject: [PATCH] add page_mkwrite() vm_operations method

Add a new VMA operation to notify a filesystem or other driver about the
MMU generating a fault because userspace attempted to write to a page
mapped through a read-only PTE.

This facility permits the filesystem or driver to:

 (*) Implement storage allocation/reservation on attempted write, and so to
     deal with problems such as ENOSPC more gracefully (perhaps by generating
     SIGBUS).

 (*) Delay making the page writable until the contents have been written to a
     backing cache. This is useful for NFS/AFS when using FS-Cache/CacheFS.
     It permits the filesystem to have some guarantee about the state of the
     cache.

 (*) Account and limit number of dirty pages. This is one piece of the puzzle
     needed to make shared writable mapping work safely in FUSE.

Needed by cachefs (Or is it cachefiles?  Or fscache? <head spins>).

At least four other groups have stated an interest in it or a desire to use
the functionality it provides: FUSE, OCFS2, NTFS and JFFS2.  Also, things like
EXT3 really ought to use it to deal with the case of shared-writable mmap
encountering ENOSPC before we permit the page to be dirtied.

From: Peter Zijlstra <a.p.zijlstra@chello.nl>

  get_user_pages(.write=1, .force=1) can generate COW hits on read-only
  shared mappings, this patch traps those as mkpage_write candidates and fails
  to handle them the old way.

Signed-off-by: David Howells <dhowells@redhat.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: Joel Becker <Joel.Becker@oracle.com>
Cc: Mark Fasheh <mark.fasheh@oracle.com>
Cc: Anton Altaparmakov <aia21@cantab.net>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h |   4 +++
 mm/memory.c        | 100 ++++++++++++++++++++++++++++++++++++++++-------------
 mm/mmap.c          |  12 +++++--
 mm/mprotect.c      |  11 ++++--
 4 files changed, 99 insertions(+), 28 deletions(-)

(limited to 'mm')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 697c6bf248c..3b09444121d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -199,6 +199,10 @@ struct vm_operations_struct {
 	void (*close)(struct vm_area_struct * area);
 	struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int *type);
 	int (*populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock);
+
+	/* notification that a previously read-only page is about to become
+	 * writable, if an error is returned it will cause a SIGBUS */
+	int (*page_mkwrite)(struct vm_area_struct *vma, struct page *page);
 #ifdef CONFIG_NUMA
 	int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);
 	struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
diff --git a/mm/memory.c b/mm/memory.c
index 11673c5d2c2..247b5c312b9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1457,25 +1457,60 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 {
 	struct page *old_page, *new_page;
 	pte_t entry;
-	int ret = VM_FAULT_MINOR;
+	int reuse, ret = VM_FAULT_MINOR;
 
 	old_page = vm_normal_page(vma, address, orig_pte);
 	if (!old_page)
 		goto gotten;
 
-	if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
-		int reuse = can_share_swap_page(old_page);
-		unlock_page(old_page);
-		if (reuse) {
-			flush_cache_page(vma, address, pte_pfn(orig_pte));
-			entry = pte_mkyoung(orig_pte);
-			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-			ptep_set_access_flags(vma, address, page_table, entry, 1);
-			update_mmu_cache(vma, address, entry);
-			lazy_mmu_prot_update(entry);
-			ret |= VM_FAULT_WRITE;
-			goto unlock;
+	if (unlikely((vma->vm_flags & (VM_SHARED|VM_WRITE)) ==
+				(VM_SHARED|VM_WRITE))) {
+		if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
+			/*
+			 * Notify the address space that the page is about to
+			 * become writable so that it can prohibit this or wait
+			 * for the page to get into an appropriate state.
+			 *
+			 * We do this without the lock held, so that it can
+			 * sleep if it needs to.
+			 */
+			page_cache_get(old_page);
+			pte_unmap_unlock(page_table, ptl);
+
+			if (vma->vm_ops->page_mkwrite(vma, old_page) < 0)
+				goto unwritable_page;
+
+			page_cache_release(old_page);
+
+			/*
+			 * Since we dropped the lock we need to revalidate
+			 * the PTE as someone else may have changed it.  If
+			 * they did, we just return, as we can count on the
+			 * MMU to tell us if they didn't also make it writable.
+			 */
+			page_table = pte_offset_map_lock(mm, pmd, address,
+							 &ptl);
+			if (!pte_same(*page_table, orig_pte))
+				goto unlock;
 		}
+
+		reuse = 1;
+	} else if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
+		reuse = can_share_swap_page(old_page);
+		unlock_page(old_page);
+	} else {
+		reuse = 0;
+	}
+
+	if (reuse) {
+		flush_cache_page(vma, address, pte_pfn(orig_pte));
+		entry = pte_mkyoung(orig_pte);
+		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+		ptep_set_access_flags(vma, address, page_table, entry, 1);
+		update_mmu_cache(vma, address, entry);
+		lazy_mmu_prot_update(entry);
+		ret |= VM_FAULT_WRITE;
+		goto unlock;
 	}
 
 	/*
@@ -1535,6 +1570,10 @@ oom:
 	if (old_page)
 		page_cache_release(old_page);
 	return VM_FAULT_OOM;
+
+unwritable_page:
+	page_cache_release(old_page);
+	return VM_FAULT_SIGBUS;
 }
 
 /*
@@ -2083,18 +2122,31 @@ retry:
 	/*
 	 * Should we do an early C-O-W break?
 	 */
-	if (write_access && !(vma->vm_flags & VM_SHARED)) {
-		struct page *page;
+	if (write_access) {
+		if (!(vma->vm_flags & VM_SHARED)) {
+			struct page *page;
 
-		if (unlikely(anon_vma_prepare(vma)))
-			goto oom;
-		page = alloc_page_vma(GFP_HIGHUSER, vma, address);
-		if (!page)
-			goto oom;
-		copy_user_highpage(page, new_page, address);
-		page_cache_release(new_page);
-		new_page = page;
-		anon = 1;
+			if (unlikely(anon_vma_prepare(vma)))
+				goto oom;
+			page = alloc_page_vma(GFP_HIGHUSER, vma, address);
+			if (!page)
+				goto oom;
+			copy_user_highpage(page, new_page, address);
+			page_cache_release(new_page);
+			new_page = page;
+			anon = 1;
+
+		} else {
+			/* if the page will be shareable, see if the backing
+			 * address space wants to know that the page is about
+			 * to become writable */
+			if (vma->vm_ops->page_mkwrite &&
+			    vma->vm_ops->page_mkwrite(vma, new_page) < 0
+			    ) {
+				page_cache_release(new_page);
+				return VM_FAULT_SIGBUS;
+			}
+		}
 	}
 
 	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
diff --git a/mm/mmap.c b/mm/mmap.c
index e6ee12344b1..6446c6134b0 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1065,7 +1065,8 @@ munmap_back:
 	vma->vm_start = addr;
 	vma->vm_end = addr + len;
 	vma->vm_flags = vm_flags;
-	vma->vm_page_prot = protection_map[vm_flags & 0x0f];
+	vma->vm_page_prot = protection_map[vm_flags &
+				(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
 	vma->vm_pgoff = pgoff;
 
 	if (file) {
@@ -1089,6 +1090,12 @@ munmap_back:
 			goto free_vma;
 	}
 
+	/* Don't make the VMA automatically writable if it's shared, but the
+	 * backer wishes to know when pages are first written to */
+	if (vma->vm_ops && vma->vm_ops->page_mkwrite)
+		vma->vm_page_prot =
+			protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)];
+
 	/* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
 	 * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
 	 * that memory reservation must be checked; but that reservation
@@ -1921,7 +1928,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
 	vma->vm_end = addr + len;
 	vma->vm_pgoff = pgoff;
 	vma->vm_flags = flags;
-	vma->vm_page_prot = protection_map[flags & 0x0f];
+	vma->vm_page_prot = protection_map[flags &
+				(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
 	vma_link(mm, vma, prev, rb_link, rb_parent);
 out:
 	mm->total_vm += len >> PAGE_SHIFT;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 14f93e62270..638edabaff7 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -123,6 +123,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 	unsigned long oldflags = vma->vm_flags;
 	long nrpages = (end - start) >> PAGE_SHIFT;
 	unsigned long charged = 0;
+	unsigned int mask;
 	pgprot_t newprot;
 	pgoff_t pgoff;
 	int error;
@@ -149,8 +150,6 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 		}
 	}
 
-	newprot = protection_map[newflags & 0xf];
-
 	/*
 	 * First try to merge with previous and/or next vma.
 	 */
@@ -177,6 +176,14 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 	}
 
 success:
+	/* Don't make the VMA automatically writable if it's shared, but the
+	 * backer wishes to know when pages are first written to */
+	mask = VM_READ|VM_WRITE|VM_EXEC|VM_SHARED;
+	if (vma->vm_ops && vma->vm_ops->page_mkwrite)
+		mask &= ~VM_SHARED;
+
+	newprot = protection_map[newflags & mask];
+
 	/*
 	 * vm_flags and vm_page_prot are protected by the mmap_sem
 	 * held in write mode.
-- 
cgit v1.2.3-70-g09d2


From 70af7c5c6492ef6ad137dbff6c4568c73edbcaf0 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Fri, 23 Jun 2006 02:03:44 -0700
Subject: [PATCH] swapoff: use atomic_inc_not_zero() on mm_users

Now that we have atomic_inc_not_zero, it's more elegant for try_to_unuse to
use that on mm_users: doesn't actually matter at present, but safer to be
sure that once mm_users has gone to 0, nothing raises it for an instant.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/swapfile.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/swapfile.c b/mm/swapfile.c
index fbceed67a07..f2824c3c31b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -772,10 +772,8 @@ static int try_to_unuse(unsigned int type)
 			while (*swap_map > 1 && !retval &&
 					(p = p->next) != &start_mm->mmlist) {
 				mm = list_entry(p, struct mm_struct, mmlist);
-				if (atomic_inc_return(&mm->mm_users) == 1) {
-					atomic_dec(&mm->mm_users);
+				if (!atomic_inc_not_zero(&mm->mm_users))
 					continue;
-				}
 				spin_unlock(&mmlist_lock);
 				mmput(prev_mm);
 				prev_mm = mm;
-- 
cgit v1.2.3-70-g09d2


From 668e0d8f1a02fd75f1c1e8142a6b08455914242c Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Fri, 23 Jun 2006 02:03:45 -0700
Subject: [PATCH] fix update_mmu_cache in fremap.c

There are two calls to update_mmu_cache in fremap.c, both defective.
The one in install_page needs to be accompanied by lazy_mmu_prot_update
(some other cleanup time, move that into ia64 update_mmu_cache itself); and
the one in install_file_pte should be removed since the pte is not present.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/fremap.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/fremap.c b/mm/fremap.c
index 9f381e58bf4..21b7d0cbc98 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -83,6 +83,7 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	page_add_file_rmap(page);
 	pte_val = *pte;
 	update_mmu_cache(vma, addr, pte_val);
+	lazy_mmu_prot_update(pte_val);
 	err = 0;
 unlock:
 	pte_unmap_unlock(pte, ptl);
@@ -114,7 +115,13 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
 	pte_val = *pte;
-	update_mmu_cache(vma, addr, pte_val);
+	/*
+	 * We don't need to run update_mmu_cache() here because the "file pte"
+	 * being installed by install_file_pte() is not a real pte - it's a
+	 * non-present entry (like a swap entry), noting what file offset should
+	 * be mapped there when there's a fault (in a non-linear vma where
+	 * that's not obvious).
+	 */
 	pte_unmap_unlock(pte, ptl);
 	err = 0;
 out:
-- 
cgit v1.2.3-70-g09d2


From e0a42726794f71336ff4b26084d453dd597471ce Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 23 Jun 2006 02:03:46 -0700
Subject: [PATCH] mm/slab.c: fix early init assumption

The SLAB bootstrap code assumes that the first two kmalloc caches created
(the INDEX_AC and INDEX_L3 kmalloc caches) wont be off-slab.  But due to AC
and L3 structure size increase in lockdep, one of them ended up being
off-slab, and subsequently crashing with:

Unable to handle kernel NULL pointer dereference at 0000000000000000 RIP:
 [<ffffffff80267478>] kmem_cache_alloc+0x26/0x7d

The fix is to introduce a bootstrap flag and to use it to prevent off-slab
caches being created so early during bootup.

(The calculation for off-slab caches is quite complex so i didnt want to
complicate things with introducing yet another INDEX_ calculation, the flag
approach is simpler and smaller.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Manfred Spraul <manfred@colorfullife.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index e3260db04b9..664c3a10acf 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -331,6 +331,8 @@ static __always_inline int index_of(const size_t size)
 	return 0;
 }
 
+static int slab_early_init = 1;
+
 #define INDEX_AC index_of(sizeof(struct arraycache_init))
 #define INDEX_L3 index_of(sizeof(struct kmem_list3))
 
@@ -1376,6 +1378,8 @@ void __init kmem_cache_init(void)
 				NULL, NULL);
 	}
 
+	slab_early_init = 0;
+
 	while (sizes->cs_size != ULONG_MAX) {
 		/*
 		 * For performance, all the general caches are L1 aligned.
@@ -2106,8 +2110,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 #endif
 #endif
 
-	/* Determine if the slab management is 'on' or 'off' slab. */
-	if (size >= (PAGE_SIZE >> 3))
+	/*
+	 * Determine if the slab management is 'on' or 'off' slab.
+	 * (bootstrapping cannot cope with offslab caches so don't do
+	 * it too early on.)
+	 */
+	if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init)
 		/*
 		 * Size is large, assume best to place the slab management obj
 		 * off-slab (should allow better packing of objs).
-- 
cgit v1.2.3-70-g09d2


From bd1e22b8e0a90f9a91e4c27db14ca15773659bf7 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 23 Jun 2006 02:03:47 -0700
Subject: [PATCH] initialise total_memory() earlier

Initialise total_memory earlier in boot.  Because if for some reason we run
page reclaim early in boot, we don't want total_memory to be zero when we use
it as a divisor.

And rename total_memory to vm_total_pages to avoid naming clashes with
architectures.

Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Martin Bligh <mbligh@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h | 1 +
 mm/page_alloc.c      | 6 +++---
 mm/vmscan.c          | 5 ++---
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index f1a827a972e..dc3f3aa0c83 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -185,6 +185,7 @@ extern unsigned long try_to_free_pages(struct zone **, gfp_t);
 extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;
 extern int remove_mapping(struct address_space *mapping, struct page *page);
+extern long vm_total_pages;
 
 #ifdef CONFIG_NUMA
 extern int zone_reclaim_mode;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5af33186a25..71a0b2a23f5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1725,9 +1725,9 @@ void __meminit build_all_zonelists(void)
 		stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
 		/* cpuset refresh routine should be here */
 	}
-
-	printk("Built %i zonelists\n", num_online_nodes());
-
+	vm_total_pages = nr_free_pagecache_pages();
+	printk("Built %i zonelists.  Total pages: %ld\n",
+			num_online_nodes(), vm_total_pages);
 }
 
 /*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 71a02e29503..72babac71de 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -110,7 +110,7 @@ struct shrinker {
  * From 0 .. 100.  Higher means more swappy.
  */
 int vm_swappiness = 60;
-static long total_memory;
+long vm_total_pages;	/* The total number of pages which the VM controls */
 
 static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
@@ -743,7 +743,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 		 * how much memory
 		 * is mapped.
 		 */
-		mapped_ratio = (sc->nr_mapped * 100) / total_memory;
+		mapped_ratio = (sc->nr_mapped * 100) / vm_total_pages;
 
 		/*
 		 * Now decide how much we really want to unmap some pages.  The
@@ -1482,7 +1482,6 @@ static int __init kswapd_init(void)
 		pgdat->kswapd = find_task_by_pid(pid);
 		read_unlock(&tasklist_lock);
 	}
-	total_memory = nr_free_pagecache_pages();
 	hotcpu_notifier(cpu_callback, 0);
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 5a4d43615921575b0c8299a5407ce4836e4138fd Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Fri, 23 Jun 2006 02:03:47 -0700
Subject: [PATCH] update vm_total_pages at memory hotadd

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory_hotplug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 1b1ac3db218..841a077d5ae 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -160,6 +160,6 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 
 	if (need_zonelists_rebuild)
 		build_all_zonelists();
-
+	vm_total_pages = nr_free_pagecache_pages();
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 800590f523bf3bde9fa6c8e4d6763e4bf6a2c8ec Mon Sep 17 00:00:00 2001
From: Paul Drynoff <pauldrynoff@gmail.com>
Date: Fri, 23 Jun 2006 02:03:48 -0700
Subject: [PATCH] slab: kmalloc, kzalloc comments cleanup and fix

- Move comments for kmalloc to right place, currently it near __do_kmalloc

- Comments for kzalloc

- More detailed comments for kmalloc

- Appearance of "kmalloc" and "kzalloc" man pages after "make mandocs"

[rdunlap@xenotime.net: simplification]
Signed-off-by: Paul Drynoff <pauldrynoff@gmail.com>
Acked-by: Randy Dunlap <rdunlap@xenotime.net>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/DocBook/kernel-api.tmpl |  1 +
 include/linux/slab.h                  | 50 +++++++++++++++++++++++++++++++++++
 mm/slab.c                             | 20 ++------------
 3 files changed, 53 insertions(+), 18 deletions(-)

(limited to 'mm')

diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
index ca02e04a906..6dab3dd3699 100644
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -117,6 +117,7 @@ X!Ilib/string.c
   <chapter id="mm">
      <title>Memory Management in Linux</title>
      <sect1><title>The Slab Cache</title>
+!Iinclude/linux/slab.h
 !Emm/slab.c
      </sect1>
      <sect1><title>User Space Memory Access</title>
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 9dc93163e06..45ad55b70d1 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -86,6 +86,51 @@ extern void *__kmalloc_track_caller(size_t, gfp_t, void*);
     __kmalloc_track_caller(size, flags, __builtin_return_address(0))
 #endif
 
+/**
+ * kmalloc - allocate memory
+ * @size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate.
+ *
+ * kmalloc is the normal method of allocating memory
+ * in the kernel.
+ *
+ * The @flags argument may be one of:
+ *
+ * %GFP_USER - Allocate memory on behalf of user.  May sleep.
+ *
+ * %GFP_KERNEL - Allocate normal kernel ram.  May sleep.
+ *
+ * %GFP_ATOMIC - Allocation will not sleep.
+ *   For example, use this inside interrupt handlers.
+ *
+ * %GFP_HIGHUSER - Allocate pages from high memory.
+ *
+ * %GFP_NOIO - Do not do any I/O at all while trying to get memory.
+ *
+ * %GFP_NOFS - Do not make any fs calls while trying to get memory.
+ *
+ * Also it is possible to set different flags by OR'ing
+ * in one or more of the following additional @flags:
+ *
+ * %__GFP_COLD - Request cache-cold pages instead of
+ *   trying to return cache-warm pages.
+ *
+ * %__GFP_DMA - Request memory from the DMA-capable zone.
+ *
+ * %__GFP_HIGH - This allocation has high priority and may use emergency pools.
+ *
+ * %__GFP_HIGHMEM - Allocated memory may be from highmem.
+ *
+ * %__GFP_NOFAIL - Indicate that this allocation is in no way allowed to fail
+ *   (think twice before using).
+ *
+ * %__GFP_NORETRY - If memory is not immediately available,
+ *   then give up at once.
+ *
+ * %__GFP_NOWARN - If allocation fails, don't issue any warnings.
+ *
+ * %__GFP_REPEAT - If allocation fails initially, try once more before failing.
+ */
 static inline void *kmalloc(size_t size, gfp_t flags)
 {
 	if (__builtin_constant_p(size)) {
@@ -111,6 +156,11 @@ found:
 
 extern void *__kzalloc(size_t, gfp_t);
 
+/**
+ * kzalloc - allocate memory. The memory is set to zero.
+ * @size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate (see kmalloc).
+ */
 static inline void *kzalloc(size_t size, gfp_t flags)
 {
 	if (__builtin_constant_p(size)) {
diff --git a/mm/slab.c b/mm/slab.c
index 664c3a10acf..98ac20bc0de 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3277,26 +3277,10 @@ EXPORT_SYMBOL(kmalloc_node);
 #endif
 
 /**
- * kmalloc - allocate memory
+ * __do_kmalloc - allocate memory
  * @size: how many bytes of memory are required.
- * @flags: the type of memory to allocate.
+ * @flags: the type of memory to allocate (see kmalloc).
  * @caller: function caller for debug tracking of the caller
- *
- * kmalloc is the normal method of allocating memory
- * in the kernel.
- *
- * The @flags argument may be one of:
- *
- * %GFP_USER - Allocate memory on behalf of user.  May sleep.
- *
- * %GFP_KERNEL - Allocate normal kernel ram.  May sleep.
- *
- * %GFP_ATOMIC - Allocation will not sleep.  Use inside interrupt handlers.
- *
- * Additionally, the %GFP_DMA flag may be set to indicate the memory
- * must be suitable for DMA.  This can mean different things on different
- * platforms.  For example, on i386, it means that the memory must come
- * from the first 16MB.
  */
 static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
 					  void *caller)
-- 
cgit v1.2.3-70-g09d2


From 485bb99b49a173a22a0bbf4e189465414947ecac Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Fri, 23 Jun 2006 02:03:49 -0700
Subject: [PATCH] kernel-doc for mm/filemap.c

mm/filemap.c:
- add lots of kernel-doc;
- fix some typos and kernel-doc errors;
- drop some blank lines between function close and EXPORT_SYMBOL();

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/filemap.c | 173 +++++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 120 insertions(+), 53 deletions(-)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index 3342067ca43..368678c2d53 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -171,15 +171,17 @@ static int sync_page(void *word)
 }
 
 /**
- * filemap_fdatawrite_range - start writeback against all of a mapping's
- * dirty pages that lie within the byte offsets <start, end>
+ * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
  * @mapping:	address space structure to write
  * @start:	offset in bytes where the range starts
  * @end:	offset in bytes where the range ends (inclusive)
  * @sync_mode:	enable synchronous operation
  *
+ * Start writeback against all of a mapping's dirty pages that lie
+ * within the byte offsets <start, end> inclusive.
+ *
  * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
- * opposed to a regular memory * cleansing writeback.  The difference between
+ * opposed to a regular memory cleansing writeback.  The difference between
  * these two operations is that if a dirty page/buffer is encountered, it must
  * be waited upon, and not just skipped over.
  */
@@ -219,7 +221,10 @@ static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
 	return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
 }
 
-/*
+/**
+ * filemap_flush - mostly a non-blocking flush
+ * @mapping:	target address_space
+ *
  * This is a mostly non-blocking flush.  Not suitable for data-integrity
  * purposes - I/O may not be started against all dirty pages.
  */
@@ -229,7 +234,12 @@ int filemap_flush(struct address_space *mapping)
 }
 EXPORT_SYMBOL(filemap_flush);
 
-/*
+/**
+ * wait_on_page_writeback_range - wait for writeback to complete
+ * @mapping:	target address_space
+ * @start:	beginning page index
+ * @end:	ending page index
+ *
  * Wait for writeback to complete against pages indexed by start->end
  * inclusive
  */
@@ -276,7 +286,13 @@ int wait_on_page_writeback_range(struct address_space *mapping,
 	return ret;
 }
 
-/*
+/**
+ * sync_page_range - write and wait on all pages in the passed range
+ * @inode:	target inode
+ * @mapping:	target address_space
+ * @pos:	beginning offset in pages to write
+ * @count:	number of bytes to write
+ *
  * Write and wait upon all the pages in the passed range.  This is a "data
  * integrity" operation.  It waits upon in-flight writeout before starting and
  * waiting upon new writeout.  If there was an IO error, return it.
@@ -305,7 +321,13 @@ int sync_page_range(struct inode *inode, struct address_space *mapping,
 }
 EXPORT_SYMBOL(sync_page_range);
 
-/*
+/**
+ * sync_page_range_nolock
+ * @inode:	target inode
+ * @mapping:	target address_space
+ * @pos:	beginning offset in pages to write
+ * @count:	number of bytes to write
+ *
  * Note: Holding i_mutex across sync_page_range_nolock is not a good idea
  * as it forces O_SYNC writers to different parts of the same file
  * to be serialised right until io completion.
@@ -329,10 +351,11 @@ int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
 EXPORT_SYMBOL(sync_page_range_nolock);
 
 /**
- * filemap_fdatawait - walk the list of under-writeback pages of the given
- *     address space and wait for all of them.
- *
+ * filemap_fdatawait - wait for all under-writeback pages to complete
  * @mapping: address space structure to wait for
+ *
+ * Walk the list of under-writeback pages of the given address space
+ * and wait for all of them.
  */
 int filemap_fdatawait(struct address_space *mapping)
 {
@@ -368,7 +391,12 @@ int filemap_write_and_wait(struct address_space *mapping)
 }
 EXPORT_SYMBOL(filemap_write_and_wait);
 
-/*
+/**
+ * filemap_write_and_wait_range - write out & wait on a file range
+ * @mapping:	the address_space for the pages
+ * @lstart:	offset in bytes where the range starts
+ * @lend:	offset in bytes where the range ends (inclusive)
+ *
  * Write out and wait upon file offsets lstart->lend, inclusive.
  *
  * Note that `lend' is inclusive (describes the last byte to be written) so
@@ -394,8 +422,14 @@ int filemap_write_and_wait_range(struct address_space *mapping,
 	return err;
 }
 
-/*
- * This function is used to add newly allocated pagecache pages:
+/**
+ * add_to_page_cache - add newly allocated pagecache pages
+ * @page:	page to add
+ * @mapping:	the page's address_space
+ * @offset:	page index
+ * @gfp_mask:	page allocation mode
+ *
+ * This function is used to add newly allocated pagecache pages;
  * the page is new, so we can just run SetPageLocked() against it.
  * The other page state flags were set by rmqueue().
  *
@@ -422,7 +456,6 @@ int add_to_page_cache(struct page *page, struct address_space *mapping,
 	}
 	return error;
 }
-
 EXPORT_SYMBOL(add_to_page_cache);
 
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
@@ -489,8 +522,7 @@ void fastcall wait_on_page_bit(struct page *page, int bit_nr)
 EXPORT_SYMBOL(wait_on_page_bit);
 
 /**
- * unlock_page() - unlock a locked page
- *
+ * unlock_page - unlock a locked page
  * @page: the page
  *
  * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
@@ -513,8 +545,9 @@ void fastcall unlock_page(struct page *page)
 }
 EXPORT_SYMBOL(unlock_page);
 
-/*
- * End writeback against a page.
+/**
+ * end_page_writeback - end writeback against a page
+ * @page: the page
  */
 void end_page_writeback(struct page *page)
 {
@@ -527,10 +560,11 @@ void end_page_writeback(struct page *page)
 }
 EXPORT_SYMBOL(end_page_writeback);
 
-/*
- * Get a lock on the page, assuming we need to sleep to get it.
+/**
+ * __lock_page - get a lock on the page, assuming we need to sleep to get it
+ * @page: the page to lock
  *
- * Ugly: running sync_page() in state TASK_UNINTERRUPTIBLE is scary.  If some
+ * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary.  If some
  * random driver's requestfn sets TASK_RUNNING, we could busywait.  However
  * chances are that on the second loop, the block layer's plug list is empty,
  * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
@@ -544,8 +578,12 @@ void fastcall __lock_page(struct page *page)
 }
 EXPORT_SYMBOL(__lock_page);
 
-/*
- * a rather lightweight function, finding and getting a reference to a
+/**
+ * find_get_page - find and get a page reference
+ * @mapping: the address_space to search
+ * @offset: the page index
+ *
+ * A rather lightweight function, finding and getting a reference to a
  * hashed page atomically.
  */
 struct page * find_get_page(struct address_space *mapping, unsigned long offset)
@@ -559,11 +597,14 @@ struct page * find_get_page(struct address_space *mapping, unsigned long offset)
 	read_unlock_irq(&mapping->tree_lock);
 	return page;
 }
-
 EXPORT_SYMBOL(find_get_page);
 
-/*
- * Same as above, but trylock it instead of incrementing the count.
+/**
+ * find_trylock_page - find and lock a page
+ * @mapping: the address_space to search
+ * @offset: the page index
+ *
+ * Same as find_get_page(), but trylock it instead of incrementing the count.
  */
 struct page *find_trylock_page(struct address_space *mapping, unsigned long offset)
 {
@@ -576,12 +617,10 @@ struct page *find_trylock_page(struct address_space *mapping, unsigned long offs
 	read_unlock_irq(&mapping->tree_lock);
 	return page;
 }
-
 EXPORT_SYMBOL(find_trylock_page);
 
 /**
  * find_lock_page - locate, pin and lock a pagecache page
- *
  * @mapping: the address_space to search
  * @offset: the page index
  *
@@ -617,12 +656,10 @@ repeat:
 	read_unlock_irq(&mapping->tree_lock);
 	return page;
 }
-
 EXPORT_SYMBOL(find_lock_page);
 
 /**
  * find_or_create_page - locate or add a pagecache page
- *
  * @mapping: the page's address_space
  * @index: the page's index into the mapping
  * @gfp_mask: page allocation mode
@@ -663,7 +700,6 @@ repeat:
 		page_cache_release(cached_page);
 	return page;
 }
-
 EXPORT_SYMBOL(find_or_create_page);
 
 /**
@@ -729,9 +765,16 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
 	return i;
 }
 
-/*
+/**
+ * find_get_pages_tag - find and return pages that match @tag
+ * @mapping:	the address_space to search
+ * @index:	the starting page index
+ * @tag:	the tag index
+ * @nr_pages:	the maximum number of pages
+ * @pages:	where the resulting pages are placed
+ *
  * Like find_get_pages, except we only return pages which are tagged with
- * `tag'.   We update *index to index the next page for the traversal.
+ * @tag.   We update @index to index the next page for the traversal.
  */
 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
 			int tag, unsigned int nr_pages, struct page **pages)
@@ -750,7 +793,11 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
 	return ret;
 }
 
-/*
+/**
+ * grab_cache_page_nowait - returns locked page at given index in given cache
+ * @mapping: target address_space
+ * @index: the page index
+ *
  * Same as grab_cache_page, but do not wait if the page is unavailable.
  * This is intended for speculative data generators, where the data can
  * be regenerated if the page couldn't be grabbed.  This routine should
@@ -779,19 +826,25 @@ grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
 	}
 	return page;
 }
-
 EXPORT_SYMBOL(grab_cache_page_nowait);
 
-/*
+/**
+ * do_generic_mapping_read - generic file read routine
+ * @mapping:	address_space to be read
+ * @_ra:	file's readahead state
+ * @filp:	the file to read
+ * @ppos:	current file position
+ * @desc:	read_descriptor
+ * @actor:	read method
+ *
  * This is a generic file read routine, and uses the
- * mapping->a_ops->readpage() function for the actual low-level
- * stuff.
+ * mapping->a_ops->readpage() function for the actual low-level stuff.
  *
  * This is really ugly. But the goto's actually try to clarify some
  * of the logic when it comes to error handling etc.
  *
- * Note the struct file* is only passed for the use of readpage.  It may be
- * NULL.
+ * Note the struct file* is only passed for the use of readpage.
+ * It may be NULL.
  */
 void do_generic_mapping_read(struct address_space *mapping,
 			     struct file_ra_state *_ra,
@@ -1004,7 +1057,6 @@ out:
 	if (filp)
 		file_accessed(filp);
 }
-
 EXPORT_SYMBOL(do_generic_mapping_read);
 
 int file_read_actor(read_descriptor_t *desc, struct page *page,
@@ -1045,7 +1097,13 @@ success:
 	return size;
 }
 
-/*
+/**
+ * __generic_file_aio_read - generic filesystem read routine
+ * @iocb:	kernel I/O control block
+ * @iov:	io vector request
+ * @nr_segs:	number of segments in the iovec
+ * @ppos:	current file position
+ *
  * This is the "read()" routine for all filesystems
  * that can use the page cache directly.
  */
@@ -1124,7 +1182,6 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 out:
 	return retval;
 }
-
 EXPORT_SYMBOL(__generic_file_aio_read);
 
 ssize_t
@@ -1135,7 +1192,6 @@ generic_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t
 	BUG_ON(iocb->ki_pos != pos);
 	return __generic_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos);
 }
-
 EXPORT_SYMBOL(generic_file_aio_read);
 
 ssize_t
@@ -1151,7 +1207,6 @@ generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppo
 		ret = wait_on_sync_kiocb(&kiocb);
 	return ret;
 }
-
 EXPORT_SYMBOL(generic_file_read);
 
 int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
@@ -1192,7 +1247,6 @@ ssize_t generic_file_sendfile(struct file *in_file, loff_t *ppos,
 		return desc.written;
 	return desc.error;
 }
-
 EXPORT_SYMBOL(generic_file_sendfile);
 
 static ssize_t
@@ -1228,11 +1282,15 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
 }
 
 #ifdef CONFIG_MMU
-/*
+static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
+/**
+ * page_cache_read - adds requested page to the page cache if not already there
+ * @file:	file to read
+ * @offset:	page index
+ *
  * This adds the requested page to the page cache if it isn't already there,
  * and schedules an I/O to read in its contents from disk.
  */
-static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
 static int fastcall page_cache_read(struct file * file, unsigned long offset)
 {
 	struct address_space *mapping = file->f_mapping;
@@ -1259,7 +1317,12 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset)
 
 #define MMAP_LOTSAMISS  (100)
 
-/*
+/**
+ * filemap_nopage - read in file data for page fault handling
+ * @area:	the applicable vm_area
+ * @address:	target address to read in
+ * @type:	returned with VM_FAULT_{MINOR,MAJOR} if not %NULL
+ *
  * filemap_nopage() is invoked via the vma operations vector for a
  * mapped memory region to read in file data during a page fault.
  *
@@ -1462,7 +1525,6 @@ page_not_uptodate:
 	page_cache_release(page);
 	return NULL;
 }
-
 EXPORT_SYMBOL(filemap_nopage);
 
 static struct page * filemap_getpage(struct file *file, unsigned long pgoff,
@@ -1716,7 +1778,13 @@ repeat:
 	return page;
 }
 
-/*
+/**
+ * read_cache_page - read into page cache, fill it if needed
+ * @mapping:	the page's address_space
+ * @index:	the page index
+ * @filler:	function to perform the read
+ * @data:	destination for read data
+ *
  * Read into the page cache. If a page already exists,
  * and PageUptodate() is not set, try to fill the page.
  */
@@ -1754,7 +1822,6 @@ retry:
  out:
 	return page;
 }
-
 EXPORT_SYMBOL(read_cache_page);
 
 /*
@@ -1854,7 +1921,7 @@ __filemap_copy_from_user_iovec(char *vaddr,
 /*
  * Performs necessary checks before doing a write
  *
- * Can adjust writing position aor amount of bytes to write.
+ * Can adjust writing position or amount of bytes to write.
  * Returns appropriate error code that caller should return or
  * zero in case that write should be allowed.
  */
-- 
cgit v1.2.3-70-g09d2


From 8f9de51a4a98ba32f839903b7d009788bc2c295d Mon Sep 17 00:00:00 2001
From: Kirill Korotaev <dev@openvz.org>
Date: Fri, 23 Jun 2006 02:03:50 -0700
Subject: [PATCH] printk() should not be called under zone->lock

This patch fixes printk() under zone->lock in show_free_areas().  It can be
unsafe to call printk() under this lock, since caller can try to
allocate/free some memory and selfdeadlock on this lock.  I found
allocations/freeing mem both in netconsole and serial console.

This issue was faced in reallity when meminfo was periodically printed for
debug purposes and netconsole was used.

Signed-off-by: Kirill Korotaev <dev@openvz.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 71a0b2a23f5..423db0db7c0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1491,7 +1491,7 @@ void show_free_areas(void)
 	}
 
 	for_each_zone(zone) {
- 		unsigned long nr, flags, order, total = 0;
+ 		unsigned long nr[MAX_ORDER], flags, order, total = 0;
 
 		show_node(zone);
 		printk("%s: ", zone->name);
@@ -1502,11 +1502,12 @@ void show_free_areas(void)
 
 		spin_lock_irqsave(&zone->lock, flags);
 		for (order = 0; order < MAX_ORDER; order++) {
-			nr = zone->free_area[order].nr_free;
-			total += nr << order;
-			printk("%lu*%lukB ", nr, K(1UL) << order);
+			nr[order] = zone->free_area[order].nr_free;
+			total += nr[order] << order;
 		}
 		spin_unlock_irqrestore(&zone->lock, flags);
+		for (order = 0; order < MAX_ORDER; order++)
+			printk("%lu*%lukB ", nr[order], K(1UL) << order);
 		printk("= %lukB\n", K(total));
 	}
 
-- 
cgit v1.2.3-70-g09d2


From e24f0b8f76cc3dd96f36f5b6a9f020f6c3fce198 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 23 Jun 2006 02:03:51 -0700
Subject: [PATCH] page migration: simplify migrate_pages()

Currently migrate_pages() is mess with lots of goto.  Extract two functions
from migrate_pages() and get rid of the gotos.

Plus we can just unconditionally set the locked bit on the new page since we
are the only one holding a reference.  Locking is to stop others from
accessing the page once we establish references to the new page.

Remove the list_del from move_to_lru in order to have finer control over list
processing.

[akpm@osdl.org: add debug check]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Jes Sorensen <jes@trained-monkey.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/migrate.c | 218 +++++++++++++++++++++++++++++++----------------------------
 1 file changed, 115 insertions(+), 103 deletions(-)

(limited to 'mm')

diff --git a/mm/migrate.c b/mm/migrate.c
index b5000d46389..09038163bfe 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -84,7 +84,6 @@ int migrate_prep(void)
 
 static inline void move_to_lru(struct page *page)
 {
-	list_del(&page->lru);
 	if (PageActive(page)) {
 		/*
 		 * lru_cache_add_active checks that
@@ -110,6 +109,7 @@ int putback_lru_pages(struct list_head *l)
 	int count = 0;
 
 	list_for_each_entry_safe(page, page2, l, lru) {
+		list_del(&page->lru);
 		move_to_lru(page);
 		count++;
 	}
@@ -533,12 +533,109 @@ static int fallback_migrate_page(struct address_space *mapping,
 	return migrate_page(mapping, newpage, page);
 }
 
+/*
+ * Move a page to a newly allocated page
+ * The page is locked and all ptes have been successfully removed.
+ *
+ * The new page will have replaced the old page if this function
+ * is successful.
+ */
+static int move_to_new_page(struct page *newpage, struct page *page)
+{
+	struct address_space *mapping;
+	int rc;
+
+	/*
+	 * Block others from accessing the page when we get around to
+	 * establishing additional references. We are the only one
+	 * holding a reference to the new page at this point.
+	 */
+	if (TestSetPageLocked(newpage))
+		BUG();
+
+	/* Prepare mapping for the new page.*/
+	newpage->index = page->index;
+	newpage->mapping = page->mapping;
+
+	mapping = page_mapping(page);
+	if (!mapping)
+		rc = migrate_page(mapping, newpage, page);
+	else if (mapping->a_ops->migratepage)
+		/*
+		 * Most pages have a mapping and most filesystems
+		 * should provide a migration function. Anonymous
+		 * pages are part of swap space which also has its
+		 * own migration function. This is the most common
+		 * path for page migration.
+		 */
+		rc = mapping->a_ops->migratepage(mapping,
+						newpage, page);
+	else
+		rc = fallback_migrate_page(mapping, newpage, page);
+
+	if (!rc)
+		remove_migration_ptes(page, newpage);
+	else
+		newpage->mapping = NULL;
+
+	unlock_page(newpage);
+
+	return rc;
+}
+
+/*
+ * Obtain the lock on page, remove all ptes and migrate the page
+ * to the newly allocated page in newpage.
+ */
+static int unmap_and_move(struct page *newpage, struct page *page, int force)
+{
+	int rc = 0;
+
+	if (page_count(page) == 1)
+		/* page was freed from under us. So we are done. */
+		goto ret;
+
+	rc = -EAGAIN;
+	if (TestSetPageLocked(page)) {
+		if (!force)
+			goto ret;
+		lock_page(page);
+	}
+
+	if (PageWriteback(page)) {
+		if (!force)
+			goto unlock;
+		wait_on_page_writeback(page);
+	}
+
+	/*
+	 * Establish migration ptes or remove ptes
+	 */
+	if (try_to_unmap(page, 1) != SWAP_FAIL) {
+		if (!page_mapped(page))
+			rc = move_to_new_page(newpage, page);
+	} else
+		/* A vma has VM_LOCKED set -> permanent failure */
+		rc = -EPERM;
+
+	if (rc)
+		remove_migration_ptes(page, page);
+unlock:
+	unlock_page(page);
+ret:
+	if (rc != -EAGAIN) {
+		list_del(&newpage->lru);
+		move_to_lru(newpage);
+	}
+	return rc;
+}
+
 /*
  * migrate_pages
  *
  * Two lists are passed to this function. The first list
  * contains the pages isolated from the LRU to be migrated.
- * The second list contains new pages that the pages isolated
+ * The second list contains new pages that the isolated pages
  * can be moved to.
  *
  * The function returns after 10 attempts or if no pages
@@ -550,7 +647,7 @@ static int fallback_migrate_page(struct address_space *mapping,
 int migrate_pages(struct list_head *from, struct list_head *to,
 		  struct list_head *moved, struct list_head *failed)
 {
-	int retry;
+	int retry = 1;
 	int nr_failed = 0;
 	int pass = 0;
 	struct page *page;
@@ -561,118 +658,33 @@ int migrate_pages(struct list_head *from, struct list_head *to,
 	if (!swapwrite)
 		current->flags |= PF_SWAPWRITE;
 
-redo:
-	retry = 0;
-
-	list_for_each_entry_safe(page, page2, from, lru) {
-		struct page *newpage = NULL;
-		struct address_space *mapping;
-
-		cond_resched();
-
-		rc = 0;
-		if (page_count(page) == 1)
-			/* page was freed from under us. So we are done. */
-			goto next;
-
-		if (to && list_empty(to))
-			break;
-
-		/*
-		 * Skip locked pages during the first two passes to give the
-		 * functions holding the lock time to release the page. Later we
-		 * use lock_page() to have a higher chance of acquiring the
-		 * lock.
-		 */
-		rc = -EAGAIN;
-		if (pass > 2)
-			lock_page(page);
-		else
-			if (TestSetPageLocked(page))
-				goto next;
-
-		/*
-		 * Only wait on writeback if we have already done a pass where
-		 * we we may have triggered writeouts for lots of pages.
-		 */
-		if (pass > 0)
-			wait_on_page_writeback(page);
-		else
-			if (PageWriteback(page))
-				goto unlock_page;
-
-		/*
-		 * Establish migration ptes or remove ptes
-		 */
-		rc = -EPERM;
-		if (try_to_unmap(page, 1) == SWAP_FAIL)
-			/* A vma has VM_LOCKED set -> permanent failure */
-			goto unlock_page;
+	for(pass = 0; pass < 10 && retry; pass++) {
+		retry = 0;
 
-		rc = -EAGAIN;
-		if (page_mapped(page))
-			goto unlock_page;
+		list_for_each_entry_safe(page, page2, from, lru) {
 
-		newpage = lru_to_page(to);
-		lock_page(newpage);
-		/* Prepare mapping for the new page.*/
-		newpage->index = page->index;
-		newpage->mapping = page->mapping;
+			if (list_empty(to))
+				break;
 
-		/*
-		 * Pages are properly locked and writeback is complete.
-		 * Try to migrate the page.
-		 */
-		mapping = page_mapping(page);
-		if (!mapping)
-			rc = migrate_page(mapping, newpage, page);
+			cond_resched();
 
-		else if (mapping->a_ops->migratepage)
-			/*
-			 * Most pages have a mapping and most filesystems
-			 * should provide a migration function. Anonymous
-			 * pages are part of swap space which also has its
-			 * own migration function. This is the most common
-			 * path for page migration.
-			 */
-			rc = mapping->a_ops->migratepage(mapping,
-							newpage, page);
-		else
-			rc = fallback_migrate_page(mapping, newpage, page);
-
-		if (!rc)
-			remove_migration_ptes(page, newpage);
-
-		unlock_page(newpage);
-
-unlock_page:
-		if (rc)
-			remove_migration_ptes(page, page);
+			rc = unmap_and_move(lru_to_page(to), page, pass > 2);
 
-		unlock_page(page);
-
-next:
-		if (rc) {
-			if (newpage)
-				newpage->mapping = NULL;
-
-			if (rc == -EAGAIN)
+			switch(rc) {
+			case -EAGAIN:
 				retry++;
-			else {
+				break;
+			case 0:
+				list_move(&page->lru, moved);
+				break;
+			default:
 				/* Permanent failure */
 				list_move(&page->lru, failed);
 				nr_failed++;
+				break;
 			}
-		} else {
-			if (newpage) {
-				/* Successful migration. Return page to LRU */
-				move_to_lru(newpage);
-			}
-			list_move(&page->lru, moved);
 		}
 	}
-	if (retry && pass++ < 10)
-		goto redo;
 
 	if (!swapwrite)
 		current->flags &= ~PF_SWAPWRITE;
-- 
cgit v1.2.3-70-g09d2


From aaa994b300a172afafab47938804836b923e5ef7 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 23 Jun 2006 02:03:52 -0700
Subject: [PATCH] page migration: handle freeing of pages in migrate_pages()

Do not leave pages on the lists passed to migrate_pages().  Seems that we will
not need any postprocessing of pages.  This will simplify the handling of
pages by the callers of migrate_pages().

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Jes Sorensen <jes@trained-monkey.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/migrate.h |  7 +++----
 mm/mempolicy.c          |  8 +-------
 mm/migrate.c            | 48 +++++++++++++++++++++++-------------------------
 3 files changed, 27 insertions(+), 36 deletions(-)

(limited to 'mm')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 287c47b5e5d..83af25949fa 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -8,8 +8,7 @@ extern int isolate_lru_page(struct page *p, struct list_head *pagelist);
 extern int putback_lru_pages(struct list_head *l);
 extern int migrate_page(struct address_space *,
 			struct page *, struct page *);
-extern int migrate_pages(struct list_head *l, struct list_head *t,
-		struct list_head *moved, struct list_head *failed);
+extern int migrate_pages(struct list_head *l, struct list_head *t);
 extern int migrate_pages_to(struct list_head *pagelist,
 			struct vm_area_struct *vma, int dest);
 extern int fail_migrate_page(struct address_space *,
@@ -22,8 +21,8 @@ extern int migrate_prep(void);
 static inline int isolate_lru_page(struct page *p, struct list_head *list)
 					{ return -ENOSYS; }
 static inline int putback_lru_pages(struct list_head *l) { return 0; }
-static inline int migrate_pages(struct list_head *l, struct list_head *t,
-	struct list_head *moved, struct list_head *failed) { return -ENOSYS; }
+static inline int migrate_pages(struct list_head *l, struct list_head *t)
+					{ return -ENOSYS; }
 
 static inline int migrate_pages_to(struct list_head *pagelist,
 			struct vm_area_struct *vma, int dest) { return 0; }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8778f58880c..244f3f130e4 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -603,11 +603,8 @@ int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
 	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 
-	if (!list_empty(&pagelist)) {
+	if (!list_empty(&pagelist))
 		err = migrate_pages_to(&pagelist, NULL, dest);
-		if (!list_empty(&pagelist))
-			putback_lru_pages(&pagelist);
-	}
 	return err;
 }
 
@@ -773,9 +770,6 @@ long do_mbind(unsigned long start, unsigned long len,
 			err = -EIO;
 	}
 
-	if (!list_empty(&pagelist))
-		putback_lru_pages(&pagelist);
-
 	up_write(&mm->mmap_sem);
 	mpol_free(new);
 	return err;
diff --git a/mm/migrate.c b/mm/migrate.c
index 09038163bfe..d3a1810a4c9 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -624,6 +624,15 @@ unlock:
 	unlock_page(page);
 ret:
 	if (rc != -EAGAIN) {
+ 		/*
+ 		 * A page that has been migrated has all references
+ 		 * removed and will be freed. A page that has not been
+ 		 * migrated will have kepts its references and be
+ 		 * restored.
+ 		 */
+ 		list_del(&page->lru);
+ 		move_to_lru(page);
+
 		list_del(&newpage->lru);
 		move_to_lru(newpage);
 	}
@@ -640,12 +649,12 @@ ret:
  *
  * The function returns after 10 attempts or if no pages
  * are movable anymore because to has become empty
- * or no retryable pages exist anymore.
+ * or no retryable pages exist anymore. All pages will be
+ * retruned to the LRU or freed.
  *
- * Return: Number of pages not migrated when "to" ran empty.
+ * Return: Number of pages not migrated.
  */
-int migrate_pages(struct list_head *from, struct list_head *to,
-		  struct list_head *moved, struct list_head *failed)
+int migrate_pages(struct list_head *from, struct list_head *to)
 {
 	int retry = 1;
 	int nr_failed = 0;
@@ -675,11 +684,9 @@ int migrate_pages(struct list_head *from, struct list_head *to,
 				retry++;
 				break;
 			case 0:
-				list_move(&page->lru, moved);
 				break;
 			default:
 				/* Permanent failure */
-				list_move(&page->lru, failed);
 				nr_failed++;
 				break;
 			}
@@ -689,6 +696,7 @@ int migrate_pages(struct list_head *from, struct list_head *to,
 	if (!swapwrite)
 		current->flags &= ~PF_SWAPWRITE;
 
+	putback_lru_pages(from);
 	return nr_failed + retry;
 }
 
@@ -702,11 +710,10 @@ int migrate_pages_to(struct list_head *pagelist,
 			struct vm_area_struct *vma, int dest)
 {
 	LIST_HEAD(newlist);
-	LIST_HEAD(moved);
-	LIST_HEAD(failed);
 	int err = 0;
 	unsigned long offset = 0;
 	int nr_pages;
+	int nr_failed = 0;
 	struct page *page;
 	struct list_head *p;
 
@@ -740,26 +747,17 @@ redo:
 		if (nr_pages > MIGRATE_CHUNK_SIZE)
 			break;
 	}
-	err = migrate_pages(pagelist, &newlist, &moved, &failed);
+	err = migrate_pages(pagelist, &newlist);
 
-	putback_lru_pages(&moved);	/* Call release pages instead ?? */
-
-	if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
-		goto redo;
-out:
-	/* Return leftover allocated pages */
-	while (!list_empty(&newlist)) {
-		page = list_entry(newlist.next, struct page, lru);
-		list_del(&page->lru);
-		__free_page(page);
+	if (err >= 0) {
+		nr_failed += err;
+		if (list_empty(&newlist) && !list_empty(pagelist))
+			goto redo;
 	}
-	list_splice(&failed, pagelist);
-	if (err < 0)
-		return err;
+out:
 
 	/* Calculate number of leftover pages */
-	nr_pages = 0;
 	list_for_each(p, pagelist)
-		nr_pages++;
-	return nr_pages;
+		nr_failed++;
+	return nr_failed;
 }
-- 
cgit v1.2.3-70-g09d2


From 95a402c3847cc16f4ba03013cd01404fa0f14c2e Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 23 Jun 2006 02:03:53 -0700
Subject: [PATCH] page migration: use allocator function for migrate_pages()

Instead of passing a list of new pages, pass a function to allocate a new
page.  This allows the correct placement of MPOL_INTERLEAVE pages during page
migration.  It also further simplifies the callers of migrate pages.
migrate_pages() becomes similar to migrate_pages_to() so drop
migrate_pages_to().  The batching of new page allocations becomes unnecessary.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Jes Sorensen <jes@trained-monkey.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/migrate.h |  11 ++---
 mm/mempolicy.c          |  23 +++++++++-
 mm/migrate.c            | 115 ++++++++++++++----------------------------------
 3 files changed, 59 insertions(+), 90 deletions(-)

(limited to 'mm')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 83af25949fa..5b95d6568dc 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -3,14 +3,15 @@
 
 #include <linux/mm.h>
 
+typedef struct page *new_page_t(struct page *, unsigned long private);
+
 #ifdef CONFIG_MIGRATION
 extern int isolate_lru_page(struct page *p, struct list_head *pagelist);
 extern int putback_lru_pages(struct list_head *l);
 extern int migrate_page(struct address_space *,
 			struct page *, struct page *);
-extern int migrate_pages(struct list_head *l, struct list_head *t);
-extern int migrate_pages_to(struct list_head *pagelist,
-			struct vm_area_struct *vma, int dest);
+extern int migrate_pages(struct list_head *l, new_page_t x, unsigned long);
+
 extern int fail_migrate_page(struct address_space *,
 			struct page *, struct page *);
 
@@ -21,8 +22,8 @@ extern int migrate_prep(void);
 static inline int isolate_lru_page(struct page *p, struct list_head *list)
 					{ return -ENOSYS; }
 static inline int putback_lru_pages(struct list_head *l) { return 0; }
-static inline int migrate_pages(struct list_head *l, struct list_head *t)
-					{ return -ENOSYS; }
+static inline int migrate_pages(struct list_head *l, new_page_t x,
+		unsigned long private) { return -ENOSYS; }
 
 static inline int migrate_pages_to(struct list_head *pagelist,
 			struct vm_area_struct *vma, int dest) { return 0; }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 244f3f130e4..f432642e9e6 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -87,6 +87,7 @@
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
 #include <linux/migrate.h>
+#include <linux/rmap.h>
 
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
@@ -587,6 +588,11 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
 		isolate_lru_page(page, pagelist);
 }
 
+static struct page *new_node_page(struct page *page, unsigned long node)
+{
+	return alloc_pages_node(node, GFP_HIGHUSER, 0);
+}
+
 /*
  * Migrate pages from one node to a target node.
  * Returns error or the number of pages not migrated.
@@ -604,7 +610,8 @@ int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 
 	if (!list_empty(&pagelist))
-		err = migrate_pages_to(&pagelist, NULL, dest);
+		err = migrate_pages(&pagelist, new_node_page, dest);
+
 	return err;
 }
 
@@ -691,6 +698,12 @@ int do_migrate_pages(struct mm_struct *mm,
 
 }
 
+static struct page *new_vma_page(struct page *page, unsigned long private)
+{
+	struct vm_area_struct *vma = (struct vm_area_struct *)private;
+
+	return alloc_page_vma(GFP_HIGHUSER, vma, page_address_in_vma(page, vma));
+}
 #else
 
 static void migrate_page_add(struct page *page, struct list_head *pagelist,
@@ -703,6 +716,11 @@ int do_migrate_pages(struct mm_struct *mm,
 {
 	return -ENOSYS;
 }
+
+static struct page *new_vma_page(struct page *page, unsigned long private)
+{
+	return NULL;
+}
 #endif
 
 long do_mbind(unsigned long start, unsigned long len,
@@ -764,7 +782,8 @@ long do_mbind(unsigned long start, unsigned long len,
 		err = mbind_range(vma, start, end, new);
 
 		if (!list_empty(&pagelist))
-			nr_failed = migrate_pages_to(&pagelist, vma, -1);
+			nr_failed = migrate_pages(&pagelist, new_vma_page,
+						(unsigned long)vma);
 
 		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 			err = -EIO;
diff --git a/mm/migrate.c b/mm/migrate.c
index d3a1810a4c9..251a8d15825 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -28,9 +28,6 @@
 
 #include "internal.h"
 
-/* The maximum number of pages to take off the LRU for migration */
-#define MIGRATE_CHUNK_SIZE 256
-
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
 
 /*
@@ -587,18 +584,23 @@ static int move_to_new_page(struct page *newpage, struct page *page)
  * Obtain the lock on page, remove all ptes and migrate the page
  * to the newly allocated page in newpage.
  */
-static int unmap_and_move(struct page *newpage, struct page *page, int force)
+static int unmap_and_move(new_page_t get_new_page, unsigned long private,
+			struct page *page, int force)
 {
 	int rc = 0;
+	struct page *newpage = get_new_page(page, private);
+
+	if (!newpage)
+		return -ENOMEM;
 
 	if (page_count(page) == 1)
 		/* page was freed from under us. So we are done. */
-		goto ret;
+		goto move_newpage;
 
 	rc = -EAGAIN;
 	if (TestSetPageLocked(page)) {
 		if (!force)
-			goto ret;
+			goto move_newpage;
 		lock_page(page);
 	}
 
@@ -622,7 +624,7 @@ static int unmap_and_move(struct page *newpage, struct page *page, int force)
 		remove_migration_ptes(page, page);
 unlock:
 	unlock_page(page);
-ret:
+
 	if (rc != -EAGAIN) {
  		/*
  		 * A page that has been migrated has all references
@@ -632,29 +634,33 @@ ret:
  		 */
  		list_del(&page->lru);
  		move_to_lru(page);
-
-		list_del(&newpage->lru);
-		move_to_lru(newpage);
 	}
+
+move_newpage:
+	/*
+	 * Move the new page to the LRU. If migration was not successful
+	 * then this will free the page.
+	 */
+	move_to_lru(newpage);
 	return rc;
 }
 
 /*
  * migrate_pages
  *
- * Two lists are passed to this function. The first list
- * contains the pages isolated from the LRU to be migrated.
- * The second list contains new pages that the isolated pages
- * can be moved to.
+ * The function takes one list of pages to migrate and a function
+ * that determines from the page to be migrated and the private data
+ * the target of the move and allocates the page.
  *
  * The function returns after 10 attempts or if no pages
  * are movable anymore because to has become empty
  * or no retryable pages exist anymore. All pages will be
  * retruned to the LRU or freed.
  *
- * Return: Number of pages not migrated.
+ * Return: Number of pages not migrated or error code.
  */
-int migrate_pages(struct list_head *from, struct list_head *to)
+int migrate_pages(struct list_head *from,
+		new_page_t get_new_page, unsigned long private)
 {
 	int retry = 1;
 	int nr_failed = 0;
@@ -671,15 +677,14 @@ int migrate_pages(struct list_head *from, struct list_head *to)
 		retry = 0;
 
 		list_for_each_entry_safe(page, page2, from, lru) {
-
-			if (list_empty(to))
-				break;
-
 			cond_resched();
 
-			rc = unmap_and_move(lru_to_page(to), page, pass > 2);
+			rc = unmap_and_move(get_new_page, private,
+						page, pass > 2);
 
 			switch(rc) {
+			case -ENOMEM:
+				goto out;
 			case -EAGAIN:
 				retry++;
 				break;
@@ -692,72 +697,16 @@ int migrate_pages(struct list_head *from, struct list_head *to)
 			}
 		}
 	}
-
+	rc = 0;
+out:
 	if (!swapwrite)
 		current->flags &= ~PF_SWAPWRITE;
 
 	putback_lru_pages(from);
-	return nr_failed + retry;
-}
 
-/*
- * Migrate the list 'pagelist' of pages to a certain destination.
- *
- * Specify destination with either non-NULL vma or dest_node >= 0
- * Return the number of pages not migrated or error code
- */
-int migrate_pages_to(struct list_head *pagelist,
-			struct vm_area_struct *vma, int dest)
-{
-	LIST_HEAD(newlist);
-	int err = 0;
-	unsigned long offset = 0;
-	int nr_pages;
-	int nr_failed = 0;
-	struct page *page;
-	struct list_head *p;
-
-redo:
-	nr_pages = 0;
-	list_for_each(p, pagelist) {
-		if (vma) {
-			/*
-			 * The address passed to alloc_page_vma is used to
-			 * generate the proper interleave behavior. We fake
-			 * the address here by an increasing offset in order
-			 * to get the proper distribution of pages.
-			 *
-			 * No decision has been made as to which page
-			 * a certain old page is moved to so we cannot
-			 * specify the correct address.
-			 */
-			page = alloc_page_vma(GFP_HIGHUSER, vma,
-					offset + vma->vm_start);
-			offset += PAGE_SIZE;
-		}
-		else
-			page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
-
-		if (!page) {
-			err = -ENOMEM;
-			goto out;
-		}
-		list_add_tail(&page->lru, &newlist);
-		nr_pages++;
-		if (nr_pages > MIGRATE_CHUNK_SIZE)
-			break;
-	}
-	err = migrate_pages(pagelist, &newlist);
-
-	if (err >= 0) {
-		nr_failed += err;
-		if (list_empty(&newlist) && !list_empty(pagelist))
-			goto redo;
-	}
-out:
+	if (rc)
+		return rc;
 
-	/* Calculate number of leftover pages */
-	list_for_each(p, pagelist)
-		nr_failed++;
-	return nr_failed;
+	return nr_failed + retry;
 }
+
-- 
cgit v1.2.3-70-g09d2


From 742755a1d8ce2b548428f7aacf1758b4bba50080 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 23 Jun 2006 02:03:55 -0700
Subject: [PATCH] page migration: sys_move_pages(): support moving of
 individual pages

move_pages() is used to move individual pages of a process. The function can
be used to determine the location of pages and to move them onto the desired
node. move_pages() returns status information for each page.

long move_pages(pid, number_of_pages_to_move,
		addresses_of_pages[],
		nodes[] or NULL,
		status[],
		flags);

The addresses of pages is an array of void * pointing to the
pages to be moved.

The nodes array contains the node numbers that the pages should be moved
to. If a NULL is passed instead of an array then no pages are moved but
the status array is updated. The status request may be used to determine
the page state before issuing another move_pages() to move pages.

The status array will contain the state of all individual page migration
attempts when the function terminates. The status array is only valid if
move_pages() completed successfullly.

Possible page states in status[]:

0..MAX_NUMNODES	The page is now on the indicated node.

-ENOENT		Page is not present

-EACCES		Page is mapped by multiple processes and can only
		be moved if MPOL_MF_MOVE_ALL is specified.

-EPERM		The page has been mlocked by a process/driver and
		cannot be moved.

-EBUSY		Page is busy and cannot be moved. Try again later.

-EFAULT		Invalid address (no VMA or zero page).

-ENOMEM		Unable to allocate memory on target node.

-EIO		Unable to write back page. The page must be written
		back in order to move it since the page is dirty and the
		filesystem does not provide a migration function that
		would allow the moving of dirty pages.

-EINVAL		A dirty page cannot be moved. The filesystem does not provide
		a migration function and has no ability to write back pages.

The flags parameter indicates what types of pages to move:

MPOL_MF_MOVE	Move pages that are only mapped by the process.

MPOL_MF_MOVE_ALL Also move pages that are mapped by multiple processes.
		Requires sufficient capabilities.

Possible return codes from move_pages()

-ENOENT		No pages found that would require moving. All pages
		are either already on the target node, not present, had an
		invalid address or could not be moved because they were
		mapped by multiple processes.

-EINVAL		Flags other than MPOL_MF_MOVE(_ALL) specified or an attempt
		to migrate pages in a kernel thread.

-EPERM		MPOL_MF_MOVE_ALL specified without sufficient priviledges.
		or an attempt to move a process belonging to another user.

-EACCES		One of the target nodes is not allowed by the current cpuset.

-ENODEV		One of the target nodes is not online.

-ESRCH		Process does not exist.

-E2BIG		Too many pages to move.

-ENOMEM		Not enough memory to allocate control array.

-EFAULT		Parameters could not be accessed.

A test program for move_pages() may be found with the patches
on ftp.kernel.org:/pub/linux/kernel/people/christoph/pmig/patches-2.6.17-rc4-mm3

From: Christoph Lameter <clameter@sgi.com>

  Detailed results for sys_move_pages()

  Pass a pointer to an integer to get_new_page() that may be used to
  indicate where the completion status of a migration operation should be
  placed.  This allows sys_move_pags() to report back exactly what happened to
  each page.

  Wish there would be a better way to do this. Looks a bit hacky.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Jes Sorensen <jes@trained-monkey.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Andi Kleen <ak@muc.de>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/vm/page_migration |  29 ++---
 arch/ia64/kernel/entry.S        |   2 +-
 include/asm-ia64/unistd.h       |   2 +-
 include/linux/migrate.h         |   2 +-
 include/linux/syscalls.h        |   5 +
 kernel/sys_ni.c                 |   1 +
 mm/mempolicy.c                  |   4 +-
 mm/migrate.c                    | 268 +++++++++++++++++++++++++++++++++++++++-
 8 files changed, 288 insertions(+), 25 deletions(-)

(limited to 'mm')

diff --git a/Documentation/vm/page_migration b/Documentation/vm/page_migration
index 0a5d5fb1885..99f89aa1016 100644
--- a/Documentation/vm/page_migration
+++ b/Documentation/vm/page_migration
@@ -26,8 +26,13 @@ a process are located. See also the numa_maps manpage in the numactl package.
 Manual migration is useful if for example the scheduler has relocated
 a process to a processor on a distant node. A batch scheduler or an
 administrator may detect the situation and move the pages of the process
-nearer to the new processor. At some point in the future we may have
-some mechanism in the scheduler that will automatically move the pages.
+nearer to the new processor. The kernel itself does only provide
+manual page migration support. Automatic page migration may be implemented
+through user space processes that move pages. A special function call
+"move_pages" allows the moving of individual pages within a process.
+A NUMA profiler may f.e. obtain a log showing frequent off node
+accesses and may use the result to move pages to more advantageous
+locations.
 
 Larger installations usually partition the system using cpusets into
 sections of nodes. Paul Jackson has equipped cpusets with the ability to
@@ -62,22 +67,14 @@ A. In kernel use of migrate_pages()
    It also prevents the swapper or other scans to encounter
    the page.
 
-2. Generate a list of newly allocates pages. These pages will contain the
-   contents of the pages from the first list after page migration is
-   complete.
+2. We need to have a function of type new_page_t that can be
+   passed to migrate_pages(). This function should figure out
+   how to allocate the correct new page given the old page.
 
 3. The migrate_pages() function is called which attempts
-   to do the migration. It returns the moved pages in the
-   list specified as the third parameter and the failed
-   migrations in the fourth parameter. When the function
-   returns the first list will contain the pages that could still be retried.
-
-4. The leftover pages of various types are returned
-   to the LRU using putback_to_lru_pages() or otherwise
-   disposed of. The pages will still have the refcount as
-   increased by isolate_lru_pages() if putback_to_lru_pages() is not
-   used! The kernel may want to handle the various cases of failures in
-   different ways.
+   to do the migration. It will call the function to allocate
+   the new page for each page that is considered for
+   moving.
 
 B. How migrate_pages() works
 ----------------------------
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index bcb80ca5cf4..32c999f58d1 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@@ -1584,7 +1584,7 @@ sys_call_table:
 	data8 sys_keyctl
 	data8 sys_ioprio_set
 	data8 sys_ioprio_get			// 1275
-	data8 sys_ni_syscall
+	data8 sys_move_pages
 	data8 sys_inotify_init
 	data8 sys_inotify_add_watch
 	data8 sys_inotify_rm_watch
diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h
index 632f2eedf72..bb0eb727dcd 100644
--- a/include/asm-ia64/unistd.h
+++ b/include/asm-ia64/unistd.h
@@ -265,7 +265,7 @@
 #define __NR_keyctl			1273
 #define __NR_ioprio_set			1274
 #define __NR_ioprio_get			1275
-/* 1276 is available for reuse (was briefly sys_set_zone_reclaim) */
+#define __NR_move_pages			1276
 #define __NR_inotify_init		1277
 #define __NR_inotify_add_watch		1278
 #define __NR_inotify_rm_watch		1279
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 5b95d6568dc..5dba23a1c0d 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -3,7 +3,7 @@
 
 #include <linux/mm.h>
 
-typedef struct page *new_page_t(struct page *, unsigned long private);
+typedef struct page *new_page_t(struct page *, unsigned long private, int **);
 
 #ifdef CONFIG_MIGRATION
 extern int isolate_lru_page(struct page *p, struct list_head *pagelist);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index bd67a4413df..7e3f2349091 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -516,6 +516,11 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 				const unsigned long __user *from,
 				const unsigned long __user *to);
+asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
+				const void __user * __user *pages,
+				const int __user *nodes,
+				int __user *status,
+				int flags);
 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 				unsigned long mode,
 				unsigned long __user *nmask,
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 5433195040f..597229749de 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -87,6 +87,7 @@ cond_syscall(sys_inotify_init);
 cond_syscall(sys_inotify_add_watch);
 cond_syscall(sys_inotify_rm_watch);
 cond_syscall(sys_migrate_pages);
+cond_syscall(sys_move_pages);
 cond_syscall(sys_chown16);
 cond_syscall(sys_fchown16);
 cond_syscall(sys_getegid16);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f432642e9e6..05b84acf0bb 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -588,7 +588,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
 		isolate_lru_page(page, pagelist);
 }
 
-static struct page *new_node_page(struct page *page, unsigned long node)
+static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 {
 	return alloc_pages_node(node, GFP_HIGHUSER, 0);
 }
@@ -698,7 +698,7 @@ int do_migrate_pages(struct mm_struct *mm,
 
 }
 
-static struct page *new_vma_page(struct page *page, unsigned long private)
+static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 {
 	struct vm_area_struct *vma = (struct vm_area_struct *)private;
 
diff --git a/mm/migrate.c b/mm/migrate.c
index 251a8d15825..033a12f4c94 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -25,6 +25,8 @@
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/writeback.h>
+#include <linux/mempolicy.h>
+#include <linux/vmalloc.h>
 
 #include "internal.h"
 
@@ -62,9 +64,8 @@ int isolate_lru_page(struct page *page, struct list_head *pagelist)
 }
 
 /*
- * migrate_prep() needs to be called after we have compiled the list of pages
- * to be migrated using isolate_lru_page() but before we begin a series of calls
- * to migrate_pages().
+ * migrate_prep() needs to be called before we start compiling a list of pages
+ * to be migrated using isolate_lru_page().
  */
 int migrate_prep(void)
 {
@@ -588,7 +589,8 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 			struct page *page, int force)
 {
 	int rc = 0;
-	struct page *newpage = get_new_page(page, private);
+	int *result = NULL;
+	struct page *newpage = get_new_page(page, private, &result);
 
 	if (!newpage)
 		return -ENOMEM;
@@ -642,6 +644,12 @@ move_newpage:
 	 * then this will free the page.
 	 */
 	move_to_lru(newpage);
+	if (result) {
+		if (rc)
+			*result = rc;
+		else
+			*result = page_to_nid(newpage);
+	}
 	return rc;
 }
 
@@ -710,3 +718,255 @@ out:
 	return nr_failed + retry;
 }
 
+#ifdef CONFIG_NUMA
+/*
+ * Move a list of individual pages
+ */
+struct page_to_node {
+	unsigned long addr;
+	struct page *page;
+	int node;
+	int status;
+};
+
+static struct page *new_page_node(struct page *p, unsigned long private,
+		int **result)
+{
+	struct page_to_node *pm = (struct page_to_node *)private;
+
+	while (pm->node != MAX_NUMNODES && pm->page != p)
+		pm++;
+
+	if (pm->node == MAX_NUMNODES)
+		return NULL;
+
+	*result = &pm->status;
+
+	return alloc_pages_node(pm->node, GFP_HIGHUSER, 0);
+}
+
+/*
+ * Move a set of pages as indicated in the pm array. The addr
+ * field must be set to the virtual address of the page to be moved
+ * and the node number must contain a valid target node.
+ */
+static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm,
+				int migrate_all)
+{
+	int err;
+	struct page_to_node *pp;
+	LIST_HEAD(pagelist);
+
+	down_read(&mm->mmap_sem);
+
+	/*
+	 * Build a list of pages to migrate
+	 */
+	migrate_prep();
+	for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
+		struct vm_area_struct *vma;
+		struct page *page;
+
+		/*
+		 * A valid page pointer that will not match any of the
+		 * pages that will be moved.
+		 */
+		pp->page = ZERO_PAGE(0);
+
+		err = -EFAULT;
+		vma = find_vma(mm, pp->addr);
+		if (!vma)
+			goto set_status;
+
+		page = follow_page(vma, pp->addr, FOLL_GET);
+		err = -ENOENT;
+		if (!page)
+			goto set_status;
+
+		if (PageReserved(page))		/* Check for zero page */
+			goto put_and_set;
+
+		pp->page = page;
+		err = page_to_nid(page);
+
+		if (err == pp->node)
+			/*
+			 * Node already in the right place
+			 */
+			goto put_and_set;
+
+		err = -EACCES;
+		if (page_mapcount(page) > 1 &&
+				!migrate_all)
+			goto put_and_set;
+
+		err = isolate_lru_page(page, &pagelist);
+put_and_set:
+		/*
+		 * Either remove the duplicate refcount from
+		 * isolate_lru_page() or drop the page ref if it was
+		 * not isolated.
+		 */
+		put_page(page);
+set_status:
+		pp->status = err;
+	}
+
+	if (!list_empty(&pagelist))
+		err = migrate_pages(&pagelist, new_page_node,
+				(unsigned long)pm);
+	else
+		err = -ENOENT;
+
+	up_read(&mm->mmap_sem);
+	return err;
+}
+
+/*
+ * Determine the nodes of a list of pages. The addr in the pm array
+ * must have been set to the virtual address of which we want to determine
+ * the node number.
+ */
+static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm)
+{
+	down_read(&mm->mmap_sem);
+
+	for ( ; pm->node != MAX_NUMNODES; pm++) {
+		struct vm_area_struct *vma;
+		struct page *page;
+		int err;
+
+		err = -EFAULT;
+		vma = find_vma(mm, pm->addr);
+		if (!vma)
+			goto set_status;
+
+		page = follow_page(vma, pm->addr, 0);
+		err = -ENOENT;
+		/* Use PageReserved to check for zero page */
+		if (!page || PageReserved(page))
+			goto set_status;
+
+		err = page_to_nid(page);
+set_status:
+		pm->status = err;
+	}
+
+	up_read(&mm->mmap_sem);
+	return 0;
+}
+
+/*
+ * Move a list of pages in the address space of the currently executing
+ * process.
+ */
+asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
+			const void __user * __user *pages,
+			const int __user *nodes,
+			int __user *status, int flags)
+{
+	int err = 0;
+	int i;
+	struct task_struct *task;
+	nodemask_t task_nodes;
+	struct mm_struct *mm;
+	struct page_to_node *pm = NULL;
+
+	/* Check flags */
+	if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
+		return -EINVAL;
+
+	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
+		return -EPERM;
+
+	/* Find the mm_struct */
+	read_lock(&tasklist_lock);
+	task = pid ? find_task_by_pid(pid) : current;
+	if (!task) {
+		read_unlock(&tasklist_lock);
+		return -ESRCH;
+	}
+	mm = get_task_mm(task);
+	read_unlock(&tasklist_lock);
+
+	if (!mm)
+		return -EINVAL;
+
+	/*
+	 * Check if this process has the right to modify the specified
+	 * process. The right exists if the process has administrative
+	 * capabilities, superuser privileges or the same
+	 * userid as the target process.
+	 */
+	if ((current->euid != task->suid) && (current->euid != task->uid) &&
+	    (current->uid != task->suid) && (current->uid != task->uid) &&
+	    !capable(CAP_SYS_NICE)) {
+		err = -EPERM;
+		goto out2;
+	}
+
+	task_nodes = cpuset_mems_allowed(task);
+
+	/* Limit nr_pages so that the multiplication may not overflow */
+	if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) {
+		err = -E2BIG;
+		goto out2;
+	}
+
+	pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node));
+	if (!pm) {
+		err = -ENOMEM;
+		goto out2;
+	}
+
+	/*
+	 * Get parameters from user space and initialize the pm
+	 * array. Return various errors if the user did something wrong.
+	 */
+	for (i = 0; i < nr_pages; i++) {
+		const void *p;
+
+		err = -EFAULT;
+		if (get_user(p, pages + i))
+			goto out;
+
+		pm[i].addr = (unsigned long)p;
+		if (nodes) {
+			int node;
+
+			if (get_user(node, nodes + i))
+				goto out;
+
+			err = -ENODEV;
+			if (!node_online(node))
+				goto out;
+
+			err = -EACCES;
+			if (!node_isset(node, task_nodes))
+				goto out;
+
+			pm[i].node = node;
+		}
+	}
+	/* End marker */
+	pm[nr_pages].node = MAX_NUMNODES;
+
+	if (nodes)
+		err = do_move_pages(mm, pm, flags & MPOL_MF_MOVE_ALL);
+	else
+		err = do_pages_stat(mm, pm);
+
+	if (err >= 0)
+		/* Return status information */
+		for (i = 0; i < nr_pages; i++)
+			if (put_user(pm[i].status, status + i))
+				err = -EFAULT;
+
+out:
+	vfree(pm);
+out2:
+	mmput(mm);
+	return err;
+}
+#endif
+
-- 
cgit v1.2.3-70-g09d2


From 86c3a7645c05a7d06b72653aa4b2bea4e7229d1b Mon Sep 17 00:00:00 2001
From: David Quigley <dpquigl@tycho.nsa.gov>
Date: Fri, 23 Jun 2006 02:04:02 -0700
Subject: [PATCH] SELinux: add security_task_movememory calls to mm code

This patch inserts security_task_movememory hook calls into memory management
code to enable security modules to mediate this operation between tasks.

Since the last posting, the hook has been renamed following feedback from
Christoph Lameter.

Signed-off-by: David Quigley <dpquigl@tycho.nsa.gov>
Acked-by:  Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: James Morris <jmorris@namei.org>
Cc: Andi Kleen <ak@muc.de>
Acked-by: Christoph Lameter <clameter@sgi.com>
Acked-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 5 +++++
 mm/migrate.c   | 6 ++++++
 2 files changed, 11 insertions(+)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 05b84acf0bb..ec4a1a950df 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -88,6 +88,7 @@
 #include <linux/proc_fs.h>
 #include <linux/migrate.h>
 #include <linux/rmap.h>
+#include <linux/security.h>
 
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
@@ -942,6 +943,10 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 		goto out;
 	}
 
+	err = security_task_movememory(task);
+	if (err)
+		goto out;
+
 	err = do_migrate_pages(mm, &old, &new,
 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
 out:
diff --git a/mm/migrate.c b/mm/migrate.c
index 033a12f4c94..1c2a71aa05c 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -27,6 +27,7 @@
 #include <linux/writeback.h>
 #include <linux/mempolicy.h>
 #include <linux/vmalloc.h>
+#include <linux/security.h>
 
 #include "internal.h"
 
@@ -905,6 +906,11 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
 		goto out2;
 	}
 
+ 	err = security_task_movememory(task);
+ 	if (err)
+ 		goto out2;
+
+
 	task_nodes = cpuset_mems_allowed(task);
 
 	/* Limit nr_pages so that the multiplication may not overflow */
-- 
cgit v1.2.3-70-g09d2


From c22ce143d15eb288543fe9873e1c5ac1c01b69a1 Mon Sep 17 00:00:00 2001
From: Hiro Yoshioka <hyoshiok@miraclelinux.com>
Date: Fri, 23 Jun 2006 02:04:16 -0700
Subject: [PATCH] x86: cache pollution aware __copy_from_user_ll()

Use the x86 cache-bypassing copy instructions for copy_from_user().

Some performance data are

Total of GLOBAL_POWER_EVENTS (CPU cycle samples)

2.6.12.4.orig    1921587
2.6.12.4.nt      1599424
1599424/1921587=83.23% (16.77% reduction)

BSQ_CACHE_REFERENCE (L3 cache miss)
2.6.12.4.orig      57427
2.6.12.4.nt        20858
20858/57427=36.32% (63.7% reduction)

L3 cache miss reduction of __copy_from_user_ll
samples  %
37408    65.1412  vmlinux                  __copy_from_user_ll
23        0.1103  vmlinux                  __copy_user_zeroing_intel_nocache
23/37408=0.061% (99.94% reduction)

Top 5 of 2.6.12.4.nt
Counted GLOBAL_POWER_EVENTS events (time during which processor is not stopped) with a unit mask of 0x01 (mandatory) count 100000
samples  %        app name                 symbol name
128392    8.0274  vmlinux                  __copy_user_zeroing_intel_nocache
64206     4.0143  vmlinux                  journal_add_journal_head
59746     3.7355  vmlinux                  do_get_write_access
47674     2.9807  vmlinux                  journal_put_journal_head
46021     2.8774  vmlinux                  journal_dirty_metadata
pattern9-0-cpu4-0-09011728/summary.out

Counted BSQ_CACHE_REFERENCE events (cache references seen by the bus unit) with a unit mask of 0x3f (multiple flags) count 3000
samples  %        app name                 symbol name
69755     4.2861  vmlinux                  __copy_user_zeroing_intel_nocache
55685     3.4215  vmlinux                  journal_add_journal_head
52371     3.2179  vmlinux                  __find_get_block
45504     2.7960  vmlinux                  journal_put_journal_head
36005     2.2123  vmlinux                  journal_stop
pattern9-0-cpu4-0-09011744/summary.out

Counted BSQ_CACHE_REFERENCE events (cache references seen by the bus unit) with a unit mask of 0x200 (read 3rd level cache miss) count 3000
samples  %        app name                 symbol name
1147      5.4994  vmlinux                  journal_add_journal_head
881       4.2240  vmlinux                  journal_dirty_data
872       4.1809  vmlinux                  blk_rq_map_sg
734       3.5192  vmlinux                  journal_commit_transaction
617       2.9582  vmlinux                  radix_tree_delete
pattern9-0-cpu4-0-09011731/summary.out

iozone results are

original 2.6.12.4 CPU time = 207.768 sec
cache aware       CPU time = 184.783 sec
(three times run)
184.783/207.768=88.94% (11.06% reduction)

original:
pattern9-0-cpu4-0-08191720/iozone.out:  CPU Utilization: Wall time   45.997    CPU time   64.527    CPU utilization 140.28 %
pattern9-0-cpu4-0-08191741/iozone.out:  CPU Utilization: Wall time   46.878    CPU time   71.933    CPU utilization 153.45 %
pattern9-0-cpu4-0-08191743/iozone.out:  CPU Utilization: Wall time   45.152    CPU time   71.308    CPU utilization 157.93 %

cache awre:
pattern9-0-cpu4-0-09011728/iozone.out:  CPU Utilization: Wall time   44.842    CPU time   62.465    CPU utilization 139.30 %
pattern9-0-cpu4-0-09011731/iozone.out:  CPU Utilization: Wall time   44.718    CPU time   59.273    CPU utilization 132.55 %
pattern9-0-cpu4-0-09011744/iozone.out:  CPU Utilization: Wall time   44.367    CPU time   63.045    CPU utilization 142.10 %

Signed-off-by: Hiro Yoshioka <hyoshiok@miraclelinux.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/lib/usercopy.c   | 137 ++++++++++++++++++++++++++++++++++++++++++---
 include/asm-i386/uaccess.h |  33 +++++++++++
 include/linux/uaccess.h    |  22 ++++++++
 mm/filemap.c               |   4 +-
 mm/filemap.h               |   6 +-
 5 files changed, 189 insertions(+), 13 deletions(-)
 create mode 100644 include/linux/uaccess.h

(limited to 'mm')

diff --git a/arch/i386/lib/usercopy.c b/arch/i386/lib/usercopy.c
index 4cf981d70f4..6979297ce27 100644
--- a/arch/i386/lib/usercopy.c
+++ b/arch/i386/lib/usercopy.c
@@ -425,15 +425,121 @@ __copy_user_zeroing_intel(void *to, const void __user *from, unsigned long size)
 		       : "eax", "edx", "memory");
 	return size;
 }
+
+/*
+ * Non Temporal Hint version of __copy_user_zeroing_intel.  It is cache aware.
+ * hyoshiok@miraclelinux.com
+ */
+
+static unsigned long __copy_user_zeroing_intel_nocache(void *to,
+				const void __user *from, unsigned long size)
+{
+        int d0, d1;
+
+	__asm__ __volatile__(
+	       "        .align 2,0x90\n"
+	       "0:      movl 32(%4), %%eax\n"
+	       "        cmpl $67, %0\n"
+	       "        jbe 2f\n"
+	       "1:      movl 64(%4), %%eax\n"
+	       "        .align 2,0x90\n"
+	       "2:      movl 0(%4), %%eax\n"
+	       "21:     movl 4(%4), %%edx\n"
+	       "        movnti %%eax, 0(%3)\n"
+	       "        movnti %%edx, 4(%3)\n"
+	       "3:      movl 8(%4), %%eax\n"
+	       "31:     movl 12(%4),%%edx\n"
+	       "        movnti %%eax, 8(%3)\n"
+	       "        movnti %%edx, 12(%3)\n"
+	       "4:      movl 16(%4), %%eax\n"
+	       "41:     movl 20(%4), %%edx\n"
+	       "        movnti %%eax, 16(%3)\n"
+	       "        movnti %%edx, 20(%3)\n"
+	       "10:     movl 24(%4), %%eax\n"
+	       "51:     movl 28(%4), %%edx\n"
+	       "        movnti %%eax, 24(%3)\n"
+	       "        movnti %%edx, 28(%3)\n"
+	       "11:     movl 32(%4), %%eax\n"
+	       "61:     movl 36(%4), %%edx\n"
+	       "        movnti %%eax, 32(%3)\n"
+	       "        movnti %%edx, 36(%3)\n"
+	       "12:     movl 40(%4), %%eax\n"
+	       "71:     movl 44(%4), %%edx\n"
+	       "        movnti %%eax, 40(%3)\n"
+	       "        movnti %%edx, 44(%3)\n"
+	       "13:     movl 48(%4), %%eax\n"
+	       "81:     movl 52(%4), %%edx\n"
+	       "        movnti %%eax, 48(%3)\n"
+	       "        movnti %%edx, 52(%3)\n"
+	       "14:     movl 56(%4), %%eax\n"
+	       "91:     movl 60(%4), %%edx\n"
+	       "        movnti %%eax, 56(%3)\n"
+	       "        movnti %%edx, 60(%3)\n"
+	       "        addl $-64, %0\n"
+	       "        addl $64, %4\n"
+	       "        addl $64, %3\n"
+	       "        cmpl $63, %0\n"
+	       "        ja  0b\n"
+	       "        sfence \n"
+	       "5:      movl  %0, %%eax\n"
+	       "        shrl  $2, %0\n"
+	       "        andl $3, %%eax\n"
+	       "        cld\n"
+	       "6:      rep; movsl\n"
+	       "        movl %%eax,%0\n"
+	       "7:      rep; movsb\n"
+	       "8:\n"
+	       ".section .fixup,\"ax\"\n"
+	       "9:      lea 0(%%eax,%0,4),%0\n"
+	       "16:     pushl %0\n"
+	       "        pushl %%eax\n"
+	       "        xorl %%eax,%%eax\n"
+	       "        rep; stosb\n"
+	       "        popl %%eax\n"
+	       "        popl %0\n"
+	       "        jmp 8b\n"
+	       ".previous\n"
+	       ".section __ex_table,\"a\"\n"
+	       "	.align 4\n"
+	       "	.long 0b,16b\n"
+	       "	.long 1b,16b\n"
+	       "	.long 2b,16b\n"
+	       "	.long 21b,16b\n"
+	       "	.long 3b,16b\n"
+	       "	.long 31b,16b\n"
+	       "	.long 4b,16b\n"
+	       "	.long 41b,16b\n"
+	       "	.long 10b,16b\n"
+	       "	.long 51b,16b\n"
+	       "	.long 11b,16b\n"
+	       "	.long 61b,16b\n"
+	       "	.long 12b,16b\n"
+	       "	.long 71b,16b\n"
+	       "	.long 13b,16b\n"
+	       "	.long 81b,16b\n"
+	       "	.long 14b,16b\n"
+	       "	.long 91b,16b\n"
+	       "	.long 6b,9b\n"
+	       "        .long 7b,16b\n"
+	       ".previous"
+	       : "=&c"(size), "=&D" (d0), "=&S" (d1)
+	       :  "1"(to), "2"(from), "0"(size)
+	       : "eax", "edx", "memory");
+	return size;
+}
+
 #else
+
 /*
  * Leave these declared but undefined.  They should not be any references to
  * them
  */
-unsigned long
-__copy_user_zeroing_intel(void *to, const void __user *from, unsigned long size);
-unsigned long
-__copy_user_intel(void __user *to, const void *from, unsigned long size);
+unsigned long __copy_user_zeroing_intel(void *to, const void __user *from,
+					unsigned long size);
+unsigned long __copy_user_intel(void __user *to, const void *from,
+					unsigned long size);
+unsigned long __copy_user_zeroing_intel_nocache(void *to,
+				const void __user *from, unsigned long size);
 #endif /* CONFIG_X86_INTEL_USERCOPY */
 
 /* Generic arbitrary sized copy.  */
@@ -515,8 +621,8 @@ do {									\
 		: "memory");						\
 } while (0)
 
-
-unsigned long __copy_to_user_ll(void __user *to, const void *from, unsigned long n)
+unsigned long __copy_to_user_ll(void __user *to, const void *from,
+				unsigned long n)
 {
 	BUG_ON((long) n < 0);
 #ifndef CONFIG_X86_WP_WORKS_OK
@@ -576,8 +682,8 @@ survive:
 }
 EXPORT_SYMBOL(__copy_to_user_ll);
 
-unsigned long
-__copy_from_user_ll(void *to, const void __user *from, unsigned long n)
+unsigned long __copy_from_user_ll(void *to, const void __user *from,
+					unsigned long n)
 {
 	BUG_ON((long)n < 0);
 	if (movsl_is_ok(to, from, n))
@@ -588,6 +694,21 @@ __copy_from_user_ll(void *to, const void __user *from, unsigned long n)
 }
 EXPORT_SYMBOL(__copy_from_user_ll);
 
+unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from,
+					unsigned long n)
+{
+	BUG_ON((long)n < 0);
+#ifdef CONFIG_X86_INTEL_USERCOPY
+	if ( n > 64 && cpu_has_xmm2)
+                n = __copy_user_zeroing_intel_nocache(to, from, n);
+	else
+		__copy_user_zeroing(to, from, n);
+#else
+        __copy_user_zeroing(to, from, n);
+#endif
+	return n;
+}
+
 /**
  * copy_to_user: - Copy a block of data into user space.
  * @to:   Destination address, in user space.
diff --git a/include/asm-i386/uaccess.h b/include/asm-i386/uaccess.h
index 1ec65523ea5..82af28a943a 100644
--- a/include/asm-i386/uaccess.h
+++ b/include/asm-i386/uaccess.h
@@ -390,6 +390,8 @@ unsigned long __must_check __copy_to_user_ll(void __user *to,
 				const void *from, unsigned long n);
 unsigned long __must_check __copy_from_user_ll(void *to,
 				const void __user *from, unsigned long n);
+unsigned long __must_check __copy_from_user_ll_nocache(void *to,
+				const void __user *from, unsigned long n);
 
 /*
  * Here we special-case 1, 2 and 4-byte copy_*_user invocations.  On a fault
@@ -478,12 +480,43 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
 	return __copy_from_user_ll(to, from, n);
 }
 
+#define ARCH_HAS_NOCACHE_UACCESS
+
+static __always_inline unsigned long __copy_from_user_inatomic_nocache(void *to,
+				const void __user *from, unsigned long n)
+{
+	if (__builtin_constant_p(n)) {
+		unsigned long ret;
+
+		switch (n) {
+		case 1:
+			__get_user_size(*(u8 *)to, from, 1, ret, 1);
+			return ret;
+		case 2:
+			__get_user_size(*(u16 *)to, from, 2, ret, 2);
+			return ret;
+		case 4:
+			__get_user_size(*(u32 *)to, from, 4, ret, 4);
+			return ret;
+		}
+	}
+	return __copy_from_user_ll_nocache(to, from, n);
+}
+
 static __always_inline unsigned long
 __copy_from_user(void *to, const void __user *from, unsigned long n)
 {
        might_sleep();
        return __copy_from_user_inatomic(to, from, n);
 }
+
+static __always_inline unsigned long
+__copy_from_user_nocache(void *to, const void __user *from, unsigned long n)
+{
+       might_sleep();
+       return __copy_from_user_inatomic_nocache(to, from, n);
+}
+
 unsigned long __must_check copy_to_user(void __user *to,
 				const void *from, unsigned long n);
 unsigned long __must_check copy_from_user(void *to,
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
new file mode 100644
index 00000000000..391e7ed1eb3
--- /dev/null
+++ b/include/linux/uaccess.h
@@ -0,0 +1,22 @@
+#ifndef __LINUX_UACCESS_H__
+#define __LINUX_UACCESS_H__
+
+#include <asm/uaccess.h>
+
+#ifndef ARCH_HAS_NOCACHE_UACCESS
+
+static inline unsigned long __copy_from_user_inatomic_nocache(void *to,
+				const void __user *from, unsigned long n)
+{
+	return __copy_from_user_inatomic(to, from, n);
+}
+
+static inline unsigned long __copy_from_user_nocache(void *to,
+				const void __user *from, unsigned long n)
+{
+	return __copy_from_user(to, from, n);
+}
+
+#endif		/* ARCH_HAS_NOCACHE_UACCESS */
+
+#endif		/* __LINUX_UACCESS_H__ */
diff --git a/mm/filemap.c b/mm/filemap.c
index 368678c2d53..807a463fd5e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -14,6 +14,7 @@
 #include <linux/slab.h>
 #include <linux/compiler.h>
 #include <linux/fs.h>
+#include <linux/uaccess.h>
 #include <linux/aio.h>
 #include <linux/capability.h>
 #include <linux/kernel_stat.h>
@@ -38,7 +39,6 @@
  */
 #include <linux/buffer_head.h> /* for generic_osync_inode */
 
-#include <asm/uaccess.h>
 #include <asm/mman.h>
 
 static ssize_t
@@ -1902,7 +1902,7 @@ __filemap_copy_from_user_iovec(char *vaddr,
 		int copy = min(bytes, iov->iov_len - base);
 
 		base = 0;
-		left = __copy_from_user_inatomic(vaddr, buf, copy);
+		left = __copy_from_user_inatomic_nocache(vaddr, buf, copy);
 		copied += copy;
 		bytes -= copy;
 		vaddr += copy;
diff --git a/mm/filemap.h b/mm/filemap.h
index 13793ba0ce1..5683cde2205 100644
--- a/mm/filemap.h
+++ b/mm/filemap.h
@@ -13,7 +13,7 @@
 #include <linux/highmem.h>
 #include <linux/uio.h>
 #include <linux/config.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 size_t
 __filemap_copy_from_user_iovec(char *vaddr,
@@ -34,13 +34,13 @@ filemap_copy_from_user(struct page *page, unsigned long offset,
 	int left;
 
 	kaddr = kmap_atomic(page, KM_USER0);
-	left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
+	left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes);
 	kunmap_atomic(kaddr, KM_USER0);
 
 	if (left != 0) {
 		/* Do it the slow way */
 		kaddr = kmap(page);
-		left = __copy_from_user(kaddr + offset, buf, bytes);
+		left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
 		kunmap(page);
 	}
 	return bytes - left;
-- 
cgit v1.2.3-70-g09d2


From 090d2b185d8680fc26a2eaf4245d4171dcf4baf1 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Fri, 23 Jun 2006 02:05:08 -0700
Subject: [PATCH] read_mapping_page for address space

Add read_mapping_page() which is used for callers that pass
mapping->a_ops->readpage as the filler for read_cache_page.  This removes
some duplication from filesystem code.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/afs/dir.c            |  4 +---
 fs/afs/mntpt.c          | 11 ++---------
 fs/cramfs/inode.c       |  4 +---
 fs/ext2/dir.c           |  3 +--
 fs/freevxfs/vxfs_subr.c |  3 +--
 fs/hfs/bnode.c          |  2 +-
 fs/hfs/btree.c          |  2 +-
 fs/hfsplus/bitmap.c     | 15 +++++++--------
 fs/hfsplus/bnode.c      |  2 +-
 fs/hfsplus/btree.c      |  2 +-
 fs/jfs/jfs_metapage.c   |  5 ++---
 fs/minix/dir.c          |  3 +--
 fs/namei.c              |  3 +--
 fs/ntfs/aops.h          |  3 +--
 fs/ntfs/attrib.c        |  6 ++----
 fs/ntfs/file.c          |  3 +--
 fs/ocfs2/symlink.c      |  3 +--
 fs/partitions/check.c   |  4 ++--
 fs/reiserfs/xattr.c     |  3 +--
 fs/sysv/dir.c           |  3 +--
 include/linux/pagemap.h |  7 +++++++
 mm/swapfile.c           |  3 +--
 22 files changed, 38 insertions(+), 56 deletions(-)

(limited to 'mm')

diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index a6dff6a4f20..2fc99877cb0 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -185,9 +185,7 @@ static struct page *afs_dir_get_page(struct inode *dir, unsigned long index)
 
 	_enter("{%lu},%lu", dir->i_ino, index);
 
-	page = read_cache_page(dir->i_mapping,index,
-			       (filler_t *) dir->i_mapping->a_ops->readpage,
-			       NULL);
+	page = read_mapping_page(dir->i_mapping, index, NULL);
 	if (!IS_ERR(page)) {
 		wait_on_page_locked(page);
 		kmap(page);
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 4e6eeb59b83..b5cf9e1205a 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -63,7 +63,6 @@ unsigned long afs_mntpt_expiry_timeout = 20;
 int afs_mntpt_check_symlink(struct afs_vnode *vnode)
 {
 	struct page *page;
-	filler_t *filler;
 	size_t size;
 	char *buf;
 	int ret;
@@ -71,10 +70,7 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode)
 	_enter("{%u,%u}", vnode->fid.vnode, vnode->fid.unique);
 
 	/* read the contents of the symlink into the pagecache */
-	filler = (filler_t *) AFS_VNODE_TO_I(vnode)->i_mapping->a_ops->readpage;
-
-	page = read_cache_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0,
-			       filler, NULL);
+	page = read_mapping_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, NULL);
 	if (IS_ERR(page)) {
 		ret = PTR_ERR(page);
 		goto out;
@@ -160,7 +156,6 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 	struct page *page = NULL;
 	size_t size;
 	char *buf, *devname = NULL, *options = NULL;
-	filler_t *filler;
 	int ret;
 
 	kenter("{%s}", mntpt->d_name.name);
@@ -182,9 +177,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 		goto error;
 
 	/* read the contents of the AFS special symlink */
-	filler = (filler_t *)mntpt->d_inode->i_mapping->a_ops->readpage;
-
-	page = read_cache_page(mntpt->d_inode->i_mapping, 0, filler, NULL);
+	page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL);
 	if (IS_ERR(page)) {
 		ret = PTR_ERR(page);
 		goto error;
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 8a9d5d3b326..c45d7386080 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -181,9 +181,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
 		struct page *page = NULL;
 
 		if (blocknr + i < devsize) {
-			page = read_cache_page(mapping, blocknr + i,
-				(filler_t *)mapping->a_ops->readpage,
-				NULL);
+			page = read_mapping_page(mapping, blocknr + i, NULL);
 			/* synchronous error? */
 			if (IS_ERR(page))
 				page = NULL;
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index d672aa9f406..3c1c9aaaca6 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -159,8 +159,7 @@ fail:
 static struct page * ext2_get_page(struct inode *dir, unsigned long n)
 {
 	struct address_space *mapping = dir->i_mapping;
-	struct page *page = read_cache_page(mapping, n,
-				(filler_t*)mapping->a_ops->readpage, NULL);
+	struct page *page = read_mapping_page(mapping, n, NULL);
 	if (!IS_ERR(page)) {
 		wait_on_page_locked(page);
 		kmap(page);
diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c
index 50aae77651b..c1be118fc06 100644
--- a/fs/freevxfs/vxfs_subr.c
+++ b/fs/freevxfs/vxfs_subr.c
@@ -71,8 +71,7 @@ vxfs_get_page(struct address_space *mapping, u_long n)
 {
 	struct page *			pp;
 
-	pp = read_cache_page(mapping, n,
-			(filler_t*)mapping->a_ops->readpage, NULL);
+	pp = read_mapping_page(mapping, n, NULL);
 
 	if (!IS_ERR(pp)) {
 		wait_on_page_locked(pp);
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index 1e44dcfe49c..13231dd5ce6 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -280,7 +280,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
 	block = off >> PAGE_CACHE_SHIFT;
 	node->page_offset = off & ~PAGE_CACHE_MASK;
 	for (i = 0; i < tree->pages_per_bnode; i++) {
-		page = read_cache_page(mapping, block++, (filler_t *)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, block++, NULL);
 		if (IS_ERR(page))
 			goto fail;
 		if (PageError(page)) {
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index d20131ce4b9..40035799431 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -59,7 +59,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
 	unlock_new_inode(tree->inode);
 
 	mapping = tree->inode->i_mapping;
-	page = read_cache_page(mapping, 0, (filler_t *)mapping->a_ops->readpage, NULL);
+	page = read_mapping_page(mapping, 0, NULL);
 	if (IS_ERR(page))
 		goto free_tree;
 
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index 9fb51632303..d128a25b74d 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -31,8 +31,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma
 	dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len);
 	mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
 	mapping = HFSPLUS_SB(sb).alloc_file->i_mapping;
-	page = read_cache_page(mapping, offset / PAGE_CACHE_BITS,
-			       (filler_t *)mapping->a_ops->readpage, NULL);
+	page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL);
 	pptr = kmap(page);
 	curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32;
 	i = offset % 32;
@@ -72,8 +71,8 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma
 		offset += PAGE_CACHE_BITS;
 		if (offset >= size)
 			break;
-		page = read_cache_page(mapping, offset / PAGE_CACHE_BITS,
-				       (filler_t *)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS,
+					 NULL);
 		curr = pptr = kmap(page);
 		if ((size ^ offset) / PAGE_CACHE_BITS)
 			end = pptr + PAGE_CACHE_BITS / 32;
@@ -119,8 +118,8 @@ found:
 		set_page_dirty(page);
 		kunmap(page);
 		offset += PAGE_CACHE_BITS;
-		page = read_cache_page(mapping, offset / PAGE_CACHE_BITS,
-				       (filler_t *)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS,
+					 NULL);
 		pptr = kmap(page);
 		curr = pptr;
 		end = pptr + PAGE_CACHE_BITS / 32;
@@ -167,7 +166,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
 	mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
 	mapping = HFSPLUS_SB(sb).alloc_file->i_mapping;
 	pnr = offset / PAGE_CACHE_BITS;
-	page = read_cache_page(mapping, pnr, (filler_t *)mapping->a_ops->readpage, NULL);
+	page = read_mapping_page(mapping, pnr, NULL);
 	pptr = kmap(page);
 	curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32;
 	end = pptr + PAGE_CACHE_BITS / 32;
@@ -199,7 +198,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
 			break;
 		set_page_dirty(page);
 		kunmap(page);
-		page = read_cache_page(mapping, ++pnr, (filler_t *)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, ++pnr, NULL);
 		pptr = kmap(page);
 		curr = pptr;
 		end = pptr + PAGE_CACHE_BITS / 32;
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 746abc9ecf7..77bf434da67 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -440,7 +440,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
 	block = off >> PAGE_CACHE_SHIFT;
 	node->page_offset = off & ~PAGE_CACHE_MASK;
 	for (i = 0; i < tree->pages_per_bnode; block++, i++) {
-		page = read_cache_page(mapping, block, (filler_t *)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, block, NULL);
 		if (IS_ERR(page))
 			goto fail;
 		if (PageError(page)) {
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index effa8991999..cfc852fdd1b 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -38,7 +38,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
 		goto free_tree;
 
 	mapping = tree->inode->i_mapping;
-	page = read_cache_page(mapping, 0, (filler_t *)mapping->a_ops->readpage, NULL);
+	page = read_mapping_page(mapping, 0, NULL);
 	if (IS_ERR(page))
 		goto free_tree;
 
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 2b220dd6b4e..7f6e8803970 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -632,10 +632,9 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
 		}
 		SetPageUptodate(page);
 	} else {
-		page = read_cache_page(mapping, page_index,
-			    (filler_t *)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, page_index, NULL);
 		if (IS_ERR(page) || !PageUptodate(page)) {
-			jfs_err("read_cache_page failed!");
+			jfs_err("read_mapping_page failed!");
 			return NULL;
 		}
 		lock_page(page);
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index 69224d1fe04..2b0a389d198 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -60,8 +60,7 @@ static int dir_commit_chunk(struct page *page, unsigned from, unsigned to)
 static struct page * dir_get_page(struct inode *dir, unsigned long n)
 {
 	struct address_space *mapping = dir->i_mapping;
-	struct page *page = read_cache_page(mapping, n,
-				(filler_t*)mapping->a_ops->readpage, NULL);
+	struct page *page = read_mapping_page(mapping, n, NULL);
 	if (!IS_ERR(page)) {
 		wait_on_page_locked(page);
 		kmap(page);
diff --git a/fs/namei.c b/fs/namei.c
index 184fe4acf82..bb4a3e40e43 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2577,8 +2577,7 @@ static char *page_getlink(struct dentry * dentry, struct page **ppage)
 {
 	struct page * page;
 	struct address_space *mapping = dentry->d_inode->i_mapping;
-	page = read_cache_page(mapping, 0, (filler_t *)mapping->a_ops->readpage,
-				NULL);
+	page = read_mapping_page(mapping, 0, NULL);
 	if (IS_ERR(page))
 		goto sync_fail;
 	wait_on_page_locked(page);
diff --git a/fs/ntfs/aops.h b/fs/ntfs/aops.h
index 3b74e66ca2f..325ce261a10 100644
--- a/fs/ntfs/aops.h
+++ b/fs/ntfs/aops.h
@@ -86,8 +86,7 @@ static inline void ntfs_unmap_page(struct page *page)
 static inline struct page *ntfs_map_page(struct address_space *mapping,
 		unsigned long index)
 {
-	struct page *page = read_cache_page(mapping, index,
-			(filler_t*)mapping->a_ops->readpage, NULL);
+	struct page *page = read_mapping_page(mapping, index, NULL);
 
 	if (!IS_ERR(page)) {
 		wait_on_page_locked(page);
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 1663f5c3c6a..6708e1d68a9 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -2529,8 +2529,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
 	end >>= PAGE_CACHE_SHIFT;
 	/* If there is a first partial page, need to do it the slow way. */
 	if (start_ofs) {
-		page = read_cache_page(mapping, idx,
-				(filler_t*)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, idx, NULL);
 		if (IS_ERR(page)) {
 			ntfs_error(vol->sb, "Failed to read first partial "
 					"page (sync error, index 0x%lx).", idx);
@@ -2600,8 +2599,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
 	}
 	/* If there is a last partial page, need to do it the slow way. */
 	if (end_ofs) {
-		page = read_cache_page(mapping, idx,
-				(filler_t*)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, idx, NULL);
 		if (IS_ERR(page)) {
 			ntfs_error(vol->sb, "Failed to read last partial page "
 					"(sync error, index 0x%lx).", idx);
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 36e1e136bb0..88292f9e4b9 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -231,8 +231,7 @@ do_non_resident_extend:
 		 * Read the page.  If the page is not present, this will zero
 		 * the uninitialized regions for us.
 		 */
-		page = read_cache_page(mapping, index,
-				(filler_t*)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, index, NULL);
 		if (IS_ERR(page)) {
 			err = PTR_ERR(page);
 			goto init_err_out;
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index f6986bd79e7..0c8a1294ec9 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -64,8 +64,7 @@ static char *ocfs2_page_getlink(struct dentry * dentry,
 {
 	struct page * page;
 	struct address_space *mapping = dentry->d_inode->i_mapping;
-	page = read_cache_page(mapping, 0,
-			       (filler_t *)mapping->a_ops->readpage, NULL);
+	page = read_mapping_page(mapping, 0, NULL);
 	if (IS_ERR(page))
 		goto sync_fail;
 	wait_on_page_locked(page);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 8851b81e7c5..cd885b23cb5 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -499,8 +499,8 @@ unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p)
 	struct address_space *mapping = bdev->bd_inode->i_mapping;
 	struct page *page;
 
-	page = read_cache_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)),
-			(filler_t *)mapping->a_ops->readpage, NULL);
+	page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)),
+				 NULL);
 	if (!IS_ERR(page)) {
 		wait_on_page_locked(page);
 		if (!PageUptodate(page))
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index ffb79c48c5b..39fedaa88a0 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -452,8 +452,7 @@ static struct page *reiserfs_get_page(struct inode *dir, unsigned long n)
 	/* We can deadlock if we try to free dentries,
 	   and an unlink/rmdir has just occured - GFP_NOFS avoids this */
 	mapping_set_gfp_mask(mapping, GFP_NOFS);
-	page = read_cache_page(mapping, n,
-			       (filler_t *) mapping->a_ops->readpage, NULL);
+	page = read_mapping_page(mapping, n, NULL);
 	if (!IS_ERR(page)) {
 		wait_on_page_locked(page);
 		kmap(page);
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index d7074341ee8..f2bef962d30 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -53,8 +53,7 @@ static int dir_commit_chunk(struct page *page, unsigned from, unsigned to)
 static struct page * dir_get_page(struct inode *dir, unsigned long n)
 {
 	struct address_space *mapping = dir->i_mapping;
-	struct page *page = read_cache_page(mapping, n,
-				(filler_t*)mapping->a_ops->readpage, NULL);
+	struct page *page = read_mapping_page(mapping, n, NULL);
 	if (!IS_ERR(page)) {
 		wait_on_page_locked(page);
 		kmap(page);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 7a1af574ded..1245df7141a 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -99,6 +99,13 @@ extern struct page * read_cache_page(struct address_space *mapping,
 extern int read_cache_pages(struct address_space *mapping,
 		struct list_head *pages, filler_t *filler, void *data);
 
+static inline struct page *read_mapping_page(struct address_space *mapping,
+					     unsigned long index, void *data)
+{
+	filler_t *filler = (filler_t *)mapping->a_ops->readpage;
+	return read_cache_page(mapping, index, filler, data);
+}
+
 int add_to_page_cache(struct page *page, struct address_space *mapping,
 				unsigned long index, gfp_t gfp_mask);
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f2824c3c31b..cc367f7e75d 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1477,8 +1477,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 		error = -EINVAL;
 		goto bad_swap;
 	}
-	page = read_cache_page(mapping, 0,
-			(filler_t *)mapping->a_ops->readpage, swap_file);
+	page = read_mapping_page(mapping, 0, swap_file);
 	if (IS_ERR(page)) {
 		error = PTR_ERR(page);
 		goto bad_swap;
-- 
cgit v1.2.3-70-g09d2


From 3cbc564024d8f174202f023e8a2991782f6a9431 Mon Sep 17 00:00:00 2001
From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Fri, 23 Jun 2006 02:05:40 -0700
Subject: [PATCH] percpu_counters: create lib/percpu_counter.c

- Move percpu_counter routines from mm/swap.c to lib/percpu_counter.c

Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 lib/Makefile         |  1 +
 lib/percpu_counter.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 mm/swap.c            | 42 ------------------------------------------
 3 files changed, 47 insertions(+), 42 deletions(-)
 create mode 100644 lib/percpu_counter.c

(limited to 'mm')

diff --git a/lib/Makefile b/lib/Makefile
index b830c9a1554..79358ad1f11 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -46,6 +46,7 @@ obj-$(CONFIG_TEXTSEARCH) += textsearch.o
 obj-$(CONFIG_TEXTSEARCH_KMP) += ts_kmp.o
 obj-$(CONFIG_TEXTSEARCH_BM) += ts_bm.o
 obj-$(CONFIG_TEXTSEARCH_FSM) += ts_fsm.o
+obj-$(CONFIG_SMP) += percpu_counter.o
 
 obj-$(CONFIG_SWIOTLB) += swiotlb.o
 
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
new file mode 100644
index 00000000000..7a87003f8e8
--- /dev/null
+++ b/lib/percpu_counter.c
@@ -0,0 +1,46 @@
+/*
+ * Fast batching percpu counters.
+ */
+
+#include <linux/percpu_counter.h>
+#include <linux/module.h>
+
+void percpu_counter_mod(struct percpu_counter *fbc, long amount)
+{
+	long count;
+	long *pcount;
+	int cpu = get_cpu();
+
+	pcount = per_cpu_ptr(fbc->counters, cpu);
+	count = *pcount + amount;
+	if (count >= FBC_BATCH || count <= -FBC_BATCH) {
+		spin_lock(&fbc->lock);
+		fbc->count += count;
+		*pcount = 0;
+		spin_unlock(&fbc->lock);
+	} else {
+		*pcount = count;
+	}
+	put_cpu();
+}
+EXPORT_SYMBOL(percpu_counter_mod);
+
+/*
+ * Add up all the per-cpu counts, return the result.  This is a more accurate
+ * but much slower version of percpu_counter_read_positive()
+ */
+long percpu_counter_sum(struct percpu_counter *fbc)
+{
+	long ret;
+	int cpu;
+
+	spin_lock(&fbc->lock);
+	ret = fbc->count;
+	for_each_possible_cpu(cpu) {
+		long *pcount = per_cpu_ptr(fbc->counters, cpu);
+		ret += *pcount;
+	}
+	spin_unlock(&fbc->lock);
+	return ret < 0 ? 0 : ret;
+}
+EXPORT_SYMBOL(percpu_counter_sum);
diff --git a/mm/swap.c b/mm/swap.c
index 88895c249bc..03ae2076f92 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -480,48 +480,6 @@ static int cpu_swap_callback(struct notifier_block *nfb,
 #endif /* CONFIG_HOTPLUG_CPU */
 #endif /* CONFIG_SMP */
 
-#ifdef CONFIG_SMP
-void percpu_counter_mod(struct percpu_counter *fbc, long amount)
-{
-	long count;
-	long *pcount;
-	int cpu = get_cpu();
-
-	pcount = per_cpu_ptr(fbc->counters, cpu);
-	count = *pcount + amount;
-	if (count >= FBC_BATCH || count <= -FBC_BATCH) {
-		spin_lock(&fbc->lock);
-		fbc->count += count;
-		*pcount = 0;
-		spin_unlock(&fbc->lock);
-	} else {
-		*pcount = count;
-	}
-	put_cpu();
-}
-EXPORT_SYMBOL(percpu_counter_mod);
-
-/*
- * Add up all the per-cpu counts, return the result.  This is a more accurate
- * but much slower version of percpu_counter_read_positive()
- */
-long percpu_counter_sum(struct percpu_counter *fbc)
-{
-	long ret;
-	int cpu;
-
-	spin_lock(&fbc->lock);
-	ret = fbc->count;
-	for_each_possible_cpu(cpu) {
-		long *pcount = per_cpu_ptr(fbc->counters, cpu);
-		ret += *pcount;
-	}
-	spin_unlock(&fbc->lock);
-	return ret < 0 ? 0 : ret;
-}
-EXPORT_SYMBOL(percpu_counter_sum);
-#endif
-
 /*
  * Perform any setup for the swap system
  */
-- 
cgit v1.2.3-70-g09d2


From e0f23603fb2607315ce52432cc4225df410828cf Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 23 Jun 2006 02:05:48 -0700
Subject: [PATCH] Remove semi-softlockup from invalidate_mapping_pages

If invalidate_mapping_pages is called to invalidate a very large mapping
(e.g.  a very large block device) and if the only active page in that
device is near the end (or at least, at a very large index), such as, say,
the superblock of an md array, and if that page happens to be locked when
invalidate_mapping_pages is called, then

  pagevec_lookup will return this page and
  as it is locked, 'next' will be incremented and pagevec_lookup
  will be called again. and again. and again.
  while we count from 0 upto a very large number.

We should really always set 'next' to 'page->index+1' before going around
the loop again, not just if the page isn't locked.

Cc: "Steinar H. Gunderson" <sgunderson@bigfoot.com>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/truncate.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/truncate.c b/mm/truncate.c
index 6cb3fff25f6..cf1b015df4a 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -230,14 +230,24 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
 			pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
+			pgoff_t index;
+			int lock_failed;
 
-			if (TestSetPageLocked(page)) {
-				next++;
-				continue;
-			}
-			if (page->index > next)
-				next = page->index;
+			lock_failed = TestSetPageLocked(page);
+
+			/*
+			 * We really shouldn't be looking at the ->index of an
+			 * unlocked page.  But we're not allowed to lock these
+			 * pages.  So we rely upon nobody altering the ->index
+			 * of this (pinned-by-us) page.
+			 */
+			index = page->index;
+			if (index > next)
+				next = index;
 			next++;
+			if (lock_failed)
+				continue;
+
 			if (PageDirty(page) || PageWriteback(page))
 				goto unlock;
 			if (page_mapped(page))
-- 
cgit v1.2.3-70-g09d2


From 125e18745f16685f69a34fd6130d47598fc4bf54 Mon Sep 17 00:00:00 2001
From: Eric Sesterhenn <snakebyte@gmx.de>
Date: Fri, 23 Jun 2006 02:06:06 -0700
Subject: [PATCH] More BUG_ON conversion

Signed-off-by: Eric Sesterhenn <snakebyte@gmx.de>
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Bartlomiej Zolnierkiewicz <B.Zolnierkiewicz@elka.pw.edu.pl>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: James Bottomley <James.Bottomley@steeleye.com>
Acked-by: "Salyzyn, Mark" <mark_salyzyn@adaptec.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/frv/kernel/setup.c        |  2 +-
 drivers/ide/ide-cd.c           |  3 +--
 drivers/ide/ide-dma.c          |  6 ++----
 drivers/ide/ide-floppy.c       | 12 ++++--------
 drivers/ide/ide-iops.c         | 12 ++++--------
 drivers/ide/ide-lib.c          |  3 +--
 drivers/ide/ide-taskfile.c     |  3 +--
 drivers/ide/ide.c              |  3 +--
 drivers/ide/pci/trm290.c       |  3 +--
 drivers/scsi/53c700.h          |  6 ++----
 drivers/scsi/aacraid/aachba.c  | 15 +++++----------
 drivers/scsi/aacraid/commsup.c |  6 ++----
 drivers/scsi/arm/queue.c       |  6 ++----
 drivers/scsi/ibmmca.c          |  3 +--
 drivers/scsi/ide-scsi.c        |  6 ++----
 drivers/scsi/megaraid.c        |  2 +-
 kernel/exit.c                  |  5 ++---
 mm/pdflush.c                   |  3 +--
 18 files changed, 34 insertions(+), 65 deletions(-)

(limited to 'mm')

diff --git a/arch/frv/kernel/setup.c b/arch/frv/kernel/setup.c
index 5908deae960..1f7d65f29e7 100644
--- a/arch/frv/kernel/setup.c
+++ b/arch/frv/kernel/setup.c
@@ -814,7 +814,7 @@ void __init setup_arch(char **cmdline_p)
 	 * - by now the stack is part of the init task */
 	printk("Memory %08lx-%08lx\n", memory_start, memory_end);
 
-	if (memory_start == memory_end) BUG();
+	BUG_ON(memory_start == memory_end);
 
 	init_mm.start_code = (unsigned long) &_stext;
 	init_mm.end_code = (unsigned long) &_etext;
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index abb83d95df5..6de3cd3d6e8 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -1725,8 +1725,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 		}
 	}
 
-	if (HWGROUP(drive)->handler != NULL)
-		BUG();
+	BUG_ON(HWGROUP(drive)->handler != NULL);
 
 	ide_set_handler(drive, cdrom_newpc_intr, rq->timeout, NULL);
 	return ide_started;
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index c481be8b807..783a2475ee8 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -206,8 +206,7 @@ int ide_build_sglist(ide_drive_t *drive, struct request *rq)
 	ide_hwif_t *hwif = HWIF(drive);
 	struct scatterlist *sg = hwif->sg_table;
 
-	if ((rq->flags & REQ_DRIVE_TASKFILE) && rq->nr_sectors > 256)
-		BUG();
+	BUG_ON((rq->flags & REQ_DRIVE_TASKFILE) && rq->nr_sectors > 256);
 
 	ide_map_sg(drive, rq);
 
@@ -947,8 +946,7 @@ void ide_setup_dma (ide_hwif_t *hwif, unsigned long dma_base, unsigned int num_p
 	}
 	printk("\n");
 
-	if (!(hwif->dma_master))
-		BUG();
+	BUG_ON(!hwif->dma_master);
 }
 
 EXPORT_SYMBOL_GPL(ide_setup_dma);
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index a53e3ce4a14..a1179e92496 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -898,8 +898,7 @@ static ide_startstop_t idefloppy_pc_intr (ide_drive_t *drive)
 					"to send us more data than expected "
 					"- discarding data\n");
 				idefloppy_discard_data(drive,bcount.all);
-				if (HWGROUP(drive)->handler != NULL)
-					BUG();
+				BUG_ON(HWGROUP(drive)->handler != NULL);
 				ide_set_handler(drive,
 						&idefloppy_pc_intr,
 						IDEFLOPPY_WAIT_CMD,
@@ -932,8 +931,7 @@ static ide_startstop_t idefloppy_pc_intr (ide_drive_t *drive)
 	pc->actually_transferred += bcount.all;
 	pc->current_position += bcount.all;
 
-	if (HWGROUP(drive)->handler != NULL)
-		BUG();
+	BUG_ON(HWGROUP(drive)->handler != NULL);
 	ide_set_handler(drive, &idefloppy_pc_intr, IDEFLOPPY_WAIT_CMD, NULL);		/* And set the interrupt handler again */
 	return ide_started;
 }
@@ -960,8 +958,7 @@ static ide_startstop_t idefloppy_transfer_pc (ide_drive_t *drive)
 				"issuing a packet command\n");
 		return ide_do_reset(drive);
 	}
-	if (HWGROUP(drive)->handler != NULL)
-		BUG();
+	BUG_ON(HWGROUP(drive)->handler != NULL);
 	/* Set the interrupt routine */
 	ide_set_handler(drive, &idefloppy_pc_intr, IDEFLOPPY_WAIT_CMD, NULL);
 	/* Send the actual packet */
@@ -1017,8 +1014,7 @@ static ide_startstop_t idefloppy_transfer_pc1 (ide_drive_t *drive)
 	 * 40 and 50msec work well. idefloppy_pc_intr will not be actually
 	 * used until after the packet is moved in about 50 msec.
 	 */
-	if (HWGROUP(drive)->handler != NULL)
-		BUG();
+	BUG_ON(HWGROUP(drive)->handler != NULL);
 	ide_set_handler(drive, 
 	  &idefloppy_pc_intr, 		/* service routine for packet command */
 	  floppy->ticks,		/* wait this long before "failing" */
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index b72dde70840..97a49e77a8f 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -939,8 +939,7 @@ void ide_execute_command(ide_drive_t *drive, task_ioreg_t cmd, ide_handler_t *ha
 	
 	spin_lock_irqsave(&ide_lock, flags);
 	
-	if(hwgroup->handler)
-		BUG();
+	BUG_ON(hwgroup->handler);
 	hwgroup->handler	= handler;
 	hwgroup->expiry		= expiry;
 	hwgroup->timer.expires	= jiffies + timeout;
@@ -981,8 +980,7 @@ static ide_startstop_t atapi_reset_pollfunc (ide_drive_t *drive)
 		printk("%s: ATAPI reset complete\n", drive->name);
 	} else {
 		if (time_before(jiffies, hwgroup->poll_timeout)) {
-			if (HWGROUP(drive)->handler != NULL)
-				BUG();
+			BUG_ON(HWGROUP(drive)->handler != NULL);
 			ide_set_handler(drive, &atapi_reset_pollfunc, HZ/20, NULL);
 			/* continue polling */
 			return ide_started;
@@ -1021,8 +1019,7 @@ static ide_startstop_t reset_pollfunc (ide_drive_t *drive)
 
 	if (!OK_STAT(tmp = hwif->INB(IDE_STATUS_REG), 0, BUSY_STAT)) {
 		if (time_before(jiffies, hwgroup->poll_timeout)) {
-			if (HWGROUP(drive)->handler != NULL)
-				BUG();
+			BUG_ON(HWGROUP(drive)->handler != NULL);
 			ide_set_handler(drive, &reset_pollfunc, HZ/20, NULL);
 			/* continue polling */
 			return ide_started;
@@ -1138,8 +1135,7 @@ static ide_startstop_t do_reset1 (ide_drive_t *drive, int do_not_try_atapi)
 	hwgroup = HWGROUP(drive);
 
 	/* We must not reset with running handlers */
-	if(hwgroup->handler != NULL)
-		BUG();
+	BUG_ON(hwgroup->handler != NULL);
 
 	/* For an ATAPI device, first try an ATAPI SRST. */
 	if (drive->media != ide_disk && !do_not_try_atapi) {
diff --git a/drivers/ide/ide-lib.c b/drivers/ide/ide-lib.c
index 41d46dbe6c2..16a143133f9 100644
--- a/drivers/ide/ide-lib.c
+++ b/drivers/ide/ide-lib.c
@@ -164,8 +164,7 @@ u8 ide_rate_filter (u8 mode, u8 speed)
 //	printk("%s: mode 0x%02x, speed 0x%02x\n", __FUNCTION__, mode, speed);
 
 	/* So that we remember to update this if new modes appear */
-	if (mode > 4)
-		BUG();
+	BUG_ON(mode > 4);
 	return min(speed, speed_max[mode]);
 #else /* !CONFIG_BLK_DEV_IDEDMA */
 	return min(speed, (u8)XFER_PIO_4);
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 9233b8109a0..a839b2a8f6f 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -196,8 +196,7 @@ ide_startstop_t set_geometry_intr (ide_drive_t *drive)
 	if (stat & (ERR_STAT|DRQ_STAT))
 		return ide_error(drive, "set_geometry_intr", stat);
 
-	if (HWGROUP(drive)->handler != NULL)
-		BUG();
+	BUG_ON(HWGROUP(drive)->handler != NULL);
 	ide_set_handler(drive, &set_geometry_intr, WAIT_WORSTCASE, NULL);
 	return ide_started;
 }
diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c
index f5a60912c27..9799aed772e 100644
--- a/drivers/ide/ide.c
+++ b/drivers/ide/ide.c
@@ -1367,8 +1367,7 @@ int generic_ide_ioctl(ide_drive_t *drive, struct file *file, struct block_device
 
 			ide_abort(drive, "drive reset");
 
-			if(HWGROUP(drive)->handler)
-				BUG();
+			BUG_ON(HWGROUP(drive)->handler);
 				
 			/* Ensure nothing gets queued after we
 			   drop the lock. Reset will clear the busy */
diff --git a/drivers/ide/pci/trm290.c b/drivers/ide/pci/trm290.c
index c26c8ca90dd..fe80295974e 100644
--- a/drivers/ide/pci/trm290.c
+++ b/drivers/ide/pci/trm290.c
@@ -183,8 +183,7 @@ static void trm290_ide_dma_exec_cmd(ide_drive_t *drive, u8 command)
 {
 	ide_hwif_t *hwif	= HWIF(drive);
 
-	if (HWGROUP(drive)->handler != NULL)	/* paranoia check */
-		BUG();
+	BUG_ON(HWGROUP(drive)->handler != NULL);	/* paranoia check */
 	ide_set_handler(drive, &ide_dma_intr, WAIT_CMD, NULL);
 	/* issue cmd to drive */
 	hwif->OUTB(command, IDE_COMMAND_REG);
diff --git a/drivers/scsi/53c700.h b/drivers/scsi/53c700.h
index d8041952b1b..7f22a06fe5e 100644
--- a/drivers/scsi/53c700.h
+++ b/drivers/scsi/53c700.h
@@ -474,8 +474,7 @@ NCR_700_readl(struct Scsi_Host *host, __u32 reg)
 		ioread32(hostdata->base + reg);
 #if 1
 	/* sanity check the register */
-	if((reg & 0x3) != 0)
-		BUG();
+	BUG_ON((reg & 0x3) != 0);
 #endif
 
 	return value;
@@ -498,8 +497,7 @@ NCR_700_writel(__u32 value, struct Scsi_Host *host, __u32 reg)
 
 #if 1
 	/* sanity check the register */
-	if((reg & 0x3) != 0)
-		BUG();
+	BUG_ON((reg & 0x3) != 0);
 #endif
 
 	bEBus ? iowrite32be(value, hostdata->base + reg): 
diff --git a/drivers/scsi/aacraid/aachba.c b/drivers/scsi/aacraid/aachba.c
index ea9e038813e..83b5c7d085f 100644
--- a/drivers/scsi/aacraid/aachba.c
+++ b/drivers/scsi/aacraid/aachba.c
@@ -396,8 +396,7 @@ static void get_container_name_callback(void *context, struct fib * fibptr)
 	scsicmd->SCp.phase = AAC_OWNER_MIDLEVEL;
 
 	dprintk((KERN_DEBUG "get_container_name_callback[cpu %d]: t = %ld.\n", smp_processor_id(), jiffies));
-	if (fibptr == NULL)
-		BUG();
+	BUG_ON(fibptr == NULL);
 
 	get_name_reply = (struct aac_get_name_resp *) fib_data(fibptr);
 	/* Failure is irrelevant, using default value instead */
@@ -956,8 +955,7 @@ static void io_callback(void *context, struct fib * fibptr)
 		  smp_processor_id(), (unsigned long long)lba, jiffies);
 	}
 
-	if (fibptr == NULL)
-		BUG();
+	BUG_ON(fibptr == NULL);
 		
 	if(scsicmd->use_sg)
 		pci_unmap_sg(dev->pdev, 
@@ -1092,8 +1090,7 @@ static int aac_read(struct scsi_cmnd * scsicmd, int cid)
 		
 		aac_build_sgraw(scsicmd, &readcmd->sg);
 		fibsize = sizeof(struct aac_raw_io) + ((le32_to_cpu(readcmd->sg.count) - 1) * sizeof (struct sgentryraw));
-		if (fibsize > (dev->max_fib_size - sizeof(struct aac_fibhdr)))
-			BUG();
+		BUG_ON(fibsize > (dev->max_fib_size - sizeof(struct aac_fibhdr)));
 		/*
 		 *	Now send the Fib to the adapter
 		 */
@@ -1261,8 +1258,7 @@ static int aac_write(struct scsi_cmnd * scsicmd, int cid)
 		
 		aac_build_sgraw(scsicmd, &writecmd->sg);
 		fibsize = sizeof(struct aac_raw_io) + ((le32_to_cpu(writecmd->sg.count) - 1) * sizeof (struct sgentryraw));
-		if (fibsize > (dev->max_fib_size - sizeof(struct aac_fibhdr)))
-			BUG();
+		BUG_ON(fibsize > (dev->max_fib_size - sizeof(struct aac_fibhdr)));
 		/*
 		 *	Now send the Fib to the adapter
 		 */
@@ -1904,8 +1900,7 @@ static void aac_srb_callback(void *context, struct fib * fibptr)
 	scsicmd->SCp.phase = AAC_OWNER_MIDLEVEL;
 	dev = (struct aac_dev *)scsicmd->device->host->hostdata;
 
-	if (fibptr == NULL)
-		BUG();
+	BUG_ON(fibptr == NULL);
 
 	srbreply = (struct aac_srb_reply *) fib_data(fibptr);
 
diff --git a/drivers/scsi/aacraid/commsup.c b/drivers/scsi/aacraid/commsup.c
index d2ef17ea44f..3f27419c66a 100644
--- a/drivers/scsi/aacraid/commsup.c
+++ b/drivers/scsi/aacraid/commsup.c
@@ -229,8 +229,7 @@ void aac_fib_init(struct fib *fibptr)
 static void fib_dealloc(struct fib * fibptr)
 {
 	struct hw_fib *hw_fib = fibptr->hw_fib;
-	if(hw_fib->header.StructType != FIB_MAGIC) 
-		BUG();
+	BUG_ON(hw_fib->header.StructType != FIB_MAGIC);
 	hw_fib->header.XferState = 0;        
 }
 
@@ -530,8 +529,7 @@ int aac_fib_send(u16 command, struct fib *fibptr, unsigned long size,
 			}
 		} else
 			down(&fibptr->event_wait);
-		if(fibptr->done == 0)
-			BUG();
+		BUG_ON(fibptr->done == 0);
 			
 		if((fibptr->flags & FIB_CONTEXT_FLAG_TIMED_OUT)){
 			return -ETIMEDOUT;
diff --git a/drivers/scsi/arm/queue.c b/drivers/scsi/arm/queue.c
index b10750bb5c0..8caa5903ce3 100644
--- a/drivers/scsi/arm/queue.c
+++ b/drivers/scsi/arm/queue.c
@@ -118,8 +118,7 @@ int __queue_add(Queue_t *queue, Scsi_Cmnd *SCpnt, int head)
 	list_del(l);
 
 	q = list_entry(l, QE_t, list);
-	if (BAD_MAGIC(q, QUEUE_MAGIC_FREE))
-		BUG();
+	BUG_ON(BAD_MAGIC(q, QUEUE_MAGIC_FREE));
 
 	SET_MAGIC(q, QUEUE_MAGIC_USED);
 	q->SCpnt = SCpnt;
@@ -144,8 +143,7 @@ static Scsi_Cmnd *__queue_remove(Queue_t *queue, struct list_head *ent)
 	 */
 	list_del(ent);
 	q = list_entry(ent, QE_t, list);
-	if (BAD_MAGIC(q, QUEUE_MAGIC_USED))
-		BUG();
+	BUG_ON(BAD_MAGIC(q, QUEUE_MAGIC_USED));
 
 	SET_MAGIC(q, QUEUE_MAGIC_FREE);
 	list_add(ent, &queue->free);
diff --git a/drivers/scsi/ibmmca.c b/drivers/scsi/ibmmca.c
index 0c6dc31bb14..115f55471ed 100644
--- a/drivers/scsi/ibmmca.c
+++ b/drivers/scsi/ibmmca.c
@@ -2241,8 +2241,7 @@ static int __ibmmca_host_reset(Scsi_Cmnd * cmd)
 	int host_index;
 	unsigned long imm_command;
 
-	if (cmd == NULL)
-		BUG();
+	BUG_ON(cmd == NULL);
 
 	ticks = IM_RESET_DELAY * HZ;
 	shpnt = cmd->device->host;
diff --git a/drivers/scsi/ide-scsi.c b/drivers/scsi/ide-scsi.c
index 39b760a2424..988e6f7af01 100644
--- a/drivers/scsi/ide-scsi.c
+++ b/drivers/scsi/ide-scsi.c
@@ -600,8 +600,7 @@ static ide_startstop_t idescsi_transfer_pc(ide_drive_t *drive)
 				"issuing a packet command\n");
 		return ide_do_reset (drive);
 	}
-	if (HWGROUP(drive)->handler != NULL)
-		BUG();
+	BUG_ON(HWGROUP(drive)->handler != NULL);
 	/* Set the interrupt routine */
 	ide_set_handler(drive, &idescsi_pc_intr, get_timeout(pc), idescsi_expiry);
 	/* Send the actual packet */
@@ -691,8 +690,7 @@ static ide_startstop_t idescsi_issue_pc (ide_drive_t *drive, idescsi_pc_t *pc)
 		set_bit(PC_DMA_OK, &pc->flags);
 
 	if (test_bit(IDESCSI_DRQ_INTERRUPT, &scsi->flags)) {
-		if (HWGROUP(drive)->handler != NULL)
-			BUG();
+		BUG_ON(HWGROUP(drive)->handler != NULL);
 		ide_set_handler(drive, &idescsi_transfer_pc,
 				get_timeout(pc), idescsi_expiry);
 		/* Issue the packet command */
diff --git a/drivers/scsi/megaraid.c b/drivers/scsi/megaraid.c
index c33857e7b33..5d2cefb5e52 100644
--- a/drivers/scsi/megaraid.c
+++ b/drivers/scsi/megaraid.c
@@ -1828,7 +1828,7 @@ mega_build_sglist(adapter_t *adapter, scb_t *scb, u32 *buf, u32 *len)
 
 	scb->dma_type = MEGA_SGLIST;
 
-	if( sgcnt > adapter->sglen ) BUG();
+	BUG_ON(sgcnt > adapter->sglen);
 
 	*len = 0;
 
diff --git a/kernel/exit.c b/kernel/exit.c
index e06d0c10a24..a3baf92462b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -579,7 +579,7 @@ static void exit_mm(struct task_struct * tsk)
 		down_read(&mm->mmap_sem);
 	}
 	atomic_inc(&mm->mm_count);
-	if (mm != tsk->active_mm) BUG();
+	BUG_ON(mm != tsk->active_mm);
 	/* more a memory barrier than a real lock */
 	task_lock(tsk);
 	tsk->mm = NULL;
@@ -1530,8 +1530,7 @@ check_continued:
 		if (options & __WNOTHREAD)
 			break;
 		tsk = next_thread(tsk);
-		if (tsk->signal != current->signal)
-			BUG();
+		BUG_ON(tsk->signal != current->signal);
 	} while (tsk != current);
 
 	read_unlock(&tasklist_lock);
diff --git a/mm/pdflush.c b/mm/pdflush.c
index c4b6d0afd73..df7e50b8f70 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -202,8 +202,7 @@ int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
 	unsigned long flags;
 	int ret = 0;
 
-	if (fn == NULL)
-		BUG();		/* Hard to diagnose if it's deferred */
+	BUG_ON(fn == NULL);	/* Hard to diagnose if it's deferred */
 
 	spin_lock_irqsave(&pdflush_lock, flags);
 	if (list_empty(&pdflush_list)) {
-- 
cgit v1.2.3-70-g09d2


From b31dc66a54ad986b6b73bdc49c8efc17cbad1833 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Tue, 13 Jun 2006 08:26:10 +0200
Subject: [PATCH] Kill PF_SYNCWRITE flag

A process flag to indicate whether we are doing sync io is incredibly
ugly. It also causes performance problems when one does a lot of async
io and then proceeds to sync it. Part of the io will go out as async,
and the other part as sync. This causes a disconnect between the
previously submitted io and the synced io. For io schedulers such as CFQ,
this will cause us lost merges and suboptimal behaviour in scheduling.

Remove PF_SYNCWRITE completely from the fsync/msync paths, and let
the O_DIRECT path just directly indicate that the writes are sync
by using WRITE_SYNC instead.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 block/as-iosched.c                |  2 +-
 block/cfq-iosched.c               |  4 +---
 block/ll_rw_blk.c                 |  3 +++
 drivers/usb/gadget/file_storage.c |  2 --
 fs/buffer.c                       |  2 --
 fs/direct-io.c                    | 18 ++++++++----------
 fs/fs-writeback.c                 |  2 --
 include/linux/blkdev.h            |  2 ++
 include/linux/sched.h             | 11 +++++------
 mm/msync.c                        |  3 ---
 10 files changed, 20 insertions(+), 29 deletions(-)

(limited to 'mm')

diff --git a/block/as-iosched.c b/block/as-iosched.c
index 9b13d72ffef..56c99fa037d 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -1339,7 +1339,7 @@ static void as_add_request(request_queue_t *q, struct request *rq)
 	arq->state = AS_RQ_NEW;
 
 	if (rq_data_dir(arq->request) == READ
-			|| current->flags&PF_SYNCWRITE)
+			|| (arq->request->flags & REQ_RW_SYNC))
 		arq->is_sync = 1;
 	else
 		arq->is_sync = 0;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index c88f161d3fb..4c4e9cc3ae2 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -277,8 +277,6 @@ static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsi
 static void cfq_dispatch_insert(request_queue_t *, struct cfq_rq *);
 static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, gfp_t gfp_mask);
 
-#define process_sync(tsk)	((tsk)->flags & PF_SYNCWRITE)
-
 /*
  * lots of deadline iosched dupes, can be abstracted later...
  */
@@ -334,7 +332,7 @@ static int cfq_queue_empty(request_queue_t *q)
 
 static inline pid_t cfq_queue_pid(struct task_struct *task, int rw)
 {
-	if (rw == READ || process_sync(task))
+	if (rw == READ || rw == WRITE_SYNC)
 		return task->pid;
 
 	return CFQ_KEY_ASYNC;
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 17c42ddd31d..2270bb45138 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -2827,6 +2827,9 @@ static void init_request_from_bio(struct request *req, struct bio *bio)
 	if (unlikely(bio_barrier(bio)))
 		req->flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
 
+	if (bio_sync(bio))
+		req->flags |= REQ_RW_SYNC;
+
 	req->errors = 0;
 	req->hard_sector = req->sector = bio->bi_sector;
 	req->hard_nr_sectors = req->nr_sectors = bio_sectors(bio);
diff --git a/drivers/usb/gadget/file_storage.c b/drivers/usb/gadget/file_storage.c
index 6f887478b14..a43dc908ac5 100644
--- a/drivers/usb/gadget/file_storage.c
+++ b/drivers/usb/gadget/file_storage.c
@@ -1906,7 +1906,6 @@ static int fsync_sub(struct lun *curlun)
 
 	inode = filp->f_dentry->d_inode;
 	mutex_lock(&inode->i_mutex);
-	current->flags |= PF_SYNCWRITE;
 	rc = filemap_fdatawrite(inode->i_mapping);
 	err = filp->f_op->fsync(filp, filp->f_dentry, 1);
 	if (!rc)
@@ -1914,7 +1913,6 @@ static int fsync_sub(struct lun *curlun)
 	err = filemap_fdatawait(inode->i_mapping);
 	if (!rc)
 		rc = err;
-	current->flags &= ~PF_SYNCWRITE;
 	mutex_unlock(&inode->i_mutex);
 	VLDBG(curlun, "fdatasync -> %d\n", rc);
 	return rc;
diff --git a/fs/buffer.c b/fs/buffer.c
index 23f1f3a6807..373bb6292bd 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -331,7 +331,6 @@ long do_fsync(struct file *file, int datasync)
 		goto out;
 	}
 
-	current->flags |= PF_SYNCWRITE;
 	ret = filemap_fdatawrite(mapping);
 
 	/*
@@ -346,7 +345,6 @@ long do_fsync(struct file *file, int datasync)
 	err = filemap_fdatawait(mapping);
 	if (!ret)
 		ret = err;
-	current->flags &= ~PF_SYNCWRITE;
 out:
 	return ret;
 }
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b05d1b21877..538fb0418fb 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -162,7 +162,7 @@ static int dio_refill_pages(struct dio *dio)
 		NULL);				/* vmas */
 	up_read(&current->mm->mmap_sem);
 
-	if (ret < 0 && dio->blocks_available && (dio->rw == WRITE)) {
+	if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) {
 		struct page *page = ZERO_PAGE(dio->curr_user_address);
 		/*
 		 * A memory fault, but the filesystem has some outstanding
@@ -535,7 +535,7 @@ static int get_more_blocks(struct dio *dio)
 		map_bh->b_state = 0;
 		map_bh->b_size = fs_count << dio->inode->i_blkbits;
 
-		create = dio->rw == WRITE;
+		create = dio->rw & WRITE;
 		if (dio->lock_type == DIO_LOCKING) {
 			if (dio->block_in_file < (i_size_read(dio->inode) >>
 							dio->blkbits))
@@ -867,7 +867,7 @@ do_holes:
 				loff_t i_size_aligned;
 
 				/* AKPM: eargh, -ENOTBLK is a hack */
-				if (dio->rw == WRITE) {
+				if (dio->rw & WRITE) {
 					page_cache_release(page);
 					return -ENOTBLK;
 				}
@@ -1045,7 +1045,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 		}
 	} /* end iovec loop */
 
-	if (ret == -ENOTBLK && rw == WRITE) {
+	if (ret == -ENOTBLK && (rw & WRITE)) {
 		/*
 		 * The remaining part of the request will be
 		 * be handled by buffered I/O when we return
@@ -1089,7 +1089,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	if (dio->is_async) {
 		int should_wait = 0;
 
-		if (dio->result < dio->size && rw == WRITE) {
+		if (dio->result < dio->size && (rw & WRITE)) {
 			dio->waiter = current;
 			should_wait = 1;
 		}
@@ -1142,7 +1142,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 			ret = transferred;
 
 		/* We could have also come here on an AIO file extend */
-		if (!is_sync_kiocb(iocb) && rw == WRITE &&
+		if (!is_sync_kiocb(iocb) && (rw & WRITE) &&
 		    ret >= 0 && dio->result == dio->size)
 			/*
 			 * For AIO writes where we have completed the
@@ -1194,7 +1194,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	int acquire_i_mutex = 0;
 
 	if (rw & WRITE)
-		current->flags |= PF_SYNCWRITE;
+		rw = WRITE_SYNC;
 
 	if (bdev)
 		bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
@@ -1270,7 +1270,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	 * even for AIO, we need to wait for i/o to complete before
 	 * returning in this case.
 	 */
-	dio->is_async = !is_sync_kiocb(iocb) && !((rw == WRITE) &&
+	dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
 		(end > i_size_read(inode)));
 
 	retval = direct_io_worker(rw, iocb, inode, iov, offset,
@@ -1284,8 +1284,6 @@ out:
 		mutex_unlock(&inode->i_mutex);
 	else if (acquire_i_mutex)
 		mutex_lock(&inode->i_mutex);
-	if (rw & WRITE)
-		current->flags &= ~PF_SYNCWRITE;
 	return retval;
 }
 EXPORT_SYMBOL(__blockdev_direct_IO);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 6db95cf3aaa..031b27a4bc9 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -623,7 +623,6 @@ int generic_osync_inode(struct inode *inode, struct address_space *mapping, int
 	int need_write_inode_now = 0;
 	int err2;
 
-	current->flags |= PF_SYNCWRITE;
 	if (what & OSYNC_DATA)
 		err = filemap_fdatawrite(mapping);
 	if (what & (OSYNC_METADATA|OSYNC_DATA)) {
@@ -636,7 +635,6 @@ int generic_osync_inode(struct inode *inode, struct address_space *mapping, int
 		if (!err)
 			err = err2;
 	}
-	current->flags &= ~PF_SYNCWRITE;
 
 	spin_lock(&inode_lock);
 	if ((inode->i_state & I_DIRTY) &&
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 3457e7b9736..482a21d6762 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -241,6 +241,7 @@ enum rq_flag_bits {
 	__REQ_PM_RESUME,	/* resume request */
 	__REQ_PM_SHUTDOWN,	/* shutdown request */
 	__REQ_ORDERED_COLOR,	/* is before or after barrier */
+	__REQ_RW_SYNC,		/* request is sync (O_DIRECT) */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -270,6 +271,7 @@ enum rq_flag_bits {
 #define REQ_PM_RESUME	(1 << __REQ_PM_RESUME)
 #define REQ_PM_SHUTDOWN	(1 << __REQ_PM_SHUTDOWN)
 #define REQ_ORDERED_COLOR	(1 << __REQ_ORDERED_COLOR)
+#define REQ_RW_SYNC	(1 << __REQ_RW_SYNC)
 
 /*
  * State information carried for REQ_PM_SUSPEND and REQ_PM_RESUME
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a9d23c7d1b2..38b4791e6a5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -941,12 +941,11 @@ static inline void put_task_struct(struct task_struct *t)
 #define PF_KSWAPD	0x00040000	/* I am kswapd */
 #define PF_SWAPOFF	0x00080000	/* I am in swapoff */
 #define PF_LESS_THROTTLE 0x00100000	/* Throttle me less: I clean memory */
-#define PF_SYNCWRITE	0x00200000	/* I am doing a sync write */
-#define PF_BORROWED_MM	0x00400000	/* I am a kthread doing use_mm */
-#define PF_RANDOMIZE	0x00800000	/* randomize virtual address space */
-#define PF_SWAPWRITE	0x01000000	/* Allowed to write to swap */
-#define PF_SPREAD_PAGE	0x04000000	/* Spread page cache over cpuset */
-#define PF_SPREAD_SLAB	0x08000000	/* Spread some slab caches over cpuset */
+#define PF_BORROWED_MM	0x00200000	/* I am a kthread doing use_mm */
+#define PF_RANDOMIZE	0x00400000	/* randomize virtual address space */
+#define PF_SWAPWRITE	0x00800000	/* Allowed to write to swap */
+#define PF_SPREAD_PAGE	0x01000000	/* Spread page cache over cpuset */
+#define PF_SPREAD_SLAB	0x02000000	/* Spread some slab caches over cpuset */
 #define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
 
 /*
diff --git a/mm/msync.c b/mm/msync.c
index bc6c9537636..d083544df21 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -170,8 +170,6 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
 	 * just ignore them, but return -ENOMEM at the end.
 	 */
 	down_read(&current->mm->mmap_sem);
-	if (flags & MS_SYNC)
-		current->flags |= PF_SYNCWRITE;
 	vma = find_vma(current->mm, start);
 	if (!vma) {
 		error = -ENOMEM;
@@ -228,7 +226,6 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
 		}
 	} while (vma && !done);
 out_unlock:
-	current->flags &= ~PF_SYNCWRITE;
 	up_read(&current->mm->mmap_sem);
 out:
 	return error;
-- 
cgit v1.2.3-70-g09d2


From 9f1a3cfcffaed2fbb3206179295c79ca8289f5c3 Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Sun, 25 Jun 2006 05:46:46 -0700
Subject: [PATCH] AOP_TRUNCATED_PAGE victims in read_pages() belong in the LRU

AOP_TRUNCATED_PAGE victims in read_pages() belong in the LRU

Nick Piggin rightly pointed out that the introduction of AOP_TRUNCATED_PAGE
to read_pages() was wrong to leave A_T_P victim pages in the page cache but
not put them in the LRU.  Failing to do so hid them from the VM.

A_T_P just means that the aop method unlocked the page rather than
performing IO.  It would be very rare that the page was truncated between
the unlock and testing A_T_P.  So we leave the pages in the LRU for likely
reuse soon rather than backing them back out of the page cache.  We do this
by matching the behaviour before the A_T_P introduction which added pages
to the LRU regardless of what ->readpage() did.

This doesn't include the unrelated cleanup in Nick's initial fix which
changed read_pages() to return void to match its only caller's behaviour of
ignoring errors.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Zach Brown <zach.brown@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/readahead.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/mm/readahead.c b/mm/readahead.c
index 0f142a40984..4ee52cadab9 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -182,14 +182,11 @@ static int read_pages(struct address_space *mapping, struct file *filp,
 		list_del(&page->lru);
 		if (!add_to_page_cache(page, mapping,
 					page->index, GFP_KERNEL)) {
-			ret = mapping->a_ops->readpage(filp, page);
-			if (ret != AOP_TRUNCATED_PAGE) {
-				if (!pagevec_add(&lru_pvec, page))
-					__pagevec_lru_add(&lru_pvec);
-				continue;
-			} /* else fall through to release */
-		}
-		page_cache_release(page);
+			mapping->a_ops->readpage(filp, page);
+			if (!pagevec_add(&lru_pvec, page))
+				__pagevec_lru_add(&lru_pvec);
+		} else
+			page_cache_release(page);
 	}
 	pagevec_lru_add(&lru_pvec);
 	ret = 0;
-- 
cgit v1.2.3-70-g09d2


From 7b2259b3e53f128c10a9fded0965e69d4a949847 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 25 Jun 2006 05:46:48 -0700
Subject: [PATCH] page migration: Support a vma migration function

Hooks for calling vma specific migration functions

With this patch a vma may define a vma->vm_ops->migrate function.  That
function may perform page migration on its own (some vmas may not contain page
structs and therefore cannot be handled by regular page migration.  Pages in a
vma may require special preparatory treatment before migration is possible
etc) .  Only mmap_sem is held when the migration function is called.  The
migrate() function gets passed two sets of nodemasks describing the source and
the target of the migration.  The flags parameter either contains

MPOL_MF_MOVE	which means that only pages used exclusively by
		the specified mm should be moved

or

MPOL_MF_MOVE_ALL which means that pages shared with other processes
		should also be moved.

The migration function returns 0 on success or an error condition.  An error
condition will prevent regular page migration from occurring.

On its own this patch cannot be included since there are no users for this
functionality.  But it seems that the uncached allocator will need this
functionality at some point.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/migrate.h | 11 ++++++++++-
 include/linux/mm.h      |  2 ++
 mm/mempolicy.c          |  6 +++++-
 mm/migrate.c            | 20 ++++++++++++++++++++
 4 files changed, 37 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 5dba23a1c0d..48148e0cdbd 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -16,7 +16,9 @@ extern int fail_migrate_page(struct address_space *,
 			struct page *, struct page *);
 
 extern int migrate_prep(void);
-
+extern int migrate_vmas(struct mm_struct *mm,
+		const nodemask_t *from, const nodemask_t *to,
+		unsigned long flags);
 #else
 
 static inline int isolate_lru_page(struct page *p, struct list_head *list)
@@ -30,6 +32,13 @@ static inline int migrate_pages_to(struct list_head *pagelist,
 
 static inline int migrate_prep(void) { return -ENOSYS; }
 
+static inline int migrate_vmas(struct mm_struct *mm,
+		const nodemask_t *from, const nodemask_t *to,
+		unsigned long flags)
+{
+	return -ENOSYS;
+}
+
 /* Possible settings for the migrate_page() method in address_operations */
 #define migrate_page NULL
 #define fail_migrate_page NULL
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 71c5d2f667e..a929ea197e4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -206,6 +206,8 @@ struct vm_operations_struct {
 	int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);
 	struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
 					unsigned long addr);
+	int (*migrate)(struct vm_area_struct *vma, const nodemask_t *from,
+		const nodemask_t *to, unsigned long flags);
 #endif
 };
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index ec4a1a950df..73e0f23b7f5 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -632,6 +632,10 @@ int do_migrate_pages(struct mm_struct *mm,
 
   	down_read(&mm->mmap_sem);
 
+	err = migrate_vmas(mm, from_nodes, to_nodes, flags);
+	if (err)
+		goto out;
+
 /*
  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
@@ -691,7 +695,7 @@ int do_migrate_pages(struct mm_struct *mm,
 		if (err < 0)
 			break;
 	}
-
+out:
 	up_read(&mm->mmap_sem);
 	if (err < 0)
 		return err;
diff --git a/mm/migrate.c b/mm/migrate.c
index 1c2a71aa05c..0576c053598 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -976,3 +976,23 @@ out2:
 }
 #endif
 
+/*
+ * Call migration functions in the vma_ops that may prepare
+ * memory in a vm for migration. migration functions may perform
+ * the migration for vmas that do not have an underlying page struct.
+ */
+int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
+	const nodemask_t *from, unsigned long flags)
+{
+ 	struct vm_area_struct *vma;
+ 	int err = 0;
+
+ 	for(vma = mm->mmap; vma->vm_next && !err; vma = vma->vm_next) {
+ 		if (vma->vm_ops && vma->vm_ops->migrate) {
+ 			err = vma->vm_ops->migrate(vma, to, from, flags);
+ 			if (err)
+ 				break;
+ 		}
+ 	}
+ 	return err;
+}
-- 
cgit v1.2.3-70-g09d2


From e6a1530d692d6a60cdf15dfbcfea07f5324d7b9f Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 25 Jun 2006 05:46:49 -0700
Subject: [PATCH] Allow migration of mlocked pages

Hugh clarified the role of VM_LOCKED.  So we can now implement page
migration for mlocked pages.

Allow the migration of mlocked pages.  This means that try_to_unmap must
unmap mlocked pages in the migration case.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/migrate.c | 10 ++++------
 mm/rmap.c    |  9 ++++-----
 2 files changed, 8 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/migrate.c b/mm/migrate.c
index 0576c053598..3f1e0c2c942 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -616,15 +616,13 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 	/*
 	 * Establish migration ptes or remove ptes
 	 */
-	if (try_to_unmap(page, 1) != SWAP_FAIL) {
-		if (!page_mapped(page))
-			rc = move_to_new_page(newpage, page);
-	} else
-		/* A vma has VM_LOCKED set -> permanent failure */
-		rc = -EPERM;
+	try_to_unmap(page, 1);
+	if (!page_mapped(page))
+		rc = move_to_new_page(newpage, page);
 
 	if (rc)
 		remove_migration_ptes(page, page);
+
 unlock:
 	unlock_page(page);
 
diff --git a/mm/rmap.c b/mm/rmap.c
index 882a85826bb..e76909e880c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -562,9 +562,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 	 * If it's recently referenced (perhaps page_referenced
 	 * skipped over this mm) then we should reactivate it.
 	 */
-	if ((vma->vm_flags & VM_LOCKED) ||
-			(ptep_clear_flush_young(vma, address, pte)
-				&& !migration)) {
+	if (!migration && ((vma->vm_flags & VM_LOCKED) ||
+			(ptep_clear_flush_young(vma, address, pte)))) {
 		ret = SWAP_FAIL;
 		goto out_unmap;
 	}
@@ -771,7 +770,7 @@ static int try_to_unmap_file(struct page *page, int migration)
 
 	list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
 						shared.vm_set.list) {
-		if (vma->vm_flags & VM_LOCKED)
+		if ((vma->vm_flags & VM_LOCKED) && !migration)
 			continue;
 		cursor = (unsigned long) vma->vm_private_data;
 		if (cursor > max_nl_cursor)
@@ -805,7 +804,7 @@ static int try_to_unmap_file(struct page *page, int migration)
 	do {
 		list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
 						shared.vm_set.list) {
-			if (vma->vm_flags & VM_LOCKED)
+			if ((vma->vm_flags & VM_LOCKED) && !migration)
 				continue;
 			cursor = (unsigned long) vma->vm_private_data;
 			while ( cursor < max_nl_cursor &&
-- 
cgit v1.2.3-70-g09d2


From d616e09ab33aa4d013a93c9b393efd5cebf78521 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 25 Jun 2006 05:47:46 -0700
Subject: [PATCH] pdflush: handle resume wakeups

pdflush is carefully designed to ensure that all wakeups have some
corresponding work to do - if a woken-up pdflush thread discovers that it
hasn't been given any work to do then this is considered an error.

That all broke when swsusp came along - because a timer-delivered wakeup to a
frozen pdflush thread will just get lost.  This causes the pdflush thread to
get lost as well: the writeback timer is supposed to be re-armed by pdflush in
process context, but pdflush doesn't execute the callout which does this.

Fix that up by ignoring the return value from try_to_freeze(): jsut proceed,
see if we have any work pending and only go back to sleep if that is not the
case.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/pdflush.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/mm/pdflush.c b/mm/pdflush.c
index df7e50b8f70..b02102feeb4 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -104,21 +104,20 @@ static int __pdflush(struct pdflush_work *my_work)
 		list_move(&my_work->list, &pdflush_list);
 		my_work->when_i_went_to_sleep = jiffies;
 		spin_unlock_irq(&pdflush_lock);
-
 		schedule();
-		if (try_to_freeze()) {
-			spin_lock_irq(&pdflush_lock);
-			continue;
-		}
-
+		try_to_freeze();
 		spin_lock_irq(&pdflush_lock);
 		if (!list_empty(&my_work->list)) {
-			printk("pdflush: bogus wakeup!\n");
+			/*
+			 * Someone woke us up, but without removing our control
+			 * structure from the global list.  swsusp will do this
+			 * in try_to_freeze()->refrigerator().  Handle it.
+			 */
 			my_work->fn = NULL;
 			continue;
 		}
 		if (my_work->fn == NULL) {
-			printk("pdflush: NULL work function\n");
+			printk("pdflush: bogus wakeup\n");
 			continue;
 		}
 		spin_unlock_irq(&pdflush_lock);
-- 
cgit v1.2.3-70-g09d2


From 43b0bc00fdbf2f1503a57f0c2c1338438c5d2805 Mon Sep 17 00:00:00 2001
From: Chris Wright <chrisw@sous-sol.org>
Date: Sun, 25 Jun 2006 05:47:55 -0700
Subject: [PATCH] cpuset: remove extra cpuset_zone_allowed check in
 __alloc_pages

This is redundant with check in wakeup_kswapd.

Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Acked-by: Paul Jackson <pj@sgi.com>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 423db0db7c0..6c1174fcf52 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -957,8 +957,7 @@ restart:
 		goto got_pg;
 
 	do {
-		if (cpuset_zone_allowed(*z, gfp_mask|__GFP_HARDWALL))
-			wakeup_kswapd(*z, order);
+		wakeup_kswapd(*z, order);
 	} while (*(++z));
 
 	/*
-- 
cgit v1.2.3-70-g09d2


From 01408c4939479ec46c15aa7ef6e2406be50eeeca Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Sun, 25 Jun 2006 05:47:58 -0700
Subject: [PATCH] Prepare for __copy_from_user_inatomic to not zero missed
 bytes

The problem is that when we write to a file, the copy from userspace to
pagecache is first done with preemption disabled, so if the source address is
not immediately available the copy fails *and* *zeros* *the* *destination*.

This is a problem because a concurrent read (which admittedly is an odd thing
to do) might see zeros rather that was there before the write, or what was
there after, or some mixture of the two (any of these being a reasonable thing
to see).

If the copy did fail, it will immediately be retried with preemption
re-enabled so any transient problem with accessing the source won't cause an
error.

The first copying does not need to zero any uncopied bytes, and doing so
causes the problem.  It uses copy_from_user_atomic rather than copy_from_user
so the simple expedient is to change copy_from_user_atomic to *not* zero out
bytes on failure.

The first of these two patches prepares for the change by fixing two places
which assume copy_from_user_atomic does zero the tail.  The two usages are
very similar pieces of code which copy from a userspace iovec into one or more
page-cache pages.  These are changed to remove the assumption.

The second patch changes __copy_from_user_inatomic* to not zero the tail.
Once these are accepted, I will look at similar patches of other architectures
where this is important (ppc, mips and sparc being the ones I can find).

This patch:

There is a problem with __copy_from_user_inatomic zeroing the tail of the
buffer in the case of an error.  As it is called in atomic context, the error
may be transient, so it results in zeros being written where maybe they
shouldn't be.

In the usage in filemap, this opens a window for a well timed read to see data
(zeros) which is not consistent with any ordering of reads and writes.

Most cases where __copy_from_user_inatomic is called, a failure results in
__copy_from_user being called immediately.  As long as the latter zeros the
tail, the former doesn't need to.  However in *copy_from_user_iovec
implementations (in both filemap and ntfs/file), it is assumed that
copy_from_user_inatomic will zero the tail.

This patch removes that assumption, so that after this patch it will
be safe for copy_from_user_inatomic to not zero the tail.

This patch also adds some commentary to filemap.h and asm-i386/uaccess.h.

After this patch, all architectures that might disable preempt when
kmap_atomic is called need to have their __copy_from_user_inatomic* "fixed".
This includes
 - powerpc
 - i386
 - mips
 - sparc

Signed-off-by: Neil Brown <neilb@suse.de>
Cc: David Howells <dhowells@redhat.com>
Cc: Anton Altaparmakov <aia21@cantab.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ntfs/file.c             | 26 ++++++++++++++------------
 include/asm-i386/uaccess.h |  6 ++++++
 mm/filemap.c               |  8 ++------
 mm/filemap.h               | 26 ++++++++++++++++++--------
 4 files changed, 40 insertions(+), 26 deletions(-)

(limited to 'mm')

diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 88292f9e4b9..2e42c2dcae1 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1358,7 +1358,7 @@ err_out:
 	goto out;
 }
 
-static size_t __ntfs_copy_from_user_iovec(char *vaddr,
+static size_t __ntfs_copy_from_user_iovec_inatomic(char *vaddr,
 		const struct iovec *iov, size_t iov_ofs, size_t bytes)
 {
 	size_t total = 0;
@@ -1376,10 +1376,6 @@ static size_t __ntfs_copy_from_user_iovec(char *vaddr,
 		bytes -= len;
 		vaddr += len;
 		if (unlikely(left)) {
-			/*
-			 * Zero the rest of the target like __copy_from_user().
-			 */
-			memset(vaddr, 0, bytes);
 			total -= left;
 			break;
 		}
@@ -1420,11 +1416,13 @@ static inline void ntfs_set_next_iovec(const struct iovec **iovp,
  * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s
  * single-segment behaviour.
  *
- * We call the same helper (__ntfs_copy_from_user_iovec()) both when atomic and
- * when not atomic.  This is ok because __ntfs_copy_from_user_iovec() calls
- * __copy_from_user_inatomic() and it is ok to call this when non-atomic.  In
- * fact, the only difference between __copy_from_user_inatomic() and
- * __copy_from_user() is that the latter calls might_sleep().  And on many
+ * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both
+ * when atomic and when not atomic.  This is ok because
+ * __ntfs_copy_from_user_iovec_inatomic() calls __copy_from_user_inatomic()
+ * and it is ok to call this when non-atomic.
+ * Infact, the only difference between __copy_from_user_inatomic() and
+ * __copy_from_user() is that the latter calls might_sleep() and the former
+ * should not zero the tail of the buffer on error.  And on many
  * architectures __copy_from_user_inatomic() is just defined to
  * __copy_from_user() so it makes no difference at all on those architectures.
  */
@@ -1441,14 +1439,18 @@ static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
 		if (len > bytes)
 			len = bytes;
 		kaddr = kmap_atomic(*pages, KM_USER0);
-		copied = __ntfs_copy_from_user_iovec(kaddr + ofs,
+		copied = __ntfs_copy_from_user_iovec_inatomic(kaddr + ofs,
 				*iov, *iov_ofs, len);
 		kunmap_atomic(kaddr, KM_USER0);
 		if (unlikely(copied != len)) {
 			/* Do it the slow way. */
 			kaddr = kmap(*pages);
-			copied = __ntfs_copy_from_user_iovec(kaddr + ofs,
+			copied = __ntfs_copy_from_user_iovec_inatomic(kaddr + ofs,
 					*iov, *iov_ofs, len);
+			/*
+			 * Zero the rest of the target like __copy_from_user().
+			 */
+			memset(kaddr + ofs + copied, 0, len - copied);
 			kunmap(*pages);
 			if (unlikely(copied != len))
 				goto err_out;
diff --git a/include/asm-i386/uaccess.h b/include/asm-i386/uaccess.h
index 8462f8e0e65..d0d253277be 100644
--- a/include/asm-i386/uaccess.h
+++ b/include/asm-i386/uaccess.h
@@ -458,6 +458,12 @@ __copy_to_user(void __user *to, const void *from, unsigned long n)
  *
  * If some data could not be copied, this function will pad the copied
  * data to the requested size using zero bytes.
+ *
+ * An alternate version - __copy_from_user_inatomic() - may be called from
+ * atomic context and will fail rather than sleep.  In this case the
+ * uncopied bytes will *NOT* be padded with zeros.  See fs/filemap.h
+ * for explanation of why this is needed.
+ * FIXME this isn't implimented yet EMXIF
  */
 static __always_inline unsigned long
 __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
diff --git a/mm/filemap.c b/mm/filemap.c
index 807a463fd5e..1ed4be2a765 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1892,7 +1892,7 @@ int remove_suid(struct dentry *dentry)
 EXPORT_SYMBOL(remove_suid);
 
 size_t
-__filemap_copy_from_user_iovec(char *vaddr, 
+__filemap_copy_from_user_iovec_inatomic(char *vaddr,
 			const struct iovec *iov, size_t base, size_t bytes)
 {
 	size_t copied = 0, left = 0;
@@ -1908,12 +1908,8 @@ __filemap_copy_from_user_iovec(char *vaddr,
 		vaddr += copy;
 		iov++;
 
-		if (unlikely(left)) {
-			/* zero the rest of the target like __copy_from_user */
-			if (bytes)
-				memset(vaddr, 0, bytes);
+		if (unlikely(left))
 			break;
-		}
 	}
 	return copied - left;
 }
diff --git a/mm/filemap.h b/mm/filemap.h
index 5683cde2205..536979fb4ba 100644
--- a/mm/filemap.h
+++ b/mm/filemap.h
@@ -16,15 +16,23 @@
 #include <linux/uaccess.h>
 
 size_t
-__filemap_copy_from_user_iovec(char *vaddr,
-			       const struct iovec *iov,
-			       size_t base,
-			       size_t bytes);
+__filemap_copy_from_user_iovec_inatomic(char *vaddr,
+					const struct iovec *iov,
+					size_t base,
+					size_t bytes);
 
 /*
  * Copy as much as we can into the page and return the number of bytes which
  * were sucessfully copied.  If a fault is encountered then clear the page
  * out to (offset+bytes) and return the number of bytes which were copied.
+ *
+ * NOTE: For this to work reliably we really want copy_from_user_inatomic_nocache
+ * to *NOT* zero any tail of the buffer that it failed to copy.  If it does,
+ * and if the following non-atomic copy succeeds, then there is a small window
+ * where the target page contains neither the data before the write, nor the
+ * data after the write (it contains zero).  A read at this time will see
+ * data that is inconsistent with any ordering of the read and the write.
+ * (This has been detected in practice).
  */
 static inline size_t
 filemap_copy_from_user(struct page *page, unsigned long offset,
@@ -60,13 +68,15 @@ filemap_copy_from_user_iovec(struct page *page, unsigned long offset,
 	size_t copied;
 
 	kaddr = kmap_atomic(page, KM_USER0);
-	copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
-						base, bytes);
+	copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov,
+							 base, bytes);
 	kunmap_atomic(kaddr, KM_USER0);
 	if (copied != bytes) {
 		kaddr = kmap(page);
-		copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
-							base, bytes);
+		copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov,
+								 base, bytes);
+		if (bytes - copied)
+			memset(kaddr + offset + copied, 0, bytes - copied);
 		kunmap(page);
 	}
 	return copied;
-- 
cgit v1.2.3-70-g09d2


From bd40cddae2211950c81c41f25a818189f80fd0b5 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Sun, 25 Jun 2006 05:48:08 -0700
Subject: [PATCH] kernel-doc: mm/readhead fixup

Put short function description for read_cache_pages() on one line as needed
by kernel-doc.

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/readahead.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/readahead.c b/mm/readahead.c
index 4ee52cadab9..e39e416860d 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -118,8 +118,7 @@ static inline unsigned long get_next_ra_size(struct file_ra_state *ra)
 #define list_to_page(head) (list_entry((head)->prev, struct page, lru))
 
 /**
- * read_cache_pages - populate an address space with some pages, and
- * 			start reads against them.
+ * read_cache_pages - populate an address space with some pages & start reads against them
  * @mapping: the address_space
  * @pages: The address of a list_head which contains the target pages.  These
  *   pages have their ->index populated and are otherwise uninitialised.
-- 
cgit v1.2.3-70-g09d2


From 76d42bd96984832c4ea8bc8cbd74e496ac31409e Mon Sep 17 00:00:00 2001
From: Wu Fengguang <wfg@mail.ustc.edu.cn>
Date: Sun, 25 Jun 2006 05:48:43 -0700
Subject: [PATCH] readahead: backoff on I/O error

Backoff readahead size exponentially on I/O error.

Michael Tokarev <mjt@tls.msk.ru> described the problem as:

[QUOTE]
Suppose there's a CD-rom with a scratch/etc, one sector is unreadable.
In order to "fix" it, one have to read it and write to another CD-rom,
or something.. or just ignore the error (if it's just a skip in a video
stream).  Let's assume the unreadable block is number U.

But current behavior is just insane.  An application requests block
number N, which is before U. Kernel tries to read-ahead blocks N..U.
Cdrom drive tries to read it, re-read it.. for some time.  Finally,
when all the N..U-1 blocks are read, kernel returns block number N
(as requested) to an application, successefully.

Now an app requests block number N+1, and kernel tries to read
blocks N+1..U+1.  Retrying again as in previous step.

And so on, up to when an app requests block number U-1.  And when,
finally, it requests block U, it receives read error.

So, kernel currentry tries to re-read the same failing block as
many times as the current readahead value (256 (times?) by default).

This whole process already killed my cdrom drive (I posted about it
to LKML several months ago) - literally, the drive has fried, and
does not work anymore.  Ofcourse that problem was a bug in firmware
(or whatever) of the drive *too*, but.. main problem with that is
current readahead logic as described above.
[/QUOTE]

Which was confirmed by Jens Axboe <axboe@suse.de>:

[QUOTE]
For ide-cd, it tends do only end the first part of the request on a
medium error. So you may see a lot of repeats :/
[/QUOTE]

With this patch, retries are expected to be reduced from, say, 256, to 5.

[akpm@osdl.org: cleanups]
Signed-off-by: Wu Fengguang <wfg@mail.ustc.edu.cn>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/filemap.c | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index 1ed4be2a765..9c7334bafda 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -828,6 +828,32 @@ grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
 }
 EXPORT_SYMBOL(grab_cache_page_nowait);
 
+/*
+ * CD/DVDs are error prone. When a medium error occurs, the driver may fail
+ * a _large_ part of the i/o request. Imagine the worst scenario:
+ *
+ *      ---R__________________________________________B__________
+ *         ^ reading here                             ^ bad block(assume 4k)
+ *
+ * read(R) => miss => readahead(R...B) => media error => frustrating retries
+ * => failing the whole request => read(R) => read(R+1) =>
+ * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
+ * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
+ * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
+ *
+ * It is going insane. Fix it by quickly scaling down the readahead size.
+ */
+static void shrink_readahead_size_eio(struct file *filp,
+					struct file_ra_state *ra)
+{
+	if (!ra->ra_pages)
+		return;
+
+	ra->ra_pages /= 4;
+	printk(KERN_WARNING "Reducing readahead size to %luK\n",
+			ra->ra_pages << (PAGE_CACHE_SHIFT - 10));
+}
+
 /**
  * do_generic_mapping_read - generic file read routine
  * @mapping:	address_space to be read
@@ -985,6 +1011,7 @@ readpage:
 				}
 				unlock_page(page);
 				error = -EIO;
+				shrink_readahead_size_eio(filp, &ra);
 				goto readpage_error;
 			}
 			unlock_page(page);
@@ -1522,6 +1549,7 @@ page_not_uptodate:
 	 * Things didn't work out. Return zero to tell the
 	 * mm layer so, possibly freeing the page cache page first.
 	 */
+	shrink_readahead_size_eio(file, ra);
 	page_cache_release(page);
 	return NULL;
 }
-- 
cgit v1.2.3-70-g09d2


From d6e05edc59ecd79e8badf440c0d295a979bdfa3e Mon Sep 17 00:00:00 2001
From: Andreas Mohr <andi@lisas.de>
Date: Mon, 26 Jun 2006 18:35:02 +0200
Subject: spelling fixes

acquired (aquired)
contiguous (contigious)
successful (succesful, succesfull)
surprise (suprise)
whether (weather)
some other misspellings

Signed-off-by: Andreas Mohr <andi@lisas.de>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 arch/i386/kernel/cpu/cyrix.c               |  2 +-
 arch/i386/kernel/i8259.c                   |  2 +-
 arch/mips/momentum/ocelot_g/gt-irq.c       |  4 ++--
 arch/powerpc/platforms/cell/spufs/switch.c |  2 +-
 arch/powerpc/platforms/pseries/eeh_cache.c |  2 +-
 arch/s390/kernel/vtime.c                   |  2 +-
 arch/um/drivers/ubd_kern.c                 |  2 +-
 arch/x86_64/kernel/i8259.c                 |  2 +-
 block/as-iosched.c                         |  2 +-
 block/ll_rw_blk.c                          |  4 ++--
 drivers/atm/firestream.c                   |  2 +-
 drivers/char/agp/amd-k7-agp.c              |  2 +-
 drivers/char/agp/ati-agp.c                 |  2 +-
 drivers/char/agp/efficeon-agp.c            |  2 +-
 drivers/char/rio/riointr.c                 |  2 +-
 drivers/hwmon/asb100.c                     |  2 +-
 drivers/hwmon/lm78.c                       |  2 +-
 drivers/hwmon/lm80.c                       |  2 +-
 drivers/hwmon/lm87.c                       |  2 +-
 drivers/hwmon/sis5595.c                    |  2 +-
 drivers/hwmon/smsc47m1.c                   |  2 +-
 drivers/hwmon/w83627hf.c                   |  2 +-
 drivers/hwmon/w83781d.c                    |  2 +-
 drivers/hwmon/w83792d.c                    |  2 +-
 drivers/ide/ide-disk.c                     |  2 +-
 drivers/ide/ide-io.c                       |  2 +-
 drivers/ieee1394/hosts.c                   |  2 +-
 drivers/ieee1394/ieee1394_core.h           |  2 +-
 drivers/isdn/divert/isdn_divert.c          |  2 +-
 drivers/media/video/bt8xx/bttv-cards.c     |  2 +-
 drivers/net/3c501.c                        |  4 ++--
 drivers/net/irda/irport.c                  |  2 +-
 drivers/net/pcmcia/smc91c92_cs.c           |  2 +-
 drivers/net/wireless/ipw2100.c             |  2 +-
 drivers/pci/pci.c                          |  2 +-
 drivers/s390/scsi/zfcp_erp.c               | 14 +++++++-------
 drivers/scsi/NCR5380.c                     |  2 +-
 drivers/scsi/advansys.c                    |  2 +-
 drivers/scsi/dc395x.c                      |  2 +-
 drivers/scsi/ibmmca.c                      |  8 ++++----
 drivers/scsi/ips.c                         |  4 ++--
 drivers/scsi/st.c                          |  2 +-
 fs/9p/mux.c                                |  2 +-
 fs/aio.c                                   |  2 +-
 fs/jffs2/summary.c                         |  2 +-
 fs/jfs/jfs_extent.c                        |  8 ++++----
 include/asm-i386/system.h                  |  2 +-
 include/asm-m32r/system.h                  |  2 +-
 include/linux/sunrpc/gss_api.h             |  2 +-
 lib/kernel_lock.c                          |  4 ++--
 mm/page_alloc.c                            |  2 +-
 mm/readahead.c                             |  4 ++--
 net/sunrpc/auth_gss/gss_krb5_mech.c        |  2 +-
 net/sunrpc/auth_gss/gss_spkm3_mech.c       |  2 +-
 sound/core/seq/seq_memory.h                |  2 +-
 sound/oss/sb_ess.c                         | 28 ++++++++++++++--------------
 sound/pci/cs5535audio/cs5535audio_pcm.c    |  2 +-
 57 files changed, 88 insertions(+), 88 deletions(-)

(limited to 'mm')

diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c
index fc32c8028e2..f03b7f94c30 100644
--- a/arch/i386/kernel/cpu/cyrix.c
+++ b/arch/i386/kernel/cpu/cyrix.c
@@ -354,7 +354,7 @@ static void __init init_nsc(struct cpuinfo_x86 *c)
 	 * This function only handles the GX processor, and kicks every
 	 * thing else to the Cyrix init function above - that should
 	 * cover any processors that might have been branded differently
-	 * after NSC aquired Cyrix.
+	 * after NSC acquired Cyrix.
 	 *
 	 * If this breaks your GX1 horribly, please e-mail
 	 * info-linux@ldcmail.amd.com to tell us.
diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c
index b7636b96e10..c1a42feba28 100644
--- a/arch/i386/kernel/i8259.c
+++ b/arch/i386/kernel/i8259.c
@@ -175,7 +175,7 @@ static void mask_and_ack_8259A(unsigned int irq)
 	 * Lightweight spurious IRQ detection. We do not want
 	 * to overdo spurious IRQ handling - it's usually a sign
 	 * of hardware problems, so we only do the checks we can
-	 * do without slowing down good hardware unnecesserily.
+	 * do without slowing down good hardware unnecessarily.
 	 *
 	 * Note that IRQ7 and IRQ15 (the two spurious IRQs
 	 * usually resulting from the 8259A-1|2 PICs) occur
diff --git a/arch/mips/momentum/ocelot_g/gt-irq.c b/arch/mips/momentum/ocelot_g/gt-irq.c
index e5eceed1bef..8bd9b844fa9 100644
--- a/arch/mips/momentum/ocelot_g/gt-irq.c
+++ b/arch/mips/momentum/ocelot_g/gt-irq.c
@@ -59,7 +59,7 @@ void hook_irq_handler(int int_cause, int bit_num, void *isr_ptr)
  * bit_num   - Indicates which bit number in the cause register
  *
  * Outputs :
- * 1 if succesful, 0 if failure
+ * 1 if successful, 0 if failure
  */
 int enable_galileo_irq(int int_cause, int bit_num)
 {
@@ -83,7 +83,7 @@ int enable_galileo_irq(int int_cause, int bit_num)
  * bit_num   - Indicates which bit number in the cause register
  *
  * Outputs :
- * 1 if succesful, 0 if failure
+ * 1 if successful, 0 if failure
  */
 int disable_galileo_irq(int int_cause, int bit_num)
 {
diff --git a/arch/powerpc/platforms/cell/spufs/switch.c b/arch/powerpc/platforms/cell/spufs/switch.c
index b30e55dab83..3068b429b03 100644
--- a/arch/powerpc/platforms/cell/spufs/switch.c
+++ b/arch/powerpc/platforms/cell/spufs/switch.c
@@ -2100,7 +2100,7 @@ EXPORT_SYMBOL_GPL(spu_save);
  * @spu: pointer to SPU iomem structure.
  *
  * Perform harvest + restore, as we may not be coming
- * from a previous succesful save operation, and the
+ * from a previous successful save operation, and the
  * hardware state is unknown.
  */
 int spu_restore(struct spu_state *new, struct spu *spu)
diff --git a/arch/powerpc/platforms/pseries/eeh_cache.c b/arch/powerpc/platforms/pseries/eeh_cache.c
index 98c23aec85b..c37a8497c60 100644
--- a/arch/powerpc/platforms/pseries/eeh_cache.c
+++ b/arch/powerpc/platforms/pseries/eeh_cache.c
@@ -287,7 +287,7 @@ void pci_addr_cache_remove_device(struct pci_dev *dev)
  * find the pci device that corresponds to a given address.
  * This routine scans all pci busses to build the cache.
  * Must be run late in boot process, after the pci controllers
- * have been scaned for devices (after all device resources are known).
+ * have been scanned for devices (after all device resources are known).
  */
 void __init pci_addr_cache_build(void)
 {
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index dfe6f085661..1f0439dc245 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -356,7 +356,7 @@ static void internal_add_vtimer(struct vtimer_list *timer)
 
 	set_vtimer(event->expires);
 	spin_unlock_irqrestore(&vt_list->lock, flags);
-	/* release CPU aquired in prepare_vtimer or mod_virt_timer() */
+	/* release CPU acquired in prepare_vtimer or mod_virt_timer() */
 	put_cpu();
 }
 
diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 0897852b09a..290cec6d69e 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -1222,7 +1222,7 @@ int open_ubd_file(char *file, struct openflags *openflags, int shared,
 		}
 	}
 
-	/* Succesful return case! */
+	/* Successful return case! */
 	if(backing_file_out == NULL)
 		return(fd);
 
diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
index 5ecd34ab8c2..573990a6d66 100644
--- a/arch/x86_64/kernel/i8259.c
+++ b/arch/x86_64/kernel/i8259.c
@@ -278,7 +278,7 @@ static void mask_and_ack_8259A(unsigned int irq)
 	 * Lightweight spurious IRQ detection. We do not want
 	 * to overdo spurious IRQ handling - it's usually a sign
 	 * of hardware problems, so we only do the checks we can
-	 * do without slowing down good hardware unnecesserily.
+	 * do without slowing down good hardware unnecessarily.
 	 *
 	 * Note that IRQ7 and IRQ15 (the two spurious IRQs
 	 * usually resulting from the 8259A-1|2 PICs) occur
diff --git a/block/as-iosched.c b/block/as-iosched.c
index 1ec5df46670..3af31ed49a9 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -892,7 +892,7 @@ static int as_can_break_anticipation(struct as_data *ad, struct as_rq *arq)
 }
 
 /*
- * as_can_anticipate indicates weather we should either run arq
+ * as_can_anticipate indicates whether we should either run arq
  * or keep anticipating a better request.
  */
 static int as_can_anticipate(struct as_data *ad, struct as_rq *arq)
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 0603ab2f369..c04422a502d 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -2745,7 +2745,7 @@ static int attempt_merge(request_queue_t *q, struct request *req,
 		return 0;
 
 	/*
-	 * not contigious
+	 * not contiguous
 	 */
 	if (req->sector + req->nr_sectors != next->sector)
 		return 0;
@@ -3415,7 +3415,7 @@ static struct notifier_block blk_cpu_notifier = {
  *
  * Description:
  *     Ends all I/O on a request. It does not handle partial completions,
- *     unless the driver actually implements this in its completionc callback
+ *     unless the driver actually implements this in its completion callback
  *     through requeueing. Theh actual completion happens out-of-order,
  *     through a softirq handler. The user must have registered a completion
  *     callback through blk_queue_softirq_done().
diff --git a/drivers/atm/firestream.c b/drivers/atm/firestream.c
index 7f7ec288824..f2eeaf9dc56 100644
--- a/drivers/atm/firestream.c
+++ b/drivers/atm/firestream.c
@@ -951,7 +951,7 @@ static int fs_open(struct atm_vcc *atm_vcc)
 		   it most likely that the chip will notice it. It also prevents us
 		   from having to wait for completion. On the other hand, we may
 		   need to wait for completion anyway, to see if it completed
-		   succesfully. */
+		   successfully. */
 
 		switch (atm_vcc->qos.aal) {
 		case ATM_AAL2:
diff --git a/drivers/char/agp/amd-k7-agp.c b/drivers/char/agp/amd-k7-agp.c
index 1f776651ac6..51d0d562d01 100644
--- a/drivers/char/agp/amd-k7-agp.c
+++ b/drivers/char/agp/amd-k7-agp.c
@@ -118,7 +118,7 @@ static int amd_create_gatt_pages(int nr_tables)
 	return retval;
 }
 
-/* Since we don't need contigious memory we just try
+/* Since we don't need contiguous memory we just try
  * to get the gatt table once
  */
 
diff --git a/drivers/char/agp/ati-agp.c b/drivers/char/agp/ati-agp.c
index 06fd10ba0c5..16056434599 100644
--- a/drivers/char/agp/ati-agp.c
+++ b/drivers/char/agp/ati-agp.c
@@ -261,7 +261,7 @@ static int agp_ati_suspend(struct pci_dev *dev, pm_message_t state)
 #endif
 
 /*
- *Since we don't need contigious memory we just try
+ *Since we don't need contiguous memory we just try
  * to get the gatt table once
  */
 
diff --git a/drivers/char/agp/efficeon-agp.c b/drivers/char/agp/efficeon-agp.c
index 86a966b6523..b788b0a3bbf 100644
--- a/drivers/char/agp/efficeon-agp.c
+++ b/drivers/char/agp/efficeon-agp.c
@@ -177,7 +177,7 @@ static int efficeon_free_gatt_table(struct agp_bridge_data *bridge)
 
 
 /*
- * Since we don't need contigious memory we just try
+ * Since we don't need contiguous memory we just try
  * to get the gatt table once
  */
 
diff --git a/drivers/char/rio/riointr.c b/drivers/char/rio/riointr.c
index eec1fea0cb9..0bd09040a5c 100644
--- a/drivers/char/rio/riointr.c
+++ b/drivers/char/rio/riointr.c
@@ -546,7 +546,7 @@ static void RIOReceive(struct rio_info *p, struct Port *PortP)
 	 ** run out of space it will be set to the offset of the
 	 ** next byte to copy from the packet data area. The packet
 	 ** length field is decremented by the number of bytes that
-	 ** we succesfully removed from the packet. When this reaches
+	 ** we successfully removed from the packet. When this reaches
 	 ** zero, we reset the offset pointer to be zero, and free
 	 ** the packet from the front of the queue.
 	 */
diff --git a/drivers/hwmon/asb100.c b/drivers/hwmon/asb100.c
index 65b2709f750..facc1ccb833 100644
--- a/drivers/hwmon/asb100.c
+++ b/drivers/hwmon/asb100.c
@@ -341,7 +341,7 @@ static ssize_t set_fan_min(struct device *dev, const char *buf,
 
 /* Note: we save and restore the fan minimum here, because its value is
    determined in part by the fan divisor.  This follows the principle of
-   least suprise; the user doesn't expect the fan minimum to change just
+   least surprise; the user doesn't expect the fan minimum to change just
    because the divisor changed. */
 static ssize_t set_fan_div(struct device *dev, const char *buf,
 				size_t count, int nr)
diff --git a/drivers/hwmon/lm78.c b/drivers/hwmon/lm78.c
index 94be3d797e6..a6ce7abf860 100644
--- a/drivers/hwmon/lm78.c
+++ b/drivers/hwmon/lm78.c
@@ -358,7 +358,7 @@ static ssize_t show_fan_div(struct device *dev, char *buf, int nr)
 
 /* Note: we save and restore the fan minimum here, because its value is
    determined in part by the fan divisor.  This follows the principle of
-   least suprise; the user doesn't expect the fan minimum to change just
+   least surprise; the user doesn't expect the fan minimum to change just
    because the divisor changed. */
 static ssize_t set_fan_div(struct device *dev, const char *buf,
 	size_t count, int nr)
diff --git a/drivers/hwmon/lm80.c b/drivers/hwmon/lm80.c
index f72120d08c4..b4ccdfc0120 100644
--- a/drivers/hwmon/lm80.c
+++ b/drivers/hwmon/lm80.c
@@ -253,7 +253,7 @@ set_fan(min2, fan_min[1], LM80_REG_FAN_MIN(2), fan_div[1]);
 
 /* Note: we save and restore the fan minimum here, because its value is
    determined in part by the fan divisor.  This follows the principle of
-   least suprise; the user doesn't expect the fan minimum to change just
+   least surprise; the user doesn't expect the fan minimum to change just
    because the divisor changed. */
 static ssize_t set_fan_div(struct device *dev, const char *buf,
 	size_t count, int nr)
diff --git a/drivers/hwmon/lm87.c b/drivers/hwmon/lm87.c
index e229daf666d..e6c1b638c97 100644
--- a/drivers/hwmon/lm87.c
+++ b/drivers/hwmon/lm87.c
@@ -421,7 +421,7 @@ static void set_fan_min(struct device *dev, const char *buf, int nr)
 
 /* Note: we save and restore the fan minimum here, because its value is
    determined in part by the fan clock divider.  This follows the principle
-   of least suprise; the user doesn't expect the fan minimum to change just
+   of least surprise; the user doesn't expect the fan minimum to change just
    because the divider changed. */
 static ssize_t set_fan_div(struct device *dev, const char *buf,
 		size_t count, int nr)
diff --git a/drivers/hwmon/sis5595.c b/drivers/hwmon/sis5595.c
index 6f3fda73f70..063f71c5f07 100644
--- a/drivers/hwmon/sis5595.c
+++ b/drivers/hwmon/sis5595.c
@@ -380,7 +380,7 @@ static ssize_t show_fan_div(struct device *dev, char *buf, int nr)
 
 /* Note: we save and restore the fan minimum here, because its value is
    determined in part by the fan divisor.  This follows the principle of
-   least suprise; the user doesn't expect the fan minimum to change just
+   least surprise; the user doesn't expect the fan minimum to change just
    because the divisor changed. */
 static ssize_t set_fan_div(struct device *dev, const char *buf,
 	size_t count, int nr)
diff --git a/drivers/hwmon/smsc47m1.c b/drivers/hwmon/smsc47m1.c
index 7732aec5459..825e8f72698 100644
--- a/drivers/hwmon/smsc47m1.c
+++ b/drivers/hwmon/smsc47m1.c
@@ -207,7 +207,7 @@ static ssize_t set_fan_min(struct device *dev, const char *buf,
 
 /* Note: we save and restore the fan minimum here, because its value is
    determined in part by the fan clock divider.  This follows the principle
-   of least suprise; the user doesn't expect the fan minimum to change just
+   of least surprise; the user doesn't expect the fan minimum to change just
    because the divider changed. */
 static ssize_t set_fan_div(struct device *dev, const char *buf,
 		size_t count, int nr)
diff --git a/drivers/hwmon/w83627hf.c b/drivers/hwmon/w83627hf.c
index 71fb7f1af8f..79368d53c36 100644
--- a/drivers/hwmon/w83627hf.c
+++ b/drivers/hwmon/w83627hf.c
@@ -781,7 +781,7 @@ show_fan_div_reg(struct device *dev, char *buf, int nr)
 
 /* Note: we save and restore the fan minimum here, because its value is
    determined in part by the fan divisor.  This follows the principle of
-   least suprise; the user doesn't expect the fan minimum to change just
+   least surprise; the user doesn't expect the fan minimum to change just
    because the divisor changed. */
 static ssize_t
 store_fan_div_reg(struct device *dev, const char *buf, size_t count, int nr)
diff --git a/drivers/hwmon/w83781d.c b/drivers/hwmon/w83781d.c
index e4c700356c4..7be469ed0f8 100644
--- a/drivers/hwmon/w83781d.c
+++ b/drivers/hwmon/w83781d.c
@@ -630,7 +630,7 @@ show_fan_div_reg(struct device *dev, char *buf, int nr)
 
 /* Note: we save and restore the fan minimum here, because its value is
    determined in part by the fan divisor.  This follows the principle of
-   least suprise; the user doesn't expect the fan minimum to change just
+   least surprise; the user doesn't expect the fan minimum to change just
    because the divisor changed. */
 static ssize_t
 store_fan_div_reg(struct device *dev, const char *buf, size_t count, int nr)
diff --git a/drivers/hwmon/w83792d.c b/drivers/hwmon/w83792d.c
index 4ef884c216e..e407c74bda3 100644
--- a/drivers/hwmon/w83792d.c
+++ b/drivers/hwmon/w83792d.c
@@ -463,7 +463,7 @@ show_fan_div(struct device *dev, struct device_attribute *attr,
 
 /* Note: we save and restore the fan minimum here, because its value is
    determined in part by the fan divisor.  This follows the principle of
-   least suprise; the user doesn't expect the fan minimum to change just
+   least surprise; the user doesn't expect the fan minimum to change just
    because the divisor changed. */
 static ssize_t
 store_fan_div(struct device *dev, struct device_attribute *attr,
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index a5017de72da..f033d732f38 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -37,7 +37,7 @@
  * Version 1.15		convert all calls to ide_raw_taskfile
  *				since args will return register content.
  * Version 1.16		added suspend-resume-checkpower
- * Version 1.17		do flush on standy, do flush on ATA < ATA6
+ * Version 1.17		do flush on standby, do flush on ATA < ATA6
  *			fix wcache setup.
  */
 
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 622a55c72f0..05ba8e01e63 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -1665,7 +1665,7 @@ irqreturn_t ide_intr (int irq, void *dev_id, struct pt_regs *regs)
  *	Initialize a request before we fill it in and send it down to
  *	ide_do_drive_cmd. Commands must be set up by this function. Right
  *	now it doesn't do a lot, but if that changes abusers will have a
- *	nasty suprise.
+ *	nasty surprise.
  */
 
 void ide_init_drive_cmd (struct request *rq)
diff --git a/drivers/ieee1394/hosts.c b/drivers/ieee1394/hosts.c
index 2d47b11777a..ad49c040b67 100644
--- a/drivers/ieee1394/hosts.c
+++ b/drivers/ieee1394/hosts.c
@@ -103,7 +103,7 @@ static int alloc_hostnum_cb(struct hpsb_host *host, void *__data)
  * driver specific parts, enable the controller and make it available
  * to the general subsystem using hpsb_add_host().
  *
- * Return Value: a pointer to the &hpsb_host if succesful, %NULL if
+ * Return Value: a pointer to the &hpsb_host if successful, %NULL if
  * no memory was available.
  */
 static DEFINE_MUTEX(host_num_alloc);
diff --git a/drivers/ieee1394/ieee1394_core.h b/drivers/ieee1394/ieee1394_core.h
index e7b55e895f5..0ecbf335c64 100644
--- a/drivers/ieee1394/ieee1394_core.h
+++ b/drivers/ieee1394/ieee1394_core.h
@@ -139,7 +139,7 @@ int hpsb_bus_reset(struct hpsb_host *host);
 
 /*
  * Hand over received selfid packet to the core.  Complement check (second
- * quadlet is complement of first) is expected to be done and succesful.
+ * quadlet is complement of first) is expected to be done and successful.
  */
 void hpsb_selfid_received(struct hpsb_host *host, quadlet_t sid);
 
diff --git a/drivers/isdn/divert/isdn_divert.c b/drivers/isdn/divert/isdn_divert.c
index f1a1f9a9b88..1f5ebe9ee72 100644
--- a/drivers/isdn/divert/isdn_divert.c
+++ b/drivers/isdn/divert/isdn_divert.c
@@ -592,7 +592,7 @@ static int put_address(char *st, u_char *p, int len)
 } /* put_address */
 
 /*************************************/
-/* report a succesfull interrogation */
+/* report a successful interrogation */
 /*************************************/
 static int interrogate_success(isdn_ctrl *ic, struct call_struc *cs)
 { char *src = ic->parm.dss1_io.data;
diff --git a/drivers/media/video/bt8xx/bttv-cards.c b/drivers/media/video/bt8xx/bttv-cards.c
index 3116345c93b..e68a6d2fff2 100644
--- a/drivers/media/video/bt8xx/bttv-cards.c
+++ b/drivers/media/video/bt8xx/bttv-cards.c
@@ -4848,7 +4848,7 @@ static void picolo_tetra_muxsel (struct bttv* btv, unsigned int input)
  *
  * The IVC120G security card has 4 i2c controlled TDA8540 matrix
  * swichers to provide 16 channels to MUX0. The TDA8540's have
- * 4 indepedant outputs and as such the IVC120G also has the
+ * 4 independent outputs and as such the IVC120G also has the
  * optional "Monitor Out" bus. This allows the card to be looking
  * at one input while the monitor is looking at another.
  *
diff --git a/drivers/net/3c501.c b/drivers/net/3c501.c
index bb44509fd40..07136ec423b 100644
--- a/drivers/net/3c501.c
+++ b/drivers/net/3c501.c
@@ -508,11 +508,11 @@ static int el_start_xmit(struct sk_buff *skb, struct net_device *dev)
  * speak of. We simply pull the packet out of its PIO buffer (which is slow)
  * and queue it for the kernel. Then we reset the card for the next packet.
  *
- * We sometimes get suprise interrupts late both because the SMP IRQ delivery
+ * We sometimes get surprise interrupts late both because the SMP IRQ delivery
  * is message passing and because the card sometimes seems to deliver late. I
  * think if it is part way through a receive and the mode is changed it carries
  * on receiving and sends us an interrupt. We have to band aid all these cases
- * to get a sensible 150kbytes/second performance. Even then you want a small
+ * to get a sensible 150kBytes/second performance. Even then you want a small
  * TCP window.
  */
 
diff --git a/drivers/net/irda/irport.c b/drivers/net/irda/irport.c
index 98fa5319e5c..44efd49bf4a 100644
--- a/drivers/net/irda/irport.c
+++ b/drivers/net/irda/irport.c
@@ -386,7 +386,7 @@ static int __irport_change_speed(struct irda_task *task)
 	/* Locking notes : this function may be called from irq context with
 	 * spinlock, via irport_write_wakeup(), or from non-interrupt without
 	 * spinlock (from the task timer). Yuck !
-	 * This is ugly, and unsafe is the spinlock is not already aquired.
+	 * This is ugly, and unsafe is the spinlock is not already acquired.
 	 * This will be fixed when irda-task get rewritten.
 	 * Jean II */
 	if (!spin_is_locked(&self->lock)) {
diff --git a/drivers/net/pcmcia/smc91c92_cs.c b/drivers/net/pcmcia/smc91c92_cs.c
index e74bf5014ef..a73d5455303 100644
--- a/drivers/net/pcmcia/smc91c92_cs.c
+++ b/drivers/net/pcmcia/smc91c92_cs.c
@@ -1883,7 +1883,7 @@ static void smc_reset(struct net_device *dev)
     /* Set the Window 1 control, configuration and station addr registers.
        No point in writing the I/O base register ;-> */
     SMC_SELECT_BANK(1);
-    /* Automatically release succesfully transmitted packets,
+    /* Automatically release successfully transmitted packets,
        Accept link errors, counter and Tx error interrupts. */
     outw(CTL_AUTO_RELEASE | CTL_TE_ENABLE | CTL_CR_ENABLE,
 	 ioaddr + CONTROL);
diff --git a/drivers/net/wireless/ipw2100.c b/drivers/net/wireless/ipw2100.c
index 72335c8eb97..94aeb23a772 100644
--- a/drivers/net/wireless/ipw2100.c
+++ b/drivers/net/wireless/ipw2100.c
@@ -1485,7 +1485,7 @@ static int ipw2100_hw_stop_adapter(struct ipw2100_priv *priv)
 		 *
 		 * Sending the PREPARE_FOR_POWER_DOWN will restrict the
 		 * hardware from going into standby mode and will transition
-		 * out of D0-standy if it is already in that state.
+		 * out of D0-standby if it is already in that state.
 		 *
 		 * STATUS_PREPARE_POWER_DOWN_COMPLETE will be sent by the
 		 * driver upon completion.  Once received, the driver can
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index d408a3c3042..23d3b17c8ca 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -369,7 +369,7 @@ pci_set_power_state(struct pci_dev *dev, pci_power_t state)
 
 	/*
 	 * Give firmware a chance to be called, such as ACPI _PRx, _PSx
-	 * Firmware method after natice method ?
+	 * Firmware method after native method ?
 	 */
 	if (platform_pci_set_power_state)
 		platform_pci_set_power_state(dev, state);
diff --git a/drivers/s390/scsi/zfcp_erp.c b/drivers/s390/scsi/zfcp_erp.c
index 4682c8b8bd2..909731b99d2 100644
--- a/drivers/s390/scsi/zfcp_erp.c
+++ b/drivers/s390/scsi/zfcp_erp.c
@@ -167,7 +167,7 @@ zfcp_fsf_scsi_er_timeout_handler(unsigned long data)
  *		initiates adapter recovery which is done
  *		asynchronously
  *
- * returns:	0	- initiated action succesfully
+ * returns:	0	- initiated action successfully
  *		<0	- failed to initiate action
  */
 int
@@ -203,7 +203,7 @@ zfcp_erp_adapter_reopen_internal(struct zfcp_adapter *adapter, int clear_mask)
  * purpose:	Wrappper for zfcp_erp_adapter_reopen_internal
  *              used to ensure the correct locking
  *
- * returns:	0	- initiated action succesfully
+ * returns:	0	- initiated action successfully
  *		<0	- failed to initiate action
  */
 int
@@ -469,7 +469,7 @@ zfcp_test_link(struct zfcp_port *port)
  *		initiates Forced Reopen recovery which is done
  *		asynchronously
  *
- * returns:	0	- initiated action succesfully
+ * returns:	0	- initiated action successfully
  *		<0	- failed to initiate action
  */
 static int
@@ -509,7 +509,7 @@ zfcp_erp_port_forced_reopen_internal(struct zfcp_port *port, int clear_mask)
  * purpose:	Wrappper for zfcp_erp_port_forced_reopen_internal
  *              used to ensure the correct locking
  *
- * returns:	0	- initiated action succesfully
+ * returns:	0	- initiated action successfully
  *		<0	- failed to initiate action
  */
 int
@@ -536,7 +536,7 @@ zfcp_erp_port_forced_reopen(struct zfcp_port *port, int clear_mask)
  *		initiates Reopen recovery which is done
  *		asynchronously
  *
- * returns:	0	- initiated action succesfully
+ * returns:	0	- initiated action successfully
  *		<0	- failed to initiate action
  */
 static int
@@ -605,7 +605,7 @@ zfcp_erp_port_reopen(struct zfcp_port *port, int clear_mask)
  *		initiates Reopen recovery which is done
  *		asynchronously
  *
- * returns:	0	- initiated action succesfully
+ * returns:	0	- initiated action successfully
  *		<0	- failed to initiate action
  */
 static int
@@ -1805,7 +1805,7 @@ zfcp_erp_modify_unit_status(struct zfcp_unit *unit, u32 mask, int set_or_clear)
  * purpose:	Wrappper for zfcp_erp_port_reopen_all_internal
  *              used to ensure the correct locking
  *
- * returns:	0	- initiated action succesfully
+ * returns:	0	- initiated action successfully
  *		<0	- failed to initiate action
  */
 int
diff --git a/drivers/scsi/NCR5380.c b/drivers/scsi/NCR5380.c
index fa57e0b4a5f..75f2f7ae2a8 100644
--- a/drivers/scsi/NCR5380.c
+++ b/drivers/scsi/NCR5380.c
@@ -500,7 +500,7 @@ static void NCR5380_print_phase(struct Scsi_Host *instance)
 /* 
  * Function : int should_disconnect (unsigned char cmd)
  *
- * Purpose : decide weather a command would normally disconnect or 
+ * Purpose : decide whether a command would normally disconnect or 
  *      not, since if it won't disconnect we should go to sleep.
  *
  * Input : cmd - opcode of SCSI command
diff --git a/drivers/scsi/advansys.c b/drivers/scsi/advansys.c
index 5ee47555a8a..dd9fb3d9100 100644
--- a/drivers/scsi/advansys.c
+++ b/drivers/scsi/advansys.c
@@ -12374,7 +12374,7 @@ AscInitFromEEP(ASC_DVC_VAR *asc_dvc)
                 ASC_PRINT1(
 "AscInitFromEEP: Failed to re-write EEPROM with %d errors.\n", i);
         } else {
-                ASC_PRINT("AscInitFromEEP: Succesfully re-wrote EEPROM.");
+                ASC_PRINT("AscInitFromEEP: Successfully re-wrote EEPROM.\n");
         }
     }
     return (warn_code);
diff --git a/drivers/scsi/dc395x.c b/drivers/scsi/dc395x.c
index 18324525493..58b0748045e 100644
--- a/drivers/scsi/dc395x.c
+++ b/drivers/scsi/dc395x.c
@@ -3771,7 +3771,7 @@ static void request_sense(struct AdapterCtlBlk *acb, struct DeviceCtlBlk *dcb,
  * @target: The target for the new device.
  * @lun: The lun for the new device.
  *
- * Return the new device if succesfull or NULL on failure.
+ * Return the new device if successful or NULL on failure.
  **/
 static struct DeviceCtlBlk *device_alloc(struct AdapterCtlBlk *acb,
 		u8 target, u8 lun)
diff --git a/drivers/scsi/ibmmca.c b/drivers/scsi/ibmmca.c
index 115f55471ed..497f6642b2d 100644
--- a/drivers/scsi/ibmmca.c
+++ b/drivers/scsi/ibmmca.c
@@ -760,7 +760,7 @@ static int device_inquiry(int host_index, int ldn)
 		while (!got_interrupt(host_index))
 			barrier();
 
-		/*if command succesful, break */
+		/*if command successful, break */
 		if ((stat_result(host_index) == IM_SCB_CMD_COMPLETED) || (stat_result(host_index) == IM_SCB_CMD_COMPLETED_WITH_RETRIES))
 			return 1;
 	}
@@ -885,7 +885,7 @@ static int immediate_assign(int host_index, unsigned int pun, unsigned int lun,
 		while (!got_interrupt(host_index))
 			barrier();
 
-		/*if command succesful, break */
+		/*if command successful, break */
 		if (stat_result(host_index) == IM_IMMEDIATE_CMD_COMPLETED)
 			return 1;
 	}
@@ -921,7 +921,7 @@ static int immediate_feature(int host_index, unsigned int speed, unsigned int ti
 			return 2;
 		} else
 			global_command_error_excuse = 0;
-		/*if command succesful, break */
+		/*if command successful, break */
 		if (stat_result(host_index) == IM_IMMEDIATE_CMD_COMPLETED)
 			return 1;
 	}
@@ -959,7 +959,7 @@ static int immediate_reset(int host_index, unsigned int ldn)
 			/* did not work, finish */
 			return 1;
 		}
-		/*if command succesful, break */
+		/*if command successful, break */
 		if (stat_result(host_index) == IM_IMMEDIATE_CMD_COMPLETED)
 			return 1;
 	}
diff --git a/drivers/scsi/ips.c b/drivers/scsi/ips.c
index 5353b28b293..78f2ff736c3 100644
--- a/drivers/scsi/ips.c
+++ b/drivers/scsi/ips.c
@@ -6438,7 +6438,7 @@ ips_erase_bios(ips_ha_t * ha)
 		/* VPP failure */
 		return (1);
 
-	/* check for succesful flash */
+	/* check for successful flash */
 	if (status & 0x30)
 		/* sequence error */
 		return (1);
@@ -6550,7 +6550,7 @@ ips_erase_bios_memio(ips_ha_t * ha)
 		/* VPP failure */
 		return (1);
 
-	/* check for succesful flash */
+	/* check for successful flash */
 	if (status & 0x30)
 		/* sequence error */
 		return (1);
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 1272dd249af..b5218fc0ac8 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -2818,7 +2818,7 @@ static int st_int_ioctl(struct scsi_tape *STp, unsigned int cmd_in, unsigned lon
 		    (cmdstatp->sense_hdr.sense_key == NO_SENSE ||
 		     cmdstatp->sense_hdr.sense_key == RECOVERED_ERROR) &&
 		    undone == 0) {
-			ioctl_result = 0;	/* EOF written succesfully at EOM */
+			ioctl_result = 0;	/* EOF written successfully at EOM */
 			if (fileno >= 0)
 				fileno++;
 			STps->drv_file = fileno;
diff --git a/fs/9p/mux.c b/fs/9p/mux.c
index f4407eb276c..12e1baa4508 100644
--- a/fs/9p/mux.c
+++ b/fs/9p/mux.c
@@ -712,7 +712,7 @@ static void v9fs_read_work(void *a)
  * v9fs_send_request - send 9P request
  * The function can sleep until the request is scheduled for sending.
  * The function can be interrupted. Return from the function is not
- * a guarantee that the request is sent succesfully. Can return errors
+ * a guarantee that the request is sent successfully. Can return errors
  * that can be retrieved by PTR_ERR macros.
  *
  * @m: mux data
diff --git a/fs/aio.c b/fs/aio.c
index 8c34a62df7d..950630187ac 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -641,7 +641,7 @@ static inline int __queue_kicked_iocb(struct kiocb *iocb)
  *	invoked both for initial i/o submission and
  *	subsequent retries via the aio_kick_handler.
  *	Expects to be invoked with iocb->ki_ctx->lock
- *	already held. The lock is released and reaquired
+ *	already held. The lock is released and reacquired
  *	as needed during processing.
  *
  * Calls the iocb retry method (already setup for the
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 0b02fc79e4d..be1acc3dad9 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -43,7 +43,7 @@ int jffs2_sum_init(struct jffs2_sb_info *c)
 		return -ENOMEM;
 	}
 
-	dbg_summary("returned succesfully\n");
+	dbg_summary("returned successfully\n");
 
 	return 0;
 }
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index 5549378358b..4d52593a5fc 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -126,7 +126,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, boolean_t abnr)
 
 	/* allocate the disk blocks for the extent.  initially, extBalloc()
 	 * will try to allocate disk blocks for the requested size (xlen). 
-	 * if this fails (xlen contigious free blocks not avaliable), it'll
+	 * if this fails (xlen contiguous free blocks not avaliable), it'll
 	 * try to allocate a smaller number of blocks (producing a smaller
 	 * extent), with this smaller number of blocks consisting of the
 	 * requested number of blocks rounded down to the next smaller
@@ -493,7 +493,7 @@ int extFill(struct inode *ip, xad_t * xp)
  *
  *		initially, we will try to allocate disk blocks for the
  *		requested size (nblocks).  if this fails (nblocks 
- *		contigious free blocks not avaliable), we'll try to allocate
+ *		contiguous free blocks not avaliable), we'll try to allocate
  *		a smaller number of blocks (producing a smaller extent), with
  *		this smaller number of blocks consisting of the requested
  *		number of blocks rounded down to the next smaller power of 2
@@ -529,7 +529,7 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
 
 	/* get the number of blocks to initially attempt to allocate.
 	 * we'll first try the number of blocks requested unless this
-	 * number is greater than the maximum number of contigious free
+	 * number is greater than the maximum number of contiguous free
 	 * blocks in the map. in that case, we'll start off with the 
 	 * maximum free.
 	 */
@@ -586,7 +586,7 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
  *		in place.  if this fails, we'll try to move the extent
  *		to a new set of blocks. if moving the extent, we initially
  *		will try to allocate disk blocks for the requested size
- *		(nnew).  if this fails 	(nnew contigious free blocks not
+ *		(nnew).  if this fails 	(new contiguous free blocks not
  *		avaliable), we'll try  to allocate a smaller number of
  *		blocks (producing a smaller extent), with this smaller
  *		number of blocks consisting of the requested number of
diff --git a/include/asm-i386/system.h b/include/asm-i386/system.h
index 0249f912a29..cab0180567f 100644
--- a/include/asm-i386/system.h
+++ b/include/asm-i386/system.h
@@ -427,7 +427,7 @@ static inline unsigned long long __cmpxchg64(volatile void *ptr, unsigned long l
  * does not enforce ordering, since there is no data dependency between
  * the read of "a" and the read of "b".  Therefore, on some CPUs, such
  * as Alpha, "y" could be set to 3 and "x" to 0.  Use rmb()
- * in cases like thiswhere there are no data dependencies.
+ * in cases like this where there are no data dependencies.
  **/
 
 #define read_barrier_depends()	do { } while(0)
diff --git a/include/asm-m32r/system.h b/include/asm-m32r/system.h
index 33567e8bfe6..66c4742f09e 100644
--- a/include/asm-m32r/system.h
+++ b/include/asm-m32r/system.h
@@ -318,7 +318,7 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, int size)
  * does not enforce ordering, since there is no data dependency between
  * the read of "a" and the read of "b".  Therefore, on some CPUs, such
  * as Alpha, "y" could be set to 3 and "x" to 0.  Use rmb()
- * in cases like thiswhere there are no data dependencies.
+ * in cases like this where there are no data dependencies.
  **/
 
 #define read_barrier_depends()	do { } while (0)
diff --git a/include/linux/sunrpc/gss_api.h b/include/linux/sunrpc/gss_api.h
index 9b8bcf125c1..6e112cc5cdd 100644
--- a/include/linux/sunrpc/gss_api.h
+++ b/include/linux/sunrpc/gss_api.h
@@ -126,7 +126,7 @@ struct gss_api_mech *gss_mech_get_by_pseudoflavor(u32);
 /* Just increments the mechanism's reference count and returns its input: */
 struct gss_api_mech * gss_mech_get(struct gss_api_mech *);
 
-/* For every succesful gss_mech_get or gss_mech_get_by_* call there must be a
+/* For every successful gss_mech_get or gss_mech_get_by_* call there must be a
  * corresponding call to gss_mech_put. */
 void gss_mech_put(struct gss_api_mech *);
 
diff --git a/lib/kernel_lock.c b/lib/kernel_lock.c
index cb5490ec00f..e713e86811a 100644
--- a/lib/kernel_lock.c
+++ b/lib/kernel_lock.c
@@ -14,7 +14,7 @@
  * The 'big kernel semaphore'
  *
  * This mutex is taken and released recursively by lock_kernel()
- * and unlock_kernel().  It is transparently dropped and reaquired
+ * and unlock_kernel().  It is transparently dropped and reacquired
  * over schedule().  It is used to protect legacy code that hasn't
  * been migrated to a proper locking design yet.
  *
@@ -92,7 +92,7 @@ void __lockfunc unlock_kernel(void)
  * The 'big kernel lock'
  *
  * This spinlock is taken and released recursively by lock_kernel()
- * and unlock_kernel().  It is transparently dropped and reaquired
+ * and unlock_kernel().  It is transparently dropped and reacquired
  * over schedule().  It is used to protect legacy code that hasn't
  * been migrated to a proper locking design yet.
  *
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6c1174fcf52..9f86191bb63 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -266,7 +266,7 @@ static inline void rmv_page_order(struct page *page)
  * satisfies the following equation:
  *     P = B & ~(1 << O)
  *
- * Assumption: *_mem_map is contigious at least up to MAX_ORDER
+ * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
  */
 static inline struct page *
 __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
diff --git a/mm/readahead.c b/mm/readahead.c
index e39e416860d..aa7ec424656 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -390,8 +390,8 @@ int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
  * Read 'nr_to_read' pages starting at page 'offset'. If the flag 'block'
  * is set wait till the read completes.  Otherwise attempt to read without
  * blocking.
- * Returns 1 meaning 'success' if read is succesfull without switching off
- * readhaead mode. Otherwise return failure.
+ * Returns 1 meaning 'success' if read is successful without switching off
+ * readahead mode. Otherwise return failure.
  */
 static int
 blockable_page_cache_readahead(struct address_space *mapping, struct file *filp,
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index 129e2bd36af..b8714a87b34 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -169,7 +169,7 @@ gss_import_sec_context_kerberos(const void *p,
 	}
 
 	ctx_id->internal_ctx_id = ctx;
-	dprintk("RPC:      Succesfully imported new context.\n");
+	dprintk("RPC:      Successfully imported new context.\n");
 	return 0;
 
 out_err_free_key2:
diff --git a/net/sunrpc/auth_gss/gss_spkm3_mech.c b/net/sunrpc/auth_gss/gss_spkm3_mech.c
index 5bf11ccba7c..3d0432aa45c 100644
--- a/net/sunrpc/auth_gss/gss_spkm3_mech.c
+++ b/net/sunrpc/auth_gss/gss_spkm3_mech.c
@@ -201,7 +201,7 @@ gss_import_sec_context_spkm3(const void *p, size_t len,
 
 	ctx_id->internal_ctx_id = ctx;
 
-	dprintk("Succesfully imported new spkm context.\n");
+	dprintk("Successfully imported new spkm context.\n");
 	return 0;
 
 out_err_free_key2:
diff --git a/sound/core/seq/seq_memory.h b/sound/core/seq/seq_memory.h
index 39c60d9e1ef..63e91431a29 100644
--- a/sound/core/seq/seq_memory.h
+++ b/sound/core/seq/seq_memory.h
@@ -31,7 +31,7 @@ struct snd_seq_event_cell {
 	struct snd_seq_event_cell *next;	/* next cell */
 };
 
-/* design note: the pool is a contigious block of memory, if we dynamicly
+/* design note: the pool is a contiguous block of memory, if we dynamicly
    want to add additional cells to the pool be better store this in another
    pool as we need to know the base address of the pool when releasing
    memory. */
diff --git a/sound/oss/sb_ess.c b/sound/oss/sb_ess.c
index fae05fe3de4..180e95c87e3 100644
--- a/sound/oss/sb_ess.c
+++ b/sound/oss/sb_ess.c
@@ -97,19 +97,19 @@
  *
  * The documentation is an adventure: it's close but not fully accurate. I
  * found out that after a reset some registers are *NOT* reset, though the
- * docs say the would be. Interresting ones are 0x7f, 0x7d and 0x7a. They are
- * related to the Audio 2 channel. I also was suprised about the consequenses
+ * docs say the would be. Interesting ones are 0x7f, 0x7d and 0x7a. They are
+ * related to the Audio 2 channel. I also was surprised about the consequences
  * of writing 0x00 to 0x7f (which should be done by reset): The ES1887 moves
  * into ES1888 mode. This means that it claims IRQ 11, which happens to be my
  * ISDN adapter. Needless to say it no longer worked. I now understand why
  * after rebooting 0x7f already was 0x05, the value of my choice: the BIOS
  * did it.
  *
- * Oh, and this is another trap: in ES1887 docs mixer register 0x70 is decribed
- * as if it's exactly the same as register 0xa1. This is *NOT* true. The
- * description of 0x70 in ES1869 docs is accurate however.
+ * Oh, and this is another trap: in ES1887 docs mixer register 0x70 is
+ * described as if it's exactly the same as register 0xa1. This is *NOT* true.
+ * The description of 0x70 in ES1869 docs is accurate however.
  * Well, the assumption about ES1869 was wrong: register 0x70 is very much
- * like register 0xa1, except that bit 7 is allways 1, whatever you want
+ * like register 0xa1, except that bit 7 is always 1, whatever you want
  * it to be.
  *
  * When using audio 2 mixer register 0x72 seems te be meaningless. Only 0xa2
@@ -117,10 +117,10 @@
  *
  * Software reset not being able to reset all registers is great! Especially
  * the fact that register 0x78 isn't reset is great when you wanna change back
- * to single dma operation (simplex): audio 2 is still operation, and uses the
- * same dma as audio 1: your ess changes into a funny echo machine.
+ * to single dma operation (simplex): audio 2 is still operational, and uses
+ * the same dma as audio 1: your ess changes into a funny echo machine.
  *
- * Received the new that ES1688 is detected as a ES1788. Did some thinking:
+ * Received the news that ES1688 is detected as a ES1788. Did some thinking:
  * the ES1887 detection scheme suggests in step 2 to try if bit 3 of register
  * 0x64 can be changed. This is inaccurate, first I inverted the * check: "If
  * can be modified, it's a 1688", which lead to a correct detection
@@ -135,7 +135,7 @@
  * About recognition of ESS chips
  *
  * The distinction of ES688, ES1688, ES1788, ES1887 and ES1888 is described in
- * a (preliminary ??) datasheet on ES1887. It's aim is to identify ES1887, but
+ * a (preliminary ??) datasheet on ES1887. Its aim is to identify ES1887, but
  * during detection the text claims that "this chip may be ..." when a step
  * fails. This scheme is used to distinct between the above chips.
  * It appears however that some PnP chips like ES1868 are recognized as ES1788
@@ -156,9 +156,9 @@
  *
  * The existing ES1688 support didn't take care of the ES1688+ recording
  * levels very well. Whenever a device was selected (recmask) for recording
- * it's recording level was loud, and it couldn't be changed. The fact that
+ * its recording level was loud, and it couldn't be changed. The fact that
  * internal register 0xb4 could take care of RECLEV, didn't work meaning until
- * it's value was restored every time the chip was reset; this reset the
+ * its value was restored every time the chip was reset; this reset the
  * value of 0xb4 too. I guess that's what 4front also had (have?) trouble with.
  *
  * About ES1887 support:
@@ -169,9 +169,9 @@
  * the latter case the recording volumes are 0.
  * Now recording levels of inputs can be controlled, by changing the playback
  * levels. Futhermore several devices can be recorded together (which is not
- * possible with the ES1688.
+ * possible with the ES1688).
  * Besides the separate recording level control for each input, the common
- * recordig level can also be controlled by RECLEV as described above.
+ * recording level can also be controlled by RECLEV as described above.
  *
  * Not only ES1887 have this recording mixer. I know the following from the
  * documentation:
diff --git a/sound/pci/cs5535audio/cs5535audio_pcm.c b/sound/pci/cs5535audio/cs5535audio_pcm.c
index f0a48693d68..5450a9e8f13 100644
--- a/sound/pci/cs5535audio/cs5535audio_pcm.c
+++ b/sound/pci/cs5535audio/cs5535audio_pcm.c
@@ -143,7 +143,7 @@ static int cs5535audio_build_dma_packets(struct cs5535audio *cs5535au,
 	if (dma->periods == periods && dma->period_bytes == period_bytes)
 		return 0;
 
-	/* the u32 cast is okay because in snd*create we succesfully told
+	/* the u32 cast is okay because in snd*create we successfully told
    	   pci alloc that we're only 32 bit capable so the uppper will be 0 */
 	addr = (u32) substream->runtime->dma_addr;
 	desc_addr = (u32) dma->desc_buf.addr;
-- 
cgit v1.2.3-70-g09d2


From 1bfba4e8ea0e555e3a0296051517d96253660ccc Mon Sep 17 00:00:00 2001
From: Akinobu Mita <mita@miraclelinux.com>
Date: Mon, 26 Jun 2006 00:24:40 -0700
Subject: [PATCH] core: use list_move()

This patch converts the combination of list_del(A) and list_add(A, B) to
list_move(A, B).

Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Ram Pai <linuxram@us.ibm.com>
Signed-off-by: Akinobu Mita <mita@miraclelinux.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/base/power/resume.c  |  6 ++----
 drivers/base/power/suspend.c | 13 +++++--------
 fs/dcache.c                  |  3 +--
 fs/libfs.c                   | 10 ++++------
 fs/namespace.c               |  6 ++----
 fs/pnode.c                   |  9 +++------
 fs/sysfs/dir.c               | 10 ++++------
 mm/swap.c                    |  3 +--
 8 files changed, 22 insertions(+), 38 deletions(-)

(limited to 'mm')

diff --git a/drivers/base/power/resume.c b/drivers/base/power/resume.c
index 520679ce53a..826093ef4c7 100644
--- a/drivers/base/power/resume.c
+++ b/drivers/base/power/resume.c
@@ -53,8 +53,7 @@ void dpm_resume(void)
 		struct device * dev = to_device(entry);
 
 		get_device(dev);
-		list_del_init(entry);
-		list_add_tail(entry, &dpm_active);
+		list_move_tail(entry, &dpm_active);
 
 		up(&dpm_list_sem);
 		if (!dev->power.prev_state.event)
@@ -101,8 +100,7 @@ void dpm_power_up(void)
 		struct device * dev = to_device(entry);
 
 		get_device(dev);
-		list_del_init(entry);
-		list_add_tail(entry, &dpm_active);
+		list_move_tail(entry, &dpm_active);
 		resume_device(dev);
 		put_device(dev);
 	}
diff --git a/drivers/base/power/suspend.c b/drivers/base/power/suspend.c
index 1a1fe43a305..69509e02f70 100644
--- a/drivers/base/power/suspend.c
+++ b/drivers/base/power/suspend.c
@@ -116,12 +116,10 @@ int device_suspend(pm_message_t state)
 		/* Check if the device got removed */
 		if (!list_empty(&dev->power.entry)) {
 			/* Move it to the dpm_off or dpm_off_irq list */
-			if (!error) {
-				list_del(&dev->power.entry);
-				list_add(&dev->power.entry, &dpm_off);
-			} else if (error == -EAGAIN) {
-				list_del(&dev->power.entry);
-				list_add(&dev->power.entry, &dpm_off_irq);
+			if (!error)
+				list_move(&dev->power.entry, &dpm_off);
+			else if (error == -EAGAIN) {
+				list_move(&dev->power.entry, &dpm_off_irq);
 				error = 0;
 			}
 		}
@@ -139,8 +137,7 @@ int device_suspend(pm_message_t state)
 		 */
 		while (!list_empty(&dpm_off_irq)) {
 			struct list_head * entry = dpm_off_irq.next;
-			list_del(entry);
-			list_add(entry, &dpm_off);
+			list_move(entry, &dpm_off);
 		}
 		dpm_resume();
 	}
diff --git a/fs/dcache.c b/fs/dcache.c
index 6aa635fbfa0..48b44a714b3 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -522,8 +522,7 @@ void shrink_dcache_sb(struct super_block * sb)
 		dentry = list_entry(tmp, struct dentry, d_lru);
 		if (dentry->d_sb != sb)
 			continue;
-		list_del(tmp);
-		list_add(tmp, &dentry_unused);
+		list_move(tmp, &dentry_unused);
 	}
 
 	/*
diff --git a/fs/libfs.c b/fs/libfs.c
index fc785d8befb..ac02ea602c3 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -149,10 +149,9 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
 			/* fallthrough */
 		default:
 			spin_lock(&dcache_lock);
-			if (filp->f_pos == 2) {
-				list_del(q);
-				list_add(q, &dentry->d_subdirs);
-			}
+			if (filp->f_pos == 2)
+				list_move(q, &dentry->d_subdirs);
+
 			for (p=q->next; p != &dentry->d_subdirs; p=p->next) {
 				struct dentry *next;
 				next = list_entry(p, struct dentry, d_u.d_child);
@@ -164,8 +163,7 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
 					return 0;
 				spin_lock(&dcache_lock);
 				/* next is still alive */
-				list_del(q);
-				list_add(q, p);
+				list_move(q, p);
 				p = q;
 				filp->f_pos++;
 			}
diff --git a/fs/namespace.c b/fs/namespace.c
index 866430bb024..b3ed212ea41 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -526,10 +526,8 @@ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
 {
 	struct vfsmount *p;
 
-	for (p = mnt; p; p = next_mnt(p, mnt)) {
-		list_del(&p->mnt_hash);
-		list_add(&p->mnt_hash, kill);
-	}
+	for (p = mnt; p; p = next_mnt(p, mnt))
+		list_move(&p->mnt_hash, kill);
 
 	if (propagate)
 		propagate_umount(kill);
diff --git a/fs/pnode.c b/fs/pnode.c
index 37b568ed0e0..da42ee61c1d 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -53,8 +53,7 @@ static int do_make_slave(struct vfsmount *mnt)
 	if (master) {
 		list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave)
 			slave_mnt->mnt_master = master;
-		list_del(&mnt->mnt_slave);
-		list_add(&mnt->mnt_slave, &master->mnt_slave_list);
+		list_move(&mnt->mnt_slave, &master->mnt_slave_list);
 		list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev);
 		INIT_LIST_HEAD(&mnt->mnt_slave_list);
 	} else {
@@ -283,10 +282,8 @@ static void __propagate_umount(struct vfsmount *mnt)
 		 * umount the child only if the child has no
 		 * other children
 		 */
-		if (child && list_empty(&child->mnt_mounts)) {
-			list_del(&child->mnt_hash);
-			list_add_tail(&child->mnt_hash, &mnt->mnt_hash);
-		}
+		if (child && list_empty(&child->mnt_mounts))
+			list_move_tail(&child->mnt_hash, &mnt->mnt_hash);
 	}
 }
 
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 610b5bdbe75..61c42430cba 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -430,10 +430,9 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
 			i++;
 			/* fallthrough */
 		default:
-			if (filp->f_pos == 2) {
-				list_del(q);
-				list_add(q, &parent_sd->s_children);
-			}
+			if (filp->f_pos == 2)
+				list_move(q, &parent_sd->s_children);
+
 			for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
 				struct sysfs_dirent *next;
 				const char * name;
@@ -455,8 +454,7 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
 						 dt_type(next)) < 0)
 					return 0;
 
-				list_del(q);
-				list_add(q, p);
+				list_move(q, p);
 				p = q;
 				filp->f_pos++;
 			}
diff --git a/mm/swap.c b/mm/swap.c
index 03ae2076f92..990868afc1c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -86,8 +86,7 @@ int rotate_reclaimable_page(struct page *page)
 	zone = page_zone(page);
 	spin_lock_irqsave(&zone->lru_lock, flags);
 	if (PageLRU(page) && !PageActive(page)) {
-		list_del(&page->lru);
-		list_add_tail(&page->lru, &zone->inactive_list);
+		list_move_tail(&page->lru, &zone->inactive_list);
 		inc_page_state(pgrotated);
 	}
 	if (!test_clear_page_writeback(page))
-- 
cgit v1.2.3-70-g09d2


From 99f895518368252ba862cc15ce4eb98ebbe1bec6 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 26 Jun 2006 00:25:55 -0700
Subject: [PATCH] proc: don't lock task_structs indefinitely

Every inode in /proc holds a reference to a struct task_struct.  If a
directory or file is opened and remains open after the the task exits this
pinning continues.  With 8K stacks on a 32bit machine the amount pinned per
file descriptor is about 10K.

Normally I would figure a reasonable per user process limit is about 100
processes.  With 80 processes, with a 1000 file descriptors each I can trigger
the 00M killer on a 32bit kernel, because I have pinned about 800MB of useless
data.

This patch replaces the struct task_struct pointer with a pointer to a struct
task_ref which has a struct task_struct pointer.  The so the pinning of dead
tasks does not happen.

The code now has to contend with the fact that the task may now exit at any
time.  Which is a little but not muh more complicated.

With this change it takes about 1000 processes each opening up 1000 file
descriptors before I can trigger the OOM killer.  Much better.

[mlp@google.com: task_mmu small fixes]
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Paul Jackson <pj@sgi.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Albert Cahalan <acahalan@gmail.com>
Signed-off-by: Prasanna Meda <mlp@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/base.c          | 355 ++++++++++++++++++++++++++++++++++--------------
 fs/proc/inode.c         |   9 +-
 fs/proc/internal.h      |  15 +-
 fs/proc/task_mmu.c      |  72 +++++++---
 include/linux/proc_fs.h |   8 +-
 kernel/cpuset.c         |  27 +++-
 mm/mempolicy.c          |   6 +-
 7 files changed, 349 insertions(+), 143 deletions(-)

(limited to 'mm')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 20746e12440..489810abc72 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -307,12 +307,15 @@ static struct pid_entry tid_attr_stuff[] = {
 
 static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
 {
-	struct task_struct *task = proc_task(inode);
-	struct files_struct *files;
+	struct task_struct *task = get_proc_task(inode);
+	struct files_struct *files = NULL;
 	struct file *file;
 	int fd = proc_fd(inode);
 
-	files = get_files_struct(task);
+	if (task) {
+		files = get_files_struct(task);
+		put_task_struct(task);
+	}
 	if (files) {
 		/*
 		 * We are not taking a ref to the file structure, so we must
@@ -344,10 +347,29 @@ static struct fs_struct *get_fs_struct(struct task_struct *task)
 	return fs;
 }
 
+static int get_nr_threads(struct task_struct *tsk)
+{
+	/* Must be called with the rcu_read_lock held */
+	unsigned long flags;
+	int count = 0;
+
+	if (lock_task_sighand(tsk, &flags)) {
+		count = atomic_read(&tsk->signal->count);
+		unlock_task_sighand(tsk, &flags);
+	}
+	return count;
+}
+
 static int proc_cwd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
 {
-	struct fs_struct *fs = get_fs_struct(proc_task(inode));
+	struct task_struct *task = get_proc_task(inode);
+	struct fs_struct *fs = NULL;
 	int result = -ENOENT;
+
+	if (task) {
+		fs = get_fs_struct(task);
+		put_task_struct(task);
+	}
 	if (fs) {
 		read_lock(&fs->lock);
 		*mnt = mntget(fs->pwdmnt);
@@ -361,8 +383,14 @@ static int proc_cwd_link(struct inode *inode, struct dentry **dentry, struct vfs
 
 static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
 {
-	struct fs_struct *fs = get_fs_struct(proc_task(inode));
+	struct task_struct *task = get_proc_task(inode);
+	struct fs_struct *fs = NULL;
 	int result = -ENOENT;
+
+	if (task) {
+		fs = get_fs_struct(task);
+		put_task_struct(task);
+	}
 	if (fs) {
 		read_lock(&fs->lock);
 		*mnt = mntget(fs->rootmnt);
@@ -550,16 +578,19 @@ struct proc_mounts {
 
 static int mounts_open(struct inode *inode, struct file *file)
 {
-	struct task_struct *task = proc_task(inode);
-	struct namespace *namespace;
+	struct task_struct *task = get_proc_task(inode);
+	struct namespace *namespace = NULL;
 	struct proc_mounts *p;
 	int ret = -EINVAL;
 
-	task_lock(task);
-	namespace = task->namespace;
-	if (namespace)
-		get_namespace(namespace);
-	task_unlock(task);
+	if (task) {
+		task_lock(task);
+		namespace = task->namespace;
+		if (namespace)
+			get_namespace(namespace);
+		task_unlock(task);
+		put_task_struct(task);
+	}
 
 	if (namespace) {
 		ret = -ENOMEM;
@@ -616,17 +647,21 @@ static struct file_operations proc_mounts_operations = {
 extern struct seq_operations mountstats_op;
 static int mountstats_open(struct inode *inode, struct file *file)
 {
-	struct task_struct *task = proc_task(inode);
 	int ret = seq_open(file, &mountstats_op);
 
 	if (!ret) {
 		struct seq_file *m = file->private_data;
-		struct namespace *namespace;
-		task_lock(task);
-		namespace = task->namespace;
-		if (namespace)
-			get_namespace(namespace);
-		task_unlock(task);
+		struct namespace *namespace = NULL;
+		struct task_struct *task = get_proc_task(inode);
+
+		if (task) {
+			task_lock(task);
+			namespace = task->namespace;
+			if (namespace)
+				get_namespace(namespace);
+			task_unlock(task);
+			put_task_struct(task);
+		}
 
 		if (namespace)
 			m->private = namespace;
@@ -653,18 +688,27 @@ static ssize_t proc_info_read(struct file * file, char __user * buf,
 	struct inode * inode = file->f_dentry->d_inode;
 	unsigned long page;
 	ssize_t length;
-	struct task_struct *task = proc_task(inode);
+	struct task_struct *task = get_proc_task(inode);
+
+	length = -ESRCH;
+	if (!task)
+		goto out_no_task;
 
 	if (count > PROC_BLOCK_SIZE)
 		count = PROC_BLOCK_SIZE;
+
+	length = -ENOMEM;
 	if (!(page = __get_free_page(GFP_KERNEL)))
-		return -ENOMEM;
+		goto out;
 
 	length = PROC_I(inode)->op.proc_read(task, (char*)page);
 
 	if (length >= 0)
 		length = simple_read_from_buffer(buf, count, ppos, (char *)page, length);
 	free_page(page);
+out:
+	put_task_struct(task);
+out_no_task:
 	return length;
 }
 
@@ -681,12 +725,15 @@ static int mem_open(struct inode* inode, struct file* file)
 static ssize_t mem_read(struct file * file, char __user * buf,
 			size_t count, loff_t *ppos)
 {
-	struct task_struct *task = proc_task(file->f_dentry->d_inode);
+	struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
 	char *page;
 	unsigned long src = *ppos;
 	int ret = -ESRCH;
 	struct mm_struct *mm;
 
+	if (!task)
+		goto out_no_task;
+
 	if (!MAY_PTRACE(task) || !ptrace_may_attach(task))
 		goto out;
 
@@ -736,6 +783,8 @@ out_put:
 out_free:
 	free_page((unsigned long) page);
 out:
+	put_task_struct(task);
+out_no_task:
 	return ret;
 }
 
@@ -748,15 +797,20 @@ static ssize_t mem_write(struct file * file, const char * buf,
 {
 	int copied = 0;
 	char *page;
-	struct task_struct *task = proc_task(file->f_dentry->d_inode);
+	struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
 	unsigned long dst = *ppos;
 
+	copied = -ESRCH;
+	if (!task)
+		goto out_no_task;
+
 	if (!MAY_PTRACE(task) || !ptrace_may_attach(task))
-		return -ESRCH;
+		goto out;
 
+	copied = -ENOMEM;
 	page = (char *)__get_free_page(GFP_USER);
 	if (!page)
-		return -ENOMEM;
+		goto out;
 
 	while (count > 0) {
 		int this_len, retval;
@@ -779,6 +833,9 @@ static ssize_t mem_write(struct file * file, const char * buf,
 	}
 	*ppos = dst;
 	free_page((unsigned long) page);
+out:
+	put_task_struct(task);
+out_no_task:
 	return copied;
 }
 #endif
@@ -809,12 +866,17 @@ static struct file_operations proc_mem_operations = {
 static ssize_t oom_adjust_read(struct file *file, char __user *buf,
 				size_t count, loff_t *ppos)
 {
-	struct task_struct *task = proc_task(file->f_dentry->d_inode);
+	struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
 	char buffer[PROC_NUMBUF];
 	size_t len;
-	int oom_adjust = task->oomkilladj;
+	int oom_adjust;
 	loff_t __ppos = *ppos;
 
+	if (!task)
+		return -ESRCH;
+	oom_adjust = task->oomkilladj;
+	put_task_struct(task);
+
 	len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
 	if (__ppos >= len)
 		return 0;
@@ -829,7 +891,7 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf,
 static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 				size_t count, loff_t *ppos)
 {
-	struct task_struct *task = proc_task(file->f_dentry->d_inode);
+	struct task_struct *task;
 	char buffer[PROC_NUMBUF], *end;
 	int oom_adjust;
 
@@ -845,7 +907,11 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 		return -EINVAL;
 	if (*end == '\n')
 		end++;
+	task = get_proc_task(file->f_dentry->d_inode);
+	if (!task)
+		return -ESRCH;
 	task->oomkilladj = oom_adjust;
+	put_task_struct(task);
 	if (end - buffer == 0)
 		return -EIO;
 	return end - buffer;
@@ -862,12 +928,15 @@ static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
 				  size_t count, loff_t *ppos)
 {
 	struct inode * inode = file->f_dentry->d_inode;
-	struct task_struct *task = proc_task(inode);
+	struct task_struct *task = get_proc_task(inode);
 	ssize_t length;
 	char tmpbuf[TMPBUFLEN];
 
+	if (!task)
+		return -ESRCH;
 	length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
 				audit_get_loginuid(task->audit_context));
+	put_task_struct(task);
 	return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
 }
 
@@ -877,13 +946,12 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
 	struct inode * inode = file->f_dentry->d_inode;
 	char *page, *tmp;
 	ssize_t length;
-	struct task_struct *task = proc_task(inode);
 	uid_t loginuid;
 
 	if (!capable(CAP_AUDIT_CONTROL))
 		return -EPERM;
 
-	if (current != task)
+	if (current != proc_tref(inode)->task)
 		return -EPERM;
 
 	if (count >= PAGE_SIZE)
@@ -907,7 +975,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
 		goto out_free_page;
 
 	}
-	length = audit_set_loginuid(task, loginuid);
+	length = audit_set_loginuid(current, loginuid);
 	if (likely(length == 0))
 		length = count;
 
@@ -926,13 +994,16 @@ static struct file_operations proc_loginuid_operations = {
 static ssize_t seccomp_read(struct file *file, char __user *buf,
 			    size_t count, loff_t *ppos)
 {
-	struct task_struct *tsk = proc_task(file->f_dentry->d_inode);
+	struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode);
 	char __buf[20];
 	loff_t __ppos = *ppos;
 	size_t len;
 
+	if (!tsk)
+		return -ESRCH;
 	/* no need to print the trailing zero, so use only len */
 	len = sprintf(__buf, "%u\n", tsk->seccomp.mode);
+	put_task_struct(tsk);
 	if (__ppos >= len)
 		return 0;
 	if (count > len - __ppos)
@@ -946,29 +1017,43 @@ static ssize_t seccomp_read(struct file *file, char __user *buf,
 static ssize_t seccomp_write(struct file *file, const char __user *buf,
 			     size_t count, loff_t *ppos)
 {
-	struct task_struct *tsk = proc_task(file->f_dentry->d_inode);
+	struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode);
 	char __buf[20], *end;
 	unsigned int seccomp_mode;
+	ssize_t result;
+
+	result = -ESRCH;
+	if (!tsk)
+		goto out_no_task;
 
 	/* can set it only once to be even more secure */
+	result = -EPERM;
 	if (unlikely(tsk->seccomp.mode))
-		return -EPERM;
+		goto out;
 
+	result = -EFAULT;
 	memset(__buf, 0, sizeof(__buf));
 	count = min(count, sizeof(__buf) - 1);
 	if (copy_from_user(__buf, buf, count))
-		return -EFAULT;
+		goto out;
+
 	seccomp_mode = simple_strtoul(__buf, &end, 0);
 	if (*end == '\n')
 		end++;
+	result = -EINVAL;
 	if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
 		tsk->seccomp.mode = seccomp_mode;
 		set_tsk_thread_flag(tsk, TIF_SECCOMP);
 	} else
-		return -EINVAL;
+		goto out;
+	result = -EIO;
 	if (unlikely(!(end - __buf)))
-		return -EIO;
-	return end - __buf;
+		goto out;
+	result = end - __buf;
+out:
+	put_task_struct(tsk);
+out_no_task:
+	return result;
 }
 
 static struct file_operations proc_seccomp_operations = {
@@ -995,7 +1080,7 @@ static int proc_check_dentry_visible(struct inode *inode,
 	/* See if the the two tasks share a commone set of
 	 * file descriptors.  If so everything is visible.
 	 */
-	task = proc_task(inode);
+	task = get_proc_task(inode);
 	if (!task)
 		goto out;
 	files = get_files_struct(current);
@@ -1006,6 +1091,7 @@ static int proc_check_dentry_visible(struct inode *inode,
 		put_files_struct(task_files);
 	if (files)
 		put_files_struct(files);
+	put_task_struct(task);
 	if (!error)
 		goto out;
 
@@ -1106,7 +1192,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
 {
 	struct dentry *dentry = filp->f_dentry;
 	struct inode *inode = dentry->d_inode;
-	struct task_struct *p = proc_task(inode);
+	struct task_struct *p = get_proc_task(inode);
 	unsigned int fd, tid, ino;
 	int retval;
 	char buf[PROC_NUMBUF];
@@ -1114,8 +1200,8 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
 	struct fdtable *fdt;
 
 	retval = -ENOENT;
-	if (!pid_alive(p))
-		goto out;
+	if (!p)
+		goto out_no_task;
 	retval = 0;
 	tid = p->pid;
 
@@ -1164,6 +1250,8 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
 			put_files_struct(files);
 	}
 out:
+	put_task_struct(p);
+out_no_task:
 	return retval;
 }
 
@@ -1175,16 +1263,18 @@ static int proc_pident_readdir(struct file *filp,
 	int pid;
 	struct dentry *dentry = filp->f_dentry;
 	struct inode *inode = dentry->d_inode;
+	struct task_struct *task = get_proc_task(inode);
 	struct pid_entry *p;
 	ino_t ino;
 	int ret;
 
 	ret = -ENOENT;
-	if (!pid_alive(proc_task(inode)))
+	if (!task)
 		goto out;
 
 	ret = 0;
-	pid = proc_task(inode)->pid;
+	pid = task->pid;
+	put_task_struct(task);
 	i = filp->f_pos;
 	switch (i) {
 	case 0:
@@ -1270,14 +1360,13 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	inode->i_ino = fake_ino(task->pid, ino);
 
-	if (!pid_alive(task))
-		goto out_unlock;
-
 	/*
 	 * grab the reference to task.
 	 */
-	get_task_struct(task);
-	ei->task = task;
+	ei->tref = tref_get_by_task(task);
+	if (!tref_task(ei->tref))
+		goto out_unlock;
+
 	inode->i_uid = 0;
 	inode->i_gid = 0;
 	if (task_dumpable(task)) {
@@ -1303,13 +1392,21 @@ out_unlock:
  *
  * Rewrite the inode's ownerships here because the owning task may have
  * performed a setuid(), etc.
+ *
+ * Before the /proc/pid/status file was created the only way to read
+ * the effective uid of a /process was to stat /proc/pid.  Reading
+ * /proc/pid/status is slow enough that procps and other packages
+ * kept stating /proc/pid.  To keep the rules in /proc simple I have
+ * made this apply to all per process world readable and executable
+ * directories.
  */
 static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode = dentry->d_inode;
-	struct task_struct *task = proc_task(inode);
-	if (pid_alive(task)) {
-		if (task_dumpable(task)) {
+	struct task_struct *task = get_proc_task(inode);
+	if (task) {
+		if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
+		    task_dumpable(task)) {
 			inode->i_uid = task->euid;
 			inode->i_gid = task->egid;
 		} else {
@@ -1317,37 +1414,63 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
 			inode->i_gid = 0;
 		}
 		security_task_to_inode(task, inode);
+		put_task_struct(task);
 		return 1;
 	}
 	d_drop(dentry);
 	return 0;
 }
 
+static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	struct task_struct *task;
+	generic_fillattr(inode, stat);
+
+	rcu_read_lock();
+	stat->uid = 0;
+	stat->gid = 0;
+	task = pid_task(proc_pid(inode), PIDTYPE_PID);
+	if (task) {
+		if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
+		    task_dumpable(task)) {
+			stat->uid = task->euid;
+			stat->gid = task->egid;
+		}
+	}
+	rcu_read_unlock();
+	return 0;
+}
+
 static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode = dentry->d_inode;
-	struct task_struct *task = proc_task(inode);
+	struct task_struct *task = get_proc_task(inode);
 	int fd = proc_fd(inode);
 	struct files_struct *files;
 
-	files = get_files_struct(task);
-	if (files) {
-		rcu_read_lock();
-		if (fcheck_files(files, fd)) {
+	if (task) {
+		files = get_files_struct(task);
+		if (files) {
+			rcu_read_lock();
+			if (fcheck_files(files, fd)) {
+				rcu_read_unlock();
+				put_files_struct(files);
+				if (task_dumpable(task)) {
+					inode->i_uid = task->euid;
+					inode->i_gid = task->egid;
+				} else {
+					inode->i_uid = 0;
+					inode->i_gid = 0;
+				}
+				security_task_to_inode(task, inode);
+				put_task_struct(task);
+				return 1;
+			}
 			rcu_read_unlock();
 			put_files_struct(files);
-			if (task_dumpable(task)) {
-				inode->i_uid = task->euid;
-				inode->i_gid = task->egid;
-			} else {
-				inode->i_uid = 0;
-				inode->i_gid = 0;
-			}
-			security_task_to_inode(task, inode);
-			return 1;
 		}
-		rcu_read_unlock();
-		put_files_struct(files);
+		put_task_struct(task);
 	}
 	d_drop(dentry);
 	return 0;
@@ -1359,7 +1482,7 @@ static int pid_delete_dentry(struct dentry * dentry)
 	 * If so, then don't put the dentry on the lru list,
 	 * kill it immediately.
 	 */
-	return !pid_alive(proc_task(dentry->d_inode));
+	return !proc_tref(dentry->d_inode)->task;
 }
 
 static struct dentry_operations tid_fd_dentry_operations =
@@ -1401,7 +1524,7 @@ out:
 /* SMP-safe */
 static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, struct nameidata *nd)
 {
-	struct task_struct *task = proc_task(dir);
+	struct task_struct *task = get_proc_task(dir);
 	unsigned fd = name_to_int(dentry);
 	struct dentry *result = ERR_PTR(-ENOENT);
 	struct file * file;
@@ -1409,10 +1532,10 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry,
 	struct inode *inode;
 	struct proc_inode *ei;
 
+	if (!task)
+		goto out_no_task;
 	if (fd == ~0U)
 		goto out;
-	if (!pid_alive(task))
-		goto out;
 
 	inode = proc_pid_make_inode(dir->i_sb, task, PROC_TID_FD_DIR+fd);
 	if (!inode)
@@ -1447,6 +1570,8 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry,
 	if (tid_fd_revalidate(dentry, NULL))
 		result = NULL;
 out:
+	put_task_struct(task);
+out_no_task:
 	return result;
 
 out_unlock2:
@@ -1490,12 +1615,17 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
 	struct inode * inode = file->f_dentry->d_inode;
 	unsigned long page;
 	ssize_t length;
-	struct task_struct *task = proc_task(inode);
+	struct task_struct *task = get_proc_task(inode);
+
+	length = -ESRCH;
+	if (!task)
+		goto out_no_task;
 
 	if (count > PAGE_SIZE)
 		count = PAGE_SIZE;
+	length = -ENOMEM;
 	if (!(page = __get_free_page(GFP_KERNEL)))
-		return -ENOMEM;
+		goto out;
 
 	length = security_getprocattr(task, 
 				      (char*)file->f_dentry->d_name.name, 
@@ -1503,6 +1633,9 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
 	if (length >= 0)
 		length = simple_read_from_buffer(buf, count, ppos, (char *)page, length);
 	free_page(page);
+out:
+	put_task_struct(task);
+out_no_task:
 	return length;
 }
 
@@ -1512,26 +1645,36 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
 	struct inode * inode = file->f_dentry->d_inode;
 	char *page; 
 	ssize_t length; 
-	struct task_struct *task = proc_task(inode); 
+	struct task_struct *task = get_proc_task(inode);
 
+	length = -ESRCH;
+	if (!task)
+		goto out_no_task;
 	if (count > PAGE_SIZE) 
 		count = PAGE_SIZE; 
-	if (*ppos != 0) {
-		/* No partial writes. */
-		return -EINVAL;
-	}
+
+	/* No partial writes. */
+	length = -EINVAL;
+	if (*ppos != 0)
+		goto out;
+
+	length = -ENOMEM;
 	page = (char*)__get_free_page(GFP_USER); 
 	if (!page) 
-		return -ENOMEM;
+		goto out;
+
 	length = -EFAULT; 
 	if (copy_from_user(page, buf, count)) 
-		goto out;
+		goto out_free;
 
 	length = security_setprocattr(task, 
 				      (char*)file->f_dentry->d_name.name, 
 				      (void*)page, count);
-out:
+out_free:
 	free_page((unsigned long) page);
+out:
+	put_task_struct(task);
+out_no_task:
 	return length;
 } 
 
@@ -1553,15 +1696,15 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
 {
 	struct inode *inode;
 	struct dentry *error;
-	struct task_struct *task = proc_task(dir);
+	struct task_struct *task = get_proc_task(dir);
 	struct pid_entry *p;
 	struct proc_inode *ei;
 
 	error = ERR_PTR(-ENOENT);
 	inode = NULL;
 
-	if (!pid_alive(task))
-		goto out;
+	if (!task)
+		goto out_no_task;
 
 	for (p = ents; p->name; p++) {
 		if (p->len != dentry->d_name.len)
@@ -1748,6 +1891,8 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
 	if (pid_revalidate(dentry, NULL))
 		error = NULL;
 out:
+	put_task_struct(task);
+out_no_task:
 	return error;
 }
 
@@ -1771,10 +1916,12 @@ static struct file_operations proc_tid_base_operations = {
 
 static struct inode_operations proc_tgid_base_inode_operations = {
 	.lookup		= proc_tgid_base_lookup,
+	.getattr	= pid_getattr,
 };
 
 static struct inode_operations proc_tid_base_inode_operations = {
 	.lookup		= proc_tid_base_lookup,
+	.getattr	= pid_getattr,
 };
 
 #ifdef CONFIG_SECURITY
@@ -1816,10 +1963,12 @@ static struct dentry *proc_tid_attr_lookup(struct inode *dir,
 
 static struct inode_operations proc_tgid_attr_inode_operations = {
 	.lookup		= proc_tgid_attr_lookup,
+	.getattr	= pid_getattr,
 };
 
 static struct inode_operations proc_tid_attr_inode_operations = {
 	.lookup		= proc_tid_attr_lookup,
+	.getattr	= pid_getattr,
 };
 #endif
 
@@ -1981,10 +2130,13 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry
 {
 	struct dentry *result = ERR_PTR(-ENOENT);
 	struct task_struct *task;
-	struct task_struct *leader = proc_task(dir);
+	struct task_struct *leader = get_proc_task(dir);
 	struct inode *inode;
 	unsigned tid;
 
+	if (!leader)
+		goto out_no_task;
+
 	tid = name_to_int(dentry);
 	if (tid == ~0U)
 		goto out;
@@ -2024,6 +2176,8 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry
 out_drop_task:
 	put_task_struct(task);
 out:
+	put_task_struct(leader);
+out_no_task:
 	return result;
 }
 
@@ -2163,12 +2317,7 @@ static struct task_struct *first_tid(struct task_struct *leader, int tid, int nr
 
 	/* If nr exceeds the number of threads there is nothing todo */
 	if (nr) {
-		int threads = 0;
-		task_lock(leader);
-		if (leader->signal)
-			threads = atomic_read(&leader->signal->count);
-		task_unlock(leader);
-		if (nr >= threads)
+		if (nr >= get_nr_threads(leader))
 			goto done;
 	}
 
@@ -2218,15 +2367,15 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
 	char buf[PROC_NUMBUF];
 	struct dentry *dentry = filp->f_dentry;
 	struct inode *inode = dentry->d_inode;
-	struct task_struct *leader = proc_task(inode);
+	struct task_struct *leader = get_proc_task(inode);
 	struct task_struct *task;
 	int retval = -ENOENT;
 	ino_t ino;
 	int tid;
 	unsigned long pos = filp->f_pos;  /* avoiding "long long" filp->f_pos */
 
-	if (!pid_alive(leader))
-		goto out;
+	if (!leader)
+		goto out_no_task;
 	retval = 0;
 
 	switch (pos) {
@@ -2266,20 +2415,22 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
 	}
 out:
 	filp->f_pos = pos;
+	put_task_struct(leader);
+out_no_task:
 	return retval;
 }
 
 static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 {
 	struct inode *inode = dentry->d_inode;
-	struct task_struct *p = proc_task(inode);
+	struct task_struct *p = get_proc_task(inode);
 	generic_fillattr(inode, stat);
 
-	if (pid_alive(p)) {
-		task_lock(p);
-		if (p->signal)
-			stat->nlink += atomic_read(&p->signal->count);
-		task_unlock(p);
+	if (p) {
+		rcu_read_lock();
+		stat->nlink += get_nr_threads(p);
+		rcu_read_unlock();
+		put_task_struct(p);
 	}
 
 	return 0;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index fbc94df138a..31e0475c6cb 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -58,14 +58,11 @@ static void de_put(struct proc_dir_entry *de)
 static void proc_delete_inode(struct inode *inode)
 {
 	struct proc_dir_entry *de;
-	struct task_struct *tsk;
 
 	truncate_inode_pages(&inode->i_data, 0);
 
-	/* Let go of any associated process */
-	tsk = PROC_I(inode)->task;
-	if (tsk)
-		put_task_struct(tsk);
+	/* Stop tracking associated processes */
+	tref_put(PROC_I(inode)->tref);
 
 	/* Let go of any associated proc directory entry */
 	de = PROC_I(inode)->pde;
@@ -94,7 +91,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
 	ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, SLAB_KERNEL);
 	if (!ei)
 		return NULL;
-	ei->task = NULL;
+	ei->tref = NULL;
 	ei->fd = 0;
 	ei->op.proc_get_link = NULL;
 	ei->pde = NULL;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 548e7447ea4..37f1648adc2 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -10,6 +10,7 @@
  */
 
 #include <linux/proc_fs.h>
+#include <linux/task_ref.h>
 
 struct vmalloc_info {
 	unsigned long	used;
@@ -41,13 +42,23 @@ extern struct file_operations proc_maps_operations;
 extern struct file_operations proc_numa_maps_operations;
 extern struct file_operations proc_smaps_operations;
 
+extern struct file_operations proc_maps_operations;
+extern struct file_operations proc_numa_maps_operations;
+extern struct file_operations proc_smaps_operations;
+
+
 void free_proc_entry(struct proc_dir_entry *de);
 
 int proc_init_inodecache(void);
 
-static inline struct task_struct *proc_task(struct inode *inode)
+static inline struct task_ref *proc_tref(struct inode *inode)
+{
+	return PROC_I(inode)->tref;
+}
+
+static inline struct task_struct *get_proc_task(struct inode *inode)
 {
-	return PROC_I(inode)->task;
+	return get_tref_task(proc_tref(inode));
 }
 
 static inline int proc_fd(struct inode *inode)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 4187b4e9cdb..abf3208c3f6 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -75,9 +75,13 @@ int proc_exe_link(struct inode *inode, struct dentry **dentry, struct vfsmount *
 {
 	struct vm_area_struct * vma;
 	int result = -ENOENT;
-	struct task_struct *task = proc_task(inode);
-	struct mm_struct * mm = get_task_mm(task);
+	struct task_struct *task = get_proc_task(inode);
+	struct mm_struct * mm = NULL;
 
+	if (task) {
+		mm = get_task_mm(task);
+		put_task_struct(task);
+	}
 	if (!mm)
 		goto out;
 	down_read(&mm->mmap_sem);
@@ -120,7 +124,8 @@ struct mem_size_stats
 
 static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss)
 {
-	struct task_struct *task = m->private;
+	struct proc_maps_private *priv = m->private;
+	struct task_struct *task = priv->task;
 	struct vm_area_struct *vma = v;
 	struct mm_struct *mm = vma->vm_mm;
 	struct file *file = vma->vm_file;
@@ -295,12 +300,16 @@ static int show_smap(struct seq_file *m, void *v)
 
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
-	struct task_struct *task = m->private;
+	struct proc_maps_private *priv = m->private;
 	unsigned long last_addr = m->version;
 	struct mm_struct *mm;
-	struct vm_area_struct *vma, *tail_vma;
+	struct vm_area_struct *vma, *tail_vma = NULL;
 	loff_t l = *pos;
 
+	/* Clear the per syscall fields in priv */
+	priv->task = NULL;
+	priv->tail_vma = NULL;
+
 	/*
 	 * We remember last_addr rather than next_addr to hit with
 	 * mmap_cache most of the time. We have zero last_addr at
@@ -311,11 +320,15 @@ static void *m_start(struct seq_file *m, loff_t *pos)
 	if (last_addr == -1UL)
 		return NULL;
 
-	mm = get_task_mm(task);
+	priv->task = get_tref_task(priv->tref);
+	if (!priv->task)
+		return NULL;
+
+	mm = get_task_mm(priv->task);
 	if (!mm)
 		return NULL;
 
-	tail_vma = get_gate_vma(task);
+	priv->tail_vma = tail_vma = get_gate_vma(priv->task);
 	down_read(&mm->mmap_sem);
 
 	/* Start with last addr hint */
@@ -350,11 +363,9 @@ out:
 	return tail_vma;
 }
 
-static void m_stop(struct seq_file *m, void *v)
+static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma)
 {
-	struct task_struct *task = m->private;
-	struct vm_area_struct *vma = v;
-	if (vma && vma != get_gate_vma(task)) {
+	if (vma && vma != priv->tail_vma) {
 		struct mm_struct *mm = vma->vm_mm;
 		up_read(&mm->mmap_sem);
 		mmput(mm);
@@ -363,17 +374,27 @@ static void m_stop(struct seq_file *m, void *v)
 
 static void *m_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	struct task_struct *task = m->private;
+	struct proc_maps_private *priv = m->private;
 	struct vm_area_struct *vma = v;
-	struct vm_area_struct *tail_vma = get_gate_vma(task);
+	struct vm_area_struct *tail_vma = priv->tail_vma;
 
 	(*pos)++;
 	if (vma && (vma != tail_vma) && vma->vm_next)
 		return vma->vm_next;
-	m_stop(m, v);
+	vma_stop(priv, vma);
 	return (vma != tail_vma)? tail_vma: NULL;
 }
 
+static void m_stop(struct seq_file *m, void *v)
+{
+	struct proc_maps_private *priv = m->private;
+	struct vm_area_struct *vma = v;
+
+	vma_stop(priv, vma);
+	if (priv->task)
+		put_task_struct(priv->task);
+}
+
 static struct seq_operations proc_pid_maps_op = {
 	.start	= m_start,
 	.next	= m_next,
@@ -391,11 +412,18 @@ static struct seq_operations proc_pid_smaps_op = {
 static int do_maps_open(struct inode *inode, struct file *file,
 			struct seq_operations *ops)
 {
-	struct task_struct *task = proc_task(inode);
-	int ret = seq_open(file, ops);
-	if (!ret) {
-		struct seq_file *m = file->private_data;
-		m->private = task;
+	struct proc_maps_private *priv;
+	int ret = -ENOMEM;
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (priv) {
+		priv->tref = proc_tref(inode);
+		ret = seq_open(file, ops);
+		if (!ret) {
+			struct seq_file *m = file->private_data;
+			m->private = priv;
+		} else {
+			kfree(priv);
+		}
 	}
 	return ret;
 }
@@ -409,7 +437,7 @@ struct file_operations proc_maps_operations = {
 	.open		= maps_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= seq_release,
+	.release	= seq_release_private,
 };
 
 #ifdef CONFIG_NUMA
@@ -431,7 +459,7 @@ struct file_operations proc_numa_maps_operations = {
 	.open		= numa_maps_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= seq_release,
+	.release	= seq_release_private,
 };
 #endif
 
@@ -444,5 +472,5 @@ struct file_operations proc_smaps_operations = {
 	.open		= smaps_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= seq_release,
+	.release	= seq_release_private,
 };
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index d4d2081dbaf..4c7271f0469 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -246,7 +246,7 @@ extern void kclist_add(struct kcore_list *, void *, size_t);
 #endif
 
 struct proc_inode {
-	struct task_struct *task;
+	struct task_ref *tref;
 	int fd;
 	union {
 		int (*proc_get_link)(struct inode *, struct dentry **, struct vfsmount **);
@@ -266,4 +266,10 @@ static inline struct proc_dir_entry *PDE(const struct inode *inode)
 	return PROC_I(inode)->pde;
 }
 
+struct proc_maps_private {
+	struct task_ref *tref;
+	struct task_struct *task;
+	struct vm_area_struct *tail_vma;
+};
+
 #endif /* _LINUX_PROC_FS_H */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b602f73fb38..3e991c0c02e 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -50,6 +50,7 @@
 #include <linux/time.h>
 #include <linux/backing-dev.h>
 #include <linux/sort.h>
+#include <linux/task_ref.h>
 
 #include <asm/uaccess.h>
 #include <asm/atomic.h>
@@ -2442,31 +2443,43 @@ void __cpuset_memory_pressure_bump(void)
  */
 static int proc_cpuset_show(struct seq_file *m, void *v)
 {
+	struct task_ref *tref;
 	struct task_struct *tsk;
 	char *buf;
-	int retval = 0;
+	int retval;
 
+	retval = -ENOMEM;
 	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
 	if (!buf)
-		return -ENOMEM;
+		goto out;
+
+	retval = -ESRCH;
+	tref = m->private;
+	tsk = get_tref_task(tref);
+	if (!tsk)
+		goto out_free;
 
-	tsk = m->private;
+	retval = -EINVAL;
 	mutex_lock(&manage_mutex);
+
 	retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE);
 	if (retval < 0)
-		goto out;
+		goto out_unlock;
 	seq_puts(m, buf);
 	seq_putc(m, '\n');
-out:
+out_unlock:
 	mutex_unlock(&manage_mutex);
+	put_task_struct(tsk);
+out_free:
 	kfree(buf);
+out:
 	return retval;
 }
 
 static int cpuset_open(struct inode *inode, struct file *file)
 {
-	struct task_struct *tsk = PROC_I(inode)->task;
-	return single_open(file, proc_cpuset_show, tsk);
+	struct task_ref *tref = PROC_I(inode)->tref;
+	return single_open(file, proc_cpuset_show, tref);
 }
 
 struct file_operations proc_cpuset_operations = {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 73e0f23b7f5..6b9740bbf4c 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1821,7 +1821,7 @@ static inline void check_huge_range(struct vm_area_struct *vma,
 
 int show_numa_map(struct seq_file *m, void *v)
 {
-	struct task_struct *task = m->private;
+	struct proc_maps_private *priv = m->private;
 	struct vm_area_struct *vma = v;
 	struct numa_maps *md;
 	struct file *file = vma->vm_file;
@@ -1837,7 +1837,7 @@ int show_numa_map(struct seq_file *m, void *v)
 		return 0;
 
 	mpol_to_str(buffer, sizeof(buffer),
-			get_vma_policy(task, vma, vma->vm_start));
+			    get_vma_policy(priv->task, vma, vma->vm_start));
 
 	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
 
@@ -1891,7 +1891,7 @@ out:
 	kfree(md);
 
 	if (m->count < m->size)
-		m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
+		m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 6550e07f41ce8473ed684dac54fbfbd42183ffda Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 12 Jun 2006 17:11:31 -0700
Subject: [PATCH] 64bit Resource: finally enable 64bit resource sizes

Introduce the Kconfig entry and actually switch to a 64bit value, if
wanted, for resource_size_t.

Based on a patch series originally from Vivek Goyal <vgoyal@in.ibm.com>

Cc: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/i386/Kconfig     | 1 +
 include/linux/types.h | 7 ++++++-
 kernel/resource.c     | 8 +++-----
 mm/Kconfig            | 6 ++++++
 4 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 47c08bcd9b2..7e46ad7a7b6 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -529,6 +529,7 @@ config X86_PAE
 	bool
 	depends on HIGHMEM64G
 	default y
+	select RESOURCES_64BIT
 
 # Common NUMA Features
 config NUMA
diff --git a/include/linux/types.h b/include/linux/types.h
index a021e157733..3f235660a3c 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -178,9 +178,14 @@ typedef __u64 __bitwise __be64;
 #ifdef __KERNEL__
 typedef unsigned __bitwise__ gfp_t;
 
-typedef unsigned long resource_size_t;
+#ifdef CONFIG_RESOURCES_64BIT
+typedef u64 resource_size_t;
+#else
+typedef u32 resource_size_t;
 #endif
 
+#endif	/* __KERNEL__ */
+
 struct ustat {
 	__kernel_daddr_t	f_tfree;
 	__kernel_ino_t		f_tinode;
diff --git a/kernel/resource.c b/kernel/resource.c
index 54835c02ab3..cc73029088a 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -23,20 +23,18 @@
 
 struct resource ioport_resource = {
 	.name	= "PCI IO",
-	.start	= 0x0000,
+	.start	= 0,
 	.end	= IO_SPACE_LIMIT,
 	.flags	= IORESOURCE_IO,
 };
-
 EXPORT_SYMBOL(ioport_resource);
 
 struct resource iomem_resource = {
 	.name	= "PCI mem",
-	.start	= 0UL,
-	.end	= ~0UL,
+	.start	= 0,
+	.end	= -1,
 	.flags	= IORESOURCE_MEM,
 };
-
 EXPORT_SYMBOL(iomem_resource);
 
 static DEFINE_RWLOCK(resource_lock);
diff --git a/mm/Kconfig b/mm/Kconfig
index 66e65ab3942..e3644b0062b 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -145,3 +145,9 @@ config MIGRATION
 	  while the virtual addresses are not changed. This is useful for
 	  example on NUMA systems to put pages nearer to the processors accessing
 	  the page.
+
+config RESOURCES_64BIT
+	bool "64 bit Memory and IO resources (EXPERIMENTAL)" if (!64BIT && EXPERIMENTAL)
+	default 64BIT
+	help
+	  This option allows memory and IO resources to be 64 bit.
-- 
cgit v1.2.3-70-g09d2


From bc02af93dd2bbddce1b55e0a493f833a1b7cf140 Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Tue, 27 Jun 2006 02:53:30 -0700
Subject: [PATCH] pgdat allocation for new node add (specify node id)

Change the name of old add_memory() to arch_add_memory.  And use node id to
get pgdat for the node at NODE_DATA().

Note: Powerpc's old add_memory() is defined as __devinit. However,
      add_memory() is usually called only after bootup.
      I suppose it may be redundant. But, I'm not well known about powerpc.
      So, I keep it. (But, __meminit is better at least.)

Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: "Brown, Len" <len.brown@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/mm/init.c            |  2 +-
 arch/ia64/mm/init.c            |  5 ++--
 arch/powerpc/mm/mem.c          | 11 +++++--
 arch/x86_64/mm/init.c          | 66 ++++++++++++++++++++++++------------------
 drivers/acpi/Kconfig           |  2 +-
 drivers/acpi/acpi_memhotplug.c |  3 +-
 drivers/base/memory.c          |  4 ++-
 include/linux/memory_hotplug.h | 13 ++++++++-
 mm/memory_hotplug.c            | 11 +++++++
 9 files changed, 78 insertions(+), 39 deletions(-)

(limited to 'mm')

diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index bf19513f0ce..46859253179 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -654,7 +654,7 @@ void __init mem_init(void)
  */
 #ifdef CONFIG_MEMORY_HOTPLUG
 #ifndef CONFIG_NEED_MULTIPLE_NODES
-int add_memory(u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size)
 {
 	struct pglist_data *pgdata = &contig_page_data;
 	struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1;
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 11f08001f8c..38306e98f04 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -652,7 +652,7 @@ void online_page(struct page *page)
 	num_physpages++;
 }
 
-int add_memory(u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size)
 {
 	pg_data_t *pgdat;
 	struct zone *zone;
@@ -660,7 +660,7 @@ int add_memory(u64 start, u64 size)
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 	int ret;
 
-	pgdat = NODE_DATA(0);
+	pgdat = NODE_DATA(nid);
 
 	zone = pgdat->node_zones + ZONE_NORMAL;
 	ret = __add_pages(zone, start_pfn, nr_pages);
@@ -671,7 +671,6 @@ int add_memory(u64 start, u64 size)
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(add_memory);
 
 int remove_memory(u64 start, u64 size)
 {
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 69f3b9a20be..089d939a0b3 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -114,15 +114,20 @@ void online_page(struct page *page)
 	num_physpages++;
 }
 
-int __devinit add_memory(u64 start, u64 size)
+#ifdef CONFIG_NUMA
+int memory_add_physaddr_to_nid(u64 start)
+{
+	return hot_add_scn_to_nid(start);
+}
+#endif
+
+int __devinit arch_add_memory(int nid, u64 start, u64 size)
 {
 	struct pglist_data *pgdata;
 	struct zone *zone;
-	int nid;
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 
-	nid = hot_add_scn_to_nid(start);
 	pgdata = NODE_DATA(nid);
 
 	start = (unsigned long)__va(start);
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 02add1d1dfa..51fbf3eddf1 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -506,8 +506,6 @@ void __init clear_kernel_mapping(unsigned long address, unsigned long size)
 /*
  * Memory hotplug specific functions
  */
-#if defined(CONFIG_ACPI_HOTPLUG_MEMORY) || defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)
-
 void online_page(struct page *page)
 {
 	ClearPageReserved(page);
@@ -517,31 +515,17 @@ void online_page(struct page *page)
 	num_physpages++;
 }
 
-#ifndef CONFIG_MEMORY_HOTPLUG
+#ifdef CONFIG_MEMORY_HOTPLUG
 /*
- * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
- * just online the pages.
+ * XXX: memory_add_physaddr_to_nid() is to find node id from physical address
+ *	via probe interface of sysfs. If acpi notifies hot-add event, then it
+ *	can tell node id by searching dsdt. But, probe interface doesn't have
+ *	node id. So, return 0 as node id at this time.
  */
-int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
+#ifdef CONFIG_NUMA
+int memory_add_physaddr_to_nid(u64 start)
 {
-	int err = -EIO;
-	unsigned long pfn;
-	unsigned long total = 0, mem = 0;
-	for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
-		if (pfn_valid(pfn)) {
-			online_page(pfn_to_page(pfn));
-			err = 0;
-			mem++;
-		}
-		total++;
-	}
-	if (!err) {
-		z->spanned_pages += total;
-		z->present_pages += mem;
-		z->zone_pgdat->node_spanned_pages += total;
-		z->zone_pgdat->node_present_pages += mem;
-	}
-	return err;
+	return 0;
 }
 #endif
 
@@ -549,9 +533,9 @@ int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
  * Memory is added always to NORMAL zone. This means you will never get
  * additional DMA/DMA32 memory.
  */
-int add_memory(u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size)
 {
-	struct pglist_data *pgdat = NODE_DATA(0);
+	struct pglist_data *pgdat = NODE_DATA(nid);
 	struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2;
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
@@ -568,7 +552,7 @@ error:
 	printk("%s: Problem encountered in __add_pages!\n", __func__);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(add_memory);
+EXPORT_SYMBOL_GPL(arch_add_memory);
 
 int remove_memory(u64 start, u64 size)
 {
@@ -576,7 +560,33 @@ int remove_memory(u64 start, u64 size)
 }
 EXPORT_SYMBOL_GPL(remove_memory);
 
-#endif
+#else /* CONFIG_MEMORY_HOTPLUG */
+/*
+ * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
+ * just online the pages.
+ */
+int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
+{
+	int err = -EIO;
+	unsigned long pfn;
+	unsigned long total = 0, mem = 0;
+	for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
+		if (pfn_valid(pfn)) {
+			online_page(pfn_to_page(pfn));
+			err = 0;
+			mem++;
+		}
+		total++;
+	}
+	if (!err) {
+		z->spanned_pages += total;
+		z->present_pages += mem;
+		z->zone_pgdat->node_spanned_pages += total;
+		z->zone_pgdat->node_present_pages += mem;
+	}
+	return err;
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
 
 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
 			 kcore_vsyscall;
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index 94b8d820c51..610d2cc02cf 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -328,7 +328,7 @@ config ACPI_CONTAINER
 config ACPI_HOTPLUG_MEMORY
 	tristate "Memory Hotplug"
 	depends on ACPI
-	depends on MEMORY_HOTPLUG || X86_64
+	depends on MEMORY_HOTPLUG
 	default n
 	help
 	  This driver adds supports for ACPI Memory Hotplug.  This driver
diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 5652569b376..0424326eae1 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -215,6 +215,7 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
 {
 	int result, num_enabled = 0;
 	struct acpi_memory_info *info;
+	int node = 0;
 
 	ACPI_FUNCTION_TRACE("acpi_memory_enable_device");
 
@@ -245,7 +246,7 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
 			continue;
 		}
 
-		result = add_memory(info->start_addr, info->length);
+		result = add_memory(node, info->start_addr, info->length);
 		if (result)
 			continue;
 		info->enabled = 1;
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index dd547af4681..c6b7d9c4b65 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -306,11 +306,13 @@ static ssize_t
 memory_probe_store(struct class *class, const char *buf, size_t count)
 {
 	u64 phys_addr;
+	int nid;
 	int ret;
 
 	phys_addr = simple_strtoull(buf, NULL, 0);
 
-	ret = add_memory(phys_addr, PAGES_PER_SECTION << PAGE_SHIFT);
+	nid = memory_add_physaddr_to_nid(phys_addr);
+	ret = add_memory(nid, phys_addr, PAGES_PER_SECTION << PAGE_SHIFT);
 
 	if (ret)
 		count = ret;
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 91120638617..29c1472efad 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -63,6 +63,16 @@ extern int online_pages(unsigned long, unsigned long);
 /* reasonably generic interface to expand the physical pages in a zone  */
 extern int __add_pages(struct zone *zone, unsigned long start_pfn,
 	unsigned long nr_pages);
+
+#ifdef CONFIG_NUMA
+extern int memory_add_physaddr_to_nid(u64 start);
+#else
+static inline int memory_add_physaddr_to_nid(u64 start)
+{
+	return 0;
+}
+#endif
+
 #else /* ! CONFIG_MEMORY_HOTPLUG */
 /*
  * Stub functions for when hotplug is off
@@ -99,7 +109,8 @@ static inline int __remove_pages(struct zone *zone, unsigned long start_pfn,
 	return -ENOSYS;
 }
 
-extern int add_memory(u64 start, u64 size);
+extern int add_memory(int nid, u64 start, u64 size);
+extern int arch_add_memory(int nid, u64 start, u64 size);
 extern int remove_memory(u64 start, u64 size);
 
 #endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 841a077d5ae..6cdeabe9f6d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -163,3 +163,14 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 	vm_total_pages = nr_free_pagecache_pages();
 	return 0;
 }
+
+int add_memory(int nid, u64 start, u64 size)
+{
+	int ret;
+
+	/* call arch's memory hotadd */
+	ret = arch_add_memory(nid, start, size);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(add_memory);
-- 
cgit v1.2.3-70-g09d2


From 3218ae14b1e3ee2ab81df30ed690c8e864d23316 Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Tue, 27 Jun 2006 02:53:33 -0700
Subject: [PATCH] pgdat allocation for new node add (export kswapd start func)

When node is hot-added, kswapd for the node should start.  This export kswapd
start function as kswapd_run() to use at add_memory().

[akpm@osdl.org: daemonize() isn't needed when using the kthread API]
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: "Brown, Len" <len.brown@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h |  2 ++
 mm/vmscan.c          | 37 ++++++++++++++++++++++++++-----------
 2 files changed, 28 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index dc3f3aa0c83..c41e2d6d1ac 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -199,6 +199,8 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
 }
 #endif
 
+extern int kswapd_run(int nid);
+
 #ifdef CONFIG_MMU
 /* linux/mm/shmem.c */
 extern int shmem_unuse(swp_entry_t entry, struct page *page);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 72babac71de..f03da33d914 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -34,6 +34,7 @@
 #include <linux/notifier.h>
 #include <linux/rwsem.h>
 #include <linux/delay.h>
+#include <linux/kthread.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -1223,7 +1224,6 @@ static int kswapd(void *p)
 	};
 	cpumask_t cpumask;
 
-	daemonize("kswapd%d", pgdat->node_id);
 	cpumask = node_to_cpumask(pgdat->node_id);
 	if (!cpus_empty(cpumask))
 		set_cpus_allowed(tsk, cpumask);
@@ -1468,20 +1468,35 @@ static int cpu_callback(struct notifier_block *nfb,
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
+/*
+ * This kswapd start function will be called by init and node-hot-add.
+ * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
+ */
+int kswapd_run(int nid)
+{
+	pg_data_t *pgdat = NODE_DATA(nid);
+	int ret = 0;
+
+	if (pgdat->kswapd)
+		return 0;
+
+	pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
+	if (IS_ERR(pgdat->kswapd)) {
+		/* failure at boot is fatal */
+		BUG_ON(system_state == SYSTEM_BOOTING);
+		printk("Failed to start kswapd on node %d\n",nid);
+		ret = -1;
+	}
+	return ret;
+}
+
 static int __init kswapd_init(void)
 {
-	pg_data_t *pgdat;
+	int nid;
 
 	swap_setup();
-	for_each_online_pgdat(pgdat) {
-		pid_t pid;
-
-		pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL);
-		BUG_ON(pid < 0);
-		read_lock(&tasklist_lock);
-		pgdat->kswapd = find_task_by_pid(pid);
-		read_unlock(&tasklist_lock);
-	}
+	for_each_online_node(nid)
+ 		kswapd_run(nid);
 	hotcpu_notifier(cpu_callback, 0);
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 9af3c2dea3a3ae4248d81a70b556adfe1dc65d55 Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Tue, 27 Jun 2006 02:53:34 -0700
Subject: [PATCH] pgdat allocation for new node add (call pgdat allocation)

Add node-hot-add support to add_memory().

node hotadd uses this sequence.
1. allocate pgdat.
2. refresh NODE_DATA()
3. call free_area_init_node() to initialize
4. create sysfs entry
5. add memory (old add_memory())
6. set node online
7. run kswapd for new node.
(8). update zonelist after pages are onlined. (This is already merged in -mm
   due to update phase is difference.)

Note:
  To make common function as much as possible,
  there is 2 changes from v2.
    - The old add_memory(), which is defiend by each archs,
      is renamed to arch_add_memory(). New add_memory becomes
      caller of arch dependent function as a common code.

    - This patch changes add_memory()'s interface
        From: add_memory(start, end)
        TO  : add_memory(nid, start, end).
      It was cause of similar code that finding node id from
      physical address is inside of old add_memory() on each arch.

      In addition, acpi memory hotplug driver can find node id easier.
      In v2, it must walk DSDT'S _CRS by matching physical address to
      get the handle of its memory device, then get _PXM and node id.
      Because input is just physical address.
      However, in v3, the acpi driver can use handle to get _PXM and node id
      for the new memory device. It can pass just node id to add_memory().

Fix interface of arch_add_memory() is in next patche.

Signed-off-by: Yasunori Goto     <y-goto@jp.fujitsu.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: "Brown, Len" <len.brown@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory_hotplug.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

(limited to 'mm')

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6cdeabe9f6d..83d37a401b3 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -164,13 +164,65 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 	return 0;
 }
 
+static pg_data_t *hotadd_new_pgdat(int nid, u64 start)
+{
+	struct pglist_data *pgdat;
+	unsigned long zones_size[MAX_NR_ZONES] = {0};
+	unsigned long zholes_size[MAX_NR_ZONES] = {0};
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+
+	pgdat = arch_alloc_nodedata(nid);
+	if (!pgdat)
+		return NULL;
+
+	arch_refresh_nodedata(nid, pgdat);
+
+	/* we can use NODE_DATA(nid) from here */
+
+	/* init node's zones as empty zones, we don't have any present pages.*/
+	free_area_init_node(nid, pgdat, zones_size, start_pfn, zholes_size);
+
+	return pgdat;
+}
+
+static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
+{
+	arch_refresh_nodedata(nid, NULL);
+	arch_free_nodedata(pgdat);
+	return;
+}
+
 int add_memory(int nid, u64 start, u64 size)
 {
+	pg_data_t *pgdat = NULL;
+	int new_pgdat = 0;
 	int ret;
 
+	if (!node_online(nid)) {
+		pgdat = hotadd_new_pgdat(nid, start);
+		if (!pgdat)
+			return -ENOMEM;
+		new_pgdat = 1;
+		ret = kswapd_run(nid);
+		if (ret)
+			goto error;
+	}
+
 	/* call arch's memory hotadd */
 	ret = arch_add_memory(nid, start, size);
 
+	if (ret < 0)
+		goto error;
+
+	/* we online node here. we have no error path from here. */
+	node_set_online(nid);
+
+	return ret;
+error:
+	/* rollback pgdat allocation and others */
+	if (new_pgdat)
+		rollback_node_hotadd(nid, pgdat);
+
 	return ret;
 }
 EXPORT_SYMBOL_GPL(add_memory);
-- 
cgit v1.2.3-70-g09d2


From 0a54703904a4a206686b4e8c3f5a6927b60747aa Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 27 Jun 2006 02:53:35 -0700
Subject: [PATCH] register hot-added memory to iomem resource

Register hot-added memory to iomem_resource.  With this, /proc/iomem can
show hot-added memory.

Note: kdump uses /proc/iomem to catch memory range when it is installed.
      So, kdump should be re-installed after /proc/iomem change.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Vivek Goyal <vgoyal@in.ibm.com>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory_hotplug.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'mm')

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 83d37a401b3..0b11a854344 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -21,6 +21,7 @@
 #include <linux/memory_hotplug.h>
 #include <linux/highmem.h>
 #include <linux/vmalloc.h>
+#include <linux/ioport.h>
 
 #include <asm/tlbflush.h>
 
@@ -192,6 +193,27 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
 	return;
 }
 
+/* add this memory to iomem resource */
+static void register_memory_resource(u64 start, u64 size)
+{
+	struct resource *res;
+
+	res = kzalloc(sizeof(struct resource), GFP_KERNEL);
+	BUG_ON(!res);
+
+	res->name = "System RAM";
+	res->start = start;
+	res->end = start + size - 1;
+	res->flags = IORESOURCE_MEM;
+	if (request_resource(&iomem_resource, res) < 0) {
+		printk("System RAM resource %llx - %llx cannot be added\n",
+		(unsigned long long)res->start, (unsigned long long)res->end);
+		kfree(res);
+	}
+}
+
+
+
 int add_memory(int nid, u64 start, u64 size)
 {
 	pg_data_t *pgdat = NULL;
@@ -217,6 +239,9 @@ int add_memory(int nid, u64 start, u64 size)
 	/* we online node here. we have no error path from here. */
 	node_set_online(nid);
 
+	/* register this memory as resource */
+	register_memory_resource(start, size);
+
 	return ret;
 error:
 	/* rollback pgdat allocation and others */
-- 
cgit v1.2.3-70-g09d2


From 2842f11419704f8707fffc82e10d2263427fc130 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 27 Jun 2006 02:53:36 -0700
Subject: [PATCH] catch valid mem range at onlining memory

This patch allows hot-add memory which is not aligned to section.

Now, hot-added memory has to be aligned to section size.  Considering big
section sized archs, this is not useful.

When hot-added memory is registerd as iomem resoruce by iomem resource
patch, we can make use of that information to detect valid memory range.

Note: With this, not-aligned memory can be registerd. To allow hot-add
      memory with holes, we have to do more work around add_memory().
      (It doesn't allows add memory to already existing mem section.)

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/ioport.h |  3 +++
 kernel/resource.c      | 38 ++++++++++++++++++++++++++++++++++++++
 mm/memory_hotplug.c    | 28 ++++++++++++++++++++++++----
 3 files changed, 65 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index cd6bd001ba4..edfc733b157 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -105,6 +105,9 @@ extern int allocate_resource(struct resource *root, struct resource *new,
 int adjust_resource(struct resource *res, unsigned long start,
 		    unsigned long size);
 
+/* get registered SYSTEM_RAM resources in specified area */
+extern int find_next_system_ram(struct resource *res);
+
 /* Convenience shorthand with allocation */
 #define request_region(start,n,name)	__request_region(&ioport_resource, (start), (n), (name))
 #define request_mem_region(start,n,name) __request_region(&iomem_resource, (start), (n), (name))
diff --git a/kernel/resource.c b/kernel/resource.c
index e3080fcc66a..2404f9b0bc4 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -232,6 +232,44 @@ int release_resource(struct resource *old)
 
 EXPORT_SYMBOL(release_resource);
 
+#ifdef CONFIG_MEMORY_HOTPLUG
+/*
+ * Finds the lowest memory reosurce exists within [res->start.res->end)
+ * the caller must specify res->start, res->end, res->flags.
+ * If found, returns 0, res is overwritten, if not found, returns -1.
+ */
+int find_next_system_ram(struct resource *res)
+{
+	resource_size_t start, end;
+	struct resource *p;
+
+	BUG_ON(!res);
+
+	start = res->start;
+	end = res->end;
+
+	read_lock(&resource_lock);
+	for (p = iomem_resource.child; p ; p = p->sibling) {
+		/* system ram is just marked as IORESOURCE_MEM */
+		if (p->flags != res->flags)
+			continue;
+		if (p->start > end) {
+			p = NULL;
+			break;
+		}
+		if (p->start >= start)
+			break;
+	}
+	read_unlock(&resource_lock);
+	if (!p)
+		return -1;
+	/* copy data */
+	res->start = p->start;
+	res->end = p->end;
+	return 0;
+}
+#endif
+
 /*
  * Find empty slot in the resource tree given range and alignment.
  */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0b11a854344..f13783e81eb 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -127,6 +127,9 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 	unsigned long i;
 	unsigned long flags;
 	unsigned long onlined_pages = 0;
+	struct resource res;
+	u64 section_end;
+	unsigned long start_pfn;
 	struct zone *zone;
 	int need_zonelists_rebuild = 0;
 
@@ -149,10 +152,27 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 	if (!populated_zone(zone))
 		need_zonelists_rebuild = 1;
 
-	for (i = 0; i < nr_pages; i++) {
-		struct page *page = pfn_to_page(pfn + i);
-		online_page(page);
-		onlined_pages++;
+	res.start = (u64)pfn << PAGE_SHIFT;
+	res.end = res.start + ((u64)nr_pages << PAGE_SHIFT) - 1;
+	res.flags = IORESOURCE_MEM; /* we just need system ram */
+	section_end = res.end;
+
+	while (find_next_system_ram(&res) >= 0) {
+		start_pfn = (unsigned long)(res.start >> PAGE_SHIFT);
+		nr_pages = (unsigned long)
+                           ((res.end + 1 - res.start) >> PAGE_SHIFT);
+
+		if (PageReserved(pfn_to_page(start_pfn))) {
+			/* this region's page is not onlined now */
+			for (i = 0; i < nr_pages; i++) {
+				struct page *page = pfn_to_page(start_pfn + i);
+				online_page(page);
+				onlined_pages++;
+			}
+		}
+
+		res.start = res.end + 1;
+		res.end = section_end;
 	}
 	zone->present_pages += onlined_pages;
 	zone->zone_pgdat->node_present_pages += onlined_pages;
-- 
cgit v1.2.3-70-g09d2


From 1f04bbd2d396a701c5af2e5b92bad896c2550c16 Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Tue, 27 Jun 2006 02:53:37 -0700
Subject: [PATCH] sparc64: support sparsemem and !memory hotplug

Fix "undefined reference to `arch_add_memory'" on sparc64 allmodconfig.

sparc64 doesn't support memory hotplug.  But we want it to support
sparsemem.

Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/Kconfig b/mm/Kconfig
index 66e65ab3942..e76c023eb0b 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -116,6 +116,7 @@ config SPARSEMEM_EXTREME
 config MEMORY_HOTPLUG
 	bool "Allow for memory hot-add"
 	depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND
+	depends on (IA64 || X86 || PPC64)
 
 comment "Memory hotplug is currently incompatible with Software Suspend"
 	depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND
-- 
cgit v1.2.3-70-g09d2


From 0fc44159bfcb5b0afa178f9c3f50db23aebc76ff Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Tue, 27 Jun 2006 02:53:38 -0700
Subject: [PATCH] Register sysfs file for hotplugged new node

When new node becomes enable by hot-add, new sysfs file must be created for
new node.  So, if new node is enabled by add_memory(), register_one_node() is
called to create it.  In addition, I386's arch_register_node() and a part of
register_nodes() of powerpc are consolidated to register_one_node() as a
generic_code().

This is tested by Tiger4(IPF) with node hot-plug emulation.

Signed-off-by: Keiichiro Tokunaga <tokuanga.keiich@jp.fujitsu.com>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/topology.c |  9 +++------
 arch/ia64/kernel/topology.c | 15 +++------------
 arch/powerpc/kernel/sysfs.c | 15 ++-------------
 drivers/base/node.c         | 25 +++++++++++++++++++++++++
 include/asm-i386/cpu.h      |  2 --
 include/asm-i386/node.h     | 29 -----------------------------
 include/linux/node.h        |  4 ++++
 mm/memory_hotplug.c         | 12 +++++++++++-
 8 files changed, 48 insertions(+), 63 deletions(-)
 delete mode 100644 include/asm-i386/node.h

(limited to 'mm')

diff --git a/arch/i386/kernel/topology.c b/arch/i386/kernel/topology.c
index 296355292c7..1eecc2e1bd4 100644
--- a/arch/i386/kernel/topology.c
+++ b/arch/i386/kernel/topology.c
@@ -38,7 +38,7 @@ int arch_register_cpu(int num){
 #ifdef CONFIG_NUMA
 	int node = cpu_to_node(num);
 	if (node_online(node))
-		parent = &node_devices[node].node;
+		parent = &node_devices[parent_node(node)];
 #endif /* CONFIG_NUMA */
 
 	/*
@@ -61,7 +61,7 @@ void arch_unregister_cpu(int num) {
 #ifdef CONFIG_NUMA
 	int node = cpu_to_node(num);
 	if (node_online(node))
-		parent = &node_devices[node].node;
+		parent = &node_devices[parent_node(node)];
 #endif /* CONFIG_NUMA */
 
 	return unregister_cpu(&cpu_devices[num].cpu, parent);
@@ -74,16 +74,13 @@ EXPORT_SYMBOL(arch_unregister_cpu);
 
 #ifdef CONFIG_NUMA
 #include <linux/mmzone.h>
-#include <asm/node.h>
-
-struct i386_node node_devices[MAX_NUMNODES];
 
 static int __init topology_init(void)
 {
 	int i;
 
 	for_each_online_node(i)
-		arch_register_node(i);
+		register_one_node(i);
 
 	for_each_present_cpu(i)
 		arch_register_cpu(i);
diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c
index 879edb51d1e..42cb05bdc68 100644
--- a/arch/ia64/kernel/topology.c
+++ b/arch/ia64/kernel/topology.c
@@ -26,9 +26,6 @@
 #include <asm/numa.h>
 #include <asm/cpu.h>
 
-#ifdef CONFIG_NUMA
-static struct node *sysfs_nodes;
-#endif
 static struct ia64_cpu *sysfs_cpus;
 
 int arch_register_cpu(int num)
@@ -36,7 +33,7 @@ int arch_register_cpu(int num)
 	struct node *parent = NULL;
 	
 #ifdef CONFIG_NUMA
-	parent = &sysfs_nodes[cpu_to_node(num)];
+	parent = &node_devices[cpu_to_node(num)];
 #endif /* CONFIG_NUMA */
 
 #if defined (CONFIG_ACPI) && defined (CONFIG_HOTPLUG_CPU)
@@ -59,7 +56,7 @@ void arch_unregister_cpu(int num)
 
 #ifdef CONFIG_NUMA
 	int node = cpu_to_node(num);
-	parent = &sysfs_nodes[node];
+	parent = &node_devices[node];
 #endif /* CONFIG_NUMA */
 
 	return unregister_cpu(&sysfs_cpus[num].cpu, parent);
@@ -74,17 +71,11 @@ static int __init topology_init(void)
 	int i, err = 0;
 
 #ifdef CONFIG_NUMA
-	sysfs_nodes = kzalloc(sizeof(struct node) * MAX_NUMNODES, GFP_KERNEL);
-	if (!sysfs_nodes) {
-		err = -ENOMEM;
-		goto out;
-	}
-
 	/*
 	 * MCD - Do we want to register all ONLINE nodes, or all POSSIBLE nodes?
 	 */
 	for_each_online_node(i) {
-		if ((err = register_node(&sysfs_nodes[i], i, 0)))
+		if ((err = register_one_node(i)))
 			goto out;
 	}
 #endif
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index 5bc2585c803..338491d1604 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -304,23 +304,12 @@ static struct notifier_block sysfs_cpu_nb = {
 /* NUMA stuff */
 
 #ifdef CONFIG_NUMA
-static struct node node_devices[MAX_NUMNODES];
-
 static void register_nodes(void)
 {
 	int i;
 
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		if (node_online(i)) {
-			int p_node = parent_node(i);
-			struct node *parent = NULL;
-
-			if (p_node != i)
-				parent = &node_devices[p_node];
-
-			register_node(&node_devices[i], i, parent);
-		}
-	}
+	for (i = 0; i < MAX_NUMNODES; i++)
+		register_one_node(i);
 }
 
 int sysfs_add_device_to_node(struct sys_device *dev, int nid)
diff --git a/drivers/base/node.c b/drivers/base/node.c
index c80c3aeed00..cbd0f62b487 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -190,6 +190,31 @@ void unregister_node(struct node *node)
 	sysdev_unregister(&node->sysdev);
 }
 
+struct node node_devices[MAX_NUMNODES];
+
+int register_one_node(int nid)
+{
+	int error = 0;
+
+	if (node_online(nid)) {
+		int p_node = parent_node(nid);
+		struct node *parent = NULL;
+
+		if (p_node != nid)
+			parent = &node_devices[p_node];
+
+		error = register_node(&node_devices[nid], nid, parent);
+	}
+
+	return error;
+
+}
+
+void unregister_one_node(int nid)
+{
+	unregister_node(&node_devices[nid]);
+}
+
 static int __init register_node_type(void)
 {
 	return sysdev_class_register(&node_class);
diff --git a/include/asm-i386/cpu.h b/include/asm-i386/cpu.h
index e7252c216ca..b1bc7b1b64b 100644
--- a/include/asm-i386/cpu.h
+++ b/include/asm-i386/cpu.h
@@ -7,8 +7,6 @@
 #include <linux/nodemask.h>
 #include <linux/percpu.h>
 
-#include <asm/node.h>
-
 struct i386_cpu {
 	struct cpu cpu;
 };
diff --git a/include/asm-i386/node.h b/include/asm-i386/node.h
deleted file mode 100644
index e13c6ffa72a..00000000000
--- a/include/asm-i386/node.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef _ASM_I386_NODE_H_
-#define _ASM_I386_NODE_H_
-
-#include <linux/device.h>
-#include <linux/mmzone.h>
-#include <linux/node.h>
-#include <linux/topology.h>
-#include <linux/nodemask.h>
-
-struct i386_node {
-	struct node node;
-};
-extern struct i386_node node_devices[MAX_NUMNODES];
-
-static inline int arch_register_node(int num){
-	int p_node;
-	struct node *parent = NULL;
-
-	if (!node_online(num))
-		return 0;
-	p_node = parent_node(num);
-
-	if (p_node != num)
-		parent = &node_devices[p_node].node;
-
-	return register_node(&node_devices[num].node, num, parent);
-}
-
-#endif /* _ASM_I386_NODE_H_ */
diff --git a/include/linux/node.h b/include/linux/node.h
index 254dc3de650..1e5347527fa 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -26,8 +26,12 @@ struct node {
 	struct sys_device	sysdev;
 };
 
+extern struct node node_devices[];
+
 extern int register_node(struct node *, int, struct node *);
 extern void unregister_node(struct node *node);
+extern int register_one_node(int nid);
+extern void unregister_one_node(int nid);
 
 #define to_node(sys_device) container_of(sys_device, struct node, sysdev)
 
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index f13783e81eb..ea4038838b0 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -256,9 +256,19 @@ int add_memory(int nid, u64 start, u64 size)
 	if (ret < 0)
 		goto error;
 
-	/* we online node here. we have no error path from here. */
+	/* we online node here. we can't roll back from here. */
 	node_set_online(nid);
 
+	if (new_pgdat) {
+		ret = register_one_node(nid);
+		/*
+		 * If sysfs file of new node can't create, cpu on the node
+		 * can't be hot-added. There is no rollback way now.
+		 * So, check by BUG_ON() to catch it reluctantly..
+		 */
+		BUG_ON(ret);
+	}
+
 	/* register this memory as resource */
 	register_memory_resource(start, size);
 
-- 
cgit v1.2.3-70-g09d2


From c9cf55285e87ac423c45d9efca750d3f50234d10 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Tue, 27 Jun 2006 02:53:52 -0700
Subject: [PATCH] add poison.h and patch primary users

Localize poison values into one header file for better documentation and
easier/quicker debugging and so that the same values won't be used for
multiple purposes.

Use these constants in core arch., mm, driver, and fs code.

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Acked-by: Matt Mackall <mpm@selenic.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/mm/init.c       |  3 ++-
 arch/powerpc/mm/init_64.c |  3 ++-
 arch/sparc64/mm/init.c    |  3 ++-
 arch/x86_64/mm/init.c     |  7 +++++--
 drivers/base/dmapool.c    |  3 +--
 fs/jbd/journal.c          |  3 ++-
 include/linux/list.h      |  9 +--------
 include/linux/poison.h    | 45 +++++++++++++++++++++++++++++++++++++++++++++
 mm/slab.c                 | 12 +-----------
 9 files changed, 61 insertions(+), 27 deletions(-)
 create mode 100644 include/linux/poison.h

(limited to 'mm')

diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index 46859253179..f84b16e007f 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -23,6 +23,7 @@
 #include <linux/init.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
+#include <linux/poison.h>
 #include <linux/bootmem.h>
 #include <linux/slab.h>
 #include <linux/proc_fs.h>
@@ -753,7 +754,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
 	for (addr = begin; addr < end; addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
 		init_page_count(virt_to_page(addr));
-		memset((void *)addr, 0xcc, PAGE_SIZE);
+		memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
 		free_page(addr);
 		totalram_pages++;
 	}
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 9e30f968c18..d454caada26 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -41,6 +41,7 @@
 #include <linux/idr.h>
 #include <linux/nodemask.h>
 #include <linux/module.h>
+#include <linux/poison.h>
 
 #include <asm/pgalloc.h>
 #include <asm/page.h>
@@ -90,7 +91,7 @@ void free_initmem(void)
 
 	addr = (unsigned long)__init_begin;
 	for (; addr < (unsigned long)__init_end; addr += PAGE_SIZE) {
-		memset((void *)addr, 0xcc, PAGE_SIZE);
+		memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
 		ClearPageReserved(virt_to_page(addr));
 		init_page_count(virt_to_page(addr));
 		free_page(addr);
diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c
index 5c2bcf354ce..cb75a27adb5 100644
--- a/arch/sparc64/mm/init.c
+++ b/arch/sparc64/mm/init.c
@@ -18,6 +18,7 @@
 #include <linux/initrd.h>
 #include <linux/swap.h>
 #include <linux/pagemap.h>
+#include <linux/poison.h>
 #include <linux/fs.h>
 #include <linux/seq_file.h>
 #include <linux/kprobes.h>
@@ -1520,7 +1521,7 @@ void free_initmem(void)
 		page = (addr +
 			((unsigned long) __va(kern_base)) -
 			((unsigned long) KERNBASE));
-		memset((void *)addr, 0xcc, PAGE_SIZE);
+		memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
 		p = virt_to_page(page);
 
 		ClearPageReserved(p);
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 51fbf3eddf1..95bd232ff0c 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -23,6 +23,7 @@
 #include <linux/bootmem.h>
 #include <linux/proc_fs.h>
 #include <linux/pci.h>
+#include <linux/poison.h>
 #include <linux/dma-mapping.h>
 #include <linux/module.h>
 #include <linux/memory_hotplug.h>
@@ -660,7 +661,8 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
 	for (addr = begin; addr < end; addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
 		init_page_count(virt_to_page(addr));
-		memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE); 
+		memset((void *)(addr & ~(PAGE_SIZE-1)),
+			POISON_FREE_INITMEM, PAGE_SIZE);
 		free_page(addr);
 		totalram_pages++;
 	}
@@ -668,7 +670,8 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
 
 void free_initmem(void)
 {
-	memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin);
+	memset(__initdata_begin, POISON_FREE_INITDATA,
+		__initdata_end - __initdata_begin);
 	free_init_pages("unused kernel memory",
 			(unsigned long)(&__init_begin),
 			(unsigned long)(&__init_end));
diff --git a/drivers/base/dmapool.c b/drivers/base/dmapool.c
index e2f64f91ed0..33c5cce1560 100644
--- a/drivers/base/dmapool.c
+++ b/drivers/base/dmapool.c
@@ -7,6 +7,7 @@
 #include <linux/dmapool.h>
 #include <linux/slab.h>
 #include <linux/module.h>
+#include <linux/poison.h>
 
 /*
  * Pool allocator ... wraps the dma_alloc_coherent page allocator, so
@@ -35,8 +36,6 @@ struct dma_page {	/* cacheable header for 'allocation' bytes */
 };
 
 #define	POOL_TIMEOUT_JIFFIES	((100 /* msec */ * HZ) / 1000)
-#define	POOL_POISON_FREED	0xa7	/* !inuse */
-#define	POOL_POISON_ALLOCATED	0xa9	/* !initted */
 
 static DECLARE_MUTEX (pools_lock);
 
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 7f96b5cb678..8c9b28dff11 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -34,6 +34,7 @@
 #include <linux/suspend.h>
 #include <linux/pagemap.h>
 #include <linux/kthread.h>
+#include <linux/poison.h>
 #include <linux/proc_fs.h>
 
 #include <asm/uaccess.h>
@@ -1675,7 +1676,7 @@ static void journal_free_journal_head(struct journal_head *jh)
 {
 #ifdef CONFIG_JBD_DEBUG
 	atomic_dec(&nr_journal_heads);
-	memset(jh, 0x5b, sizeof(*jh));
+	memset(jh, JBD_POISON_FREE, sizeof(*jh));
 #endif
 	kmem_cache_free(journal_head_cache, jh);
 }
diff --git a/include/linux/list.h b/include/linux/list.h
index 37ca31b21bb..6b74adf5297 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -4,17 +4,10 @@
 #ifdef __KERNEL__
 
 #include <linux/stddef.h>
+#include <linux/poison.h>
 #include <linux/prefetch.h>
 #include <asm/system.h>
 
-/*
- * These are non-NULL pointers that will result in page faults
- * under normal circumstances, used to verify that nobody uses
- * non-initialized list entries.
- */
-#define LIST_POISON1  ((void *) 0x00100100)
-#define LIST_POISON2  ((void *) 0x00200200)
-
 /*
  * Simple doubly linked list implementation.
  *
diff --git a/include/linux/poison.h b/include/linux/poison.h
new file mode 100644
index 00000000000..d536de7dd82
--- /dev/null
+++ b/include/linux/poison.h
@@ -0,0 +1,45 @@
+#ifndef _LINUX_POISON_H
+#define _LINUX_POISON_H
+
+/********** include/linux/list.h **********/
+/*
+ * These are non-NULL pointers that will result in page faults
+ * under normal circumstances, used to verify that nobody uses
+ * non-initialized list entries.
+ */
+#define LIST_POISON1  ((void *) 0x00100100)
+#define LIST_POISON2  ((void *) 0x00200200)
+
+/********** mm/slab.c **********/
+/*
+ * Magic nums for obj red zoning.
+ * Placed in the first word before and the first word after an obj.
+ */
+#define	RED_INACTIVE	0x5A2CF071UL	/* when obj is inactive */
+#define	RED_ACTIVE	0x170FC2A5UL	/* when obj is active */
+
+/* ...and for poisoning */
+#define	POISON_INUSE	0x5a	/* for use-uninitialised poisoning */
+#define POISON_FREE	0x6b	/* for use-after-free poisoning */
+#define	POISON_END	0xa5	/* end-byte of poisoning */
+
+/********** arch/$ARCH/mm/init.c **********/
+#define POISON_FREE_INITMEM	0xcc
+
+/********** arch/x86_64/mm/init.c **********/
+#define	POISON_FREE_INITDATA	0xba
+
+/********** arch/ia64/hp/common/sba_iommu.c **********/
+/*
+ * arch/ia64/hp/common/sba_iommu.c uses a 16-byte poison string with a
+ * value of "SBAIOMMU POISON\0" for spill-over poisoning.
+ */
+
+/********** fs/jbd/journal.c **********/
+#define JBD_POISON_FREE	0x5b
+
+/********** drivers/base/dmapool.c **********/
+#define	POOL_POISON_FREED	0xa7	/* !inuse */
+#define	POOL_POISON_ALLOCATED	0xa9	/* !initted */
+
+#endif
diff --git a/mm/slab.c b/mm/slab.c
index 98ac20bc0de..47982c2d9f3 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -89,6 +89,7 @@
 #include	<linux/config.h>
 #include	<linux/slab.h>
 #include	<linux/mm.h>
+#include	<linux/poison.h>
 #include	<linux/swap.h>
 #include	<linux/cache.h>
 #include	<linux/interrupt.h>
@@ -492,17 +493,6 @@ struct kmem_cache {
 #endif
 
 #if DEBUG
-/*
- * Magic nums for obj red zoning.
- * Placed in the first word before and the first word after an obj.
- */
-#define	RED_INACTIVE	0x5A2CF071UL	/* when obj is inactive */
-#define	RED_ACTIVE	0x170FC2A5UL	/* when obj is active */
-
-/* ...and for poisoning */
-#define	POISON_INUSE	0x5a	/* for use-uninitialised poisoning */
-#define POISON_FREE	0x6b	/* for use-after-free poisoning */
-#define	POISON_END	0xa5	/* end-byte of poisoning */
 
 /*
  * memory layout of objects:
-- 
cgit v1.2.3-70-g09d2


From 34af946a22724c4e2b204957f2b24b22a0fb121c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 27 Jun 2006 02:53:55 -0700
Subject: [PATCH] spin/rwlock init cleanups

locking init cleanups:

 - convert " = SPIN_LOCK_UNLOCKED" to spin_lock_init() or DEFINE_SPINLOCK()
 - convert rwlocks in a similar manner

this patch was generated automatically.

Motivation:

 - cleanliness
 - lockdep needs control of lock initialization, which the open-coded
   variants do not give
 - it's also useful for -rt and for lock debugging in general

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/sn/kernel/irq.c                    | 2 +-
 arch/mips/kernel/smtc.c                      | 4 ++--
 arch/powerpc/platforms/cell/spufs/switch.c   | 2 +-
 arch/powerpc/platforms/powermac/pfunc_core.c | 2 +-
 arch/powerpc/platforms/pseries/eeh_event.c   | 2 +-
 arch/powerpc/sysdev/mmio_nvram.c             | 2 +-
 arch/xtensa/kernel/time.c                    | 2 +-
 arch/xtensa/kernel/traps.c                   | 2 +-
 drivers/char/drm/drm_memory_debug.h          | 2 +-
 drivers/char/drm/via_dmablit.c               | 2 +-
 drivers/char/epca.c                          | 2 +-
 drivers/char/moxa.c                          | 2 +-
 drivers/char/specialix.c                     | 2 +-
 drivers/char/sx.c                            | 2 +-
 drivers/isdn/gigaset/common.c                | 2 +-
 drivers/leds/led-core.c                      | 2 +-
 drivers/leds/led-triggers.c                  | 2 +-
 drivers/misc/ibmasm/module.c                 | 2 +-
 drivers/pcmcia/m8xx_pcmcia.c                 | 4 ++--
 drivers/rapidio/rio-access.c                 | 4 ++--
 drivers/rtc/rtc-sa1100.c                     | 2 +-
 drivers/rtc/rtc-vr41xx.c                     | 2 +-
 drivers/s390/block/dasd_eer.c                | 2 +-
 drivers/scsi/libata-core.c                   | 2 +-
 drivers/sn/ioc3.c                            | 2 +-
 drivers/video/backlight/hp680_bl.c           | 2 +-
 fs/nfsd/nfs4state.c                          | 2 +-
 fs/ocfs2/cluster/heartbeat.c                 | 2 +-
 fs/ocfs2/cluster/tcp.c                       | 2 +-
 fs/ocfs2/dlm/dlmdomain.c                     | 2 +-
 fs/ocfs2/dlm/dlmlock.c                       | 2 +-
 fs/ocfs2/dlm/dlmrecovery.c                   | 4 ++--
 fs/ocfs2/dlmglue.c                           | 2 +-
 fs/ocfs2/journal.c                           | 2 +-
 include/asm-alpha/core_t2.h                  | 2 +-
 kernel/audit.c                               | 2 +-
 mm/sparse.c                                  | 2 +-
 net/ipv6/route.c                             | 2 +-
 net/sunrpc/auth_gss/gss_krb5_seal.c          | 2 +-
 net/tipc/bcast.c                             | 4 ++--
 net/tipc/bearer.c                            | 2 +-
 net/tipc/config.c                            | 2 +-
 net/tipc/dbg.c                               | 2 +-
 net/tipc/handler.c                           | 2 +-
 net/tipc/name_table.c                        | 4 ++--
 net/tipc/net.c                               | 2 +-
 net/tipc/node.c                              | 2 +-
 net/tipc/port.c                              | 4 ++--
 net/tipc/ref.c                               | 4 ++--
 net/tipc/subscr.c                            | 2 +-
 net/tipc/user_reg.c                          | 2 +-
 51 files changed, 59 insertions(+), 59 deletions(-)

(limited to 'mm')

diff --git a/arch/ia64/sn/kernel/irq.c b/arch/ia64/sn/kernel/irq.c
index dc8e2b69671..677c6c0fd66 100644
--- a/arch/ia64/sn/kernel/irq.c
+++ b/arch/ia64/sn/kernel/irq.c
@@ -27,7 +27,7 @@ static void unregister_intr_pda(struct sn_irq_info *sn_irq_info);
 int sn_force_interrupt_flag = 1;
 extern int sn_ioif_inited;
 struct list_head **sn_irq_lh;
-static spinlock_t sn_irq_info_lock = SPIN_LOCK_UNLOCKED; /* non-IRQ lock */
+static DEFINE_SPINLOCK(sn_irq_info_lock); /* non-IRQ lock */
 
 u64 sn_intr_alloc(nasid_t local_nasid, int local_widget,
 				     struct sn_irq_info *sn_irq_info,
diff --git a/arch/mips/kernel/smtc.c b/arch/mips/kernel/smtc.c
index 2e8e52c135e..70cf09afdf5 100644
--- a/arch/mips/kernel/smtc.c
+++ b/arch/mips/kernel/smtc.c
@@ -367,7 +367,7 @@ void mipsmt_prepare_cpus(void)
 	dvpe();
 	dmt();
 
-	freeIPIq.lock = SPIN_LOCK_UNLOCKED;
+	spin_lock_init(&freeIPIq.lock);
 
 	/*
 	 * We probably don't have as many VPEs as we do SMP "CPUs",
@@ -375,7 +375,7 @@ void mipsmt_prepare_cpus(void)
 	 */
 	for (i=0; i<NR_CPUS; i++) {
 		IPIQ[i].head = IPIQ[i].tail = NULL;
-		IPIQ[i].lock = SPIN_LOCK_UNLOCKED;
+		spin_lock_init(&IPIQ[i].lock);
 		IPIQ[i].depth = 0;
 		ipi_timer_latch[i] = 0;
 	}
diff --git a/arch/powerpc/platforms/cell/spufs/switch.c b/arch/powerpc/platforms/cell/spufs/switch.c
index 3068b429b03..a656d810a44 100644
--- a/arch/powerpc/platforms/cell/spufs/switch.c
+++ b/arch/powerpc/platforms/cell/spufs/switch.c
@@ -2203,7 +2203,7 @@ void spu_init_csa(struct spu_state *csa)
 
 	memset(lscsa, 0, sizeof(struct spu_lscsa));
 	csa->lscsa = lscsa;
-	csa->register_lock = SPIN_LOCK_UNLOCKED;
+	spin_lock_init(&csa->register_lock);
 
 	/* Set LS pages reserved to allow for user-space mapping. */
 	for (p = lscsa->ls; p < lscsa->ls + LS_SIZE; p += PAGE_SIZE)
diff --git a/arch/powerpc/platforms/powermac/pfunc_core.c b/arch/powerpc/platforms/powermac/pfunc_core.c
index 047f954a89e..93e7505debc 100644
--- a/arch/powerpc/platforms/powermac/pfunc_core.c
+++ b/arch/powerpc/platforms/powermac/pfunc_core.c
@@ -546,7 +546,7 @@ struct pmf_device {
 };
 
 static LIST_HEAD(pmf_devices);
-static spinlock_t pmf_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(pmf_lock);
 static DEFINE_MUTEX(pmf_irq_mutex);
 
 static void pmf_release_device(struct kref *kref)
diff --git a/arch/powerpc/platforms/pseries/eeh_event.c b/arch/powerpc/platforms/pseries/eeh_event.c
index 8f2d12935b9..45ccc687e57 100644
--- a/arch/powerpc/platforms/pseries/eeh_event.c
+++ b/arch/powerpc/platforms/pseries/eeh_event.c
@@ -35,7 +35,7 @@
  */
 
 /* EEH event workqueue setup. */
-static spinlock_t eeh_eventlist_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(eeh_eventlist_lock);
 LIST_HEAD(eeh_eventlist);
 static void eeh_thread_launcher(void *);
 DECLARE_WORK(eeh_event_wq, eeh_thread_launcher, NULL);
diff --git a/arch/powerpc/sysdev/mmio_nvram.c b/arch/powerpc/sysdev/mmio_nvram.c
index 74e0d31a355..615350d46b5 100644
--- a/arch/powerpc/sysdev/mmio_nvram.c
+++ b/arch/powerpc/sysdev/mmio_nvram.c
@@ -32,7 +32,7 @@
 
 static void __iomem *mmio_nvram_start;
 static long mmio_nvram_len;
-static spinlock_t mmio_nvram_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(mmio_nvram_lock);
 
 static ssize_t mmio_nvram_read(char *buf, size_t count, loff_t *index)
 {
diff --git a/arch/xtensa/kernel/time.c b/arch/xtensa/kernel/time.c
index 937d81f62f4..fe14909f45e 100644
--- a/arch/xtensa/kernel/time.c
+++ b/arch/xtensa/kernel/time.c
@@ -29,7 +29,7 @@
 
 extern volatile unsigned long wall_jiffies;
 
-spinlock_t rtc_lock = SPIN_LOCK_UNLOCKED;
+DEFINE_SPINLOCK(rtc_lock);
 EXPORT_SYMBOL(rtc_lock);
 
 
diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c
index 225d64d73f0..27e409089a7 100644
--- a/arch/xtensa/kernel/traps.c
+++ b/arch/xtensa/kernel/traps.c
@@ -461,7 +461,7 @@ void show_code(unsigned int *pc)
 	}
 }
 
-spinlock_t die_lock = SPIN_LOCK_UNLOCKED;
+DEFINE_SPINLOCK(die_lock);
 
 void die(const char * str, struct pt_regs * regs, long err)
 {
diff --git a/drivers/char/drm/drm_memory_debug.h b/drivers/char/drm/drm_memory_debug.h
index 6543b9a14c4..d117cc99719 100644
--- a/drivers/char/drm/drm_memory_debug.h
+++ b/drivers/char/drm/drm_memory_debug.h
@@ -43,7 +43,7 @@ typedef struct drm_mem_stats {
 	unsigned long bytes_freed;
 } drm_mem_stats_t;
 
-static spinlock_t drm_mem_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(drm_mem_lock);
 static unsigned long drm_ram_available = 0;	/* In pages */
 static unsigned long drm_ram_used = 0;
 static drm_mem_stats_t drm_mem_stats[] =
diff --git a/drivers/char/drm/via_dmablit.c b/drivers/char/drm/via_dmablit.c
index b7f17457b42..78a81a4a99c 100644
--- a/drivers/char/drm/via_dmablit.c
+++ b/drivers/char/drm/via_dmablit.c
@@ -557,7 +557,7 @@ via_init_dmablit(drm_device_t *dev)
 		blitq->num_outstanding = 0;
 		blitq->is_active = 0;
 		blitq->aborting = 0;
-		blitq->blit_lock = SPIN_LOCK_UNLOCKED;
+		spin_lock_init(&blitq->blit_lock);
 		for (j=0; j<VIA_NUM_BLIT_SLOTS; ++j) {
 			DRM_INIT_WAITQUEUE(blitq->blit_queue + j);
 		}
diff --git a/drivers/char/epca.c b/drivers/char/epca.c
index 9cad8501d62..dc0602ae850 100644
--- a/drivers/char/epca.c
+++ b/drivers/char/epca.c
@@ -80,7 +80,7 @@ static int invalid_lilo_config;
 /* The ISA boards do window flipping into the same spaces so its only sane
    with a single lock. It's still pretty efficient */
 
-static spinlock_t epca_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(epca_lock);
 
 /* -----------------------------------------------------------------------
 	MAXBOARDS is typically 12, but ISA and EISA cards are restricted to 
diff --git a/drivers/char/moxa.c b/drivers/char/moxa.c
index f43c2e04ead..01247cccb89 100644
--- a/drivers/char/moxa.c
+++ b/drivers/char/moxa.c
@@ -301,7 +301,7 @@ static struct tty_operations moxa_ops = {
 	.tiocmset = moxa_tiocmset,
 };
 
-static spinlock_t moxa_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(moxa_lock);
 
 #ifdef CONFIG_PCI
 static int moxa_get_PCI_conf(struct pci_dev *p, int board_type, moxa_board_conf * board)
diff --git a/drivers/char/specialix.c b/drivers/char/specialix.c
index 1b5330299e3..d2d6b01dcd0 100644
--- a/drivers/char/specialix.c
+++ b/drivers/char/specialix.c
@@ -2477,7 +2477,7 @@ static int __init specialix_init(void)
 #endif
 
 	for (i = 0; i < SX_NBOARD; i++)
-		sx_board[i].lock = SPIN_LOCK_UNLOCKED;
+		spin_lock_init(&sx_board[i].lock);
 
 	if (sx_init_drivers()) {
 		func_exit();
diff --git a/drivers/char/sx.c b/drivers/char/sx.c
index 3b474723027..76b9107f7f8 100644
--- a/drivers/char/sx.c
+++ b/drivers/char/sx.c
@@ -2320,7 +2320,7 @@ static int sx_init_portstructs (int nboards, int nports)
 #ifdef NEW_WRITE_LOCKING
 			port->gs.port_write_mutex = MUTEX;
 #endif
-			port->gs.driver_lock = SPIN_LOCK_UNLOCKED;
+			spin_lock_init(&port->gs.driver_lock);
 			/*
 			 * Initializing wait queue
 			 */
diff --git a/drivers/isdn/gigaset/common.c b/drivers/isdn/gigaset/common.c
index acb7e265678..2a56bf33a67 100644
--- a/drivers/isdn/gigaset/common.c
+++ b/drivers/isdn/gigaset/common.c
@@ -981,7 +981,7 @@ exit:
 EXPORT_SYMBOL_GPL(gigaset_stop);
 
 static LIST_HEAD(drivers);
-static spinlock_t driver_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(driver_lock);
 
 struct cardstate *gigaset_get_cs_by_id(int id)
 {
diff --git a/drivers/leds/led-core.c b/drivers/leds/led-core.c
index fe6541326c7..9b015f9af35 100644
--- a/drivers/leds/led-core.c
+++ b/drivers/leds/led-core.c
@@ -18,7 +18,7 @@
 #include <linux/leds.h>
 #include "leds.h"
 
-rwlock_t leds_list_lock = RW_LOCK_UNLOCKED;
+DEFINE_RWLOCK(leds_list_lock);
 LIST_HEAD(leds_list);
 
 EXPORT_SYMBOL_GPL(leds_list);
diff --git a/drivers/leds/led-triggers.c b/drivers/leds/led-triggers.c
index 5e2cd8be119..1b1ce652396 100644
--- a/drivers/leds/led-triggers.c
+++ b/drivers/leds/led-triggers.c
@@ -26,7 +26,7 @@
 /*
  * Nests outside led_cdev->trigger_lock
  */
-static rwlock_t triggers_list_lock = RW_LOCK_UNLOCKED;
+static DEFINE_RWLOCK(triggers_list_lock);
 static LIST_HEAD(trigger_list);
 
 ssize_t led_trigger_store(struct class_device *dev, const char *buf,
diff --git a/drivers/misc/ibmasm/module.c b/drivers/misc/ibmasm/module.c
index 1fdf03fd2da..9706cc19134 100644
--- a/drivers/misc/ibmasm/module.c
+++ b/drivers/misc/ibmasm/module.c
@@ -85,7 +85,7 @@ static int __devinit ibmasm_init_one(struct pci_dev *pdev, const struct pci_devi
 	}
 	memset(sp, 0, sizeof(struct service_processor));
 
-	sp->lock = SPIN_LOCK_UNLOCKED;
+	spin_lock_init(&sp->lock);
 	INIT_LIST_HEAD(&sp->command_queue);
 
 	pci_set_drvdata(pdev, (void *)sp);
diff --git a/drivers/pcmcia/m8xx_pcmcia.c b/drivers/pcmcia/m8xx_pcmcia.c
index 0e07d953511..d0f68ab8f04 100644
--- a/drivers/pcmcia/m8xx_pcmcia.c
+++ b/drivers/pcmcia/m8xx_pcmcia.c
@@ -157,7 +157,7 @@ MODULE_LICENSE("Dual MPL/GPL");
 
 static int pcmcia_schlvl = PCMCIA_SCHLVL;
 
-static spinlock_t events_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(events_lock);
 
 
 #define PCMCIA_SOCKET_KEY_5V 1
@@ -644,7 +644,7 @@ static struct platform_device m8xx_device = {
 };
 
 static u32 pending_events[PCMCIA_SOCKETS_NO];
-static spinlock_t pending_event_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(pending_event_lock);
 
 static irqreturn_t m8xx_interrupt(int irq, void *dev, struct pt_regs *regs)
 {
diff --git a/drivers/rapidio/rio-access.c b/drivers/rapidio/rio-access.c
index b9fab2ae3a3..8b56bbdd011 100644
--- a/drivers/rapidio/rio-access.c
+++ b/drivers/rapidio/rio-access.c
@@ -17,8 +17,8 @@
  * These interrupt-safe spinlocks protect all accesses to RIO
  * configuration space and doorbell access.
  */
-static spinlock_t rio_config_lock = SPIN_LOCK_UNLOCKED;
-static spinlock_t rio_doorbell_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(rio_config_lock);
+static DEFINE_SPINLOCK(rio_doorbell_lock);
 
 /*
  *  Wrappers for all RIO configuration access functions.  They just check
diff --git a/drivers/rtc/rtc-sa1100.c b/drivers/rtc/rtc-sa1100.c
index ab486fbc828..9cd1cb304bb 100644
--- a/drivers/rtc/rtc-sa1100.c
+++ b/drivers/rtc/rtc-sa1100.c
@@ -45,7 +45,7 @@
 
 static unsigned long rtc_freq = 1024;
 static struct rtc_time rtc_alarm;
-static spinlock_t sa1100_rtc_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(sa1100_rtc_lock);
 
 static int rtc_update_alarm(struct rtc_time *alrm)
 {
diff --git a/drivers/rtc/rtc-vr41xx.c b/drivers/rtc/rtc-vr41xx.c
index 33e029207e2..4b9291dd444 100644
--- a/drivers/rtc/rtc-vr41xx.c
+++ b/drivers/rtc/rtc-vr41xx.c
@@ -93,7 +93,7 @@ static void __iomem *rtc2_base;
 
 static unsigned long epoch = 1970;	/* Jan 1 1970 00:00:00 */
 
-static spinlock_t rtc_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(rtc_lock);
 static char rtc_name[] = "RTC";
 static unsigned long periodic_frequency;
 static unsigned long periodic_count;
diff --git a/drivers/s390/block/dasd_eer.c b/drivers/s390/block/dasd_eer.c
index 2d946b6ca07..2d8af709947 100644
--- a/drivers/s390/block/dasd_eer.c
+++ b/drivers/s390/block/dasd_eer.c
@@ -89,7 +89,7 @@ struct eerbuffer {
 };
 
 static LIST_HEAD(bufferlist);
-static spinlock_t bufferlock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(bufferlock);
 static DECLARE_WAIT_QUEUE_HEAD(dasd_eer_read_wait_queue);
 
 /*
diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 6c66877be2b..855ce9a9d94 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -5733,7 +5733,7 @@ module_init(ata_init);
 module_exit(ata_exit);
 
 static unsigned long ratelimit_time;
-static spinlock_t ata_ratelimit_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(ata_ratelimit_lock);
 
 int ata_ratelimit(void)
 {
diff --git a/drivers/sn/ioc3.c b/drivers/sn/ioc3.c
index 501316b198e..ed946311d3a 100644
--- a/drivers/sn/ioc3.c
+++ b/drivers/sn/ioc3.c
@@ -26,7 +26,7 @@ static DECLARE_RWSEM(ioc3_devices_rwsem);
 
 static struct ioc3_submodule *ioc3_submodules[IOC3_MAX_SUBMODULES];
 static struct ioc3_submodule *ioc3_ethernet;
-static rwlock_t ioc3_submodules_lock = RW_LOCK_UNLOCKED;
+static DEFINE_RWLOCK(ioc3_submodules_lock);
 
 /* NIC probing code */
 
diff --git a/drivers/video/backlight/hp680_bl.c b/drivers/video/backlight/hp680_bl.c
index a71e984c93d..ffc72ae3ada 100644
--- a/drivers/video/backlight/hp680_bl.c
+++ b/drivers/video/backlight/hp680_bl.c
@@ -27,7 +27,7 @@
 
 static int hp680bl_suspended;
 static int current_intensity = 0;
-static spinlock_t bl_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(bl_lock);
 static struct backlight_device *hp680_backlight_device;
 
 static void hp680bl_send_intensity(struct backlight_device *bd)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 1630b5670dc..7c7d01672d3 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -123,7 +123,7 @@ static void release_stateid(struct nfs4_stateid *stp, int flags);
  */
 
 /* recall_lock protects the del_recall_lru */
-static spinlock_t recall_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(recall_lock);
 static struct list_head del_recall_lru;
 
 static void
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 21f38accd03..1d26cfcd9f8 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -54,7 +54,7 @@ static DECLARE_RWSEM(o2hb_callback_sem);
  * multiple hb threads are watching multiple regions.  A node is live
  * whenever any of the threads sees activity from the node in its region.
  */
-static spinlock_t o2hb_live_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(o2hb_live_lock);
 static struct list_head o2hb_live_slots[O2NM_MAX_NODES];
 static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
 static LIST_HEAD(o2hb_node_events);
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 0f60cc0d398..1591eb37a72 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -108,7 +108,7 @@
 	    ##args);							\
 } while (0)
 
-static rwlock_t o2net_handler_lock = RW_LOCK_UNLOCKED;
+static DEFINE_RWLOCK(o2net_handler_lock);
 static struct rb_root o2net_handler_tree = RB_ROOT;
 
 static struct o2net_node o2net_nodes[O2NM_MAX_NODES];
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index ba27c5c5e95..b8c23f7ba67 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -88,7 +88,7 @@ out_free:
  *
  */
 
-spinlock_t dlm_domain_lock = SPIN_LOCK_UNLOCKED;
+DEFINE_SPINLOCK(dlm_domain_lock);
 LIST_HEAD(dlm_domains);
 static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
 
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index d6f89577e25..5ca57ec650c 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -53,7 +53,7 @@
 #define MLOG_MASK_PREFIX ML_DLM
 #include "cluster/masklog.h"
 
-static spinlock_t dlm_cookie_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(dlm_cookie_lock);
 static u64 dlm_next_cookie = 1;
 
 static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index da399013516..29b2845f370 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -98,8 +98,8 @@ static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data);
 
 static u64 dlm_get_next_mig_cookie(void);
 
-static spinlock_t dlm_reco_state_lock = SPIN_LOCK_UNLOCKED;
-static spinlock_t dlm_mig_cookie_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(dlm_reco_state_lock);
+static DEFINE_SPINLOCK(dlm_mig_cookie_lock);
 static u64 dlm_mig_cookie = 1;
 
 static u64 dlm_get_next_mig_cookie(void)
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 64cd52860c8..4acd37286bd 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -242,7 +242,7 @@ static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
 	mlog_exit_void();
 }
 
-static spinlock_t ocfs2_dlm_tracking_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(ocfs2_dlm_tracking_lock);
 
 static void ocfs2_add_lockres_tracking(struct ocfs2_lock_res *res,
 				       struct ocfs2_dlm_debug *dlm_debug)
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 3fe8781c22c..910a601b2e9 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -49,7 +49,7 @@
 
 #include "buffer_head_io.h"
 
-spinlock_t trans_inc_lock = SPIN_LOCK_UNLOCKED;
+DEFINE_SPINLOCK(trans_inc_lock);
 
 static int ocfs2_force_read_journal(struct inode *inode);
 static int ocfs2_recover_node(struct ocfs2_super *osb,
diff --git a/include/asm-alpha/core_t2.h b/include/asm-alpha/core_t2.h
index dba70c62a16..457c34b6eb0 100644
--- a/include/asm-alpha/core_t2.h
+++ b/include/asm-alpha/core_t2.h
@@ -435,7 +435,7 @@ static inline void t2_outl(u32 b, unsigned long addr)
 	set_hae(msb); \
 }
 
-static spinlock_t t2_hae_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(t2_hae_lock);
 
 __EXTERN_INLINE u8 t2_readb(const volatile void __iomem *xaddr)
 {
diff --git a/kernel/audit.c b/kernel/audit.c
index 7dfac7031bd..82443fb433e 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -818,7 +818,7 @@ err:
  */
 unsigned int audit_serial(void)
 {
-	static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED;
+	static DEFINE_SPINLOCK(serial_lock);
 	static unsigned int serial = 0;
 
 	unsigned long flags;
diff --git a/mm/sparse.c b/mm/sparse.c
index e0a3fe48aa3..c7a2b3a0e46 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -45,7 +45,7 @@ static struct mem_section *sparse_index_alloc(int nid)
 
 static int sparse_index_init(unsigned long section_nr, int nid)
 {
-	static spinlock_t index_init_lock = SPIN_LOCK_UNLOCKED;
+	static DEFINE_SPINLOCK(index_init_lock);
 	unsigned long root = SECTION_NR_TO_ROOT(section_nr);
 	struct mem_section *section;
 	int ret = 0;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 8a777932786..e728980160d 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -349,7 +349,7 @@ static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
 	    (strict & RT6_SELECT_F_REACHABLE) &&
 	    last && last != rt0) {
 		/* no entries matched; do round-robin */
-		static spinlock_t lock = SPIN_LOCK_UNLOCKED;
+		static DEFINE_SPINLOCK(lock);
 		spin_lock(&lock);
 		*head = rt0->u.next;
 		rt0->u.next = last->u.next;
diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c
index f43311221a7..2f312164d6d 100644
--- a/net/sunrpc/auth_gss/gss_krb5_seal.c
+++ b/net/sunrpc/auth_gss/gss_krb5_seal.c
@@ -70,7 +70,7 @@
 # define RPCDBG_FACILITY        RPCDBG_AUTH
 #endif
 
-spinlock_t krb5_seq_lock = SPIN_LOCK_UNLOCKED;
+DEFINE_SPINLOCK(krb5_seq_lock);
 
 u32
 gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text,
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index 54128040a12..1bb75703f38 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -117,7 +117,7 @@ struct bclink {
 static struct bcbearer *bcbearer = NULL;
 static struct bclink *bclink = NULL;
 static struct link *bcl = NULL;
-static spinlock_t bc_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(bc_lock);
 
 char tipc_bclink_name[] = "multicast-link";
 
@@ -796,7 +796,7 @@ int tipc_bclink_init(void)
 	memset(bclink, 0, sizeof(struct bclink));
 	INIT_LIST_HEAD(&bcl->waiting_ports);
 	bcl->next_out_no = 1;
-	bclink->node.lock =  SPIN_LOCK_UNLOCKED;        
+	spin_lock_init(&bclink->node.lock);
 	bcl->owner = &bclink->node;
         bcl->max_pkt = MAX_PKT_DEFAULT_MCAST;
 	tipc_link_set_queue_limits(bcl, BCLINK_WIN_DEFAULT);
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 4fa24b5e891..7ef17a449cf 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -566,7 +566,7 @@ restart:
 		b_ptr->link_req = tipc_disc_init_link_req(b_ptr, &m_ptr->bcast_addr,
 							  bcast_scope, 2);
 	}
-	b_ptr->publ.lock = SPIN_LOCK_UNLOCKED;
+	spin_lock_init(&b_ptr->publ.lock);
 	write_unlock_bh(&tipc_net_lock);
 	info("Enabled bearer <%s>, discovery domain %s, priority %u\n",
 	     name, addr_string_fill(addr_string, bcast_scope), priority);
diff --git a/net/tipc/config.c b/net/tipc/config.c
index 3ec502fac8c..285e1bc2d88 100644
--- a/net/tipc/config.c
+++ b/net/tipc/config.c
@@ -63,7 +63,7 @@ struct manager {
 
 static struct manager mng = { 0};
 
-static spinlock_t config_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(config_lock);
 
 static const void *req_tlv_area;	/* request message TLV area */
 static int req_tlv_space;		/* request message TLV area size */
diff --git a/net/tipc/dbg.c b/net/tipc/dbg.c
index 26ef95d5fe3..55130655e1e 100644
--- a/net/tipc/dbg.c
+++ b/net/tipc/dbg.c
@@ -41,7 +41,7 @@
 #define MAX_STRING 512
 
 static char print_string[MAX_STRING];
-static spinlock_t print_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(print_lock);
 
 static struct print_buf cons_buf = { NULL, 0, NULL, NULL };
 struct print_buf *TIPC_CONS = &cons_buf;
diff --git a/net/tipc/handler.c b/net/tipc/handler.c
index 966f70a1b60..ae6ddf00a1a 100644
--- a/net/tipc/handler.c
+++ b/net/tipc/handler.c
@@ -44,7 +44,7 @@ struct queue_item {
 
 static kmem_cache_t *tipc_queue_item_cache;
 static struct list_head signal_queue_head;
-static spinlock_t qitem_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(qitem_lock);
 static int handler_enabled = 0;
 
 static void process_signal_queue(unsigned long dummy);
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index 38571306aba..a6926ff07bc 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -101,7 +101,7 @@ struct name_table {
 
 static struct name_table table = { NULL } ;
 static atomic_t rsv_publ_ok = ATOMIC_INIT(0);
-rwlock_t tipc_nametbl_lock = RW_LOCK_UNLOCKED;
+DEFINE_RWLOCK(tipc_nametbl_lock);
 
 
 static int hash(int x)
@@ -172,7 +172,7 @@ static struct name_seq *tipc_nameseq_create(u32 type, struct hlist_head *seq_hea
 	}
 
 	memset(nseq, 0, sizeof(*nseq));
-	nseq->lock = SPIN_LOCK_UNLOCKED;
+	spin_lock_init(&nseq->lock);
 	nseq->type = type;
 	nseq->sseqs = sseq;
 	dbg("tipc_nameseq_create(): nseq = %p, type %u, ssseqs %p, ff: %u\n",
diff --git a/net/tipc/net.c b/net/tipc/net.c
index f7c8223ddf7..e5a359ab493 100644
--- a/net/tipc/net.c
+++ b/net/tipc/net.c
@@ -115,7 +115,7 @@
  *     - A local spin_lock protecting the queue of subscriber events.
 */
 
-rwlock_t tipc_net_lock = RW_LOCK_UNLOCKED;
+DEFINE_RWLOCK(tipc_net_lock);
 struct network tipc_net = { NULL };
 
 struct node *tipc_net_select_remote_node(u32 addr, u32 ref) 
diff --git a/net/tipc/node.c b/net/tipc/node.c
index ce9678efa98..861322b935d 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -77,7 +77,7 @@ struct node *tipc_node_create(u32 addr)
 		
 	memset(n_ptr, 0, sizeof(*n_ptr));
 	n_ptr->addr = addr;
-	n_ptr->lock =  SPIN_LOCK_UNLOCKED;	
+                spin_lock_init(&n_ptr->lock);
 	INIT_LIST_HEAD(&n_ptr->nsub);
 	n_ptr->owner = c_ptr;
 	tipc_cltr_attach_node(c_ptr, n_ptr);
diff --git a/net/tipc/port.c b/net/tipc/port.c
index 47d97404e3e..3251c8d8e53 100644
--- a/net/tipc/port.c
+++ b/net/tipc/port.c
@@ -57,8 +57,8 @@
 static struct sk_buff *msg_queue_head = NULL;
 static struct sk_buff *msg_queue_tail = NULL;
 
-spinlock_t tipc_port_list_lock = SPIN_LOCK_UNLOCKED;
-static spinlock_t queue_lock = SPIN_LOCK_UNLOCKED;
+DEFINE_SPINLOCK(tipc_port_list_lock);
+static DEFINE_SPINLOCK(queue_lock);
 
 static LIST_HEAD(ports);
 static void port_handle_node_down(unsigned long ref);
diff --git a/net/tipc/ref.c b/net/tipc/ref.c
index d2f0cce10e2..596d3c8ff75 100644
--- a/net/tipc/ref.c
+++ b/net/tipc/ref.c
@@ -63,7 +63,7 @@
 
 struct ref_table tipc_ref_table = { NULL };
 
-static rwlock_t ref_table_lock = RW_LOCK_UNLOCKED;
+static DEFINE_RWLOCK(ref_table_lock);
 
 /**
  * tipc_ref_table_init - create reference table for objects
@@ -87,7 +87,7 @@ int tipc_ref_table_init(u32 requested_size, u32 start)
 	index_mask = sz - 1;
 	for (i = sz - 1; i >= 0; i--) {
 		table[i].object = NULL;
-		table[i].lock = SPIN_LOCK_UNLOCKED;
+		spin_lock_init(&table[i].lock);
 		table[i].data.next_plus_upper = (start & ~index_mask) + i - 1;
 	}
 	tipc_ref_table.entries = table;
diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index fc171875660..e19b4bcd67e 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -457,7 +457,7 @@ int tipc_subscr_start(void)
 	int res = -1;
 
 	memset(&topsrv, 0, sizeof (topsrv));
-	topsrv.lock = SPIN_LOCK_UNLOCKED;
+	spin_lock_init(&topsrv.lock);
 	INIT_LIST_HEAD(&topsrv.subscriber_list);
 
 	spin_lock_bh(&topsrv.lock);
diff --git a/net/tipc/user_reg.c b/net/tipc/user_reg.c
index 3f3f933976e..1e3ae57c722 100644
--- a/net/tipc/user_reg.c
+++ b/net/tipc/user_reg.c
@@ -67,7 +67,7 @@ struct tipc_user {
 
 static struct tipc_user *users = NULL;
 static u32 next_free_user = MAX_USERID + 1;
-static spinlock_t reg_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(reg_lock);
 
 /**
  * reg_init - create TIPC user registry (but don't activate it)
-- 
cgit v1.2.3-70-g09d2


From 6527c2bdf1f833cc18e8f42bd97973d583e4aa83 Mon Sep 17 00:00:00 2001
From: "Vladimir V. Saveliev" <vs@namesys.com>
Date: Tue, 27 Jun 2006 02:53:57 -0700
Subject: [PATCH] generic_file_buffered_write(): deadlock on vectored write

generic_file_buffered_write() prefaults in user pages in order to avoid
deadlock on copying from the same page as write goes to.

However, it looks like there is a problem when write is vectored:
fault_in_pages_readable brings in current segment or its part (maxlen).
OTOH, filemap_copy_from_user_iovec is called to copy number of bytes
(bytes) which may exceed current segment, so filemap_copy_from_user_iovec
switches to the next segment which is not brought in yet.  Pagefault is
generated.  That causes the deadlock if pagefault is for the same page
write goes to: page being written is locked and not uptodate, pagefault
will deadlock trying to lock locked page.

[akpm@osdl.org: somewhat rewritten]
Cc: Neil Brown <neilb@suse.de>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/filemap.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index 9c7334bafda..d504d6e9888 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2095,14 +2095,21 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 	do {
 		unsigned long index;
 		unsigned long offset;
-		unsigned long maxlen;
 		size_t copied;
 
 		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
 		index = pos >> PAGE_CACHE_SHIFT;
 		bytes = PAGE_CACHE_SIZE - offset;
-		if (bytes > count)
-			bytes = count;
+
+		/* Limit the size of the copy to the caller's write size */
+		bytes = min(bytes, count);
+
+		/*
+		 * Limit the size of the copy to that of the current segment,
+		 * because fault_in_pages_readable() doesn't know how to walk
+		 * segments.
+		 */
+		bytes = min(bytes, cur_iov->iov_len - iov_base);
 
 		/*
 		 * Bring in the user page that we will copy from _first_.
@@ -2110,10 +2117,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 		 * same page as we're writing to, without it being marked
 		 * up-to-date.
 		 */
-		maxlen = cur_iov->iov_len - iov_base;
-		if (maxlen > bytes)
-			maxlen = bytes;
-		fault_in_pages_readable(buf, maxlen);
+		fault_in_pages_readable(buf, bytes);
 
 		page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
 		if (!page) {
-- 
cgit v1.2.3-70-g09d2


From 9c7b216d23e820e0e148d5be01bbb5bd2d8378fe Mon Sep 17 00:00:00 2001
From: Chandra Seetharaman <sekharan@us.ibm.com>
Date: Tue, 27 Jun 2006 02:54:07 -0700
Subject: [PATCH] cpu hotplug: revert init patch submitted for 2.6.17

In 2.6.17, there was a problem with cpu_notifiers and XFS.  I provided a
band-aid solution to solve that problem.  In the process, i undid all the
changes you both were making to ensure that these notifiers were available
only at init time (unless CONFIG_HOTPLUG_CPU is defined).

We deferred the real fix to 2.6.18.  Here is a set of patches that fixes the
XFS problem cleanly and makes the cpu notifiers available only at init time
(unless CONFIG_HOTPLUG_CPU is defined).

If CONFIG_HOTPLUG_CPU is defined then cpu notifiers are available at run
time.

This patch reverts the notifier_call changes made in 2.6.17

Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Cc: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/cpu/intel_cacheinfo.c | 2 +-
 arch/ia64/kernel/palinfo.c             | 2 +-
 arch/ia64/kernel/salinfo.c             | 2 +-
 arch/ia64/kernel/topology.c            | 2 +-
 arch/powerpc/kernel/sysfs.c            | 2 +-
 arch/x86_64/kernel/mce.c               | 2 +-
 drivers/base/topology.c                | 2 +-
 drivers/cpufreq/cpufreq.c              | 2 +-
 kernel/hrtimer.c                       | 2 +-
 kernel/profile.c                       | 2 +-
 kernel/rcupdate.c                      | 2 +-
 kernel/softirq.c                       | 2 +-
 kernel/softlockup.c                    | 2 +-
 kernel/timer.c                         | 2 +-
 kernel/workqueue.c                     | 2 +-
 mm/page_alloc.c                        | 2 +-
 mm/slab.c                              | 2 +-
 mm/vmscan.c                            | 2 +-
 18 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'mm')

diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c
index 8a92642ea59..1d4ab104798 100644
--- a/arch/i386/kernel/cpu/intel_cacheinfo.c
+++ b/arch/i386/kernel/cpu/intel_cacheinfo.c
@@ -729,7 +729,7 @@ static void __cpuexit cache_remove_dev(struct sys_device * sys_dev)
 	return;
 }
 
-static int cacheinfo_cpu_callback(struct notifier_block *nfb,
+static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb,
 					unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
diff --git a/arch/ia64/kernel/palinfo.c b/arch/ia64/kernel/palinfo.c
index 859fb37ff49..6386f63c413 100644
--- a/arch/ia64/kernel/palinfo.c
+++ b/arch/ia64/kernel/palinfo.c
@@ -959,7 +959,7 @@ remove_palinfo_proc_entries(unsigned int hcpu)
 	}
 }
 
-static int palinfo_cpu_callback(struct notifier_block *nfb,
+static int __devinit palinfo_cpu_callback(struct notifier_block *nfb,
 								unsigned long action,
 								void *hcpu)
 {
diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c
index 663a186ad19..9d5a823479a 100644
--- a/arch/ia64/kernel/salinfo.c
+++ b/arch/ia64/kernel/salinfo.c
@@ -572,7 +572,7 @@ static struct file_operations salinfo_data_fops = {
 };
 
 #ifdef	CONFIG_HOTPLUG_CPU
-static int
+static int __devinit
 salinfo_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
 {
 	unsigned int i, cpu = (unsigned long)hcpu;
diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c
index 5737c9a061e..f07c382b57b 100644
--- a/arch/ia64/kernel/topology.c
+++ b/arch/ia64/kernel/topology.c
@@ -404,7 +404,7 @@ static int __cpuinit cache_remove_dev(struct sys_device * sys_dev)
  * When a cpu is hot-plugged, do a check and initiate
  * cache kobject if necessary
  */
-static int cache_cpu_callback(struct notifier_block *nfb,
+static int __cpuinit cache_cpu_callback(struct notifier_block *nfb,
 		unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index 412ad00e222..0231869613c 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -279,7 +279,7 @@ static void unregister_cpu_online(unsigned int cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static int sysfs_cpu_notify(struct notifier_block *self,
+static int __devinit sysfs_cpu_notify(struct notifier_block *self,
 				      unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned int)(long)hcpu;
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index acd5816b1a6..efe8500a5b9 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -629,7 +629,7 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
 #endif
 
 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
-static int
+static __cpuinit int
 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
diff --git a/drivers/base/topology.c b/drivers/base/topology.c
index 8c52421cbc5..915810f6237 100644
--- a/drivers/base/topology.c
+++ b/drivers/base/topology.c
@@ -107,7 +107,7 @@ static int __cpuinit topology_remove_dev(struct sys_device * sys_dev)
 	return 0;
 }
 
-static int topology_cpu_callback(struct notifier_block *nfb,
+static int __cpuinit topology_cpu_callback(struct notifier_block *nfb,
 		unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 44d1eca83a7..486ef666470 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1497,7 +1497,7 @@ int cpufreq_update_policy(unsigned int cpu)
 }
 EXPORT_SYMBOL(cpufreq_update_policy);
 
-static int cpufreq_cpu_callback(struct notifier_block *nfb,
+static int __cpuinit cpufreq_cpu_callback(struct notifier_block *nfb,
 					unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 55601b3ce60..f9f53191661 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -833,7 +833,7 @@ static void migrate_hrtimers(int cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static int hrtimer_cpu_notify(struct notifier_block *self,
+static int __devinit hrtimer_cpu_notify(struct notifier_block *self,
 					unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
diff --git a/kernel/profile.c b/kernel/profile.c
index 68afe121e50..5a730fdb1a2 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -299,7 +299,7 @@ out:
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-static int profile_cpu_callback(struct notifier_block *info,
+static int __devinit profile_cpu_callback(struct notifier_block *info,
 					unsigned long action, void *__cpu)
 {
 	int node, cpu = (unsigned long)__cpu;
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index c0e1cb95dd4..a8d80b7048b 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -548,7 +548,7 @@ static void __devinit rcu_online_cpu(int cpu)
 	tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL);
 }
 
-static int rcu_cpu_notify(struct notifier_block *self,
+static int __devinit rcu_cpu_notify(struct notifier_block *self,
 				unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 9e2f1c6e73d..db65a311f14 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -446,7 +446,7 @@ static void takeover_tasklets(unsigned int cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static int cpu_callback(struct notifier_block *nfb,
+static int __devinit cpu_callback(struct notifier_block *nfb,
 				  unsigned long action,
 				  void *hcpu)
 {
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index b5c3b94e01c..29da0a847ba 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -104,7 +104,7 @@ static int watchdog(void * __bind_cpu)
 /*
  * Create/destroy watchdog threads as CPUs come and go:
  */
-static int
+static int __devinit
 cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
 	int hotcpu = (unsigned long)hcpu;
diff --git a/kernel/timer.c b/kernel/timer.c
index 5bb6b7976ee..878194ec8bd 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1652,7 +1652,7 @@ static void __devinit migrate_timers(int cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static int timer_cpu_notify(struct notifier_block *self,
+static int __devinit timer_cpu_notify(struct notifier_block *self,
 				unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 565cf7a1feb..59f0b42bd89 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -559,7 +559,7 @@ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
 }
 
 /* We're holding the cpucontrol mutex here */
-static int workqueue_cpu_callback(struct notifier_block *nfb,
+static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 				  unsigned long action,
 				  void *hcpu)
 {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9f86191bb63..e9fb2d4064c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2009,7 +2009,7 @@ static inline void free_zone_pagesets(int cpu)
 	}
 }
 
-static int pageset_cpuup_callback(struct notifier_block *nfb,
+static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
 		unsigned long action,
 		void *hcpu)
 {
diff --git a/mm/slab.c b/mm/slab.c
index 47982c2d9f3..631c0feb964 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1073,7 +1073,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
 
 #endif
 
-static int cpuup_callback(struct notifier_block *nfb,
+static int __devinit cpuup_callback(struct notifier_block *nfb,
 				    unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f03da33d914..eeacb0d695c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1450,7 +1450,7 @@ out:
    not required for correctness.  So if the last cpu in a node goes
    away, we get changed to run anywhere: as the first one comes back,
    restore their cpu bindings. */
-static int cpu_callback(struct notifier_block *nfb,
+static int __devinit cpu_callback(struct notifier_block *nfb,
 				  unsigned long action, void *hcpu)
 {
 	pg_data_t *pgdat;
-- 
cgit v1.2.3-70-g09d2


From 74b85f3790aa2550c617fe14439482e13e615fa0 Mon Sep 17 00:00:00 2001
From: Chandra Seetharaman <sekharan@us.ibm.com>
Date: Tue, 27 Jun 2006 02:54:09 -0700
Subject: [PATCH] cpu hotplug: make cpu_notifier related notifier blocks
 __cpuinit only

Make notifier_blocks associated with cpu_notifier as __cpuinitdata.

__cpuinitdata makes sure that the data is init time only unless
CONFIG_HOTPLUG_CPU is defined.

Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Cc: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/cpu/intel_cacheinfo.c |  2 +-
 arch/i386/kernel/cpuid.c               |  2 +-
 arch/i386/kernel/msr.c                 |  2 +-
 arch/ia64/kernel/palinfo.c             |  4 ++--
 arch/ia64/kernel/topology.c            |  2 +-
 arch/powerpc/mm/numa.c                 | 11 ++++++-----
 arch/x86_64/kernel/mce.c               |  2 +-
 drivers/base/topology.c                |  2 +-
 drivers/cpufreq/cpufreq.c              |  2 +-
 mm/page-writeback.c                    |  2 +-
 mm/page_alloc.c                        |  2 +-
 mm/slab.c                              |  4 +++-
 12 files changed, 20 insertions(+), 17 deletions(-)

(limited to 'mm')

diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c
index 1d4ab104798..e9f0b928b0a 100644
--- a/arch/i386/kernel/cpu/intel_cacheinfo.c
+++ b/arch/i386/kernel/cpu/intel_cacheinfo.c
@@ -747,7 +747,7 @@ static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block cacheinfo_cpu_notifier =
+static struct notifier_block __cpuinitdata cacheinfo_cpu_notifier =
 {
     .notifier_call = cacheinfo_cpu_callback,
 };
diff --git a/arch/i386/kernel/cpuid.c b/arch/i386/kernel/cpuid.c
index 1d9a4abcdfc..f6dfa9fb675 100644
--- a/arch/i386/kernel/cpuid.c
+++ b/arch/i386/kernel/cpuid.c
@@ -183,7 +183,7 @@ static int cpuid_class_cpu_callback(struct notifier_block *nfb, unsigned long ac
 	return NOTIFY_OK;
 }
 
-static struct notifier_block cpuid_class_cpu_notifier =
+static struct notifier_block __cpuinitdata cpuid_class_cpu_notifier =
 {
 	.notifier_call = cpuid_class_cpu_callback,
 };
diff --git a/arch/i386/kernel/msr.c b/arch/i386/kernel/msr.c
index 7a328230e54..d022cb8fd72 100644
--- a/arch/i386/kernel/msr.c
+++ b/arch/i386/kernel/msr.c
@@ -266,7 +266,7 @@ static int msr_class_cpu_callback(struct notifier_block *nfb, unsigned long acti
 	return NOTIFY_OK;
 }
 
-static struct notifier_block msr_class_cpu_notifier =
+static struct notifier_block __cpuinitdata msr_class_cpu_notifier =
 {
 	.notifier_call = msr_class_cpu_callback,
 };
diff --git a/arch/ia64/kernel/palinfo.c b/arch/ia64/kernel/palinfo.c
index 6386f63c413..303a9afcf2a 100644
--- a/arch/ia64/kernel/palinfo.c
+++ b/arch/ia64/kernel/palinfo.c
@@ -959,7 +959,7 @@ remove_palinfo_proc_entries(unsigned int hcpu)
 	}
 }
 
-static int __devinit palinfo_cpu_callback(struct notifier_block *nfb,
+static int __cpuinit palinfo_cpu_callback(struct notifier_block *nfb,
 								unsigned long action,
 								void *hcpu)
 {
@@ -978,7 +978,7 @@ static int __devinit palinfo_cpu_callback(struct notifier_block *nfb,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block palinfo_cpu_notifier =
+static struct notifier_block __cpuinitdata palinfo_cpu_notifier =
 {
 	.notifier_call = palinfo_cpu_callback,
 	.priority = 0,
diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c
index f07c382b57b..5511d9c6c70 100644
--- a/arch/ia64/kernel/topology.c
+++ b/arch/ia64/kernel/topology.c
@@ -422,7 +422,7 @@ static int __cpuinit cache_cpu_callback(struct notifier_block *nfb,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block cache_cpu_notifier =
+static struct notifier_block __cpuinitdata cache_cpu_notifier =
 {
 	.notifier_call = cache_cpu_callback
 };
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index aa98cb3b59d..fbe23933f73 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -334,7 +334,7 @@ out:
 	return nid;
 }
 
-static int cpu_numa_callback(struct notifier_block *nfb,
+static int __cpuinit cpu_numa_callback(struct notifier_block *nfb,
 			     unsigned long action,
 			     void *hcpu)
 {
@@ -609,14 +609,15 @@ static void __init *careful_allocation(int nid, unsigned long size,
 	return (void *)ret;
 }
 
+static struct notifier_block __cpuinitdata ppc64_numa_nb = {
+	.notifier_call = cpu_numa_callback,
+	.priority = 1 /* Must run before sched domains notifier. */
+};
+
 void __init do_init_bootmem(void)
 {
 	int nid;
 	unsigned int i;
-	static struct notifier_block ppc64_numa_nb = {
-		.notifier_call = cpu_numa_callback,
-		.priority = 1 /* Must run before sched domains notifier. */
-	};
 
 	min_low_pfn = 0;
 	max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index efe8500a5b9..88845674c66 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -647,7 +647,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	return NOTIFY_OK;
 }
 
-static struct notifier_block mce_cpu_notifier = {
+static struct notifier_block __cpuinitdata mce_cpu_notifier = {
 	.notifier_call = mce_cpu_callback,
 };
 
diff --git a/drivers/base/topology.c b/drivers/base/topology.c
index 915810f6237..c2d62163238 100644
--- a/drivers/base/topology.c
+++ b/drivers/base/topology.c
@@ -125,7 +125,7 @@ static int __cpuinit topology_cpu_callback(struct notifier_block *nfb,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block topology_cpu_notifier =
+static struct notifier_block __cpuinitdata topology_cpu_notifier =
 {
 	.notifier_call = topology_cpu_callback,
 };
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 3533e26f837..35e0b9ceecf 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1533,7 +1533,7 @@ static int cpufreq_cpu_callback(struct notifier_block *nfb,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block cpufreq_cpu_notifier =
+static struct notifier_block __cpuinitdata cpufreq_cpu_notifier =
 {
     .notifier_call = cpufreq_cpu_callback,
 };
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 8ccf6f1b147..8ac6bfb4007 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -523,7 +523,7 @@ ratelimit_handler(struct notifier_block *self, unsigned long u, void *v)
 	return 0;
 }
 
-static struct notifier_block ratelimit_nb = {
+static struct notifier_block __cpuinitdata ratelimit_nb = {
 	.notifier_call	= ratelimit_handler,
 	.next		= NULL,
 };
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e9fb2d4064c..dafd12ec7a0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2031,7 +2031,7 @@ static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
 	return ret;
 }
 
-static struct notifier_block pageset_notifier =
+static struct notifier_block __cpuinitdata pageset_notifier =
 	{ &pageset_cpuup_callback, NULL, 0 };
 
 void __init setup_per_cpu_pageset(void)
diff --git a/mm/slab.c b/mm/slab.c
index 631c0feb964..d1d55279202 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1255,7 +1255,9 @@ bad:
 	return NOTIFY_BAD;
 }
 
-static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
+static struct notifier_block __cpuinitdata cpucache_notifier = {
+	&cpuup_callback, NULL, 0
+};
 
 /*
  * swap the static kmem_list3 with kmalloced memory
-- 
cgit v1.2.3-70-g09d2


From 26c2143b63b8078d08d562733716de142927e17a Mon Sep 17 00:00:00 2001
From: Chandra Seetharaman <sekharan@us.ibm.com>
Date: Tue, 27 Jun 2006 02:54:10 -0700
Subject: [PATCH] cpu hotplug: make cpu_notifier related notifier calls
 __cpuinit only

Make notifier_calls associated with cpu_notifier as __cpuinit.

__cpuinit makes sure that the function is init time only unless
CONFIG_HOTPLUG_CPU is defined.

[akpm@osdl.org: section fix]
Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Cc: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c      | 7 ++++---
 mm/page-writeback.c | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/kernel/sched.c b/kernel/sched.c
index b8deddb7833..3e57712aefd 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4734,8 +4734,9 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
  * migration_call - callback that gets triggered when a CPU is added.
  * Here we can start up the necessary migration thread for the new CPU.
  */
-static int migration_call(struct notifier_block *nfb, unsigned long action,
-			  void *hcpu)
+static int __cpuinit migration_call(struct notifier_block *nfb,
+			unsigned long action,
+			void *hcpu)
 {
 	int cpu = (long)hcpu;
 	struct task_struct *p;
@@ -4805,7 +4806,7 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
 /* Register at highest priority so that task migration (migrate_all_tasks)
  * happens before everything else.
  */
-static struct notifier_block __devinitdata migration_notifier = {
+static struct notifier_block __cpuinitdata migration_notifier = {
 	.notifier_call = migration_call,
 	.priority = 10
 };
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 8ac6bfb4007..4ec7026c7ba 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -516,7 +516,7 @@ static void set_ratelimit(void)
 		ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE;
 }
 
-static int
+static int __cpuinit
 ratelimit_handler(struct notifier_block *self, unsigned long u, void *v)
 {
 	set_ratelimit();
-- 
cgit v1.2.3-70-g09d2


From f9b8404cf8f8456dfa83459510762b700dc00385 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 27 Jun 2006 02:54:49 -0700
Subject: [PATCH] pi-futex: introduce debug_check_no_locks_freed()

Add debug_check_no_locks_freed(), as a central inline to add
bad-lock-free-debugging functionality to.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/mm/pageattr.c |  4 ++--
 include/linux/mm.h      | 10 ++++++++--
 mm/page_alloc.c         |  4 ++--
 mm/slab.c               |  2 +-
 4 files changed, 13 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c
index 0887b34bc59..353a836ed63 100644
--- a/arch/i386/mm/pageattr.c
+++ b/arch/i386/mm/pageattr.c
@@ -229,8 +229,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
 	if (PageHighMem(page))
 		return;
 	if (!enable)
-		mutex_debug_check_no_locks_freed(page_address(page),
-						 numpages * PAGE_SIZE);
+		debug_check_no_locks_freed(page_address(page),
+					   numpages * PAGE_SIZE);
 
 	/* the return value is ignored - the calls cannot fail,
 	 * large pages are disabled at boot time.
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ff1fa87df8d..0ac255720f3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1030,13 +1030,19 @@ static inline void vm_stat_account(struct mm_struct *mm,
 }
 #endif /* CONFIG_PROC_FS */
 
+static inline void
+debug_check_no_locks_freed(const void *from, unsigned long len)
+{
+	mutex_debug_check_no_locks_freed(from, len);
+}
+
 #ifndef CONFIG_DEBUG_PAGEALLOC
 static inline void
 kernel_map_pages(struct page *page, int numpages, int enable)
 {
 	if (!PageHighMem(page) && !enable)
-		mutex_debug_check_no_locks_freed(page_address(page),
-						 numpages * PAGE_SIZE);
+		debug_check_no_locks_freed(page_address(page),
+					   numpages * PAGE_SIZE);
 }
 #endif
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dafd12ec7a0..084a2de7e52 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -446,8 +446,8 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 
 	arch_free_page(page, order);
 	if (!PageHighMem(page))
-		mutex_debug_check_no_locks_freed(page_address(page),
-						 PAGE_SIZE<<order);
+		debug_check_no_locks_freed(page_address(page),
+					   PAGE_SIZE<<order);
 
 	for (i = 0 ; i < (1 << order) ; ++i)
 		reserved += free_pages_check(page + i);
diff --git a/mm/slab.c b/mm/slab.c
index d1d55279202..f378d027c68 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3397,7 +3397,7 @@ void kfree(const void *objp)
 	local_irq_save(flags);
 	kfree_debugcheck(objp);
 	c = virt_to_cache(objp);
-	mutex_debug_check_no_locks_freed(objp, obj_size(c));
+	debug_check_no_locks_freed(objp, obj_size(c));
 	__cache_free(c, (void *)objp);
 	local_irq_restore(flags);
 }
-- 
cgit v1.2.3-70-g09d2


From e7eebaf6a81b956c989f184ee4b27277c88f8afe Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 27 Jun 2006 02:54:55 -0700
Subject: [PATCH] pi-futex: rt mutex debug

Runtime debugging functionality for rt-mutexes.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h      |   1 +
 include/linux/rtmutex.h |  15 +-
 kernel/Makefile         |   1 +
 kernel/exit.c           |   1 +
 kernel/rtmutex-debug.c  | 513 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/rtmutex-debug.h  |  37 ++++
 lib/Kconfig.debug       |  13 ++
 mm/slab.c               |   1 +
 8 files changed, 581 insertions(+), 1 deletion(-)
 create mode 100644 kernel/rtmutex-debug.c
 create mode 100644 kernel/rtmutex-debug.h

(limited to 'mm')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0ac255720f3..c41a1299b8c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1034,6 +1034,7 @@ static inline void
 debug_check_no_locks_freed(const void *from, unsigned long len)
 {
 	mutex_debug_check_no_locks_freed(from, len);
+	rt_mutex_debug_check_no_locks_freed(from, len);
 }
 
 #ifndef CONFIG_DEBUG_PAGEALLOC
diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
index 12309c916c6..fa4a3b82ba7 100644
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -40,6 +40,19 @@ struct rt_mutex {
 struct rt_mutex_waiter;
 struct hrtimer_sleeper;
 
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+ extern int rt_mutex_debug_check_no_locks_freed(const void *from,
+						unsigned long len);
+ extern void rt_mutex_debug_check_no_locks_held(struct task_struct *task);
+#else
+ static inline int rt_mutex_debug_check_no_locks_freed(const void *from,
+						       unsigned long len)
+ {
+	return 0;
+ }
+# define rt_mutex_debug_check_no_locks_held(task)	do { } while (0)
+#endif
+
 #ifdef CONFIG_DEBUG_RT_MUTEXES
 # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
 	, .name = #mutexname, .file = __FILE__, .line = __LINE__
@@ -48,7 +61,7 @@ struct hrtimer_sleeper;
 #else
 # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
 # define rt_mutex_init(mutex)			__rt_mutex_init(mutex, NULL)
-# define rt_mutex_debug_task_free(t)		do { } while (0)
+# define rt_mutex_debug_task_free(t)			do { } while (0)
 #endif
 
 #define __RT_MUTEX_INITIALIZER(mutexname) \
diff --git a/kernel/Makefile b/kernel/Makefile
index 21df9a338ff..f9c92d34cde 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -17,6 +17,7 @@ ifeq ($(CONFIG_COMPAT),y)
 obj-$(CONFIG_FUTEX) += futex_compat.o
 endif
 obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
+obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
 obj-$(CONFIG_SMP) += cpu.o spinlock.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
diff --git a/kernel/exit.c b/kernel/exit.c
index 304ef637be6..3e8a0282e9a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -929,6 +929,7 @@ fastcall NORET_TYPE void do_exit(long code)
 	 * If DEBUG_MUTEXES is on, make sure we are holding no locks:
 	 */
 	mutex_debug_check_no_locks_held(tsk);
+	rt_mutex_debug_check_no_locks_held(tsk);
 
 	if (tsk->io_context)
 		exit_io_context();
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
new file mode 100644
index 00000000000..4aa8a2c9f45
--- /dev/null
+++ b/kernel/rtmutex-debug.c
@@ -0,0 +1,513 @@
+/*
+ * RT-Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This code is based on the rt.c implementation in the preempt-rt tree.
+ * Portions of said code are
+ *
+ *  Copyright (C) 2004  LynuxWorks, Inc., Igor Manyilov, Bill Huey
+ *  Copyright (C) 2006  Esben Nielsen
+ *  Copyright (C) 2006  Kihon Technologies Inc.,
+ *			Steven Rostedt <rostedt@goodmis.org>
+ *
+ * See rt.c in preempt-rt for proper credits and further information
+ */
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/kallsyms.h>
+#include <linux/syscalls.h>
+#include <linux/interrupt.h>
+#include <linux/plist.h>
+#include <linux/fs.h>
+
+#include "rtmutex_common.h"
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# include "rtmutex-debug.h"
+#else
+# include "rtmutex.h"
+#endif
+
+# define TRACE_WARN_ON(x)			WARN_ON(x)
+# define TRACE_BUG_ON(x)			BUG_ON(x)
+
+# define TRACE_OFF()						\
+do {								\
+	if (rt_trace_on) {					\
+		rt_trace_on = 0;				\
+		console_verbose();				\
+		if (spin_is_locked(&current->pi_lock))		\
+			spin_unlock(&current->pi_lock);		\
+		if (spin_is_locked(&current->held_list_lock))	\
+			spin_unlock(&current->held_list_lock);	\
+	}							\
+} while (0)
+
+# define TRACE_OFF_NOLOCK()					\
+do {								\
+	if (rt_trace_on) {					\
+		rt_trace_on = 0;				\
+		console_verbose();				\
+	}							\
+} while (0)
+
+# define TRACE_BUG_LOCKED()			\
+do {						\
+	TRACE_OFF();				\
+	BUG();					\
+} while (0)
+
+# define TRACE_WARN_ON_LOCKED(c)		\
+do {						\
+	if (unlikely(c)) {			\
+		TRACE_OFF();			\
+		WARN_ON(1);			\
+	}					\
+} while (0)
+
+# define TRACE_BUG_ON_LOCKED(c)			\
+do {						\
+	if (unlikely(c))			\
+		TRACE_BUG_LOCKED();		\
+} while (0)
+
+#ifdef CONFIG_SMP
+# define SMP_TRACE_BUG_ON_LOCKED(c)	TRACE_BUG_ON_LOCKED(c)
+#else
+# define SMP_TRACE_BUG_ON_LOCKED(c)	do { } while (0)
+#endif
+
+/*
+ * deadlock detection flag. We turn it off when we detect
+ * the first problem because we dont want to recurse back
+ * into the tracing code when doing error printk or
+ * executing a BUG():
+ */
+int rt_trace_on = 1;
+
+void deadlock_trace_off(void)
+{
+	rt_trace_on = 0;
+}
+
+static void printk_task(task_t *p)
+{
+	if (p)
+		printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio);
+	else
+		printk("<none>");
+}
+
+static void printk_task_short(task_t *p)
+{
+	if (p)
+		printk("%s/%d [%p, %3d]", p->comm, p->pid, p, p->prio);
+	else
+		printk("<none>");
+}
+
+static void printk_lock(struct rt_mutex *lock, int print_owner)
+{
+	if (lock->name)
+		printk(" [%p] {%s}\n",
+			lock, lock->name);
+	else
+		printk(" [%p] {%s:%d}\n",
+			lock, lock->file, lock->line);
+
+	if (print_owner && rt_mutex_owner(lock)) {
+		printk(".. ->owner: %p\n", lock->owner);
+		printk(".. held by:  ");
+		printk_task(rt_mutex_owner(lock));
+		printk("\n");
+	}
+	if (rt_mutex_owner(lock)) {
+		printk("... acquired at:               ");
+		print_symbol("%s\n", lock->acquire_ip);
+	}
+}
+
+static void printk_waiter(struct rt_mutex_waiter *w)
+{
+	printk("-------------------------\n");
+	printk("| waiter struct %p:\n", w);
+	printk("| w->list_entry: [DP:%p/%p|SP:%p/%p|PRI:%d]\n",
+	       w->list_entry.plist.prio_list.prev, w->list_entry.plist.prio_list.next,
+	       w->list_entry.plist.node_list.prev, w->list_entry.plist.node_list.next,
+	       w->list_entry.prio);
+	printk("| w->pi_list_entry: [DP:%p/%p|SP:%p/%p|PRI:%d]\n",
+	       w->pi_list_entry.plist.prio_list.prev, w->pi_list_entry.plist.prio_list.next,
+	       w->pi_list_entry.plist.node_list.prev, w->pi_list_entry.plist.node_list.next,
+	       w->pi_list_entry.prio);
+	printk("\n| lock:\n");
+	printk_lock(w->lock, 1);
+	printk("| w->ti->task:\n");
+	printk_task(w->task);
+	printk("| blocked at:  ");
+	print_symbol("%s\n", w->ip);
+	printk("-------------------------\n");
+}
+
+static void show_task_locks(task_t *p)
+{
+	switch (p->state) {
+	case TASK_RUNNING:		printk("R"); break;
+	case TASK_INTERRUPTIBLE:	printk("S"); break;
+	case TASK_UNINTERRUPTIBLE:	printk("D"); break;
+	case TASK_STOPPED:		printk("T"); break;
+	case EXIT_ZOMBIE:		printk("Z"); break;
+	case EXIT_DEAD:			printk("X"); break;
+	default:			printk("?"); break;
+	}
+	printk_task(p);
+	if (p->pi_blocked_on) {
+		struct rt_mutex *lock = p->pi_blocked_on->lock;
+
+		printk(" blocked on:");
+		printk_lock(lock, 1);
+	} else
+		printk(" (not blocked)\n");
+}
+
+void rt_mutex_show_held_locks(task_t *task, int verbose)
+{
+	struct list_head *curr, *cursor = NULL;
+	struct rt_mutex *lock;
+	task_t *t;
+	unsigned long flags;
+	int count = 0;
+
+	if (!rt_trace_on)
+		return;
+
+	if (verbose) {
+		printk("------------------------------\n");
+		printk("| showing all locks held by: |  (");
+		printk_task_short(task);
+		printk("):\n");
+		printk("------------------------------\n");
+	}
+
+next:
+	spin_lock_irqsave(&task->held_list_lock, flags);
+	list_for_each(curr, &task->held_list_head) {
+		if (cursor && curr != cursor)
+			continue;
+		lock = list_entry(curr, struct rt_mutex, held_list_entry);
+		t = rt_mutex_owner(lock);
+		WARN_ON(t != task);
+		count++;
+		cursor = curr->next;
+		spin_unlock_irqrestore(&task->held_list_lock, flags);
+
+		printk("\n#%03d:            ", count);
+		printk_lock(lock, 0);
+		goto next;
+	}
+	spin_unlock_irqrestore(&task->held_list_lock, flags);
+
+	printk("\n");
+}
+
+void rt_mutex_show_all_locks(void)
+{
+	task_t *g, *p;
+	int count = 10;
+	int unlock = 1;
+
+	printk("\n");
+	printk("----------------------\n");
+	printk("| showing all tasks: |\n");
+	printk("----------------------\n");
+
+	/*
+	 * Here we try to get the tasklist_lock as hard as possible,
+	 * if not successful after 2 seconds we ignore it (but keep
+	 * trying). This is to enable a debug printout even if a
+	 * tasklist_lock-holding task deadlocks or crashes.
+	 */
+retry:
+	if (!read_trylock(&tasklist_lock)) {
+		if (count == 10)
+			printk("hm, tasklist_lock locked, retrying... ");
+		if (count) {
+			count--;
+			printk(" #%d", 10-count);
+			mdelay(200);
+			goto retry;
+		}
+		printk(" ignoring it.\n");
+		unlock = 0;
+	}
+	if (count != 10)
+		printk(" locked it.\n");
+
+	do_each_thread(g, p) {
+		show_task_locks(p);
+		if (!unlock)
+			if (read_trylock(&tasklist_lock))
+				unlock = 1;
+	} while_each_thread(g, p);
+
+	printk("\n");
+
+	printk("-----------------------------------------\n");
+	printk("| showing all locks held in the system: |\n");
+	printk("-----------------------------------------\n");
+
+	do_each_thread(g, p) {
+		rt_mutex_show_held_locks(p, 0);
+		if (!unlock)
+			if (read_trylock(&tasklist_lock))
+				unlock = 1;
+	} while_each_thread(g, p);
+
+
+	printk("=============================================\n\n");
+
+	if (unlock)
+		read_unlock(&tasklist_lock);
+}
+
+void rt_mutex_debug_check_no_locks_held(task_t *task)
+{
+	struct rt_mutex_waiter *w;
+	struct list_head *curr;
+	struct rt_mutex *lock;
+
+	if (!rt_trace_on)
+		return;
+	if (!rt_prio(task->normal_prio) && rt_prio(task->prio)) {
+		printk("BUG: PI priority boost leaked!\n");
+		printk_task(task);
+		printk("\n");
+	}
+	if (list_empty(&task->held_list_head))
+		return;
+
+	spin_lock(&task->pi_lock);
+	plist_for_each_entry(w, &task->pi_waiters, pi_list_entry) {
+		TRACE_OFF();
+
+		printk("hm, PI interest held at exit time? Task:\n");
+		printk_task(task);
+		printk_waiter(w);
+		return;
+	}
+	spin_unlock(&task->pi_lock);
+
+	list_for_each(curr, &task->held_list_head) {
+		lock = list_entry(curr, struct rt_mutex, held_list_entry);
+
+		printk("BUG: %s/%d, lock held at task exit time!\n",
+		       task->comm, task->pid);
+		printk_lock(lock, 1);
+		if (rt_mutex_owner(lock) != task)
+			printk("exiting task is not even the owner??\n");
+	}
+}
+
+int rt_mutex_debug_check_no_locks_freed(const void *from, unsigned long len)
+{
+	const void *to = from + len;
+	struct list_head *curr;
+	struct rt_mutex *lock;
+	unsigned long flags;
+	void *lock_addr;
+
+	if (!rt_trace_on)
+		return 0;
+
+	spin_lock_irqsave(&current->held_list_lock, flags);
+	list_for_each(curr, &current->held_list_head) {
+		lock = list_entry(curr, struct rt_mutex, held_list_entry);
+		lock_addr = lock;
+		if (lock_addr < from || lock_addr >= to)
+			continue;
+		TRACE_OFF();
+
+		printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n",
+			current->comm, current->pid, lock, from, to);
+		dump_stack();
+		printk_lock(lock, 1);
+		if (rt_mutex_owner(lock) != current)
+			printk("freeing task is not even the owner??\n");
+		return 1;
+	}
+	spin_unlock_irqrestore(&current->held_list_lock, flags);
+
+	return 0;
+}
+
+void rt_mutex_debug_task_free(struct task_struct *task)
+{
+	WARN_ON(!plist_head_empty(&task->pi_waiters));
+	WARN_ON(task->pi_blocked_on);
+}
+
+/*
+ * We fill out the fields in the waiter to store the information about
+ * the deadlock. We print when we return. act_waiter can be NULL in
+ * case of a remove waiter operation.
+ */
+void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter,
+			     struct rt_mutex *lock)
+{
+	struct task_struct *task;
+
+	if (!rt_trace_on || detect || !act_waiter)
+		return;
+
+	task = rt_mutex_owner(act_waiter->lock);
+	if (task && task != current) {
+		act_waiter->deadlock_task_pid = task->pid;
+		act_waiter->deadlock_lock = lock;
+	}
+}
+
+void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
+{
+	struct task_struct *task;
+
+	if (!waiter->deadlock_lock || !rt_trace_on)
+		return;
+
+	task = find_task_by_pid(waiter->deadlock_task_pid);
+	if (!task)
+		return;
+
+	TRACE_OFF_NOLOCK();
+
+	printk("\n============================================\n");
+	printk(  "[ BUG: circular locking deadlock detected! ]\n");
+	printk(  "--------------------------------------------\n");
+	printk("%s/%d is deadlocking current task %s/%d\n\n",
+	       task->comm, task->pid, current->comm, current->pid);
+
+	printk("\n1) %s/%d is trying to acquire this lock:\n",
+	       current->comm, current->pid);
+	printk_lock(waiter->lock, 1);
+
+	printk("... trying at:                 ");
+	print_symbol("%s\n", waiter->ip);
+
+	printk("\n2) %s/%d is blocked on this lock:\n", task->comm, task->pid);
+	printk_lock(waiter->deadlock_lock, 1);
+
+	rt_mutex_show_held_locks(current, 1);
+	rt_mutex_show_held_locks(task, 1);
+
+	printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task->pid);
+	show_stack(task, NULL);
+	printk("\n%s/%d's [current] stackdump:\n\n",
+	       current->comm, current->pid);
+	dump_stack();
+	rt_mutex_show_all_locks();
+	printk("[ turning off deadlock detection."
+	       "Please report this trace. ]\n\n");
+	local_irq_disable();
+}
+
+void debug_rt_mutex_lock(struct rt_mutex *lock __IP_DECL__)
+{
+	unsigned long flags;
+
+	if (rt_trace_on) {
+		TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list_entry));
+
+		spin_lock_irqsave(&current->held_list_lock, flags);
+		list_add_tail(&lock->held_list_entry, &current->held_list_head);
+		spin_unlock_irqrestore(&current->held_list_lock, flags);
+
+		lock->acquire_ip = ip;
+	}
+}
+
+void debug_rt_mutex_unlock(struct rt_mutex *lock)
+{
+	unsigned long flags;
+
+	if (rt_trace_on) {
+		TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current);
+		TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list_entry));
+
+		spin_lock_irqsave(&current->held_list_lock, flags);
+		list_del_init(&lock->held_list_entry);
+		spin_unlock_irqrestore(&current->held_list_lock, flags);
+	}
+}
+
+void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
+			       struct task_struct *powner __IP_DECL__)
+{
+	unsigned long flags;
+
+	if (rt_trace_on) {
+		TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list_entry));
+
+		spin_lock_irqsave(&powner->held_list_lock, flags);
+		list_add_tail(&lock->held_list_entry, &powner->held_list_head);
+		spin_unlock_irqrestore(&powner->held_list_lock, flags);
+
+		lock->acquire_ip = ip;
+	}
+}
+
+void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
+{
+	unsigned long flags;
+
+	if (rt_trace_on) {
+		struct task_struct *owner = rt_mutex_owner(lock);
+
+		TRACE_WARN_ON_LOCKED(!owner);
+		TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list_entry));
+
+		spin_lock_irqsave(&owner->held_list_lock, flags);
+		list_del_init(&lock->held_list_entry);
+		spin_unlock_irqrestore(&owner->held_list_lock, flags);
+	}
+}
+
+void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
+{
+	memset(waiter, 0x11, sizeof(*waiter));
+	plist_node_init(&waiter->list_entry, MAX_PRIO);
+	plist_node_init(&waiter->pi_list_entry, MAX_PRIO);
+}
+
+void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
+{
+	TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
+	TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
+	TRACE_WARN_ON(waiter->task);
+	memset(waiter, 0x22, sizeof(*waiter));
+}
+
+void debug_rt_mutex_init(struct rt_mutex *lock, const char *name)
+{
+	void *addr = lock;
+
+	if (rt_trace_on) {
+		rt_mutex_debug_check_no_locks_freed(addr,
+						    sizeof(struct rt_mutex));
+		INIT_LIST_HEAD(&lock->held_list_entry);
+		lock->name = name;
+	}
+}
+
+void rt_mutex_deadlock_account_lock(struct rt_mutex *lock, task_t *task)
+{
+}
+
+void rt_mutex_deadlock_account_unlock(struct task_struct *task)
+{
+}
+
diff --git a/kernel/rtmutex-debug.h b/kernel/rtmutex-debug.h
new file mode 100644
index 00000000000..7612fbc62d7
--- /dev/null
+++ b/kernel/rtmutex-debug.h
@@ -0,0 +1,37 @@
+/*
+ * RT-Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This file contains macros used solely by rtmutex.c. Debug version.
+ */
+
+#define __IP_DECL__		, unsigned long ip
+#define __IP__			, ip
+#define __RET_IP__		, (unsigned long)__builtin_return_address(0)
+
+extern void
+rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task);
+extern void rt_mutex_deadlock_account_unlock(struct task_struct *task);
+extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
+extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
+extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name);
+extern void debug_rt_mutex_lock(struct rt_mutex *lock __IP_DECL__);
+extern void debug_rt_mutex_unlock(struct rt_mutex *lock);
+extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
+				      struct task_struct *powner __IP_DECL__);
+extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock);
+extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter,
+				    struct rt_mutex *lock);
+extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter);
+# define debug_rt_mutex_reset_waiter(w)			\
+	do { (w)->deadlock_lock = NULL; } while (0)
+
+static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
+						 int detect)
+{
+	return (waiter != NULL);
+}
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 8bab0102ac7..06d3ea1133c 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -107,6 +107,19 @@ config DEBUG_MUTEXES
 	 This allows mutex semantics violations and mutex related deadlocks
 	 (lockups) to be detected and reported automatically.
 
+config DEBUG_RT_MUTEXES
+	bool "RT Mutex debugging, deadlock detection"
+	default y
+	depends on DEBUG_KERNEL && RT_MUTEXES
+	help
+	 This allows rt mutex semantics violations and rt mutex related
+	 deadlocks (lockups) to be detected and reported automatically.
+
+config DEBUG_PI_LIST
+	bool
+	default y
+	depends on DEBUG_RT_MUTEXES
+
 config DEBUG_SPINLOCK
 	bool "Spinlock debugging"
 	depends on DEBUG_KERNEL
diff --git a/mm/slab.c b/mm/slab.c
index f378d027c68..233e39d14ca 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -107,6 +107,7 @@
 #include	<linux/nodemask.h>
 #include	<linux/mempolicy.h>
 #include	<linux/mutex.h>
+#include	<linux/rtmutex.h>
 
 #include	<asm/uaccess.h>
 #include	<asm/cacheflush.h>
-- 
cgit v1.2.3-70-g09d2


From f5e54d6e53a20cef45af7499e86164f0e0d16bb2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 28 Jun 2006 04:26:44 -0700
Subject: [PATCH] mark address_space_operations const

Same as with already do with the file operations: keep them in .rodata and
prevents people from doing runtime patching.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Steven French <sfrench@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/loop.c        | 4 ++--
 drivers/block/rd.c          | 2 +-
 fs/9p/v9fs_vfs.h            | 2 +-
 fs/9p/vfs_addr.c            | 2 +-
 fs/adfs/inode.c             | 2 +-
 fs/affs/affs.h              | 6 +++---
 fs/affs/file.c              | 4 ++--
 fs/affs/symlink.c           | 2 +-
 fs/afs/file.c               | 2 +-
 fs/afs/internal.h           | 2 +-
 fs/befs/linuxvfs.c          | 2 +-
 fs/bfs/bfs.h                | 2 +-
 fs/bfs/file.c               | 2 +-
 fs/block_dev.c              | 2 +-
 fs/buffer.c                 | 2 +-
 fs/cifs/cifsfs.h            | 4 ++--
 fs/cifs/file.c              | 4 ++--
 fs/coda/symlink.c           | 2 +-
 fs/configfs/inode.c         | 2 +-
 fs/cramfs/inode.c           | 4 ++--
 fs/efs/inode.c              | 2 +-
 fs/efs/symlink.c            | 2 +-
 fs/ext2/ext2.h              | 6 +++---
 fs/ext2/inode.c             | 6 +++---
 fs/ext3/inode.c             | 6 +++---
 fs/fat/inode.c              | 2 +-
 fs/freevxfs/vxfs_immed.c    | 2 +-
 fs/freevxfs/vxfs_inode.c    | 6 +++---
 fs/freevxfs/vxfs_subr.c     | 2 +-
 fs/fuse/file.c              | 2 +-
 fs/hfs/hfs_fs.h             | 4 ++--
 fs/hfs/inode.c              | 4 ++--
 fs/hfsplus/hfsplus_fs.h     | 4 ++--
 fs/hfsplus/inode.c          | 4 ++--
 fs/hostfs/hostfs_kern.c     | 6 +++---
 fs/hpfs/file.c              | 2 +-
 fs/hpfs/hpfs_fn.h           | 4 ++--
 fs/hpfs/namei.c             | 2 +-
 fs/hugetlbfs/inode.c        | 4 ++--
 fs/inode.c                  | 2 +-
 fs/isofs/compress.c         | 2 +-
 fs/isofs/inode.c            | 2 +-
 fs/isofs/isofs.h            | 2 +-
 fs/isofs/rock.c             | 2 +-
 fs/isofs/zisofs.h           | 2 +-
 fs/jffs/inode-v23.c         | 4 ++--
 fs/jffs2/file.c             | 2 +-
 fs/jffs2/os-linux.h         | 2 +-
 fs/jfs/inode.c              | 2 +-
 fs/jfs/jfs_inode.h          | 2 +-
 fs/jfs/jfs_metapage.c       | 2 +-
 fs/jfs/jfs_metapage.h       | 2 +-
 fs/minix/inode.c            | 2 +-
 fs/ncpfs/inode.c            | 2 +-
 fs/ncpfs/symlink.c          | 2 +-
 fs/nfs/file.c               | 2 +-
 fs/ntfs/aops.c              | 4 ++--
 fs/ntfs/ntfs.h              | 4 ++--
 fs/ocfs2/aops.c             | 2 +-
 fs/ocfs2/inode.h            | 2 +-
 fs/qnx4/inode.c             | 2 +-
 fs/ramfs/file-mmu.c         | 2 +-
 fs/ramfs/file-nommu.c       | 2 +-
 fs/ramfs/internal.h         | 2 +-
 fs/reiserfs/inode.c         | 2 +-
 fs/romfs/inode.c            | 2 +-
 fs/smbfs/file.c             | 2 +-
 fs/smbfs/proto.h            | 2 +-
 fs/sysfs/inode.c            | 2 +-
 fs/sysv/itree.c             | 2 +-
 fs/sysv/sysv.h              | 2 +-
 fs/udf/file.c               | 2 +-
 fs/udf/inode.c              | 2 +-
 fs/udf/symlink.c            | 2 +-
 fs/udf/udfdecl.h            | 6 +++---
 fs/ufs/inode.c              | 2 +-
 fs/xfs/linux-2.6/xfs_aops.c | 2 +-
 fs/xfs/linux-2.6/xfs_aops.h | 2 +-
 fs/xfs/linux-2.6/xfs_buf.c  | 2 +-
 include/linux/coda_linux.h  | 4 ++--
 include/linux/efs_fs.h      | 2 +-
 include/linux/fs.h          | 4 ++--
 include/linux/nfs_fs.h      | 2 +-
 include/linux/reiserfs_fs.h | 2 +-
 include/linux/ufs_fs.h      | 2 +-
 mm/filemap.c                | 4 ++--
 mm/filemap_xip.c            | 2 +-
 mm/shmem.c                  | 4 ++--
 mm/swap_state.c             | 2 +-
 89 files changed, 121 insertions(+), 121 deletions(-)

(limited to 'mm')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 3c74ea729fc..18dd026f470 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -210,7 +210,7 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
 {
 	struct file *file = lo->lo_backing_file; /* kudos to NFsckingS */
 	struct address_space *mapping = file->f_mapping;
-	struct address_space_operations *aops = mapping->a_ops;
+	const struct address_space_operations *aops = mapping->a_ops;
 	pgoff_t index;
 	unsigned offset, bv_offs;
 	int len, ret;
@@ -784,7 +784,7 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file,
 
 	error = -EINVAL;
 	if (S_ISREG(inode->i_mode) || S_ISBLK(inode->i_mode)) {
-		struct address_space_operations *aops = mapping->a_ops;
+		const struct address_space_operations *aops = mapping->a_ops;
 		/*
 		 * If we can't read - sorry. If we only can't write - well,
 		 * it's going to be read-only.
diff --git a/drivers/block/rd.c b/drivers/block/rd.c
index 940bfd7951e..0378da04cfa 100644
--- a/drivers/block/rd.c
+++ b/drivers/block/rd.c
@@ -191,7 +191,7 @@ static int ramdisk_set_page_dirty(struct page *page)
 	return 0;
 }
 
-static struct address_space_operations ramdisk_aops = {
+static const struct address_space_operations ramdisk_aops = {
 	.readpage	= ramdisk_readpage,
 	.prepare_write	= ramdisk_prepare_write,
 	.commit_write	= ramdisk_commit_write,
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index f867b8d3e97..450b0c1b385 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -38,7 +38,7 @@
  */
 
 extern struct file_system_type v9fs_fs_type;
-extern struct address_space_operations v9fs_addr_operations;
+extern const struct address_space_operations v9fs_addr_operations;
 extern const struct file_operations v9fs_file_operations;
 extern const struct file_operations v9fs_dir_operations;
 extern struct dentry_operations v9fs_dentry_operations;
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index efda46fb64d..d4f0aa3c87f 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -103,6 +103,6 @@ UnmapAndUnlock:
 	return retval;
 }
 
-struct address_space_operations v9fs_addr_operations = {
+const struct address_space_operations v9fs_addr_operations = {
       .readpage = v9fs_vfs_readpage,
 };
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index a02802a3079..534f3eecc98 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -72,7 +72,7 @@ static sector_t _adfs_bmap(struct address_space *mapping, sector_t block)
 	return generic_block_bmap(mapping, block, adfs_get_block);
 }
 
-static struct address_space_operations adfs_aops = {
+static const struct address_space_operations adfs_aops = {
 	.readpage	= adfs_readpage,
 	.writepage	= adfs_writepage,
 	.sync_page	= block_sync_page,
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index a43a876742b..0ddd4cc0d1a 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -195,9 +195,9 @@ extern struct inode_operations   affs_symlink_inode_operations;
 extern const struct file_operations	 affs_file_operations;
 extern const struct file_operations	 affs_file_operations_ofs;
 extern const struct file_operations	 affs_dir_operations;
-extern struct address_space_operations	 affs_symlink_aops;
-extern struct address_space_operations	 affs_aops;
-extern struct address_space_operations	 affs_aops_ofs;
+extern const struct address_space_operations	 affs_symlink_aops;
+extern const struct address_space_operations	 affs_aops;
+extern const struct address_space_operations	 affs_aops_ofs;
 
 extern struct dentry_operations	 affs_dentry_operations;
 extern struct dentry_operations	 affs_dentry_operations_intl;
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 7076262af39..3de8590e4f6 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -406,7 +406,7 @@ static sector_t _affs_bmap(struct address_space *mapping, sector_t block)
 {
 	return generic_block_bmap(mapping,block,affs_get_block);
 }
-struct address_space_operations affs_aops = {
+const struct address_space_operations affs_aops = {
 	.readpage = affs_readpage,
 	.writepage = affs_writepage,
 	.sync_page = block_sync_page,
@@ -759,7 +759,7 @@ out:
 	goto done;
 }
 
-struct address_space_operations affs_aops_ofs = {
+const struct address_space_operations affs_aops_ofs = {
 	.readpage = affs_readpage_ofs,
 	//.writepage = affs_writepage_ofs,
 	//.sync_page = affs_sync_page_ofs,
diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c
index 426f0f094f2..f802256a593 100644
--- a/fs/affs/symlink.c
+++ b/fs/affs/symlink.c
@@ -66,7 +66,7 @@ fail:
 	return err;
 }
 
-struct address_space_operations affs_symlink_aops = {
+const struct address_space_operations affs_symlink_aops = {
 	.readpage	= affs_symlink_readpage,
 };
 
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 7bb716887e2..67d6634101f 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -35,7 +35,7 @@ struct inode_operations afs_file_inode_operations = {
 	.getattr	= afs_inode_getattr,
 };
 
-struct address_space_operations afs_fs_aops = {
+const struct address_space_operations afs_fs_aops = {
 	.readpage	= afs_file_readpage,
 	.sync_page	= block_sync_page,
 	.set_page_dirty	= __set_page_dirty_nobuffers,
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 72febdf9a35..e88b3b65ae4 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -69,7 +69,7 @@ extern const struct file_operations afs_dir_file_operations;
 /*
  * file.c
  */
-extern struct address_space_operations afs_fs_aops;
+extern const struct address_space_operations afs_fs_aops;
 extern struct inode_operations afs_file_inode_operations;
 
 #ifdef AFS_CACHING_SUPPORT
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 08201fab26c..a83e889a97c 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -73,7 +73,7 @@ static struct inode_operations befs_dir_inode_operations = {
 	.lookup		= befs_lookup,
 };
 
-static struct address_space_operations befs_aops = {
+static const struct address_space_operations befs_aops = {
 	.readpage	= befs_readpage,
 	.sync_page	= block_sync_page,
 	.bmap		= befs_bmap,
diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h
index 9d791004b21..31973bbbf05 100644
--- a/fs/bfs/bfs.h
+++ b/fs/bfs/bfs.h
@@ -50,7 +50,7 @@ static inline struct bfs_inode_info *BFS_I(struct inode *inode)
 /* file.c */
 extern struct inode_operations bfs_file_inops;
 extern const struct file_operations bfs_file_operations;
-extern struct address_space_operations bfs_aops;
+extern const struct address_space_operations bfs_aops;
 
 /* dir.c */
 extern struct inode_operations bfs_dir_inops;
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index d83cd74a2e4..3d5aca28a0a 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -153,7 +153,7 @@ static sector_t bfs_bmap(struct address_space *mapping, sector_t block)
 	return generic_block_bmap(mapping, block, bfs_get_block);
 }
 
-struct address_space_operations bfs_aops = {
+const struct address_space_operations bfs_aops = {
 	.readpage	= bfs_readpage,
 	.writepage	= bfs_writepage,
 	.sync_page	= block_sync_page,
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 028d9fb9c2d..7f7600e2381 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1095,7 +1095,7 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	return blkdev_ioctl(file->f_mapping->host, file, cmd, arg);
 }
 
-struct address_space_operations def_blk_aops = {
+const struct address_space_operations def_blk_aops = {
 	.readpage	= blkdev_readpage,
 	.writepage	= blkdev_writepage,
 	.sync_page	= block_sync_page,
diff --git a/fs/buffer.c b/fs/buffer.c
index f23bb647db4..e9994722f4a 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2598,7 +2598,7 @@ int nobh_truncate_page(struct address_space *mapping, loff_t from)
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
 	unsigned to;
 	struct page *page;
-	struct address_space_operations *a_ops = mapping->a_ops;
+	const struct address_space_operations *a_ops = mapping->a_ops;
 	char *kaddr;
 	int ret = 0;
 
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index a6384d83fde..8f75c6f2470 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -32,8 +32,8 @@
 #define TRUE 1
 #endif
 
-extern struct address_space_operations cifs_addr_ops;
-extern struct address_space_operations cifs_addr_ops_smallbuf;
+extern const struct address_space_operations cifs_addr_ops;
+extern const struct address_space_operations cifs_addr_ops_smallbuf;
 
 /* Functions related to super block operations */
 extern struct super_operations cifs_super_ops;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index e9c1573f6aa..5861eb42e62 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1942,7 +1942,7 @@ static int cifs_prepare_write(struct file *file, struct page *page,
 	return 0;
 }
 
-struct address_space_operations cifs_addr_ops = {
+const struct address_space_operations cifs_addr_ops = {
 	.readpage = cifs_readpage,
 	.readpages = cifs_readpages,
 	.writepage = cifs_writepage,
@@ -1959,7 +1959,7 @@ struct address_space_operations cifs_addr_ops = {
  * contain the header plus one complete page of data.  Otherwise, we need
  * to leave cifs_readpages out of the address space operations.
  */
-struct address_space_operations cifs_addr_ops_smallbuf = {
+const struct address_space_operations cifs_addr_ops_smallbuf = {
 	.readpage = cifs_readpage,
 	.writepage = cifs_writepage,
 	.writepages = cifs_writepages,
diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c
index b35e5bbd9c9..76e00a65a75 100644
--- a/fs/coda/symlink.c
+++ b/fs/coda/symlink.c
@@ -50,6 +50,6 @@ fail:
 	return error;
 }
 
-struct address_space_operations coda_symlink_aops = {
+const struct address_space_operations coda_symlink_aops = {
 	.readpage	= coda_symlink_filler,
 };
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index c153bd9534c..e14488ca641 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -38,7 +38,7 @@
 
 extern struct super_block * configfs_sb;
 
-static struct address_space_operations configfs_aops = {
+static const struct address_space_operations configfs_aops = {
 	.readpage	= simple_readpage,
 	.prepare_write	= simple_prepare_write,
 	.commit_write	= simple_commit_write
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index c45d7386080..223c0431042 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -30,7 +30,7 @@
 static struct super_operations cramfs_ops;
 static struct inode_operations cramfs_dir_inode_operations;
 static const struct file_operations cramfs_directory_operations;
-static struct address_space_operations cramfs_aops;
+static const struct address_space_operations cramfs_aops;
 
 static DEFINE_MUTEX(read_mutex);
 
@@ -501,7 +501,7 @@ static int cramfs_readpage(struct file *file, struct page * page)
 	return 0;
 }
 
-static struct address_space_operations cramfs_aops = {
+static const struct address_space_operations cramfs_aops = {
 	.readpage = cramfs_readpage
 };
 
diff --git a/fs/efs/inode.c b/fs/efs/inode.c
index 180607f9314..174696f9bf1 100644
--- a/fs/efs/inode.c
+++ b/fs/efs/inode.c
@@ -21,7 +21,7 @@ static sector_t _efs_bmap(struct address_space *mapping, sector_t block)
 {
 	return generic_block_bmap(mapping,block,efs_get_block);
 }
-static struct address_space_operations efs_aops = {
+static const struct address_space_operations efs_aops = {
 	.readpage = efs_readpage,
 	.sync_page = block_sync_page,
 	.bmap = _efs_bmap
diff --git a/fs/efs/symlink.c b/fs/efs/symlink.c
index 3d9a350e3e7..e249cf733a6 100644
--- a/fs/efs/symlink.c
+++ b/fs/efs/symlink.c
@@ -53,6 +53,6 @@ fail:
 	return err;
 }
 
-struct address_space_operations efs_symlink_aops = {
+const struct address_space_operations efs_symlink_aops = {
 	.readpage	= efs_symlink_readpage
 };
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 9f74a62be55..e65a019fc7a 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -162,9 +162,9 @@ extern const struct file_operations ext2_file_operations;
 extern const struct file_operations ext2_xip_file_operations;
 
 /* inode.c */
-extern struct address_space_operations ext2_aops;
-extern struct address_space_operations ext2_aops_xip;
-extern struct address_space_operations ext2_nobh_aops;
+extern const struct address_space_operations ext2_aops;
+extern const struct address_space_operations ext2_aops_xip;
+extern const struct address_space_operations ext2_nobh_aops;
 
 /* namei.c */
 extern struct inode_operations ext2_dir_inode_operations;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 04af9c45dce..fb4d3220eb8 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -684,7 +684,7 @@ ext2_writepages(struct address_space *mapping, struct writeback_control *wbc)
 	return mpage_writepages(mapping, wbc, ext2_get_block);
 }
 
-struct address_space_operations ext2_aops = {
+const struct address_space_operations ext2_aops = {
 	.readpage		= ext2_readpage,
 	.readpages		= ext2_readpages,
 	.writepage		= ext2_writepage,
@@ -697,12 +697,12 @@ struct address_space_operations ext2_aops = {
 	.migratepage		= buffer_migrate_page,
 };
 
-struct address_space_operations ext2_aops_xip = {
+const struct address_space_operations ext2_aops_xip = {
 	.bmap			= ext2_bmap,
 	.get_xip_page		= ext2_get_xip_page,
 };
 
-struct address_space_operations ext2_nobh_aops = {
+const struct address_space_operations ext2_nobh_aops = {
 	.readpage		= ext2_readpage,
 	.readpages		= ext2_readpages,
 	.writepage		= ext2_nobh_writepage,
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 0321e1b9034..f804d5e9d60 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1698,7 +1698,7 @@ static int ext3_journalled_set_page_dirty(struct page *page)
 	return __set_page_dirty_nobuffers(page);
 }
 
-static struct address_space_operations ext3_ordered_aops = {
+static const struct address_space_operations ext3_ordered_aops = {
 	.readpage	= ext3_readpage,
 	.readpages	= ext3_readpages,
 	.writepage	= ext3_ordered_writepage,
@@ -1712,7 +1712,7 @@ static struct address_space_operations ext3_ordered_aops = {
 	.migratepage	= buffer_migrate_page,
 };
 
-static struct address_space_operations ext3_writeback_aops = {
+static const struct address_space_operations ext3_writeback_aops = {
 	.readpage	= ext3_readpage,
 	.readpages	= ext3_readpages,
 	.writepage	= ext3_writeback_writepage,
@@ -1726,7 +1726,7 @@ static struct address_space_operations ext3_writeback_aops = {
 	.migratepage	= buffer_migrate_page,
 };
 
-static struct address_space_operations ext3_journalled_aops = {
+static const struct address_space_operations ext3_journalled_aops = {
 	.readpage	= ext3_readpage,
 	.readpages	= ext3_readpages,
 	.writepage	= ext3_journalled_writepage,
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 7c35d582ec1..31b7174176b 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -196,7 +196,7 @@ static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
 	return generic_block_bmap(mapping, block, fat_get_block);
 }
 
-static struct address_space_operations fat_aops = {
+static const struct address_space_operations fat_aops = {
 	.readpage	= fat_readpage,
 	.readpages	= fat_readpages,
 	.writepage	= fat_writepage,
diff --git a/fs/freevxfs/vxfs_immed.c b/fs/freevxfs/vxfs_immed.c
index 6f5df1700e9..4e25f3fbed8 100644
--- a/fs/freevxfs/vxfs_immed.c
+++ b/fs/freevxfs/vxfs_immed.c
@@ -56,7 +56,7 @@ struct inode_operations vxfs_immed_symlink_iops = {
 /*
  * Adress space operations for immed files and directories.
  */
-struct address_space_operations vxfs_immed_aops = {
+const struct address_space_operations vxfs_immed_aops = {
 	.readpage =		vxfs_immed_readpage,
 };
 
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index f544aae9169..ca6a3971477 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -41,8 +41,8 @@
 #include "vxfs_extern.h"
 
 
-extern struct address_space_operations vxfs_aops;
-extern struct address_space_operations vxfs_immed_aops;
+extern const struct address_space_operations vxfs_aops;
+extern const struct address_space_operations vxfs_immed_aops;
 
 extern struct inode_operations vxfs_immed_symlink_iops;
 
@@ -295,7 +295,7 @@ vxfs_read_inode(struct inode *ip)
 {
 	struct super_block		*sbp = ip->i_sb;
 	struct vxfs_inode_info		*vip;
-	struct address_space_operations	*aops;
+	const struct address_space_operations	*aops;
 	ino_t				ino = ip->i_ino;
 
 	if (!(vip = __vxfs_iget(ino, VXFS_SBI(sbp)->vsi_ilist)))
diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c
index c1be118fc06..decac62efe5 100644
--- a/fs/freevxfs/vxfs_subr.c
+++ b/fs/freevxfs/vxfs_subr.c
@@ -42,7 +42,7 @@
 static int		vxfs_readpage(struct file *, struct page *);
 static sector_t		vxfs_bmap(struct address_space *, sector_t);
 
-struct address_space_operations vxfs_aops = {
+const struct address_space_operations vxfs_aops = {
 	.readpage =		vxfs_readpage,
 	.bmap =			vxfs_bmap,
 	.sync_page =		block_sync_page,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 28aa81eae2c..63614ed1633 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -770,7 +770,7 @@ static const struct file_operations fuse_direct_io_file_operations = {
 	/* no mmap and sendfile */
 };
 
-static struct address_space_operations fuse_file_aops  = {
+static const struct address_space_operations fuse_file_aops  = {
 	.readpage	= fuse_readpage,
 	.prepare_write	= fuse_prepare_write,
 	.commit_write	= fuse_commit_write,
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 3ed8663a8db..735332dfd1b 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -182,8 +182,8 @@ extern void hfs_file_truncate(struct inode *);
 extern int hfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
 
 /* inode.c */
-extern struct address_space_operations hfs_aops;
-extern struct address_space_operations hfs_btree_aops;
+extern const struct address_space_operations hfs_aops;
+extern const struct address_space_operations hfs_btree_aops;
 
 extern struct inode *hfs_new_inode(struct inode *, struct qstr *, int);
 extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 2d4ced22201..315cf44a90b 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -114,7 +114,7 @@ static int hfs_writepages(struct address_space *mapping,
 	return mpage_writepages(mapping, wbc, hfs_get_block);
 }
 
-struct address_space_operations hfs_btree_aops = {
+const struct address_space_operations hfs_btree_aops = {
 	.readpage	= hfs_readpage,
 	.writepage	= hfs_writepage,
 	.sync_page	= block_sync_page,
@@ -124,7 +124,7 @@ struct address_space_operations hfs_btree_aops = {
 	.releasepage	= hfs_releasepage,
 };
 
-struct address_space_operations hfs_aops = {
+const struct address_space_operations hfs_aops = {
 	.readpage	= hfs_readpage,
 	.writepage	= hfs_writepage,
 	.sync_page	= block_sync_page,
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 7ae393637a0..8a1ca5ef7ad 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -323,8 +323,8 @@ int hfsplus_file_extend(struct inode *);
 void hfsplus_file_truncate(struct inode *);
 
 /* inode.c */
-extern struct address_space_operations hfsplus_aops;
-extern struct address_space_operations hfsplus_btree_aops;
+extern const struct address_space_operations hfsplus_aops;
+extern const struct address_space_operations hfsplus_btree_aops;
 
 void hfsplus_inode_read_fork(struct inode *, struct hfsplus_fork_raw *);
 void hfsplus_inode_write_fork(struct inode *, struct hfsplus_fork_raw *);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index acf66dba3e0..924ecdef809 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -109,7 +109,7 @@ static int hfsplus_writepages(struct address_space *mapping,
 	return mpage_writepages(mapping, wbc, hfsplus_get_block);
 }
 
-struct address_space_operations hfsplus_btree_aops = {
+const struct address_space_operations hfsplus_btree_aops = {
 	.readpage	= hfsplus_readpage,
 	.writepage	= hfsplus_writepage,
 	.sync_page	= block_sync_page,
@@ -119,7 +119,7 @@ struct address_space_operations hfsplus_btree_aops = {
 	.releasepage	= hfsplus_releasepage,
 };
 
-struct address_space_operations hfsplus_aops = {
+const struct address_space_operations hfsplus_aops = {
 	.readpage	= hfsplus_readpage,
 	.writepage	= hfsplus_writepage,
 	.sync_page	= block_sync_page,
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 8e0d37743e7..b82e3d9c879 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -54,7 +54,7 @@ static int append = 0;
 
 static struct inode_operations hostfs_iops;
 static struct inode_operations hostfs_dir_iops;
-static struct address_space_operations hostfs_link_aops;
+static const struct address_space_operations hostfs_link_aops;
 
 #ifndef MODULE
 static int __init hostfs_args(char *options, int *add)
@@ -518,7 +518,7 @@ int hostfs_commit_write(struct file *file, struct page *page, unsigned from,
 	return(err);
 }
 
-static struct address_space_operations hostfs_aops = {
+static const struct address_space_operations hostfs_aops = {
 	.writepage 	= hostfs_writepage,
 	.readpage	= hostfs_readpage,
 	.set_page_dirty = __set_page_dirty_nobuffers,
@@ -935,7 +935,7 @@ int hostfs_link_readpage(struct file *file, struct page *page)
 	return(err);
 }
 
-static struct address_space_operations hostfs_link_aops = {
+static const struct address_space_operations hostfs_link_aops = {
 	.readpage	= hostfs_link_readpage,
 };
 
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index d3b9fffe45a..d9eb19b7b8a 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -99,7 +99,7 @@ static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block)
 {
 	return generic_block_bmap(mapping,block,hpfs_get_block);
 }
-struct address_space_operations hpfs_aops = {
+const struct address_space_operations hpfs_aops = {
 	.readpage = hpfs_readpage,
 	.writepage = hpfs_writepage,
 	.sync_page = block_sync_page,
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 29b7a3e5517..f687d54ed44 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -268,7 +268,7 @@ void hpfs_set_ea(struct inode *, struct fnode *, char *, char *, int);
 int hpfs_file_fsync(struct file *, struct dentry *, int);
 extern const struct file_operations hpfs_file_ops;
 extern struct inode_operations hpfs_file_iops;
-extern struct address_space_operations hpfs_aops;
+extern const struct address_space_operations hpfs_aops;
 
 /* inode.c */
 
@@ -304,7 +304,7 @@ void hpfs_decide_conv(struct inode *, unsigned char *, unsigned);
 /* namei.c */
 
 extern struct inode_operations hpfs_dir_iops;
-extern struct address_space_operations hpfs_symlink_aops;
+extern const struct address_space_operations hpfs_symlink_aops;
 
 static inline struct hpfs_inode_info *hpfs_i(struct inode *inode)
 {
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index a03abb12c61..59e7dc182a0 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -538,7 +538,7 @@ fail:
 	return err;
 }
 
-struct address_space_operations hpfs_symlink_aops = {
+const struct address_space_operations hpfs_symlink_aops = {
 	.readpage	= hpfs_symlink_readpage
 };
 	
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index e6410d8edd0..6449cb69796 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -34,7 +34,7 @@
 #define HUGETLBFS_MAGIC	0x958458f6
 
 static struct super_operations hugetlbfs_ops;
-static struct address_space_operations hugetlbfs_aops;
+static const struct address_space_operations hugetlbfs_aops;
 const struct file_operations hugetlbfs_file_operations;
 static struct inode_operations hugetlbfs_dir_inode_operations;
 static struct inode_operations hugetlbfs_inode_operations;
@@ -547,7 +547,7 @@ static void hugetlbfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
 }
 
-static struct address_space_operations hugetlbfs_aops = {
+static const struct address_space_operations hugetlbfs_aops = {
 	.readpage	= hugetlbfs_readpage,
 	.prepare_write	= hugetlbfs_prepare_write,
 	.commit_write	= hugetlbfs_commit_write,
diff --git a/fs/inode.c b/fs/inode.c
index 3a2446a27d2..f42961eb983 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -102,7 +102,7 @@ static kmem_cache_t * inode_cachep __read_mostly;
 
 static struct inode *alloc_inode(struct super_block *sb)
 {
-	static struct address_space_operations empty_aops;
+	static const struct address_space_operations empty_aops;
 	static struct inode_operations empty_iops;
 	static const struct file_operations empty_fops;
 	struct inode *inode;
diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c
index 4917315db73..3a39158cca9 100644
--- a/fs/isofs/compress.c
+++ b/fs/isofs/compress.c
@@ -312,7 +312,7 @@ eio:
 	return err;
 }
 
-struct address_space_operations zisofs_aops = {
+const struct address_space_operations zisofs_aops = {
 	.readpage = zisofs_readpage,
 	/* No sync_page operation supported? */
 	/* No bmap operation supported */
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 3f9c8ba1fa1..bb11c7fb401 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -1054,7 +1054,7 @@ static sector_t _isofs_bmap(struct address_space *mapping, sector_t block)
 	return generic_block_bmap(mapping,block,isofs_get_block);
 }
 
-static struct address_space_operations isofs_aops = {
+static const struct address_space_operations isofs_aops = {
 	.readpage = isofs_readpage,
 	.sync_page = block_sync_page,
 	.bmap = _isofs_bmap
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index b87ba066f5e..e6308c8b573 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -176,5 +176,5 @@ isofs_normalize_block_and_offset(struct iso_directory_record* de,
 
 extern struct inode_operations isofs_dir_inode_operations;
 extern const struct file_operations isofs_dir_operations;
-extern struct address_space_operations isofs_symlink_aops;
+extern const struct address_space_operations isofs_symlink_aops;
 extern struct export_operations isofs_export_ops;
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 4326cb47f8f..f3a1db3098d 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -754,6 +754,6 @@ error:
 	return -EIO;
 }
 
-struct address_space_operations isofs_symlink_aops = {
+const struct address_space_operations isofs_symlink_aops = {
 	.readpage = rock_ridge_symlink_readpage
 };
diff --git a/fs/isofs/zisofs.h b/fs/isofs/zisofs.h
index d78485d101c..27379570915 100644
--- a/fs/isofs/zisofs.h
+++ b/fs/isofs/zisofs.h
@@ -15,7 +15,7 @@
  */
 
 #ifdef CONFIG_ZISOFS
-extern struct address_space_operations zisofs_aops;
+extern const struct address_space_operations zisofs_aops;
 extern int __init zisofs_init(void);
 extern void zisofs_cleanup(void);
 #endif
diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c
index 9e46ea6da75..93068697a9b 100644
--- a/fs/jffs/inode-v23.c
+++ b/fs/jffs/inode-v23.c
@@ -59,7 +59,7 @@ static const struct file_operations jffs_file_operations;
 static struct inode_operations jffs_file_inode_operations;
 static const struct file_operations jffs_dir_operations;
 static struct inode_operations jffs_dir_inode_operations;
-static struct address_space_operations jffs_address_operations;
+static const struct address_space_operations jffs_address_operations;
 
 kmem_cache_t     *node_cache = NULL;
 kmem_cache_t     *fm_cache = NULL;
@@ -1614,7 +1614,7 @@ jffs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
 } /* jffs_ioctl()  */
 
 
-static struct address_space_operations jffs_address_operations = {
+static const struct address_space_operations jffs_address_operations = {
 	.readpage	= jffs_readpage,
 	.prepare_write	= jffs_prepare_write,
 	.commit_write	= jffs_commit_write,
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index bb8844f40e4..3ed6e3e120b 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -62,7 +62,7 @@ struct inode_operations jffs2_file_inode_operations =
 	.removexattr =	jffs2_removexattr
 };
 
-struct address_space_operations jffs2_file_address_operations =
+const struct address_space_operations jffs2_file_address_operations =
 {
 	.readpage =	jffs2_readpage,
 	.prepare_write =jffs2_prepare_write,
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 6b522356540..9f41fc01a37 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -158,7 +158,7 @@ extern struct inode_operations jffs2_dir_inode_operations;
 /* file.c */
 extern const struct file_operations jffs2_file_operations;
 extern struct inode_operations jffs2_file_inode_operations;
-extern struct address_space_operations jffs2_file_address_operations;
+extern const struct address_space_operations jffs2_file_address_operations;
 int jffs2_fsync(struct file *, struct dentry *, int);
 int jffs2_do_readpage_unlock (struct inode *inode, struct page *pg);
 
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 04eb78f1252..43e3f566aad 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -305,7 +305,7 @@ static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb,
 				offset, nr_segs, jfs_get_block, NULL);
 }
 
-struct address_space_operations jfs_aops = {
+const struct address_space_operations jfs_aops = {
 	.readpage	= jfs_readpage,
 	.readpages	= jfs_readpages,
 	.writepage	= jfs_writepage,
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index c3007267446..b5c7da6190d 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -33,7 +33,7 @@ extern void jfs_free_zero_link(struct inode *);
 extern struct dentry *jfs_get_parent(struct dentry *dentry);
 extern void jfs_set_inode_flags(struct inode *);
 
-extern struct address_space_operations jfs_aops;
+extern const struct address_space_operations jfs_aops;
 extern struct inode_operations jfs_dir_inode_operations;
 extern const struct file_operations jfs_dir_operations;
 extern struct inode_operations jfs_file_inode_operations;
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 7f6e8803970..e1e0a6e6ebd 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -577,7 +577,7 @@ static void metapage_invalidatepage(struct page *page, unsigned long offset)
 	metapage_releasepage(page, 0);
 }
 
-struct address_space_operations jfs_metapage_aops = {
+const struct address_space_operations jfs_metapage_aops = {
 	.readpage	= metapage_readpage,
 	.writepage	= metapage_writepage,
 	.sync_page	= block_sync_page,
diff --git a/fs/jfs/jfs_metapage.h b/fs/jfs/jfs_metapage.h
index f0b7d3282b0..d17a3290f5a 100644
--- a/fs/jfs/jfs_metapage.h
+++ b/fs/jfs/jfs_metapage.h
@@ -139,7 +139,7 @@ static inline void metapage_homeok(struct metapage *mp)
 	put_metapage(mp);
 }
 
-extern struct address_space_operations jfs_metapage_aops;
+extern const struct address_space_operations jfs_metapage_aops;
 
 /*
  * This routines invalidate all pages for an extent.
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index a6fb509b734..9ea91c5eeb7 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -335,7 +335,7 @@ static sector_t minix_bmap(struct address_space *mapping, sector_t block)
 {
 	return generic_block_bmap(mapping,block,minix_get_block);
 }
-static struct address_space_operations minix_aops = {
+static const struct address_space_operations minix_aops = {
 	.readpage = minix_readpage,
 	.writepage = minix_writepage,
 	.sync_page = block_sync_page,
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 90d2ea28f33..6c51c119846 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -105,7 +105,7 @@ static struct super_operations ncp_sops =
 
 extern struct dentry_operations ncp_root_dentry_operations;
 #if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS)
-extern struct address_space_operations ncp_symlink_aops;
+extern const struct address_space_operations ncp_symlink_aops;
 extern int ncp_symlink(struct inode*, struct dentry*, const char*);
 #endif
 
diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c
index e935f1b34bc..f76b1392a01 100644
--- a/fs/ncpfs/symlink.c
+++ b/fs/ncpfs/symlink.c
@@ -99,7 +99,7 @@ fail:
 /*
  * symlinks can't do much...
  */
-struct address_space_operations ncp_symlink_aops = {
+const struct address_space_operations ncp_symlink_aops = {
 	.readpage	= ncp_symlink_readpage,
 };
 	
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index add28913883..cc2b874ad5a 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -315,7 +315,7 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
 	return !nfs_wb_page(page->mapping->host, page);
 }
 
-struct address_space_operations nfs_file_aops = {
+const struct address_space_operations nfs_file_aops = {
 	.readpage = nfs_readpage,
 	.readpages = nfs_readpages,
 	.set_page_dirty = __set_page_dirty_nobuffers,
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 580412d330c..bc579bfdfbd 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1544,7 +1544,7 @@ err_out:
 /**
  * ntfs_aops - general address space operations for inodes and attributes
  */
-struct address_space_operations ntfs_aops = {
+const struct address_space_operations ntfs_aops = {
 	.readpage	= ntfs_readpage,	/* Fill page with data. */
 	.sync_page	= block_sync_page,	/* Currently, just unplugs the
 						   disk request queue. */
@@ -1560,7 +1560,7 @@ struct address_space_operations ntfs_aops = {
  * ntfs_mst_aops - general address space operations for mst protecteed inodes
  *		   and attributes
  */
-struct address_space_operations ntfs_mst_aops = {
+const struct address_space_operations ntfs_mst_aops = {
 	.readpage	= ntfs_readpage,	/* Fill page with data. */
 	.sync_page	= block_sync_page,	/* Currently, just unplugs the
 						   disk request queue. */
diff --git a/fs/ntfs/ntfs.h b/fs/ntfs/ntfs.h
index bf7b3d7c093..ddd3d503097 100644
--- a/fs/ntfs/ntfs.h
+++ b/fs/ntfs/ntfs.h
@@ -57,8 +57,8 @@ extern struct kmem_cache *ntfs_attr_ctx_cache;
 extern struct kmem_cache *ntfs_index_ctx_cache;
 
 /* The various operations structs defined throughout the driver files. */
-extern struct address_space_operations ntfs_aops;
-extern struct address_space_operations ntfs_mst_aops;
+extern const struct address_space_operations ntfs_aops;
+extern const struct address_space_operations ntfs_mst_aops;
 
 extern const struct  file_operations ntfs_file_ops;
 extern struct inode_operations ntfs_file_inode_ops;
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 47152bf9a7f..cca71317b6d 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -666,7 +666,7 @@ out:
 	return ret;
 }
 
-struct address_space_operations ocfs2_aops = {
+const struct address_space_operations ocfs2_aops = {
 	.readpage	= ocfs2_readpage,
 	.writepage	= ocfs2_writepage,
 	.prepare_write	= ocfs2_prepare_write,
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 84c50796128..35140f6cf84 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -114,7 +114,7 @@ static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
 
 extern kmem_cache_t *ocfs2_inode_cache;
 
-extern struct address_space_operations ocfs2_aops;
+extern const struct address_space_operations ocfs2_aops;
 
 struct buffer_head *ocfs2_bread(struct inode *inode, int block,
 				int *err, int reada);
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 2f24c46f72a..8bc182a8874 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -450,7 +450,7 @@ static sector_t qnx4_bmap(struct address_space *mapping, sector_t block)
 {
 	return generic_block_bmap(mapping,block,qnx4_get_block);
 }
-static struct address_space_operations qnx4_aops = {
+static const struct address_space_operations qnx4_aops = {
 	.readpage	= qnx4_readpage,
 	.writepage	= qnx4_writepage,
 	.sync_page	= block_sync_page,
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 00a933eb820..86f14cacf64 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -26,7 +26,7 @@
 
 #include <linux/fs.h>
 
-struct address_space_operations ramfs_aops = {
+const struct address_space_operations ramfs_aops = {
 	.readpage	= simple_readpage,
 	.prepare_write	= simple_prepare_write,
 	.commit_write	= simple_commit_write
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index f443a84b98a..99fffc9e1bf 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -27,7 +27,7 @@
 
 static int ramfs_nommu_setattr(struct dentry *, struct iattr *);
 
-struct address_space_operations ramfs_aops = {
+const struct address_space_operations ramfs_aops = {
 	.readpage		= simple_readpage,
 	.prepare_write		= simple_prepare_write,
 	.commit_write		= simple_commit_write
diff --git a/fs/ramfs/internal.h b/fs/ramfs/internal.h
index 313237631b4..c2bb58e7465 100644
--- a/fs/ramfs/internal.h
+++ b/fs/ramfs/internal.h
@@ -10,6 +10,6 @@
  */
 
 
-extern struct address_space_operations ramfs_aops;
+extern const struct address_space_operations ramfs_aops;
 extern const struct file_operations ramfs_file_operations;
 extern struct inode_operations ramfs_file_inode_operations;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 9857e50f85e..a24858a632f 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2996,7 +2996,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
 	return error;
 }
 
-struct address_space_operations reiserfs_address_space_operations = {
+const struct address_space_operations reiserfs_address_space_operations = {
 	.writepage = reiserfs_writepage,
 	.readpage = reiserfs_readpage,
 	.readpages = reiserfs_readpages,
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index 283fbc6b8ee..22eed61ebf6 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -459,7 +459,7 @@ err_out:
 
 /* Mapping from our types to the kernel */
 
-static struct address_space_operations romfs_aops = {
+static const struct address_space_operations romfs_aops = {
 	.readpage = romfs_readpage
 };
 
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index ed9a24d19d7..dae67048bab 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -306,7 +306,7 @@ static int smb_commit_write(struct file *file, struct page *page,
 	return status;
 }
 
-struct address_space_operations smb_file_aops = {
+const struct address_space_operations smb_file_aops = {
 	.readpage = smb_readpage,
 	.writepage = smb_writepage,
 	.prepare_write = smb_prepare_write,
diff --git a/fs/smbfs/proto.h b/fs/smbfs/proto.h
index 972ed7dad38..34fb462b237 100644
--- a/fs/smbfs/proto.h
+++ b/fs/smbfs/proto.h
@@ -63,7 +63,7 @@ extern int smb_revalidate_inode(struct dentry *dentry);
 extern int smb_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
 extern int smb_notify_change(struct dentry *dentry, struct iattr *attr);
 /* file.c */
-extern struct address_space_operations smb_file_aops;
+extern const struct address_space_operations smb_file_aops;
 extern const struct file_operations smb_file_operations;
 extern struct inode_operations smb_file_inode_operations;
 /* ioctl.c */
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index f0b347bd12c..5e0e31cc46f 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -16,7 +16,7 @@
 
 extern struct super_block * sysfs_sb;
 
-static struct address_space_operations sysfs_aops = {
+static const struct address_space_operations sysfs_aops = {
 	.readpage	= simple_readpage,
 	.prepare_write	= simple_prepare_write,
 	.commit_write	= simple_commit_write
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 86f5f8d43d0..f2bcccd1d6f 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -465,7 +465,7 @@ static sector_t sysv_bmap(struct address_space *mapping, sector_t block)
 {
 	return generic_block_bmap(mapping,block,get_block);
 }
-struct address_space_operations sysv_aops = {
+const struct address_space_operations sysv_aops = {
 	.readpage = sysv_readpage,
 	.writepage = sysv_writepage,
 	.sync_page = block_sync_page,
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index 393a480e4de..9dcc8212093 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -161,7 +161,7 @@ extern struct inode_operations sysv_dir_inode_operations;
 extern struct inode_operations sysv_fast_symlink_inode_operations;
 extern const struct file_operations sysv_file_operations;
 extern const struct file_operations sysv_dir_operations;
-extern struct address_space_operations sysv_aops;
+extern const struct address_space_operations sysv_aops;
 extern struct super_operations sysv_sops;
 extern struct dentry_operations sysv_dentry_operations;
 
diff --git a/fs/udf/file.c b/fs/udf/file.c
index e34b00e303f..a59e5f33daf 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -95,7 +95,7 @@ static int udf_adinicb_commit_write(struct file *file, struct page *page, unsign
 	return 0;
 }
 
-struct address_space_operations udf_adinicb_aops = {
+const struct address_space_operations udf_adinicb_aops = {
 	.readpage		= udf_adinicb_readpage,
 	.writepage		= udf_adinicb_writepage,
 	.sync_page		= block_sync_page,
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 2983afd5e7f..605f5111b6d 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -132,7 +132,7 @@ static sector_t udf_bmap(struct address_space *mapping, sector_t block)
 	return generic_block_bmap(mapping,block,udf_get_block);
 }
 
-struct address_space_operations udf_aops = {
+const struct address_space_operations udf_aops = {
 	.readpage		= udf_readpage,
 	.writepage		= udf_writepage,
 	.sync_page		= block_sync_page,
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 674bb40edc8..ba068a78656 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -113,6 +113,6 @@ out:
 /*
  * symlinks can't do much...
  */
-struct address_space_operations udf_symlink_aops = {
+const struct address_space_operations udf_symlink_aops = {
 	.readpage		= udf_symlink_filler,
 };
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 023e19ba5a2..2f992387cc9 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -47,9 +47,9 @@ extern struct inode_operations udf_dir_inode_operations;
 extern const struct file_operations udf_dir_operations;
 extern struct inode_operations udf_file_inode_operations;
 extern const struct file_operations udf_file_operations;
-extern struct address_space_operations udf_aops;
-extern struct address_space_operations udf_adinicb_aops;
-extern struct address_space_operations udf_symlink_aops;
+extern const struct address_space_operations udf_aops;
+extern const struct address_space_operations udf_adinicb_aops;
+extern const struct address_space_operations udf_symlink_aops;
 
 struct udf_fileident_bh
 {
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 259bd196099..8e1f90e4204 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -574,7 +574,7 @@ static sector_t ufs_bmap(struct address_space *mapping, sector_t block)
 {
 	return generic_block_bmap(mapping,block,ufs_getfrag_block);
 }
-struct address_space_operations ufs_aops = {
+const struct address_space_operations ufs_aops = {
 	.readpage = ufs_readpage,
 	.writepage = ufs_writepage,
 	.sync_page = block_sync_page,
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 3e807b828e2..c40f81ba9b1 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1454,7 +1454,7 @@ xfs_vm_invalidatepage(
 	block_invalidatepage(page, offset);
 }
 
-struct address_space_operations xfs_address_space_operations = {
+const struct address_space_operations xfs_address_space_operations = {
 	.readpage		= xfs_vm_readpage,
 	.readpages		= xfs_vm_readpages,
 	.writepage		= xfs_vm_writepage,
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index 706d8c781b8..2244e516b66 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -40,7 +40,7 @@ typedef struct xfs_ioend {
 	struct work_struct	io_work;	/* xfsdatad work queue */
 } xfs_ioend_t;
 
-extern struct address_space_operations xfs_address_space_operations;
+extern const struct address_space_operations xfs_address_space_operations;
 extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
 
 #endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 26fed0756f0..2af528dcfb0 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1520,7 +1520,7 @@ xfs_mapping_buftarg(
 	struct backing_dev_info	*bdi;
 	struct inode		*inode;
 	struct address_space	*mapping;
-	static struct address_space_operations mapping_aops = {
+	static const struct address_space_operations mapping_aops = {
 		.sync_page = block_sync_page,
 		.migratepage = fail_migrate_page,
 	};
diff --git a/include/linux/coda_linux.h b/include/linux/coda_linux.h
index 7b5c5df5cb6..be512cc9879 100644
--- a/include/linux/coda_linux.h
+++ b/include/linux/coda_linux.h
@@ -27,8 +27,8 @@ extern struct inode_operations coda_dir_inode_operations;
 extern struct inode_operations coda_file_inode_operations;
 extern struct inode_operations coda_ioctl_inode_operations;
 
-extern struct address_space_operations coda_file_aops;
-extern struct address_space_operations coda_symlink_aops;
+extern const struct address_space_operations coda_file_aops;
+extern const struct address_space_operations coda_symlink_aops;
 
 extern const struct file_operations coda_dir_operations;
 extern const struct file_operations coda_file_operations;
diff --git a/include/linux/efs_fs.h b/include/linux/efs_fs.h
index fbfa6b52e2f..278ef449581 100644
--- a/include/linux/efs_fs.h
+++ b/include/linux/efs_fs.h
@@ -38,7 +38,7 @@ struct statfs;
 
 extern struct inode_operations efs_dir_inode_operations;
 extern const struct file_operations efs_dir_operations;
-extern struct address_space_operations efs_symlink_aops;
+extern const struct address_space_operations efs_symlink_aops;
 
 extern void efs_read_inode(struct inode *);
 extern efs_block_t efs_map_block(struct inode *, efs_block_t);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2d8b348c119..e04a5cfe874 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -392,7 +392,7 @@ struct address_space {
 	unsigned int		truncate_count;	/* Cover race condition with truncate */
 	unsigned long		nrpages;	/* number of total pages */
 	pgoff_t			writeback_index;/* writeback starts here */
-	struct address_space_operations *a_ops;	/* methods */
+	const struct address_space_operations *a_ops;	/* methods */
 	unsigned long		flags;		/* error bits/gfp mask */
 	struct backing_dev_info *backing_dev_info; /* device readahead, etc */
 	spinlock_t		private_lock;	/* for use by the address_space */
@@ -1405,7 +1405,7 @@ extern void bd_forget(struct inode *inode);
 extern void bdput(struct block_device *);
 extern struct block_device *open_by_devnum(dev_t, unsigned);
 extern const struct file_operations def_blk_fops;
-extern struct address_space_operations def_blk_aops;
+extern const struct address_space_operations def_blk_aops;
 extern const struct file_operations def_chr_fops;
 extern const struct file_operations bad_sock_fops;
 extern const struct file_operations def_fifo_fops;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 0a1740b2532..d90b1bb3756 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -335,7 +335,7 @@ extern struct inode_operations nfs_file_inode_operations;
 extern struct inode_operations nfs3_file_inode_operations;
 #endif /* CONFIG_NFS_V3 */
 extern const struct file_operations nfs_file_operations;
-extern struct address_space_operations nfs_file_aops;
+extern const struct address_space_operations nfs_file_aops;
 
 static inline struct rpc_cred *nfs_file_cred(struct file *file)
 {
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index 5676c4210e2..daa2d83cefe 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -1973,7 +1973,7 @@ void reiserfs_unmap_buffer(struct buffer_head *);
 /* file.c */
 extern struct inode_operations reiserfs_file_inode_operations;
 extern const struct file_operations reiserfs_file_operations;
-extern struct address_space_operations reiserfs_address_space_operations;
+extern const struct address_space_operations reiserfs_address_space_operations;
 
 /* fix_nodes.c */
 
diff --git a/include/linux/ufs_fs.h b/include/linux/ufs_fs.h
index 914f911325b..e39b7cc4339 100644
--- a/include/linux/ufs_fs.h
+++ b/include/linux/ufs_fs.h
@@ -966,7 +966,7 @@ extern void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
 extern struct inode_operations ufs_file_inode_operations;
 extern const struct file_operations ufs_file_operations;
 
-extern struct address_space_operations ufs_aops;
+extern const struct address_space_operations ufs_aops;
 
 /* ialloc.c */
 extern void ufs_free_inode (struct inode *inode);
diff --git a/mm/filemap.c b/mm/filemap.c
index d504d6e9888..4082b3b3cea 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2069,7 +2069,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space * mapping = file->f_mapping;
-	struct address_space_operations *a_ops = mapping->a_ops;
+	const struct address_space_operations *a_ops = mapping->a_ops;
 	struct inode 	*inode = mapping->host;
 	long		status = 0;
 	struct page	*page;
@@ -2219,7 +2219,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
 				unsigned long nr_segs, loff_t *ppos)
 {
 	struct file *file = iocb->ki_filp;
-	struct address_space * mapping = file->f_mapping;
+	const struct address_space * mapping = file->f_mapping;
 	size_t ocount;		/* original count */
 	size_t count;		/* after file limit checks */
 	struct inode 	*inode = mapping->host;
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index b960ac8e591..b4fd0d7c9bf 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -273,7 +273,7 @@ __xip_file_write(struct file *filp, const char __user *buf,
 		  size_t count, loff_t pos, loff_t *ppos)
 {
 	struct address_space * mapping = filp->f_mapping;
-	struct address_space_operations *a_ops = mapping->a_ops;
+	const struct address_space_operations *a_ops = mapping->a_ops;
 	struct inode 	*inode = mapping->host;
 	long		status = 0;
 	struct page	*page;
diff --git a/mm/shmem.c b/mm/shmem.c
index 38bc3334f26..ea64c07cbe7 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -174,7 +174,7 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages)
 }
 
 static struct super_operations shmem_ops;
-static struct address_space_operations shmem_aops;
+static const struct address_space_operations shmem_aops;
 static struct file_operations shmem_file_operations;
 static struct inode_operations shmem_inode_operations;
 static struct inode_operations shmem_dir_inode_operations;
@@ -2162,7 +2162,7 @@ static void destroy_inodecache(void)
 		printk(KERN_INFO "shmem_inode_cache: not all structures were freed\n");
 }
 
-static struct address_space_operations shmem_aops = {
+static const struct address_space_operations shmem_aops = {
 	.writepage	= shmem_writepage,
 	.set_page_dirty	= __set_page_dirty_nobuffers,
 #ifdef CONFIG_TMPFS
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e0e1583f32c..7535211bb49 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -24,7 +24,7 @@
  * vmscan's shrink_list, to make sync_page look nicer, and to allow
  * future use of radix_tree tags in the swap cache.
  */
-static struct address_space_operations swap_aops = {
+static const struct address_space_operations swap_aops = {
 	.writepage	= swap_writepage,
 	.sync_page	= block_sync_page,
 	.set_page_dirty	= __set_page_dirty_nobuffers,
-- 
cgit v1.2.3-70-g09d2


From 81b0c8713385ce1b1b9058e916edcf9561ad76d6 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Thu, 29 Jun 2006 02:24:26 -0700
Subject: [PATCH] generic_file_buffered_write(): handle zero-length iovec
 segments

The recent generic_file_write() deadlock fix caused
generic_file_buffered_write() to loop inifinitely when presented with a
zero-length iovec segment.  Fix.

Note that this fix deliberately avoids calling ->prepare_write(),
->commit_write() etc with a zero-length write.  This is because I don't trust
all filesystems to get that right.

This is a cautious approach, for 2.6.17.x.  For 2.6.18 we should just go ahead
and call ->prepare_write() and ->commit_write() with the zero length and fix
any broken filesystems.  So I'll make that change once this code is stabilised
and backported into 2.6.17.x.

The reason for preferring to call ->prepare_write() and ->commit_write() with
the zero-length segment: a zero-length segment _should_ be sufficiently
uncommon that this is the correct way of handling it.  We don't want to
optimise for poorly-written userspace at the expense of well-written
userspace.

Cc: "Vladimir V. Saveliev" <vs@namesys.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: Greg KH <greg@kroah.com>
Cc: <stable@kernel.org>
Cc: walt <wa1ter@myrealbox.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/filemap.c | 9 ++++++++-
 mm/filemap.h | 4 ++--
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index 4082b3b3cea..648f2c0c8e1 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2125,6 +2125,12 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 			break;
 		}
 
+		if (unlikely(bytes == 0)) {
+			status = 0;
+			copied = 0;
+			goto zero_length_segment;
+		}
+
 		status = a_ops->prepare_write(file, page, offset, offset+bytes);
 		if (unlikely(status)) {
 			loff_t isize = i_size_read(inode);
@@ -2154,7 +2160,8 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 			page_cache_release(page);
 			continue;
 		}
-		if (likely(copied > 0)) {
+zero_length_segment:
+		if (likely(copied >= 0)) {
 			if (!status)
 				status = copied;
 
diff --git a/mm/filemap.h b/mm/filemap.h
index 536979fb4ba..3f2a343c601 100644
--- a/mm/filemap.h
+++ b/mm/filemap.h
@@ -88,7 +88,7 @@ filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
 	const struct iovec *iov = *iovp;
 	size_t base = *basep;
 
-	while (bytes) {
+	do {
 		int copy = min(bytes, iov->iov_len - base);
 
 		bytes -= copy;
@@ -97,7 +97,7 @@ filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
 			iov++;
 			base = 0;
 		}
-	}
+	} while (bytes);
 	*iovp = iov;
 	*basep = base;
 }
-- 
cgit v1.2.3-70-g09d2


From cc57637b0b015fb5d70dbbec740de516d33af07d Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Thu, 29 Jun 2006 02:24:27 -0700
Subject: [PATCH] solve config broken: undefined reference to `online_page'

Memory hotplug code of i386 adds memory to only highmem.  So, if
CONFIG_HIGHMEM is not set, CONFIG_MEMORY_HOTPLUG shouldn't be set.
Otherwise, it causes compile error.

In addition, many architecture can't use memory hotplug feature yet.  So, I
introduce CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG.

Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/Kconfig    | 3 +++
 arch/ia64/Kconfig    | 3 +++
 arch/powerpc/Kconfig | 3 +++
 arch/x86_64/Kconfig  | 2 ++
 mm/Kconfig           | 2 +-
 5 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 3bb221db164..5fa3c671c93 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -794,6 +794,9 @@ config COMPAT_VDSO
 
 endmenu
 
+config ARCH_ENABLE_MEMORY_HOTPLUG
+	def_bool y
+	depends on HIGHMEM
 
 menu "Power management options (ACPI, APM)"
 	depends on !X86_VOYAGER
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index a56df7bf022..5faacbb8d16 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -271,6 +271,9 @@ config HOTPLUG_CPU
 	  can be controlled through /sys/devices/system/cpu/cpu#.
 	  Say N if you want to disable CPU hotplug.
 
+config ARCH_ENABLE_MEMORY_HOTPLUG
+	def_bool y
+
 config SCHED_SMT
 	bool "SMT scheduler support"
 	depends on SMP
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index e922a88b2ba..e2e9df344ab 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -618,6 +618,9 @@ config HOTPLUG_CPU
 
 	  Say N if you are unsure.
 
+config ARCH_ENABLE_MEMORY_HOTPLUG
+	def_bool y
+
 config KEXEC
 	bool "kexec system call (EXPERIMENTAL)"
 	depends on PPC_MULTIPLATFORM && EXPERIMENTAL
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index ccc4a7fb97a..91039844820 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -370,6 +370,8 @@ config HOTPLUG_CPU
 		can be controlled through /sys/devices/system/cpu/cpu#.
 		Say N if you want to disable CPU hotplug.
 
+config ARCH_ENABLE_MEMORY_HOTPLUG
+	def_bool y
 
 config HPET_TIMER
 	bool
diff --git a/mm/Kconfig b/mm/Kconfig
index e76c023eb0b..83917291d45 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -115,7 +115,7 @@ config SPARSEMEM_EXTREME
 # eventually, we can have this option just 'select SPARSEMEM'
 config MEMORY_HOTPLUG
 	bool "Allow for memory hot-add"
-	depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND
+	depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND && ARCH_ENABLE_MEMORY_HOTPLUG
 	depends on (IA64 || X86 || PPC64)
 
 comment "Memory hotplug is currently incompatible with Software Suspend"
-- 
cgit v1.2.3-70-g09d2