From 377f0a08e4cb56658d878d22c3aed4716e283c6b Mon Sep 17 00:00:00 2001
From: Rami Rosen <ramirose@gmail.com>
Date: Tue, 31 Mar 2009 14:43:17 -0700
Subject: ipv4: remove unused parameter from tcp_recv_urg().

Signed-off-by: Rami Rosen <ramirose@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 2451aeb5ac2..fafbec8b073 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1081,8 +1081,7 @@ out_err:
  *	this, no blocking and very strange errors 8)
  */
 
-static int tcp_recv_urg(struct sock *sk, long timeo,
-			struct msghdr *msg, int len, int flags)
+static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
@@ -1697,7 +1696,7 @@ out:
 	return err;
 
 recv_urg:
-	err = tcp_recv_urg(sk, timeo, msg, len, flags);
+	err = tcp_recv_urg(sk, msg, len, flags);
 	goto out;
 }
 
-- 
cgit v1.2.3-70-g09d2


From c9caceca25854eff4328c89045793a91bf8f9ee3 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Tue, 31 Mar 2009 15:06:26 -0700
Subject: core: remove pointless conditional before kfree()

Remove pointless conditional before kfree().

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/ethtool.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 244ca56dffa..d9d5160610d 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -261,8 +261,7 @@ static int ethtool_get_rxnfc(struct net_device *dev, void __user *useraddr)
 	ret = 0;
 
 err_out:
-	if (rule_buf)
-		kfree(rule_buf);
+	kfree(rule_buf);
 
 	return ret;
 }
-- 
cgit v1.2.3-70-g09d2


From ce3b0f8d5c2203301fc87f3aaaed73e5819e2a48 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 29 Mar 2009 19:08:22 -0400
Subject: New helper - current_umask()

current->fs->umask is what most of fs_struct users are doing.
Put that into a helper function.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/powerpc/platforms/cell/spufs/inode.c | 2 +-
 fs/btrfs/acl.c                            | 2 +-
 fs/btrfs/ioctl.c                          | 2 +-
 fs/cifs/dir.c                             | 4 ++--
 fs/cifs/inode.c                           | 4 ++--
 fs/ext2/acl.c                             | 2 +-
 fs/ext3/acl.c                             | 2 +-
 fs/ext4/acl.c                             | 2 +-
 fs/fat/inode.c                            | 2 +-
 fs/fs_struct.c                            | 6 ++++++
 fs/generic_acl.c                          | 2 +-
 fs/gfs2/acl.c                             | 2 +-
 fs/hfsplus/options.c                      | 2 +-
 fs/hpfs/super.c                           | 2 +-
 fs/jffs2/acl.c                            | 2 +-
 fs/jfs/acl.c                              | 2 +-
 fs/namei.c                                | 6 +++---
 fs/nfs/nfs3proc.c                         | 6 +++---
 fs/nfs/nfs4proc.c                         | 2 +-
 fs/ocfs2/acl.c                            | 2 +-
 fs/omfs/inode.c                           | 2 +-
 fs/reiserfs/xattr_acl.c                   | 2 +-
 fs/xfs/linux-2.6/xfs_iops.c               | 4 ++--
 include/linux/fs.h                        | 2 ++
 ipc/mqueue.c                              | 2 +-
 net/unix/af_unix.c                        | 2 +-
 26 files changed, 39 insertions(+), 31 deletions(-)

(limited to 'net')

diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 64f068540d0..706eb5c7e2e 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -635,7 +635,7 @@ long spufs_create(struct nameidata *nd, unsigned int flags, mode_t mode,
 	if (dentry->d_inode)
 		goto out_dput;
 
-	mode &= ~current->fs->umask;
+	mode &= ~current_umask();
 
 	if (flags & SPU_CREATE_GANG)
 		ret = spufs_create_gang(nd->path.dentry->d_inode,
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 1d53b62dbba..7fdd184a528 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -256,7 +256,7 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
 		}
 
 		if (!acl)
-			inode->i_mode &= ~current->fs->umask;
+			inode->i_mode &= ~current_umask();
 	}
 
 	if (IS_POSIXACL(dir) && acl) {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index bca729fc80c..7594bec1be1 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -267,7 +267,7 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
 		goto out_dput;
 
 	if (!IS_POSIXACL(parent->dentry->d_inode))
-		mode &= ~current->fs->umask;
+		mode &= ~current_umask();
 
 	error = mnt_want_write(parent->mnt);
 	if (error)
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 2f35cccfcd8..54dce78fbb7 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -254,7 +254,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 		return -ENOMEM;
 	}
 
-	mode &= ~current->fs->umask;
+	mode &= ~current_umask();
 	if (oplockEnabled)
 		oplock = REQ_OPLOCK;
 
@@ -479,7 +479,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
 		rc = -ENOMEM;
 	else if (pTcon->unix_ext) {
 		struct cifs_unix_set_info_args args = {
-			.mode	= mode & ~current->fs->umask,
+			.mode	= mode & ~current_umask(),
 			.ctime	= NO_CHANGE_64,
 			.atime	= NO_CHANGE_64,
 			.mtime	= NO_CHANGE_64,
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index a8797cc6080..f121a80fdd6 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1125,7 +1125,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 			goto mkdir_out;
 		}
 
-		mode &= ~current->fs->umask;
+		mode &= ~current_umask();
 		rc = CIFSPOSIXCreate(xid, pTcon, SMB_O_DIRECTORY | SMB_O_CREAT,
 				mode, NULL /* netfid */, pInfo, &oplock,
 				full_path, cifs_sb->local_nls,
@@ -1204,7 +1204,7 @@ mkdir_get_info:
 		if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2))
 				direntry->d_inode->i_nlink = 2;
 
-		mode &= ~current->fs->umask;
+		mode &= ~current_umask();
 		/* must turn on setgid bit if parent dir has it */
 		if (inode->i_mode & S_ISGID)
 			mode |= S_ISGID;
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index ae8c4f850b2..d46e38cb85c 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -318,7 +318,7 @@ ext2_init_acl(struct inode *inode, struct inode *dir)
 				return PTR_ERR(acl);
 		}
 		if (!acl)
-			inode->i_mode &= ~current->fs->umask;
+			inode->i_mode &= ~current_umask();
 	}
 	if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
                struct posix_acl *clone;
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index b60bb241880..d81ef2fdb08 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -323,7 +323,7 @@ ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
 				return PTR_ERR(acl);
 		}
 		if (!acl)
-			inode->i_mode &= ~current->fs->umask;
+			inode->i_mode &= ~current_umask();
 	}
 	if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
 		struct posix_acl *clone;
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 694ed6fadcc..647e0d65a28 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -323,7 +323,7 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
 				return PTR_ERR(acl);
 		}
 		if (!acl)
-			inode->i_mode &= ~current->fs->umask;
+			inode->i_mode &= ~current_umask();
 	}
 	if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
 		struct posix_acl *clone;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index de0004fe6e0..ab657db4c94 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -930,7 +930,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
 
 	opts->fs_uid = current_uid();
 	opts->fs_gid = current_gid();
-	opts->fs_fmask = opts->fs_dmask = current->fs->umask;
+	opts->fs_fmask = current_umask();
 	opts->allow_utime = -1;
 	opts->codepage = fat_default_codepage;
 	opts->iocharset = fat_default_iocharset;
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 41cff72b377..6ac21933867 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -138,6 +138,12 @@ int unshare_fs_struct(void)
 }
 EXPORT_SYMBOL_GPL(unshare_fs_struct);
 
+int current_umask(void)
+{
+	return current->fs->umask;
+}
+EXPORT_SYMBOL(current_umask);
+
 /* to be mentioned only in INIT_TASK */
 struct fs_struct init_fs = {
 	.users		= 1,
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 995d63b2e74..e0b53aa7bbe 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -134,7 +134,7 @@ generic_acl_init(struct inode *inode, struct inode *dir,
 	mode_t mode = inode->i_mode;
 	int error;
 
-	inode->i_mode = mode & ~current->fs->umask;
+	inode->i_mode = mode & ~current_umask();
 	if (!S_ISLNK(inode->i_mode))
 		acl = ops->getacl(dir, ACL_TYPE_DEFAULT);
 	if (acl) {
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 43764f4fa76..fa881bdc3d8 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -215,7 +215,7 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
 	if (error)
 		return error;
 	if (!acl) {
-		mode &= ~current->fs->umask;
+		mode &= ~current_umask();
 		if (mode != ip->i_inode.i_mode)
 			error = munge_mode(ip, mode);
 		return error;
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index bab7f8d1bdf..3fcbb0e1f6f 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -48,7 +48,7 @@ void hfsplus_fill_defaults(struct hfsplus_sb_info *opts)
 
 	opts->creator = HFSPLUS_DEF_CR_TYPE;
 	opts->type = HFSPLUS_DEF_CR_TYPE;
-	opts->umask = current->fs->umask;
+	opts->umask = current_umask();
 	opts->uid = current_uid();
 	opts->gid = current_gid();
 	opts->part = -1;
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 0d049b8919c..c696d01bc8f 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -477,7 +477,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
 
 	uid = current_uid();
 	gid = current_gid();
-	umask = current->fs->umask;
+	umask = current_umask();
 	lowercase = 0;
 	conv = CONV_BINARY;
 	eas = 2;
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index d98713777a1..77ccf8cb082 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -336,7 +336,7 @@ int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode)
 		return PTR_ERR(acl);
 
 	if (!acl) {
-		*i_mode &= ~current->fs->umask;
+		*i_mode &= ~current_umask();
 	} else {
 		if (S_ISDIR(*i_mode))
 			jffs2_iset_acl(inode, &f->i_acl_default, acl);
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index a166c1669e8..06ca1b8d205 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -182,7 +182,7 @@ int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir)
 cleanup:
 		posix_acl_release(acl);
 	} else
-		inode->i_mode &= ~current->fs->umask;
+		inode->i_mode &= ~current_umask();
 
 	JFS_IP(inode)->mode2 = (JFS_IP(inode)->mode2 & 0xffff0000) |
 			       inode->i_mode;
diff --git a/fs/namei.c b/fs/namei.c
index 4c65a646013..964c0249444 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1578,7 +1578,7 @@ static int __open_namei_create(struct nameidata *nd, struct path *path,
 	struct dentry *dir = nd->path.dentry;
 
 	if (!IS_POSIXACL(dir->d_inode))
-		mode &= ~current->fs->umask;
+		mode &= ~current_umask();
 	error = security_path_mknod(&nd->path, path->dentry, mode, 0);
 	if (error)
 		goto out_unlock;
@@ -1989,7 +1989,7 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, int, mode,
 		goto out_unlock;
 	}
 	if (!IS_POSIXACL(nd.path.dentry->d_inode))
-		mode &= ~current->fs->umask;
+		mode &= ~current_umask();
 	error = may_mknod(mode);
 	if (error)
 		goto out_dput;
@@ -2067,7 +2067,7 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, int, mode)
 		goto out_unlock;
 
 	if (!IS_POSIXACL(nd.path.dentry->d_inode))
-		mode &= ~current->fs->umask;
+		mode &= ~current_umask();
 	error = mnt_want_write(nd.path.mnt);
 	if (error)
 		goto out_dput;
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index c55be7a7679..e47d4400fb8 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -328,7 +328,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 		data->arg.create.verifier[1] = current->pid;
 	}
 
-	sattr->ia_mode &= ~current->fs->umask;
+	sattr->ia_mode &= ~current_umask();
 
 	for (;;) {
 		status = nfs3_do_create(dir, dentry, data);
@@ -528,7 +528,7 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
 
 	dprintk("NFS call  mkdir %s\n", dentry->d_name.name);
 
-	sattr->ia_mode &= ~current->fs->umask;
+	sattr->ia_mode &= ~current_umask();
 
 	data = nfs3_alloc_createdata();
 	if (data == NULL)
@@ -639,7 +639,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 	dprintk("NFS call  mknod %s %u:%u\n", dentry->d_name.name,
 			MAJOR(rdev), MINOR(rdev));
 
-	sattr->ia_mode &= ~current->fs->umask;
+	sattr->ia_mode &= ~current_umask();
 
 	data = nfs3_alloc_createdata();
 	if (data == NULL)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8dde84b988d..bbee587dd59 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1509,7 +1509,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 		attr.ia_mode = nd->intent.open.create_mode;
 		attr.ia_valid = ATTR_MODE;
 		if (!IS_POSIXACL(dir))
-			attr.ia_mode &= ~current->fs->umask;
+			attr.ia_mode &= ~current_umask();
 	} else {
 		attr.ia_valid = 0;
 		BUG_ON(nd->intent.open.flags & O_CREAT);
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 12dfb44c22e..fbeaec76210 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -296,7 +296,7 @@ int ocfs2_init_acl(handle_t *handle,
 				return PTR_ERR(acl);
 		}
 		if (!acl)
-			inode->i_mode &= ~current->fs->umask;
+			inode->i_mode &= ~current_umask();
 	}
 	if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
 		struct posix_acl *clone;
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 633e9dc972b..aa6fc30772a 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -421,7 +421,7 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent)
 
 	sbi->s_uid = current_uid();
 	sbi->s_gid = current_gid();
-	sbi->s_dmask = sbi->s_fmask = current->fs->umask;
+	sbi->s_dmask = sbi->s_fmask = current_umask();
 
 	if (!parse_options((char *) data, sbi))
 		goto end;
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index d423416d93d..c303c426fe2 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -428,7 +428,7 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
 	} else {
 	      apply_umask:
 		/* no ACL, apply umask */
-		inode->i_mode &= ~current->fs->umask;
+		inode->i_mode &= ~current_umask();
 	}
 
 	return err;
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 7aa53fefc67..2940612e3ae 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -227,7 +227,7 @@ xfs_vn_mknod(
 	xfs_dentry_to_name(&name, dentry);
 
 	if (IS_POSIXACL(dir) && !default_acl)
-		mode &= ~current->fs->umask;
+		mode &= ~current_umask();
 
 	switch (mode & S_IFMT) {
 	case S_IFCHR:
@@ -416,7 +416,7 @@ xfs_vn_symlink(
 	mode_t		mode;
 
 	mode = S_IFLNK |
-		(irix_symlink_mode ? 0777 & ~current->fs->umask : S_IRWXUGO);
+		(irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
 	xfs_dentry_to_name(&name, dentry);
 
 	error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip, NULL);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 87e7bfc5ebd..3d7bd5447ca 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1741,6 +1741,8 @@ extern void drop_collected_mounts(struct vfsmount *);
 
 extern int vfs_statfs(struct dentry *, struct kstatfs *);
 
+extern int current_umask(void);
+
 /* /sys/fs */
 extern struct kobject *fs_kobj;
 
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index a8ddadbc745..916785363f0 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -602,7 +602,7 @@ static struct file *do_create(struct dentry *dir, struct dentry *dentry,
 		dentry->d_fsdata = attr;
 	}
 
-	mode &= ~current->fs->umask;
+	mode &= ~current_umask();
 	ret = mnt_want_write(mqueue_mnt);
 	if (ret)
 		goto out;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index baac91049b0..9dcc6e7f96e 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -832,7 +832,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		 * All right, let's create it.
 		 */
 		mode = S_IFSOCK |
-		       (SOCK_INODE(sock)->i_mode & ~current->fs->umask);
+		       (SOCK_INODE(sock)->i_mode & ~current_umask());
 		err = mnt_want_write(nd.path.mnt);
 		if (err)
 			goto out_mknod_dput;
-- 
cgit v1.2.3-70-g09d2


From f1cffcbfcc53b825da7d1d26244aabd8dccb24aa Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Wed, 1 Apr 2009 08:20:18 +0000
Subject: RDS: Fix m_rs_lock deadlock

rs_send_drop_to() is called during socket close. If it takes
m_rs_lock without disabling interrupts, then
rds_send_remove_from_sock() can run from the rx completion
handler and thus deadlock.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/send.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/rds/send.c b/net/rds/send.c
index 1b37364656f..104fe033203 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -615,7 +615,7 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
 {
 	struct rds_message *rm, *tmp;
 	struct rds_connection *conn;
-	unsigned long flags;
+	unsigned long flags, flags2;
 	LIST_HEAD(list);
 	int wake = 0;
 
@@ -651,9 +651,9 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
 	list_for_each_entry(rm, &list, m_sock_item) {
 		/* We do this here rather than in the loop above, so that
 		 * we don't have to nest m_rs_lock under rs->rs_lock */
-		spin_lock(&rm->m_rs_lock);
+		spin_lock_irqsave(&rm->m_rs_lock, flags2);
 		rm->m_rs = NULL;
-		spin_unlock(&rm->m_rs_lock);
+		spin_unlock_irqrestore(&rm->m_rs_lock, flags2);
 
 		/*
 		 * If we see this flag cleared then we're *sure* that someone
-- 
cgit v1.2.3-70-g09d2


From 745cbccac3fe8cead529a1b3358e1e86a1505bfa Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Wed, 1 Apr 2009 08:20:19 +0000
Subject: RDS: Rewrite connection cleanup, fixing oops on rmmod

This fixes a bug where a connection was unexpectedly
not on *any* list while being destroyed. It also
cleans up some code duplication and regularizes some
function names.

* Grab appropriate lock in conn_free() and explain in comment
* Ensure via locking that a conn is never not on either
  a dev's list or the nodev list
* Add rds_xx_remove_conn() to match rds_xx_add_conn()
* Make rds_xx_add_conn() return void
* Rename remove_{,nodev_}conns() to
  destroy_{,nodev_}conns() and unify their implementation
  in a helper function
* Document lock ordering as nodev conn_lock before
  dev_conn_lock

Reported-by: Yosef Etigin <yosefe@voltaire.com>
Signed-off-by: Andy Grover <andy.grover@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/ib.c      |  5 +++--
 net/rds/ib.h      | 14 +++++++++++---
 net/rds/ib_cm.c   | 34 +++++++++++++++++++---------------
 net/rds/ib_rdma.c | 43 +++++++++++++++++++++----------------------
 net/rds/iw.c      |  5 +++--
 net/rds/iw.h      | 14 +++++++++++---
 net/rds/iw_cm.c   | 35 +++++++++++++++++++----------------
 net/rds/iw_rdma.c | 44 ++++++++++++++++++++++----------------------
 8 files changed, 109 insertions(+), 85 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib.c b/net/rds/ib.c
index 06a7b798d9a..4933b380985 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -51,6 +51,7 @@ MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer");
 
 struct list_head rds_ib_devices;
 
+/* NOTE: if also grabbing ibdev lock, grab this first */
 DEFINE_SPINLOCK(ib_nodev_conns_lock);
 LIST_HEAD(ib_nodev_conns);
 
@@ -137,7 +138,7 @@ void rds_ib_remove_one(struct ib_device *device)
 		kfree(i_ipaddr);
 	}
 
-	rds_ib_remove_conns(rds_ibdev);
+	rds_ib_destroy_conns(rds_ibdev);
 
 	if (rds_ibdev->mr_pool)
 		rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
@@ -249,7 +250,7 @@ static int rds_ib_laddr_check(__be32 addr)
 void rds_ib_exit(void)
 {
 	rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
-	rds_ib_remove_nodev_conns();
+	rds_ib_destroy_nodev_conns();
 	ib_unregister_client(&rds_ib_client);
 	rds_ib_sysctl_exit();
 	rds_ib_recv_exit();
diff --git a/net/rds/ib.h b/net/rds/ib.h
index 8be563a1363..c08ffffb316 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -267,9 +267,17 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn,
 
 /* ib_rdma.c */
 int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr);
-int rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
-void rds_ib_remove_nodev_conns(void);
-void rds_ib_remove_conns(struct rds_ib_device *rds_ibdev);
+void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
+void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
+void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock);
+static inline void rds_ib_destroy_nodev_conns(void)
+{
+	__rds_ib_destroy_conns(&ib_nodev_conns, &ib_nodev_conns_lock);
+}
+static inline void rds_ib_destroy_conns(struct rds_ib_device *rds_ibdev)
+{
+	__rds_ib_destroy_conns(&rds_ibdev->conn_list, &rds_ibdev->spinlock);
+}
 struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *);
 void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo);
 void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 0532237bd12..889ab044135 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -126,9 +126,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
 	err = rds_ib_update_ipaddr(rds_ibdev, conn->c_laddr);
 	if (err)
 		printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", err);
-	err = rds_ib_add_conn(rds_ibdev, conn);
-	if (err)
-		printk(KERN_ERR "rds_ib_add_conn failed (%d)\n", err);
+	rds_ib_add_conn(rds_ibdev, conn);
 
 	/* If the peer gave us the last packet it saw, process this as if
 	 * we had received a regular ACK. */
@@ -616,18 +614,8 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
 		/*
 		 * Move connection back to the nodev list.
 		 */
-		if (ic->rds_ibdev) {
-
-			spin_lock_irq(&ic->rds_ibdev->spinlock);
-			BUG_ON(list_empty(&ic->ib_node));
-			list_del(&ic->ib_node);
-			spin_unlock_irq(&ic->rds_ibdev->spinlock);
-
-			spin_lock_irq(&ib_nodev_conns_lock);
-			list_add_tail(&ic->ib_node, &ib_nodev_conns);
-			spin_unlock_irq(&ib_nodev_conns_lock);
-			ic->rds_ibdev = NULL;
-		}
+		if (ic->rds_ibdev)
+			rds_ib_remove_conn(ic->rds_ibdev, conn);
 
 		ic->i_cm_id = NULL;
 		ic->i_pd = NULL;
@@ -701,11 +689,27 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
 	return 0;
 }
 
+/*
+ * Free a connection. Connection must be shut down and not set for reconnect.
+ */
 void rds_ib_conn_free(void *arg)
 {
 	struct rds_ib_connection *ic = arg;
+	spinlock_t	*lock_ptr;
+
 	rdsdebug("ic %p\n", ic);
+
+	/*
+	 * Conn is either on a dev's list or on the nodev list.
+	 * A race with shutdown() or connect() would cause problems
+	 * (since rds_ibdev would change) but that should never happen.
+	 */
+	lock_ptr = ic->rds_ibdev ? &ic->rds_ibdev->spinlock : &ib_nodev_conns_lock;
+
+	spin_lock_irq(lock_ptr);
 	list_del(&ic->ib_node);
+	spin_unlock_irq(lock_ptr);
+
 	kfree(ic);
 }
 
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index 69a6289ed67..81033af9302 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -139,7 +139,7 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
 	return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
 }
 
-int rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
+void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
 {
 	struct rds_ib_connection *ic = conn->c_transport_data;
 
@@ -148,45 +148,44 @@ int rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn
 	BUG_ON(list_empty(&ib_nodev_conns));
 	BUG_ON(list_empty(&ic->ib_node));
 	list_del(&ic->ib_node);
-	spin_unlock_irq(&ib_nodev_conns_lock);
 
 	spin_lock_irq(&rds_ibdev->spinlock);
 	list_add_tail(&ic->ib_node, &rds_ibdev->conn_list);
 	spin_unlock_irq(&rds_ibdev->spinlock);
+	spin_unlock_irq(&ib_nodev_conns_lock);
 
 	ic->rds_ibdev = rds_ibdev;
-
-	return 0;
 }
 
-void rds_ib_remove_nodev_conns(void)
+void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
 {
-	struct rds_ib_connection *ic, *_ic;
-	LIST_HEAD(tmp_list);
+	struct rds_ib_connection *ic = conn->c_transport_data;
 
-	/* avoid calling conn_destroy with irqs off */
-	spin_lock_irq(&ib_nodev_conns_lock);
-	list_splice(&ib_nodev_conns, &tmp_list);
-	INIT_LIST_HEAD(&ib_nodev_conns);
-	spin_unlock_irq(&ib_nodev_conns_lock);
+	/* place conn on nodev_conns_list */
+	spin_lock(&ib_nodev_conns_lock);
 
-	list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) {
-		if (ic->conn->c_passive)
-			rds_conn_destroy(ic->conn->c_passive);
-		rds_conn_destroy(ic->conn);
-	}
+	spin_lock_irq(&rds_ibdev->spinlock);
+	BUG_ON(list_empty(&ic->ib_node));
+	list_del(&ic->ib_node);
+	spin_unlock_irq(&rds_ibdev->spinlock);
+
+	list_add_tail(&ic->ib_node, &ib_nodev_conns);
+
+	spin_unlock(&ib_nodev_conns_lock);
+
+	ic->rds_ibdev = NULL;
 }
 
-void rds_ib_remove_conns(struct rds_ib_device *rds_ibdev)
+void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock)
 {
 	struct rds_ib_connection *ic, *_ic;
 	LIST_HEAD(tmp_list);
 
 	/* avoid calling conn_destroy with irqs off */
-	spin_lock_irq(&rds_ibdev->spinlock);
-	list_splice(&rds_ibdev->conn_list, &tmp_list);
-	INIT_LIST_HEAD(&rds_ibdev->conn_list);
-	spin_unlock_irq(&rds_ibdev->spinlock);
+	spin_lock_irq(list_lock);
+	list_splice(list, &tmp_list);
+	INIT_LIST_HEAD(list);
+	spin_unlock_irq(list_lock);
 
 	list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) {
 		if (ic->conn->c_passive)
diff --git a/net/rds/iw.c b/net/rds/iw.c
index 1b56905c4c0..b732efb5b63 100644
--- a/net/rds/iw.c
+++ b/net/rds/iw.c
@@ -51,6 +51,7 @@ MODULE_PARM_DESC(fastreg_message_size, " Max size of a RDMA transfer (fastreg MR
 
 struct list_head rds_iw_devices;
 
+/* NOTE: if also grabbing iwdev lock, grab this first */
 DEFINE_SPINLOCK(iw_nodev_conns_lock);
 LIST_HEAD(iw_nodev_conns);
 
@@ -145,7 +146,7 @@ void rds_iw_remove_one(struct ib_device *device)
 	}
 	spin_unlock_irq(&rds_iwdev->spinlock);
 
-	rds_iw_remove_conns(rds_iwdev);
+	rds_iw_destroy_conns(rds_iwdev);
 
 	if (rds_iwdev->mr_pool)
 		rds_iw_destroy_mr_pool(rds_iwdev->mr_pool);
@@ -258,7 +259,7 @@ static int rds_iw_laddr_check(__be32 addr)
 void rds_iw_exit(void)
 {
 	rds_info_deregister_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info);
-	rds_iw_remove_nodev_conns();
+	rds_iw_destroy_nodev_conns();
 	ib_unregister_client(&rds_iw_client);
 	rds_iw_sysctl_exit();
 	rds_iw_recv_exit();
diff --git a/net/rds/iw.h b/net/rds/iw.h
index 0ddda34f2a1..70eb948f42f 100644
--- a/net/rds/iw.h
+++ b/net/rds/iw.h
@@ -294,9 +294,17 @@ void rds_iw_cm_connect_complete(struct rds_connection *conn,
 
 /* ib_rdma.c */
 int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id);
-int rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn);
-void rds_iw_remove_nodev_conns(void);
-void rds_iw_remove_conns(struct rds_iw_device *rds_iwdev);
+void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn);
+void rds_iw_remove_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn);
+void __rds_iw_destroy_conns(struct list_head *list, spinlock_t *list_lock);
+static inline void rds_iw_destroy_nodev_conns(void)
+{
+	__rds_iw_destroy_conns(&iw_nodev_conns, &iw_nodev_conns_lock);
+}
+static inline void rds_iw_destroy_conns(struct rds_iw_device *rds_iwdev)
+{
+	__rds_iw_destroy_conns(&rds_iwdev->conn_list, &rds_iwdev->spinlock);
+}
 struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *);
 void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo);
 void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *);
diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c
index 57ecb3d4b8a..0ffaa3e97ad 100644
--- a/net/rds/iw_cm.c
+++ b/net/rds/iw_cm.c
@@ -86,9 +86,7 @@ void rds_iw_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
 	err = rds_iw_update_cm_id(rds_iwdev, ic->i_cm_id);
 	if (err)
 		printk(KERN_ERR "rds_iw_update_ipaddr failed (%d)\n", err);
-	err = rds_iw_add_conn(rds_iwdev, conn);
-	if (err)
-		printk(KERN_ERR "rds_iw_add_conn failed (%d)\n", err);
+	rds_iw_add_conn(rds_iwdev, conn);
 
 	/* If the peer gave us the last packet it saw, process this as if
 	 * we had received a regular ACK. */
@@ -637,19 +635,8 @@ void rds_iw_conn_shutdown(struct rds_connection *conn)
 		 * 	Move connection back to the nodev list.
 		 * 	Remove cm_id from the device cm_id list.
 		 */
-		if (ic->rds_iwdev) {
-
-			spin_lock_irq(&ic->rds_iwdev->spinlock);
-			BUG_ON(list_empty(&ic->iw_node));
-			list_del(&ic->iw_node);
-			spin_unlock_irq(&ic->rds_iwdev->spinlock);
-
-			spin_lock_irq(&iw_nodev_conns_lock);
-			list_add_tail(&ic->iw_node, &iw_nodev_conns);
-			spin_unlock_irq(&iw_nodev_conns_lock);
-			rds_iw_remove_cm_id(ic->rds_iwdev, ic->i_cm_id);
-			ic->rds_iwdev = NULL;
-		}
+		if (ic->rds_iwdev)
+			rds_iw_remove_conn(ic->rds_iwdev, conn);
 
 		rdma_destroy_id(ic->i_cm_id);
 
@@ -726,11 +713,27 @@ int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp)
 	return 0;
 }
 
+/*
+ * Free a connection. Connection must be shut down and not set for reconnect.
+ */
 void rds_iw_conn_free(void *arg)
 {
 	struct rds_iw_connection *ic = arg;
+	spinlock_t	*lock_ptr;
+
 	rdsdebug("ic %p\n", ic);
+
+	/*
+	 * Conn is either on a dev's list or on the nodev list.
+	 * A race with shutdown() or connect() would cause problems
+	 * (since rds_iwdev would change) but that should never happen.
+	 */
+	lock_ptr = ic->rds_iwdev ? &ic->rds_iwdev->spinlock : &iw_nodev_conns_lock;
+
+	spin_lock_irq(lock_ptr);
 	list_del(&ic->iw_node);
+	spin_unlock_irq(lock_ptr);
+
 	kfree(ic);
 }
 
diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c
index 1c02a8f952d..dcdb37da80f 100644
--- a/net/rds/iw_rdma.c
+++ b/net/rds/iw_rdma.c
@@ -196,7 +196,7 @@ int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_i
 	return rds_iw_add_cm_id(rds_iwdev, cm_id);
 }
 
-int rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn)
+void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn)
 {
 	struct rds_iw_connection *ic = conn->c_transport_data;
 
@@ -205,45 +205,45 @@ int rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn
 	BUG_ON(list_empty(&iw_nodev_conns));
 	BUG_ON(list_empty(&ic->iw_node));
 	list_del(&ic->iw_node);
-	spin_unlock_irq(&iw_nodev_conns_lock);
 
 	spin_lock_irq(&rds_iwdev->spinlock);
 	list_add_tail(&ic->iw_node, &rds_iwdev->conn_list);
 	spin_unlock_irq(&rds_iwdev->spinlock);
+	spin_unlock_irq(&iw_nodev_conns_lock);
 
 	ic->rds_iwdev = rds_iwdev;
-
-	return 0;
 }
 
-void rds_iw_remove_nodev_conns(void)
+void rds_iw_remove_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn)
 {
-	struct rds_iw_connection *ic, *_ic;
-	LIST_HEAD(tmp_list);
+	struct rds_iw_connection *ic = conn->c_transport_data;
 
-	/* avoid calling conn_destroy with irqs off */
-	spin_lock_irq(&iw_nodev_conns_lock);
-	list_splice(&iw_nodev_conns, &tmp_list);
-	INIT_LIST_HEAD(&iw_nodev_conns);
-	spin_unlock_irq(&iw_nodev_conns_lock);
+	/* place conn on nodev_conns_list */
+	spin_lock(&iw_nodev_conns_lock);
 
-	list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node) {
-		if (ic->conn->c_passive)
-			rds_conn_destroy(ic->conn->c_passive);
-		rds_conn_destroy(ic->conn);
-	}
+	spin_lock_irq(&rds_iwdev->spinlock);
+	BUG_ON(list_empty(&ic->iw_node));
+	list_del(&ic->iw_node);
+	spin_unlock_irq(&rds_iwdev->spinlock);
+
+	list_add_tail(&ic->iw_node, &iw_nodev_conns);
+
+	spin_unlock(&iw_nodev_conns_lock);
+
+	rds_iw_remove_cm_id(ic->rds_iwdev, ic->i_cm_id);
+	ic->rds_iwdev = NULL;
 }
 
-void rds_iw_remove_conns(struct rds_iw_device *rds_iwdev)
+void __rds_iw_destroy_conns(struct list_head *list, spinlock_t *list_lock)
 {
 	struct rds_iw_connection *ic, *_ic;
 	LIST_HEAD(tmp_list);
 
 	/* avoid calling conn_destroy with irqs off */
-	spin_lock_irq(&rds_iwdev->spinlock);
-	list_splice(&rds_iwdev->conn_list, &tmp_list);
-	INIT_LIST_HEAD(&rds_iwdev->conn_list);
-	spin_unlock_irq(&rds_iwdev->spinlock);
+	spin_lock_irq(list_lock);
+	list_splice(list, &tmp_list);
+	INIT_LIST_HEAD(list);
+	spin_unlock_irq(list_lock);
 
 	list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node) {
 		if (ic->conn->c_passive)
-- 
cgit v1.2.3-70-g09d2


From 8cbd9606a6367c221a7bbcc47f3ab1a8c31b6437 Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Wed, 1 Apr 2009 08:20:20 +0000
Subject: RDS: Use spinlock to protect 64b value update on 32b archs

We have a 64bit value that needs to be set atomically.
This is easy and quick on all 64bit archs, and can also be done
on x86/32 with set_64bit() (uses cmpxchg8b). However other
32b archs don't have this.

I actually changed this to the current state in preparation for
mainline because the old way (using a spinlock on 32b) resulted in
unsightly #ifdefs in the code. But obviously, being correct takes
precedence.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/ib.h      | 14 +++++---------
 net/rds/ib_cm.c   |  9 ++++++++-
 net/rds/ib_recv.c | 37 +++++++++++++++++++++++++++++++++++--
 net/rds/iw.h      | 14 +++++---------
 net/rds/iw_cm.c   |  9 ++++++++-
 net/rds/iw_recv.c | 37 +++++++++++++++++++++++++++++++++++--
 net/rds/rds.h     |  4 ++++
 7 files changed, 100 insertions(+), 24 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib.h b/net/rds/ib.h
index c08ffffb316..069206cae73 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -108,7 +108,12 @@ struct rds_ib_connection {
 
 	/* sending acks */
 	unsigned long		i_ack_flags;
+#ifdef KERNEL_HAS_ATOMIC64
+	atomic64_t		i_ack_next;	/* next ACK to send */
+#else
+	spinlock_t		i_ack_lock;	/* protect i_ack_next */
 	u64			i_ack_next;	/* next ACK to send */
+#endif
 	struct rds_header	*i_ack;
 	struct ib_send_wr	i_ack_wr;
 	struct ib_sge		i_ack_sge;
@@ -363,13 +368,4 @@ rds_ib_data_sge(struct rds_ib_connection *ic, struct ib_sge *sge)
 	return &sge[1];
 }
 
-static inline void rds_ib_set_64bit(u64 *ptr, u64 val)
-{
-#if BITS_PER_LONG == 64
-	*ptr = val;
-#else
-	set_64bit(ptr, val);
-#endif
-}
-
 #endif
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 889ab044135..f8e40e1a603 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -636,7 +636,11 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
 
 	/* Clear the ACK state */
 	clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
-	rds_ib_set_64bit(&ic->i_ack_next, 0);
+#ifdef KERNEL_HAS_ATOMIC64
+	atomic64_set(&ic->i_ack_next, 0);
+#else
+	ic->i_ack_next = 0;
+#endif
 	ic->i_ack_recv = 0;
 
 	/* Clear flow control state */
@@ -669,6 +673,9 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
 
 	INIT_LIST_HEAD(&ic->ib_node);
 	mutex_init(&ic->i_recv_mutex);
+#ifndef KERNEL_HAS_ATOMIC64
+	spin_lock_init(&ic->i_ack_lock);
+#endif
 
 	/*
 	 * rds_ib_conn_shutdown() waits for these to be emptied so they
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 5061b550216..36d931573ff 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -395,10 +395,37 @@ void rds_ib_recv_init_ack(struct rds_ib_connection *ic)
  * room for it beyond the ring size.  Send completion notices its special
  * wr_id and avoids working with the ring in that case.
  */
+#ifndef KERNEL_HAS_ATOMIC64
 static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq,
 				int ack_required)
 {
-	rds_ib_set_64bit(&ic->i_ack_next, seq);
+	unsigned long flags;
+
+	spin_lock_irqsave(&ic->i_ack_lock, flags);
+	ic->i_ack_next = seq;
+	if (ack_required)
+		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+	spin_unlock_irqrestore(&ic->i_ack_lock, flags);
+}
+
+static u64 rds_ib_get_ack(struct rds_ib_connection *ic)
+{
+	unsigned long flags;
+	u64 seq;
+
+	clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+
+	spin_lock_irqsave(&ic->i_ack_lock, flags);
+	seq = ic->i_ack_next;
+	spin_unlock_irqrestore(&ic->i_ack_lock, flags);
+
+	return seq;
+}
+#else
+static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq,
+				int ack_required)
+{
+	atomic64_set(&ic->i_ack_next, seq);
 	if (ack_required) {
 		smp_mb__before_clear_bit();
 		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
@@ -410,8 +437,10 @@ static u64 rds_ib_get_ack(struct rds_ib_connection *ic)
 	clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
 	smp_mb__after_clear_bit();
 
-	return ic->i_ack_next;
+	return atomic64_read(&ic->i_ack_next);
 }
+#endif
+
 
 static void rds_ib_send_ack(struct rds_ib_connection *ic, unsigned int adv_credits)
 {
@@ -464,6 +493,10 @@ static void rds_ib_send_ack(struct rds_ib_connection *ic, unsigned int adv_credi
  *  -	i_ack_next, which is the last sequence number we received
  *
  * Potentially, send queue and receive queue handlers can run concurrently.
+ * It would be nice to not have to use a spinlock to synchronize things,
+ * but the one problem that rules this out is that 64bit updates are
+ * not atomic on all platforms. Things would be a lot simpler if
+ * we had atomic64 or maybe cmpxchg64 everywhere.
  *
  * Reconnecting complicates this picture just slightly. When we
  * reconnect, we may be seeing duplicate packets. The peer
diff --git a/net/rds/iw.h b/net/rds/iw.h
index 70eb948f42f..b4fb2725289 100644
--- a/net/rds/iw.h
+++ b/net/rds/iw.h
@@ -131,7 +131,12 @@ struct rds_iw_connection {
 
 	/* sending acks */
 	unsigned long		i_ack_flags;
+#ifdef KERNEL_HAS_ATOMIC64
+	atomic64_t		i_ack_next;	/* next ACK to send */
+#else
+	spinlock_t		i_ack_lock;	/* protect i_ack_next */
 	u64			i_ack_next;	/* next ACK to send */
+#endif
 	struct rds_header	*i_ack;
 	struct ib_send_wr	i_ack_wr;
 	struct ib_sge		i_ack_sge;
@@ -391,13 +396,4 @@ rds_iw_data_sge(struct rds_iw_connection *ic, struct ib_sge *sge)
 	return &sge[1];
 }
 
-static inline void rds_iw_set_64bit(u64 *ptr, u64 val)
-{
-#if BITS_PER_LONG == 64
-	*ptr = val;
-#else
-	set_64bit(ptr, val);
-#endif
-}
-
 #endif
diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c
index 0ffaa3e97ad..a416b0d492b 100644
--- a/net/rds/iw_cm.c
+++ b/net/rds/iw_cm.c
@@ -659,7 +659,11 @@ void rds_iw_conn_shutdown(struct rds_connection *conn)
 
 	/* Clear the ACK state */
 	clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
-	rds_iw_set_64bit(&ic->i_ack_next, 0);
+#ifdef KERNEL_HAS_ATOMIC64
+	atomic64_set(&ic->i_ack_next, 0);
+#else
+	ic->i_ack_next = 0;
+#endif
 	ic->i_ack_recv = 0;
 
 	/* Clear flow control state */
@@ -693,6 +697,9 @@ int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp)
 
 	INIT_LIST_HEAD(&ic->iw_node);
 	mutex_init(&ic->i_recv_mutex);
+#ifndef KERNEL_HAS_ATOMIC64
+	spin_lock_init(&ic->i_ack_lock);
+#endif
 
 	/*
 	 * rds_iw_conn_shutdown() waits for these to be emptied so they
diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c
index a1931f0027a..fde470fa50d 100644
--- a/net/rds/iw_recv.c
+++ b/net/rds/iw_recv.c
@@ -395,10 +395,37 @@ void rds_iw_recv_init_ack(struct rds_iw_connection *ic)
  * room for it beyond the ring size.  Send completion notices its special
  * wr_id and avoids working with the ring in that case.
  */
+#ifndef KERNEL_HAS_ATOMIC64
 static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq,
 				int ack_required)
 {
-	rds_iw_set_64bit(&ic->i_ack_next, seq);
+	unsigned long flags;
+
+	spin_lock_irqsave(&ic->i_ack_lock, flags);
+	ic->i_ack_next = seq;
+	if (ack_required)
+		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+	spin_unlock_irqrestore(&ic->i_ack_lock, flags);
+}
+
+static u64 rds_iw_get_ack(struct rds_iw_connection *ic)
+{
+	unsigned long flags;
+	u64 seq;
+
+	clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+
+	spin_lock_irqsave(&ic->i_ack_lock, flags);
+	seq = ic->i_ack_next;
+	spin_unlock_irqrestore(&ic->i_ack_lock, flags);
+
+	return seq;
+}
+#else
+static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq,
+				int ack_required)
+{
+	atomic64_set(&ic->i_ack_next, seq);
 	if (ack_required) {
 		smp_mb__before_clear_bit();
 		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
@@ -410,8 +437,10 @@ static u64 rds_iw_get_ack(struct rds_iw_connection *ic)
 	clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
 	smp_mb__after_clear_bit();
 
-	return ic->i_ack_next;
+	return atomic64_read(&ic->i_ack_next);
 }
+#endif
+
 
 static void rds_iw_send_ack(struct rds_iw_connection *ic, unsigned int adv_credits)
 {
@@ -464,6 +493,10 @@ static void rds_iw_send_ack(struct rds_iw_connection *ic, unsigned int adv_credi
  *  -	i_ack_next, which is the last sequence number we received
  *
  * Potentially, send queue and receive queue handlers can run concurrently.
+ * It would be nice to not have to use a spinlock to synchronize things,
+ * but the one problem that rules this out is that 64bit updates are
+ * not atomic on all platforms. Things would be a lot simpler if
+ * we had atomic64 or maybe cmpxchg64 everywhere.
  *
  * Reconnecting complicates this picture just slightly. When we
  * reconnect, we may be seeing duplicate packets. The peer
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 06040070497..619f0a30a4e 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -28,6 +28,10 @@
  */
 #define RDS_PORT	18634
 
+#ifdef ATOMIC64_INIT
+#define KERNEL_HAS_ATOMIC64
+#endif
+
 #ifdef DEBUG
 #define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args)
 #else
-- 
cgit v1.2.3-70-g09d2


From fa9a86ddc8ecd2830a5e773facc250f110300ae7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Thu, 2 Apr 2009 00:53:49 -0700
Subject: netfilter: use rcu_read_bh() in ipt_do_table()

Commit 784544739a25c30637397ace5489eeb6e15d7d49
(netfilter: iptables: lock free counters) forgot to disable BH
in arpt_do_table(), ipt_do_table() and  ip6t_do_table()

Use rcu_read_lock_bh() instead of rcu_read_lock() cures the problem.

Reported-and-bisected-by: Roman Mindalev <r000n@r000n.net>
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Acked-by: Patrick McHardy <kaber@trash.net>
Acked-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/arp_tables.c | 4 ++--
 net/ipv4/netfilter/ip_tables.c  | 4 ++--
 net/ipv6/netfilter/ip6_tables.c | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 35c5f6a5cb7..5ba533d234d 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -253,7 +253,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 	indev = in ? in->name : nulldevname;
 	outdev = out ? out->name : nulldevname;
 
-	rcu_read_lock();
+	rcu_read_lock_bh();
 	private = rcu_dereference(table->private);
 	table_base = rcu_dereference(private->entries[smp_processor_id()]);
 
@@ -329,7 +329,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 		}
 	} while (!hotdrop);
 
-	rcu_read_unlock();
+	rcu_read_unlock_bh();
 
 	if (hotdrop)
 		return NF_DROP;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 82ee7c9049f..810c0b62c7d 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -339,7 +339,7 @@ ipt_do_table(struct sk_buff *skb,
 
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
 
-	rcu_read_lock();
+	rcu_read_lock_bh();
 	private = rcu_dereference(table->private);
 	table_base = rcu_dereference(private->entries[smp_processor_id()]);
 
@@ -437,7 +437,7 @@ ipt_do_table(struct sk_buff *skb,
 		}
 	} while (!hotdrop);
 
-	rcu_read_unlock();
+	rcu_read_unlock_bh();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index e89cfa3a8f2..dfed176aed3 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -365,7 +365,7 @@ ip6t_do_table(struct sk_buff *skb,
 
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
 
-	rcu_read_lock();
+	rcu_read_lock_bh();
 	private = rcu_dereference(table->private);
 	table_base = rcu_dereference(private->entries[smp_processor_id()]);
 
@@ -466,7 +466,7 @@ ip6t_do_table(struct sk_buff *skb,
 #ifdef CONFIG_NETFILTER_DEBUG
 	((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
 #endif
-	rcu_read_unlock();
+	rcu_read_unlock_bh();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
-- 
cgit v1.2.3-70-g09d2


From f2bde7328633269ee935d9ed96535ade15cc348f Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Wed, 1 Apr 2009 11:20:20 +0000
Subject: net: allow multiple dev per napi with GRO

GRO assumes that there is a one-to-one relationship between NAPI
structure and network device. Some devices like sky2 share multiple
devices on a single interrupt so only have one NAPI handler. Rather than
split GRO from NAPI, just have GRO assume if device changes that
it is a different flow.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 52fea5b28ca..91d792d17e0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2472,8 +2472,9 @@ static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 		return GRO_NORMAL;
 
 	for (p = napi->gro_list; p; p = p->next) {
-		NAPI_GRO_CB(p)->same_flow = !compare_ether_header(
-			skb_mac_header(p), skb_gro_mac_header(skb));
+		NAPI_GRO_CB(p)->same_flow = (p->dev == skb->dev)
+			&& !compare_ether_header(skb_mac_header(p),
+						 skb_gro_mac_header(skb));
 		NAPI_GRO_CB(p)->flush = 0;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 797108d134a91afca9fa59c572336b279bc66afb Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Wed, 1 Apr 2009 23:15:17 +0000
Subject: tcp: add helper for counter tweaking due mid-wq change
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We need full-scale adjustment to fix a TCP miscount in the next
patch, so just move it into a helper and call for that from the
other places.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     | 15 ------------
 net/ipv4/tcp_output.c | 66 ++++++++++++++++++++++++++-------------------------
 2 files changed, 34 insertions(+), 47 deletions(-)

(limited to 'net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index e54c76d7549..1b94b9bfe2d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -616,21 +616,6 @@ static inline int tcp_skb_mss(const struct sk_buff *skb)
 	return skb_shinfo(skb)->gso_size;
 }
 
-static inline void tcp_dec_pcount_approx_int(__u32 *count, const int decr)
-{
-	if (*count) {
-		*count -= decr;
-		if ((int)*count < 0)
-			*count = 0;
-	}
-}
-
-static inline void tcp_dec_pcount_approx(__u32 *count,
-					 const struct sk_buff *skb)
-{
-	tcp_dec_pcount_approx_int(count, tcp_skb_pcount(skb));
-}
-
 /* Events passed to congestion control interface */
 enum tcp_ca_event {
 	CA_EVENT_TX_START,	/* first transmit when no packets in flight */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index c1f259d2d33..f1db89bb3aa 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -754,6 +754,36 @@ static void tcp_adjust_fackets_out(struct sock *sk, struct sk_buff *skb,
 		tp->fackets_out -= decr;
 }
 
+/* Pcount in the middle of the write queue got changed, we need to do various
+ * tweaks to fix counters
+ */
+static void tcp_adjust_pcount(struct sock *sk, struct sk_buff *skb, int decr)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	tp->packets_out -= decr;
+
+	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
+		tp->sacked_out -= decr;
+	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
+		tp->retrans_out -= decr;
+	if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
+		tp->lost_out -= decr;
+
+	/* Reno case is special. Sigh... */
+	if (tcp_is_reno(tp) && decr > 0)
+		tp->sacked_out -= min_t(u32, tp->sacked_out, decr);
+
+	tcp_adjust_fackets_out(sk, skb, decr);
+
+	if (tp->lost_skb_hint &&
+	    before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
+	    (tcp_is_fack(tp) || TCP_SKB_CB(skb)->sacked))
+		tp->lost_cnt_hint -= decr;
+
+	tcp_verify_left_out(tp);
+}
+
 /* Function to create two new TCP segments.  Shrinks the given segment
  * to the specified size and appends a new segment with the rest of the
  * packet to the list.  This won't be called frequently, I hope.
@@ -836,28 +866,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
 		int diff = old_factor - tcp_skb_pcount(skb) -
 			tcp_skb_pcount(buff);
 
-		tp->packets_out -= diff;
-
-		if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
-			tp->sacked_out -= diff;
-		if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
-			tp->retrans_out -= diff;
-
-		if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
-			tp->lost_out -= diff;
-
-		/* Adjust Reno SACK estimate. */
-		if (tcp_is_reno(tp) && diff > 0) {
-			tcp_dec_pcount_approx_int(&tp->sacked_out, diff);
-			tcp_verify_left_out(tp);
-		}
-		tcp_adjust_fackets_out(sk, skb, diff);
-
-		if (tp->lost_skb_hint &&
-		    before(TCP_SKB_CB(skb)->seq,
-			   TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
-		    (tcp_is_fack(tp) || TCP_SKB_CB(skb)->sacked))
-			tp->lost_cnt_hint -= diff;
+		if (diff)
+			tcp_adjust_pcount(sk, skb, diff);
 	}
 
 	/* Link BUFF into the send queue. */
@@ -1768,22 +1778,14 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
 	 * packet counting does not break.
 	 */
 	TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS;
-	if (TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_RETRANS)
-		tp->retrans_out -= tcp_skb_pcount(next_skb);
-	if (TCP_SKB_CB(next_skb)->sacked & TCPCB_LOST)
-		tp->lost_out -= tcp_skb_pcount(next_skb);
-	/* Reno case is special. Sigh... */
-	if (tcp_is_reno(tp) && tp->sacked_out)
-		tcp_dec_pcount_approx(&tp->sacked_out, next_skb);
-
-	tcp_adjust_fackets_out(sk, next_skb, tcp_skb_pcount(next_skb));
-	tp->packets_out -= tcp_skb_pcount(next_skb);
 
 	/* changed transmit queue under us so clear hints */
 	tcp_clear_retrans_hints_partial(tp);
 	if (next_skb == tp->retransmit_skb_hint)
 		tp->retransmit_skb_hint = skb;
 
+	tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb));
+
 	sk_wmem_free_skb(sk, next_skb);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 9eb9362e569062e2f841b7a023e5fcde10ed63b4 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Wed, 1 Apr 2009 23:18:20 +0000
Subject: tcp: miscounts due to tcp_fragment pcount reset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It seems that trivial reset of pcount to one was not sufficient
in tcp_retransmit_skb. Multiple counters experience a positive
miscount when skb's pcount gets lowered without the necessary
adjustments (depending on skb's sacked bits which exactly), at
worst a packets_out miscount can crash at RTO if the write queue
is empty!

Triggering this requires mss change, so bidir tcp or mtu probe or
like.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de>
Tested-by: Uwe Bugla <uwe.bugla@gmx.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f1db89bb3aa..53300fa2359 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1893,7 +1893,12 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 		if (tcp_fragment(sk, skb, cur_mss, cur_mss))
 			return -ENOMEM; /* We'll try again later. */
 	} else {
-		tcp_init_tso_segs(sk, skb, cur_mss);
+		int oldpcount = tcp_skb_pcount(skb);
+
+		if (unlikely(oldpcount > 1)) {
+			tcp_init_tso_segs(sk, skb, cur_mss);
+			tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb));
+		}
 	}
 
 	tcp_retrans_try_collapse(sk, skb, cur_mss);
-- 
cgit v1.2.3-70-g09d2