From 7123aaa3a1416529ce461e98108e6b343b294643 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 8 Jun 2012 05:03:21 +0000 Subject: af_unix: speedup /proc/net/unix /proc/net/unix has quadratic behavior, and can hold unix_table_lock for a while if high number of unix sockets are alive. (90 ms for 200k sockets...) We already have a hash table, so its quite easy to use it. Problem is unbound sockets are still hashed in a single hash slot (unix_socket_table[UNIX_HASH_TABLE]) This patch also spreads unbound sockets to 256 hash slots, to speedup both /proc/net/unix and unix_diag. Time to read /proc/net/unix with 200k unix sockets : (time dd if=/proc/net/unix of=/dev/null bs=4k) before : 520 secs after : 2 secs Signed-off-by: Eric Dumazet Cc: Steven Whitehouse Cc: Pavel Emelyanov Signed-off-by: David S. Miller --- net/unix/af_unix.c | 110 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 64 insertions(+), 46 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 641f2e47f16..cf83f6b5ac9 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -115,15 +115,24 @@ #include #include -struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; +struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE]; EXPORT_SYMBOL_GPL(unix_socket_table); DEFINE_SPINLOCK(unix_table_lock); EXPORT_SYMBOL_GPL(unix_table_lock); static atomic_long_t unix_nr_socks; -#define unix_sockets_unbound (&unix_socket_table[UNIX_HASH_SIZE]) -#define UNIX_ABSTRACT(sk) (unix_sk(sk)->addr->hash != UNIX_HASH_SIZE) +static struct hlist_head *unix_sockets_unbound(void *addr) +{ + unsigned long hash = (unsigned long)addr; + + hash ^= hash >> 16; + hash ^= hash >> 8; + hash %= UNIX_HASH_SIZE; + return &unix_socket_table[UNIX_HASH_SIZE + hash]; +} + +#define UNIX_ABSTRACT(sk) (unix_sk(sk)->addr->hash < UNIX_HASH_SIZE) #ifdef CONFIG_SECURITY_NETWORK static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) @@ -645,7 +654,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock) INIT_LIST_HEAD(&u->link); mutex_init(&u->readlock); /* single task reading lock */ init_waitqueue_head(&u->peer_wait); - unix_insert_socket(unix_sockets_unbound, sk); + unix_insert_socket(unix_sockets_unbound(sk), sk); out: if (sk == NULL) atomic_long_dec(&unix_nr_socks); @@ -2239,47 +2248,58 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock, } #ifdef CONFIG_PROC_FS -static struct sock *first_unix_socket(int *i) -{ - for (*i = 0; *i <= UNIX_HASH_SIZE; (*i)++) { - if (!hlist_empty(&unix_socket_table[*i])) - return __sk_head(&unix_socket_table[*i]); - } - return NULL; -} -static struct sock *next_unix_socket(int *i, struct sock *s) -{ - struct sock *next = sk_next(s); - /* More in this chain? */ - if (next) - return next; - /* Look for next non-empty chain. */ - for ((*i)++; *i <= UNIX_HASH_SIZE; (*i)++) { - if (!hlist_empty(&unix_socket_table[*i])) - return __sk_head(&unix_socket_table[*i]); - } - return NULL; -} +#define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1) + +#define get_bucket(x) ((x) >> BUCKET_SPACE) +#define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1)) +#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) struct unix_iter_state { struct seq_net_private p; - int i; }; -static struct sock *unix_seq_idx(struct seq_file *seq, loff_t pos) +static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) { - struct unix_iter_state *iter = seq->private; - loff_t off = 0; - struct sock *s; + unsigned long offset = get_offset(*pos); + unsigned long bucket = get_bucket(*pos); + struct sock *sk; + unsigned long count = 0; - for (s = first_unix_socket(&iter->i); s; s = next_unix_socket(&iter->i, s)) { - if (sock_net(s) != seq_file_net(seq)) + for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) { + if (sock_net(sk) != seq_file_net(seq)) continue; - if (off == pos) - return s; - ++off; + if (++count == offset) + break; } + + return sk; +} + +static struct sock *unix_next_socket(struct seq_file *seq, + struct sock *sk, + loff_t *pos) +{ + unsigned long bucket; + + while (sk > (struct sock *)SEQ_START_TOKEN) { + sk = sk_next(sk); + if (!sk) + goto next_bucket; + if (sock_net(sk) == seq_file_net(seq)) + return sk; + } + + do { + sk = unix_from_bucket(seq, pos); + if (sk) + return sk; + +next_bucket: + bucket = get_bucket(*pos) + 1; + *pos = set_bucket_offset(bucket, 1); + } while (bucket < ARRAY_SIZE(unix_socket_table)); + return NULL; } @@ -2287,22 +2307,20 @@ static void *unix_seq_start(struct seq_file *seq, loff_t *pos) __acquires(unix_table_lock) { spin_lock(&unix_table_lock); - return *pos ? unix_seq_idx(seq, *pos - 1) : SEQ_START_TOKEN; + + if (!*pos) + return SEQ_START_TOKEN; + + if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table)) + return NULL; + + return unix_next_socket(seq, NULL, pos); } static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct unix_iter_state *iter = seq->private; - struct sock *sk = v; ++*pos; - - if (v == SEQ_START_TOKEN) - sk = first_unix_socket(&iter->i); - else - sk = next_unix_socket(&iter->i, sk); - while (sk && (sock_net(sk) != seq_file_net(seq))) - sk = next_unix_socket(&iter->i, sk); - return sk; + return unix_next_socket(seq, v, pos); } static void unix_seq_stop(struct seq_file *seq, void *v) -- cgit v1.2.3-70-g09d2 From 8b51b064a6da90c68af5385a874968829a2a0ed7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 8 Jun 2012 22:10:20 +0000 Subject: af_unix: remove unix_iter_state As pointed out by Michael Tokarev , struct unix_iter_state is no longer needed. Suggested-by: Michael Tokarev Signed-off-by: Eric Dumazet Cc: Steven Whitehouse Cc: Pavel Emelyanov Signed-off-by: David S. Miller --- net/unix/af_unix.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index cf83f6b5ac9..79981d97bc9 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2255,10 +2255,6 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock, #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1)) #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) -struct unix_iter_state { - struct seq_net_private p; -}; - static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) { unsigned long offset = get_offset(*pos); @@ -2383,7 +2379,7 @@ static const struct seq_operations unix_seq_ops = { static int unix_seq_open(struct inode *inode, struct file *file) { return seq_open_net(inode, file, &unix_seq_ops, - sizeof(struct unix_iter_state)); + sizeof(struct seq_net_private)); } static const struct file_operations unix_seq_fops = { -- cgit v1.2.3-70-g09d2 From 921a1650de9eed40dd64d681aba4a4d98856f289 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 20 Jul 2012 01:15:31 +0400 Subject: new helper: done_path_create() releases what needs to be released after {kern,user}_path_create() Signed-off-by: Al Viro --- arch/powerpc/platforms/cell/spufs/syscalls.c | 4 +--- drivers/base/devtmpfs.c | 9 ++------- fs/namei.c | 24 ++++++++++++------------ fs/ocfs2/refcounttree.c | 4 +--- include/linux/namei.h | 1 + net/unix/af_unix.c | 9 ++++----- 6 files changed, 21 insertions(+), 30 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/arch/powerpc/platforms/cell/spufs/syscalls.c b/arch/powerpc/platforms/cell/spufs/syscalls.c index 8591bb62d7f..5b7d8ffbf89 100644 --- a/arch/powerpc/platforms/cell/spufs/syscalls.c +++ b/arch/powerpc/platforms/cell/spufs/syscalls.c @@ -70,9 +70,7 @@ static long do_spu_create(const char __user *pathname, unsigned int flags, ret = PTR_ERR(dentry); if (!IS_ERR(dentry)) { ret = spufs_create(&path, dentry, flags, mode, neighbor); - mutex_unlock(&path.dentry->d_inode->i_mutex); - dput(dentry); - path_put(&path); + done_path_create(&path, dentry); } return ret; diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c index d91a3a0b232..deb4a456cf8 100644 --- a/drivers/base/devtmpfs.c +++ b/drivers/base/devtmpfs.c @@ -156,9 +156,7 @@ static int dev_mkdir(const char *name, umode_t mode) if (!err) /* mark as kernel-created inode */ dentry->d_inode->i_private = &thread; - dput(dentry); - mutex_unlock(&path.dentry->d_inode->i_mutex); - path_put(&path); + done_path_create(&path, dentry); return err; } @@ -218,10 +216,7 @@ static int handle_create(const char *nodename, umode_t mode, struct device *dev) /* mark as kernel-created inode */ dentry->d_inode->i_private = &thread; } - dput(dentry); - - mutex_unlock(&path.dentry->d_inode->i_mutex); - path_put(&path); + done_path_create(&path, dentry); return err; } diff --git a/fs/namei.c b/fs/namei.c index 2ccc35c4dc2..5bc6f3d1dc8 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2893,6 +2893,14 @@ out: } EXPORT_SYMBOL(kern_path_create); +void done_path_create(struct path *path, struct dentry *dentry) +{ + dput(dentry); + mutex_unlock(&path->dentry->d_inode->i_mutex); + path_put(path); +} +EXPORT_SYMBOL(done_path_create); + struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, int is_dir) { char *tmp = getname(pathname); @@ -2989,9 +2997,7 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, out_drop_write: mnt_drop_write(path.mnt); out_dput: - dput(dentry); - mutex_unlock(&path.dentry->d_inode->i_mutex); - path_put(&path); + done_path_create(&path, dentry); return error; } @@ -3048,9 +3054,7 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) out_drop_write: mnt_drop_write(path.mnt); out_dput: - dput(dentry); - mutex_unlock(&path.dentry->d_inode->i_mutex); - path_put(&path); + done_path_create(&path, dentry); return error; } @@ -3334,9 +3338,7 @@ SYSCALL_DEFINE3(symlinkat, const char __user *, oldname, out_drop_write: mnt_drop_write(path.mnt); out_dput: - dput(dentry); - mutex_unlock(&path.dentry->d_inode->i_mutex); - path_put(&path); + done_path_create(&path, dentry); out_putname: putname(from); return error; @@ -3446,9 +3448,7 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, out_drop_write: mnt_drop_write(new_path.mnt); out_dput: - dput(new_dentry); - mutex_unlock(&new_path.dentry->d_inode->i_mutex); - path_put(&new_path); + done_path_create(&new_path, new_dentry); out: path_put(&old_path); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 9f32d7cbb7a..23cf78f6850 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4477,9 +4477,7 @@ int ocfs2_reflink_ioctl(struct inode *inode, new_dentry, preserve); mnt_drop_write(new_path.mnt); out_dput: - dput(new_dentry); - mutex_unlock(&new_path.dentry->d_inode->i_mutex); - path_put(&new_path); + done_path_create(&new_path, new_dentry); out: path_put(&old_path); diff --git a/include/linux/namei.h b/include/linux/namei.h index d2ef8b34b96..4bf19d8174e 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -67,6 +67,7 @@ extern int kern_path(const char *, unsigned, struct path *); extern struct dentry *kern_path_create(int, const char *, struct path *, int); extern struct dentry *user_path_create(int, const char __user *, struct path *, int); +extern void done_path_create(struct path *, struct dentry *); extern struct dentry *kern_path_locked(const char *, struct path *); extern int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *, unsigned int, struct path *); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 641f2e47f16..e8239540683 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -887,8 +887,9 @@ out_mknod_drop_write: mnt_drop_write(path.mnt); if (err) goto out_mknod_dput; - mutex_unlock(&path.dentry->d_inode->i_mutex); - dput(path.dentry); + mntget(path.mnt); + dget(dentry); + done_path_create(&path, dentry); path.dentry = dentry; addr->hash = UNIX_HASH_SIZE; @@ -923,9 +924,7 @@ out: return err; out_mknod_dput: - dput(dentry); - mutex_unlock(&path.dentry->d_inode->i_mutex); - path_put(&path); + done_path_create(&path, dentry); out_mknod_parent: if (err == -EEXIST) err = -EADDRINUSE; -- cgit v1.2.3-70-g09d2 From a8104a9fcdeb82e22d7acd55fca20746581067d3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 20 Jul 2012 02:25:00 +0400 Subject: pull mnt_want_write()/mnt_drop_write() into kern_path_create()/done_path_create() resp. One side effect - attempt to create a cross-device link on a read-only fs fails with EROFS instead of EXDEV now. Makes more sense, POSIX allows, etc. Signed-off-by: Al Viro --- fs/namei.c | 57 ++++++++++++++++--------------------------------- fs/ocfs2/refcounttree.c | 7 ------ net/unix/af_unix.c | 4 ---- 3 files changed, 18 insertions(+), 50 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/fs/namei.c b/fs/namei.c index cf362dc9d1f..a3fb78fd70d 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2865,10 +2865,11 @@ struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); dentry = lookup_hash(&nd); if (IS_ERR(dentry)) - goto fail; + goto unlock; + error = -EEXIST; if (dentry->d_inode) - goto eexist; + goto fail; /* * Special case - lookup gave negative, but... we had foo/bar/ * From the vfs_mknod() POV we just have a negative dentry - @@ -2876,16 +2877,18 @@ struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path * been asking for (non-existent) directory. -ENOENT for you. */ if (unlikely(!is_dir && nd.last.name[nd.last.len])) { - dput(dentry); - dentry = ERR_PTR(-ENOENT); + error = -ENOENT; goto fail; } + error = mnt_want_write(nd.path.mnt); + if (error) + goto fail; *path = nd.path; return dentry; -eexist: - dput(dentry); - dentry = ERR_PTR(-EEXIST); fail: + dput(dentry); + dentry = ERR_PTR(error); +unlock: mutex_unlock(&nd.path.dentry->d_inode->i_mutex); out: path_put(&nd.path); @@ -2897,6 +2900,7 @@ void done_path_create(struct path *path, struct dentry *dentry) { dput(dentry); mutex_unlock(&path->dentry->d_inode->i_mutex); + mnt_drop_write(path->mnt); path_put(path); } EXPORT_SYMBOL(done_path_create); @@ -2974,12 +2978,9 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, if (!IS_POSIXACL(path.dentry->d_inode)) mode &= ~current_umask(); - error = mnt_want_write(path.mnt); - if (error) - goto out_dput; error = security_path_mknod(&path, dentry, mode, dev); if (error) - goto out_drop_write; + goto out; switch (mode & S_IFMT) { case 0: case S_IFREG: error = vfs_create(path.dentry->d_inode,dentry,mode,true); @@ -2992,11 +2993,8 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, error = vfs_mknod(path.dentry->d_inode,dentry,mode,0); break; } -out_drop_write: - mnt_drop_write(path.mnt); -out_dput: +out: done_path_create(&path, dentry); - return error; } @@ -3042,16 +3040,9 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) if (!IS_POSIXACL(path.dentry->d_inode)) mode &= ~current_umask(); - error = mnt_want_write(path.mnt); - if (error) - goto out_dput; error = security_path_mkdir(&path, dentry, mode); - if (error) - goto out_drop_write; - error = vfs_mkdir(path.dentry->d_inode, dentry, mode); -out_drop_write: - mnt_drop_write(path.mnt); -out_dput: + if (!error) + error = vfs_mkdir(path.dentry->d_inode, dentry, mode); done_path_create(&path, dentry); return error; } @@ -3326,16 +3317,9 @@ SYSCALL_DEFINE3(symlinkat, const char __user *, oldname, if (IS_ERR(dentry)) goto out_putname; - error = mnt_want_write(path.mnt); - if (error) - goto out_dput; error = security_path_symlink(&path, dentry, from); - if (error) - goto out_drop_write; - error = vfs_symlink(path.dentry->d_inode, dentry, from); -out_drop_write: - mnt_drop_write(path.mnt); -out_dput: + if (!error) + error = vfs_symlink(path.dentry->d_inode, dentry, from); done_path_create(&path, dentry); out_putname: putname(from); @@ -3436,15 +3420,10 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, error = -EXDEV; if (old_path.mnt != new_path.mnt) goto out_dput; - error = mnt_want_write(new_path.mnt); - if (error) - goto out_dput; error = security_path_link(old_path.dentry, &new_path, new_dentry); if (error) - goto out_drop_write; + goto out_dput; error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry); -out_drop_write: - mnt_drop_write(new_path.mnt); out_dput: done_path_create(&new_path, new_dentry); out: diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 23cf78f6850..30a055049e1 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4466,16 +4466,9 @@ int ocfs2_reflink_ioctl(struct inode *inode, goto out_dput; } - error = mnt_want_write(new_path.mnt); - if (error) { - mlog_errno(error); - goto out_dput; - } - error = ocfs2_vfs_reflink(old_path.dentry, new_path.dentry->d_inode, new_dentry, preserve); - mnt_drop_write(new_path.mnt); out_dput: done_path_create(&new_path, new_dentry); out: diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e8239540683..88ab72820b9 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -876,15 +876,11 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) */ mode = S_IFSOCK | (SOCK_INODE(sock)->i_mode & ~current_umask()); - err = mnt_want_write(path.mnt); - if (err) - goto out_mknod_dput; err = security_path_mknod(&path, dentry, mode, 0); if (err) goto out_mknod_drop_write; err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0); out_mknod_drop_write: - mnt_drop_write(path.mnt); if (err) goto out_mknod_dput; mntget(path.mnt); -- cgit v1.2.3-70-g09d2 From faf02010290e202e275c1bf94ca9dd808bf85607 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 20 Jul 2012 02:37:29 +0400 Subject: clean unix_bind() up a bit Signed-off-by: Al Viro --- net/unix/af_unix.c | 88 ++++++++++++++++++++++++++---------------------------- 1 file changed, 43 insertions(+), 45 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 88ab72820b9..b7667f81c49 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -814,6 +814,34 @@ fail: return NULL; } +static int unix_mknod(const char *sun_path, umode_t mode, struct path *res) +{ + struct dentry *dentry; + struct path path; + int err = 0; + /* + * Get the parent directory, calculate the hash for last + * component. + */ + dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0); + err = PTR_ERR(dentry); + if (IS_ERR(dentry)) + return err; + + /* + * All right, let's create it. + */ + err = security_path_mknod(&path, dentry, mode, 0); + if (!err) { + err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0); + if (!err) { + res->mnt = mntget(path.mnt); + res->dentry = dget(dentry); + } + } + done_path_create(&path, dentry); + return err; +} static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { @@ -822,8 +850,6 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) struct unix_sock *u = unix_sk(sk); struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; char *sun_path = sunaddr->sun_path; - struct dentry *dentry = NULL; - struct path path; int err; unsigned int hash; struct unix_address *addr; @@ -860,40 +886,23 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) atomic_set(&addr->refcnt, 1); if (sun_path[0]) { - umode_t mode; - err = 0; - /* - * Get the parent directory, calculate the hash for last - * component. - */ - dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0); - err = PTR_ERR(dentry); - if (IS_ERR(dentry)) - goto out_mknod_parent; - - /* - * All right, let's create it. - */ - mode = S_IFSOCK | + struct path path; + umode_t mode = S_IFSOCK | (SOCK_INODE(sock)->i_mode & ~current_umask()); - err = security_path_mknod(&path, dentry, mode, 0); - if (err) - goto out_mknod_drop_write; - err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0); -out_mknod_drop_write: - if (err) - goto out_mknod_dput; - mntget(path.mnt); - dget(dentry); - done_path_create(&path, dentry); - path.dentry = dentry; - + err = unix_mknod(sun_path, mode, &path); + if (err) { + if (err == -EEXIST) + err = -EADDRINUSE; + unix_release_addr(addr); + goto out_up; + } addr->hash = UNIX_HASH_SIZE; - } - - spin_lock(&unix_table_lock); - - if (!sun_path[0]) { + hash = path.dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1); + spin_lock(&unix_table_lock); + u->path = path; + list = &unix_socket_table[hash]; + } else { + spin_lock(&unix_table_lock); err = -EADDRINUSE; if (__unix_find_socket_byname(net, sunaddr, addr_len, sk->sk_type, hash)) { @@ -902,9 +911,6 @@ out_mknod_drop_write: } list = &unix_socket_table[addr->hash]; - } else { - list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)]; - u->path = path; } err = 0; @@ -918,14 +924,6 @@ out_up: mutex_unlock(&u->readlock); out: return err; - -out_mknod_dput: - done_path_create(&path, dentry); -out_mknod_parent: - if (err == -EEXIST) - err = -EADDRINUSE; - unix_release_addr(addr); - goto out_up; } static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) -- cgit v1.2.3-70-g09d2 From e0e3cea46d31d23dc40df0a49a7a2c04fe8edfea Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 21 Aug 2012 06:21:17 +0000 Subject: af_netlink: force credentials passing [CVE-2012-3520] Pablo Neira Ayuso discovered that avahi and potentially NetworkManager accept spoofed Netlink messages because of a kernel bug. The kernel passes all-zero SCM_CREDENTIALS ancillary data to the receiver if the sender did not provide such data, instead of not including any such data at all or including the correct data from the peer (as it is the case with AF_UNIX). This bug was introduced in commit 16e572626961 (af_unix: dont send SCM_CREDENTIALS by default) This patch forces passing credentials for netlink, as before the regression. Another fix would be to not add SCM_CREDENTIALS in netlink messages if not provided by the sender, but it might break some programs. With help from Florian Weimer & Petr Matousek This issue is designated as CVE-2012-3520 Signed-off-by: Eric Dumazet Cc: Petr Matousek Cc: Florian Weimer Cc: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/scm.h | 4 +++- net/netlink/af_netlink.c | 2 +- net/unix/af_unix.c | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/include/net/scm.h b/include/net/scm.h index 079d7887dac..7dc0854f0b3 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -70,9 +70,11 @@ static __inline__ void scm_destroy(struct scm_cookie *scm) } static __inline__ int scm_send(struct socket *sock, struct msghdr *msg, - struct scm_cookie *scm) + struct scm_cookie *scm, bool forcecreds) { memset(scm, 0, sizeof(*scm)); + if (forcecreds) + scm_set_cred(scm, task_tgid(current), current_cred()); unix_get_peersec_dgram(sock, scm); if (msg->msg_controllen <= 0) return 0; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 5463969da45..1445d73533e 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1362,7 +1362,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, if (NULL == siocb->scm) siocb->scm = &scm; - err = scm_send(sock, msg, siocb->scm); + err = scm_send(sock, msg, siocb->scm, true); if (err < 0) return err; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e4768c180da..c5ee4ff6136 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1450,7 +1450,7 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, if (NULL == siocb->scm) siocb->scm = &tmp_scm; wait_for_unix_gc(); - err = scm_send(sock, msg, siocb->scm); + err = scm_send(sock, msg, siocb->scm, false); if (err < 0) return err; @@ -1619,7 +1619,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, if (NULL == siocb->scm) siocb->scm = &tmp_scm; wait_for_unix_gc(); - err = scm_send(sock, msg, siocb->scm); + err = scm_send(sock, msg, siocb->scm, false); if (err < 0) return err; -- cgit v1.2.3-70-g09d2