From 0856a304091b33a8e8f9f9c98e776f425af2b625 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Mon, 22 Aug 2011 14:57:26 +0000 Subject: Scm: Remove unnecessary pid & credential references in Unix socket's send and receive path Patch series 109f6e39..7361c36c back in 2.6.36 added functionality to allow credentials to work across pid namespaces for packets sent via UNIX sockets. However, the atomic reference counts on pid and credentials caused plenty of cache bouncing when there are numerous threads of the same pid sharing a UNIX socket. This patch mitigates the problem by eliminating extraneous reference counts on pid and credentials on both send and receive path of UNIX sockets. I found a 2x improvement in hackbench's threaded case. On the receive path in unix_dgram_recvmsg, currently there is an increment of reference count on pid and credentials in scm_set_cred. Then there are two decrement of the reference counts. Once in scm_recv and once when skb_free_datagram call skb->destructor function unix_destruct_scm. One pair of increment and decrement of ref count on pid and credentials can be eliminated from the receive path. Until we destroy the skb, we already set a reference when we created the skb on the send side. On the send path, there are two increments of ref count on pid and credentials, once in scm_send and once in unix_scm_to_skb. Then there is a decrement of the reference counts in scm_destroy's call to scm_destroy_cred at the end of unix_dgram_sendmsg functions. One pair of increment and decrement of the reference counts can be removed so we only need to increment the ref counts once. By incorporating these changes, for hackbench running on a 4 socket NHM-EX machine with 40 cores, the execution of hackbench on 50 groups of 20 threads sped up by factor of 2. Hackbench command used for testing: ./hackbench 50 thread 2000 Signed-off-by: Tim Chen Signed-off-by: David S. Miller --- net/unix/af_unix.c | 45 +++++++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 16 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index ec68e1c05b8..e6d9d1014ed 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1378,11 +1378,17 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) return max_level; } -static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) +static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, + bool send_fds, bool ref) { int err = 0; - UNIXCB(skb).pid = get_pid(scm->pid); - UNIXCB(skb).cred = get_cred(scm->cred); + if (ref) { + UNIXCB(skb).pid = get_pid(scm->pid); + UNIXCB(skb).cred = get_cred(scm->cred); + } else { + UNIXCB(skb).pid = scm->pid; + UNIXCB(skb).cred = scm->cred; + } UNIXCB(skb).fp = NULL; if (scm->fp && send_fds) err = unix_attach_fds(scm, skb); @@ -1407,7 +1413,7 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, int namelen = 0; /* fake GCC */ int err; unsigned hash; - struct sk_buff *skb; + struct sk_buff *skb = NULL; long timeo; struct scm_cookie tmp_scm; int max_level; @@ -1448,7 +1454,7 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, if (skb == NULL) goto out; - err = unix_scm_to_skb(siocb->scm, skb, true); + err = unix_scm_to_skb(siocb->scm, skb, true, false); if (err < 0) goto out_free; max_level = err + 1; @@ -1544,7 +1550,7 @@ restart: unix_state_unlock(other); other->sk_data_ready(other, len); sock_put(other); - scm_destroy(siocb->scm); + scm_release(siocb->scm); return len; out_unlock: @@ -1554,7 +1560,8 @@ out_free: out: if (other) sock_put(other); - scm_destroy(siocb->scm); + if (skb == NULL) + scm_destroy(siocb->scm); return err; } @@ -1566,7 +1573,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, struct sock *sk = sock->sk; struct sock *other = NULL; int err, size; - struct sk_buff *skb; + struct sk_buff *skb = NULL; int sent = 0; struct scm_cookie tmp_scm; bool fds_sent = false; @@ -1631,11 +1638,11 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, size = min_t(int, size, skb_tailroom(skb)); - /* Only send the fds in the first buffer */ - err = unix_scm_to_skb(siocb->scm, skb, !fds_sent); + /* Only send the fds and no ref to pid in the first buffer */ + err = unix_scm_to_skb(siocb->scm, skb, !fds_sent, fds_sent); if (err < 0) { kfree_skb(skb); - goto out_err; + goto out; } max_level = err + 1; fds_sent = true; @@ -1643,7 +1650,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size); if (err) { kfree_skb(skb); - goto out_err; + goto out; } unix_state_lock(other); @@ -1660,7 +1667,10 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, sent += size; } - scm_destroy(siocb->scm); + if (skb) + scm_release(siocb->scm); + else + scm_destroy(siocb->scm); siocb->scm = NULL; return sent; @@ -1673,7 +1683,9 @@ pipe_err: send_sig(SIGPIPE, current, 0); err = -EPIPE; out_err: - scm_destroy(siocb->scm); + if (skb == NULL) + scm_destroy(siocb->scm); +out: siocb->scm = NULL; return sent ? : err; } @@ -1777,7 +1789,7 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock, siocb->scm = &tmp_scm; memset(&tmp_scm, 0, sizeof(tmp_scm)); } - scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred); + scm_set_cred_noref(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred); unix_set_secdata(siocb->scm, skb); if (!(flags & MSG_PEEK)) { @@ -1939,7 +1951,8 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, } } else { /* Copy credentials */ - scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred); + scm_set_cred_noref(siocb->scm, UNIXCB(skb).pid, + UNIXCB(skb).cred); check_creds = 1; } -- cgit v1.2.3-70-g09d2 From f78a5fda9116525809d088917638be912b85f838 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 16 Sep 2011 19:34:00 -0400 Subject: Revert "Scm: Remove unnecessary pid & credential references in Unix socket's send and receive path" This reverts commit 0856a304091b33a8e8f9f9c98e776f425af2b625. As requested by Eric Dumazet, it has various ref-counting problems and has introduced regressions. Eric will add a more suitable version of this performance fix. Signed-off-by: David S. Miller --- include/net/scm.h | 22 +++------------------- net/unix/af_unix.c | 45 ++++++++++++++++----------------------------- 2 files changed, 19 insertions(+), 48 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/include/net/scm.h b/include/net/scm.h index 68e1e481658..745460fa2f0 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -53,14 +53,6 @@ static __inline__ void scm_set_cred(struct scm_cookie *scm, cred_to_ucred(pid, cred, &scm->creds); } -static __inline__ void scm_set_cred_noref(struct scm_cookie *scm, - struct pid *pid, const struct cred *cred) -{ - scm->pid = pid; - scm->cred = cred; - cred_to_ucred(pid, cred, &scm->creds); -} - static __inline__ void scm_destroy_cred(struct scm_cookie *scm) { put_pid(scm->pid); @@ -78,15 +70,6 @@ static __inline__ void scm_destroy(struct scm_cookie *scm) __scm_destroy(scm); } -static __inline__ void scm_release(struct scm_cookie *scm) -{ - /* keep ref on pid and cred */ - scm->pid = NULL; - scm->cred = NULL; - if (scm->fp) - __scm_destroy(scm); -} - static __inline__ int scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm) { @@ -125,14 +108,15 @@ static __inline__ void scm_recv(struct socket *sock, struct msghdr *msg, if (!msg->msg_control) { if (test_bit(SOCK_PASSCRED, &sock->flags) || scm->fp) msg->msg_flags |= MSG_CTRUNC; - if (scm && scm->fp) - __scm_destroy(scm); + scm_destroy(scm); return; } if (test_bit(SOCK_PASSCRED, &sock->flags)) put_cmsg(msg, SOL_SOCKET, SCM_CREDENTIALS, sizeof(scm->creds), &scm->creds); + scm_destroy_cred(scm); + scm_passec(sock, msg, scm); if (!scm->fp) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e6d9d1014ed..ec68e1c05b8 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1378,17 +1378,11 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) return max_level; } -static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, - bool send_fds, bool ref) +static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) { int err = 0; - if (ref) { - UNIXCB(skb).pid = get_pid(scm->pid); - UNIXCB(skb).cred = get_cred(scm->cred); - } else { - UNIXCB(skb).pid = scm->pid; - UNIXCB(skb).cred = scm->cred; - } + UNIXCB(skb).pid = get_pid(scm->pid); + UNIXCB(skb).cred = get_cred(scm->cred); UNIXCB(skb).fp = NULL; if (scm->fp && send_fds) err = unix_attach_fds(scm, skb); @@ -1413,7 +1407,7 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, int namelen = 0; /* fake GCC */ int err; unsigned hash; - struct sk_buff *skb = NULL; + struct sk_buff *skb; long timeo; struct scm_cookie tmp_scm; int max_level; @@ -1454,7 +1448,7 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, if (skb == NULL) goto out; - err = unix_scm_to_skb(siocb->scm, skb, true, false); + err = unix_scm_to_skb(siocb->scm, skb, true); if (err < 0) goto out_free; max_level = err + 1; @@ -1550,7 +1544,7 @@ restart: unix_state_unlock(other); other->sk_data_ready(other, len); sock_put(other); - scm_release(siocb->scm); + scm_destroy(siocb->scm); return len; out_unlock: @@ -1560,8 +1554,7 @@ out_free: out: if (other) sock_put(other); - if (skb == NULL) - scm_destroy(siocb->scm); + scm_destroy(siocb->scm); return err; } @@ -1573,7 +1566,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, struct sock *sk = sock->sk; struct sock *other = NULL; int err, size; - struct sk_buff *skb = NULL; + struct sk_buff *skb; int sent = 0; struct scm_cookie tmp_scm; bool fds_sent = false; @@ -1638,11 +1631,11 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, size = min_t(int, size, skb_tailroom(skb)); - /* Only send the fds and no ref to pid in the first buffer */ - err = unix_scm_to_skb(siocb->scm, skb, !fds_sent, fds_sent); + /* Only send the fds in the first buffer */ + err = unix_scm_to_skb(siocb->scm, skb, !fds_sent); if (err < 0) { kfree_skb(skb); - goto out; + goto out_err; } max_level = err + 1; fds_sent = true; @@ -1650,7 +1643,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size); if (err) { kfree_skb(skb); - goto out; + goto out_err; } unix_state_lock(other); @@ -1667,10 +1660,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, sent += size; } - if (skb) - scm_release(siocb->scm); - else - scm_destroy(siocb->scm); + scm_destroy(siocb->scm); siocb->scm = NULL; return sent; @@ -1683,9 +1673,7 @@ pipe_err: send_sig(SIGPIPE, current, 0); err = -EPIPE; out_err: - if (skb == NULL) - scm_destroy(siocb->scm); -out: + scm_destroy(siocb->scm); siocb->scm = NULL; return sent ? : err; } @@ -1789,7 +1777,7 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock, siocb->scm = &tmp_scm; memset(&tmp_scm, 0, sizeof(tmp_scm)); } - scm_set_cred_noref(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred); + scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred); unix_set_secdata(siocb->scm, skb); if (!(flags & MSG_PEEK)) { @@ -1951,8 +1939,7 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, } } else { /* Copy credentials */ - scm_set_cred_noref(siocb->scm, UNIXCB(skb).pid, - UNIXCB(skb).cred); + scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred); check_creds = 1; } -- cgit v1.2.3-70-g09d2 From 16e5726269611b71c930054ffe9b858c1cea88eb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 19 Sep 2011 05:52:27 +0000 Subject: af_unix: dont send SCM_CREDENTIALS by default Since commit 7361c36c5224 (af_unix: Allow credentials to work across user and pid namespaces) af_unix performance dropped a lot. This is because we now take a reference on pid and cred in each write(), and release them in read(), usually done from another process, eventually from another cpu. This triggers false sharing. # Events: 154K cycles # # Overhead Command Shared Object Symbol # ........ ....... .................. ......................... # 10.40% hackbench [kernel.kallsyms] [k] put_pid 8.60% hackbench [kernel.kallsyms] [k] unix_stream_recvmsg 7.87% hackbench [kernel.kallsyms] [k] unix_stream_sendmsg 6.11% hackbench [kernel.kallsyms] [k] do_raw_spin_lock 4.95% hackbench [kernel.kallsyms] [k] unix_scm_to_skb 4.87% hackbench [kernel.kallsyms] [k] pid_nr_ns 4.34% hackbench [kernel.kallsyms] [k] cred_to_ucred 2.39% hackbench [kernel.kallsyms] [k] unix_destruct_scm 2.24% hackbench [kernel.kallsyms] [k] sub_preempt_count 1.75% hackbench [kernel.kallsyms] [k] fget_light 1.51% hackbench [kernel.kallsyms] [k] __mutex_lock_interruptible_slowpath 1.42% hackbench [kernel.kallsyms] [k] sock_alloc_send_pskb This patch includes SCM_CREDENTIALS information in a af_unix message/skb only if requested by the sender, [man 7 unix for details how to include ancillary data using sendmsg() system call] Note: This might break buggy applications that expected SCM_CREDENTIAL from an unaware write() system call, and receiver not using SO_PASSCRED socket option. If SOCK_PASSCRED is set on source or destination socket, we still include credentials for mere write() syscalls. Performance boost in hackbench : more than 50% gain on a 16 thread machine (2 quad-core cpus, 2 threads per core) hackbench 20 thread 2000 4.228 sec instead of 9.102 sec Signed-off-by: Eric Dumazet Acked-by: Tim Chen Signed-off-by: David S. Miller --- include/net/scm.h | 5 ++--- net/core/scm.c | 10 ++++++---- net/netlink/af_netlink.c | 5 ++--- net/unix/af_unix.c | 24 +++++++++++++++++++++++- 4 files changed, 33 insertions(+), 11 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/include/net/scm.h b/include/net/scm.h index 745460fa2f0..d456f4c71a3 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -49,7 +49,7 @@ static __inline__ void scm_set_cred(struct scm_cookie *scm, struct pid *pid, const struct cred *cred) { scm->pid = get_pid(pid); - scm->cred = get_cred(cred); + scm->cred = cred ? get_cred(cred) : NULL; cred_to_ucred(pid, cred, &scm->creds); } @@ -73,8 +73,7 @@ static __inline__ void scm_destroy(struct scm_cookie *scm) static __inline__ int scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm) { - scm_set_cred(scm, task_tgid(current), current_cred()); - scm->fp = NULL; + memset(scm, 0, sizeof(*scm)); unix_get_peersec_dgram(sock, scm); if (msg->msg_controllen <= 0) return 0; diff --git a/net/core/scm.c b/net/core/scm.c index 811b53fb330..ff52ad0a515 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -173,7 +173,7 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) if (err) goto error; - if (pid_vnr(p->pid) != p->creds.pid) { + if (!p->pid || pid_vnr(p->pid) != p->creds.pid) { struct pid *pid; err = -ESRCH; pid = find_get_pid(p->creds.pid); @@ -183,8 +183,9 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) p->pid = pid; } - if ((p->cred->euid != p->creds.uid) || - (p->cred->egid != p->creds.gid)) { + if (!p->cred || + (p->cred->euid != p->creds.uid) || + (p->cred->egid != p->creds.gid)) { struct cred *cred; err = -ENOMEM; cred = prepare_creds(); @@ -193,7 +194,8 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) cred->uid = cred->euid = p->creds.uid; cred->gid = cred->egid = p->creds.gid; - put_cred(p->cred); + if (p->cred) + put_cred(p->cred); p->cred = cred; } break; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 4330db99fab..1201b6d4183 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1324,10 +1324,9 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, if (msg->msg_flags&MSG_OOB) return -EOPNOTSUPP; - if (NULL == siocb->scm) { + if (NULL == siocb->scm) siocb->scm = &scm; - memset(&scm, 0, sizeof(scm)); - } + err = scm_send(sock, msg, siocb->scm); if (err < 0) return err; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index ec68e1c05b8..466fbcc5cf7 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1381,8 +1381,10 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) { int err = 0; + UNIXCB(skb).pid = get_pid(scm->pid); - UNIXCB(skb).cred = get_cred(scm->cred); + if (scm->cred) + UNIXCB(skb).cred = get_cred(scm->cred); UNIXCB(skb).fp = NULL; if (scm->fp && send_fds) err = unix_attach_fds(scm, skb); @@ -1391,6 +1393,24 @@ static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool sen return err; } +/* + * Some apps rely on write() giving SCM_CREDENTIALS + * We include credentials if source or destination socket + * asserted SOCK_PASSCRED. + */ +static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock, + const struct sock *other) +{ + if (UNIXCB(skb).cred) + return; + if (test_bit(SOCK_PASSCRED, &sock->flags) || + !other->sk_socket || + test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) { + UNIXCB(skb).pid = get_pid(task_tgid(current)); + UNIXCB(skb).cred = get_current_cred(); + } +} + /* * Send AF_UNIX data. */ @@ -1538,6 +1558,7 @@ restart: if (sock_flag(other, SOCK_RCVTSTAMP)) __net_timestamp(skb); + maybe_add_creds(skb, sock, other); skb_queue_tail(&other->sk_receive_queue, skb); if (max_level > unix_sk(other)->recursion_level) unix_sk(other)->recursion_level = max_level; @@ -1652,6 +1673,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, (other->sk_shutdown & RCV_SHUTDOWN)) goto pipe_err_free; + maybe_add_creds(skb, sock, other); skb_queue_tail(&other->sk_receive_queue, skb); if (max_level > unix_sk(other)->recursion_level) unix_sk(other)->recursion_level = max_level; -- cgit v1.2.3-70-g09d2