From f55bb7f9cb82dec2f2e803d7bd0fc5929248e4d8 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Tue, 21 Feb 2012 07:31:51 +0000 Subject: unix: Support peeking offset for datagram and seqpacket sockets The sk_peek_off manipulations are protected with the unix_sk->readlock mutex. This mutex is enough since all we need is to syncronize setting the offset vs reading the queue head. The latter is fully covered with the mentioned lock. The recently added __skb_recv_datagram's offset is used to pick the skb to read the data from. Signed-off-by: Pavel Emelyanov Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/unix/af_unix.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 85d3bb7490a..3d9481de031 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -530,6 +530,16 @@ static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *, static int unix_seqpacket_recvmsg(struct kiocb *, struct socket *, struct msghdr *, size_t, int); +static void unix_set_peek_off(struct sock *sk, int val) +{ + struct unix_sock *u = unix_sk(sk); + + mutex_lock(&u->readlock); + sk->sk_peek_off = val; + mutex_unlock(&u->readlock); +} + + static const struct proto_ops unix_stream_ops = { .family = PF_UNIX, .owner = THIS_MODULE, @@ -570,6 +580,7 @@ static const struct proto_ops unix_dgram_ops = { .recvmsg = unix_dgram_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, + .set_peek_off = unix_set_peek_off, }; static const struct proto_ops unix_seqpacket_ops = { @@ -591,6 +602,7 @@ static const struct proto_ops unix_seqpacket_ops = { .recvmsg = unix_seqpacket_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, + .set_peek_off = unix_set_peek_off, }; static struct proto unix_proto = { @@ -1756,6 +1768,7 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock, int noblock = flags & MSG_DONTWAIT; struct sk_buff *skb; int err; + int peeked, skip; err = -EOPNOTSUPP; if (flags&MSG_OOB) @@ -1769,7 +1782,9 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock, goto out; } - skb = skb_recv_datagram(sk, flags, noblock, &err); + skip = sk_peek_offset(sk, flags); + + skb = __skb_recv_datagram(sk, flags, &peeked, &skip, &err); if (!skb) { unix_state_lock(sk); /* Signal EOF on disconnected non-blocking SEQPACKET socket. */ @@ -1786,12 +1801,12 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock, if (msg->msg_name) unix_copy_addr(msg, skb->sk); - if (size > skb->len) - size = skb->len; - else if (size < skb->len) + if (size > skb->len - skip) + size = skb->len - skip; + else if (size < skb->len - skip) msg->msg_flags |= MSG_TRUNC; - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, size); + err = skb_copy_datagram_iovec(skb, skip, msg->msg_iov, size); if (err) goto out_free; @@ -1808,6 +1823,8 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock, if (!(flags & MSG_PEEK)) { if (UNIXCB(skb).fp) unix_detach_fds(siocb->scm, skb); + + sk_peek_offset_bwd(sk, skb->len); } else { /* It is questionable: on PEEK we could: - do not return fds - good, but too simple 8) @@ -1821,6 +1838,9 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock, clearly however! */ + + sk_peek_offset_fwd(sk, size); + if (UNIXCB(skb).fp) siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp); } -- cgit v1.2.3-70-g09d2 From fc0d753641f7b919c7273d9bd21ae6ab45e757f3 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Tue, 21 Feb 2012 07:32:06 +0000 Subject: unix: Support peeking offset for stream sockets The same here -- we can protect the sk_peek_off manipulations with the unix_sk->readlock mutex. The peeking of data from a stream socket is done in the datagram style, i.e. even if there's enough room for more data in the user buffer, only the head skb's data is copied in there. This feature is preserved when peeking data from a given offset -- the data is read till the nearest skb's boundary. Signed-off-by: Pavel Emelyanov Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/unix/af_unix.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 3d9481de031..0be4d24f6ae 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -559,6 +559,7 @@ static const struct proto_ops unix_stream_ops = { .recvmsg = unix_stream_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, + .set_peek_off = unix_set_peek_off, }; static const struct proto_ops unix_dgram_ops = { @@ -1904,6 +1905,7 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, int target; int err = 0; long timeo; + int skip; err = -EINVAL; if (sk->sk_state != TCP_ESTABLISHED) @@ -1933,12 +1935,15 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, goto out; } + skip = sk_peek_offset(sk, flags); + do { int chunk; struct sk_buff *skb; unix_state_lock(sk); skb = skb_peek(&sk->sk_receive_queue); +again: if (skb == NULL) { unix_sk(sk)->recursion_level = 0; if (copied >= target) @@ -1973,6 +1978,13 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, unix_state_unlock(sk); break; } + + if (skip >= skb->len) { + skip -= skb->len; + skb = skb_peek_next(skb, &sk->sk_receive_queue); + goto again; + } + unix_state_unlock(sk); if (check_creds) { @@ -1992,8 +2004,8 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, sunaddr = NULL; } - chunk = min_t(unsigned int, skb->len, size); - if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) { + chunk = min_t(unsigned int, skb->len - skip, size); + if (memcpy_toiovec(msg->msg_iov, skb->data + skip, chunk)) { if (copied == 0) copied = -EFAULT; break; @@ -2005,6 +2017,8 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, if (!(flags & MSG_PEEK)) { skb_pull(skb, chunk); + sk_peek_offset_bwd(sk, chunk); + if (UNIXCB(skb).fp) unix_detach_fds(siocb->scm, skb); @@ -2022,6 +2036,8 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, if (UNIXCB(skb).fp) siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp); + sk_peek_offset_fwd(sk, chunk); + break; } } while (size); -- cgit v1.2.3-70-g09d2 From 9f6f9af7694ede6314bed281eec74d588ba9474f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 21 Feb 2012 23:24:55 +0000 Subject: af_unix: MSG_TRUNC support for dgram sockets Piergiorgio Beruto expressed the need to fetch size of first datagram in queue for AF_UNIX sockets and suggested a patch against SIOCINQ ioctl. I suggested instead to implement MSG_TRUNC support as a recv() input flag, as already done for RAW, UDP & NETLINK sockets. len = recv(fd, &byte, 1, MSG_PEEK | MSG_TRUNC); MSG_TRUNC asks recv() to return the real length of the packet, even when is was longer than the passed buffer. There is risk that a userland application used MSG_TRUNC by accident (since it had no effect on af_unix sockets) and this might break after this patch. Signed-off-by: Eric Dumazet Tested-by: Piergiorgio Beruto CC: Michael Kerrisk Signed-off-by: David S. Miller --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 0be4d24f6ae..8ee85aa79fa 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1845,7 +1845,7 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock, if (UNIXCB(skb).fp) siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp); } - err = size; + err = (flags & MSG_TRUNC) ? skb->len - skip : size; scm_recv(sock, msg, siocb->scm, flags); -- cgit v1.2.3-70-g09d2 From 40ffe67d2e89c7a475421d007becc11a2f88ea3d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 14 Mar 2012 21:54:32 -0400 Subject: switch unix_sock to struct path Signed-off-by: Al Viro --- include/net/af_unix.h | 3 +-- net/unix/af_unix.c | 35 +++++++++++++++-------------------- net/unix/diag.c | 2 +- security/lsm_audit.c | 8 ++------ 4 files changed, 19 insertions(+), 29 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 5a4e29b168c..ca68e2cef23 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -49,8 +49,7 @@ struct unix_sock { /* WARNING: sk has to be the first member */ struct sock sk; struct unix_address *addr; - struct dentry *dentry; - struct vfsmount *mnt; + struct path path; struct mutex readlock; struct sock *peer; struct sock *other; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 85d3bb7490a..ef4b780ef63 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -293,7 +293,7 @@ static struct sock *unix_find_socket_byinode(struct inode *i) spin_lock(&unix_table_lock); sk_for_each(s, node, &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) { - struct dentry *dentry = unix_sk(s)->dentry; + struct dentry *dentry = unix_sk(s)->path.dentry; if (dentry && dentry->d_inode == i) { sock_hold(s); @@ -377,8 +377,7 @@ static void unix_sock_destructor(struct sock *sk) static int unix_release_sock(struct sock *sk, int embrion) { struct unix_sock *u = unix_sk(sk); - struct dentry *dentry; - struct vfsmount *mnt; + struct path path; struct sock *skpair; struct sk_buff *skb; int state; @@ -389,10 +388,9 @@ static int unix_release_sock(struct sock *sk, int embrion) unix_state_lock(sk); sock_orphan(sk); sk->sk_shutdown = SHUTDOWN_MASK; - dentry = u->dentry; - u->dentry = NULL; - mnt = u->mnt; - u->mnt = NULL; + path = u->path; + u->path.dentry = NULL; + u->path.mnt = NULL; state = sk->sk_state; sk->sk_state = TCP_CLOSE; unix_state_unlock(sk); @@ -425,10 +423,8 @@ static int unix_release_sock(struct sock *sk, int embrion) kfree_skb(skb); } - if (dentry) { - dput(dentry); - mntput(mnt); - } + if (path.dentry) + path_put(&path); sock_put(sk); @@ -628,8 +624,8 @@ static struct sock *unix_create1(struct net *net, struct socket *sock) sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen; sk->sk_destruct = unix_sock_destructor; u = unix_sk(sk); - u->dentry = NULL; - u->mnt = NULL; + u->path.dentry = NULL; + u->path.mnt = NULL; spin_lock_init(&u->lock); atomic_long_set(&u->inflight, 0); INIT_LIST_HEAD(&u->link); @@ -789,9 +785,9 @@ static struct sock *unix_find_other(struct net *net, u = unix_find_socket_byname(net, sunname, len, type, hash); if (u) { struct dentry *dentry; - dentry = unix_sk(u)->dentry; + dentry = unix_sk(u)->path.dentry; if (dentry) - touch_atime(unix_sk(u)->mnt, dentry); + touch_atime(unix_sk(u)->path.mnt, dentry); } else goto fail; } @@ -897,8 +893,7 @@ out_mknod_drop_write: list = &unix_socket_table[addr->hash]; } else { list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)]; - u->dentry = path.dentry; - u->mnt = path.mnt; + u->path = path; } err = 0; @@ -1180,9 +1175,9 @@ restart: atomic_inc(&otheru->addr->refcnt); newu->addr = otheru->addr; } - if (otheru->dentry) { - newu->dentry = dget(otheru->dentry); - newu->mnt = mntget(otheru->mnt); + if (otheru->path.dentry) { + path_get(&otheru->path); + newu->path = otheru->path; } /* Set credentials */ diff --git a/net/unix/diag.c b/net/unix/diag.c index 6b7697fd911..ffd86518e64 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -29,7 +29,7 @@ rtattr_failure: static int sk_diag_dump_vfs(struct sock *sk, struct sk_buff *nlskb) { - struct dentry *dentry = unix_sk(sk)->dentry; + struct dentry *dentry = unix_sk(sk)->path.dentry; struct unix_diag_vfs *uv; if (dentry) { diff --git a/security/lsm_audit.c b/security/lsm_audit.c index 293b8c45b1d..8b8f0902f6e 100644 --- a/security/lsm_audit.c +++ b/security/lsm_audit.c @@ -313,12 +313,8 @@ static void dump_common_audit_data(struct audit_buffer *ab, } case AF_UNIX: u = unix_sk(sk); - if (u->dentry) { - struct path path = { - .dentry = u->dentry, - .mnt = u->mnt - }; - audit_log_d_path(ab, " path=", &path); + if (u->path.dentry) { + audit_log_d_path(ab, " path=", &u->path); break; } if (!u->addr) -- cgit v1.2.3-70-g09d2 From 68ac1234fb949b66941d94dce4157742799fc581 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 15 Mar 2012 08:21:57 -0400 Subject: switch touch_atime to struct path Signed-off-by: Al Viro --- fs/cachefiles/namei.c | 3 ++- fs/ecryptfs/file.c | 9 ++++----- fs/inode.c | 5 +++-- fs/namei.c | 2 +- fs/nfsd/vfs.c | 11 ++++++----- fs/stat.c | 2 +- include/linux/fs.h | 4 ++-- net/unix/af_unix.c | 4 ++-- 8 files changed, 21 insertions(+), 19 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index a0358c2189c..7f0771d3894 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -646,7 +646,8 @@ lookup_again: * (this is used to keep track of culling, and atimes are only * updated by read, write and readdir but not lookup or * open) */ - touch_atime(cache->mnt, next); + path.dentry = next; + touch_atime(&path); } /* open a file interface onto a data file */ diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index d3f95f941c4..2b17f2f9b12 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -48,8 +48,7 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb, unsigned long nr_segs, loff_t pos) { ssize_t rc; - struct dentry *lower_dentry; - struct vfsmount *lower_vfsmount; + struct path lower; struct file *file = iocb->ki_filp; rc = generic_file_aio_read(iocb, iov, nr_segs, pos); @@ -60,9 +59,9 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb, if (-EIOCBQUEUED == rc) rc = wait_on_sync_kiocb(iocb); if (rc >= 0) { - lower_dentry = ecryptfs_dentry_to_lower(file->f_path.dentry); - lower_vfsmount = ecryptfs_dentry_to_lower_mnt(file->f_path.dentry); - touch_atime(lower_vfsmount, lower_dentry); + lower.dentry = ecryptfs_dentry_to_lower(file->f_path.dentry); + lower.mnt = ecryptfs_dentry_to_lower_mnt(file->f_path.dentry); + touch_atime(&lower); } return rc; } diff --git a/fs/inode.c b/fs/inode.c index 92de04b0baa..8b612813a6a 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1499,9 +1499,10 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, * This function automatically handles read only file systems and media, * as well as the "noatime" flag and inode specific "noatime" markers. */ -void touch_atime(struct vfsmount *mnt, struct dentry *dentry) +void touch_atime(struct path *path) { - struct inode *inode = dentry->d_inode; + struct vfsmount *mnt = path->mnt; + struct inode *inode = path->dentry->d_inode; struct timespec now; if (inode->i_flags & S_NOATIME) diff --git a/fs/namei.c b/fs/namei.c index a0b82762e8f..0ccc74ee92f 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -642,7 +642,7 @@ follow_link(struct path *link, struct nameidata *nd, void **p) cond_resched(); current->total_link_count++; - touch_atime(link->mnt, dentry); + touch_atime(link); nd_set_link(nd, NULL); error = security_inode_follow_link(link->dentry, nd); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index edf6d3ed877..e59f71d0cf7 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1541,30 +1541,31 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, __be32 nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp) { - struct dentry *dentry; struct inode *inode; mm_segment_t oldfs; __be32 err; int host_err; + struct path path; err = fh_verify(rqstp, fhp, S_IFLNK, NFSD_MAY_NOP); if (err) goto out; - dentry = fhp->fh_dentry; - inode = dentry->d_inode; + path.mnt = fhp->fh_export->ex_path.mnt; + path.dentry = fhp->fh_dentry; + inode = path.dentry->d_inode; err = nfserr_inval; if (!inode->i_op->readlink) goto out; - touch_atime(fhp->fh_export->ex_path.mnt, dentry); + touch_atime(&path); /* N.B. Why does this call need a get_fs()?? * Remove the set_fs and watch the fireworks:-) --okir */ oldfs = get_fs(); set_fs(KERNEL_DS); - host_err = inode->i_op->readlink(dentry, buf, *lenp); + host_err = inode->i_op->readlink(path.dentry, buf, *lenp); set_fs(oldfs); if (host_err < 0) diff --git a/fs/stat.c b/fs/stat.c index 8806b8997d2..86f13563a46 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -307,7 +307,7 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname, if (inode->i_op->readlink) { error = security_inode_readlink(path.dentry); if (!error) { - touch_atime(path.mnt, path.dentry); + touch_atime(&path); error = inode->i_op->readlink(path.dentry, buf, bufsiz); } diff --git a/include/linux/fs.h b/include/linux/fs.h index b89dc4d60d9..9bbe1a9ac43 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1812,11 +1812,11 @@ static inline void inode_inc_iversion(struct inode *inode) spin_unlock(&inode->i_lock); } -extern void touch_atime(struct vfsmount *mnt, struct dentry *dentry); +extern void touch_atime(struct path *); static inline void file_accessed(struct file *file) { if (!(file->f_flags & O_NOATIME)) - touch_atime(file->f_path.mnt, file->f_path.dentry); + touch_atime(&file->f_path); } int sync_inode(struct inode *inode, struct writeback_control *wbc); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index ef4b780ef63..081679444a6 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -771,7 +771,7 @@ static struct sock *unix_find_other(struct net *net, goto put_fail; if (u->sk_type == type) - touch_atime(path.mnt, path.dentry); + touch_atime(&path); path_put(&path); @@ -787,7 +787,7 @@ static struct sock *unix_find_other(struct net *net, struct dentry *dentry; dentry = unix_sk(u)->path.dentry; if (dentry) - touch_atime(unix_sk(u)->path.mnt, dentry); + touch_atime(&unix_sk(u)->path); } else goto fail; } -- cgit v1.2.3-70-g09d2 From 626cf236608505d376e4799adb4f7eb00a8594af Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Fri, 23 Mar 2012 15:02:27 -0700 Subject: poll: add poll_requested_events() and poll_does_not_wait() functions In some cases the poll() implementation in a driver has to do different things depending on the events the caller wants to poll for. An example is when a driver needs to start a DMA engine if the caller polls for POLLIN, but doesn't want to do that if POLLIN is not requested but instead only POLLOUT or POLLPRI is requested. This is something that can happen in the video4linux subsystem among others. Unfortunately, the current epoll/poll/select implementation doesn't provide that information reliably. The poll_table_struct does have it: it has a key field with the event mask. But once a poll() call matches one or more bits of that mask any following poll() calls are passed a NULL poll_table pointer. Also, the eventpoll implementation always left the key field at ~0 instead of using the requested events mask. This was changed in eventpoll.c so the key field now contains the actual events that should be polled for as set by the caller. The solution to the NULL poll_table pointer is to set the qproc field to NULL in poll_table once poll() matches the events, not the poll_table pointer itself. That way drivers can obtain the mask through a new poll_requested_events inline. The poll_table_struct can still be NULL since some kernel code calls it internally (netfs_state_poll() in ./drivers/staging/pohmelfs/netfs.h). In that case poll_requested_events() returns ~0 (i.e. all events). Very rarely drivers might want to know whether poll_wait will actually wait. If another earlier file descriptor in the set already matched the events the caller wanted to wait for, then the kernel will return from the select() call without waiting. This might be useful information in order to avoid doing expensive work. A new helper function poll_does_not_wait() is added that drivers can use to detect this situation. This is now used in sock_poll_wait() in include/net/sock.h. This was the only place in the kernel that needed this information. Drivers should no longer access any of the poll_table internals, but use the poll_requested_events() and poll_does_not_wait() access functions instead. In order to enforce that the poll_table fields are now prepended with an underscore and a comment was added warning against using them directly. This required a change in unix_dgram_poll() in unix/af_unix.c which used the key field to get the requested events. It's been replaced by a call to poll_requested_events(). For qproc it was especially important to change its name since the behavior of that field changes with this patch since this function pointer can now be NULL when that wasn't possible in the past. Any driver accessing the qproc or key fields directly will now fail to compile. Some notes regarding the correctness of this patch: the driver's poll() function is called with a 'struct poll_table_struct *wait' argument. This pointer may or may not be NULL, drivers can never rely on it being one or the other as that depends on whether or not an earlier file descriptor in the select()'s fdset matched the requested events. There are only three things a driver can do with the wait argument: 1) obtain the key field: events = wait ? wait->key : ~0; This will still work although it should be replaced with the new poll_requested_events() function (which does exactly the same). This will now even work better, since wait is no longer set to NULL unnecessarily. 2) use the qproc callback. This could be deadly since qproc can now be NULL. Renaming qproc should prevent this from happening. There are no kernel drivers that actually access this callback directly, BTW. 3) test whether wait == NULL to determine whether poll would return without waiting. This is no longer sufficient as the correct test is now wait == NULL || wait->_qproc == NULL. However, the worst that can happen here is a slight performance hit in the case where wait != NULL and wait->_qproc == NULL. In that case the driver will assume that poll_wait() will actually add the fd to the set of waiting file descriptors. Of course, poll_wait() will not do that since it tests for wait->_qproc. This will not break anything, though. There is only one place in the whole kernel where this happens (sock_poll_wait() in include/net/sock.h) and that code will be replaced by a call to poll_does_not_wait() in the next patch. Note that even if wait->_qproc != NULL drivers cannot rely on poll_wait() actually waiting. The next file descriptor from the set might match the event mask and thus any possible waits will never happen. Signed-off-by: Hans Verkuil Reviewed-by: Jonathan Corbet Reviewed-by: Al Viro Cc: Davide Libenzi Signed-off-by: Hans de Goede Cc: Mauro Carvalho Chehab Cc: David Miller Cc: Eric Dumazet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 18 +++++++++++++++--- fs/select.c | 40 ++++++++++++++++++---------------------- include/linux/poll.h | 37 +++++++++++++++++++++++++++++++------ include/net/sock.h | 2 +- net/unix/af_unix.c | 2 +- 5 files changed, 66 insertions(+), 33 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 4d9d3a45e35..ca300071e79 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -699,9 +699,12 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, void *priv) { struct epitem *epi, *tmp; + poll_table pt; + init_poll_funcptr(&pt, NULL); list_for_each_entry_safe(epi, tmp, head, rdllink) { - if (epi->ffd.file->f_op->poll(epi->ffd.file, NULL) & + pt._key = epi->event.events; + if (epi->ffd.file->f_op->poll(epi->ffd.file, &pt) & epi->event.events) return POLLIN | POLLRDNORM; else { @@ -1097,6 +1100,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, /* Initialize the poll table using the queue callback */ epq.epi = epi; init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); + epq.pt._key = event->events; /* * Attach the item to the poll hooks and get current event bits. @@ -1191,6 +1195,9 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even { int pwake = 0; unsigned int revents; + poll_table pt; + + init_poll_funcptr(&pt, NULL); /* * Set the new event interest mask before calling f_op->poll(); @@ -1198,13 +1205,14 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even * f_op->poll() call and the new event set registering. */ epi->event.events = event->events; + pt._key = event->events; epi->event.data = event->data; /* protected by mtx */ /* * Get current event bits. We can safely use the file* here because * its usage count has been increased by the caller of this function. */ - revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL); + revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt); /* * If the item is "hot" and it is not registered inside the ready @@ -1239,6 +1247,9 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, unsigned int revents; struct epitem *epi; struct epoll_event __user *uevent; + poll_table pt; + + init_poll_funcptr(&pt, NULL); /* * We can loop without lock because we are passed a task private list. @@ -1251,7 +1262,8 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, list_del_init(&epi->rdllink); - revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) & + pt._key = epi->event.events; + revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt) & epi->event.events; /* diff --git a/fs/select.c b/fs/select.c index e782258d0de..ecfd0b125ba 100644 --- a/fs/select.c +++ b/fs/select.c @@ -223,7 +223,7 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, get_file(filp); entry->filp = filp; entry->wait_address = wait_address; - entry->key = p->key; + entry->key = p->_key; init_waitqueue_func_entry(&entry->wait, pollwake); entry->wait.private = pwq; add_wait_queue(wait_address, &entry->wait); @@ -386,13 +386,11 @@ get_max: static inline void wait_key_set(poll_table *wait, unsigned long in, unsigned long out, unsigned long bit) { - if (wait) { - wait->key = POLLEX_SET; - if (in & bit) - wait->key |= POLLIN_SET; - if (out & bit) - wait->key |= POLLOUT_SET; - } + wait->_key = POLLEX_SET; + if (in & bit) + wait->_key |= POLLIN_SET; + if (out & bit) + wait->_key |= POLLOUT_SET; } int do_select(int n, fd_set_bits *fds, struct timespec *end_time) @@ -414,7 +412,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) poll_initwait(&table); wait = &table.pt; if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { - wait = NULL; + wait->_qproc = NULL; timed_out = 1; } @@ -459,17 +457,17 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) if ((mask & POLLIN_SET) && (in & bit)) { res_in |= bit; retval++; - wait = NULL; + wait->_qproc = NULL; } if ((mask & POLLOUT_SET) && (out & bit)) { res_out |= bit; retval++; - wait = NULL; + wait->_qproc = NULL; } if ((mask & POLLEX_SET) && (ex & bit)) { res_ex |= bit; retval++; - wait = NULL; + wait->_qproc = NULL; } } } @@ -481,7 +479,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) *rexp = res_ex; cond_resched(); } - wait = NULL; + wait->_qproc = NULL; if (retval || timed_out || signal_pending(current)) break; if (table.error) { @@ -720,7 +718,7 @@ struct poll_list { * interested in events matching the pollfd->events mask, and the result * matching that mask is both recorded in pollfd->revents and returned. The * pwait poll_table will be used by the fd-provided poll handler for waiting, - * if non-NULL. + * if pwait->_qproc is non-NULL. */ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait) { @@ -738,9 +736,7 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait) if (file != NULL) { mask = DEFAULT_POLLMASK; if (file->f_op && file->f_op->poll) { - if (pwait) - pwait->key = pollfd->events | - POLLERR | POLLHUP; + pwait->_key = pollfd->events|POLLERR|POLLHUP; mask = file->f_op->poll(file, pwait); } /* Mask out unneeded events. */ @@ -763,7 +759,7 @@ static int do_poll(unsigned int nfds, struct poll_list *list, /* Optimise the no-wait case */ if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { - pt = NULL; + pt->_qproc = NULL; timed_out = 1; } @@ -781,22 +777,22 @@ static int do_poll(unsigned int nfds, struct poll_list *list, for (; pfd != pfd_end; pfd++) { /* * Fish for events. If we found one, record it - * and kill the poll_table, so we don't + * and kill poll_table->_qproc, so we don't * needlessly register any other waiters after * this. They'll get immediately deregistered * when we break out and return. */ if (do_pollfd(pfd, pt)) { count++; - pt = NULL; + pt->_qproc = NULL; } } } /* * All waiters have already been registered, so don't provide - * a poll_table to them on the next loop iteration. + * a poll_table->_qproc to them on the next loop iteration. */ - pt = NULL; + pt->_qproc = NULL; if (!count) { count = wait->error; if (signal_pending(current)) diff --git a/include/linux/poll.h b/include/linux/poll.h index cf40010ce0c..48fe8bc398d 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -32,21 +32,46 @@ struct poll_table_struct; */ typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *); +/* + * Do not touch the structure directly, use the access functions + * poll_does_not_wait() and poll_requested_events() instead. + */ typedef struct poll_table_struct { - poll_queue_proc qproc; - unsigned long key; + poll_queue_proc _qproc; + unsigned long _key; } poll_table; static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p) { - if (p && wait_address) - p->qproc(filp, wait_address, p); + if (p && p->_qproc && wait_address) + p->_qproc(filp, wait_address, p); +} + +/* + * Return true if it is guaranteed that poll will not wait. This is the case + * if the poll() of another file descriptor in the set got an event, so there + * is no need for waiting. + */ +static inline bool poll_does_not_wait(const poll_table *p) +{ + return p == NULL || p->_qproc == NULL; +} + +/* + * Return the set of events that the application wants to poll for. + * This is useful for drivers that need to know whether a DMA transfer has + * to be started implicitly on poll(). You typically only want to do that + * if the application is actually polling for POLLIN and/or POLLOUT. + */ +static inline unsigned long poll_requested_events(const poll_table *p) +{ + return p ? p->_key : ~0UL; } static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc) { - pt->qproc = qproc; - pt->key = ~0UL; /* all events enabled */ + pt->_qproc = qproc; + pt->_key = ~0UL; /* all events enabled */ } struct poll_table_entry { diff --git a/include/net/sock.h b/include/net/sock.h index 04bc0b30e9e..a6ba1f8871f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1854,7 +1854,7 @@ static inline bool wq_has_sleeper(struct socket_wq *wq) static inline void sock_poll_wait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p) { - if (p && wait_address) { + if (!poll_does_not_wait(p) && wait_address) { poll_wait(filp, wait_address, p); /* * We need to be sure we are in sync with the diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index eb4277c3318..d510353ef43 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2206,7 +2206,7 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock, } /* No write status requested, avoid expensive OUT tests. */ - if (wait && !(wait->key & (POLLWRBAND | POLLWRNORM | POLLOUT))) + if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT))) return mask; writable = unix_writable(sk); -- cgit v1.2.3-70-g09d2