From 1fd36adcd98c14d2fd97f545293c488775cb2823 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Feb 2012 17:49:54 +0000 Subject: Replace the fd_sets in struct fdtable with an array of unsigned longs Replace the fd_sets in struct fdtable with an array of unsigned longs and then use the standard non-atomic bit operations rather than the FD_* macros. This: (1) Removes the abuses of struct fd_set: (a) Since we don't want to allocate a full fd_set the vast majority of the time, we actually, in effect, just allocate a just-big-enough array of unsigned longs and cast it to an fd_set type - so why bother with the fd_set at all? (b) Some places outside of the core fdtable handling code (such as SELinux) want to look inside the array of unsigned longs hidden inside the fd_set struct for more efficient iteration over the entire set. (2) Eliminates the use of FD_*() macros in the kernel completely. (3) Permits the __FD_*() macros to be deleted entirely where not exposed to userspace. Signed-off-by: David Howells Link: http://lkml.kernel.org/r/20120216174954.23314.48147.stgit@warthog.procyon.org.uk Signed-off-by: H. Peter Anvin Cc: Al Viro --- fs/select.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/select.c') diff --git a/fs/select.c b/fs/select.c index d33418fdc85..2e7fbe8a092 100644 --- a/fs/select.c +++ b/fs/select.c @@ -348,7 +348,7 @@ static int max_select_fd(unsigned long n, fd_set_bits *fds) set = ~(~0UL << (n & (__NFDBITS-1))); n /= __NFDBITS; fdt = files_fdtable(current->files); - open_fds = fdt->open_fds->fds_bits+n; + open_fds = fdt->open_fds + n; max = 0; if (set) { set &= BITS(fds, n); -- cgit v1.2.3-70-g09d2 From faf309009e2e18d30c032b7d9479f29b91677c37 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 21 Feb 2012 17:24:20 -0800 Subject: sys_poll: fix incorrect type for 'timeout' parameter The 'poll()' system call timeout parameter is supposed to be 'int', not 'long'. Now, the reason this matters is that right now 32-bit compat mode is broken on at least x86-64, because the 32-bit code just calls 'sys_poll()' directly on x86-64, and the 32-bit argument will have been zero-extended, turning a signed 'int' into a large unsigned 'long' value. We could just introduce a 'compat_sys_poll()' function for this, and that may eventually be what we have to do, but since the actual standard poll() semantics is *supposed* to be 'int', and since at least on x86-64 glibc sign-extends the argument before invocing the system call (so nobody can actually use a 64-bit timeout value in user space _anyway_, even in 64-bit binaries), the simpler solution would seem to be to just fix the definition of the system call to match what it should have been from the very start. If it turns out that somebody somehow circumvents the user-level libc 64-bit sign extension and actually uses a large unsigned 64-bit timeout despite that not being how poll() is supposed to work, we will need to do the compat_sys_poll() approach. Reported-by: Thomas Meyer Acked-by: Eric Dumazet Signed-off-by: Linus Torvalds --- arch/s390/kernel/compat_wrapper.S | 2 +- fs/select.c | 2 +- include/linux/syscalls.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/select.c') diff --git a/arch/s390/kernel/compat_wrapper.S b/arch/s390/kernel/compat_wrapper.S index 18c51df9fe0..ff605a39cf4 100644 --- a/arch/s390/kernel/compat_wrapper.S +++ b/arch/s390/kernel/compat_wrapper.S @@ -662,7 +662,7 @@ ENTRY(sys32_getresuid16_wrapper) ENTRY(sys32_poll_wrapper) llgtr %r2,%r2 # struct pollfd * llgfr %r3,%r3 # unsigned int - lgfr %r4,%r4 # long + lgfr %r4,%r4 # int jg sys_poll # branch to system call ENTRY(sys32_setresgid16_wrapper) diff --git a/fs/select.c b/fs/select.c index d33418fdc85..e782258d0de 100644 --- a/fs/select.c +++ b/fs/select.c @@ -912,7 +912,7 @@ static long do_restart_poll(struct restart_block *restart_block) } SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, - long, timeout_msecs) + int, timeout_msecs) { struct timespec end_time, *to = NULL; int ret; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 515669fa3c1..8ec1153ff57 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -624,7 +624,7 @@ asmlinkage long sys_socketpair(int, int, int, int __user *); asmlinkage long sys_socketcall(int call, unsigned long __user *args); asmlinkage long sys_listen(int, int); asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds, - long timeout); + int timeout); asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timeval __user *tvp); asmlinkage long sys_old_select(struct sel_arg_struct __user *arg); -- cgit v1.2.3-70-g09d2 From 630d9c47274aa89bfa77fe6556d7818bdcb12992 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Wed, 16 Nov 2011 23:57:37 -0500 Subject: fs: reduce the use of module.h wherever possible For files only using THIS_MODULE and/or EXPORT_SYMBOL, map them onto including export.h -- or if the file isn't even using those, then just delete the include. Fix up any implicit include dependencies that were being masked by module.h along the way. Signed-off-by: Paul Gortmaker --- fs/aio.c | 2 +- fs/attr.c | 2 +- fs/bad_inode.c | 2 +- fs/binfmt_flat.c | 2 +- fs/bio.c | 2 +- fs/buffer.c | 2 +- fs/compat.c | 1 - fs/compat_ioctl.c | 1 - fs/dcache.c | 2 +- fs/dcookies.c | 2 +- fs/eventfd.c | 2 +- fs/file.c | 2 +- fs/fs-writeback.c | 2 +- fs/fs_struct.c | 2 +- fs/inode.c | 2 +- fs/ioctl.c | 2 +- fs/libfs.c | 2 +- fs/mpage.c | 2 +- fs/namei.c | 2 +- fs/posix_acl.c | 2 +- fs/read_write.c | 2 +- fs/readdir.c | 2 +- fs/select.c | 2 +- fs/seq_file.c | 2 +- fs/splice.c | 2 +- fs/stack.c | 2 +- fs/stat.c | 2 +- fs/statfs.c | 2 +- fs/super.c | 2 +- fs/sync.c | 2 +- fs/xattr.c | 2 +- fs/xattr_acl.c | 2 +- 32 files changed, 30 insertions(+), 32 deletions(-) (limited to 'fs/select.c') diff --git a/fs/aio.c b/fs/aio.c index 969beb0e223..4b5e06390db 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/fs/attr.c b/fs/attr.c index 95053ad8abc..73f69a6ce9e 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -5,7 +5,7 @@ * changes by Thomas Schoebel-Theuer */ -#include +#include #include #include #include diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 22e9a78872f..37268c5bb98 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -9,7 +9,7 @@ */ #include -#include +#include #include #include #include diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 1bffbe0ed77..3e27232e357 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -15,7 +15,7 @@ * JAN/99 -- coded full program relocation (gerg@snapgear.com) */ -#include +#include #include #include #include diff --git a/fs/bio.c b/fs/bio.c index b980ecde026..e453924036e 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include #include /* for struct sg_iovec */ diff --git a/fs/buffer.c b/fs/buffer.c index 1a30db77af3..70e2017edd7 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/fs/compat.c b/fs/compat.c index 07880bae28a..14483a715bb 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -33,7 +33,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index a26bea10e81..4d3eec7418e 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -49,7 +49,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/dcache.c b/fs/dcache.c index fe19ac13f75..303ebd98bc8 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/fs/dcookies.c b/fs/dcookies.c index dda0dc702d1..17c77996782 100644 --- a/fs/dcookies.c +++ b/fs/dcookies.c @@ -13,7 +13,7 @@ */ #include -#include +#include #include #include #include diff --git a/fs/eventfd.c b/fs/eventfd.c index d9a59177391..dba15fecf23 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include diff --git a/fs/file.c b/fs/file.c index 4c6992d8f3b..3c426de7203 100644 --- a/fs/file.c +++ b/fs/file.c @@ -6,7 +6,7 @@ * Manage the dynamic fd arrays in the process files_struct. */ -#include +#include #include #include #include diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 5b4a9362d5a..92fcb19e410 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -14,7 +14,7 @@ */ #include -#include +#include #include #include #include diff --git a/fs/fs_struct.c b/fs/fs_struct.c index 78b519c1353..a2d1db2ea98 100644 --- a/fs/fs_struct.c +++ b/fs/fs_struct.c @@ -1,4 +1,4 @@ -#include +#include #include #include #include diff --git a/fs/inode.c b/fs/inode.c index d3ebdbe723d..cd8cffcb75f 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/fs/ioctl.c b/fs/ioctl.c index 066836e8184..29167bebe87 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/fs/libfs.c b/fs/libfs.c index 5b2dbb3ba4f..001e25be4b6 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -3,7 +3,7 @@ * Library for filesystems writers. */ -#include +#include #include #include #include diff --git a/fs/mpage.c b/fs/mpage.c index 643e9f55ef2..0face1c4d4c 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -13,7 +13,7 @@ */ #include -#include +#include #include #include #include diff --git a/fs/namei.c b/fs/namei.c index a780ea515c4..fa549f27f01 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -15,7 +15,7 @@ */ #include -#include +#include #include #include #include diff --git a/fs/posix_acl.c b/fs/posix_acl.c index cea4623f1ed..5e325a42e33 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include diff --git a/fs/read_write.c b/fs/read_write.c index 5ad4248b0cd..ffc99d22e0a 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/fs/readdir.c b/fs/readdir.c index 356f71528ad..cc0a8227cdd 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -6,7 +6,7 @@ #include #include -#include +#include #include #include #include diff --git a/fs/select.c b/fs/select.c index e782258d0de..de668aa9a09 100644 --- a/fs/select.c +++ b/fs/select.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include /* for STICKY_TIMEOUTS */ diff --git a/fs/seq_file.c b/fs/seq_file.c index 4023d6be939..9f73c6b4436 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -6,7 +6,7 @@ */ #include -#include +#include #include #include diff --git a/fs/splice.c b/fs/splice.c index 1ec0493266b..66f4ee013bc 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/fs/stack.c b/fs/stack.c index 9c11519245a..5b5388250e2 100644 --- a/fs/stack.c +++ b/fs/stack.c @@ -1,4 +1,4 @@ -#include +#include #include #include diff --git a/fs/stat.c b/fs/stat.c index 8806b8997d2..c9dfa296b0c 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -4,7 +4,7 @@ * Copyright (C) 1991, 1992 Linus Torvalds */ -#include +#include #include #include #include diff --git a/fs/statfs.c b/fs/statfs.c index 2aa6a22e0be..43e6b6fe4e8 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -1,5 +1,5 @@ #include -#include +#include #include #include #include diff --git a/fs/super.c b/fs/super.c index 6277ec6cb60..52bfd251b75 100644 --- a/fs/super.c +++ b/fs/super.c @@ -20,7 +20,7 @@ * Heavily rewritten for 'one fs - one tree' dcache architecture. AV, Mar 2000 */ -#include +#include #include #include #include diff --git a/fs/sync.c b/fs/sync.c index f3501ef3923..0e8db939d96 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -6,7 +6,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/fs/xattr.c b/fs/xattr.c index 82f43376c7c..d6dfd247bb2 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c index 8d5a506c82e..69d06b07b16 100644 --- a/fs/xattr_acl.c +++ b/fs/xattr_acl.c @@ -5,7 +5,7 @@ * Copyright (C) 2001 by Andreas Gruenbacher, */ -#include +#include #include #include #include -- cgit v1.2.3-70-g09d2 From 626cf236608505d376e4799adb4f7eb00a8594af Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Fri, 23 Mar 2012 15:02:27 -0700 Subject: poll: add poll_requested_events() and poll_does_not_wait() functions In some cases the poll() implementation in a driver has to do different things depending on the events the caller wants to poll for. An example is when a driver needs to start a DMA engine if the caller polls for POLLIN, but doesn't want to do that if POLLIN is not requested but instead only POLLOUT or POLLPRI is requested. This is something that can happen in the video4linux subsystem among others. Unfortunately, the current epoll/poll/select implementation doesn't provide that information reliably. The poll_table_struct does have it: it has a key field with the event mask. But once a poll() call matches one or more bits of that mask any following poll() calls are passed a NULL poll_table pointer. Also, the eventpoll implementation always left the key field at ~0 instead of using the requested events mask. This was changed in eventpoll.c so the key field now contains the actual events that should be polled for as set by the caller. The solution to the NULL poll_table pointer is to set the qproc field to NULL in poll_table once poll() matches the events, not the poll_table pointer itself. That way drivers can obtain the mask through a new poll_requested_events inline. The poll_table_struct can still be NULL since some kernel code calls it internally (netfs_state_poll() in ./drivers/staging/pohmelfs/netfs.h). In that case poll_requested_events() returns ~0 (i.e. all events). Very rarely drivers might want to know whether poll_wait will actually wait. If another earlier file descriptor in the set already matched the events the caller wanted to wait for, then the kernel will return from the select() call without waiting. This might be useful information in order to avoid doing expensive work. A new helper function poll_does_not_wait() is added that drivers can use to detect this situation. This is now used in sock_poll_wait() in include/net/sock.h. This was the only place in the kernel that needed this information. Drivers should no longer access any of the poll_table internals, but use the poll_requested_events() and poll_does_not_wait() access functions instead. In order to enforce that the poll_table fields are now prepended with an underscore and a comment was added warning against using them directly. This required a change in unix_dgram_poll() in unix/af_unix.c which used the key field to get the requested events. It's been replaced by a call to poll_requested_events(). For qproc it was especially important to change its name since the behavior of that field changes with this patch since this function pointer can now be NULL when that wasn't possible in the past. Any driver accessing the qproc or key fields directly will now fail to compile. Some notes regarding the correctness of this patch: the driver's poll() function is called with a 'struct poll_table_struct *wait' argument. This pointer may or may not be NULL, drivers can never rely on it being one or the other as that depends on whether or not an earlier file descriptor in the select()'s fdset matched the requested events. There are only three things a driver can do with the wait argument: 1) obtain the key field: events = wait ? wait->key : ~0; This will still work although it should be replaced with the new poll_requested_events() function (which does exactly the same). This will now even work better, since wait is no longer set to NULL unnecessarily. 2) use the qproc callback. This could be deadly since qproc can now be NULL. Renaming qproc should prevent this from happening. There are no kernel drivers that actually access this callback directly, BTW. 3) test whether wait == NULL to determine whether poll would return without waiting. This is no longer sufficient as the correct test is now wait == NULL || wait->_qproc == NULL. However, the worst that can happen here is a slight performance hit in the case where wait != NULL and wait->_qproc == NULL. In that case the driver will assume that poll_wait() will actually add the fd to the set of waiting file descriptors. Of course, poll_wait() will not do that since it tests for wait->_qproc. This will not break anything, though. There is only one place in the whole kernel where this happens (sock_poll_wait() in include/net/sock.h) and that code will be replaced by a call to poll_does_not_wait() in the next patch. Note that even if wait->_qproc != NULL drivers cannot rely on poll_wait() actually waiting. The next file descriptor from the set might match the event mask and thus any possible waits will never happen. Signed-off-by: Hans Verkuil Reviewed-by: Jonathan Corbet Reviewed-by: Al Viro Cc: Davide Libenzi Signed-off-by: Hans de Goede Cc: Mauro Carvalho Chehab Cc: David Miller Cc: Eric Dumazet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 18 +++++++++++++++--- fs/select.c | 40 ++++++++++++++++++---------------------- include/linux/poll.h | 37 +++++++++++++++++++++++++++++++------ include/net/sock.h | 2 +- net/unix/af_unix.c | 2 +- 5 files changed, 66 insertions(+), 33 deletions(-) (limited to 'fs/select.c') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 4d9d3a45e35..ca300071e79 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -699,9 +699,12 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, void *priv) { struct epitem *epi, *tmp; + poll_table pt; + init_poll_funcptr(&pt, NULL); list_for_each_entry_safe(epi, tmp, head, rdllink) { - if (epi->ffd.file->f_op->poll(epi->ffd.file, NULL) & + pt._key = epi->event.events; + if (epi->ffd.file->f_op->poll(epi->ffd.file, &pt) & epi->event.events) return POLLIN | POLLRDNORM; else { @@ -1097,6 +1100,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, /* Initialize the poll table using the queue callback */ epq.epi = epi; init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); + epq.pt._key = event->events; /* * Attach the item to the poll hooks and get current event bits. @@ -1191,6 +1195,9 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even { int pwake = 0; unsigned int revents; + poll_table pt; + + init_poll_funcptr(&pt, NULL); /* * Set the new event interest mask before calling f_op->poll(); @@ -1198,13 +1205,14 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even * f_op->poll() call and the new event set registering. */ epi->event.events = event->events; + pt._key = event->events; epi->event.data = event->data; /* protected by mtx */ /* * Get current event bits. We can safely use the file* here because * its usage count has been increased by the caller of this function. */ - revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL); + revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt); /* * If the item is "hot" and it is not registered inside the ready @@ -1239,6 +1247,9 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, unsigned int revents; struct epitem *epi; struct epoll_event __user *uevent; + poll_table pt; + + init_poll_funcptr(&pt, NULL); /* * We can loop without lock because we are passed a task private list. @@ -1251,7 +1262,8 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, list_del_init(&epi->rdllink); - revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) & + pt._key = epi->event.events; + revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt) & epi->event.events; /* diff --git a/fs/select.c b/fs/select.c index e782258d0de..ecfd0b125ba 100644 --- a/fs/select.c +++ b/fs/select.c @@ -223,7 +223,7 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, get_file(filp); entry->filp = filp; entry->wait_address = wait_address; - entry->key = p->key; + entry->key = p->_key; init_waitqueue_func_entry(&entry->wait, pollwake); entry->wait.private = pwq; add_wait_queue(wait_address, &entry->wait); @@ -386,13 +386,11 @@ get_max: static inline void wait_key_set(poll_table *wait, unsigned long in, unsigned long out, unsigned long bit) { - if (wait) { - wait->key = POLLEX_SET; - if (in & bit) - wait->key |= POLLIN_SET; - if (out & bit) - wait->key |= POLLOUT_SET; - } + wait->_key = POLLEX_SET; + if (in & bit) + wait->_key |= POLLIN_SET; + if (out & bit) + wait->_key |= POLLOUT_SET; } int do_select(int n, fd_set_bits *fds, struct timespec *end_time) @@ -414,7 +412,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) poll_initwait(&table); wait = &table.pt; if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { - wait = NULL; + wait->_qproc = NULL; timed_out = 1; } @@ -459,17 +457,17 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) if ((mask & POLLIN_SET) && (in & bit)) { res_in |= bit; retval++; - wait = NULL; + wait->_qproc = NULL; } if ((mask & POLLOUT_SET) && (out & bit)) { res_out |= bit; retval++; - wait = NULL; + wait->_qproc = NULL; } if ((mask & POLLEX_SET) && (ex & bit)) { res_ex |= bit; retval++; - wait = NULL; + wait->_qproc = NULL; } } } @@ -481,7 +479,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) *rexp = res_ex; cond_resched(); } - wait = NULL; + wait->_qproc = NULL; if (retval || timed_out || signal_pending(current)) break; if (table.error) { @@ -720,7 +718,7 @@ struct poll_list { * interested in events matching the pollfd->events mask, and the result * matching that mask is both recorded in pollfd->revents and returned. The * pwait poll_table will be used by the fd-provided poll handler for waiting, - * if non-NULL. + * if pwait->_qproc is non-NULL. */ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait) { @@ -738,9 +736,7 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait) if (file != NULL) { mask = DEFAULT_POLLMASK; if (file->f_op && file->f_op->poll) { - if (pwait) - pwait->key = pollfd->events | - POLLERR | POLLHUP; + pwait->_key = pollfd->events|POLLERR|POLLHUP; mask = file->f_op->poll(file, pwait); } /* Mask out unneeded events. */ @@ -763,7 +759,7 @@ static int do_poll(unsigned int nfds, struct poll_list *list, /* Optimise the no-wait case */ if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { - pt = NULL; + pt->_qproc = NULL; timed_out = 1; } @@ -781,22 +777,22 @@ static int do_poll(unsigned int nfds, struct poll_list *list, for (; pfd != pfd_end; pfd++) { /* * Fish for events. If we found one, record it - * and kill the poll_table, so we don't + * and kill poll_table->_qproc, so we don't * needlessly register any other waiters after * this. They'll get immediately deregistered * when we break out and return. */ if (do_pollfd(pfd, pt)) { count++; - pt = NULL; + pt->_qproc = NULL; } } } /* * All waiters have already been registered, so don't provide - * a poll_table to them on the next loop iteration. + * a poll_table->_qproc to them on the next loop iteration. */ - pt = NULL; + pt->_qproc = NULL; if (!count) { count = wait->error; if (signal_pending(current)) diff --git a/include/linux/poll.h b/include/linux/poll.h index cf40010ce0c..48fe8bc398d 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -32,21 +32,46 @@ struct poll_table_struct; */ typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *); +/* + * Do not touch the structure directly, use the access functions + * poll_does_not_wait() and poll_requested_events() instead. + */ typedef struct poll_table_struct { - poll_queue_proc qproc; - unsigned long key; + poll_queue_proc _qproc; + unsigned long _key; } poll_table; static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p) { - if (p && wait_address) - p->qproc(filp, wait_address, p); + if (p && p->_qproc && wait_address) + p->_qproc(filp, wait_address, p); +} + +/* + * Return true if it is guaranteed that poll will not wait. This is the case + * if the poll() of another file descriptor in the set got an event, so there + * is no need for waiting. + */ +static inline bool poll_does_not_wait(const poll_table *p) +{ + return p == NULL || p->_qproc == NULL; +} + +/* + * Return the set of events that the application wants to poll for. + * This is useful for drivers that need to know whether a DMA transfer has + * to be started implicitly on poll(). You typically only want to do that + * if the application is actually polling for POLLIN and/or POLLOUT. + */ +static inline unsigned long poll_requested_events(const poll_table *p) +{ + return p ? p->_key : ~0UL; } static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc) { - pt->qproc = qproc; - pt->key = ~0UL; /* all events enabled */ + pt->_qproc = qproc; + pt->_key = ~0UL; /* all events enabled */ } struct poll_table_entry { diff --git a/include/net/sock.h b/include/net/sock.h index 04bc0b30e9e..a6ba1f8871f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1854,7 +1854,7 @@ static inline bool wq_has_sleeper(struct socket_wq *wq) static inline void sock_poll_wait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p) { - if (p && wait_address) { + if (!poll_does_not_wait(p) && wait_address) { poll_wait(filp, wait_address, p); /* * We need to be sure we are in sync with the diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index eb4277c3318..d510353ef43 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2206,7 +2206,7 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock, } /* No write status requested, avoid expensive OUT tests. */ - if (wait && !(wait->key & (POLLWRBAND | POLLWRNORM | POLLOUT))) + if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT))) return mask; writable = unix_writable(sk); -- cgit v1.2.3-70-g09d2