From c66ab6fa705e1b2887a6d9246b798bdc526839e2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 12 Jun 2007 21:17:17 +0200 Subject: splice: abstract out actor data For direct splicing (or private splicing), the output may not be a file. So abstract out the handling into a specified actor function and put the data in the splice_desc structure earlier, so we can build on top of that. This is the first step in better splice handling for drivers, and also for implementing vmsplice _to_ user memory. Signed-off-by: Jens Axboe --- include/linux/pipe_fs_i.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index c8884f97122..883ba9b78d3 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -94,13 +94,15 @@ struct splice_desc { typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *, struct splice_desc *); +typedef int (splice_direct_actor)(struct pipe_inode_info *, + struct splice_desc *); extern ssize_t splice_from_pipe(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int, splice_actor *); - -extern ssize_t __splice_from_pipe(struct pipe_inode_info *, struct file *, - loff_t *, size_t, unsigned int, - splice_actor *); +extern ssize_t __splice_from_pipe(struct pipe_inode_info *, + struct splice_desc *, splice_actor *); +extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *, + splice_direct_actor *); #endif -- cgit v1.2.3-70-g09d2 From 6a14b90bb6bc7cd83e2a444bf457a2ea645cbfe7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 14 Jun 2007 13:08:55 +0200 Subject: vmsplice: add vmsplice-to-user support A bit of a cheat, it actually just copies the data to userspace. But this makes the interface nice and symmetric and enables people to build on splice, with room for future improvement in performance. Signed-off-by: Jens Axboe --- fs/ocfs2/file.c | 2 +- fs/splice.c | 178 ++++++++++++++++++++++++++++++++++++++-------- include/linux/pipe_fs_i.h | 8 ++- 3 files changed, 158 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 93565c03d31..222f108ee45 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1640,7 +1640,7 @@ static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe, .total_len = len, .flags = flags, .pos = *ppos, - .file = out, + .u.file = out, }; ret = __splice_from_pipe(pipe, &sd, ocfs2_splice_write_actor); diff --git a/fs/splice.c b/fs/splice.c index 68f6328236a..13846f723d7 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -528,7 +528,7 @@ EXPORT_SYMBOL(generic_file_splice_read); static int pipe_to_sendpage(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { - struct file *file = sd->file; + struct file *file = sd->u.file; loff_t pos = sd->pos; int ret, more; @@ -566,7 +566,7 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe, static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { - struct file *file = sd->file; + struct file *file = sd->u.file; struct address_space *mapping = file->f_mapping; unsigned int offset, this_len; struct page *page; @@ -769,7 +769,7 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, .total_len = len, .flags = flags, .pos = *ppos, - .file = out, + .u.file = out, }; /* @@ -807,7 +807,7 @@ generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out, .total_len = len, .flags = flags, .pos = *ppos, - .file = out, + .u.file = out, }; ssize_t ret; int err; @@ -1087,7 +1087,7 @@ EXPORT_SYMBOL(splice_direct_to_actor); static int direct_splice_actor(struct pipe_inode_info *pipe, struct splice_desc *sd) { - struct file *file = sd->file; + struct file *file = sd->u.file; return do_splice_from(pipe, file, &sd->pos, sd->total_len, sd->flags); } @@ -1100,7 +1100,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, .total_len = len, .flags = flags, .pos = *ppos, - .file = out, + .u.file = out, }; size_t ret; @@ -1289,28 +1289,131 @@ static int get_iovec_page_array(const struct iovec __user *iov, return error; } +static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, + struct splice_desc *sd) +{ + char *src; + int ret; + + ret = buf->ops->pin(pipe, buf); + if (unlikely(ret)) + return ret; + + /* + * See if we can use the atomic maps, by prefaulting in the + * pages and doing an atomic copy + */ + if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) { + src = buf->ops->map(pipe, buf, 1); + ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset, + sd->len); + buf->ops->unmap(pipe, buf, src); + if (!ret) { + ret = sd->len; + goto out; + } + } + + /* + * No dice, use slow non-atomic map and copy + */ + src = buf->ops->map(pipe, buf, 0); + + ret = sd->len; + if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len)) + ret = -EFAULT; + +out: + if (ret > 0) + sd->u.userptr += ret; + buf->ops->unmap(pipe, buf, src); + return ret; +} + +/* + * For lack of a better implementation, implement vmsplice() to userspace + * as a simple copy of the pipes pages to the user iov. + */ +static long vmsplice_to_user(struct file *file, const struct iovec __user *iov, + unsigned long nr_segs, unsigned int flags) +{ + struct pipe_inode_info *pipe; + struct splice_desc sd; + ssize_t size; + int error; + long ret; + + pipe = pipe_info(file->f_path.dentry->d_inode); + if (!pipe) + return -EBADF; + + if (pipe->inode) + mutex_lock(&pipe->inode->i_mutex); + + error = ret = 0; + while (nr_segs) { + void __user *base; + size_t len; + + /* + * Get user address base and length for this iovec. + */ + error = get_user(base, &iov->iov_base); + if (unlikely(error)) + break; + error = get_user(len, &iov->iov_len); + if (unlikely(error)) + break; + + /* + * Sanity check this iovec. 0 read succeeds. + */ + if (unlikely(!len)) + break; + if (unlikely(!base)) { + error = -EFAULT; + break; + } + + sd.len = 0; + sd.total_len = len; + sd.flags = flags; + sd.u.userptr = base; + sd.pos = 0; + + size = __splice_from_pipe(pipe, &sd, pipe_to_user); + if (size < 0) { + if (!ret) + ret = size; + + break; + } + + ret += size; + + if (size < len) + break; + + nr_segs--; + iov++; + } + + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); + + if (!ret) + ret = error; + + return ret; +} + /* * vmsplice splices a user address range into a pipe. It can be thought of * as splice-from-memory, where the regular splice is splice-from-file (or * to file). In both cases the output is a pipe, naturally. - * - * Note that vmsplice only supports splicing _from_ user memory to a pipe, - * not the other way around. Splicing from user memory is a simple operation - * that can be supported without any funky alignment restrictions or nasty - * vm tricks. We simply map in the user memory and fill them into a pipe. - * The reverse isn't quite as easy, though. There are two possible solutions - * for that: - * - * - memcpy() the data internally, at which point we might as well just - * do a regular read() on the buffer anyway. - * - Lots of nasty vm tricks, that are neither fast nor flexible (it - * has restriction limitations on both ends of the pipe). - * - * Alas, it isn't here. - * */ -static long do_vmsplice(struct file *file, const struct iovec __user *iov, - unsigned long nr_segs, unsigned int flags) +static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, + unsigned long nr_segs, unsigned int flags) { struct pipe_inode_info *pipe; struct page *pages[PIPE_BUFFERS]; @@ -1325,10 +1428,6 @@ static long do_vmsplice(struct file *file, const struct iovec __user *iov, pipe = pipe_info(file->f_path.dentry->d_inode); if (!pipe) return -EBADF; - if (unlikely(nr_segs > UIO_MAXIOV)) - return -EINVAL; - else if (unlikely(!nr_segs)) - return 0; spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial, flags & SPLICE_F_GIFT); @@ -1338,6 +1437,22 @@ static long do_vmsplice(struct file *file, const struct iovec __user *iov, return splice_to_pipe(pipe, &spd); } +/* + * Note that vmsplice only really supports true splicing _from_ user memory + * to a pipe, not the other way around. Splicing from user memory is a simple + * operation that can be supported without any funky alignment restrictions + * or nasty vm tricks. We simply map in the user memory and fill them into + * a pipe. The reverse isn't quite as easy, though. There are two possible + * solutions for that: + * + * - memcpy() the data internally, at which point we might as well just + * do a regular read() on the buffer anyway. + * - Lots of nasty vm tricks, that are neither fast nor flexible (it + * has restriction limitations on both ends of the pipe). + * + * Currently we punt and implement it as a normal copy, see pipe_to_user(). + * + */ asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, unsigned long nr_segs, unsigned int flags) { @@ -1345,11 +1460,18 @@ asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, long error; int fput; + if (unlikely(nr_segs > UIO_MAXIOV)) + return -EINVAL; + else if (unlikely(!nr_segs)) + return 0; + error = -EBADF; file = fget_light(fd, &fput); if (file) { if (file->f_mode & FMODE_WRITE) - error = do_vmsplice(file, iov, nr_segs, flags); + error = vmsplice_to_pipe(file, iov, nr_segs, flags); + else if (file->f_mode & FMODE_READ) + error = vmsplice_to_user(file, iov, nr_segs, flags); fput_light(file, fput); } diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 883ba9b78d3..6e7bfc12542 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -88,7 +88,13 @@ int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *); struct splice_desc { unsigned int len, total_len; /* current and remaining length */ unsigned int flags; /* splice flags */ - struct file *file; /* file to read/write */ + /* + * actor() private data + */ + union { + void __user *userptr; /* memory to write to */ + struct file *file; /* file to read/write */ + } u; loff_t pos; /* file position */ }; -- cgit v1.2.3-70-g09d2 From 0452a4e5d021900b07ebdeecb9ed03b49f164f3f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 1 Jun 2007 11:55:49 +0200 Subject: sendfile: kill generic_file_sendfile() It's no longer used. Signed-off-by: Jens Axboe --- include/linux/fs.h | 1 - mm/filemap.c | 20 -------------------- 2 files changed, 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 6a41f4cab14..87c1d3e9d6c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1762,7 +1762,6 @@ extern ssize_t generic_file_buffered_write(struct kiocb *, const struct iovec *, unsigned long, loff_t, loff_t *, size_t, ssize_t); extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos); extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos); -extern ssize_t generic_file_sendfile(struct file *, loff_t *, size_t, read_actor_t, void *); extern void do_generic_mapping_read(struct address_space *mapping, struct file_ra_state *, struct file *, loff_t *, read_descriptor_t *, read_actor_t); diff --git a/mm/filemap.c b/mm/filemap.c index d1d9814f99d..c6ebd9f912a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1245,26 +1245,6 @@ int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long o return written; } -ssize_t generic_file_sendfile(struct file *in_file, loff_t *ppos, - size_t count, read_actor_t actor, void *target) -{ - read_descriptor_t desc; - - if (!count) - return 0; - - desc.written = 0; - desc.count = count; - desc.arg.data = target; - desc.error = 0; - - do_generic_file_read(in_file, ppos, &desc, actor); - if (desc.written) - return desc.written; - return desc.error; -} -EXPORT_SYMBOL(generic_file_sendfile); - static ssize_t do_readahead(struct address_space *mapping, struct file *filp, unsigned long index, unsigned long nr) -- cgit v1.2.3-70-g09d2 From 130610d6f681c5d970340897f5db000d04a7ef78 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 12 Jun 2007 21:20:00 +0200 Subject: splice: add void cookie to the actor data We need that for passing driver private info. Signed-off-by: Jens Axboe --- include/linux/pipe_fs_i.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 6e7bfc12542..f277a9cdb50 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -94,6 +94,7 @@ struct splice_desc { union { void __user *userptr; /* memory to write to */ struct file *file; /* file to read/write */ + void *data; /* cookie */ } u; loff_t pos; /* file position */ }; -- cgit v1.2.3-70-g09d2 From cf8208d0eabd1d5d2625ec02a175a294c3f30d36 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 12 Jun 2007 21:22:14 +0200 Subject: sendfile: convert nfsd to splice_direct_to_actor() Signed-off-by: Jens Axboe --- fs/nfsd/vfs.c | 47 ++++++++++++++++++++++++++------------- include/linux/sunrpc/svc.h | 2 +- net/sunrpc/auth_gss/svcauth_gss.c | 2 +- net/sunrpc/svc.c | 2 +- 4 files changed, 34 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 7e6aa245b5d..15471a9efe0 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include @@ -801,26 +801,32 @@ found: } /* - * Grab and keep cached pages assosiated with a file in the svc_rqst - * so that they can be passed to the netowork sendmsg/sendpage routines - * directrly. They will be released after the sending has completed. + * Grab and keep cached pages associated with a file in the svc_rqst + * so that they can be passed to the network sendmsg/sendpage routines + * directly. They will be released after the sending has completed. */ static int -nfsd_read_actor(read_descriptor_t *desc, struct page *page, unsigned long offset , unsigned long size) +nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, + struct splice_desc *sd) { - unsigned long count = desc->count; - struct svc_rqst *rqstp = desc->arg.data; + struct svc_rqst *rqstp = sd->u.data; struct page **pp = rqstp->rq_respages + rqstp->rq_resused; + struct page *page = buf->page; + size_t size; + int ret; + + ret = buf->ops->pin(pipe, buf); + if (unlikely(ret)) + return ret; - if (size > count) - size = count; + size = sd->len; if (rqstp->rq_res.page_len == 0) { get_page(page); put_page(*pp); *pp = page; rqstp->rq_resused++; - rqstp->rq_res.page_base = offset; + rqstp->rq_res.page_base = buf->offset; rqstp->rq_res.page_len = size; } else if (page != pp[-1]) { get_page(page); @@ -832,11 +838,15 @@ nfsd_read_actor(read_descriptor_t *desc, struct page *page, unsigned long offset } else rqstp->rq_res.page_len += size; - desc->count = count - size; - desc->written += size; return size; } +static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe, + struct splice_desc *sd) +{ + return __splice_from_pipe(pipe, sd, nfsd_splice_actor); +} + static __be32 nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, struct kvec *vec, int vlen, unsigned long *count) @@ -861,10 +871,15 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, if (ra && ra->p_set) file->f_ra = ra->p_ra; - if (file->f_op->sendfile && rqstp->rq_sendfile_ok) { - rqstp->rq_resused = 1; - host_err = file->f_op->sendfile(file, &offset, *count, - nfsd_read_actor, rqstp); + if (file->f_op->splice_read && rqstp->rq_splice_ok) { + struct splice_desc sd = { + .len = 0, + .total_len = *count, + .pos = offset, + .u.data = rqstp, + }; + + host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor); } else { oldfs = get_fs(); set_fs(KERNEL_DS); diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 4a7ae8ab6eb..129d50f2225 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -253,7 +253,7 @@ struct svc_rqst { * determine what device number * to report (real or virtual) */ - int rq_sendfile_ok; /* turned off in gss privacy + int rq_splice_ok; /* turned off in gss privacy * to prevent encrypting page * cache pages */ wait_queue_head_t rq_wait; /* synchronization */ diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 099a983797d..c094583386f 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -853,7 +853,7 @@ unwrap_priv_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gs u32 priv_len, maj_stat; int pad, saved_len, remaining_len, offset; - rqstp->rq_sendfile_ok = 0; + rqstp->rq_splice_ok = 0; priv_len = svc_getnl(&buf->head[0]); if (rqstp->rq_deferred) { diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index e673ef99390..55ea6df069d 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -814,7 +814,7 @@ svc_process(struct svc_rqst *rqstp) rqstp->rq_res.tail[0].iov_base = NULL; rqstp->rq_res.tail[0].iov_len = 0; /* Will be turned off only in gss privacy case: */ - rqstp->rq_sendfile_ok = 1; + rqstp->rq_splice_ok = 1; /* tcp needs a space for the record length... */ if (rqstp->rq_prot == IPPROTO_TCP) svc_putnl(resv, 0); -- cgit v1.2.3-70-g09d2 From d6b29d7cee064f28ca097e906de7453541351095 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 4 Jun 2007 09:59:47 +0200 Subject: splice: divorce the splice structure/function definitions from the pipe header We need to move even more stuff into the header so that folks can use the splice_to_pipe() implementation instead of open-coding a lot of pipe knowledge (see relay implementation), so move to our own header file finally. Signed-off-by: Jens Axboe --- drivers/block/loop.c | 2 +- drivers/char/mem.c | 2 +- fs/nfsd/vfs.c | 2 +- fs/ocfs2/file.c | 2 +- fs/read_write.c | 2 +- fs/splice.c | 26 ++++------------- include/linux/pipe_fs_i.h | 41 --------------------------- include/linux/splice.h | 72 +++++++++++++++++++++++++++++++++++++++++++++++ kernel/relay.c | 2 +- 9 files changed, 83 insertions(+), 68 deletions(-) create mode 100644 include/linux/splice.h (limited to 'include/linux') diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 87d84e7d543..08f53df03e8 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -74,7 +74,7 @@ #include #include #include -#include +#include #include diff --git a/drivers/char/mem.c b/drivers/char/mem.c index cc9a9d0df97..d2e4cfd79f2 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 15471a9efe0..8176fbf5c00 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index ed1ffa70cc3..44c2e2afa46 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include diff --git a/fs/read_write.c b/fs/read_write.c index 47da8a4a0fb..2527cf035b0 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include "read_write.h" #include diff --git a/fs/splice.c b/fs/splice.c index 13846f723d7..bea9f1581ca 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include @@ -29,22 +29,6 @@ #include #include -struct partial_page { - unsigned int offset; - unsigned int len; -}; - -/* - * Passed to splice_to_pipe - */ -struct splice_pipe_desc { - struct page **pages; /* page map */ - struct partial_page *partial; /* pages[] may not be contig */ - int nr_pages; /* number of pages in map */ - unsigned int flags; /* splice flags */ - const struct pipe_buf_operations *ops;/* ops associated with output pipe */ -}; - /* * Attempt to steal a page from a pipe buffer. This should perhaps go into * a vm helper function, it's already simplified quite a bit by the @@ -170,11 +154,11 @@ static const struct pipe_buf_operations user_page_pipe_buf_ops = { }; /* - * Pipe output worker. This sets up our pipe format with the page cache - * pipe buffer operations. Otherwise very similar to the regular pipe_writev(). + * Pipe output worker. This fills a pipe with the information contained + * from splice_pipe_desc(). */ -static ssize_t splice_to_pipe(struct pipe_inode_info *pipe, - struct splice_pipe_desc *spd) +ssize_t splice_to_pipe(struct pipe_inode_info *pipe, + struct splice_pipe_desc *spd) { unsigned int spd_pages = spd->nr_pages; int ret, do_wakeup, page_nr; diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index f277a9cdb50..7ba228d52f5 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -71,45 +71,4 @@ void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *); int generic_pipe_buf_pin(struct pipe_inode_info *, struct pipe_buffer *); int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *); -/* - * splice is tied to pipes as a transport (at least for now), so we'll just - * add the splice flags here. - */ -#define SPLICE_F_MOVE (0x01) /* move pages instead of copying */ -#define SPLICE_F_NONBLOCK (0x02) /* don't block on the pipe splicing (but */ - /* we may still block on the fd we splice */ - /* from/to, of course */ -#define SPLICE_F_MORE (0x04) /* expect more data */ -#define SPLICE_F_GIFT (0x08) /* pages passed in are a gift */ - -/* - * Passed to the actors - */ -struct splice_desc { - unsigned int len, total_len; /* current and remaining length */ - unsigned int flags; /* splice flags */ - /* - * actor() private data - */ - union { - void __user *userptr; /* memory to write to */ - struct file *file; /* file to read/write */ - void *data; /* cookie */ - } u; - loff_t pos; /* file position */ -}; - -typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *, - struct splice_desc *); -typedef int (splice_direct_actor)(struct pipe_inode_info *, - struct splice_desc *); - -extern ssize_t splice_from_pipe(struct pipe_inode_info *, struct file *, - loff_t *, size_t, unsigned int, - splice_actor *); -extern ssize_t __splice_from_pipe(struct pipe_inode_info *, - struct splice_desc *, splice_actor *); -extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *, - splice_direct_actor *); - #endif diff --git a/include/linux/splice.h b/include/linux/splice.h new file mode 100644 index 00000000000..f8cc97f71cd --- /dev/null +++ b/include/linux/splice.h @@ -0,0 +1,72 @@ +/* + * Function declerations and data structures related to the splice + * implementation. + * + * Copyright (C) 2007 Jens Axboe + * + */ +#ifndef SPLICE_H +#define SPLICE_H + +#include + +/* + * splice is tied to pipes as a transport (at least for now), so we'll just + * add the splice flags here. + */ +#define SPLICE_F_MOVE (0x01) /* move pages instead of copying */ +#define SPLICE_F_NONBLOCK (0x02) /* don't block on the pipe splicing (but */ + /* we may still block on the fd we splice */ + /* from/to, of course */ +#define SPLICE_F_MORE (0x04) /* expect more data */ +#define SPLICE_F_GIFT (0x08) /* pages passed in are a gift */ + +/* + * Passed to the actors + */ +struct splice_desc { + unsigned int len, total_len; /* current and remaining length */ + unsigned int flags; /* splice flags */ + /* + * actor() private data + */ + union { + void __user *userptr; /* memory to write to */ + struct file *file; /* file to read/write */ + void *data; /* cookie */ + } u; + loff_t pos; /* file position */ +}; + +struct partial_page { + unsigned int offset; + unsigned int len; +}; + +/* + * Passed to splice_to_pipe + */ +struct splice_pipe_desc { + struct page **pages; /* page map */ + struct partial_page *partial; /* pages[] may not be contig */ + int nr_pages; /* number of pages in map */ + unsigned int flags; /* splice flags */ + const struct pipe_buf_operations *ops;/* ops associated with output pipe */ +}; + +typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *, + struct splice_desc *); +typedef int (splice_direct_actor)(struct pipe_inode_info *, + struct splice_desc *); + +extern ssize_t splice_from_pipe(struct pipe_inode_info *, struct file *, + loff_t *, size_t, unsigned int, + splice_actor *); +extern ssize_t __splice_from_pipe(struct pipe_inode_info *, + struct splice_desc *, splice_actor *); +extern ssize_t splice_to_pipe(struct pipe_inode_info *, + struct splice_pipe_desc *); +extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *, + splice_direct_actor *); + +#endif diff --git a/kernel/relay.c b/kernel/relay.c index d1d1920f280..951f29b24e5 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include /* list of open channels, for cpu hotplug */ static DEFINE_MUTEX(relay_channels_mutex); -- cgit v1.2.3-70-g09d2 From 497f9625c2bbd6a8525fb2eedb22a382a6a8253c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 11 Jun 2007 12:00:45 +0200 Subject: pipe: allow passing around of ops private pointer relay needs this for proper consumption handling, and the network receive support needs it as well to lookup the sk_buff on pipe release. Signed-off-by: Jens Axboe --- fs/splice.c | 1 + include/linux/pipe_fs_i.h | 1 + include/linux/splice.h | 1 + 3 files changed, 3 insertions(+) (limited to 'include/linux') diff --git a/fs/splice.c b/fs/splice.c index bea9f1581ca..00850e56280 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -185,6 +185,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, buf->page = spd->pages[page_nr]; buf->offset = spd->partial[page_nr].offset; buf->len = spd->partial[page_nr].len; + buf->private = spd->partial[page_nr].private; buf->ops = spd->ops; if (spd->flags & SPLICE_F_GIFT) buf->flags |= PIPE_BUF_FLAG_GIFT; diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 7ba228d52f5..4409167b9eb 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -14,6 +14,7 @@ struct pipe_buffer { unsigned int offset, len; const struct pipe_buf_operations *ops; unsigned int flags; + unsigned long private; }; struct pipe_inode_info { diff --git a/include/linux/splice.h b/include/linux/splice.h index f8cc97f71cd..33e447f98a5 100644 --- a/include/linux/splice.h +++ b/include/linux/splice.h @@ -41,6 +41,7 @@ struct splice_desc { struct partial_page { unsigned int offset; unsigned int len; + unsigned long private; }; /* -- cgit v1.2.3-70-g09d2 From d054fe3d10cc1f9aec01378c38caa32dffdd0090 Mon Sep 17 00:00:00 2001 From: Carsten Otte Date: Fri, 15 Jun 2007 08:16:22 +0200 Subject: xip sendfile removal This patch removes xip_file_sendfile, the sendfile implementation for xip without replacement. Those customers that use xip on s390 are not using sendfile() as far as we know, and so far s390 is the only platform this could potentially be used on so far. Having sendfile is not a popular feature for execute in place file systems, however we have a working implementation of splice_read() based on fs/splice.c if anyone asks for it. At this point in time, it does not seem preferable to merge splice_read() for xip because it causes extra maintenence effort due to code duplication and it requires struct page behind the xip memory segment. We'd like to get rid of that in favor of supporting flash based embedded platforms (Monta Vista work) soon. Signed-off-by: Carsten Otte Signed-off-by: Jens Axboe --- fs/ext2/file.c | 1 - include/linux/fs.h | 3 --- mm/filemap_xip.c | 22 ---------------------- 3 files changed, 26 deletions(-) (limited to 'include/linux') diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 072a1909b2b..04afeecaaef 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -70,7 +70,6 @@ const struct file_operations ext2_xip_file_operations = { .open = generic_file_open, .release = ext2_release_file, .fsync = ext2_sync_file, - .sendfile = xip_file_sendfile, }; #endif diff --git a/include/linux/fs.h b/include/linux/fs.h index 87c1d3e9d6c..894620e9402 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1791,9 +1791,6 @@ extern int nonseekable_open(struct inode * inode, struct file * filp); #ifdef CONFIG_FS_XIP extern ssize_t xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos); -extern ssize_t xip_file_sendfile(struct file *in_file, loff_t *ppos, - size_t count, read_actor_t actor, - void *target); extern int xip_file_mmap(struct file * file, struct vm_area_struct * vma); extern ssize_t xip_file_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos); diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index fa360e566d8..65ffc321f0c 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -159,28 +159,6 @@ xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) } EXPORT_SYMBOL_GPL(xip_file_read); -ssize_t -xip_file_sendfile(struct file *in_file, loff_t *ppos, - size_t count, read_actor_t actor, void *target) -{ - read_descriptor_t desc; - - if (!count) - return 0; - - desc.written = 0; - desc.count = count; - desc.arg.data = target; - desc.error = 0; - - do_xip_mapping_read(in_file->f_mapping, &in_file->f_ra, in_file, - ppos, &desc, actor); - if (desc.written) - return desc.written; - return desc.error; -} -EXPORT_SYMBOL_GPL(xip_file_sendfile); - /* * __xip_unmap is invoked from xip_unmap and * xip_write -- cgit v1.2.3-70-g09d2 From d96e6e71647846e0dab097efd9b8bf3a3a556dca Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 11 Jun 2007 12:18:52 +0200 Subject: Remove remnants of sendfile() There are now zero users of .sendfile() in the kernel, so kill it from the file_operations structure and in do_sendfile(). Signed-off-by: Jens Axboe --- fs/read_write.c | 26 +++++++++++--------------- include/linux/fs.h | 3 +-- 2 files changed, 12 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/fs/read_write.c b/fs/read_write.c index 2527cf035b0..507ddff48a9 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -724,8 +724,7 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, in_inode = in_file->f_path.dentry->d_inode; if (!in_inode) goto fput_in; - if (!in_file->f_op || (!in_file->f_op->sendfile && - !in_file->f_op->splice_read)) + if (!in_file->f_op || !in_file->f_op->splice_read) goto fput_in; retval = -ESPIPE; if (!ppos) @@ -778,21 +777,18 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, count = max - pos; } - if (in_file->f_op->splice_read) { - fl = 0; + fl = 0; #if 0 - /* - * We need to debate whether we can enable this or not. The - * man page documents EAGAIN return for the output at least, - * and the application is arguably buggy if it doesn't expect - * EAGAIN on a non-blocking file descriptor. - */ - if (in_file->f_flags & O_NONBLOCK) - fl = SPLICE_F_NONBLOCK; + /* + * We need to debate whether we can enable this or not. The + * man page documents EAGAIN return for the output at least, + * and the application is arguably buggy if it doesn't expect + * EAGAIN on a non-blocking file descriptor. + */ + if (in_file->f_flags & O_NONBLOCK) + fl = SPLICE_F_NONBLOCK; #endif - retval = do_splice_direct(in_file, ppos, out_file, count, fl); - } else - retval = in_file->f_op->sendfile(in_file, ppos, count, file_send_actor, out_file); + retval = do_splice_direct(in_file, ppos, out_file, count, fl); if (retval > 0) { add_rchar(current, retval); diff --git a/include/linux/fs.h b/include/linux/fs.h index 894620e9402..4f0b3bf5983 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1054,7 +1054,7 @@ struct block_device_operations { }; /* - * "descriptor" for what we're up to with a read for sendfile(). + * "descriptor" for what we're up to with a read. * This allows us to use the same read code yet * have multiple different users of the data that * we read from a file. @@ -1105,7 +1105,6 @@ struct file_operations { int (*aio_fsync) (struct kiocb *, int datasync); int (*fasync) (int, struct file *, int); int (*lock) (struct file *, int, struct file_lock *); - ssize_t (*sendfile) (struct file *, loff_t *, size_t, read_actor_t, void *); ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); int (*check_flags)(int); -- cgit v1.2.3-70-g09d2 From cac36bb06efe4880234524e117e0e712b10b1f16 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 14 Jun 2007 13:10:48 +0200 Subject: pipe: change the ->pin() operation to ->confirm() The name 'pin' was badly chosen, it doesn't pin a pipe buffer in the most commonly used sense in the kernel. So change the name to 'confirm', after debating this issue with Hugh Dickins a bit. A good return from ->confirm() means that the buffer is really there, and that the contents are good. Signed-off-by: Jens Axboe --- drivers/block/loop.c | 2 +- fs/nfsd/vfs.c | 2 +- fs/ocfs2/file.c | 4 ++-- fs/pipe.c | 9 +++++---- fs/splice.c | 14 +++++++------- include/linux/pipe_fs_i.h | 9 +++++---- kernel/relay.c | 2 +- 7 files changed, 22 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 08f53df03e8..4503290da40 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -412,7 +412,7 @@ lo_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, size_t size; int ret; - ret = buf->ops->pin(pipe, buf); + ret = buf->ops->confirm(pipe, buf); if (unlikely(ret)) return ret; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 8176fbf5c00..8604e35bd48 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -815,7 +815,7 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, size_t size; int ret; - ret = buf->ops->pin(pipe, buf); + ret = buf->ops->confirm(pipe, buf); if (unlikely(ret)) return ret; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 44c2e2afa46..4979b667571 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1583,7 +1583,7 @@ static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe, ssize_t copied = 0; struct ocfs2_splice_write_priv sp; - ret = buf->ops->pin(pipe, buf); + ret = buf->ops->confirm(pipe, buf); if (ret) goto out; @@ -1604,7 +1604,7 @@ static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe, * might enter ocfs2_buffered_write_cluster() more * than once, so keep track of our progress here. */ - copied = ocfs2_buffered_write_cluster(sd->file, + copied = ocfs2_buffered_write_cluster(sd->u.file, (loff_t)sd->pos + total, count, ocfs2_map_and_write_splice_data, diff --git a/fs/pipe.c b/fs/pipe.c index 3a89592bdf5..3694af10dd2 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -203,7 +203,8 @@ void generic_pipe_buf_get(struct pipe_inode_info *info, struct pipe_buffer *buf) page_cache_get(buf->page); } -int generic_pipe_buf_pin(struct pipe_inode_info *info, struct pipe_buffer *buf) +int generic_pipe_buf_confirm(struct pipe_inode_info *info, + struct pipe_buffer *buf) { return 0; } @@ -212,7 +213,7 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = { .can_merge = 1, .map = generic_pipe_buf_map, .unmap = generic_pipe_buf_unmap, - .pin = generic_pipe_buf_pin, + .confirm = generic_pipe_buf_confirm, .release = anon_pipe_buf_release, .steal = generic_pipe_buf_steal, .get = generic_pipe_buf_get, @@ -252,7 +253,7 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov, if (chars > total_len) chars = total_len; - error = ops->pin(pipe, buf); + error = ops->confirm(pipe, buf); if (error) { if (!ret) error = ret; @@ -373,7 +374,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov, int error, atomic = 1; void *addr; - error = ops->pin(pipe, buf); + error = ops->confirm(pipe, buf); if (error) goto out; diff --git a/fs/splice.c b/fs/splice.c index d257d666358..c804121601b 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -85,8 +85,8 @@ static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, buf->flags &= ~PIPE_BUF_FLAG_LRU; } -static int page_cache_pipe_buf_pin(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) +static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) { struct page *page = buf->page; int err; @@ -127,7 +127,7 @@ static const struct pipe_buf_operations page_cache_pipe_buf_ops = { .can_merge = 0, .map = generic_pipe_buf_map, .unmap = generic_pipe_buf_unmap, - .pin = page_cache_pipe_buf_pin, + .confirm = page_cache_pipe_buf_confirm, .release = page_cache_pipe_buf_release, .steal = page_cache_pipe_buf_steal, .get = generic_pipe_buf_get, @@ -147,7 +147,7 @@ static const struct pipe_buf_operations user_page_pipe_buf_ops = { .can_merge = 0, .map = generic_pipe_buf_map, .unmap = generic_pipe_buf_unmap, - .pin = generic_pipe_buf_pin, + .confirm = generic_pipe_buf_confirm, .release = page_cache_pipe_buf_release, .steal = user_page_pipe_buf_steal, .get = generic_pipe_buf_get, @@ -525,7 +525,7 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe, loff_t pos = sd->pos; int ret, more; - ret = buf->ops->pin(pipe, buf); + ret = buf->ops->confirm(pipe, buf); if (!ret) { more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; @@ -569,7 +569,7 @@ static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, /* * make sure the data in this buffer is uptodate */ - ret = buf->ops->pin(pipe, buf); + ret = buf->ops->confirm(pipe, buf); if (unlikely(ret)) return ret; @@ -1341,7 +1341,7 @@ static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, char *src; int ret; - ret = buf->ops->pin(pipe, buf); + ret = buf->ops->confirm(pipe, buf); if (unlikely(ret)) return ret; diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 4409167b9eb..cc09fe89bf0 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -35,20 +35,21 @@ struct pipe_inode_info { /* * Note on the nesting of these functions: * - * ->pin() + * ->confirm() * ->steal() * ... * ->map() * ... * ->unmap() * - * That is, ->map() must be called on a pinned buffer, same goes for ->steal(). + * That is, ->map() must be called on a confirmed buffer, + * same goes for ->steal(). */ struct pipe_buf_operations { int can_merge; void * (*map)(struct pipe_inode_info *, struct pipe_buffer *, int); void (*unmap)(struct pipe_inode_info *, struct pipe_buffer *, void *); - int (*pin)(struct pipe_inode_info *, struct pipe_buffer *); + int (*confirm)(struct pipe_inode_info *, struct pipe_buffer *); void (*release)(struct pipe_inode_info *, struct pipe_buffer *); int (*steal)(struct pipe_inode_info *, struct pipe_buffer *); void (*get)(struct pipe_inode_info *, struct pipe_buffer *); @@ -69,7 +70,7 @@ void __free_pipe_info(struct pipe_inode_info *); void *generic_pipe_buf_map(struct pipe_inode_info *, struct pipe_buffer *, int); void generic_pipe_buf_unmap(struct pipe_inode_info *, struct pipe_buffer *, void *); void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *); -int generic_pipe_buf_pin(struct pipe_inode_info *, struct pipe_buffer *); +int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *); int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *); #endif diff --git a/kernel/relay.c b/kernel/relay.c index dd3bc5b6903..3b299fb3855 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1055,7 +1055,7 @@ static struct pipe_buf_operations relay_pipe_buf_ops = { .can_merge = 0, .map = generic_pipe_buf_map, .unmap = generic_pipe_buf_unmap, - .pin = generic_pipe_buf_pin, + .confirm = generic_pipe_buf_confirm, .release = relay_pipe_buf_release, .steal = generic_pipe_buf_steal, .get = generic_pipe_buf_get, -- cgit v1.2.3-70-g09d2 From 0845718dafea3e16041d270c256e8516acf4e13d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 12 Jun 2007 20:51:32 +0200 Subject: pipe: add documentation and comments As per Andrew Mortons request, here's a set of documentation for the generic pipe_buf_operations hooks, the pipe, and pipe_buffer structures. Signed-off-by: Jens Axboe --- fs/pipe.c | 61 ++++++++++++++++++++++++++++++++++++- fs/splice.c | 4 +++ include/linux/pipe_fs_i.h | 77 ++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 140 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/fs/pipe.c b/fs/pipe.c index 3694af10dd2..d007830d9c8 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -164,6 +164,20 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe, page_cache_release(page); } +/** + * generic_pipe_buf_map - virtually map a pipe buffer + * @pipe: the pipe that the buffer belongs to + * @buf: the buffer that should be mapped + * @atomic: whether to use an atomic map + * + * Description: + * This function returns a kernel virtual address mapping for the + * passed in @pipe_buffer. If @atomic is set, an atomic map is provided + * and the caller has to be careful not to fault before calling + * the unmap function. + * + * Note that this function occupies KM_USER0 if @atomic != 0. + */ void *generic_pipe_buf_map(struct pipe_inode_info *pipe, struct pipe_buffer *buf, int atomic) { @@ -175,6 +189,15 @@ void *generic_pipe_buf_map(struct pipe_inode_info *pipe, return kmap(buf->page); } +/** + * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer + * @pipe: the pipe that the buffer belongs to + * @buf: the buffer that should be unmapped + * @map_data: the data that the mapping function returned + * + * Description: + * This function undoes the mapping that ->map() provided. + */ void generic_pipe_buf_unmap(struct pipe_inode_info *pipe, struct pipe_buffer *buf, void *map_data) { @@ -185,11 +208,28 @@ void generic_pipe_buf_unmap(struct pipe_inode_info *pipe, kunmap(buf->page); } +/** + * generic_pipe_buf_steal - attempt to take ownership of a @pipe_buffer + * @pipe: the pipe that the buffer belongs to + * @buf: the buffer to attempt to steal + * + * Description: + * This function attempts to steal the @struct page attached to + * @buf. If successful, this function returns 0 and returns with + * the page locked. The caller may then reuse the page for whatever + * he wishes, the typical use is insertion into a different file + * page cache. + */ int generic_pipe_buf_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct page *page = buf->page; + /* + * A reference of one is golden, that means that the owner of this + * page is the only one holding a reference to it. lock the page + * and return OK. + */ if (page_count(page) == 1) { lock_page(page); return 0; @@ -198,11 +238,30 @@ int generic_pipe_buf_steal(struct pipe_inode_info *pipe, return 1; } -void generic_pipe_buf_get(struct pipe_inode_info *info, struct pipe_buffer *buf) +/** + * generic_pipe_buf_get - get a reference to a @struct pipe_buffer + * @pipe: the pipe that the buffer belongs to + * @buf: the buffer to get a reference to + * + * Description: + * This function grabs an extra reference to @buf. It's used in + * in the tee() system call, when we duplicate the buffers in one + * pipe into another. + */ +void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { page_cache_get(buf->page); } +/** + * generic_pipe_buf_confirm - verify contents of the pipe buffer + * @pipe: the pipe that the buffer belongs to + * @buf: the buffer to confirm + * + * Description: + * This function does nothing, because the generic pipe code uses + * pages that are always good when inserted into the pipe. + */ int generic_pipe_buf_confirm(struct pipe_inode_info *info, struct pipe_buffer *buf) { diff --git a/fs/splice.c b/fs/splice.c index c804121601b..ed2ce995475 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -85,6 +85,10 @@ static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, buf->flags &= ~PIPE_BUF_FLAG_LRU; } +/* + * Check whether the contents of buf is OK to access. Since the content + * is a page cache page, IO may be in flight. + */ static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index cc09fe89bf0..8e4120285f7 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -9,6 +9,15 @@ #define PIPE_BUF_FLAG_ATOMIC 0x02 /* was atomically mapped */ #define PIPE_BUF_FLAG_GIFT 0x04 /* page is a gift */ +/** + * struct pipe_buffer - a linux kernel pipe buffer + * @page: the page containing the data for the pipe buffer + * @offset: offset of data inside the @page + * @len: length of data inside the @page + * @ops: operations associated with this buffer. See @pipe_buf_operations. + * @flags: pipe buffer flags. See above. + * @private: private data owned by the ops. + **/ struct pipe_buffer { struct page *page; unsigned int offset, len; @@ -17,6 +26,22 @@ struct pipe_buffer { unsigned long private; }; +/** + * struct pipe_inode_info - a linux kernel pipe + * @wait: reader/writer wait point in case of empty/full pipe + * @nrbufs: the number of non-empty pipe buffers in this pipe + * @curbuf: the current pipe buffer entry + * @tmp_page: cached released page + * @readers: number of current readers of this pipe + * @writers: number of current writers of this pipe + * @waiting_writers: number of writers blocked waiting for room + * @r_counter: reader counter + * @w_counter: writer counter + * @fasync_readers: reader side fasync + * @fasync_writers: writer side fasync + * @inode: inode this pipe is attached to + * @bufs: the circular array of pipe buffers + **/ struct pipe_inode_info { wait_queue_head_t wait; unsigned int nrbufs, curbuf; @@ -43,15 +68,65 @@ struct pipe_inode_info { * ->unmap() * * That is, ->map() must be called on a confirmed buffer, - * same goes for ->steal(). + * same goes for ->steal(). See below for the meaning of each + * operation. Also see kerneldoc in fs/pipe.c for the pipe + * and generic variants of these hooks. */ struct pipe_buf_operations { + /* + * This is set to 1, if the generic pipe read/write may coalesce + * data into an existing buffer. If this is set to 0, a new pipe + * page segment is always used for new data. + */ int can_merge; + + /* + * ->map() returns a virtual address mapping of the pipe buffer. + * The last integer flag reflects whether this should be an atomic + * mapping or not. The atomic map is faster, however you can't take + * page faults before calling ->unmap() again. So if you need to eg + * access user data through copy_to/from_user(), then you must get + * a non-atomic map. ->map() uses the KM_USER0 atomic slot for + * atomic maps, so you can't map more than one pipe_buffer at once + * and you have to be careful if mapping another page as source + * or destination for a copy (IOW, it has to use something else + * than KM_USER0). + */ void * (*map)(struct pipe_inode_info *, struct pipe_buffer *, int); + + /* + * Undoes ->map(), finishes the virtual mapping of the pipe buffer. + */ void (*unmap)(struct pipe_inode_info *, struct pipe_buffer *, void *); + + /* + * ->confirm() verifies that the data in the pipe buffer is there + * and that the contents are good. If the pages in the pipe belong + * to a file system, we may need to wait for IO completion in this + * hook. Returns 0 for good, or a negative error value in case of + * error. + */ int (*confirm)(struct pipe_inode_info *, struct pipe_buffer *); + + /* + * When the contents of this pipe buffer has been completely + * consumed by a reader, ->release() is called. + */ void (*release)(struct pipe_inode_info *, struct pipe_buffer *); + + /* + * Attempt to take ownership of the pipe buffer and its contents. + * ->steal() returns 0 for success, in which case the contents + * of the pipe (the buf->page) is locked and now completely owned + * by the caller. The page may then be transferred to a different + * mapping, the most often used case is insertion into different + * file address space cache. + */ int (*steal)(struct pipe_inode_info *, struct pipe_buffer *); + + /* + * Get a reference to the pipe buffer. + */ void (*get)(struct pipe_inode_info *, struct pipe_buffer *); }; -- cgit v1.2.3-70-g09d2