summaryrefslogtreecommitdiffstats
path: root/mm/filemap.c
diff options
context:
space:
mode:
authorPaul Moore <pmoore@redhat.com>2014-08-05 15:44:22 -0400
committerPaul Moore <pmoore@redhat.com>2014-08-05 15:44:22 -0400
commitaa9e0de81b5b257f6dae48efe2ed5f255f066497 (patch)
tree9b0b791d5912368006115427e74105cfe26750bd /mm/filemap.c
parent4fbe63d1c773cceef3fe1f6ed0c9c268f4f24760 (diff)
parent19583ca584d6f574384e17fe7613dfaeadcdc4a6 (diff)
Merge tag 'v3.16' into next
Linux 3.16
Diffstat (limited to 'mm/filemap.c')
-rw-r--r--mm/filemap.c403
1 files changed, 157 insertions, 246 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 088358c8006..900edfaf6df 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -742,7 +742,7 @@ void unlock_page(struct page *page)
{
VM_BUG_ON_PAGE(!PageLocked(page), page);
clear_bit_unlock(PG_locked, &page->flags);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
wake_up_page(page, PG_locked);
}
EXPORT_SYMBOL(unlock_page);
@@ -753,17 +753,51 @@ EXPORT_SYMBOL(unlock_page);
*/
void end_page_writeback(struct page *page)
{
- if (TestClearPageReclaim(page))
+ /*
+ * TestClearPageReclaim could be used here but it is an atomic
+ * operation and overkill in this particular case. Failing to
+ * shuffle a page marked for immediate reclaim is too mild to
+ * justify taking an atomic operation penalty at the end of
+ * ever page writeback.
+ */
+ if (PageReclaim(page)) {
+ ClearPageReclaim(page);
rotate_reclaimable_page(page);
+ }
if (!test_clear_page_writeback(page))
BUG();
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
wake_up_page(page, PG_writeback);
}
EXPORT_SYMBOL(end_page_writeback);
+/*
+ * After completing I/O on a page, call this routine to update the page
+ * flags appropriately
+ */
+void page_endio(struct page *page, int rw, int err)
+{
+ if (rw == READ) {
+ if (!err) {
+ SetPageUptodate(page);
+ } else {
+ ClearPageUptodate(page);
+ SetPageError(page);
+ }
+ unlock_page(page);
+ } else { /* rw == WRITE */
+ if (err) {
+ SetPageError(page);
+ if (page->mapping)
+ mapping_set_error(page->mapping, err);
+ }
+ end_page_writeback(page);
+ }
+}
+EXPORT_SYMBOL_GPL(page_endio);
+
/**
* __lock_page - get a lock on the page, assuming we need to sleep to get it
* @page: the page to lock
@@ -957,26 +991,6 @@ out:
EXPORT_SYMBOL(find_get_entry);
/**
- * find_get_page - find and get a page reference
- * @mapping: the address_space to search
- * @offset: the page index
- *
- * Looks up the page cache slot at @mapping & @offset. If there is a
- * page cache page, it is returned with an increased refcount.
- *
- * Otherwise, %NULL is returned.
- */
-struct page *find_get_page(struct address_space *mapping, pgoff_t offset)
-{
- struct page *page = find_get_entry(mapping, offset);
-
- if (radix_tree_exceptional_entry(page))
- page = NULL;
- return page;
-}
-EXPORT_SYMBOL(find_get_page);
-
-/**
* find_lock_entry - locate, pin and lock a page cache entry
* @mapping: the address_space to search
* @offset: the page cache index
@@ -1013,66 +1027,87 @@ repeat:
EXPORT_SYMBOL(find_lock_entry);
/**
- * find_lock_page - locate, pin and lock a pagecache page
+ * pagecache_get_page - find and get a page reference
* @mapping: the address_space to search
* @offset: the page index
+ * @fgp_flags: PCG flags
+ * @cache_gfp_mask: gfp mask to use for the page cache data page allocation
+ * @radix_gfp_mask: gfp mask to use for radix tree node allocation
*
- * Looks up the page cache slot at @mapping & @offset. If there is a
- * page cache page, it is returned locked and with an increased
- * refcount.
- *
- * Otherwise, %NULL is returned.
- *
- * find_lock_page() may sleep.
- */
-struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
-{
- struct page *page = find_lock_entry(mapping, offset);
-
- if (radix_tree_exceptional_entry(page))
- page = NULL;
- return page;
-}
-EXPORT_SYMBOL(find_lock_page);
-
-/**
- * find_or_create_page - locate or add a pagecache page
- * @mapping: the page's address_space
- * @index: the page's index into the mapping
- * @gfp_mask: page allocation mode
+ * Looks up the page cache slot at @mapping & @offset.
*
- * Looks up the page cache slot at @mapping & @offset. If there is a
- * page cache page, it is returned locked and with an increased
- * refcount.
+ * PCG flags modify how the page is returned.
*
- * If the page is not present, a new page is allocated using @gfp_mask
- * and added to the page cache and the VM's LRU list. The page is
- * returned locked and with an increased refcount.
+ * FGP_ACCESSED: the page will be marked accessed
+ * FGP_LOCK: Page is return locked
+ * FGP_CREAT: If page is not present then a new page is allocated using
+ * @cache_gfp_mask and added to the page cache and the VM's LRU
+ * list. If radix tree nodes are allocated during page cache
+ * insertion then @radix_gfp_mask is used. The page is returned
+ * locked and with an increased refcount. Otherwise, %NULL is
+ * returned.
*
- * On memory exhaustion, %NULL is returned.
+ * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even
+ * if the GFP flags specified for FGP_CREAT are atomic.
*
- * find_or_create_page() may sleep, even if @gfp_flags specifies an
- * atomic allocation!
+ * If there is a page cache page, it is returned with an increased refcount.
*/
-struct page *find_or_create_page(struct address_space *mapping,
- pgoff_t index, gfp_t gfp_mask)
+struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
+ int fgp_flags, gfp_t cache_gfp_mask, gfp_t radix_gfp_mask)
{
struct page *page;
- int err;
+
repeat:
- page = find_lock_page(mapping, index);
- if (!page) {
- page = __page_cache_alloc(gfp_mask);
+ page = find_get_entry(mapping, offset);
+ if (radix_tree_exceptional_entry(page))
+ page = NULL;
+ if (!page)
+ goto no_page;
+
+ if (fgp_flags & FGP_LOCK) {
+ if (fgp_flags & FGP_NOWAIT) {
+ if (!trylock_page(page)) {
+ page_cache_release(page);
+ return NULL;
+ }
+ } else {
+ lock_page(page);
+ }
+
+ /* Has the page been truncated? */
+ if (unlikely(page->mapping != mapping)) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto repeat;
+ }
+ VM_BUG_ON_PAGE(page->index != offset, page);
+ }
+
+ if (page && (fgp_flags & FGP_ACCESSED))
+ mark_page_accessed(page);
+
+no_page:
+ if (!page && (fgp_flags & FGP_CREAT)) {
+ int err;
+ if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping))
+ cache_gfp_mask |= __GFP_WRITE;
+ if (fgp_flags & FGP_NOFS) {
+ cache_gfp_mask &= ~__GFP_FS;
+ radix_gfp_mask &= ~__GFP_FS;
+ }
+
+ page = __page_cache_alloc(cache_gfp_mask);
if (!page)
return NULL;
- /*
- * We want a regular kernel memory (not highmem or DMA etc)
- * allocation for the radix tree nodes, but we need to honour
- * the context-specific requirements the caller has asked for.
- * GFP_RECLAIM_MASK collects those requirements.
- */
- err = add_to_page_cache_lru(page, mapping, index,
- (gfp_mask & GFP_RECLAIM_MASK));
+
+ if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK)))
+ fgp_flags |= FGP_LOCK;
+
+ /* Init accessed so avoit atomic mark_page_accessed later */
+ if (fgp_flags & FGP_ACCESSED)
+ init_page_accessed(page);
+
+ err = add_to_page_cache_lru(page, mapping, offset, radix_gfp_mask);
if (unlikely(err)) {
page_cache_release(page);
page = NULL;
@@ -1080,9 +1115,10 @@ repeat:
goto repeat;
}
}
+
return page;
}
-EXPORT_SYMBOL(find_or_create_page);
+EXPORT_SYMBOL(pagecache_get_page);
/**
* find_get_entries - gang pagecache lookup
@@ -1379,39 +1415,6 @@ repeat:
}
EXPORT_SYMBOL(find_get_pages_tag);
-/**
- * grab_cache_page_nowait - returns locked page at given index in given cache
- * @mapping: target address_space
- * @index: the page index
- *
- * Same as grab_cache_page(), but do not wait if the page is unavailable.
- * This is intended for speculative data generators, where the data can
- * be regenerated if the page couldn't be grabbed. This routine should
- * be safe to call while holding the lock for another page.
- *
- * Clear __GFP_FS when allocating the page to avoid recursion into the fs
- * and deadlock against the caller's locked page.
- */
-struct page *
-grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
-{
- struct page *page = find_get_page(mapping, index);
-
- if (page) {
- if (trylock_page(page))
- return page;
- page_cache_release(page);
- return NULL;
- }
- page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
- if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) {
- page_cache_release(page);
- page = NULL;
- }
- return page;
-}
-EXPORT_SYMBOL(grab_cache_page_nowait);
-
/*
* CD/DVDs are error prone. When a medium error occurs, the driver may fail
* a _large_ part of the i/o request. Imagine the worst scenario:
@@ -1665,96 +1668,42 @@ out:
return written ? written : error;
}
-/*
- * Performs necessary checks before doing a write
- * @iov: io vector request
- * @nr_segs: number of segments in the iovec
- * @count: number of bytes to write
- * @access_flags: type of access: %VERIFY_READ or %VERIFY_WRITE
- *
- * Adjust number of segments and amount of bytes to write (nr_segs should be
- * properly initialized first). Returns appropriate error code that caller
- * should return or zero in case that write should be allowed.
- */
-int generic_segment_checks(const struct iovec *iov,
- unsigned long *nr_segs, size_t *count, int access_flags)
-{
- unsigned long seg;
- size_t cnt = 0;
- for (seg = 0; seg < *nr_segs; seg++) {
- const struct iovec *iv = &iov[seg];
-
- /*
- * If any segment has a negative length, or the cumulative
- * length ever wraps negative then return -EINVAL.
- */
- cnt += iv->iov_len;
- if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
- return -EINVAL;
- if (access_ok(access_flags, iv->iov_base, iv->iov_len))
- continue;
- if (seg == 0)
- return -EFAULT;
- *nr_segs = seg;
- cnt -= iv->iov_len; /* This segment is no good */
- break;
- }
- *count = cnt;
- return 0;
-}
-EXPORT_SYMBOL(generic_segment_checks);
-
/**
- * generic_file_aio_read - generic filesystem read routine
+ * generic_file_read_iter - generic filesystem read routine
* @iocb: kernel I/O control block
- * @iov: io vector request
- * @nr_segs: number of segments in the iovec
- * @pos: current file position
+ * @iter: destination for the data read
*
- * This is the "read()" routine for all filesystems
+ * This is the "read_iter()" routine for all filesystems
* that can use the page cache directly.
*/
ssize_t
-generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
- struct file *filp = iocb->ki_filp;
- ssize_t retval;
- size_t count;
+ struct file *file = iocb->ki_filp;
+ ssize_t retval = 0;
loff_t *ppos = &iocb->ki_pos;
- struct iov_iter i;
-
- count = 0;
- retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
- if (retval)
- return retval;
- iov_iter_init(&i, iov, nr_segs, count, 0);
+ loff_t pos = *ppos;
/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
- if (filp->f_flags & O_DIRECT) {
+ if (file->f_flags & O_DIRECT) {
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ size_t count = iov_iter_count(iter);
loff_t size;
- struct address_space *mapping;
- struct inode *inode;
- mapping = filp->f_mapping;
- inode = mapping->host;
if (!count)
goto out; /* skip atime */
size = i_size_read(inode);
retval = filemap_write_and_wait_range(mapping, pos,
- pos + iov_length(iov, nr_segs) - 1);
+ pos + count - 1);
if (!retval) {
- retval = mapping->a_ops->direct_IO(READ, iocb,
- iov, pos, nr_segs);
+ struct iov_iter data = *iter;
+ retval = mapping->a_ops->direct_IO(READ, iocb, &data, pos);
}
+
if (retval > 0) {
*ppos = pos + retval;
- count -= retval;
- /*
- * If we did a short DIO read we need to skip the
- * section of the iov that we've already read data into.
- */
- iov_iter_advance(&i, retval);
+ iov_iter_advance(iter, retval);
}
/*
@@ -1765,17 +1714,17 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
* and return. Otherwise fallthrough to buffered io for
* the rest of the read.
*/
- if (retval < 0 || !count || *ppos >= size) {
- file_accessed(filp);
+ if (retval < 0 || !iov_iter_count(iter) || *ppos >= size) {
+ file_accessed(file);
goto out;
}
}
- retval = do_generic_file_read(filp, ppos, &i, retval);
+ retval = do_generic_file_read(file, ppos, iter, retval);
out:
return retval;
}
-EXPORT_SYMBOL(generic_file_aio_read);
+EXPORT_SYMBOL(generic_file_read_iter);
#ifdef CONFIG_MMU
/**
@@ -2381,15 +2330,12 @@ int pagecache_write_end(struct file *file, struct address_space *mapping,
{
const struct address_space_operations *aops = mapping->a_ops;
- mark_page_accessed(page);
return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
}
EXPORT_SYMBOL(pagecache_write_end);
ssize_t
-generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long *nr_segs, loff_t pos,
- size_t count, size_t ocount)
+generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -2397,11 +2343,9 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
ssize_t written;
size_t write_len;
pgoff_t end;
+ struct iov_iter data;
- if (count != ocount)
- *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
-
- write_len = iov_length(iov, *nr_segs);
+ write_len = iov_iter_count(from);
end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
@@ -2428,7 +2372,8 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
}
}
- written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs);
+ data = *from;
+ written = mapping->a_ops->direct_IO(WRITE, iocb, &data, pos);
/*
* Finally, try again to invalidate clean pages which might have been
@@ -2445,6 +2390,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
if (written > 0) {
pos += written;
+ iov_iter_advance(from, written);
if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
i_size_write(inode, pos);
mark_inode_dirty(inode);
@@ -2463,34 +2409,18 @@ EXPORT_SYMBOL(generic_file_direct_write);
struct page *grab_cache_page_write_begin(struct address_space *mapping,
pgoff_t index, unsigned flags)
{
- int status;
- gfp_t gfp_mask;
struct page *page;
- gfp_t gfp_notmask = 0;
+ int fgp_flags = FGP_LOCK|FGP_ACCESSED|FGP_WRITE|FGP_CREAT;
- gfp_mask = mapping_gfp_mask(mapping);
- if (mapping_cap_account_dirty(mapping))
- gfp_mask |= __GFP_WRITE;
if (flags & AOP_FLAG_NOFS)
- gfp_notmask = __GFP_FS;
-repeat:
- page = find_lock_page(mapping, index);
+ fgp_flags |= FGP_NOFS;
+
+ page = pagecache_get_page(mapping, index, fgp_flags,
+ mapping_gfp_mask(mapping),
+ GFP_KERNEL);
if (page)
- goto found;
+ wait_for_stable_page(page);
- page = __page_cache_alloc(gfp_mask & ~gfp_notmask);
- if (!page)
- return NULL;
- status = add_to_page_cache_lru(page, mapping, index,
- GFP_KERNEL & ~gfp_notmask);
- if (unlikely(status)) {
- page_cache_release(page);
- if (status == -EEXIST)
- goto repeat;
- return NULL;
- }
-found:
- wait_for_stable_page(page);
return page;
}
EXPORT_SYMBOL(grab_cache_page_write_begin);
@@ -2539,7 +2469,7 @@ again:
status = a_ops->write_begin(file, mapping, pos, bytes, flags,
&page, &fsdata);
- if (unlikely(status))
+ if (unlikely(status < 0))
break;
if (mapping_writably_mapped(mapping))
@@ -2548,7 +2478,6 @@ again:
copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
flush_dcache_page(page);
- mark_page_accessed(page);
status = a_ops->write_end(file, mapping, pos, bytes, copied,
page, fsdata);
if (unlikely(status < 0))
@@ -2586,10 +2515,9 @@ again:
EXPORT_SYMBOL(generic_perform_write);
/**
- * __generic_file_aio_write - write data to a file
+ * __generic_file_write_iter - write data to a file
* @iocb: IO state structure (file, offset, etc.)
- * @iov: vector with data to write
- * @nr_segs: number of segments in the vector
+ * @from: iov_iter with data to write
*
* This function does all the work needed for actually writing data to a
* file. It does all basic checks, removes SUID from the file, updates
@@ -2603,26 +2531,16 @@ EXPORT_SYMBOL(generic_perform_write);
* A caller has to handle it. This is mainly due to the fact that we want to
* avoid syncing under i_mutex.
*/
-ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs)
+ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct address_space * mapping = file->f_mapping;
- size_t ocount; /* original count */
- size_t count; /* after file limit checks */
struct inode *inode = mapping->host;
loff_t pos = iocb->ki_pos;
ssize_t written = 0;
ssize_t err;
ssize_t status;
- struct iov_iter from;
-
- ocount = 0;
- err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
- if (err)
- return err;
-
- count = ocount;
+ size_t count = iov_iter_count(from);
/* We can write back this queue in page reclaim */
current->backing_dev_info = mapping->backing_dev_info;
@@ -2633,6 +2551,8 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
if (count == 0)
goto out;
+ iov_iter_truncate(from, count);
+
err = file_remove_suid(file);
if (err)
goto out;
@@ -2641,17 +2561,13 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
if (err)
goto out;
- iov_iter_init(&from, iov, nr_segs, count, 0);
-
/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
if (unlikely(file->f_flags & O_DIRECT)) {
loff_t endbyte;
- written = generic_file_direct_write(iocb, iov, &from.nr_segs, pos,
- count, ocount);
+ written = generic_file_direct_write(iocb, from, pos);
if (written < 0 || written == count)
goto out;
- iov_iter_advance(&from, written);
/*
* direct-io write to a hole: fall through to buffered I/O
@@ -2660,7 +2576,7 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
pos += written;
count -= written;
- status = generic_perform_write(file, &from, pos);
+ status = generic_perform_write(file, from, pos);
/*
* If generic_perform_write() returned a synchronous error
* then we want to return the number of bytes which were
@@ -2692,7 +2608,7 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
*/
}
} else {
- written = generic_perform_write(file, &from, pos);
+ written = generic_perform_write(file, from, pos);
if (likely(written >= 0))
iocb->ki_pos = pos + written;
}
@@ -2700,30 +2616,25 @@ out:
current->backing_dev_info = NULL;
return written ? written : err;
}
-EXPORT_SYMBOL(__generic_file_aio_write);
+EXPORT_SYMBOL(__generic_file_write_iter);
/**
- * generic_file_aio_write - write data to a file
+ * generic_file_write_iter - write data to a file
* @iocb: IO state structure
- * @iov: vector with data to write
- * @nr_segs: number of segments in the vector
- * @pos: position in file where to write
+ * @from: iov_iter with data to write
*
- * This is a wrapper around __generic_file_aio_write() to be used by most
+ * This is a wrapper around __generic_file_write_iter() to be used by most
* filesystems. It takes care of syncing the file in case of O_SYNC file
* and acquires i_mutex as needed.
*/
-ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
ssize_t ret;
- BUG_ON(iocb->ki_pos != pos);
-
mutex_lock(&inode->i_mutex);
- ret = __generic_file_aio_write(iocb, iov, nr_segs);
+ ret = __generic_file_write_iter(iocb, from);
mutex_unlock(&inode->i_mutex);
if (ret > 0) {
@@ -2735,7 +2646,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
}
return ret;
}
-EXPORT_SYMBOL(generic_file_aio_write);
+EXPORT_SYMBOL(generic_file_write_iter);
/**
* try_to_release_page() - release old fs-specific metadata on a page