diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Makefile | 3 | ||||
-rw-r--r-- | mm/filemap.c | 344 | ||||
-rw-r--r-- | mm/iov_iter.c | 224 | ||||
-rw-r--r-- | mm/process_vm_access.c | 250 | ||||
-rw-r--r-- | mm/shmem.c | 79 | ||||
-rw-r--r-- | mm/slab.c | 183 | ||||
-rw-r--r-- | mm/slob.c | 10 | ||||
-rw-r--r-- | mm/slub.c | 5 | ||||
-rw-r--r-- | mm/util.c | 48 |
9 files changed, 552 insertions, 594 deletions
diff --git a/mm/Makefile b/mm/Makefile index 9e5aaf92197..b484452dac5 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -17,7 +17,8 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ compaction.o balloon_compaction.o vmacache.o \ - interval_tree.o list_lru.o workingset.o $(mmu-y) + interval_tree.o list_lru.o workingset.o \ + iov_iter.o $(mmu-y) obj-y += init-mm.o diff --git a/mm/filemap.c b/mm/filemap.c index 27ebc0c9571..a82fbe4c9e8 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -77,7 +77,7 @@ * ->mmap_sem * ->lock_page (access_process_vm) * - * ->i_mutex (generic_file_buffered_write) + * ->i_mutex (generic_perform_write) * ->mmap_sem (fault_in_pages_readable->do_page_fault) * * bdi->wb.list_lock @@ -1428,7 +1428,8 @@ static void shrink_readahead_size_eio(struct file *filp, * do_generic_file_read - generic file read routine * @filp: the file to read * @ppos: current file position - * @desc: read_descriptor + * @iter: data destination + * @written: already copied * * This is a generic file read routine, and uses the * mapping->a_ops->readpage() function for the actual low-level stuff. @@ -1436,8 +1437,8 @@ static void shrink_readahead_size_eio(struct file *filp, * This is really ugly. But the goto's actually try to clarify some * of the logic when it comes to error handling etc. */ -static void do_generic_file_read(struct file *filp, loff_t *ppos, - read_descriptor_t *desc) +static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos, + struct iov_iter *iter, ssize_t written) { struct address_space *mapping = filp->f_mapping; struct inode *inode = mapping->host; @@ -1447,12 +1448,12 @@ static void do_generic_file_read(struct file *filp, loff_t *ppos, pgoff_t prev_index; unsigned long offset; /* offset into pagecache page */ unsigned int prev_offset; - int error; + int error = 0; index = *ppos >> PAGE_CACHE_SHIFT; prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT; prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1); - last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; + last_index = (*ppos + iter->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; offset = *ppos & ~PAGE_CACHE_MASK; for (;;) { @@ -1487,7 +1488,7 @@ find_page: if (!page->mapping) goto page_not_up_to_date_locked; if (!mapping->a_ops->is_partially_uptodate(page, - desc, offset)) + offset, iter->count)) goto page_not_up_to_date_locked; unlock_page(page); } @@ -1537,24 +1538,23 @@ page_ok: /* * Ok, we have the page, and it's up-to-date, so * now we can copy it to user space... - * - * The file_read_actor routine returns how many bytes were - * actually used.. - * NOTE! This may not be the same as how much of a user buffer - * we filled up (we may be padding etc), so we can only update - * "pos" here (the actor routine has to update the user buffer - * pointers and the remaining count). */ - ret = file_read_actor(desc, page, offset, nr); + + ret = copy_page_to_iter(page, offset, nr, iter); offset += ret; index += offset >> PAGE_CACHE_SHIFT; offset &= ~PAGE_CACHE_MASK; prev_offset = offset; page_cache_release(page); - if (ret == nr && desc->count) - continue; - goto out; + written += ret; + if (!iov_iter_count(iter)) + goto out; + if (ret < nr) { + error = -EFAULT; + goto out; + } + continue; page_not_up_to_date: /* Get exclusive access to the page ... */ @@ -1589,6 +1589,7 @@ readpage: if (unlikely(error)) { if (error == AOP_TRUNCATED_PAGE) { page_cache_release(page); + error = 0; goto find_page; } goto readpage_error; @@ -1619,7 +1620,6 @@ readpage: readpage_error: /* UHHUH! A synchronous read error occurred. Report it */ - desc->error = error; page_cache_release(page); goto out; @@ -1630,16 +1630,17 @@ no_cached_page: */ page = page_cache_alloc_cold(mapping); if (!page) { - desc->error = -ENOMEM; + error = -ENOMEM; goto out; } error = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); if (error) { page_cache_release(page); - if (error == -EEXIST) + if (error == -EEXIST) { + error = 0; goto find_page; - desc->error = error; + } goto out; } goto readpage; @@ -1652,44 +1653,7 @@ out: *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset; file_accessed(filp); -} - -int file_read_actor(read_descriptor_t *desc, struct page *page, - unsigned long offset, unsigned long size) -{ - char *kaddr; - unsigned long left, count = desc->count; - - if (size > count) - size = count; - - /* - * Faults on the destination of a read are common, so do it before - * taking the kmap. - */ - if (!fault_in_pages_writeable(desc->arg.buf, size)) { - kaddr = kmap_atomic(page); - left = __copy_to_user_inatomic(desc->arg.buf, - kaddr + offset, size); - kunmap_atomic(kaddr); - if (left == 0) - goto success; - } - - /* Do it the slow way */ - kaddr = kmap(page); - left = __copy_to_user(desc->arg.buf, kaddr + offset, size); - kunmap(page); - - if (left) { - size -= left; - desc->error = -EFAULT; - } -success: - desc->count = count - size; - desc->written += size; - desc->arg.buf += size; - return size; + return written ? written : error; } /* @@ -1747,14 +1711,15 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, { struct file *filp = iocb->ki_filp; ssize_t retval; - unsigned long seg = 0; size_t count; loff_t *ppos = &iocb->ki_pos; + struct iov_iter i; count = 0; retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); if (retval) return retval; + iov_iter_init(&i, iov, nr_segs, count, 0); /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ if (filp->f_flags & O_DIRECT) { @@ -1776,6 +1741,11 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, if (retval > 0) { *ppos = pos + retval; count -= retval; + /* + * If we did a short DIO read we need to skip the + * section of the iov that we've already read data into. + */ + iov_iter_advance(&i, retval); } /* @@ -1792,39 +1762,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, } } - count = retval; - for (seg = 0; seg < nr_segs; seg++) { - read_descriptor_t desc; - loff_t offset = 0; - - /* - * If we did a short DIO read we need to skip the section of the - * iov that we've already read data into. - */ - if (count) { - if (count > iov[seg].iov_len) { - count -= iov[seg].iov_len; - continue; - } - offset = count; - count = 0; - } - - desc.written = 0; - desc.arg.buf = iov[seg].iov_base + offset; - desc.count = iov[seg].iov_len - offset; - if (desc.count == 0) - continue; - desc.error = 0; - do_generic_file_read(filp, ppos, &desc); - retval += desc.written; - if (desc.error) { - retval = retval ?: desc.error; - break; - } - if (desc.count > 0) - break; - } + retval = do_generic_file_read(filp, ppos, &i, retval); out: return retval; } @@ -2335,150 +2273,6 @@ struct page *read_cache_page_gfp(struct address_space *mapping, } EXPORT_SYMBOL(read_cache_page_gfp); -static size_t __iovec_copy_from_user_inatomic(char *vaddr, - const struct iovec *iov, size_t base, size_t bytes) -{ - size_t copied = 0, left = 0; - - while (bytes) { - char __user *buf = iov->iov_base + base; - int copy = min(bytes, iov->iov_len - base); - - base = 0; - left = __copy_from_user_inatomic(vaddr, buf, copy); - copied += copy; - bytes -= copy; - vaddr += copy; - iov++; - - if (unlikely(left)) - break; - } - return copied - left; -} - -/* - * Copy as much as we can into the page and return the number of bytes which - * were successfully copied. If a fault is encountered then return the number of - * bytes which were copied. - */ -size_t iov_iter_copy_from_user_atomic(struct page *page, - struct iov_iter *i, unsigned long offset, size_t bytes) -{ - char *kaddr; - size_t copied; - - BUG_ON(!in_atomic()); - kaddr = kmap_atomic(page); - if (likely(i->nr_segs == 1)) { - int left; - char __user *buf = i->iov->iov_base + i->iov_offset; - left = __copy_from_user_inatomic(kaddr + offset, buf, bytes); - copied = bytes - left; - } else { - copied = __iovec_copy_from_user_inatomic(kaddr + offset, - i->iov, i->iov_offset, bytes); - } - kunmap_atomic(kaddr); - - return copied; -} -EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); - -/* - * This has the same sideeffects and return value as - * iov_iter_copy_from_user_atomic(). - * The difference is that it attempts to resolve faults. - * Page must not be locked. - */ -size_t iov_iter_copy_from_user(struct page *page, - struct iov_iter *i, unsigned long offset, size_t bytes) -{ - char *kaddr; - size_t copied; - - kaddr = kmap(page); - if (likely(i->nr_segs == 1)) { - int left; - char __user *buf = i->iov->iov_base + i->iov_offset; - left = __copy_from_user(kaddr + offset, buf, bytes); - copied = bytes - left; - } else { - copied = __iovec_copy_from_user_inatomic(kaddr + offset, - i->iov, i->iov_offset, bytes); - } - kunmap(page); - return copied; -} -EXPORT_SYMBOL(iov_iter_copy_from_user); - -void iov_iter_advance(struct iov_iter *i, size_t bytes) -{ - BUG_ON(i->count < bytes); - - if (likely(i->nr_segs == 1)) { - i->iov_offset += bytes; - i->count -= bytes; - } else { - const struct iovec *iov = i->iov; - size_t base = i->iov_offset; - unsigned long nr_segs = i->nr_segs; - - /* - * The !iov->iov_len check ensures we skip over unlikely - * zero-length segments (without overruning the iovec). - */ - while (bytes || unlikely(i->count && !iov->iov_len)) { - int copy; - - copy = min(bytes, iov->iov_len - base); - BUG_ON(!i->count || i->count < copy); - i->count -= copy; - bytes -= copy; - base += copy; - if (iov->iov_len == base) { - iov++; - nr_segs--; - base = 0; - } - } - i->iov = iov; - i->iov_offset = base; - i->nr_segs = nr_segs; - } -} -EXPORT_SYMBOL(iov_iter_advance); - -/* - * Fault in the first iovec of the given iov_iter, to a maximum length - * of bytes. Returns 0 on success, or non-zero if the memory could not be - * accessed (ie. because it is an invalid address). - * - * writev-intensive code may want this to prefault several iovecs -- that - * would be possible (callers must not rely on the fact that _only_ the - * first iovec will be faulted with the current implementation). - */ -int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) -{ - char __user *buf = i->iov->iov_base + i->iov_offset; - bytes = min(bytes, i->iov->iov_len - i->iov_offset); - return fault_in_pages_readable(buf, bytes); -} -EXPORT_SYMBOL(iov_iter_fault_in_readable); - -/* - * Return the count of just the current iov_iter segment. - */ -size_t iov_iter_single_seg_count(const struct iov_iter *i) -{ - const struct iovec *iov = i->iov; - if (i->nr_segs == 1) - return i->count; - else - return min(i->count, iov->iov_len - i->iov_offset); -} -EXPORT_SYMBOL(iov_iter_single_seg_count); - /* * Performs necessary checks before doing a write * @@ -2585,7 +2379,7 @@ EXPORT_SYMBOL(pagecache_write_end); ssize_t generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long *nr_segs, loff_t pos, loff_t *ppos, + unsigned long *nr_segs, loff_t pos, size_t count, size_t ocount) { struct file *file = iocb->ki_filp; @@ -2646,7 +2440,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, i_size_write(inode, pos); mark_inode_dirty(inode); } - *ppos = pos; + iocb->ki_pos = pos; } out: return written; @@ -2692,7 +2486,7 @@ found: } EXPORT_SYMBOL(grab_cache_page_write_begin); -static ssize_t generic_perform_write(struct file *file, +ssize_t generic_perform_write(struct file *file, struct iov_iter *i, loff_t pos) { struct address_space *mapping = file->f_mapping; @@ -2742,9 +2536,7 @@ again: if (mapping_writably_mapped(mapping)) flush_dcache_page(page); - pagefault_disable(); copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); - pagefault_enable(); flush_dcache_page(page); mark_page_accessed(page); @@ -2782,27 +2574,7 @@ again: return written ? written : status; } - -ssize_t -generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos, loff_t *ppos, - size_t count, ssize_t written) -{ - struct file *file = iocb->ki_filp; - ssize_t status; - struct iov_iter i; - - iov_iter_init(&i, iov, nr_segs, count, written); - status = generic_perform_write(file, &i, pos); - - if (likely(status >= 0)) { - written += status; - *ppos = pos + status; - } - - return written ? written : status; -} -EXPORT_SYMBOL(generic_file_buffered_write); +EXPORT_SYMBOL(generic_perform_write); /** * __generic_file_aio_write - write data to a file @@ -2824,16 +2596,18 @@ EXPORT_SYMBOL(generic_file_buffered_write); * avoid syncing under i_mutex. */ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t *ppos) + unsigned long nr_segs) { struct file *file = iocb->ki_filp; struct address_space * mapping = file->f_mapping; size_t ocount; /* original count */ size_t count; /* after file limit checks */ struct inode *inode = mapping->host; - loff_t pos; - ssize_t written; + loff_t pos = iocb->ki_pos; + ssize_t written = 0; ssize_t err; + ssize_t status; + struct iov_iter from; ocount = 0; err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); @@ -2841,12 +2615,9 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, return err; count = ocount; - pos = *ppos; /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; - written = 0; - err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (err) goto out; @@ -2862,45 +2633,47 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, if (err) goto out; + iov_iter_init(&from, iov, nr_segs, count, 0); + /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ if (unlikely(file->f_flags & O_DIRECT)) { loff_t endbyte; - ssize_t written_buffered; - written = generic_file_direct_write(iocb, iov, &nr_segs, pos, - ppos, count, ocount); + written = generic_file_direct_write(iocb, iov, &from.nr_segs, pos, + count, ocount); if (written < 0 || written == count) goto out; + iov_iter_advance(&from, written); + /* * direct-io write to a hole: fall through to buffered I/O * for completing the rest of the request. */ pos += written; count -= written; - written_buffered = generic_file_buffered_write(iocb, iov, - nr_segs, pos, ppos, count, - written); + + status = generic_perform_write(file, &from, pos); /* - * If generic_file_buffered_write() retuned a synchronous error + * If generic_perform_write() returned a synchronous error * then we want to return the number of bytes which were * direct-written, or the error code if that was zero. Note * that this differs from normal direct-io semantics, which * will return -EFOO even if some bytes were written. */ - if (written_buffered < 0) { - err = written_buffered; + if (unlikely(status < 0) && !written) { + err = status; goto out; } - + iocb->ki_pos = pos + status; /* * We need to ensure that the page cache pages are written to * disk and invalidated to preserve the expected O_DIRECT * semantics. */ - endbyte = pos + written_buffered - written - 1; + endbyte = pos + status - 1; err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); if (err == 0) { - written = written_buffered; + written += status; invalidate_mapping_pages(mapping, pos >> PAGE_CACHE_SHIFT, endbyte >> PAGE_CACHE_SHIFT); @@ -2911,8 +2684,9 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, */ } } else { - written = generic_file_buffered_write(iocb, iov, nr_segs, - pos, ppos, count, written); + written = generic_perform_write(file, &from, pos); + if (likely(written >= 0)) + iocb->ki_pos = pos + written; } out: current->backing_dev_info = NULL; @@ -2941,7 +2715,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, BUG_ON(iocb->ki_pos != pos); mutex_lock(&inode->i_mutex); - ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); + ret = __generic_file_aio_write(iocb, iov, nr_segs); mutex_unlock(&inode->i_mutex); if (ret > 0) { diff --git a/mm/iov_iter.c b/mm/iov_iter.c new file mode 100644 index 00000000000..10e46cd721d --- /dev/null +++ b/mm/iov_iter.c @@ -0,0 +1,224 @@ +#include <linux/export.h> +#include <linux/uio.h> +#include <linux/pagemap.h> + +size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, + struct iov_iter *i) +{ + size_t skip, copy, left, wanted; + const struct iovec *iov; + char __user *buf; + void *kaddr, *from; + + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + wanted = bytes; + iov = i->iov; + skip = i->iov_offset; + buf = iov->iov_base + skip; + copy = min(bytes, iov->iov_len - skip); + + if (!fault_in_pages_writeable(buf, copy)) { + kaddr = kmap_atomic(page); + from = kaddr + offset; + + /* first chunk, usually the only one */ + left = __copy_to_user_inatomic(buf, from, copy); + copy -= left; + skip += copy; + from += copy; + bytes -= copy; + + while (unlikely(!left && bytes)) { + iov++; + buf = iov->iov_base; + copy = min(bytes, iov->iov_len); + left = __copy_to_user_inatomic(buf, from, copy); + copy -= left; + skip = copy; + from += copy; + bytes -= copy; + } + if (likely(!bytes)) { + kunmap_atomic(kaddr); + goto done; + } + offset = from - kaddr; + buf += copy; + kunmap_atomic(kaddr); + copy = min(bytes, iov->iov_len - skip); + } + /* Too bad - revert to non-atomic kmap */ + kaddr = kmap(page); + from = kaddr + offset; + left = __copy_to_user(buf, from, copy); + copy -= left; + skip += copy; + from += copy; + bytes -= copy; + while (unlikely(!left && bytes)) { + iov++; + buf = iov->iov_base; + copy = min(bytes, iov->iov_len); + left = __copy_to_user(buf, from, copy); + copy -= left; + skip = copy; + from += copy; + bytes -= copy; + } + kunmap(page); +done: + i->count -= wanted - bytes; + i->nr_segs -= iov - i->iov; + i->iov = iov; + i->iov_offset = skip; + return wanted - bytes; +} +EXPORT_SYMBOL(copy_page_to_iter); + +static size_t __iovec_copy_from_user_inatomic(char *vaddr, + const struct iovec *iov, size_t base, size_t bytes) +{ + size_t copied = 0, left = 0; + + while (bytes) { + char __user *buf = iov->iov_base + base; + int copy = min(bytes, iov->iov_len - base); + + base = 0; + left = __copy_from_user_inatomic(vaddr, buf, copy); + copied += copy; + bytes -= copy; + vaddr += copy; + iov++; + + if (unlikely(left)) + break; + } + return copied - left; +} + +/* + * Copy as much as we can into the page and return the number of bytes which + * were successfully copied. If a fault is encountered then return the number of + * bytes which were copied. + */ +size_t iov_iter_copy_from_user_atomic(struct page *page, + struct iov_iter *i, unsigned long offset, size_t bytes) +{ + char *kaddr; + size_t copied; + + kaddr = kmap_atomic(page); + if (likely(i->nr_segs == 1)) { + int left; + char __user *buf = i->iov->iov_base + i->iov_offset; + left = __copy_from_user_inatomic(kaddr + offset, buf, bytes); + copied = bytes - left; + } else { + copied = __iovec_copy_from_user_inatomic(kaddr + offset, + i->iov, i->iov_offset, bytes); + } + kunmap_atomic(kaddr); + + return copied; +} +EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); + +/* + * This has the same sideeffects and return value as + * iov_iter_copy_from_user_atomic(). + * The difference is that it attempts to resolve faults. + * Page must not be locked. + */ +size_t iov_iter_copy_from_user(struct page *page, + struct iov_iter *i, unsigned long offset, size_t bytes) +{ + char *kaddr; + size_t copied; + + kaddr = kmap(page); + if (likely(i->nr_segs == 1)) { + int left; + char __user *buf = i->iov->iov_base + i->iov_offset; + left = __copy_from_user(kaddr + offset, buf, bytes); + copied = bytes - left; + } else { + copied = __iovec_copy_from_user_inatomic(kaddr + offset, + i->iov, i->iov_offset, bytes); + } + kunmap(page); + return copied; +} +EXPORT_SYMBOL(iov_iter_copy_from_user); + +void iov_iter_advance(struct iov_iter *i, size_t bytes) +{ + BUG_ON(i->count < bytes); + + if (likely(i->nr_segs == 1)) { + i->iov_offset += bytes; + i->count -= bytes; + } else { + const struct iovec *iov = i->iov; + size_t base = i->iov_offset; + unsigned long nr_segs = i->nr_segs; + + /* + * The !iov->iov_len check ensures we skip over unlikely + * zero-length segments (without overruning the iovec). + */ + while (bytes || unlikely(i->count && !iov->iov_len)) { + int copy; + + copy = min(bytes, iov->iov_len - base); + BUG_ON(!i->count || i->count < copy); + i->count -= copy; + bytes -= copy; + base += copy; + if (iov->iov_len == base) { + iov++; + nr_segs--; + base = 0; + } + } + i->iov = iov; + i->iov_offset = base; + i->nr_segs = nr_segs; + } +} +EXPORT_SYMBOL(iov_iter_advance); + +/* + * Fault in the first iovec of the given iov_iter, to a maximum length + * of bytes. Returns 0 on success, or non-zero if the memory could not be + * accessed (ie. because it is an invalid address). + * + * writev-intensive code may want this to prefault several iovecs -- that + * would be possible (callers must not rely on the fact that _only_ the + * first iovec will be faulted with the current implementation). + */ +int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) +{ + char __user *buf = i->iov->iov_base + i->iov_offset; + bytes = min(bytes, i->iov->iov_len - i->iov_offset); + return fault_in_pages_readable(buf, bytes); +} +EXPORT_SYMBOL(iov_iter_fault_in_readable); + +/* + * Return the count of just the current iov_iter segment. + */ +size_t iov_iter_single_seg_count(const struct iov_iter *i) +{ + const struct iovec *iov = i->iov; + if (i->nr_segs == 1) + return i->count; + else + return min(i->count, iov->iov_len - i->iov_offset); +} +EXPORT_SYMBOL(iov_iter_single_seg_count); diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index cb79065c19e..8505c9262b3 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -23,129 +23,44 @@ /** * process_vm_rw_pages - read/write pages from task specified - * @task: task to read/write from - * @mm: mm for task - * @process_pages: struct pages area that can store at least - * nr_pages_to_copy struct page pointers - * @pa: address of page in task to start copying from/to + * @pages: array of pointers to pages we want to copy * @start_offset: offset in page to start copying from/to * @len: number of bytes to copy - * @lvec: iovec array specifying where to copy to/from - * @lvec_cnt: number of elements in iovec array - * @lvec_current: index in iovec array we are up to - * @lvec_offset: offset in bytes from current iovec iov_base we are up to + * @iter: where to copy to/from locally * @vm_write: 0 means copy from, 1 means copy to - * @nr_pages_to_copy: number of pages to copy - * @bytes_copied: returns number of bytes successfully copied * Returns 0 on success, error code otherwise */ -static int process_vm_rw_pages(struct task_struct *task, - struct mm_struct *mm, - struct page **process_pages, - unsigned long pa, - unsigned long start_offset, - unsigned long len, - const struct iovec *lvec, - unsigned long lvec_cnt, - unsigned long *lvec_current, - size_t *lvec_offset, - int vm_write, - unsigned int nr_pages_to_copy, - ssize_t *bytes_copied) +static int process_vm_rw_pages(struct page **pages, + unsigned offset, + size_t len, + struct iov_iter *iter, + int vm_write) { - int pages_pinned; - void *target_kaddr; - int pgs_copied = 0; - int j; - int ret; - ssize_t bytes_to_copy; - ssize_t rc = 0; - - *bytes_copied = 0; - - /* Get the pages we're interested in */ - down_read(&mm->mmap_sem); - pages_pinned = get_user_pages(task, mm, pa, - nr_pages_to_copy, - vm_write, 0, process_pages, NULL); - up_read(&mm->mmap_sem); - - if (pages_pinned != nr_pages_to_copy) { - rc = -EFAULT; - goto end; - } - /* Do the copy for each page */ - for (pgs_copied = 0; - (pgs_copied < nr_pages_to_copy) && (*lvec_current < lvec_cnt); - pgs_copied++) { - /* Make sure we have a non zero length iovec */ - while (*lvec_current < lvec_cnt - && lvec[*lvec_current].iov_len == 0) - (*lvec_current)++; - if (*lvec_current == lvec_cnt) - break; - - /* - * Will copy smallest of: - * - bytes remaining in page - * - bytes remaining in destination iovec - */ - bytes_to_copy = min_t(ssize_t, PAGE_SIZE - start_offset, - len - *bytes_copied); - bytes_to_copy = min_t(ssize_t, bytes_to_copy, - lvec[*lvec_current].iov_len - - *lvec_offset); - - target_kaddr = kmap(process_pages[pgs_copied]) + start_offset; - - if (vm_write) - ret = copy_from_user(target_kaddr, - lvec[*lvec_current].iov_base - + *lvec_offset, - bytes_to_copy); - else - ret = copy_to_user(lvec[*lvec_current].iov_base - + *lvec_offset, - target_kaddr, bytes_to_copy); - kunmap(process_pages[pgs_copied]); - if (ret) { - *bytes_copied += bytes_to_copy - ret; - pgs_copied++; - rc = -EFAULT; - goto end; - } - *bytes_copied += bytes_to_copy; - *lvec_offset += bytes_to_copy; - if (*lvec_offset == lvec[*lvec_current].iov_len) { - /* - * Need to copy remaining part of page into the - * next iovec if there are any bytes left in page - */ - (*lvec_current)++; - *lvec_offset = 0; - start_offset = (start_offset + bytes_to_copy) - % PAGE_SIZE; - if (start_offset) - pgs_copied--; + while (len && iov_iter_count(iter)) { + struct page *page = *pages++; + size_t copy = PAGE_SIZE - offset; + size_t copied; + + if (copy > len) + copy = len; + + if (vm_write) { + if (copy > iov_iter_count(iter)) + copy = iov_iter_count(iter); + copied = iov_iter_copy_from_user(page, iter, + offset, copy); + iov_iter_advance(iter, copied); + set_page_dirty_lock(page); } else { - start_offset = 0; - } - } - -end: - if (vm_write) { - for (j = 0; j < pages_pinned; j++) { - if (j < pgs_copied) - set_page_dirty_lock(process_pages[j]); - put_page(process_pages[j]); + copied = copy_page_to_iter(page, offset, copy, iter); } - } else { - for (j = 0; j < pages_pinned; j++) - put_page(process_pages[j]); + len -= copied; + if (copied < copy && iov_iter_count(iter)) + return -EFAULT; + offset = 0; } - - return rc; + return 0; } /* Maximum number of pages kmalloc'd to hold struct page's during copy */ @@ -155,67 +70,60 @@ end: * process_vm_rw_single_vec - read/write pages from task specified * @addr: start memory address of target process * @len: size of area to copy to/from - * @lvec: iovec array specifying where to copy to/from locally - * @lvec_cnt: number of elements in iovec array - * @lvec_current: index in iovec array we are up to - * @lvec_offset: offset in bytes from current iovec iov_base we are up to + * @iter: where to copy to/from locally * @process_pages: struct pages area that can store at least * nr_pages_to_copy struct page pointers * @mm: mm for task * @task: task to read/write from * @vm_write: 0 means copy from, 1 means copy to - * @bytes_copied: returns number of bytes successfully copied * Returns 0 on success or on failure error code */ static int process_vm_rw_single_vec(unsigned long addr, unsigned long len, - const struct iovec *lvec, - unsigned long lvec_cnt, - unsigned long *lvec_current, - size_t *lvec_offset, + struct iov_iter *iter, struct page **process_pages, struct mm_struct *mm, struct task_struct *task, - int vm_write, - ssize_t *bytes_copied) + int vm_write) { unsigned long pa = addr & PAGE_MASK; unsigned long start_offset = addr - pa; unsigned long nr_pages; - ssize_t bytes_copied_loop; ssize_t rc = 0; - unsigned long nr_pages_copied = 0; - unsigned long nr_pages_to_copy; unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES / sizeof(struct pages *); - *bytes_copied = 0; - /* Work out address and page range required */ if (len == 0) return 0; nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1; - while ((nr_pages_copied < nr_pages) && (*lvec_current < lvec_cnt)) { - nr_pages_to_copy = min(nr_pages - nr_pages_copied, - max_pages_per_loop); + while (!rc && nr_pages && iov_iter_count(iter)) { + int pages = min(nr_pages, max_pages_per_loop); + size_t bytes; - rc = process_vm_rw_pages(task, mm, process_pages, pa, - start_offset, len, - lvec, lvec_cnt, - lvec_current, lvec_offset, - vm_write, nr_pages_to_copy, - &bytes_copied_loop); - start_offset = 0; - *bytes_copied += bytes_copied_loop; + /* Get the pages we're interested in */ + down_read(&mm->mmap_sem); + pages = get_user_pages(task, mm, pa, pages, + vm_write, 0, process_pages, NULL); + up_read(&mm->mmap_sem); - if (rc < 0) { - return rc; - } else { - len -= bytes_copied_loop; - nr_pages_copied += nr_pages_to_copy; - pa += nr_pages_to_copy * PAGE_SIZE; - } + if (pages <= 0) + return -EFAULT; + + bytes = pages * PAGE_SIZE - start_offset; + if (bytes > len) + bytes = len; + + rc = process_vm_rw_pages(process_pages, + start_offset, bytes, iter, + vm_write); + len -= bytes; + start_offset = 0; + nr_pages -= pages; + pa += pages * PAGE_SIZE; + while (pages) + put_page(process_pages[--pages]); } return rc; @@ -228,8 +136,7 @@ static int process_vm_rw_single_vec(unsigned long addr, /** * process_vm_rw_core - core of reading/writing pages from task specified * @pid: PID of process to read/write from/to - * @lvec: iovec array specifying where to copy to/from locally - * @liovcnt: size of lvec array + * @iter: where to copy to/from locally * @rvec: iovec array specifying where to copy to/from in the other process * @riovcnt: size of rvec array * @flags: currently unused @@ -238,8 +145,7 @@ static int process_vm_rw_single_vec(unsigned long addr, * return less bytes than expected if an error occurs during the copying * process. */ -static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec, - unsigned long liovcnt, +static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter, const struct iovec *rvec, unsigned long riovcnt, unsigned long flags, int vm_write) @@ -250,13 +156,10 @@ static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec, struct mm_struct *mm; unsigned long i; ssize_t rc = 0; - ssize_t bytes_copied_loop; - ssize_t bytes_copied = 0; unsigned long nr_pages = 0; unsigned long nr_pages_iov; - unsigned long iov_l_curr_idx = 0; - size_t iov_l_curr_offset = 0; ssize_t iov_len; + size_t total_len = iov_iter_count(iter); /* * Work out how many pages of struct pages we're going to need @@ -310,24 +213,20 @@ static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec, goto put_task_struct; } - for (i = 0; i < riovcnt && iov_l_curr_idx < liovcnt; i++) { + for (i = 0; i < riovcnt && iov_iter_count(iter) && !rc; i++) rc = process_vm_rw_single_vec( (unsigned long)rvec[i].iov_base, rvec[i].iov_len, - lvec, liovcnt, &iov_l_curr_idx, &iov_l_curr_offset, - process_pages, mm, task, vm_write, &bytes_copied_loop); - bytes_copied += bytes_copied_loop; - if (rc != 0) { - /* If we have managed to copy any data at all then - we return the number of bytes copied. Otherwise - we return the error code */ - if (bytes_copied) - rc = bytes_copied; - goto put_mm; - } - } + iter, process_pages, mm, task, vm_write); + + /* copied = space before - space after */ + total_len -= iov_iter_count(iter); + + /* If we have managed to copy any data at all then + we return the number of bytes copied. Otherwise + we return the error code */ + if (total_len) + rc = total_len; - rc = bytes_copied; -put_mm: mmput(mm); put_task_struct: @@ -363,6 +262,7 @@ static ssize_t process_vm_rw(pid_t pid, struct iovec iovstack_r[UIO_FASTIOV]; struct iovec *iov_l = iovstack_l; struct iovec *iov_r = iovstack_r; + struct iov_iter iter; ssize_t rc; if (flags != 0) @@ -378,13 +278,14 @@ static ssize_t process_vm_rw(pid_t pid, if (rc <= 0) goto free_iovecs; + iov_iter_init(&iter, iov_l, liovcnt, rc, 0); + rc = rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV, iovstack_r, &iov_r); if (rc <= 0) goto free_iovecs; - rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags, - vm_write); + rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write); free_iovecs: if (iov_r != iovstack_r) @@ -424,6 +325,7 @@ compat_process_vm_rw(compat_pid_t pid, struct iovec iovstack_r[UIO_FASTIOV]; struct iovec *iov_l = iovstack_l; struct iovec *iov_r = iovstack_r; + struct iov_iter iter; ssize_t rc = -EFAULT; if (flags != 0) @@ -439,14 +341,14 @@ compat_process_vm_rw(compat_pid_t pid, &iov_l); if (rc <= 0) goto free_iovecs; + iov_iter_init(&iter, iov_l, liovcnt, rc, 0); rc = compat_rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV, iovstack_r, &iov_r); if (rc <= 0) goto free_iovecs; - rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags, - vm_write); + rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write); free_iovecs: if (iov_r != iovstack_r) diff --git a/mm/shmem.c b/mm/shmem.c index 70273f8df58..9f70e02111c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1402,13 +1402,25 @@ shmem_write_end(struct file *file, struct address_space *mapping, return copied; } -static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor) +static ssize_t shmem_file_aio_read(struct kiocb *iocb, + const struct iovec *iov, unsigned long nr_segs, loff_t pos) { - struct inode *inode = file_inode(filp); + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; pgoff_t index; unsigned long offset; enum sgp_type sgp = SGP_READ; + int error = 0; + ssize_t retval; + size_t count; + loff_t *ppos = &iocb->ki_pos; + struct iov_iter iter; + + retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); + if (retval) + return retval; + iov_iter_init(&iter, iov, nr_segs, count, 0); /* * Might this read be for a stacking filesystem? Then when reading @@ -1436,10 +1448,10 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_ break; } - desc->error = shmem_getpage(inode, index, &page, sgp, NULL); - if (desc->error) { - if (desc->error == -EINVAL) - desc->error = 0; + error = shmem_getpage(inode, index, &page, sgp, NULL); + if (error) { + if (error == -EINVAL) + error = 0; break; } if (page) @@ -1483,61 +1495,26 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_ /* * Ok, we have the page, and it's up-to-date, so * now we can copy it to user space... - * - * The actor routine returns how many bytes were actually used.. - * NOTE! This may not be the same as how much of a user buffer - * we filled up (we may be padding etc), so we can only update - * "pos" here (the actor routine has to update the user buffer - * pointers and the remaining count). */ - ret = actor(desc, page, offset, nr); + ret = copy_page_to_iter(page, offset, nr, &iter); + retval += ret; offset += ret; index += offset >> PAGE_CACHE_SHIFT; offset &= ~PAGE_CACHE_MASK; page_cache_release(page); - if (ret != nr || !desc->count) + if (!iov_iter_count(&iter)) break; - + if (ret < nr) { + error = -EFAULT; + break; + } cond_resched(); } *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; - file_accessed(filp); -} - -static ssize_t shmem_file_aio_read(struct kiocb *iocb, - const struct iovec *iov, unsigned long nr_segs, loff_t pos) -{ - struct file *filp = iocb->ki_filp; - ssize_t retval; - unsigned long seg; - size_t count; - loff_t *ppos = &iocb->ki_pos; - - retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); - if (retval) - return retval; - - for (seg = 0; seg < nr_segs; seg++) { - read_descriptor_t desc; - - desc.written = 0; - desc.arg.buf = iov[seg].iov_base; - desc.count = iov[seg].iov_len; - if (desc.count == 0) - continue; - desc.error = 0; - do_shmem_file_read(filp, ppos, &desc, file_read_actor); - retval += desc.written; - if (desc.error) { - retval = retval ?: desc.error; - break; - } - if (desc.count > 0) - break; - } - return retval; + file_accessed(file); + return retval ? retval : error; } static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, @@ -1576,7 +1553,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, index = *ppos >> PAGE_CACHE_SHIFT; loff = *ppos & ~PAGE_CACHE_MASK; req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - nr_pages = min(req_pages, pipe->buffers); + nr_pages = min(req_pages, spd.nr_pages_max); spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages); diff --git a/mm/slab.c b/mm/slab.c index 3db4cb06e32..388cb1ae6fb 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -157,6 +157,17 @@ #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN #endif +#define FREELIST_BYTE_INDEX (((PAGE_SIZE >> BITS_PER_BYTE) \ + <= SLAB_OBJ_MIN_SIZE) ? 1 : 0) + +#if FREELIST_BYTE_INDEX +typedef unsigned char freelist_idx_t; +#else +typedef unsigned short freelist_idx_t; +#endif + +#define SLAB_OBJ_MAX_NUM (1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) + /* * true if a page was allocated from pfmemalloc reserves for network-based * swap @@ -277,8 +288,8 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) * OTOH the cpuarrays can contain lots of objects, * which could lock up otherwise freeable slabs. */ -#define REAPTIMEOUT_CPUC (2*HZ) -#define REAPTIMEOUT_LIST3 (4*HZ) +#define REAPTIMEOUT_AC (2*HZ) +#define REAPTIMEOUT_NODE (4*HZ) #if STATS #define STATS_INC_ACTIVE(x) ((x)->num_active++) @@ -565,9 +576,31 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) return cachep->array[smp_processor_id()]; } -static size_t slab_mgmt_size(size_t nr_objs, size_t align) +static int calculate_nr_objs(size_t slab_size, size_t buffer_size, + size_t idx_size, size_t align) { - return ALIGN(nr_objs * sizeof(unsigned int), align); + int nr_objs; + size_t freelist_size; + + /* + * Ignore padding for the initial guess. The padding + * is at most @align-1 bytes, and @buffer_size is at + * least @align. In the worst case, this result will + * be one greater than the number of objects that fit + * into the memory allocation when taking the padding + * into account. + */ + nr_objs = slab_size / (buffer_size + idx_size); + + /* + * This calculated number will be either the right + * amount, or one greater than what we want. + */ + freelist_size = slab_size - nr_objs * buffer_size; + if (freelist_size < ALIGN(nr_objs * idx_size, align)) + nr_objs--; + + return nr_objs; } /* @@ -600,25 +633,9 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, nr_objs = slab_size / buffer_size; } else { - /* - * Ignore padding for the initial guess. The padding - * is at most @align-1 bytes, and @buffer_size is at - * least @align. In the worst case, this result will - * be one greater than the number of objects that fit - * into the memory allocation when taking the padding - * into account. - */ - nr_objs = (slab_size) / (buffer_size + sizeof(unsigned int)); - - /* - * This calculated number will be either the right - * amount, or one greater than what we want. - */ - if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size - > slab_size) - nr_objs--; - - mgmt_size = slab_mgmt_size(nr_objs, align); + nr_objs = calculate_nr_objs(slab_size, buffer_size, + sizeof(freelist_idx_t), align); + mgmt_size = ALIGN(nr_objs * sizeof(freelist_idx_t), align); } *num = nr_objs; *left_over = slab_size - nr_objs*buffer_size - mgmt_size; @@ -1067,7 +1084,7 @@ static int init_cache_node_node(int node) list_for_each_entry(cachep, &slab_caches, list) { /* - * Set up the size64 kmemlist for cpu before we can + * Set up the kmem_cache_node for cpu before we can * begin anything. Make sure some other cpu on this * node has not already allocated this */ @@ -1076,12 +1093,12 @@ static int init_cache_node_node(int node) if (!n) return -ENOMEM; kmem_cache_node_init(n); - n->next_reap = jiffies + REAPTIMEOUT_LIST3 + - ((unsigned long)cachep) % REAPTIMEOUT_LIST3; + n->next_reap = jiffies + REAPTIMEOUT_NODE + + ((unsigned long)cachep) % REAPTIMEOUT_NODE; /* - * The l3s don't come and go as CPUs come and - * go. slab_mutex is sufficient + * The kmem_cache_nodes don't come and go as CPUs + * come and go. slab_mutex is sufficient * protection here. */ cachep->node[node] = n; @@ -1406,8 +1423,8 @@ static void __init set_up_node(struct kmem_cache *cachep, int index) for_each_online_node(node) { cachep->node[node] = &init_kmem_cache_node[index + node]; cachep->node[node]->next_reap = jiffies + - REAPTIMEOUT_LIST3 + - ((unsigned long)cachep) % REAPTIMEOUT_LIST3; + REAPTIMEOUT_NODE + + ((unsigned long)cachep) % REAPTIMEOUT_NODE; } } @@ -2010,6 +2027,10 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, if (!num) continue; + /* Can't handle number of objects more than SLAB_OBJ_MAX_NUM */ + if (num > SLAB_OBJ_MAX_NUM) + break; + if (flags & CFLGS_OFF_SLAB) { /* * Max number of objs-per-slab for caches which @@ -2017,7 +2038,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, * looping condition in cache_grow(). */ offslab_limit = size; - offslab_limit /= sizeof(unsigned int); + offslab_limit /= sizeof(freelist_idx_t); if (num > offslab_limit) break; @@ -2103,8 +2124,8 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) } } cachep->node[numa_mem_id()]->next_reap = - jiffies + REAPTIMEOUT_LIST3 + - ((unsigned long)cachep) % REAPTIMEOUT_LIST3; + jiffies + REAPTIMEOUT_NODE + + ((unsigned long)cachep) % REAPTIMEOUT_NODE; cpu_cache_get(cachep)->avail = 0; cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; @@ -2243,7 +2264,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) * it too early on. Always use on-slab management when * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) */ - if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init && + if ((size >= (PAGE_SIZE >> 5)) && !slab_early_init && !(flags & SLAB_NOLEAKTRACE)) /* * Size is large, assume best to place the slab management obj @@ -2252,6 +2273,12 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) flags |= CFLGS_OFF_SLAB; size = ALIGN(size, cachep->align); + /* + * We should restrict the number of objects in a slab to implement + * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition. + */ + if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE) + size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align); left_over = calculate_slab_order(cachep, size, cachep->align, flags); @@ -2259,7 +2286,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) return -E2BIG; freelist_size = - ALIGN(cachep->num * sizeof(unsigned int), cachep->align); + ALIGN(cachep->num * sizeof(freelist_idx_t), cachep->align); /* * If the slab has been placed off-slab, and we have enough space then @@ -2272,7 +2299,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (flags & CFLGS_OFF_SLAB) { /* really off slab. No need for manual alignment */ - freelist_size = cachep->num * sizeof(unsigned int); + freelist_size = cachep->num * sizeof(freelist_idx_t); #ifdef CONFIG_PAGE_POISONING /* If we're going to use the generic kernel_map_pages() @@ -2300,10 +2327,10 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (flags & CFLGS_OFF_SLAB) { cachep->freelist_cache = kmalloc_slab(freelist_size, 0u); /* - * This is a possibility for one of the malloc_sizes caches. + * This is a possibility for one of the kmalloc_{dma,}_caches. * But since we go off slab only for object size greater than - * PAGE_SIZE/8, and malloc_sizes gets created in ascending order, - * this should not happen at all. + * PAGE_SIZE/8, and kmalloc_{dma,}_caches get created + * in ascending order,this should not happen at all. * But leave a BUG_ON for some lucky dude. */ BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache)); @@ -2511,14 +2538,17 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep) /* * Get the memory for a slab management obj. - * For a slab cache when the slab descriptor is off-slab, slab descriptors - * always come from malloc_sizes caches. The slab descriptor cannot - * come from the same cache which is getting created because, - * when we are searching for an appropriate cache for these - * descriptors in kmem_cache_create, we search through the malloc_sizes array. - * If we are creating a malloc_sizes cache here it would not be visible to - * kmem_find_general_cachep till the initialization is complete. - * Hence we cannot have freelist_cache same as the original cache. + * + * For a slab cache when the slab descriptor is off-slab, the + * slab descriptor can't come from the same cache which is being created, + * Because if it is the case, that means we defer the creation of + * the kmalloc_{dma,}_cache of size sizeof(slab descriptor) to this point. + * And we eventually call down to __kmem_cache_create(), which + * in turn looks up in the kmalloc_{dma,}_caches for the disired-size one. + * This is a "chicken-and-egg" problem. + * + * So the off-slab slab descriptor shall come from the kmalloc_{dma,}_caches, + * which are all initialized during kmem_cache_init(). */ static void *alloc_slabmgmt(struct kmem_cache *cachep, struct page *page, int colour_off, @@ -2542,9 +2572,15 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep, return freelist; } -static inline unsigned int *slab_freelist(struct page *page) +static inline freelist_idx_t get_free_obj(struct page *page, unsigned char idx) { - return (unsigned int *)(page->freelist); + return ((freelist_idx_t *)page->freelist)[idx]; +} + +static inline void set_free_obj(struct page *page, + unsigned char idx, freelist_idx_t val) +{ + ((freelist_idx_t *)(page->freelist))[idx] = val; } static void cache_init_objs(struct kmem_cache *cachep, @@ -2589,7 +2625,7 @@ static void cache_init_objs(struct kmem_cache *cachep, if (cachep->ctor) cachep->ctor(objp); #endif - slab_freelist(page)[i] = i; + set_free_obj(page, i, i); } } @@ -2608,7 +2644,7 @@ static void *slab_get_obj(struct kmem_cache *cachep, struct page *page, { void *objp; - objp = index_to_obj(cachep, page, slab_freelist(page)[page->active]); + objp = index_to_obj(cachep, page, get_free_obj(page, page->active)); page->active++; #if DEBUG WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); @@ -2629,7 +2665,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct page *page, /* Verify double free bug */ for (i = page->active; i < cachep->num; i++) { - if (slab_freelist(page)[i] == objnr) { + if (get_free_obj(page, i) == objnr) { printk(KERN_ERR "slab: double free detected in cache " "'%s', objp %p\n", cachep->name, objp); BUG(); @@ -2637,7 +2673,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct page *page, } #endif page->active--; - slab_freelist(page)[page->active] = objnr; + set_free_obj(page, page->active, objnr); } /* @@ -2886,9 +2922,9 @@ retry: /* move slabp to correct slabp list: */ list_del(&page->lru); if (page->active == cachep->num) - list_add(&page->list, &n->slabs_full); + list_add(&page->lru, &n->slabs_full); else - list_add(&page->list, &n->slabs_partial); + list_add(&page->lru, &n->slabs_partial); } must_grow: @@ -3245,11 +3281,11 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags, flags); - if (likely(ptr)) + if (likely(ptr)) { kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size); - - if (unlikely((flags & __GFP_ZERO) && ptr)) - memset(ptr, 0, cachep->object_size); + if (unlikely(flags & __GFP_ZERO)) + memset(ptr, 0, cachep->object_size); + } return ptr; } @@ -3310,17 +3346,17 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) flags); prefetchw(objp); - if (likely(objp)) + if (likely(objp)) { kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size); - - if (unlikely((flags & __GFP_ZERO) && objp)) - memset(objp, 0, cachep->object_size); + if (unlikely(flags & __GFP_ZERO)) + memset(objp, 0, cachep->object_size); + } return objp; } /* - * Caller needs to acquire correct kmem_list's list_lock + * Caller needs to acquire correct kmem_cache_node's list_lock */ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, int node) @@ -3574,11 +3610,6 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, struct kmem_cache *cachep; void *ret; - /* If you want to save a few bytes .text space: replace - * __ with kmem_. - * Then kmalloc uses the uninlined functions instead of the inline - * functions. - */ cachep = kmalloc_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; @@ -3670,7 +3701,7 @@ EXPORT_SYMBOL(kfree); /* * This initializes kmem_cache_node or resizes various caches for all nodes. */ -static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp) +static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp) { int node; struct kmem_cache_node *n; @@ -3726,8 +3757,8 @@ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp) } kmem_cache_node_init(n); - n->next_reap = jiffies + REAPTIMEOUT_LIST3 + - ((unsigned long)cachep) % REAPTIMEOUT_LIST3; + n->next_reap = jiffies + REAPTIMEOUT_NODE + + ((unsigned long)cachep) % REAPTIMEOUT_NODE; n->shared = new_shared; n->alien = new_alien; n->free_limit = (1 + nr_cpus_node(node)) * @@ -3813,7 +3844,7 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, kfree(ccold); } kfree(new); - return alloc_kmemlist(cachep, gfp); + return alloc_kmem_cache_node(cachep, gfp); } static int do_tune_cpucache(struct kmem_cache *cachep, int limit, @@ -3982,7 +4013,7 @@ static void cache_reap(struct work_struct *w) if (time_after(n->next_reap, jiffies)) goto next; - n->next_reap = jiffies + REAPTIMEOUT_LIST3; + n->next_reap = jiffies + REAPTIMEOUT_NODE; drain_array(searchp, n, n->shared, 0, node); @@ -4003,7 +4034,7 @@ next: next_reap_node(); out: /* Set up the next iteration */ - schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC)); + schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_AC)); } #ifdef CONFIG_SLABINFO @@ -4210,7 +4241,7 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c, for (j = page->active; j < c->num; j++) { /* Skip freed item */ - if (slab_freelist(page)[j] == i) { + if (get_free_obj(page, j) == i) { active = false; break; } diff --git a/mm/slob.c b/mm/slob.c index 4bf8809dfcc..730cad45d4b 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -111,13 +111,13 @@ static inline int slob_page_free(struct page *sp) static void set_slob_page_free(struct page *sp, struct list_head *list) { - list_add(&sp->list, list); + list_add(&sp->lru, list); __SetPageSlobFree(sp); } static inline void clear_slob_page_free(struct page *sp) { - list_del(&sp->list); + list_del(&sp->lru); __ClearPageSlobFree(sp); } @@ -282,7 +282,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) spin_lock_irqsave(&slob_lock, flags); /* Iterate through each partially free page, try to find room */ - list_for_each_entry(sp, slob_list, list) { + list_for_each_entry(sp, slob_list, lru) { #ifdef CONFIG_NUMA /* * If there's a node specification, search for a partial @@ -296,7 +296,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) continue; /* Attempt to alloc */ - prev = sp->list.prev; + prev = sp->lru.prev; b = slob_page_alloc(sp, size, align); if (!b) continue; @@ -322,7 +322,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) spin_lock_irqsave(&slob_lock, flags); sp->units = SLOB_UNITS(PAGE_SIZE); sp->freelist = b; - INIT_LIST_HEAD(&sp->list); + INIT_LIST_HEAD(&sp->lru); set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); set_slob_page_free(sp, slob_list); b = slob_page_alloc(sp, size, align); diff --git a/mm/slub.c b/mm/slub.c index f620bbf4054..5e234f1f885 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1352,11 +1352,12 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) page = alloc_slab_page(alloc_gfp, node, oo); if (unlikely(!page)) { oo = s->min; + alloc_gfp = flags; /* * Allocation may have failed due to fragmentation. * Try a lower order alloc if possible */ - page = alloc_slab_page(flags, node, oo); + page = alloc_slab_page(alloc_gfp, node, oo); if (page) stat(s, ORDER_FALLBACK); @@ -1366,7 +1367,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { int pages = 1 << oo_order(oo); - kmemcheck_alloc_shadow(page, oo_order(oo), flags, node); + kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node); /* * Objects from caches that have a constructor don't get diff --git a/mm/util.c b/mm/util.c index d7813e6d4cc..f380af7ea77 100644 --- a/mm/util.c +++ b/mm/util.c @@ -446,6 +446,54 @@ unsigned long vm_commit_limit(void) return allowed; } +/** + * get_cmdline() - copy the cmdline value to a buffer. + * @task: the task whose cmdline value to copy. + * @buffer: the buffer to copy to. + * @buflen: the length of the buffer. Larger cmdline values are truncated + * to this length. + * Returns the size of the cmdline field copied. Note that the copy does + * not guarantee an ending NULL byte. + */ +int get_cmdline(struct task_struct *task, char *buffer, int buflen) +{ + int res = 0; + unsigned int len; + struct mm_struct *mm = get_task_mm(task); + if (!mm) + goto out; + if (!mm->arg_end) + goto out_mm; /* Shh! No looking before we're done */ + + len = mm->arg_end - mm->arg_start; + + if (len > buflen) + len = buflen; + + res = access_process_vm(task, mm->arg_start, buffer, len, 0); + + /* + * If the nul at the end of args has been overwritten, then + * assume application is using setproctitle(3). + */ + if (res > 0 && buffer[res-1] != '\0' && len < buflen) { + len = strnlen(buffer, res); + if (len < res) { + res = len; + } else { + len = mm->env_end - mm->env_start; + if (len > buflen - res) + len = buflen - res; + res += access_process_vm(task, mm->env_start, + buffer+res, len, 0); + res = strnlen(buffer, res); + } + } +out_mm: + mmput(mm); +out: + return res; +} /* Tracepoints definitions. */ EXPORT_TRACEPOINT_SYMBOL(kmalloc); |