diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/backing-dev.c | 16 | ||||
-rw-r--r-- | mm/bounce.c | 2 | ||||
-rw-r--r-- | mm/filemap.c | 46 | ||||
-rw-r--r-- | mm/filemap_xip.c | 48 | ||||
-rw-r--r-- | mm/madvise.c | 22 | ||||
-rw-r--r-- | mm/migrate.c | 15 | ||||
-rw-r--r-- | mm/nommu.c | 29 | ||||
-rw-r--r-- | mm/oom_kill.c | 6 | ||||
-rw-r--r-- | mm/rmap.c | 4 | ||||
-rw-r--r-- | mm/shmem.c | 132 | ||||
-rw-r--r-- | mm/slab.c | 4 |
11 files changed, 257 insertions, 67 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index f50a2811f9d..e5de3781d3f 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -55,6 +55,22 @@ long congestion_wait(int rw, long timeout) } EXPORT_SYMBOL(congestion_wait); +long congestion_wait_interruptible(int rw, long timeout) +{ + long ret; + DEFINE_WAIT(wait); + wait_queue_head_t *wqh = &congestion_wqh[rw]; + + prepare_to_wait(wqh, &wait, TASK_INTERRUPTIBLE); + if (signal_pending(current)) + ret = -ERESTARTSYS; + else + ret = io_schedule_timeout(timeout); + finish_wait(wqh, &wait); + return ret; +} +EXPORT_SYMBOL(congestion_wait_interruptible); + /** * congestion_end - wake up sleepers on a congested backing_dev_info * @rw: READ or WRITE diff --git a/mm/bounce.c b/mm/bounce.c index 643efbe8240..ad401fc5744 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -204,7 +204,7 @@ static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig, /* * is destination page below bounce pfn? */ - if (page_to_pfn(page) < q->bounce_pfn) + if (page_to_pfn(page) <= q->bounce_pfn) continue; /* diff --git a/mm/filemap.c b/mm/filemap.c index d1060b8d3cd..5dfc093ceb3 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2379,7 +2379,8 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; ssize_t retval; - size_t write_len = 0; + size_t write_len; + pgoff_t end = 0; /* silence gcc */ /* * If it's a write, unmap all mmappings of the file up-front. This @@ -2388,23 +2389,46 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, */ if (rw == WRITE) { write_len = iov_length(iov, nr_segs); + end = (offset + write_len - 1) >> PAGE_CACHE_SHIFT; if (mapping_mapped(mapping)) unmap_mapping_range(mapping, offset, write_len, 0); } retval = filemap_write_and_wait(mapping); - if (retval == 0) { - retval = mapping->a_ops->direct_IO(rw, iocb, iov, - offset, nr_segs); - if (rw == WRITE && mapping->nrpages) { - pgoff_t end = (offset + write_len - 1) - >> PAGE_CACHE_SHIFT; - int err = invalidate_inode_pages2_range(mapping, + if (retval) + goto out; + + /* + * After a write we want buffered reads to be sure to go to disk to get + * the new data. We invalidate clean cached page from the region we're + * about to write. We do this *before* the write so that we can return + * -EIO without clobbering -EIOCBQUEUED from ->direct_IO(). + */ + if (rw == WRITE && mapping->nrpages) { + retval = invalidate_inode_pages2_range(mapping, offset >> PAGE_CACHE_SHIFT, end); - if (err) - retval = err; - } + if (retval) + goto out; } + + retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs); + if (retval) + goto out; + + /* + * Finally, try again to invalidate clean pages which might have been + * faulted in by get_user_pages() if the source of the write was an + * mmap()ed region of the file we're writing. That's a pretty crazy + * thing to do, so we don't support it 100%. If this invalidation + * fails and we have -EIOCBQUEUED we ignore the failure. + */ + if (rw == WRITE && mapping->nrpages) { + int err = invalidate_inode_pages2_range(mapping, + offset >> PAGE_CACHE_SHIFT, end); + if (err && retval >= 0) + retval = err; + } +out: return retval; } diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 9dd9fbb7513..cbb335813ec 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -17,6 +17,29 @@ #include "filemap.h" /* + * We do use our own empty page to avoid interference with other users + * of ZERO_PAGE(), such as /dev/zero + */ +static struct page *__xip_sparse_page; + +static struct page *xip_sparse_page(void) +{ + if (!__xip_sparse_page) { + unsigned long zeroes = get_zeroed_page(GFP_HIGHUSER); + if (zeroes) { + static DEFINE_SPINLOCK(xip_alloc_lock); + spin_lock(&xip_alloc_lock); + if (!__xip_sparse_page) + __xip_sparse_page = virt_to_page(zeroes); + else + free_page(zeroes); + spin_unlock(&xip_alloc_lock); + } + } + return __xip_sparse_page; +} + +/* * This is a file read routine for execute in place files, and uses * the mapping->a_ops->get_xip_page() function for the actual low-level * stuff. @@ -162,7 +185,7 @@ EXPORT_SYMBOL_GPL(xip_file_sendfile); * xip_write * * This function walks all vmas of the address_space and unmaps the - * ZERO_PAGE when found at pgoff. Should it go in rmap.c? + * __xip_sparse_page when found at pgoff. */ static void __xip_unmap (struct address_space * mapping, @@ -177,13 +200,16 @@ __xip_unmap (struct address_space * mapping, spinlock_t *ptl; struct page *page; + page = __xip_sparse_page; + if (!page) + return; + spin_lock(&mapping->i_mmap_lock); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { mm = vma->vm_mm; address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); BUG_ON(address < vma->vm_start || address >= vma->vm_end); - page = ZERO_PAGE(0); pte = page_check_address(page, mm, address, &ptl); if (pte) { /* Nuke the page table entry. */ @@ -222,16 +248,14 @@ xip_file_nopage(struct vm_area_struct * area, + area->vm_pgoff; size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (pgoff >= size) { - return NULL; - } + if (pgoff >= size) + return NOPAGE_SIGBUS; page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0); - if (!IS_ERR(page)) { + if (!IS_ERR(page)) goto out; - } if (PTR_ERR(page) != -ENODATA) - return NULL; + return NOPAGE_SIGBUS; /* sparse block */ if ((area->vm_flags & (VM_WRITE | VM_MAYWRITE)) && @@ -241,12 +265,14 @@ xip_file_nopage(struct vm_area_struct * area, page = mapping->a_ops->get_xip_page (mapping, pgoff*(PAGE_SIZE/512), 1); if (IS_ERR(page)) - return NULL; + return NOPAGE_SIGBUS; /* unmap page at pgoff from all other vmas */ __xip_unmap(mapping, pgoff); } else { - /* not shared and writable, use ZERO_PAGE() */ - page = ZERO_PAGE(0); + /* not shared and writable, use xip_sparse_page() */ + page = xip_sparse_page(); + if (!page) + return NOPAGE_OOM; } out: diff --git a/mm/madvise.c b/mm/madvise.c index 4e196155a0c..603c5257ed6 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -155,10 +155,14 @@ static long madvise_dontneed(struct vm_area_struct * vma, * Other filesystems return -ENOSYS. */ static long madvise_remove(struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, unsigned long end) { struct address_space *mapping; - loff_t offset, endoff; + loff_t offset, endoff; + int error; + + *prev = NULL; /* tell sys_madvise we drop mmap_sem */ if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) return -EINVAL; @@ -177,7 +181,12 @@ static long madvise_remove(struct vm_area_struct *vma, + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); endoff = (loff_t)(end - vma->vm_start - 1) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); - return vmtruncate_range(mapping->host, offset, endoff); + + /* vmtruncate_range needs to take i_mutex and i_alloc_sem */ + up_write(¤t->mm->mmap_sem); + error = vmtruncate_range(mapping->host, offset, endoff); + down_write(¤t->mm->mmap_sem); + return error; } static long @@ -199,7 +208,7 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, error = madvise_behavior(vma, prev, start, end, behavior); break; case MADV_REMOVE: - error = madvise_remove(vma, start, end); + error = madvise_remove(vma, prev, start, end); break; case MADV_WILLNEED: @@ -312,12 +321,15 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior) if (error) goto out; start = tmp; - if (start < prev->vm_end) + if (prev && start < prev->vm_end) start = prev->vm_end; error = unmapped_error; if (start >= end) goto out; - vma = prev->vm_next; + if (prev) + vma = prev->vm_next; + else /* madvise_remove dropped mmap_sem */ + vma = find_vma(current->mm, start); } out: up_write(¤t->mm->mmap_sem); diff --git a/mm/migrate.c b/mm/migrate.c index 7a66ca25dc8..a91ca00abeb 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -297,7 +297,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, void **pslot; if (!mapping) { - /* Anonymous page */ + /* Anonymous page without mapping */ if (page_count(page) != 1) return -EAGAIN; return 0; @@ -333,6 +333,19 @@ static int migrate_page_move_mapping(struct address_space *mapping, */ __put_page(page); + /* + * If moved to a different zone then also account + * the page for that zone. Other VM counters will be + * taken care of when we establish references to the + * new page and drop references to the old page. + * + * Note that anonymous pages are accounted for + * via NR_FILE_PAGES and NR_ANON_PAGES if they + * are mapped to swap space. + */ + __dec_zone_page_state(page, NR_FILE_PAGES); + __inc_zone_page_state(newpage, NR_FILE_PAGES); + write_unlock_irq(&mapping->tree_lock); return 0; diff --git a/mm/nommu.c b/mm/nommu.c index 23fb033e596..1f60194d9b9 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -45,6 +45,7 @@ int heap_stack_gap = 0; EXPORT_SYMBOL(mem_map); EXPORT_SYMBOL(__vm_enough_memory); +EXPORT_SYMBOL(num_physpages); /* list of shareable VMAs */ struct rb_root nommu_vma_tree = RB_ROOT; @@ -826,6 +827,11 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; unsigned long vmpglen; + /* suppress VMA sharing for shared regions */ + if (vm_flags & VM_SHARED && + capabilities & BDI_CAP_MAP_DIRECT) + goto dont_share_VMAs; + for (rb = rb_first(&nommu_vma_tree); rb; rb = rb_next(rb)) { vma = rb_entry(rb, struct vm_area_struct, vm_rb); @@ -859,6 +865,7 @@ unsigned long do_mmap_pgoff(struct file *file, goto shared; } + dont_share_VMAs: vma = NULL; /* obtain the address at which to make a shared mapping @@ -1193,6 +1200,28 @@ void unmap_mapping_range(struct address_space *mapping, EXPORT_SYMBOL(unmap_mapping_range); /* + * ask for an unmapped area at which to create a mapping on a file + */ +unsigned long get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + unsigned long (*get_area)(struct file *, unsigned long, unsigned long, + unsigned long, unsigned long); + + get_area = current->mm->get_unmapped_area; + if (file && file->f_op && file->f_op->get_unmapped_area) + get_area = file->f_op->get_unmapped_area; + + if (!get_area) + return -ENOSYS; + + return get_area(file, addr, len, pgoff, flags); +} + +EXPORT_SYMBOL(get_unmapped_area); + +/* * Check that a process has enough memory to allocate a new virtual * mapping. 0 means there is enough memory for the allocation to * succeed and -ENOMEM implies there is not. diff --git a/mm/oom_kill.c b/mm/oom_kill.c index b278b8d60ee..3791edfffee 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -176,6 +176,8 @@ static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask) struct zone **z; nodemask_t nodes; int node; + + nodes_clear(nodes); /* node has memory ? */ for_each_online_node(node) if (NODE_DATA(node)->node_present_pages) @@ -320,7 +322,7 @@ static int oom_kill_task(struct task_struct *p) * Don't kill the process if any threads are set to OOM_DISABLE */ do_each_thread(g, q) { - if (q->mm == mm && p->oomkilladj == OOM_DISABLE) + if (q->mm == mm && q->oomkilladj == OOM_DISABLE) return 1; } while_each_thread(g, q); @@ -333,7 +335,7 @@ static int oom_kill_task(struct task_struct *p) */ do_each_thread(g, q) { if (q->mm == mm && q->tgid != p->tgid) - force_sig(SIGKILL, p); + force_sig(SIGKILL, q); } while_each_thread(g, q); return 0; diff --git a/mm/rmap.c b/mm/rmap.c index 22ed3f71a67..b82146e6dfc 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -498,9 +498,9 @@ int page_mkclean(struct page *page) struct address_space *mapping = page_mapping(page); if (mapping) ret = page_mkclean_file(mapping, page); + if (page_test_and_clear_dirty(page)) + ret = 1; } - if (page_test_and_clear_dirty(page)) - ret = 1; return ret; } diff --git a/mm/shmem.c b/mm/shmem.c index b8c429a2d27..b2a35ebf071 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -402,26 +402,38 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long /* * shmem_free_swp - free some swap entries in a directory * - * @dir: pointer to the directory - * @edir: pointer after last entry of the directory + * @dir: pointer to the directory + * @edir: pointer after last entry of the directory + * @punch_lock: pointer to spinlock when needed for the holepunch case */ -static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir) +static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir, + spinlock_t *punch_lock) { + spinlock_t *punch_unlock = NULL; swp_entry_t *ptr; int freed = 0; for (ptr = dir; ptr < edir; ptr++) { if (ptr->val) { + if (unlikely(punch_lock)) { + punch_unlock = punch_lock; + punch_lock = NULL; + spin_lock(punch_unlock); + if (!ptr->val) + continue; + } free_swap_and_cache(*ptr); *ptr = (swp_entry_t){0}; freed++; } } + if (punch_unlock) + spin_unlock(punch_unlock); return freed; } -static int shmem_map_and_free_swp(struct page *subdir, - int offset, int limit, struct page ***dir) +static int shmem_map_and_free_swp(struct page *subdir, int offset, + int limit, struct page ***dir, spinlock_t *punch_lock) { swp_entry_t *ptr; int freed = 0; @@ -431,7 +443,8 @@ static int shmem_map_and_free_swp(struct page *subdir, int size = limit - offset; if (size > LATENCY_LIMIT) size = LATENCY_LIMIT; - freed += shmem_free_swp(ptr+offset, ptr+offset+size); + freed += shmem_free_swp(ptr+offset, ptr+offset+size, + punch_lock); if (need_resched()) { shmem_swp_unmap(ptr); if (*dir) { @@ -481,7 +494,10 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) long nr_swaps_freed = 0; int offset; int freed; - int punch_hole = 0; + int punch_hole; + spinlock_t *needs_lock; + spinlock_t *punch_lock; + unsigned long upper_limit; inode->i_ctime = inode->i_mtime = CURRENT_TIME; idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; @@ -492,11 +508,20 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) info->flags |= SHMEM_TRUNCATE; if (likely(end == (loff_t) -1)) { limit = info->next_index; + upper_limit = SHMEM_MAX_INDEX; info->next_index = idx; + needs_lock = NULL; + punch_hole = 0; } else { - limit = (end + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (limit > info->next_index) - limit = info->next_index; + if (end + 1 >= inode->i_size) { /* we may free a little more */ + limit = (inode->i_size + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + upper_limit = SHMEM_MAX_INDEX; + } else { + limit = (end + 1) >> PAGE_CACHE_SHIFT; + upper_limit = limit; + } + needs_lock = &info->lock; punch_hole = 1; } @@ -513,17 +538,30 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) size = limit; if (size > SHMEM_NR_DIRECT) size = SHMEM_NR_DIRECT; - nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size); + nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size, needs_lock); } /* * If there are no indirect blocks or we are punching a hole * below indirect blocks, nothing to be done. */ - if (!topdir || (punch_hole && (limit <= SHMEM_NR_DIRECT))) + if (!topdir || limit <= SHMEM_NR_DIRECT) goto done2; - BUG_ON(limit <= SHMEM_NR_DIRECT); + /* + * The truncation case has already dropped info->lock, and we're safe + * because i_size and next_index have already been lowered, preventing + * access beyond. But in the punch_hole case, we still need to take + * the lock when updating the swap directory, because there might be + * racing accesses by shmem_getpage(SGP_CACHE), shmem_unuse_inode or + * shmem_writepage. However, whenever we find we can remove a whole + * directory page (not at the misaligned start or end of the range), + * we first NULLify its pointer in the level above, and then have no + * need to take the lock when updating its contents: needs_lock and + * punch_lock (either pointing to info->lock or NULL) manage this. + */ + + upper_limit -= SHMEM_NR_DIRECT; limit -= SHMEM_NR_DIRECT; idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0; offset = idx % ENTRIES_PER_PAGE; @@ -543,8 +581,14 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) if (*dir) { diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) % ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE; - if (!diroff && !offset) { - *dir = NULL; + if (!diroff && !offset && upper_limit >= stage) { + if (needs_lock) { + spin_lock(needs_lock); + *dir = NULL; + spin_unlock(needs_lock); + needs_lock = NULL; + } else + *dir = NULL; nr_pages_to_free++; list_add(&middir->lru, &pages_to_free); } @@ -570,39 +614,55 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) } stage = idx + ENTRIES_PER_PAGEPAGE; middir = *dir; - *dir = NULL; - nr_pages_to_free++; - list_add(&middir->lru, &pages_to_free); + if (punch_hole) + needs_lock = &info->lock; + if (upper_limit >= stage) { + if (needs_lock) { + spin_lock(needs_lock); + *dir = NULL; + spin_unlock(needs_lock); + needs_lock = NULL; + } else + *dir = NULL; + nr_pages_to_free++; + list_add(&middir->lru, &pages_to_free); + } shmem_dir_unmap(dir); cond_resched(); dir = shmem_dir_map(middir); diroff = 0; } + punch_lock = needs_lock; subdir = dir[diroff]; - if (subdir && page_private(subdir)) { + if (subdir && !offset && upper_limit-idx >= ENTRIES_PER_PAGE) { + if (needs_lock) { + spin_lock(needs_lock); + dir[diroff] = NULL; + spin_unlock(needs_lock); + punch_lock = NULL; + } else + dir[diroff] = NULL; + nr_pages_to_free++; + list_add(&subdir->lru, &pages_to_free); + } + if (subdir && page_private(subdir) /* has swap entries */) { size = limit - idx; if (size > ENTRIES_PER_PAGE) size = ENTRIES_PER_PAGE; freed = shmem_map_and_free_swp(subdir, - offset, size, &dir); + offset, size, &dir, punch_lock); if (!dir) dir = shmem_dir_map(middir); nr_swaps_freed += freed; - if (offset) + if (offset || punch_lock) { spin_lock(&info->lock); - set_page_private(subdir, page_private(subdir) - freed); - if (offset) + set_page_private(subdir, + page_private(subdir) - freed); spin_unlock(&info->lock); - if (!punch_hole) - BUG_ON(page_private(subdir) > offset); - } - if (offset) - offset = 0; - else if (subdir && !page_private(subdir)) { - dir[diroff] = NULL; - nr_pages_to_free++; - list_add(&subdir->lru, &pages_to_free); + } else + BUG_ON(page_private(subdir) != freed); } + offset = 0; } done1: shmem_dir_unmap(dir); @@ -614,8 +674,16 @@ done2: * generic_delete_inode did it, before we lowered next_index. * Also, though shmem_getpage checks i_size before adding to * cache, no recheck after: so fix the narrow window there too. + * + * Recalling truncate_inode_pages_range and unmap_mapping_range + * every time for punch_hole (which never got a chance to clear + * SHMEM_PAGEIN at the start of vmtruncate_range) is expensive, + * yet hardly ever necessary: try to optimize them out later. */ truncate_inode_pages_range(inode->i_mapping, start, end); + if (punch_hole) + unmap_mapping_range(inode->i_mapping, start, + end - start, 1); } spin_lock(&info->lock); diff --git a/mm/slab.c b/mm/slab.c index 57f7aa42006..4cbac24ae2f 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1802,8 +1802,8 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) /* Print header */ if (lines == 0) { printk(KERN_ERR - "Slab corruption: start=%p, len=%d\n", - realobj, size); + "Slab corruption: %s start=%p, len=%d\n", + cachep->name, realobj, size); print_objinfo(cachep, objp, 0); } /* Hexdump the affected line */ |