diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/allocpercpu.c | 9 | ||||
-rw-r--r-- | mm/bootmem.c | 6 | ||||
-rw-r--r-- | mm/bounce.c | 4 | ||||
-rw-r--r-- | mm/fadvise.c | 2 | ||||
-rw-r--r-- | mm/filemap.c | 13 | ||||
-rw-r--r-- | mm/filemap_xip.c | 4 | ||||
-rw-r--r-- | mm/fremap.c | 4 | ||||
-rw-r--r-- | mm/hugetlb.c | 30 | ||||
-rw-r--r-- | mm/memory.c | 51 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 7 | ||||
-rw-r--r-- | mm/mempolicy.c | 12 | ||||
-rw-r--r-- | mm/migrate.c | 19 | ||||
-rw-r--r-- | mm/mincore.c | 183 | ||||
-rw-r--r-- | mm/mlock.c | 2 | ||||
-rw-r--r-- | mm/mmap.c | 14 | ||||
-rw-r--r-- | mm/mmzone.c | 5 | ||||
-rw-r--r-- | mm/nommu.c | 30 | ||||
-rw-r--r-- | mm/oom_kill.c | 62 | ||||
-rw-r--r-- | mm/page-writeback.c | 106 | ||||
-rw-r--r-- | mm/page_alloc.c | 406 | ||||
-rw-r--r-- | mm/page_io.c | 45 | ||||
-rw-r--r-- | mm/pdflush.c | 1 | ||||
-rw-r--r-- | mm/readahead.c | 12 | ||||
-rw-r--r-- | mm/rmap.c | 36 | ||||
-rw-r--r-- | mm/shmem.c | 35 | ||||
-rw-r--r-- | mm/slab.c | 411 | ||||
-rw-r--r-- | mm/slob.c | 27 | ||||
-rw-r--r-- | mm/sparse.c | 23 | ||||
-rw-r--r-- | mm/swap.c | 10 | ||||
-rw-r--r-- | mm/swapfile.c | 102 | ||||
-rw-r--r-- | mm/thrash.c | 116 | ||||
-rw-r--r-- | mm/tiny-shmem.c | 4 | ||||
-rw-r--r-- | mm/truncate.c | 41 | ||||
-rw-r--r-- | mm/vmscan.c | 60 | ||||
-rw-r--r-- | mm/vmstat.c | 22 |
35 files changed, 1224 insertions, 690 deletions
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c index eaa9abeea53..b2486cf887a 100644 --- a/mm/allocpercpu.c +++ b/mm/allocpercpu.c @@ -17,10 +17,9 @@ void percpu_depopulate(void *__pdata, int cpu) { struct percpu_data *pdata = __percpu_disguise(__pdata); - if (pdata->ptrs[cpu]) { - kfree(pdata->ptrs[cpu]); - pdata->ptrs[cpu] = NULL; - } + + kfree(pdata->ptrs[cpu]); + pdata->ptrs[cpu] = NULL; } EXPORT_SYMBOL_GPL(percpu_depopulate); @@ -123,6 +122,8 @@ EXPORT_SYMBOL_GPL(__percpu_alloc_mask); */ void percpu_free(void *__pdata) { + if (unlikely(!__pdata)) + return; __percpu_depopulate_mask(__pdata, &cpu_possible_map); kfree(__percpu_disguise(__pdata)); } diff --git a/mm/bootmem.c b/mm/bootmem.c index d53112fcb40..00a96970b23 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -27,8 +27,6 @@ unsigned long max_low_pfn; unsigned long min_low_pfn; unsigned long max_pfn; -EXPORT_UNUSED_SYMBOL(max_pfn); /* June 2006 */ - static LIST_HEAD(bdata_list); #ifdef CONFIG_CRASH_DUMP /* @@ -196,6 +194,10 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, if (limit && bdata->node_boot_start >= limit) return NULL; + /* on nodes without memory - bootmem_map is NULL */ + if (!bdata->node_bootmem_map) + return NULL; + end_pfn = bdata->node_low_pfn; limit = PFN_DOWN(limit); if (limit && end_pfn > limit) diff --git a/mm/bounce.c b/mm/bounce.c index e4b62d2a402..643efbe8240 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -237,6 +237,8 @@ static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig, if (!bio) return; + blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE); + /* * at least one page was bounced, fill in possible non-highmem * pages @@ -291,8 +293,6 @@ void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig) pool = isa_page_pool; } - blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE); - /* * slow path */ diff --git a/mm/fadvise.c b/mm/fadvise.c index 168c78a121b..0df4c899e97 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -38,7 +38,7 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) if (!file) return -EBADF; - if (S_ISFIFO(file->f_dentry->d_inode->i_mode)) { + if (S_ISFIFO(file->f_path.dentry->d_inode->i_mode)) { ret = -ESPIPE; goto out; } diff --git a/mm/filemap.c b/mm/filemap.c index 7b84dc81434..8332c77b1bd 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1181,8 +1181,6 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, if (pos < size) { retval = generic_file_direct_IO(READ, iocb, iov, pos, nr_segs); - if (retval > 0 && !is_sync_kiocb(iocb)) - retval = -EIOCBQUEUED; if (retval > 0) *ppos = pos + retval; } @@ -1445,7 +1443,6 @@ no_cached_page: * effect. */ error = page_cache_read(file, pgoff); - grab_swap_token(); /* * The page we want has now been added to the page cache. @@ -1893,6 +1890,7 @@ int should_remove_suid(struct dentry *dentry) return 0; } +EXPORT_SYMBOL(should_remove_suid); int __remove_suid(struct dentry *dentry, int kill) { @@ -2047,15 +2045,14 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, * Sync the fs metadata but not the minor inode changes and * of course not the data as we did direct DMA for the IO. * i_mutex is held, which protects generic_osync_inode() from - * livelocking. + * livelocking. AIO O_DIRECT ops attempt to sync metadata here. */ - if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + if ((written >= 0 || written == -EIOCBQUEUED) && + ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); if (err < 0) written = err; } - if (written == count && !is_sync_kiocb(iocb)) - written = -EIOCBQUEUED; return written; } EXPORT_SYMBOL(generic_file_direct_write); @@ -2269,7 +2266,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, if (count == 0) goto out; - err = remove_suid(file->f_dentry); + err = remove_suid(file->f_path.dentry); if (err) goto out; diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index b4fd0d7c9bf..45b3553865c 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -189,7 +189,7 @@ __xip_unmap (struct address_space * mapping, /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pte)); pteval = ptep_clear_flush(vma, address, pte); - page_remove_rmap(page); + page_remove_rmap(page, vma); dec_mm_counter(mm, file_rss); BUG_ON(pte_dirty(pteval)); pte_unmap_unlock(pte, ptl); @@ -379,7 +379,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len, if (count == 0) goto out_backing; - ret = remove_suid(filp->f_dentry); + ret = remove_suid(filp->f_path.dentry); if (ret) goto out_backing; diff --git a/mm/fremap.c b/mm/fremap.c index 7a9d0f5d246..4e3f53dd5fd 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -33,7 +33,7 @@ static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, if (page) { if (pte_dirty(pte)) set_page_dirty(page); - page_remove_rmap(page); + page_remove_rmap(page, vma); page_cache_release(page); } } else { @@ -101,7 +101,6 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, { int err = -ENOMEM; pte_t *pte; - pte_t pte_val; spinlock_t *ptl; pte = get_locked_pte(mm, addr, &ptl); @@ -114,7 +113,6 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, } set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); - pte_val = *pte; /* * We don't need to run update_mmu_cache() here because the "file pte" * being installed by install_file_pte() is not a real pte - it's a diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a088f593a80..cb362f761f1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -44,14 +44,14 @@ static void clear_huge_page(struct page *page, unsigned long addr) } static void copy_huge_page(struct page *dst, struct page *src, - unsigned long addr) + unsigned long addr, struct vm_area_struct *vma) { int i; might_sleep(); for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { cond_resched(); - copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE); + copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); } } @@ -73,7 +73,7 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma, for (z = zonelist->zones; *z; z++) { nid = zone_to_nid(*z); - if (cpuset_zone_allowed(*z, GFP_HIGHUSER) && + if (cpuset_zone_allowed_softwall(*z, GFP_HIGHUSER) && !list_empty(&hugepage_freelists[nid])) break; } @@ -109,7 +109,7 @@ static int alloc_fresh_huge_page(void) if (nid == MAX_NUMNODES) nid = first_node(node_online_map); if (page) { - page[1].lru.next = (void *)free_huge_page; /* dtor */ + set_compound_page_dtor(page, free_huge_page); spin_lock(&hugetlb_lock); nr_huge_pages++; nr_huge_pages_node[page_to_nid(page)]++; @@ -344,7 +344,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, entry = *src_pte; ptepage = pte_page(entry); get_page(ptepage); - add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE); set_huge_pte_at(dst, addr, dst_pte, entry); } spin_unlock(&src->page_table_lock); @@ -365,6 +364,11 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, pte_t pte; struct page *page; struct page *tmp; + /* + * A page gathering list, protected by per file i_mmap_lock. The + * lock is used to avoid list corruption from multiple unmapping + * of the same page since we are using page->lru. + */ LIST_HEAD(page_list); WARN_ON(!is_vm_hugetlb_page(vma)); @@ -372,24 +376,21 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, BUG_ON(end & ~HPAGE_MASK); spin_lock(&mm->page_table_lock); - - /* Update high watermark before we lower rss */ - update_hiwater_rss(mm); - for (address = start; address < end; address += HPAGE_SIZE) { ptep = huge_pte_offset(mm, address); if (!ptep) continue; + if (huge_pmd_unshare(mm, &address, ptep)) + continue; + pte = huge_ptep_get_and_clear(mm, address, ptep); if (pte_none(pte)) continue; page = pte_page(pte); list_add(&page->lru, &page_list); - add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE)); } - spin_unlock(&mm->page_table_lock); flush_tlb_range(vma, start, end); list_for_each_entry_safe(page, tmp, &page_list, lru) { @@ -441,7 +442,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, } spin_unlock(&mm->page_table_lock); - copy_huge_page(new_page, old_page, address); + copy_huge_page(new_page, old_page, address, vma); spin_lock(&mm->page_table_lock); ptep = huge_pte_offset(mm, address & HPAGE_MASK); @@ -515,7 +516,6 @@ retry: if (!pte_none(*ptep)) goto backout; - add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED))); set_huge_pte_at(mm, address, ptep, new_pte); @@ -653,11 +653,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma, BUG_ON(address >= end); flush_cache_range(vma, address, end); + spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); spin_lock(&mm->page_table_lock); for (; address < end; address += HPAGE_SIZE) { ptep = huge_pte_offset(mm, address); if (!ptep) continue; + if (huge_pmd_unshare(mm, &address, ptep)) + continue; if (!pte_none(*ptep)) { pte = huge_ptep_get_and_clear(mm, address, ptep); pte = pte_mkhuge(pte_modify(pte, newprot)); @@ -666,6 +669,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma, } } spin_unlock(&mm->page_table_lock); + spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); flush_tlb_range(vma, start, end); } diff --git a/mm/memory.c b/mm/memory.c index 156861fcac4..af227d26e10 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -681,7 +681,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, mark_page_accessed(page); file_rss--; } - page_remove_rmap(page); + page_remove_rmap(page, vma); tlb_remove_page(tlb, page); continue; } @@ -1091,7 +1091,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (pages) { pages[i] = page; - flush_anon_page(page, start); + flush_anon_page(vma, page, start); flush_dcache_page(page); } if (vmas) @@ -1110,23 +1110,29 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd, { pte_t *pte; spinlock_t *ptl; + int err = 0; pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) - return -ENOMEM; + return -EAGAIN; arch_enter_lazy_mmu_mode(); do { struct page *page = ZERO_PAGE(addr); pte_t zero_pte = pte_wrprotect(mk_pte(page, prot)); + + if (unlikely(!pte_none(*pte))) { + err = -EEXIST; + pte++; + break; + } page_cache_get(page); page_add_file_rmap(page); inc_mm_counter(mm, file_rss); - BUG_ON(!pte_none(*pte)); set_pte_at(mm, addr, pte, zero_pte); } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); - return 0; + return err; } static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud, @@ -1134,16 +1140,18 @@ static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud, { pmd_t *pmd; unsigned long next; + int err; pmd = pmd_alloc(mm, pud, addr); if (!pmd) - return -ENOMEM; + return -EAGAIN; do { next = pmd_addr_end(addr, end); - if (zeromap_pte_range(mm, pmd, addr, next, prot)) - return -ENOMEM; + err = zeromap_pte_range(mm, pmd, addr, next, prot); + if (err) + break; } while (pmd++, addr = next, addr != end); - return 0; + return err; } static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd, @@ -1151,16 +1159,18 @@ static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd, { pud_t *pud; unsigned long next; + int err; pud = pud_alloc(mm, pgd, addr); if (!pud) - return -ENOMEM; + return -EAGAIN; do { next = pud_addr_end(addr, end); - if (zeromap_pmd_range(mm, pud, addr, next, prot)) - return -ENOMEM; + err = zeromap_pmd_range(mm, pud, addr, next, prot); + if (err) + break; } while (pud++, addr = next, addr != end); - return 0; + return err; } int zeromap_page_range(struct vm_area_struct *vma, @@ -1431,7 +1441,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) return pte; } -static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va) +static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) { /* * If the source page was a PFN mapping, we don't have @@ -1454,9 +1464,9 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo kunmap_atomic(kaddr, KM_USER0); flush_dcache_page(dst); return; - + } - copy_user_highpage(dst, src, va); + copy_user_highpage(dst, src, va, vma); } /* @@ -1567,7 +1577,7 @@ gotten: new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); if (!new_page) goto oom; - cow_user_page(new_page, old_page, address); + cow_user_page(new_page, old_page, address, vma); } /* @@ -1576,7 +1586,7 @@ gotten: page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (likely(pte_same(*page_table, orig_pte))) { if (old_page) { - page_remove_rmap(old_page); + page_remove_rmap(old_page, vma); if (!PageAnon(old_page)) { dec_mm_counter(mm, file_rss); inc_mm_counter(mm, anon_rss); @@ -1902,7 +1912,6 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) return 0; } -EXPORT_UNUSED_SYMBOL(vmtruncate_range); /* June 2006 */ /** * swapin_readahead - swap in pages in hope we need them soon @@ -1991,6 +2000,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, delayacct_set_flag(DELAYACCT_PF_SWAPIN); page = lookup_swap_cache(entry); if (!page) { + grab_swap_token(); /* Contend for token _before_ read-in */ swapin_readahead(entry, address, vma); page = read_swap_cache_async(entry, vma, address); if (!page) { @@ -2008,7 +2018,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Had to read the page from swap area: Major fault */ ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); - grab_swap_token(); } delayacct_clear_flag(DELAYACCT_PF_SWAPIN); @@ -2191,7 +2200,7 @@ retry: page = alloc_page_vma(GFP_HIGHUSER, vma, address); if (!page) goto oom; - copy_user_highpage(page, new_page, address); + copy_user_highpage(page, new_page, address, vma); page_cache_release(new_page); new_page = page; anon = 1; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index fd678a662ea..84279127fcd 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -67,12 +67,13 @@ static int __add_zone(struct zone *zone, unsigned long phys_start_pfn) zone_type = zone - pgdat->node_zones; if (!populated_zone(zone)) { int ret = 0; - ret = init_currently_empty_zone(zone, phys_start_pfn, nr_pages); + ret = init_currently_empty_zone(zone, phys_start_pfn, + nr_pages, MEMMAP_HOTPLUG); if (ret < 0) return ret; } - memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn); - zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages); + memmap_init_zone(nr_pages, nid, zone_type, + phys_start_pfn, MEMMAP_HOTPLUG); return 0; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 617fb31086e..da946394655 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -141,9 +141,11 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes) enum zone_type k; max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); + max++; /* space for zlcache_ptr (see mmzone.h) */ zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); if (!zl) return NULL; + zl->zlcache_ptr = NULL; num = 0; /* First put in the highest zones from all nodes, then all the next lower zones etc. Avoid empty zones because the memory allocator @@ -219,7 +221,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); do { struct page *page; - unsigned int nid; + int nid; if (!pte_present(*pte)) continue; @@ -1324,7 +1326,7 @@ struct mempolicy *__mpol_copy(struct mempolicy *old) atomic_set(&new->refcnt, 1); if (new->policy == MPOL_BIND) { int sz = ksize(old->v.zonelist); - new->v.zonelist = kmemdup(old->v.zonelist, sz, SLAB_KERNEL); + new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL); if (!new->v.zonelist) { kmem_cache_free(policy_cache, new); return ERR_PTR(-ENOMEM); @@ -1705,8 +1707,8 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) * Display pages allocated per node and memory policy via /proc. */ -static const char *policy_types[] = { "default", "prefer", "bind", - "interleave" }; +static const char * const policy_types[] = + { "default", "prefer", "bind", "interleave" }; /* * Convert a mempolicy into a string. @@ -1855,7 +1857,7 @@ int show_numa_map(struct seq_file *m, void *v) if (file) { seq_printf(m, " file="); - seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= "); + seq_path(m, file->f_path.mnt, file->f_path.dentry, "\n\t= "); } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { seq_printf(m, " heap"); } else if (vma->vm_start <= mm->start_stack && diff --git a/mm/migrate.c b/mm/migrate.c index b4979d423d2..e9b161bde95 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -294,7 +294,7 @@ out: static int migrate_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page) { - struct page **radix_pointer; + void **pslot; if (!mapping) { /* Anonymous page */ @@ -305,12 +305,11 @@ static int migrate_page_move_mapping(struct address_space *mapping, write_lock_irq(&mapping->tree_lock); - radix_pointer = (struct page **)radix_tree_lookup_slot( - &mapping->page_tree, - page_index(page)); + pslot = radix_tree_lookup_slot(&mapping->page_tree, + page_index(page)); if (page_count(page) != 2 + !!PagePrivate(page) || - *radix_pointer != page) { + (struct page *)radix_tree_deref_slot(pslot) != page) { write_unlock_irq(&mapping->tree_lock); return -EAGAIN; } @@ -318,7 +317,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, /* * Now we know that no one else is looking at the page. */ - get_page(newpage); + get_page(newpage); /* add cache reference */ #ifdef CONFIG_SWAP if (PageSwapCache(page)) { SetPageSwapCache(newpage); @@ -326,8 +325,14 @@ static int migrate_page_move_mapping(struct address_space *mapping, } #endif - *radix_pointer = newpage; + radix_tree_replace_slot(pslot, newpage); + + /* + * Drop cache reference from old page. + * We know this isn't the last reference. + */ __put_page(page); + write_unlock_irq(&mapping->tree_lock); return 0; diff --git a/mm/mincore.c b/mm/mincore.c index 72890780c1c..8aca6f7167b 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -1,7 +1,7 @@ /* * linux/mm/mincore.c * - * Copyright (C) 1994-1999 Linus Torvalds + * Copyright (C) 1994-2006 Linus Torvalds */ /* @@ -38,46 +38,51 @@ static unsigned char mincore_page(struct vm_area_struct * vma, return present; } -static long mincore_vma(struct vm_area_struct * vma, - unsigned long start, unsigned long end, unsigned char __user * vec) +/* + * Do a chunk of "sys_mincore()". We've already checked + * all the arguments, we hold the mmap semaphore: we should + * just return the amount of info we're asked for. + */ +static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pages) { - long error, i, remaining; - unsigned char * tmp; - - error = -ENOMEM; - if (!vma->vm_file) - return error; - - start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - if (end > vma->vm_end) - end = vma->vm_end; - end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + unsigned long i, nr, pgoff; + struct vm_area_struct *vma = find_vma(current->mm, addr); - error = -EAGAIN; - tmp = (unsigned char *) __get_free_page(GFP_KERNEL); - if (!tmp) - return error; + /* + * find_vma() didn't find anything above us, or we're + * in an unmapped hole in the address space: ENOMEM. + */ + if (!vma || addr < vma->vm_start) + return -ENOMEM; - /* (end - start) is # of pages, and also # of bytes in "vec */ - remaining = (end - start), + /* + * Ok, got it. But check whether it's a segment we support + * mincore() on. Right now, we don't do any anonymous mappings. + * + * FIXME: This is just stupid. And returning ENOMEM is + * stupid too. We should just look at the page tables. But + * this is what we've traditionally done, so we'll just + * continue doing it. + */ + if (!vma->vm_file) + return -ENOMEM; - error = 0; - for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) { - int j = 0; - long thispiece = (remaining < PAGE_SIZE) ? - remaining : PAGE_SIZE; + /* + * Calculate how many pages there are left in the vma, and + * what the pgoff is for our address. + */ + nr = (vma->vm_end - addr) >> PAGE_SHIFT; + if (nr > pages) + nr = pages; - while (j < thispiece) - tmp[j++] = mincore_page(vma, start++); + pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; + pgoff += vma->vm_pgoff; - if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) { - error = -EFAULT; - break; - } - } + /* And then we just fill the sucker in.. */ + for (i = 0 ; i < nr; i++, pgoff++) + vec[i] = mincore_page(vma, pgoff); - free_page((unsigned long) tmp); - return error; + return nr; } /* @@ -107,82 +112,50 @@ static long mincore_vma(struct vm_area_struct * vma, asmlinkage long sys_mincore(unsigned long start, size_t len, unsigned char __user * vec) { - int index = 0; - unsigned long end, limit; - struct vm_area_struct * vma; - size_t max; - int unmapped_error = 0; - long error; - - /* check the arguments */ - if (start & ~PAGE_CACHE_MASK) - goto einval; - - limit = TASK_SIZE; - if (start >= limit) - goto enomem; - - if (!len) - return 0; - - max = limit - start; - len = PAGE_CACHE_ALIGN(len); - if (len > max || !len) - goto enomem; + long retval; + unsigned long pages; + unsigned char *tmp; - end = start + len; + /* Check the start address: needs to be page-aligned.. */ + if (start & ~PAGE_CACHE_MASK) + return -EINVAL; - /* check the output buffer whilst holding the lock */ - error = -EFAULT; - down_read(¤t->mm->mmap_sem); + /* ..and we need to be passed a valid user-space range */ + if (!access_ok(VERIFY_READ, (void __user *) start, len)) + return -ENOMEM; - if (!access_ok(VERIFY_WRITE, vec, len >> PAGE_SHIFT)) - goto out; + /* This also avoids any overflows on PAGE_CACHE_ALIGN */ + pages = len >> PAGE_SHIFT; + pages += (len & ~PAGE_MASK) != 0; - /* - * If the interval [start,end) covers some unmapped address - * ranges, just ignore them, but return -ENOMEM at the end. - */ - error = 0; - - vma = find_vma(current->mm, start); - while (vma) { - /* Here start < vma->vm_end. */ - if (start < vma->vm_start) { - unmapped_error = -ENOMEM; - start = vma->vm_start; - } + if (!access_ok(VERIFY_WRITE, vec, pages)) + return -EFAULT; - /* Here vma->vm_start <= start < vma->vm_end. */ - if (end <= vma->vm_end) { - if (start < end) { - error = mincore_vma(vma, start, end, - &vec[index]); - if (error) - goto out; - } - error = unmapped_error; - goto out; + tmp = (void *) __get_free_page(GFP_USER); + if (!tmp) + return -EAGAIN; + + retval = 0; + while (pages) { + /* + * Do at most PAGE_SIZE entries per iteration, due to + * the temporary buffer size. + */ + down_read(¤t->mm->mmap_sem); + retval = do_mincore(start, tmp, min(pages, PAGE_SIZE)); + up_read(¤t->mm->mmap_sem); + + if (retval <= 0) + break; + if (copy_to_user(vec, tmp, retval)) { + retval = -EFAULT; + break; } - - /* Here vma->vm_start <= start < vma->vm_end < end. */ - error = mincore_vma(vma, start, vma->vm_end, &vec[index]); - if (error) - goto out; - index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT; - start = vma->vm_end; - vma = vma->vm_next; + pages -= retval; + vec += retval; + start += retval << PAGE_SHIFT; + retval = 0; } - - /* we found a hole in the area queried if we arrive here */ - error = -ENOMEM; - -out: - up_read(¤t->mm->mmap_sem); - return error; - -einval: - return -EINVAL; -enomem: - return -ENOMEM; + free_page((unsigned long) tmp); + return retval; } diff --git a/mm/mlock.c b/mm/mlock.c index b90c59573ab..3446b7ef731 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -65,7 +65,7 @@ success: ret = make_pages_present(start, end); } - vma->vm_mm->locked_vm -= pages; + mm->locked_vm -= pages; out: if (ret == -ENOMEM) ret = -EAGAIN; diff --git a/mm/mmap.c b/mm/mmap.c index 7b40abd7cba..9717337293c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -188,7 +188,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, struct file *file, struct address_space *mapping) { if (vma->vm_flags & VM_DENYWRITE) - atomic_inc(&file->f_dentry->d_inode->i_writecount); + atomic_inc(&file->f_path.dentry->d_inode->i_writecount); if (vma->vm_flags & VM_SHARED) mapping->i_mmap_writable--; @@ -399,7 +399,7 @@ static inline void __vma_link_file(struct vm_area_struct *vma) struct address_space *mapping = file->f_mapping; if (vma->vm_flags & VM_DENYWRITE) - atomic_dec(&file->f_dentry->d_inode->i_writecount); + atomic_dec(&file->f_path.dentry->d_inode->i_writecount); if (vma->vm_flags & VM_SHARED) mapping->i_mmap_writable++; @@ -907,7 +907,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, * mounted, in which case we dont add PROT_EXEC.) */ if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) - if (!(file && (file->f_vfsmnt->mnt_flags & MNT_NOEXEC))) + if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC))) prot |= PROT_EXEC; if (!len) @@ -960,7 +960,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, return -EAGAIN; } - inode = file ? file->f_dentry->d_inode : NULL; + inode = file ? file->f_path.dentry->d_inode : NULL; if (file) { switch (flags & MAP_TYPE) { @@ -989,7 +989,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, case MAP_PRIVATE: if (!(file->f_mode & FMODE_READ)) return -EACCES; - if (file->f_vfsmnt->mnt_flags & MNT_NOEXEC) { + if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { if (vm_flags & VM_EXEC) return -EPERM; vm_flags &= ~VM_MAYEXEC; @@ -1736,7 +1736,7 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, if (mm->map_count >= sysctl_max_map_count) return -ENOMEM; - new = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!new) return -ENOMEM; @@ -2057,7 +2057,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, vma_start < new_vma->vm_end) *vmap = new_vma; } else { - new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (new_vma) { *new_vma = *vma; pol = mpol_copy(vma_policy(vma)); diff --git a/mm/mmzone.c b/mm/mmzone.c index febea1c9816..eb5838634f1 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -14,8 +14,6 @@ struct pglist_data *first_online_pgdat(void) return NODE_DATA(first_online_node); } -EXPORT_UNUSED_SYMBOL(first_online_pgdat); /* June 2006 */ - struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) { int nid = next_online_node(pgdat->node_id); @@ -24,8 +22,6 @@ struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) return NULL; return NODE_DATA(nid); } -EXPORT_UNUSED_SYMBOL(next_online_pgdat); /* June 2006 */ - /* * next_zone - helper magic for for_each_zone() @@ -45,5 +41,4 @@ struct zone *next_zone(struct zone *zone) } return zone; } -EXPORT_UNUSED_SYMBOL(next_zone); /* June 2006 */ diff --git a/mm/nommu.c b/mm/nommu.c index 8bdde9508f3..23fb033e596 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -497,15 +497,17 @@ static int validate_mmap_request(struct file *file, (flags & MAP_TYPE) != MAP_SHARED) return -EINVAL; - if (PAGE_ALIGN(len) == 0) - return addr; - - if (len > TASK_SIZE) + if (!len) return -EINVAL; + /* Careful about overflows.. */ + len = PAGE_ALIGN(len); + if (!len || len > TASK_SIZE) + return -ENOMEM; + /* offset overflow? */ if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) - return -EINVAL; + return -EOVERFLOW; if (file) { /* validate file mapping requests */ @@ -521,7 +523,7 @@ static int validate_mmap_request(struct file *file, */ mapping = file->f_mapping; if (!mapping) - mapping = file->f_dentry->d_inode->i_mapping; + mapping = file->f_path.dentry->d_inode->i_mapping; capabilities = 0; if (mapping && mapping->backing_dev_info) @@ -530,7 +532,7 @@ static int validate_mmap_request(struct file *file, if (!capabilities) { /* no explicit capabilities set, so assume some * defaults */ - switch (file->f_dentry->d_inode->i_mode & S_IFMT) { + switch (file->f_path.dentry->d_inode->i_mode & S_IFMT) { case S_IFREG: case S_IFBLK: capabilities = BDI_CAP_MAP_COPY; @@ -561,11 +563,11 @@ static int validate_mmap_request(struct file *file, !(file->f_mode & FMODE_WRITE)) return -EACCES; - if (IS_APPEND(file->f_dentry->d_inode) && + if (IS_APPEND(file->f_path.dentry->d_inode) && (file->f_mode & FMODE_WRITE)) return -EACCES; - if (locks_verify_locked(file->f_dentry->d_inode)) + if (locks_verify_locked(file->f_path.dentry->d_inode)) return -EAGAIN; if (!(capabilities & BDI_CAP_MAP_DIRECT)) @@ -596,7 +598,7 @@ static int validate_mmap_request(struct file *file, /* handle executable mappings and implied executable * mappings */ - if (file->f_vfsmnt->mnt_flags & MNT_NOEXEC) { + if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { if (prot & PROT_EXEC) return -EPERM; } @@ -806,10 +808,9 @@ unsigned long do_mmap_pgoff(struct file *file, vm_flags = determine_vm_flags(file, prot, flags, capabilities); /* we're going to need to record the mapping if it works */ - vml = kmalloc(sizeof(struct vm_list_struct), GFP_KERNEL); + vml = kzalloc(sizeof(struct vm_list_struct), GFP_KERNEL); if (!vml) goto error_getting_vml; - memset(vml, 0, sizeof(*vml)); down_write(&nommu_vma_sem); @@ -832,7 +833,7 @@ unsigned long do_mmap_pgoff(struct file *file, continue; /* search for overlapping mappings on the same file */ - if (vma->vm_file->f_dentry->d_inode != file->f_dentry->d_inode) + if (vma->vm_file->f_path.dentry->d_inode != file->f_path.dentry->d_inode) continue; if (vma->vm_pgoff >= pgoff + pglen) @@ -885,11 +886,10 @@ unsigned long do_mmap_pgoff(struct file *file, } /* we're going to need a VMA struct as well */ - vma = kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL); + vma = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL); if (!vma) goto error_getting_vma; - memset(vma, 0, sizeof(*vma)); INIT_LIST_HEAD(&vma->anon_vma_node); atomic_set(&vma->vm_usage, 1); if (file) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 2e3ce3a928b..b278b8d60ee 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -61,12 +61,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) } /* - * swapoff can easily use up all memory, so kill those first. - */ - if (p->flags & PF_SWAPOFF) - return ULONG_MAX; - - /* * The memory size of the process is the basis for the badness. */ points = mm->total_vm; @@ -77,6 +71,12 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) task_unlock(p); /* + * swapoff can easily use up all memory, so kill those first. + */ + if (p->flags & PF_SWAPOFF) + return ULONG_MAX; + + /* * Processes which fork a lot of child processes are likely * a good choice. We add half the vmsize of the children if they * have an own mm. This prevents forking servers to flood the @@ -174,10 +174,15 @@ static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask) { #ifdef CONFIG_NUMA struct zone **z; - nodemask_t nodes = node_online_map; + nodemask_t nodes; + int node; + /* node has memory ? */ + for_each_online_node(node) + if (NODE_DATA(node)->node_present_pages) + node_set(node, nodes); for (z = zonelist->zones; *z; z++) - if (cpuset_zone_allowed(*z, gfp_mask)) + if (cpuset_zone_allowed_softwall(*z, gfp_mask)) node_clear(zone_to_nid(*z), nodes); else return CONSTRAINT_CPUSET; @@ -264,7 +269,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints) * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO * set. */ -static void __oom_kill_task(struct task_struct *p, const char *message) +static void __oom_kill_task(struct task_struct *p, int verbose) { if (is_init(p)) { WARN_ON(1); @@ -278,10 +283,8 @@ static void __oom_kill_task(struct task_struct *p, const char *message) return; } - if (message) { - printk(KERN_ERR "%s: Killed process %d (%s).\n", - message, p->pid, p->comm); - } + if (verbose) + printk(KERN_ERR "Killed process %d (%s)\n", p->pid, p->comm); /* * We give our sacrificial lamb high priority and access to @@ -294,7 +297,7 @@ static void __oom_kill_task(struct task_struct *p, const char *message) force_sig(SIGKILL, p); } -static int oom_kill_task(struct task_struct *p, const char *message) +static int oom_kill_task(struct task_struct *p) { struct mm_struct *mm; struct task_struct *g, *q; @@ -313,15 +316,25 @@ static int oom_kill_task(struct task_struct *p, const char *message) if (mm == NULL) return 1; - __oom_kill_task(p, message); + /* + * Don't kill the process if any threads are set to OOM_DISABLE + */ + do_each_thread(g, q) { + if (q->mm == mm && p->oomkilladj == OOM_DISABLE) + return 1; + } while_each_thread(g, q); + + __oom_kill_task(p, 1); + /* * kill all processes that share the ->mm (i.e. all threads), - * but are in a different thread group + * but are in a different thread group. Don't let them have access + * to memory reserves though, otherwise we might deplete all memory. */ - do_each_thread(g, q) + do_each_thread(g, q) { if (q->mm == mm && q->tgid != p->tgid) - __oom_kill_task(q, message); - while_each_thread(g, q); + force_sig(SIGKILL, p); + } while_each_thread(g, q); return 0; } @@ -337,21 +350,22 @@ static int oom_kill_process(struct task_struct *p, unsigned long points, * its children or threads, just set TIF_MEMDIE so it can die quickly */ if (p->flags & PF_EXITING) { - __oom_kill_task(p, NULL); + __oom_kill_task(p, 0); return 0; } - printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li" - " and children.\n", p->pid, p->comm, points); + printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n", + message, p->pid, p->comm, points); + /* Try to kill a child first */ list_for_each(tsk, &p->children) { c = list_entry(tsk, struct task_struct, sibling); if (c->mm == p->mm) continue; - if (!oom_kill_task(c, message)) + if (!oom_kill_task(c)) return 0; } - return oom_kill_task(p, message); + return oom_kill_task(p); } static BLOCKING_NOTIFIER_HEAD(oom_notify_list); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 8d9b19f239c..1d2fc89ca56 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -21,6 +21,7 @@ #include <linux/writeback.h> #include <linux/init.h> #include <linux/backing-dev.h> +#include <linux/task_io_accounting_ops.h> #include <linux/blkdev.h> #include <linux/mpage.h> #include <linux/rmap.h> @@ -761,23 +762,24 @@ int __set_page_dirty_nobuffers(struct page *page) struct address_space *mapping = page_mapping(page); struct address_space *mapping2; - if (mapping) { - write_lock_irq(&mapping->tree_lock); - mapping2 = page_mapping(page); - if (mapping2) { /* Race with truncate? */ - BUG_ON(mapping2 != mapping); - if (mapping_cap_account_dirty(mapping)) - __inc_zone_page_state(page, - NR_FILE_DIRTY); - radix_tree_tag_set(&mapping->page_tree, - page_index(page), PAGECACHE_TAG_DIRTY); - } - write_unlock_irq(&mapping->tree_lock); - if (mapping->host) { - /* !PageAnon && !swapper_space */ - __mark_inode_dirty(mapping->host, - I_DIRTY_PAGES); + if (!mapping) + return 1; + + write_lock_irq(&mapping->tree_lock); + mapping2 = page_mapping(page); + if (mapping2) { /* Race with truncate? */ + BUG_ON(mapping2 != mapping); + if (mapping_cap_account_dirty(mapping)) { + __inc_zone_page_state(page, NR_FILE_DIRTY); + task_io_account_write(PAGE_CACHE_SIZE); } + radix_tree_tag_set(&mapping->page_tree, + page_index(page), PAGECACHE_TAG_DIRTY); + } + write_unlock_irq(&mapping->tree_lock); + if (mapping->host) { + /* !PageAnon && !swapper_space */ + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } return 1; } @@ -843,39 +845,6 @@ int set_page_dirty_lock(struct page *page) EXPORT_SYMBOL(set_page_dirty_lock); /* - * Clear a page's dirty flag, while caring for dirty memory accounting. - * Returns true if the page was previously dirty. - */ -int test_clear_page_dirty(struct page *page) -{ - struct address_space *mapping = page_mapping(page); - unsigned long flags; - - if (mapping) { - write_lock_irqsave(&mapping->tree_lock, flags); - if (TestClearPageDirty(page)) { - radix_tree_tag_clear(&mapping->page_tree, - page_index(page), - PAGECACHE_TAG_DIRTY); - write_unlock_irqrestore(&mapping->tree_lock, flags); - /* - * We can continue to use `mapping' here because the - * page is locked, which pins the address_space - */ - if (mapping_cap_account_dirty(mapping)) { - page_mkclean(page); - dec_zone_page_state(page, NR_FILE_DIRTY); - } - return 1; - } - write_unlock_irqrestore(&mapping->tree_lock, flags); - return 0; - } - return TestClearPageDirty(page); -} -EXPORT_SYMBOL(test_clear_page_dirty); - -/* * Clear a page's dirty flag, while caring for dirty memory accounting. * Returns true if the page was previously dirty. * @@ -893,12 +862,41 @@ int clear_page_dirty_for_io(struct page *page) { struct address_space *mapping = page_mapping(page); - if (mapping) { + if (mapping && mapping_cap_account_dirty(mapping)) { + /* + * Yes, Virginia, this is indeed insane. + * + * We use this sequence to make sure that + * (a) we account for dirty stats properly + * (b) we tell the low-level filesystem to + * mark the whole page dirty if it was + * dirty in a pagetable. Only to then + * (c) clean the page again and return 1 to + * cause the writeback. + * + * This way we avoid all nasty races with the + * dirty bit in multiple places and clearing + * them concurrently from different threads. + * + * Note! Normally the "set_page_dirty(page)" + * has no effect on the actual dirty bit - since + * that will already usually be set. But we + * need the side effects, and it can help us + * avoid races. + * + * We basically use the page "master dirty bit" + * as a serialization point for all the different + * threads doing their things. + * + * FIXME! We still have a race here: if somebody + * adds the page back to the page tables in + * between the "page_mkclean()" and the "TestClearPageDirty()", + * we might have it mapped without the dirty bit set. + */ + if (page_mkclean(page)) + set_page_dirty(page); if (TestClearPageDirty(page)) { - if (mapping_cap_account_dirty(mapping)) { - page_mkclean(page); - dec_zone_page_state(page, NR_FILE_DIRTY); - } + dec_zone_page_state(page, NR_FILE_DIRTY); return 1; } return 0; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index aa6fcc7ca66..fc5b5442e94 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -40,6 +40,7 @@ #include <linux/sort.h> #include <linux/pfn.h> #include <linux/backing-dev.h> +#include <linux/fault-inject.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -83,14 +84,7 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { EXPORT_SYMBOL(totalram_pages); -/* - * Used by page_zone() to look up the address of the struct zone whose - * id is encoded in the upper bits of page->flags - */ -struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly; -EXPORT_SYMBOL(zone_table); - -static char *zone_names[MAX_NR_ZONES] = { +static char * const zone_names[MAX_NR_ZONES] = { "DMA", #ifdef CONFIG_ZONE_DMA32 "DMA32", @@ -237,7 +231,7 @@ static void prep_compound_page(struct page *page, unsigned long order) int i; int nr_pages = 1 << order; - page[1].lru.next = (void *)free_compound_page; /* set dtor */ + set_compound_page_dtor(page, free_compound_page); page[1].lru.prev = (void *)order; for (i = 0; i < nr_pages; i++) { struct page *p = page + i; @@ -486,7 +480,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order) spin_lock(&zone->lock); zone->all_unreclaimable = 0; zone->pages_scanned = 0; - __free_one_page(page, zone ,order); + __free_one_page(page, zone, order); spin_unlock(&zone->lock); } @@ -605,6 +599,8 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) 1 << PG_checked | 1 << PG_mappedtodisk); set_page_private(page, 0); set_page_refcounted(page); + + arch_alloc_page(page, order); kernel_map_pages(page, 1 << order, 1); if (gfp_flags & __GFP_ZERO) @@ -690,9 +686,15 @@ void drain_node_pages(int nodeid) pcp = &pset->pcp[i]; if (pcp->count) { + int to_drain; + local_irq_save(flags); - free_pages_bulk(zone, pcp->count, &pcp->list, 0); - pcp->count = 0; + if (pcp->count >= pcp->batch) + to_drain = pcp->batch; + else + to_drain = pcp->count; + free_pages_bulk(zone, to_drain, &pcp->list, 0); + pcp->count -= to_drain; local_irq_restore(flags); } } @@ -700,7 +702,6 @@ void drain_node_pages(int nodeid) } #endif -#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) static void __drain_pages(unsigned int cpu) { unsigned long flags; @@ -710,6 +711,9 @@ static void __drain_pages(unsigned int cpu) for_each_zone(zone) { struct per_cpu_pageset *pset; + if (!populated_zone(zone)) + continue; + pset = zone_pcp(zone, cpu); for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { struct per_cpu_pages *pcp; @@ -722,7 +726,6 @@ static void __drain_pages(unsigned int cpu) } } } -#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */ #ifdef CONFIG_PM @@ -893,6 +896,91 @@ failed: #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ +#ifdef CONFIG_FAIL_PAGE_ALLOC + +static struct fail_page_alloc_attr { + struct fault_attr attr; + + u32 ignore_gfp_highmem; + u32 ignore_gfp_wait; + +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS + + struct dentry *ignore_gfp_highmem_file; + struct dentry *ignore_gfp_wait_file; + +#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ + +} fail_page_alloc = { + .attr = FAULT_ATTR_INITIALIZER, + .ignore_gfp_wait = 1, + .ignore_gfp_highmem = 1, +}; + +static int __init setup_fail_page_alloc(char *str) +{ + return setup_fault_attr(&fail_page_alloc.attr, str); +} +__setup("fail_page_alloc=", setup_fail_page_alloc); + +static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +{ + if (gfp_mask & __GFP_NOFAIL) + return 0; + if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) + return 0; + if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) + return 0; + + return should_fail(&fail_page_alloc.attr, 1 << order); +} + +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS + +static int __init fail_page_alloc_debugfs(void) +{ + mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; + struct dentry *dir; + int err; + + err = init_fault_attr_dentries(&fail_page_alloc.attr, + "fail_page_alloc"); + if (err) + return err; + dir = fail_page_alloc.attr.dentries.dir; + + fail_page_alloc.ignore_gfp_wait_file = + debugfs_create_bool("ignore-gfp-wait", mode, dir, + &fail_page_alloc.ignore_gfp_wait); + + fail_page_alloc.ignore_gfp_highmem_file = + debugfs_create_bool("ignore-gfp-highmem", mode, dir, + &fail_page_alloc.ignore_gfp_highmem); + + if (!fail_page_alloc.ignore_gfp_wait_file || + !fail_page_alloc.ignore_gfp_highmem_file) { + err = -ENOMEM; + debugfs_remove(fail_page_alloc.ignore_gfp_wait_file); + debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file); + cleanup_fault_attr_dentries(&fail_page_alloc.attr); + } + + return err; +} + +late_initcall(fail_page_alloc_debugfs); + +#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ + +#else /* CONFIG_FAIL_PAGE_ALLOC */ + +static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +{ + return 0; +} + +#endif /* CONFIG_FAIL_PAGE_ALLOC */ + /* * Return 1 if free pages are above 'mark'. This takes into account the order * of the allocation. @@ -925,31 +1013,160 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, return 1; } +#ifdef CONFIG_NUMA +/* + * zlc_setup - Setup for "zonelist cache". Uses cached zone data to + * skip over zones that are not allowed by the cpuset, or that have + * been recently (in last second) found to be nearly full. See further + * comments in mmzone.h. Reduces cache footprint of zonelist scans + * that have to skip over alot of full or unallowed zones. + * + * If the zonelist cache is present in the passed in zonelist, then + * returns a pointer to the allowed node mask (either the current + * tasks mems_allowed, or node_online_map.) + * + * If the zonelist cache is not available for this zonelist, does + * nothing and returns NULL. + * + * If the fullzones BITMAP in the zonelist cache is stale (more than + * a second since last zap'd) then we zap it out (clear its bits.) + * + * We hold off even calling zlc_setup, until after we've checked the + * first zone in the zonelist, on the theory that most allocations will + * be satisfied from that first zone, so best to examine that zone as + * quickly as we can. + */ +static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) +{ + struct zonelist_cache *zlc; /* cached zonelist speedup info */ + nodemask_t *allowednodes; /* zonelist_cache approximation */ + + zlc = zonelist->zlcache_ptr; + if (!zlc) + return NULL; + + if (jiffies - zlc->last_full_zap > 1 * HZ) { + bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); + zlc->last_full_zap = jiffies; + } + + allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? + &cpuset_current_mems_allowed : + &node_online_map; + return allowednodes; +} + +/* + * Given 'z' scanning a zonelist, run a couple of quick checks to see + * if it is worth looking at further for free memory: + * 1) Check that the zone isn't thought to be full (doesn't have its + * bit set in the zonelist_cache fullzones BITMAP). + * 2) Check that the zones node (obtained from the zonelist_cache + * z_to_n[] mapping) is allowed in the passed in allowednodes mask. + * Return true (non-zero) if zone is worth looking at further, or + * else return false (zero) if it is not. + * + * This check -ignores- the distinction between various watermarks, + * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is + * found to be full for any variation of these watermarks, it will + * be considered full for up to one second by all requests, unless + * we are so low on memory on all allowed nodes that we are forced + * into the second scan of the zonelist. + * + * In the second scan we ignore this zonelist cache and exactly + * apply the watermarks to all zones, even it is slower to do so. + * We are low on memory in the second scan, and should leave no stone + * unturned looking for a free page. + */ +static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, + nodemask_t *allowednodes) +{ + struct zonelist_cache *zlc; /* cached zonelist speedup info */ + int i; /* index of *z in zonelist zones */ + int n; /* node that zone *z is on */ + + zlc = zonelist->zlcache_ptr; + if (!zlc) + return 1; + + i = z - zonelist->zones; + n = zlc->z_to_n[i]; + + /* This zone is worth trying if it is allowed but not full */ + return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones); +} + +/* + * Given 'z' scanning a zonelist, set the corresponding bit in + * zlc->fullzones, so that subsequent attempts to allocate a page + * from that zone don't waste time re-examining it. + */ +static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) +{ + struct zonelist_cache *zlc; /* cached zonelist speedup info */ + int i; /* index of *z in zonelist zones */ + + zlc = zonelist->zlcache_ptr; + if (!zlc) + return; + + i = z - zonelist->zones; + + set_bit(i, zlc->fullzones); +} + +#else /* CONFIG_NUMA */ + +static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) +{ + return NULL; +} + +static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, + nodemask_t *allowednodes) +{ + return 1; +} + +static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) +{ +} +#endif /* CONFIG_NUMA */ + /* - * get_page_from_freeliest goes through the zonelist trying to allocate + * get_page_from_freelist goes through the zonelist trying to allocate * a page. */ static struct page * get_page_from_freelist(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, int alloc_flags) { - struct zone **z = zonelist->zones; + struct zone **z; struct page *page = NULL; - int classzone_idx = zone_idx(*z); + int classzone_idx = zone_idx(zonelist->zones[0]); struct zone *zone; + nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ + int zlc_active = 0; /* set if using zonelist_cache */ + int did_zlc_setup = 0; /* just call zlc_setup() one time */ +zonelist_scan: /* - * Go through the zonelist once, looking for a zone with enough free. + * Scan zonelist, looking for a zone with enough free. * See also cpuset_zone_allowed() comment in kernel/cpuset.c. */ + z = zonelist->zones; + do { + if (NUMA_BUILD && zlc_active && + !zlc_zone_worth_trying(zonelist, z, allowednodes)) + continue; zone = *z; if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) && zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) break; if ((alloc_flags & ALLOC_CPUSET) && - !cpuset_zone_allowed(zone, gfp_mask)) - continue; + !cpuset_zone_allowed_softwall(zone, gfp_mask)) + goto try_next_zone; if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { unsigned long mark; @@ -959,18 +1176,34 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, mark = zone->pages_low; else mark = zone->pages_high; - if (!zone_watermark_ok(zone , order, mark, - classzone_idx, alloc_flags)) + if (!zone_watermark_ok(zone, order, mark, + classzone_idx, alloc_flags)) { if (!zone_reclaim_mode || !zone_reclaim(zone, gfp_mask, order)) - continue; + goto this_zone_full; + } } page = buffered_rmqueue(zonelist, zone, order, gfp_mask); - if (page) { + if (page) break; +this_zone_full: + if (NUMA_BUILD) + zlc_mark_zone_full(zonelist, z); +try_next_zone: + if (NUMA_BUILD && !did_zlc_setup) { + /* we do zlc_setup after the first zone is tried */ + allowednodes = zlc_setup(zonelist, alloc_flags); + zlc_active = 1; + did_zlc_setup = 1; } } while (*(++z) != NULL); + + if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { + /* Disable zlc cache for second zonelist scan */ + zlc_active = 0; + goto zonelist_scan; + } return page; } @@ -992,6 +1225,9 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order, might_sleep_if(wait); + if (should_fail_alloc_page(gfp_mask, order)) + return NULL; + restart: z = zonelist->zones; /* the list of zones suitable for gfp_mask */ @@ -1005,9 +1241,19 @@ restart: if (page) goto got_pg; - do { + /* + * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and + * __GFP_NOWARN set) should not cause reclaim since the subsystem + * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim + * using a larger set of nodes after it has established that the + * allowed per node queues are empty and that nodes are + * over allocated. + */ + if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) + goto nopage; + + for (z = zonelist->zones; *z; z++) wakeup_kswapd(*z, order); - } while (*(++z)); /* * OK, we're below the kswapd watermark and have kicked background @@ -1041,6 +1287,7 @@ restart: /* This allocation should allow future memory freeing. */ +rebalance: if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) && !in_interrupt()) { if (!(gfp_mask & __GFP_NOMEMALLOC)) { @@ -1062,7 +1309,6 @@ nofail_alloc: if (!wait) goto nopage; -rebalance: cond_resched(); /* We now go into synchronous reclaim */ @@ -1262,7 +1508,7 @@ unsigned int nr_free_pagecache_pages(void) static inline void show_node(struct zone *zone) { if (NUMA_BUILD) - printk("Node %ld ", zone_to_nid(zone)); + printk("Node %d ", zone_to_nid(zone)); } void si_meminfo(struct sysinfo *val) @@ -1542,6 +1788,24 @@ static void __meminit build_zonelists(pg_data_t *pgdat) } } +/* Construct the zonelist performance cache - see further mmzone.h */ +static void __meminit build_zonelist_cache(pg_data_t *pgdat) +{ + int i; + + for (i = 0; i < MAX_NR_ZONES; i++) { + struct zonelist *zonelist; + struct zonelist_cache *zlc; + struct zone **z; + + zonelist = pgdat->node_zonelists + i; + zonelist->zlcache_ptr = zlc = &zonelist->zlcache; + bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); + for (z = zonelist->zones; *z; z++) + zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z); + } +} + #else /* CONFIG_NUMA */ static void __meminit build_zonelists(pg_data_t *pgdat) @@ -1579,14 +1843,26 @@ static void __meminit build_zonelists(pg_data_t *pgdat) } } +/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ +static void __meminit build_zonelist_cache(pg_data_t *pgdat) +{ + int i; + + for (i = 0; i < MAX_NR_ZONES; i++) + pgdat->node_zonelists[i].zlcache_ptr = NULL; +} + #endif /* CONFIG_NUMA */ /* return values int ....just for stop_machine_run() */ static int __meminit __build_all_zonelists(void *dummy) { int nid; - for_each_online_node(nid) + + for_each_online_node(nid) { build_zonelists(NODE_DATA(nid)); + build_zonelist_cache(NODE_DATA(nid)); + } return 0; } @@ -1680,17 +1956,24 @@ static inline unsigned long wait_table_bits(unsigned long size) * done. Non-atomic initialization, single-pass. */ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, - unsigned long start_pfn) + unsigned long start_pfn, enum memmap_context context) { struct page *page; unsigned long end_pfn = start_pfn + size; unsigned long pfn; for (pfn = start_pfn; pfn < end_pfn; pfn++) { - if (!early_pfn_valid(pfn)) - continue; - if (!early_pfn_in_nid(pfn, nid)) - continue; + /* + * There can be holes in boot-time mem_map[]s + * handed to this function. They do not + * exist on hotplugged memory. + */ + if (context == MEMMAP_EARLY) { + if (!early_pfn_valid(pfn)) + continue; + if (!early_pfn_in_nid(pfn, nid)) + continue; + } page = pfn_to_page(pfn); set_page_links(page, zone, nid, pfn); init_page_count(page); @@ -1715,23 +1998,9 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, } } -#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr) -void zonetable_add(struct zone *zone, int nid, enum zone_type zid, - unsigned long pfn, unsigned long size) -{ - unsigned long snum = pfn_to_section_nr(pfn); - unsigned long end = pfn_to_section_nr(pfn + size); - - if (FLAGS_HAS_NODE) - zone_table[ZONETABLE_INDEX(nid, zid)] = zone; - else - for (; snum <= end; snum++) - zone_table[ZONETABLE_INDEX(snum, zid)] = zone; -} - #ifndef __HAVE_ARCH_MEMMAP_INIT #define memmap_init(size, nid, zone, start_pfn) \ - memmap_init_zone((size), (nid), (zone), (start_pfn)) + memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) #endif static int __cpuinit zone_batchsize(struct zone *zone) @@ -1881,16 +2150,16 @@ static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, int ret = NOTIFY_OK; switch (action) { - case CPU_UP_PREPARE: - if (process_zones(cpu)) - ret = NOTIFY_BAD; - break; - case CPU_UP_CANCELED: - case CPU_DEAD: - free_zone_pagesets(cpu); - break; - default: - break; + case CPU_UP_PREPARE: + if (process_zones(cpu)) + ret = NOTIFY_BAD; + break; + case CPU_UP_CANCELED: + case CPU_DEAD: + free_zone_pagesets(cpu); + break; + default: + break; } return ret; } @@ -1977,7 +2246,8 @@ static __meminit void zone_pcp_init(struct zone *zone) __meminit int init_currently_empty_zone(struct zone *zone, unsigned long zone_start_pfn, - unsigned long size) + unsigned long size, + enum memmap_context context) { struct pglist_data *pgdat = zone->zone_pgdat; int ret; @@ -2421,8 +2691,8 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, if (!size) continue; - zonetable_add(zone, nid, j, zone_start_pfn, size); - ret = init_currently_empty_zone(zone, zone_start_pfn, size); + ret = init_currently_empty_zone(zone, zone_start_pfn, + size, MEMMAP_EARLY); BUG_ON(ret); zone_start_pfn += size; } @@ -2736,7 +3006,6 @@ void __init free_area_init(unsigned long *zones_size) __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); } -#ifdef CONFIG_HOTPLUG_CPU static int page_alloc_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -2751,7 +3020,6 @@ static int page_alloc_cpu_notify(struct notifier_block *self, } return NOTIFY_OK; } -#endif /* CONFIG_HOTPLUG_CPU */ void __init page_alloc_init(void) { @@ -3055,7 +3323,7 @@ void *__init alloc_large_system_hash(const char *tablename, /* allow the kernel cmdline to have a say */ if (!numentries) { /* round applicable memory size up to nearest megabyte */ - numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages; + numentries = nr_kernel_pages; numentries += (1UL << (20 - PAGE_SHIFT)) - 1; numentries >>= 20 - PAGE_SHIFT; numentries <<= 20 - PAGE_SHIFT; @@ -3065,6 +3333,10 @@ void *__init alloc_large_system_hash(const char *tablename, numentries >>= (scale - PAGE_SHIFT); else numentries <<= (PAGE_SHIFT - scale); + + /* Make sure we've got at least a 0-order allocation.. */ + if (unlikely((numentries * bucketsize) < PAGE_SIZE)) + numentries = PAGE_SIZE / bucketsize; } numentries = roundup_pow_of_two(numentries); @@ -3077,7 +3349,7 @@ void *__init alloc_large_system_hash(const char *tablename, if (numentries > max) numentries = max; - log2qty = long_log2(numentries); + log2qty = ilog2(numentries); do { size = bucketsize << log2qty; @@ -3099,7 +3371,7 @@ void *__init alloc_large_system_hash(const char *tablename, printk("%s hash table entries: %d (order: %d, %lu bytes)\n", tablename, (1U << log2qty), - long_log2(size) - PAGE_SHIFT, + ilog2(size) - PAGE_SHIFT, size); if (_hash_shift) diff --git a/mm/page_io.c b/mm/page_io.c index d4840ecbf8f..dbffec0d78c 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -147,48 +147,3 @@ int swap_readpage(struct file *file, struct page *page) out: return ret; } - -#ifdef CONFIG_SOFTWARE_SUSPEND -/* - * A scruffy utility function to read or write an arbitrary swap page - * and wait on the I/O. The caller must have a ref on the page. - * - * We use end_swap_bio_read() even for writes, because it happens to do what - * we want. - */ -int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page, - struct bio **bio_chain) -{ - struct bio *bio; - int ret = 0; - int bio_rw; - - lock_page(page); - - bio = get_swap_bio(GFP_KERNEL, entry.val, page, end_swap_bio_read); - if (bio == NULL) { - unlock_page(page); - ret = -ENOMEM; - goto out; - } - - bio_rw = rw; - if (!bio_chain) - bio_rw |= (1 << BIO_RW_SYNC); - if (bio_chain) - bio_get(bio); - submit_bio(bio_rw, bio); - if (bio_chain == NULL) { - wait_on_page_locked(page); - - if (!PageUptodate(page) || PageError(page)) - ret = -EIO; - } - if (bio_chain) { - bio->bi_private = *bio_chain; - *bio_chain = bio; - } -out: - return ret; -} -#endif diff --git a/mm/pdflush.c b/mm/pdflush.c index b02102feeb4..8ce0900dc95 100644 --- a/mm/pdflush.c +++ b/mm/pdflush.c @@ -21,6 +21,7 @@ #include <linux/writeback.h> // Prototypes pdflush_operation() #include <linux/kthread.h> #include <linux/cpuset.h> +#include <linux/freezer.h> /* diff --git a/mm/readahead.c b/mm/readahead.c index 23cb61a01c6..0f539e8e827 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -13,6 +13,7 @@ #include <linux/module.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> +#include <linux/task_io_accounting_ops.h> #include <linux/pagevec.h> void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) @@ -148,15 +149,10 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, if (!pagevec_add(&lru_pvec, page)) __pagevec_lru_add(&lru_pvec); if (ret) { - while (!list_empty(pages)) { - struct page *victim; - - victim = list_to_page(pages); - list_del(&victim->lru); - page_cache_release(victim); - } + put_pages_list(pages); break; } + task_io_account_read(PAGE_CACHE_SIZE); } pagevec_lru_add(&lru_pvec); return ret; @@ -456,7 +452,7 @@ static int make_ahead_window(struct address_space *mapping, struct file *filp, * * Note that @filp is purely used for passing on to the ->readpage[s]() * handler: it may refer to a different file from @mapping (so we may not use - * @filp->f_mapping or @filp->f_dentry->d_inode here). + * @filp->f_mapping or @filp->f_path.dentry->d_inode here). * Also, @ra may not be equal to &@filp->f_ra. * */ diff --git a/mm/rmap.c b/mm/rmap.c index d8a842a586d..669acb22b57 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -47,6 +47,7 @@ #include <linux/rmap.h> #include <linux/rcupdate.h> #include <linux/module.h> +#include <linux/kallsyms.h> #include <asm/tlbflush.h> @@ -432,7 +433,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) { struct mm_struct *mm = vma->vm_mm; unsigned long address; - pte_t *pte, entry; + pte_t *pte; spinlock_t *ptl; int ret = 0; @@ -444,17 +445,18 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) if (!pte) goto out; - if (!pte_dirty(*pte) && !pte_write(*pte)) - goto unlock; + if (pte_dirty(*pte) || pte_write(*pte)) { + pte_t entry; - entry = ptep_get_and_clear(mm, address, pte); - entry = pte_mkclean(entry); - entry = pte_wrprotect(entry); - ptep_establish(vma, address, pte, entry); - lazy_mmu_prot_update(entry); - ret = 1; + flush_cache_page(vma, address, pte_pfn(*pte)); + entry = ptep_clear_flush(vma, address, pte); + entry = pte_wrprotect(entry); + entry = pte_mkclean(entry); + set_pte_at(mm, address, pte, entry); + lazy_mmu_prot_update(entry); + ret = 1; + } -unlock: pte_unmap_unlock(pte, ptl); out: return ret; @@ -489,6 +491,8 @@ int page_mkclean(struct page *page) if (mapping) ret = page_mkclean_file(mapping, page); } + if (page_test_and_clear_dirty(page)) + ret = 1; return ret; } @@ -567,14 +571,20 @@ void page_add_file_rmap(struct page *page) * * The caller needs to hold the pte lock. */ -void page_remove_rmap(struct page *page) +void page_remove_rmap(struct page *page, struct vm_area_struct *vma) { if (atomic_add_negative(-1, &page->_mapcount)) { if (unlikely(page_mapcount(page) < 0)) { printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); + printk (KERN_EMERG " page pfn = %lx\n", page_to_pfn(page)); printk (KERN_EMERG " page->flags = %lx\n", page->flags); printk (KERN_EMERG " page->count = %x\n", page_count(page)); printk (KERN_EMERG " page->mapping = %p\n", page->mapping); + print_symbol (KERN_EMERG " vma->vm_ops = %s\n", (unsigned long)vma->vm_ops); + if (vma->vm_ops) + print_symbol (KERN_EMERG " vma->vm_ops->nopage = %s\n", (unsigned long)vma->vm_ops->nopage); + if (vma->vm_file && vma->vm_file->f_op) + print_symbol (KERN_EMERG " vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap); BUG(); } @@ -679,7 +689,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, dec_mm_counter(mm, file_rss); - page_remove_rmap(page); + page_remove_rmap(page, vma); page_cache_release(page); out_unmap: @@ -769,7 +779,7 @@ static void try_to_unmap_cluster(unsigned long cursor, if (pte_dirty(pteval)) set_page_dirty(page); - page_remove_rmap(page); + page_remove_rmap(page, vma); page_cache_release(page); dec_mm_counter(mm, file_rss); (*mapcount)--; diff --git a/mm/shmem.c b/mm/shmem.c index 4959535fc14..70da7a0981b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -177,7 +177,7 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages) static struct super_operations shmem_ops; static const struct address_space_operations shmem_aops; -static struct file_operations shmem_file_operations; +static const struct file_operations shmem_file_operations; static struct inode_operations shmem_inode_operations; static struct inode_operations shmem_dir_inode_operations; static struct inode_operations shmem_special_inode_operations; @@ -515,7 +515,12 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) size = SHMEM_NR_DIRECT; nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size); } - if (!topdir) + + /* + * If there are no indirect blocks or we are punching a hole + * below indirect blocks, nothing to be done. + */ + if (!topdir || (punch_hole && (limit <= SHMEM_NR_DIRECT))) goto done2; BUG_ON(limit <= SHMEM_NR_DIRECT); @@ -1225,7 +1230,7 @@ failed: struct page *shmem_nopage(struct vm_area_struct *vma, unsigned long address, int *type) { - struct inode *inode = vma->vm_file->f_dentry->d_inode; + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; struct page *page = NULL; unsigned long idx; int error; @@ -1248,7 +1253,7 @@ static int shmem_populate(struct vm_area_struct *vma, unsigned long addr, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock) { - struct inode *inode = vma->vm_file->f_dentry->d_inode; + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; struct mm_struct *mm = vma->vm_mm; enum sgp_type sgp = nonblock? SGP_QUICK: SGP_CACHE; unsigned long size; @@ -1293,14 +1298,14 @@ static int shmem_populate(struct vm_area_struct *vma, #ifdef CONFIG_NUMA int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) { - struct inode *i = vma->vm_file->f_dentry->d_inode; + struct inode *i = vma->vm_file->f_path.dentry->d_inode; return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); } struct mempolicy * shmem_get_policy(struct vm_area_struct *vma, unsigned long addr) { - struct inode *i = vma->vm_file->f_dentry->d_inode; + struct inode *i = vma->vm_file->f_path.dentry->d_inode; unsigned long idx; idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; @@ -1310,7 +1315,7 @@ shmem_get_policy(struct vm_area_struct *vma, unsigned long addr) int shmem_lock(struct file *file, int lock, struct user_struct *user) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; struct shmem_inode_info *info = SHMEM_I(inode); int retval = -ENOMEM; @@ -1422,7 +1427,7 @@ shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsig static ssize_t shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; loff_t pos; unsigned long written; ssize_t err; @@ -1442,7 +1447,7 @@ shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t if (err || !count) goto out; - err = remove_suid(file->f_dentry); + err = remove_suid(file->f_path.dentry); if (err) goto out; @@ -1524,7 +1529,7 @@ out: static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; struct address_space *mapping = inode->i_mapping; unsigned long index, offset; @@ -1943,7 +1948,7 @@ static int shmem_xattr_security_set(struct inode *inode, const char *name, return security_inode_setsecurity(inode, name, value, size, flags); } -struct xattr_handler shmem_xattr_security_handler = { +static struct xattr_handler shmem_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .list = shmem_xattr_security_list, .get = shmem_xattr_security_get, @@ -2263,7 +2268,7 @@ static struct kmem_cache *shmem_inode_cachep; static struct inode *shmem_alloc_inode(struct super_block *sb) { struct shmem_inode_info *p; - p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, SLAB_KERNEL); + p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); if (!p) return NULL; return &p->vfs_inode; @@ -2319,7 +2324,7 @@ static const struct address_space_operations shmem_aops = { .migratepage = migrate_page, }; -static struct file_operations shmem_file_operations = { +static const struct file_operations shmem_file_operations = { .mmap = shmem_mmap, #ifdef CONFIG_TMPFS .llseek = generic_file_llseek, @@ -2493,8 +2498,8 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) d_instantiate(dentry, inode); inode->i_size = size; inode->i_nlink = 0; /* It is unlinked */ - file->f_vfsmnt = mntget(shm_mnt); - file->f_dentry = dentry; + file->f_path.mnt = mntget(shm_mnt); + file->f_path.dentry = dentry; file->f_mapping = inode->i_mapping; file->f_op = &shmem_file_operations; file->f_mode = FMODE_WRITE | FMODE_READ; diff --git a/mm/slab.c b/mm/slab.c index 3c4a7e34edd..c6100628a6e 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -103,12 +103,14 @@ #include <linux/module.h> #include <linux/rcupdate.h> #include <linux/string.h> +#include <linux/uaccess.h> #include <linux/nodemask.h> #include <linux/mempolicy.h> #include <linux/mutex.h> +#include <linux/fault-inject.h> #include <linux/rtmutex.h> +#include <linux/reciprocal_div.h> -#include <asm/uaccess.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> #include <asm/page.h> @@ -313,7 +315,7 @@ static int drain_freelist(struct kmem_cache *cache, static void free_block(struct kmem_cache *cachep, void **objpp, int len, int node); static int enable_cpucache(struct kmem_cache *cachep); -static void cache_reap(void *unused); +static void cache_reap(struct work_struct *unused); /* * This function must be completely optimized away if a constant is passed to @@ -385,6 +387,7 @@ struct kmem_cache { unsigned int shared; unsigned int buffer_size; + u32 reciprocal_buffer_size; /* 3) touched by every alloc & free from the backend */ struct kmem_list3 *nodelists[MAX_NUMNODES]; @@ -626,10 +629,17 @@ static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab, return slab->s_mem + cache->buffer_size * idx; } -static inline unsigned int obj_to_index(struct kmem_cache *cache, - struct slab *slab, void *obj) +/* + * We want to avoid an expensive divide : (offset / cache->buffer_size) + * Using the fact that buffer_size is a constant for a particular cache, + * we can replace (offset / cache->buffer_size) by + * reciprocal_divide(offset, cache->reciprocal_buffer_size) + */ +static inline unsigned int obj_to_index(const struct kmem_cache *cache, + const struct slab *slab, void *obj) { - return (unsigned)(obj - slab->s_mem) / cache->buffer_size; + u32 offset = (obj - slab->s_mem); + return reciprocal_divide(offset, cache->reciprocal_buffer_size); } /* @@ -730,7 +740,10 @@ static inline void init_lock_keys(void) } #endif -/* Guard access to the cache-chain. */ +/* + * 1. Guard access to the cache-chain. + * 2. Protect sanity of cpu_online_map against cpu hotplug events + */ static DEFINE_MUTEX(cache_chain_mutex); static struct list_head cache_chain; @@ -753,7 +766,7 @@ int slab_is_available(void) return g_cpucache_up == FULL; } -static DEFINE_PER_CPU(struct work_struct, reap_work); +static DEFINE_PER_CPU(struct delayed_work, reap_work); static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) { @@ -866,6 +879,22 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, dump_stack(); } +/* + * By default on NUMA we use alien caches to stage the freeing of + * objects allocated from other nodes. This causes massive memory + * inefficiencies when using fake NUMA setup to split memory into a + * large number of small nodes, so it can be disabled on the command + * line + */ + +static int use_alien_caches __read_mostly = 1; +static int __init noaliencache_setup(char *s) +{ + use_alien_caches = 0; + return 1; +} +__setup("noaliencache", noaliencache_setup); + #ifdef CONFIG_NUMA /* * Special reaping functions for NUMA systems called from cache_reap(). @@ -916,17 +945,18 @@ static void next_reap_node(void) */ static void __devinit start_cpu_timer(int cpu) { - struct work_struct *reap_work = &per_cpu(reap_work, cpu); + struct delayed_work *reap_work = &per_cpu(reap_work, cpu); /* * When this gets called from do_initcalls via cpucache_init(), * init_workqueues() has already run, so keventd will be setup * at that time. */ - if (keventd_up() && reap_work->func == NULL) { + if (keventd_up() && reap_work->work.func == NULL) { init_reap_node(cpu); - INIT_WORK(reap_work, cache_reap, NULL); - schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu); + INIT_DELAYED_WORK(reap_work, cache_reap); + schedule_delayed_work_on(cpu, reap_work, + __round_jiffies_relative(HZ, cpu)); } } @@ -996,7 +1026,7 @@ static inline void *alternate_node_alloc(struct kmem_cache *cachep, return NULL; } -static inline void *__cache_alloc_node(struct kmem_cache *cachep, +static inline void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { return NULL; @@ -1004,7 +1034,7 @@ static inline void *__cache_alloc_node(struct kmem_cache *cachep, #else /* CONFIG_NUMA */ -static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int); +static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); static void *alternate_node_alloc(struct kmem_cache *, gfp_t); static struct array_cache **alloc_alien_cache(int node, int limit) @@ -1114,7 +1144,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) * Make sure we are not freeing a object from another node to the array * cache on this cpu. */ - if (likely(slabp->nodeid == node)) + if (likely(slabp->nodeid == node) || unlikely(!use_alien_caches)) return 0; l3 = cachep->nodelists[node]; @@ -1192,7 +1222,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, list_for_each_entry(cachep, &cache_chain, next) { struct array_cache *nc; struct array_cache *shared; - struct array_cache **alien; + struct array_cache **alien = NULL; nc = alloc_arraycache(node, cachep->limit, cachep->batchcount); @@ -1204,9 +1234,11 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, if (!shared) goto bad; - alien = alloc_alien_cache(node, cachep->limit); - if (!alien) - goto bad; + if (use_alien_caches) { + alien = alloc_alien_cache(node, cachep->limit); + if (!alien) + goto bad; + } cachep->array[cpu] = nc; l3 = cachep->nodelists[node]; BUG_ON(!l3); @@ -1230,12 +1262,18 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, kfree(shared); free_alien_cache(alien); } - mutex_unlock(&cache_chain_mutex); break; case CPU_ONLINE: + mutex_unlock(&cache_chain_mutex); start_cpu_timer(cpu); break; #ifdef CONFIG_HOTPLUG_CPU + case CPU_DOWN_PREPARE: + mutex_lock(&cache_chain_mutex); + break; + case CPU_DOWN_FAILED: + mutex_unlock(&cache_chain_mutex); + break; case CPU_DEAD: /* * Even if all the cpus of a node are down, we don't free the @@ -1246,8 +1284,8 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, * gets destroyed at kmem_cache_destroy(). */ /* fall thru */ +#endif case CPU_UP_CANCELED: - mutex_lock(&cache_chain_mutex); list_for_each_entry(cachep, &cache_chain, next) { struct array_cache *nc; struct array_cache *shared; @@ -1308,11 +1346,9 @@ free_array_cache: } mutex_unlock(&cache_chain_mutex); break; -#endif } return NOTIFY_OK; bad: - mutex_unlock(&cache_chain_mutex); return NOTIFY_BAD; } @@ -1400,6 +1436,8 @@ void __init kmem_cache_init(void) cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size()); + cache_cache.reciprocal_buffer_size = + reciprocal_value(cache_cache.buffer_size); for (order = 0; order < MAX_ORDER; order++) { cache_estimate(order, cache_cache.buffer_size, @@ -1580,12 +1618,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) flags |= __GFP_COMP; #endif - /* - * Under NUMA we want memory on the indicated node. We will handle - * the needed fallback ourselves since we want to serve from our - * per node object lists first for other nodes. - */ - flags |= cachep->gfpflags | GFP_THISNODE; + flags |= cachep->gfpflags; page = alloc_pages_node(nodeid, flags, cachep->gfporder); if (!page) @@ -2098,15 +2131,12 @@ kmem_cache_create (const char *name, size_t size, size_t align, } /* - * Prevent CPUs from coming and going. - * lock_cpu_hotplug() nests outside cache_chain_mutex + * We use cache_chain_mutex to ensure a consistent view of + * cpu_online_map as well. Please see cpuup_callback */ - lock_cpu_hotplug(); - mutex_lock(&cache_chain_mutex); list_for_each_entry(pc, &cache_chain, next) { - mm_segment_t old_fs = get_fs(); char tmp; int res; @@ -2115,9 +2145,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, * destroy its slab cache and no-one else reuses the vmalloc * area of the module. Print a warning. */ - set_fs(KERNEL_DS); - res = __get_user(tmp, pc->name); - set_fs(old_fs); + res = probe_kernel_address(pc->name, tmp); if (res) { printk("SLAB: cache with size %d has lost its name\n", pc->buffer_size); @@ -2197,25 +2225,24 @@ kmem_cache_create (const char *name, size_t size, size_t align, if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER) ralign = BYTES_PER_WORD; - /* 2) arch mandated alignment: disables debug if necessary */ + /* 2) arch mandated alignment */ if (ralign < ARCH_SLAB_MINALIGN) { ralign = ARCH_SLAB_MINALIGN; - if (ralign > BYTES_PER_WORD) - flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); } - /* 3) caller mandated alignment: disables debug if necessary */ + /* 3) caller mandated alignment */ if (ralign < align) { ralign = align; - if (ralign > BYTES_PER_WORD) - flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); } + /* disable debug if necessary */ + if (ralign > BYTES_PER_WORD) + flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); /* * 4) Store it. */ align = ralign; /* Get cache's description obj. */ - cachep = kmem_cache_zalloc(&cache_cache, SLAB_KERNEL); + cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL); if (!cachep) goto oops; @@ -2297,6 +2324,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, if (flags & SLAB_CACHE_DMA) cachep->gfpflags |= GFP_DMA; cachep->buffer_size = size; + cachep->reciprocal_buffer_size = reciprocal_value(size); if (flags & CFLGS_OFF_SLAB) { cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); @@ -2326,7 +2354,6 @@ oops: panic("kmem_cache_create(): failed to create slab `%s'\n", name); mutex_unlock(&cache_chain_mutex); - unlock_cpu_hotplug(); return cachep; } EXPORT_SYMBOL(kmem_cache_create); @@ -2444,6 +2471,7 @@ out: return nr_freed; } +/* Called with cache_chain_mutex held to protect against cpu hotplug */ static int __cache_shrink(struct kmem_cache *cachep) { int ret = 0, i = 0; @@ -2474,9 +2502,13 @@ static int __cache_shrink(struct kmem_cache *cachep) */ int kmem_cache_shrink(struct kmem_cache *cachep) { + int ret; BUG_ON(!cachep || in_interrupt()); - return __cache_shrink(cachep); + mutex_lock(&cache_chain_mutex); + ret = __cache_shrink(cachep); + mutex_unlock(&cache_chain_mutex); + return ret; } EXPORT_SYMBOL(kmem_cache_shrink); @@ -2500,23 +2532,16 @@ void kmem_cache_destroy(struct kmem_cache *cachep) { BUG_ON(!cachep || in_interrupt()); - /* Don't let CPUs to come and go */ - lock_cpu_hotplug(); - /* Find the cache in the chain of caches. */ mutex_lock(&cache_chain_mutex); /* * the chain is never empty, cache_cache is never destroyed */ list_del(&cachep->next); - mutex_unlock(&cache_chain_mutex); - if (__cache_shrink(cachep)) { slab_error(cachep, "Can't free all objects"); - mutex_lock(&cache_chain_mutex); list_add(&cachep->next, &cache_chain); mutex_unlock(&cache_chain_mutex); - unlock_cpu_hotplug(); return; } @@ -2524,7 +2549,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep) synchronize_rcu(); __kmem_cache_destroy(cachep); - unlock_cpu_hotplug(); + mutex_unlock(&cache_chain_mutex); } EXPORT_SYMBOL(kmem_cache_destroy); @@ -2548,7 +2573,7 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, if (OFF_SLAB(cachep)) { /* Slab management obj is off-slab. */ slabp = kmem_cache_alloc_node(cachep->slabp_cache, - local_flags, nodeid); + local_flags & ~GFP_THISNODE, nodeid); if (!slabp) return NULL; } else { @@ -2618,7 +2643,7 @@ static void cache_init_objs(struct kmem_cache *cachep, static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) { - if (flags & SLAB_DMA) + if (flags & GFP_DMA) BUG_ON(!(cachep->gfpflags & GFP_DMA)); else BUG_ON(cachep->gfpflags & GFP_DMA); @@ -2689,10 +2714,10 @@ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, * Grow (by 1) the number of slabs within a cache. This is called by * kmem_cache_alloc() when there are no active objs left in a cache. */ -static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) +static int cache_grow(struct kmem_cache *cachep, + gfp_t flags, int nodeid, void *objp) { struct slab *slabp; - void *objp; size_t offset; gfp_t local_flags; unsigned long ctor_flags; @@ -2702,12 +2727,12 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) * Be lazy and only check for valid flags here, keeping it out of the * critical path in kmem_cache_alloc(). */ - BUG_ON(flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW)); - if (flags & SLAB_NO_GROW) + BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK | __GFP_NO_GROW)); + if (flags & __GFP_NO_GROW) return 0; ctor_flags = SLAB_CTOR_CONSTRUCTOR; - local_flags = (flags & SLAB_LEVEL_MASK); + local_flags = (flags & GFP_LEVEL_MASK); if (!(local_flags & __GFP_WAIT)) /* * Not allowed to sleep. Need to tell a constructor about @@ -2744,12 +2769,14 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) * Get mem for the objs. Attempt to allocate a physical page from * 'nodeid'. */ - objp = kmem_getpages(cachep, flags, nodeid); + if (!objp) + objp = kmem_getpages(cachep, flags, nodeid); if (!objp) goto failed; /* Get slab management. */ - slabp = alloc_slabmgmt(cachep, objp, offset, local_flags, nodeid); + slabp = alloc_slabmgmt(cachep, objp, offset, + local_flags & ~GFP_THISNODE, nodeid); if (!slabp) goto opps1; @@ -2987,7 +3014,7 @@ alloc_done: if (unlikely(!ac->avail)) { int x; - x = cache_grow(cachep, flags, node); + x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); /* cache_grow can reenable interrupts, then ac could change. */ ac = cpu_cache_get(cachep); @@ -3063,18 +3090,101 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, cachep->ctor(objp, cachep, ctor_flags); } +#if ARCH_SLAB_MINALIGN + if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { + printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", + objp, ARCH_SLAB_MINALIGN); + } +#endif return objp; } #else #define cache_alloc_debugcheck_after(a,b,objp,d) (objp) #endif +#ifdef CONFIG_FAILSLAB + +static struct failslab_attr { + + struct fault_attr attr; + + u32 ignore_gfp_wait; +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS + struct dentry *ignore_gfp_wait_file; +#endif + +} failslab = { + .attr = FAULT_ATTR_INITIALIZER, + .ignore_gfp_wait = 1, +}; + +static int __init setup_failslab(char *str) +{ + return setup_fault_attr(&failslab.attr, str); +} +__setup("failslab=", setup_failslab); + +static int should_failslab(struct kmem_cache *cachep, gfp_t flags) +{ + if (cachep == &cache_cache) + return 0; + if (flags & __GFP_NOFAIL) + return 0; + if (failslab.ignore_gfp_wait && (flags & __GFP_WAIT)) + return 0; + + return should_fail(&failslab.attr, obj_size(cachep)); +} + +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS + +static int __init failslab_debugfs(void) +{ + mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; + struct dentry *dir; + int err; + + err = init_fault_attr_dentries(&failslab.attr, "failslab"); + if (err) + return err; + dir = failslab.attr.dentries.dir; + + failslab.ignore_gfp_wait_file = + debugfs_create_bool("ignore-gfp-wait", mode, dir, + &failslab.ignore_gfp_wait); + + if (!failslab.ignore_gfp_wait_file) { + err = -ENOMEM; + debugfs_remove(failslab.ignore_gfp_wait_file); + cleanup_fault_attr_dentries(&failslab.attr); + } + + return err; +} + +late_initcall(failslab_debugfs); + +#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ + +#else /* CONFIG_FAILSLAB */ + +static inline int should_failslab(struct kmem_cache *cachep, gfp_t flags) +{ + return 0; +} + +#endif /* CONFIG_FAILSLAB */ + static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) { void *objp; struct array_cache *ac; check_irq_off(); + + if (should_failslab(cachep, flags)) + return NULL; + ac = cpu_cache_get(cachep); if (likely(ac->avail)) { STATS_INC_ALLOCHIT(cachep); @@ -3105,10 +3215,10 @@ static __always_inline void *__cache_alloc(struct kmem_cache *cachep, objp = ____cache_alloc(cachep, flags); /* * We may just have run out of memory on the local node. - * __cache_alloc_node() knows how to locate memory on other nodes + * ____cache_alloc_node() knows how to locate memory on other nodes */ if (NUMA_BUILD && !objp) - objp = __cache_alloc_node(cachep, flags, numa_node_id()); + objp = ____cache_alloc_node(cachep, flags, numa_node_id()); local_irq_restore(save_flags); objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); @@ -3135,15 +3245,17 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) else if (current->mempolicy) nid_alloc = slab_node(current->mempolicy); if (nid_alloc != nid_here) - return __cache_alloc_node(cachep, flags, nid_alloc); + return ____cache_alloc_node(cachep, flags, nid_alloc); return NULL; } /* * Fallback function if there was no memory available and no objects on a - * certain node and we are allowed to fall back. We mimick the behavior of - * the page allocator. We fall back according to a zonelist determined by - * the policy layer while obeying cpuset constraints. + * certain node and fall back is permitted. First we scan all the + * available nodelists for available objects. If that fails then we + * perform an allocation without specifying a node. This allows the page + * allocator to do its reclaim / fallback magic. We then insert the + * slab into the proper nodelist and then allocate from it. */ void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) { @@ -3151,15 +3263,57 @@ void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) ->node_zonelists[gfp_zone(flags)]; struct zone **z; void *obj = NULL; + int nid; + gfp_t local_flags = (flags & GFP_LEVEL_MASK); +retry: + /* + * Look through allowed nodes for objects available + * from existing per node queues. + */ for (z = zonelist->zones; *z && !obj; z++) { - int nid = zone_to_nid(*z); + nid = zone_to_nid(*z); - if (zone_idx(*z) <= ZONE_NORMAL && - cpuset_zone_allowed(*z, flags) && - cache->nodelists[nid]) - obj = __cache_alloc_node(cache, - flags | __GFP_THISNODE, nid); + if (cpuset_zone_allowed_hardwall(*z, flags) && + cache->nodelists[nid] && + cache->nodelists[nid]->free_objects) + obj = ____cache_alloc_node(cache, + flags | GFP_THISNODE, nid); + } + + if (!obj && !(flags & __GFP_NO_GROW)) { + /* + * This allocation will be performed within the constraints + * of the current cpuset / memory policy requirements. + * We may trigger various forms of reclaim on the allowed + * set and go into memory reserves if necessary. + */ + if (local_flags & __GFP_WAIT) + local_irq_enable(); + kmem_flagcheck(cache, flags); + obj = kmem_getpages(cache, flags, -1); + if (local_flags & __GFP_WAIT) + local_irq_disable(); + if (obj) { + /* + * Insert into the appropriate per node queues + */ + nid = page_to_nid(virt_to_page(obj)); + if (cache_grow(cache, flags, nid, obj)) { + obj = ____cache_alloc_node(cache, + flags | GFP_THISNODE, nid); + if (!obj) + /* + * Another processor may allocate the + * objects in the slab since we are + * not holding any locks. + */ + goto retry; + } else { + /* cache_grow already freed obj */ + obj = NULL; + } + } } return obj; } @@ -3167,7 +3321,7 @@ void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) /* * A interface to enable slab creation on nodeid */ -static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, +static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { struct list_head *entry; @@ -3216,7 +3370,7 @@ retry: must_grow: spin_unlock(&l3->list_lock); - x = cache_grow(cachep, flags, nodeid); + x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL); if (x) goto retry; @@ -3399,7 +3553,7 @@ EXPORT_SYMBOL(kmem_cache_zalloc); * * Currently only used for dentry validation. */ -int fastcall kmem_ptr_validate(struct kmem_cache *cachep, void *ptr) +int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr) { unsigned long addr = (unsigned long)ptr; unsigned long min_addr = PAGE_OFFSET; @@ -3433,36 +3587,61 @@ out: * @cachep: The cache to allocate from. * @flags: See kmalloc(). * @nodeid: node number of the target node. + * @caller: return address of caller, used for debug information + * + * Identical to kmem_cache_alloc but it will allocate memory on the given + * node, which can improve the performance for cpu bound structures. * - * Identical to kmem_cache_alloc, except that this function is slow - * and can sleep. And it will allocate memory on the given node, which - * can improve the performance for cpu bound structures. - * New and improved: it will now make sure that the object gets - * put on the correct node list so that there is no false sharing. + * Fallback to other node is possible if __GFP_THISNODE is not set. */ -void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) +static __always_inline void * +__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, + int nodeid, void *caller) { unsigned long save_flags; - void *ptr; + void *ptr = NULL; cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); - if (nodeid == -1 || nodeid == numa_node_id() || - !cachep->nodelists[nodeid]) - ptr = ____cache_alloc(cachep, flags); - else - ptr = __cache_alloc_node(cachep, flags, nodeid); - local_irq_restore(save_flags); + if (unlikely(nodeid == -1)) + nodeid = numa_node_id(); - ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, - __builtin_return_address(0)); + if (likely(cachep->nodelists[nodeid])) { + if (nodeid == numa_node_id()) { + /* + * Use the locally cached objects if possible. + * However ____cache_alloc does not allow fallback + * to other nodes. It may fail while we still have + * objects on other nodes available. + */ + ptr = ____cache_alloc(cachep, flags); + } + if (!ptr) { + /* ___cache_alloc_node can fall back to other nodes */ + ptr = ____cache_alloc_node(cachep, flags, nodeid); + } + } else { + /* Node not bootstrapped yet */ + if (!(flags & __GFP_THISNODE)) + ptr = fallback_alloc(cachep, flags); + } + + local_irq_restore(save_flags); + ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); return ptr; } + +void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) +{ + return __cache_alloc_node(cachep, flags, nodeid, + __builtin_return_address(0)); +} EXPORT_SYMBOL(kmem_cache_alloc_node); -void *__kmalloc_node(size_t size, gfp_t flags, int node) +static __always_inline void * +__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) { struct kmem_cache *cachep; @@ -3471,8 +3650,29 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) return NULL; return kmem_cache_alloc_node(cachep, flags, node); } + +#ifdef CONFIG_DEBUG_SLAB +void *__kmalloc_node(size_t size, gfp_t flags, int node) +{ + return __do_kmalloc_node(size, flags, node, + __builtin_return_address(0)); +} EXPORT_SYMBOL(__kmalloc_node); -#endif + +void *__kmalloc_node_track_caller(size_t size, gfp_t flags, + int node, void *caller) +{ + return __do_kmalloc_node(size, flags, node, caller); +} +EXPORT_SYMBOL(__kmalloc_node_track_caller); +#else +void *__kmalloc_node(size_t size, gfp_t flags, int node) +{ + return __do_kmalloc_node(size, flags, node, NULL); +} +EXPORT_SYMBOL(__kmalloc_node); +#endif /* CONFIG_DEBUG_SLAB */ +#endif /* CONFIG_NUMA */ /** * __do_kmalloc - allocate memory @@ -3583,13 +3783,15 @@ static int alloc_kmemlist(struct kmem_cache *cachep) int node; struct kmem_list3 *l3; struct array_cache *new_shared; - struct array_cache **new_alien; + struct array_cache **new_alien = NULL; for_each_online_node(node) { - new_alien = alloc_alien_cache(node, cachep->limit); - if (!new_alien) - goto fail; + if (use_alien_caches) { + new_alien = alloc_alien_cache(node, cachep->limit); + if (!new_alien) + goto fail; + } new_shared = alloc_arraycache(node, cachep->shared*cachep->batchcount, @@ -3815,7 +4017,7 @@ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, * If we cannot acquire the cache chain mutex then just give up - we'll try * again on the next iteration. */ -static void cache_reap(void *unused) +static void cache_reap(struct work_struct *unused) { struct kmem_cache *searchp; struct kmem_list3 *l3; @@ -3824,7 +4026,7 @@ static void cache_reap(void *unused) if (!mutex_trylock(&cache_chain_mutex)) { /* Give up. Setup the next iteration. */ schedule_delayed_work(&__get_cpu_var(reap_work), - REAPTIMEOUT_CPUC); + round_jiffies_relative(REAPTIMEOUT_CPUC)); return; } @@ -3870,7 +4072,8 @@ next: next_reap_node(); refresh_cpu_vm_stats(smp_processor_id()); /* Set up the next iteration */ - schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); + schedule_delayed_work(&__get_cpu_var(reap_work), + round_jiffies_relative(REAPTIMEOUT_CPUC)); } #ifdef CONFIG_PROC_FS @@ -4038,7 +4241,7 @@ static int s_show(struct seq_file *m, void *p) * + further values on SMP and with statistics enabled */ -struct seq_operations slabinfo_op = { +const struct seq_operations slabinfo_op = { .start = s_start, .next = s_next, .stop = s_stop, @@ -4236,7 +4439,7 @@ static int leaks_show(struct seq_file *m, void *p) return 0; } -struct seq_operations slabstats_op = { +const struct seq_operations slabstats_op = { .start = leaks_start, .next = s_next, .stop = s_stop, diff --git a/mm/slob.c b/mm/slob.c index 542394184a5..5adc29cb58d 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -60,6 +60,8 @@ static DEFINE_SPINLOCK(slob_lock); static DEFINE_SPINLOCK(block_lock); static void slob_free(void *b, int size); +static void slob_timer_cbk(void); + static void *slob_alloc(size_t size, gfp_t gfp, int align) { @@ -157,7 +159,7 @@ static int fastcall find_order(int size) return order; } -void *kmalloc(size_t size, gfp_t gfp) +void *__kmalloc(size_t size, gfp_t gfp) { slob_t *m; bigblock_t *bb; @@ -186,8 +188,7 @@ void *kmalloc(size_t size, gfp_t gfp) slob_free(bb, sizeof(bigblock_t)); return 0; } - -EXPORT_SYMBOL(kmalloc); +EXPORT_SYMBOL(__kmalloc); void kfree(const void *block) { @@ -327,9 +328,25 @@ const char *kmem_cache_name(struct kmem_cache *c) EXPORT_SYMBOL(kmem_cache_name); static struct timer_list slob_timer = TIMER_INITIALIZER( - (void (*)(unsigned long))kmem_cache_init, 0, 0); + (void (*)(unsigned long))slob_timer_cbk, 0, 0); + +int kmem_cache_shrink(struct kmem_cache *d) +{ + return 0; +} +EXPORT_SYMBOL(kmem_cache_shrink); + +int kmem_ptr_validate(struct kmem_cache *a, const void *b) +{ + return 0; +} + +void __init kmem_cache_init(void) +{ + slob_timer_cbk(); +} -void kmem_cache_init(void) +static void slob_timer_cbk(void) { void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1); diff --git a/mm/sparse.c b/mm/sparse.c index b3c82ba3001..ac26eb0d73c 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -24,6 +24,25 @@ struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT] #endif EXPORT_SYMBOL(mem_section); +#ifdef NODE_NOT_IN_PAGE_FLAGS +/* + * If we did not store the node number in the page then we have to + * do a lookup in the section_to_node_table in order to find which + * node the page belongs to. + */ +#if MAX_NUMNODES <= 256 +static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; +#else +static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; +#endif + +int page_to_nid(struct page *page) +{ + return section_to_node_table[page_to_section(page)]; +} +EXPORT_SYMBOL(page_to_nid); +#endif + #ifdef CONFIG_SPARSEMEM_EXTREME static struct mem_section *sparse_index_alloc(int nid) { @@ -49,6 +68,10 @@ static int sparse_index_init(unsigned long section_nr, int nid) struct mem_section *section; int ret = 0; +#ifdef NODE_NOT_IN_PAGE_FLAGS + section_to_node_table[section_nr] = nid; +#endif + if (mem_section[root]) return -EEXIST; diff --git a/mm/swap.c b/mm/swap.c index 2e0e871f542..2ed7be39795 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -57,9 +57,9 @@ static void put_compound_page(struct page *page) { page = (struct page *)page_private(page); if (put_page_testzero(page)) { - void (*dtor)(struct page *page); + compound_page_dtor *dtor; - dtor = (void (*)(struct page *))page[1].lru.next; + dtor = get_compound_page_dtor(page); (*dtor)(page); } } @@ -216,7 +216,7 @@ void lru_add_drain(void) } #ifdef CONFIG_NUMA -static void lru_add_drain_per_cpu(void *dummy) +static void lru_add_drain_per_cpu(struct work_struct *dummy) { lru_add_drain(); } @@ -226,7 +226,7 @@ static void lru_add_drain_per_cpu(void *dummy) */ int lru_add_drain_all(void) { - return schedule_on_each_cpu(lru_add_drain_per_cpu, NULL); + return schedule_on_each_cpu(lru_add_drain_per_cpu); } #else @@ -514,5 +514,7 @@ void __init swap_setup(void) * Right now other parts of the system means that we * _really_ don't want to cluster much more */ +#ifdef CONFIG_HOTPLUG_CPU hotcpu_notifier(cpu_swap_callback, 0); +#endif } diff --git a/mm/swapfile.c b/mm/swapfile.c index a15def63f28..a2d9bb4e80d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -427,34 +427,54 @@ void free_swap_and_cache(swp_entry_t entry) #ifdef CONFIG_SOFTWARE_SUSPEND /* - * Find the swap type that corresponds to given device (if any) + * Find the swap type that corresponds to given device (if any). * - * This is needed for software suspend and is done in such a way that inode - * aliasing is allowed. + * @offset - number of the PAGE_SIZE-sized block of the device, starting + * from 0, in which the swap header is expected to be located. + * + * This is needed for the suspend to disk (aka swsusp). */ -int swap_type_of(dev_t device) +int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) { + struct block_device *bdev = NULL; int i; + if (device) + bdev = bdget(device); + spin_lock(&swap_lock); for (i = 0; i < nr_swapfiles; i++) { - struct inode *inode; + struct swap_info_struct *sis = swap_info + i; - if (!(swap_info[i].flags & SWP_WRITEOK)) + if (!(sis->flags & SWP_WRITEOK)) continue; - if (!device) { + if (!bdev) { + if (bdev_p) + *bdev_p = sis->bdev; + spin_unlock(&swap_lock); return i; } - inode = swap_info[i].swap_file->f_dentry->d_inode; - if (S_ISBLK(inode->i_mode) && - device == MKDEV(imajor(inode), iminor(inode))) { - spin_unlock(&swap_lock); - return i; + if (bdev == sis->bdev) { + struct swap_extent *se; + + se = list_entry(sis->extent_list.next, + struct swap_extent, list); + if (se->start_block == offset) { + if (bdev_p) + *bdev_p = sis->bdev; + + spin_unlock(&swap_lock); + bdput(bdev); + return i; + } } } spin_unlock(&swap_lock); + if (bdev) + bdput(bdev); + return -ENODEV; } @@ -931,6 +951,23 @@ sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset) } } +#ifdef CONFIG_SOFTWARE_SUSPEND +/* + * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev + * corresponding to given index in swap_info (swap type). + */ +sector_t swapdev_block(int swap_type, pgoff_t offset) +{ + struct swap_info_struct *sis; + + if (swap_type >= nr_swapfiles) + return 0; + + sis = swap_info + swap_type; + return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0; +} +#endif /* CONFIG_SOFTWARE_SUSPEND */ + /* * Free all of a swapdev's extent information */ @@ -1274,10 +1311,13 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) mutex_lock(&swapon_mutex); + if (!l) + return SEQ_START_TOKEN; + for (i = 0; i < nr_swapfiles; i++, ptr++) { if (!(ptr->flags & SWP_USED) || !ptr->swap_map) continue; - if (!l--) + if (!--l) return ptr; } @@ -1286,10 +1326,17 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) { - struct swap_info_struct *ptr = v; + struct swap_info_struct *ptr; struct swap_info_struct *endptr = swap_info + nr_swapfiles; - for (++ptr; ptr < endptr; ptr++) { + if (v == SEQ_START_TOKEN) + ptr = swap_info; + else { + ptr = v; + ptr++; + } + + for (; ptr < endptr; ptr++) { if (!(ptr->flags & SWP_USED) || !ptr->swap_map) continue; ++*pos; @@ -1310,14 +1357,16 @@ static int swap_show(struct seq_file *swap, void *v) struct file *file; int len; - if (v == swap_info) - seq_puts(swap, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); + if (ptr == SEQ_START_TOKEN) { + seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); + return 0; + } file = ptr->swap_file; - len = seq_path(swap, file->f_vfsmnt, file->f_dentry, " \t\n\\"); + len = seq_path(swap, file->f_path.mnt, file->f_path.dentry, " \t\n\\"); seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", len < 40 ? 40 - len : 1, " ", - S_ISBLK(file->f_dentry->d_inode->i_mode) ? + S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? "partition" : "file\t", ptr->pages << (PAGE_SHIFT - 10), ptr->inuse_pages << (PAGE_SHIFT - 10), @@ -1325,7 +1374,7 @@ static int swap_show(struct seq_file *swap, void *v) return 0; } -static struct seq_operations swaps_op = { +static const struct seq_operations swaps_op = { .start = swap_start, .next = swap_next, .stop = swap_stop, @@ -1337,7 +1386,7 @@ static int swaps_open(struct inode *inode, struct file *file) return seq_open(file, &swaps_op); } -static struct file_operations proc_swaps_operations = { +static const struct file_operations proc_swaps_operations = { .open = swaps_open, .read = seq_read, .llseek = seq_lseek, @@ -1540,6 +1589,11 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) error = -EINVAL; if (!maxpages) goto bad_swap; + if (swapfilesize && maxpages > swapfilesize) { + printk(KERN_WARNING + "Swap area shorter than signature indicates\n"); + goto bad_swap; + } if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) goto bad_swap; if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) @@ -1567,12 +1621,6 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) goto bad_swap; } - if (swapfilesize && maxpages > swapfilesize) { - printk(KERN_WARNING - "Swap area shorter than signature indicates\n"); - error = -EINVAL; - goto bad_swap; - } if (nr_good_pages) { p->swap_map[0] = SWAP_MAP_BAD; p->max = maxpages; diff --git a/mm/thrash.c b/mm/thrash.c index f4c560b4a2b..9ef9071f99b 100644 --- a/mm/thrash.c +++ b/mm/thrash.c @@ -7,100 +7,74 @@ * * Simple token based thrashing protection, using the algorithm * described in: http://www.cs.wm.edu/~sjiang/token.pdf + * + * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com> + * Improved algorithm to pass token: + * Each task has a priority which is incremented if it contended + * for the token in an interval less than its previous attempt. + * If the token is acquired, that task's priority is boosted to prevent + * the token from bouncing around too often and to let the task make + * some progress in its execution. */ + #include <linux/jiffies.h> #include <linux/mm.h> #include <linux/sched.h> #include <linux/swap.h> static DEFINE_SPINLOCK(swap_token_lock); -static unsigned long swap_token_timeout; -static unsigned long swap_token_check; -struct mm_struct * swap_token_mm = &init_mm; - -#define SWAP_TOKEN_CHECK_INTERVAL (HZ * 2) -#define SWAP_TOKEN_TIMEOUT (300 * HZ) -/* - * Currently disabled; Needs further code to work at HZ * 300. - */ -unsigned long swap_token_default_timeout = SWAP_TOKEN_TIMEOUT; - -/* - * Take the token away if the process had no page faults - * in the last interval, or if it has held the token for - * too long. - */ -#define SWAP_TOKEN_ENOUGH_RSS 1 -#define SWAP_TOKEN_TIMED_OUT 2 -static int should_release_swap_token(struct mm_struct *mm) -{ - int ret = 0; - if (!mm->recent_pagein) - ret = SWAP_TOKEN_ENOUGH_RSS; - else if (time_after(jiffies, swap_token_timeout)) - ret = SWAP_TOKEN_TIMED_OUT; - mm->recent_pagein = 0; - return ret; -} +struct mm_struct *swap_token_mm; +static unsigned int global_faults; -/* - * Try to grab the swapout protection token. We only try to - * grab it once every TOKEN_CHECK_INTERVAL, both to prevent - * SMP lock contention and to check that the process that held - * the token before is no longer thrashing. - */ void grab_swap_token(void) { - struct mm_struct *mm; - int reason; + int current_interval; - /* We have the token. Let others know we still need it. */ - if (has_swap_token(current->mm)) { - current->mm->recent_pagein = 1; - if (unlikely(!swap_token_default_timeout)) - disable_swap_token(); - return; - } - - if (time_after(jiffies, swap_token_check)) { + global_faults++; - if (!swap_token_default_timeout) { - swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL; - return; - } - - /* ... or if we recently held the token. */ - if (time_before(jiffies, current->mm->swap_token_time)) - return; + current_interval = global_faults - current->mm->faultstamp; - if (!spin_trylock(&swap_token_lock)) - return; + if (!spin_trylock(&swap_token_lock)) + return; - swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL; + /* First come first served */ + if (swap_token_mm == NULL) { + current->mm->token_priority = current->mm->token_priority + 2; + swap_token_mm = current->mm; + goto out; + } - mm = swap_token_mm; - if ((reason = should_release_swap_token(mm))) { - unsigned long eligible = jiffies; - if (reason == SWAP_TOKEN_TIMED_OUT) { - eligible += swap_token_default_timeout; - } - mm->swap_token_time = eligible; - swap_token_timeout = jiffies + swap_token_default_timeout; + if (current->mm != swap_token_mm) { + if (current_interval < current->mm->last_interval) + current->mm->token_priority++; + else { + current->mm->token_priority--; + if (unlikely(current->mm->token_priority < 0)) + current->mm->token_priority = 0; + } + /* Check if we deserve the token */ + if (current->mm->token_priority > + swap_token_mm->token_priority) { + current->mm->token_priority += 2; swap_token_mm = current->mm; } - spin_unlock(&swap_token_lock); + } else { + /* Token holder came in again! */ + current->mm->token_priority += 2; } - return; + +out: + current->mm->faultstamp = global_faults; + current->mm->last_interval = current_interval; + spin_unlock(&swap_token_lock); +return; } /* Called on process exit. */ void __put_swap_token(struct mm_struct *mm) { spin_lock(&swap_token_lock); - if (likely(mm == swap_token_mm)) { - mm->swap_token_time = jiffies + SWAP_TOKEN_CHECK_INTERVAL; - swap_token_mm = &init_mm; - swap_token_check = jiffies; - } + if (likely(mm == swap_token_mm)) + swap_token_mm = NULL; spin_unlock(&swap_token_lock); } diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c index 5f2cbf0f153..c7f6e1914bc 100644 --- a/mm/tiny-shmem.c +++ b/mm/tiny-shmem.c @@ -79,8 +79,8 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) d_instantiate(dentry, inode); inode->i_nlink = 0; /* It is unlinked */ - file->f_vfsmnt = mntget(shm_mnt); - file->f_dentry = dentry; + file->f_path.mnt = mntget(shm_mnt); + file->f_path.dentry = dentry; file->f_mapping = inode->i_mapping; file->f_op = &ramfs_file_operations; file->f_mode = FMODE_WRITE | FMODE_READ; diff --git a/mm/truncate.c b/mm/truncate.c index e07b1e682c3..6c79ca4a1ca 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -13,6 +13,7 @@ #include <linux/module.h> #include <linux/pagemap.h> #include <linux/pagevec.h> +#include <linux/task_io_accounting_ops.h> #include <linux/buffer_head.h> /* grr. try_to_release_page, do_invalidatepage */ @@ -50,6 +51,26 @@ static inline void truncate_partial_page(struct page *page, unsigned partial) do_invalidatepage(page, partial); } +void cancel_dirty_page(struct page *page, unsigned int account_size) +{ + /* If we're cancelling the page, it had better not be mapped any more */ + if (page_mapped(page)) { + static unsigned int warncount; + + WARN_ON(++warncount < 5); + } + + if (TestClearPageDirty(page)) { + struct address_space *mapping = page->mapping; + if (mapping && mapping_cap_account_dirty(mapping)) { + dec_zone_page_state(page, NR_FILE_DIRTY); + if (account_size) + task_io_account_cancelled_write(account_size); + } + } +} +EXPORT_SYMBOL(cancel_dirty_page); + /* * If truncate cannot remove the fs-private metadata from the page, the page * becomes anonymous. It will be left on the LRU and may even be mapped into @@ -66,10 +87,11 @@ truncate_complete_page(struct address_space *mapping, struct page *page) if (page->mapping != mapping) return; + cancel_dirty_page(page, PAGE_CACHE_SIZE); + if (PagePrivate(page)) do_invalidatepage(page, 0); - clear_page_dirty(page); ClearPageUptodate(page); ClearPageMappedToDisk(page); remove_from_page_cache(page); @@ -319,6 +341,15 @@ failed: return 0; } +static int do_launder_page(struct address_space *mapping, struct page *page) +{ + if (!PageDirty(page)) + return 0; + if (page->mapping != mapping || mapping->a_ops->launder_page == NULL) + return 0; + return mapping->a_ops->launder_page(page); +} + /** * invalidate_inode_pages2_range - remove range of pages from an address_space * @mapping: the address_space @@ -348,7 +379,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping, for (i = 0; !ret && i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; pgoff_t page_index; - int was_dirty; lock_page(page); if (page->mapping != mapping) { @@ -384,12 +414,9 @@ int invalidate_inode_pages2_range(struct address_space *mapping, PAGE_CACHE_SIZE, 0); } } - was_dirty = test_clear_page_dirty(page); - if (!invalidate_complete_page2(mapping, page)) { - if (was_dirty) - set_page_dirty(page); + ret = do_launder_page(mapping, page); + if (ret == 0 && !invalidate_complete_page2(mapping, page)) ret = -EIO; - } unlock_page(page); } pagevec_release(&pvec); diff --git a/mm/vmscan.c b/mm/vmscan.c index 518540a4a2a..7430df68cb6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -36,6 +36,7 @@ #include <linux/rwsem.h> #include <linux/delay.h> #include <linux/kthread.h> +#include <linux/freezer.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -691,7 +692,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, __count_vm_events(KSWAPD_STEAL, nr_freed); } else __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); - __count_vm_events(PGACTIVATE, nr_freed); + __count_zone_vm_events(PGSTEAL, zone, nr_freed); if (nr_taken == 0) goto done; @@ -983,7 +984,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones, if (!populated_zone(zone)) continue; - if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; note_zone_scanning_priority(zone, priority); @@ -1033,7 +1034,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) for (i = 0; zones[i] != NULL; i++) { struct zone *zone = zones[i]; - if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; lru_pages += zone->nr_active + zone->nr_inactive; @@ -1088,7 +1089,7 @@ out: for (i = 0; zones[i] != 0; i++) { struct zone *zone = zones[i]; - if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; zone->prev_priority = priority; @@ -1172,11 +1173,12 @@ loop_again: if (!zone_watermark_ok(zone, order, zone->pages_high, 0, 0)) { end_zone = i; - goto scan; + break; } } - goto out; -scan: + if (i < 0) + goto out; + for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; @@ -1259,6 +1261,9 @@ out: } if (!all_zones_ok) { cond_resched(); + + try_to_freeze(); + goto loop_again; } @@ -1349,7 +1354,7 @@ void wakeup_kswapd(struct zone *zone, int order) return; if (pgdat->kswapd_max_order < order) pgdat->kswapd_max_order = order; - if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) return; if (!waitqueue_active(&pgdat->kswapd_wait)) return; @@ -1364,8 +1369,8 @@ void wakeup_kswapd(struct zone *zone, int order) * * For pass > 3 we also try to shrink the LRU lists that contain a few pages */ -static unsigned long shrink_all_zones(unsigned long nr_pages, int pass, - int prio, struct scan_control *sc) +static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, + int pass, struct scan_control *sc) { struct zone *zone; unsigned long nr_to_scan, ret = 0; @@ -1401,6 +1406,16 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int pass, return ret; } +static unsigned long count_lru_pages(void) +{ + struct zone *zone; + unsigned long ret = 0; + + for_each_zone(zone) + ret += zone->nr_active + zone->nr_inactive; + return ret; +} + /* * Try to free `nr_pages' of memory, system-wide, and return the number of * freed pages. @@ -1415,7 +1430,6 @@ unsigned long shrink_all_memory(unsigned long nr_pages) unsigned long ret = 0; int pass; struct reclaim_state reclaim_state; - struct zone *zone; struct scan_control sc = { .gfp_mask = GFP_KERNEL, .may_swap = 0, @@ -1426,10 +1440,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) current->reclaim_state = &reclaim_state; - lru_pages = 0; - for_each_zone(zone) - lru_pages += zone->nr_active + zone->nr_inactive; - + lru_pages = count_lru_pages(); nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); /* If slab caches are huge, it's better to hit them first */ while (nr_slab >= lru_pages) { @@ -1456,13 +1467,6 @@ unsigned long shrink_all_memory(unsigned long nr_pages) for (pass = 0; pass < 5; pass++) { int prio; - /* Needed for shrinking slab caches later on */ - if (!lru_pages) - for_each_zone(zone) { - lru_pages += zone->nr_active; - lru_pages += zone->nr_inactive; - } - /* Force reclaiming mapped pages in the passes #3 and #4 */ if (pass > 2) { sc.may_swap = 1; @@ -1478,7 +1482,8 @@ unsigned long shrink_all_memory(unsigned long nr_pages) goto out; reclaim_state.reclaimed_slab = 0; - shrink_slab(sc.nr_scanned, sc.gfp_mask, lru_pages); + shrink_slab(sc.nr_scanned, sc.gfp_mask, + count_lru_pages()); ret += reclaim_state.reclaimed_slab; if (ret >= nr_pages) goto out; @@ -1486,20 +1491,19 @@ unsigned long shrink_all_memory(unsigned long nr_pages) if (sc.nr_scanned && prio < DEF_PRIORITY - 2) congestion_wait(WRITE, HZ / 10); } - - lru_pages = 0; } /* * If ret = 0, we could not shrink LRUs, but there may be something * in slab caches */ - if (!ret) + if (!ret) { do { reclaim_state.reclaimed_slab = 0; - shrink_slab(nr_pages, sc.gfp_mask, lru_pages); + shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages()); ret += reclaim_state.reclaimed_slab; } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); + } out: current->reclaim_state = NULL; @@ -1508,7 +1512,6 @@ out: } #endif -#ifdef CONFIG_HOTPLUG_CPU /* It's optimal to keep kswapds on the same CPUs as their memory, but not required for correctness. So if the last cpu in a node goes away, we get changed to run anywhere: as the first one comes back, @@ -1529,7 +1532,6 @@ static int __devinit cpu_callback(struct notifier_block *nfb, } return NOTIFY_OK; } -#endif /* CONFIG_HOTPLUG_CPU */ /* * This kswapd start function will be called by init and node-hot-add. diff --git a/mm/vmstat.c b/mm/vmstat.c index 8614e8f6743..dc005a0c96a 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -430,7 +430,7 @@ static int frag_show(struct seq_file *m, void *arg) return 0; } -struct seq_operations fragmentation_op = { +const struct seq_operations fragmentation_op = { .start = frag_start, .next = frag_next, .stop = frag_stop, @@ -452,7 +452,7 @@ struct seq_operations fragmentation_op = { #define TEXTS_FOR_ZONES(xx) xx "_dma", TEXT_FOR_DMA32(xx) xx "_normal", \ TEXT_FOR_HIGHMEM(xx) -static char *vmstat_text[] = { +static const char * const vmstat_text[] = { /* Zoned VM counters */ "nr_anon_pages", "nr_mapped", @@ -597,7 +597,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg) return 0; } -struct seq_operations zoneinfo_op = { +const struct seq_operations zoneinfo_op = { .start = frag_start, /* iterate over all zones. The same as in * fragmentation. */ .next = frag_next, @@ -660,7 +660,7 @@ static void vmstat_stop(struct seq_file *m, void *arg) m->private = NULL; } -struct seq_operations vmstat_op = { +const struct seq_operations vmstat_op = { .start = vmstat_start, .next = vmstat_next, .stop = vmstat_stop, @@ -679,13 +679,13 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, void *hcpu) { switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_CANCELED: - case CPU_DEAD: - refresh_zone_stat_thresholds(); - break; - default: - break; + case CPU_UP_PREPARE: + case CPU_UP_CANCELED: + case CPU_DEAD: + refresh_zone_stat_thresholds(); + break; + default: + break; } return NOTIFY_OK; } |