diff options
Diffstat (limited to 'arch/x86/mm')
-rw-r--r-- | arch/x86/mm/Makefile | 2 | ||||
-rw-r--r-- | arch/x86/mm/discontig_32.c | 3 | ||||
-rw-r--r-- | arch/x86/mm/gup.c | 298 | ||||
-rw-r--r-- | arch/x86/mm/hugetlbpage.c | 78 | ||||
-rw-r--r-- | arch/x86/mm/init_32.c | 4 | ||||
-rw-r--r-- | arch/x86/mm/init_64.c | 85 | ||||
-rw-r--r-- | arch/x86/mm/ioremap.c | 18 | ||||
-rw-r--r-- | arch/x86/mm/mmio-mod.c | 4 | ||||
-rw-r--r-- | arch/x86/mm/numa_64.c | 4 | ||||
-rw-r--r-- | arch/x86/mm/pageattr-test.c | 3 | ||||
-rw-r--r-- | arch/x86/mm/pageattr.c | 27 | ||||
-rw-r--r-- | arch/x86/mm/pat.c | 50 | ||||
-rw-r--r-- | arch/x86/mm/pgtable.c | 3 | ||||
-rw-r--r-- | arch/x86/mm/pgtable_32.c | 47 | ||||
-rw-r--r-- | arch/x86/mm/srat_32.c | 12 |
15 files changed, 475 insertions, 163 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 1fbb844c3d7..dfb932dcf13 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,5 +1,5 @@ obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ - pat.o pgtable.o + pat.o pgtable.o gup.o obj-$(CONFIG_X86_32) += pgtable_32.o diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 5dfef9fa061..62fa440678d 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -42,7 +42,6 @@ struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); -static bootmem_data_t node0_bdata; /* * numa interface - we expect the numa architecture specific code to have @@ -385,7 +384,7 @@ void __init initmem_init(unsigned long start_pfn, for_each_online_node(nid) memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); - NODE_DATA(0)->bdata = &node0_bdata; + NODE_DATA(0)->bdata = &bootmem_node_data[0]; setup_bootmem_allocator(); } diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c new file mode 100644 index 00000000000..007bb06c750 --- /dev/null +++ b/arch/x86/mm/gup.c @@ -0,0 +1,298 @@ +/* + * Lockless get_user_pages_fast for x86 + * + * Copyright (C) 2008 Nick Piggin + * Copyright (C) 2008 Novell Inc. + */ +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/vmstat.h> +#include <linux/highmem.h> + +#include <asm/pgtable.h> + +static inline pte_t gup_get_pte(pte_t *ptep) +{ +#ifndef CONFIG_X86_PAE + return *ptep; +#else + /* + * With get_user_pages_fast, we walk down the pagetables without taking + * any locks. For this we would like to load the pointers atoimcally, + * but that is not possible (without expensive cmpxchg8b) on PAE. What + * we do have is the guarantee that a pte will only either go from not + * present to present, or present to not present or both -- it will not + * switch to a completely different present page without a TLB flush in + * between; something that we are blocking by holding interrupts off. + * + * Setting ptes from not present to present goes: + * ptep->pte_high = h; + * smp_wmb(); + * ptep->pte_low = l; + * + * And present to not present goes: + * ptep->pte_low = 0; + * smp_wmb(); + * ptep->pte_high = 0; + * + * We must ensure here that the load of pte_low sees l iff pte_high + * sees h. We load pte_high *after* loading pte_low, which ensures we + * don't see an older value of pte_high. *Then* we recheck pte_low, + * which ensures that we haven't picked up a changed pte high. We might + * have got rubbish values from pte_low and pte_high, but we are + * guaranteed that pte_low will not have the present bit set *unless* + * it is 'l'. And get_user_pages_fast only operates on present ptes, so + * we're safe. + * + * gup_get_pte should not be used or copied outside gup.c without being + * very careful -- it does not atomically load the pte or anything that + * is likely to be useful for you. + */ + pte_t pte; + +retry: + pte.pte_low = ptep->pte_low; + smp_rmb(); + pte.pte_high = ptep->pte_high; + smp_rmb(); + if (unlikely(pte.pte_low != ptep->pte_low)) + goto retry; + + return pte; +#endif +} + +/* + * The performance critical leaf functions are made noinline otherwise gcc + * inlines everything into a single function which results in too much + * register pressure. + */ +static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + unsigned long mask; + pte_t *ptep; + + mask = _PAGE_PRESENT|_PAGE_USER; + if (write) + mask |= _PAGE_RW; + + ptep = pte_offset_map(&pmd, addr); + do { + pte_t pte = gup_get_pte(ptep); + struct page *page; + + if ((pte_val(pte) & (mask | _PAGE_SPECIAL)) != mask) { + pte_unmap(ptep); + return 0; + } + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + page = pte_page(pte); + get_page(page); + pages[*nr] = page; + (*nr)++; + + } while (ptep++, addr += PAGE_SIZE, addr != end); + pte_unmap(ptep - 1); + + return 1; +} + +static inline void get_head_page_multiple(struct page *page, int nr) +{ + VM_BUG_ON(page != compound_head(page)); + VM_BUG_ON(page_count(page) == 0); + atomic_add(nr, &page->_count); +} + +static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + unsigned long mask; + pte_t pte = *(pte_t *)&pmd; + struct page *head, *page; + int refs; + + mask = _PAGE_PRESENT|_PAGE_USER; + if (write) + mask |= _PAGE_RW; + if ((pte_val(pte) & mask) != mask) + return 0; + /* hugepages are never "special" */ + VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL); + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + + refs = 0; + head = pte_page(pte); + page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + do { + VM_BUG_ON(compound_head(page) != head); + pages[*nr] = page; + (*nr)++; + page++; + refs++; + } while (addr += PAGE_SIZE, addr != end); + get_head_page_multiple(head, refs); + + return 1; +} + +static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + unsigned long next; + pmd_t *pmdp; + + pmdp = pmd_offset(&pud, addr); + do { + pmd_t pmd = *pmdp; + + next = pmd_addr_end(addr, end); + if (pmd_none(pmd)) + return 0; + if (unlikely(pmd_large(pmd))) { + if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) + return 0; + } else { + if (!gup_pte_range(pmd, addr, next, write, pages, nr)) + return 0; + } + } while (pmdp++, addr = next, addr != end); + + return 1; +} + +static noinline int gup_huge_pud(pud_t pud, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + unsigned long mask; + pte_t pte = *(pte_t *)&pud; + struct page *head, *page; + int refs; + + mask = _PAGE_PRESENT|_PAGE_USER; + if (write) + mask |= _PAGE_RW; + if ((pte_val(pte) & mask) != mask) + return 0; + /* hugepages are never "special" */ + VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL); + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + + refs = 0; + head = pte_page(pte); + page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); + do { + VM_BUG_ON(compound_head(page) != head); + pages[*nr] = page; + (*nr)++; + page++; + refs++; + } while (addr += PAGE_SIZE, addr != end); + get_head_page_multiple(head, refs); + + return 1; +} + +static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + unsigned long next; + pud_t *pudp; + + pudp = pud_offset(&pgd, addr); + do { + pud_t pud = *pudp; + + next = pud_addr_end(addr, end); + if (pud_none(pud)) + return 0; + if (unlikely(pud_large(pud))) { + if (!gup_huge_pud(pud, addr, next, write, pages, nr)) + return 0; + } else { + if (!gup_pmd_range(pud, addr, next, write, pages, nr)) + return 0; + } + } while (pudp++, addr = next, addr != end); + + return 1; +} + +int get_user_pages_fast(unsigned long start, int nr_pages, int write, + struct page **pages) +{ + struct mm_struct *mm = current->mm; + unsigned long addr, len, end; + unsigned long next; + pgd_t *pgdp; + int nr = 0; + + start &= PAGE_MASK; + addr = start; + len = (unsigned long) nr_pages << PAGE_SHIFT; + end = start + len; + if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, + start, len))) + goto slow_irqon; + + /* + * XXX: batch / limit 'nr', to avoid large irq off latency + * needs some instrumenting to determine the common sizes used by + * important workloads (eg. DB2), and whether limiting the batch size + * will decrease performance. + * + * It seems like we're in the clear for the moment. Direct-IO is + * the main guy that batches up lots of get_user_pages, and even + * they are limited to 64-at-a-time which is not so many. + */ + /* + * This doesn't prevent pagetable teardown, but does prevent + * the pagetables and pages from being freed on x86. + * + * So long as we atomically load page table pointers versus teardown + * (which we do on x86, with the above PAE exception), we can follow the + * address down to the the page and take a ref on it. + */ + local_irq_disable(); + pgdp = pgd_offset(mm, addr); + do { + pgd_t pgd = *pgdp; + + next = pgd_addr_end(addr, end); + if (pgd_none(pgd)) + goto slow; + if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + goto slow; + } while (pgdp++, addr = next, addr != end); + local_irq_enable(); + + VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); + return nr; + + { + int ret; + +slow: + local_irq_enable(); +slow_irqon: + /* Try to get the remaining pages with get_user_pages */ + start += nr << PAGE_SHIFT; + pages += nr; + + down_read(&mm->mmap_sem); + ret = get_user_pages(current, mm, start, + (end - start) >> PAGE_SHIFT, write, 0, pages, NULL); + up_read(&mm->mmap_sem); + + /* Have to be a bit careful with return values */ + if (nr > 0) { + if (ret < 0) + ret = nr; + else + ret += nr; + } + + return ret; + } +} diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 0b3d567e686..8f307d914c2 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -124,7 +124,8 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) return 1; } -pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_alloc(struct mm_struct *mm, + unsigned long addr, unsigned long sz) { pgd_t *pgd; pud_t *pud; @@ -133,9 +134,14 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) pgd = pgd_offset(mm, addr); pud = pud_alloc(mm, pgd, addr); if (pud) { - if (pud_none(*pud)) - huge_pmd_share(mm, addr, pud); - pte = (pte_t *) pmd_alloc(mm, pud, addr); + if (sz == PUD_SIZE) { + pte = (pte_t *)pud; + } else { + BUG_ON(sz != PMD_SIZE); + if (pud_none(*pud)) + huge_pmd_share(mm, addr, pud); + pte = (pte_t *) pmd_alloc(mm, pud, addr); + } } BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); @@ -151,8 +157,11 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) pgd = pgd_offset(mm, addr); if (pgd_present(*pgd)) { pud = pud_offset(pgd, addr); - if (pud_present(*pud)) + if (pud_present(*pud)) { + if (pud_large(*pud)) + return (pte_t *)pud; pmd = pmd_offset(pud, addr); + } } return (pte_t *) pmd; } @@ -188,6 +197,11 @@ int pmd_huge(pmd_t pmd) return 0; } +int pud_huge(pud_t pud) +{ + return 0; +} + struct page * follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write) @@ -208,6 +222,11 @@ int pmd_huge(pmd_t pmd) return !!(pmd_val(pmd) & _PAGE_PSE); } +int pud_huge(pud_t pud) +{ + return !!(pud_val(pud) & _PAGE_PSE); +} + struct page * follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write) @@ -216,9 +235,22 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, page = pte_page(*(pte_t *)pmd); if (page) - page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT); + page += ((address & ~PMD_MASK) >> PAGE_SHIFT); + return page; +} + +struct page * +follow_huge_pud(struct mm_struct *mm, unsigned long address, + pud_t *pud, int write) +{ + struct page *page; + + page = pte_page(*(pte_t *)pud); + if (page) + page += ((address & ~PUD_MASK) >> PAGE_SHIFT); return page; } + #endif /* x86_64 also uses this file */ @@ -228,6 +260,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { + struct hstate *h = hstate_file(file); struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long start_addr; @@ -240,7 +273,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, } full_search: - addr = ALIGN(start_addr, HPAGE_SIZE); + addr = ALIGN(start_addr, huge_page_size(h)); for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { /* At this point: (!vma || addr < vma->vm_end). */ @@ -262,7 +295,7 @@ full_search: } if (addr + mm->cached_hole_size < vma->vm_start) mm->cached_hole_size = vma->vm_start - addr; - addr = ALIGN(vma->vm_end, HPAGE_SIZE); + addr = ALIGN(vma->vm_end, huge_page_size(h)); } } @@ -270,6 +303,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr0, unsigned long len, unsigned long pgoff, unsigned long flags) { + struct hstate *h = hstate_file(file); struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev_vma; unsigned long base = mm->mmap_base, addr = addr0; @@ -290,7 +324,7 @@ try_again: goto fail; /* either no address requested or cant fit in requested address hole */ - addr = (mm->free_area_cache - len) & HPAGE_MASK; + addr = (mm->free_area_cache - len) & huge_page_mask(h); do { /* * Lookup failure means no vma is above this address, @@ -321,7 +355,7 @@ try_again: largest_hole = vma->vm_start - addr; /* try just below the current vma->vm_start */ - addr = (vma->vm_start - len) & HPAGE_MASK; + addr = (vma->vm_start - len) & huge_page_mask(h); } while (len <= vma->vm_start); fail: @@ -359,22 +393,23 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { + struct hstate *h = hstate_file(file); struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - if (len & ~HPAGE_MASK) + if (len & ~huge_page_mask(h)) return -EINVAL; if (len > TASK_SIZE) return -ENOMEM; if (flags & MAP_FIXED) { - if (prepare_hugepage_range(addr, len)) + if (prepare_hugepage_range(file, addr, len)) return -EINVAL; return addr; } if (addr) { - addr = ALIGN(addr, HPAGE_SIZE); + addr = ALIGN(addr, huge_page_size(h)); vma = find_vma(mm, addr); if (TASK_SIZE - len >= addr && (!vma || addr + len <= vma->vm_start)) @@ -390,3 +425,20 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, #endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/ +#ifdef CONFIG_X86_64 +static __init int setup_hugepagesz(char *opt) +{ + unsigned long ps = memparse(opt, &opt); + if (ps == PMD_SIZE) { + hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); + } else if (ps == PUD_SIZE && cpu_has_gbpages) { + hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); + } else { + printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n", + ps >> 20); + return 0; + } + return 1; +} +__setup("hugepagesz=", setup_hugepagesz); +#endif diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index d37f29376b0..60ec1d08ff2 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -458,11 +458,7 @@ static void __init pagetable_init(void) { pgd_t *pgd_base = swapper_pg_dir; - paravirt_pagetable_setup_start(pgd_base); - permanent_kmaps_init(pgd_base); - - paravirt_pagetable_setup_done(pgd_base); } #ifdef CONFIG_ACPI_SLEEP diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index ec37121f670..d3746efb060 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -60,7 +60,7 @@ static unsigned long dma_reserve __initdata; DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); -int direct_gbpages __meminitdata +int direct_gbpages #ifdef CONFIG_DIRECT_GBPAGES = 1 #endif @@ -86,46 +86,13 @@ early_param("gbpages", parse_direct_gbpages_on); * around without checking the pgd every time. */ -void show_mem(void) -{ - long i, total = 0, reserved = 0; - long shared = 0, cached = 0; - struct page *page; - pg_data_t *pgdat; - - printk(KERN_INFO "Mem-info:\n"); - show_free_areas(); - for_each_online_pgdat(pgdat) { - for (i = 0; i < pgdat->node_spanned_pages; ++i) { - /* - * This loop can take a while with 256 GB and - * 4k pages so defer the NMI watchdog: - */ - if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) - touch_nmi_watchdog(); - - if (!pfn_valid(pgdat->node_start_pfn + i)) - continue; - - page = pfn_to_page(pgdat->node_start_pfn + i); - total++; - if (PageReserved(page)) - reserved++; - else if (PageSwapCache(page)) - cached++; - else if (page_count(page)) - shared += page_count(page) - 1; - } - } - printk(KERN_INFO "%lu pages of RAM\n", total); - printk(KERN_INFO "%lu reserved pages\n", reserved); - printk(KERN_INFO "%lu pages shared\n", shared); - printk(KERN_INFO "%lu pages swap cached\n", cached); -} - int after_bootmem; -static __init void *spp_getpage(void) +/* + * NOTE: This function is marked __ref because it calls __init function + * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. + */ +static __ref void *spp_getpage(void) { void *ptr; @@ -274,7 +241,7 @@ static unsigned long __initdata table_start; static unsigned long __meminitdata table_end; static unsigned long __meminitdata table_top; -static __meminit void *alloc_low_page(unsigned long *phys) +static __ref void *alloc_low_page(unsigned long *phys) { unsigned long pfn = table_end++; void *adr; @@ -295,7 +262,7 @@ static __meminit void *alloc_low_page(unsigned long *phys) return adr; } -static __meminit void unmap_low_page(void *adr) +static __ref void unmap_low_page(void *adr) { if (after_bootmem) return; @@ -351,6 +318,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, { unsigned long pages = 0; unsigned long last_map_addr = end; + unsigned long start = address; int i = pmd_index(address); @@ -368,16 +336,24 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, } if (pmd_val(*pmd)) { - if (!pmd_large(*pmd)) + if (!pmd_large(*pmd)) { + spin_lock(&init_mm.page_table_lock); last_map_addr = phys_pte_update(pmd, address, - end); + end); + spin_unlock(&init_mm.page_table_lock); + } + /* Count entries we're using from level2_ident_pgt */ + if (start == 0) + pages++; continue; } if (page_size_mask & (1<<PG_LEVEL_2M)) { pages++; + spin_lock(&init_mm.page_table_lock); set_pte((pte_t *)pmd, pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); + spin_unlock(&init_mm.page_table_lock); last_map_addr = (address & PMD_MASK) + PMD_SIZE; continue; } @@ -386,7 +362,9 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, last_map_addr = phys_pte_init(pte, address, end); unmap_low_page(pte); + spin_lock(&init_mm.page_table_lock); pmd_populate_kernel(&init_mm, pmd, __va(pte_phys)); + spin_unlock(&init_mm.page_table_lock); } update_page_count(PG_LEVEL_2M, pages); return last_map_addr; @@ -399,9 +377,7 @@ phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end, pmd_t *pmd = pmd_offset(pud, 0); unsigned long last_map_addr; - spin_lock(&init_mm.page_table_lock); last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask); - spin_unlock(&init_mm.page_table_lock); __flush_tlb_all(); return last_map_addr; } @@ -437,20 +413,21 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, if (page_size_mask & (1<<PG_LEVEL_1G)) { pages++; + spin_lock(&init_mm.page_table_lock); set_pte((pte_t *)pud, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); + spin_unlock(&init_mm.page_table_lock); last_map_addr = (addr & PUD_MASK) + PUD_SIZE; continue; } pmd = alloc_low_page(&pmd_phys); - - spin_lock(&init_mm.page_table_lock); last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask); unmap_low_page(pmd); + + spin_lock(&init_mm.page_table_lock); pud_populate(&init_mm, pud, __va(pmd_phys)); spin_unlock(&init_mm.page_table_lock); - } __flush_tlb_all(); update_page_count(PG_LEVEL_1G, pages); @@ -542,16 +519,14 @@ static unsigned long __init kernel_physical_mapping_init(unsigned long start, continue; } - if (after_bootmem) - pud = pud_offset(pgd, start & PGDIR_MASK); - else - pud = alloc_low_page(&pud_phys); - + pud = alloc_low_page(&pud_phys); last_map_addr = phys_pud_init(pud, __pa(start), __pa(next), page_size_mask); unmap_low_page(pud); - pgd_populate(&init_mm, pgd_offset_k(start), - __va(pud_phys)); + + spin_lock(&init_mm.page_table_lock); + pgd_populate(&init_mm, pgd, __va(pud_phys)); + spin_unlock(&init_mm.page_table_lock); } return last_map_addr; diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 24c1d3c3018..d4b6e6a29ae 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -170,7 +170,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, phys_addr &= PAGE_MASK; size = PAGE_ALIGN(last_addr+1) - phys_addr; - retval = reserve_memtype(phys_addr, phys_addr + size, + retval = reserve_memtype(phys_addr, (u64)phys_addr + size, prot_val, &new_prot_val); if (retval) { pr_debug("Warning: reserve_memtype returned %d\n", retval); @@ -330,6 +330,14 @@ static void __iomem *ioremap_default(resource_size_t phys_addr, return (void __iomem *)ret; } +void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size, + unsigned long prot_val) +{ + return __ioremap_caller(phys_addr, size, (prot_val & _PAGE_CACHE_MASK), + __builtin_return_address(0)); +} +EXPORT_SYMBOL(ioremap_prot); + /** * iounmap - Free a IO remapping * @addr: virtual address from ioremap_* @@ -545,13 +553,11 @@ static int __init check_early_ioremap_leak(void) { if (!early_ioremap_nested) return 0; - - printk(KERN_WARNING + WARN(1, KERN_WARNING "Debug warning: early ioremap leak of %d areas detected.\n", - early_ioremap_nested); + early_ioremap_nested); printk(KERN_WARNING - "please boot with early_ioremap_debug and report the dmesg.\n"); - WARN_ON(1); + "please boot with early_ioremap_debug and report the dmesg.\n"); return 1; } diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index e7397e108be..635b50e8558 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -430,7 +430,9 @@ static void enter_uniprocessor(void) "may miss events.\n"); } -static void leave_uniprocessor(void) +/* __ref because leave_uniprocessor calls cpu_up which is __cpuinit, + but this whole function is ifdefed CONFIG_HOTPLUG_CPU */ +static void __ref leave_uniprocessor(void) { int cpu; int err; diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 9782f42dd31..a4dd793d600 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -23,8 +23,6 @@ struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); -static bootmem_data_t plat_node_bdata[MAX_NUMNODES]; - struct memnode memnode; s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { @@ -198,7 +196,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, nodedata_phys + pgdat_size - 1); memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); - NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; + NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid]; NODE_DATA(nodeid)->node_start_pfn = start_pfn; NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn; diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index 0dcd42eb94e..d4aa503caaa 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -221,8 +221,7 @@ static int pageattr_test(void) failed += print_split(&sc); if (failed) { - printk(KERN_ERR "NOT PASSED. Please report.\n"); - WARN_ON(1); + WARN(1, KERN_ERR "NOT PASSED. Please report.\n"); return -EINVAL; } else { if (print) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 65c6e46bf05..43e2f8483e4 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -55,13 +55,19 @@ static void split_page_count(int level) int arch_report_meminfo(char *page) { - int n = sprintf(page, "DirectMap4k: %8lu\n" - "DirectMap2M: %8lu\n", - direct_pages_count[PG_LEVEL_4K], - direct_pages_count[PG_LEVEL_2M]); + int n = sprintf(page, "DirectMap4k: %8lu kB\n", + direct_pages_count[PG_LEVEL_4K] << 2); +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) + n += sprintf(page + n, "DirectMap2M: %8lu kB\n", + direct_pages_count[PG_LEVEL_2M] << 11); +#else + n += sprintf(page + n, "DirectMap4M: %8lu kB\n", + direct_pages_count[PG_LEVEL_2M] << 12); +#endif #ifdef CONFIG_X86_64 - n += sprintf(page + n, "DirectMap1G: %8lu\n", - direct_pages_count[PG_LEVEL_1G]); + if (direct_gbpages) + n += sprintf(page + n, "DirectMap1G: %8lu kB\n", + direct_pages_count[PG_LEVEL_1G] << 20); #endif return n; } @@ -592,10 +598,9 @@ repeat: if (!pte_val(old_pte)) { if (!primary) return 0; - printk(KERN_WARNING "CPA: called for zero pte. " + WARN(1, KERN_WARNING "CPA: called for zero pte. " "vaddr = %lx cpa->vaddr = %lx\n", address, cpa->vaddr); - WARN_ON(1); return -EINVAL; } @@ -844,7 +849,7 @@ int set_memory_uc(unsigned long addr, int numpages) /* * for now UC MINUS. see comments in ioremap_nocache() */ - if (reserve_memtype(addr, addr + numpages * PAGE_SIZE, + if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, _PAGE_CACHE_UC_MINUS, NULL)) return -EINVAL; @@ -863,7 +868,7 @@ int set_memory_wc(unsigned long addr, int numpages) if (!pat_enabled) return set_memory_uc(addr, numpages); - if (reserve_memtype(addr, addr + numpages * PAGE_SIZE, + if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, _PAGE_CACHE_WC, NULL)) return -EINVAL; @@ -879,7 +884,7 @@ int _set_memory_wb(unsigned long addr, int numpages) int set_memory_wb(unsigned long addr, int numpages) { - free_memtype(addr, addr + numpages * PAGE_SIZE); + free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); return _set_memory_wb(addr, numpages); } diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 2fe30916d4b..2a50e0fa64a 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -207,6 +207,9 @@ static int chk_conflict(struct memtype *new, struct memtype *entry, return -EBUSY; } +static struct memtype *cached_entry; +static u64 cached_start; + /* * req_type typically has one of the: * - _PAGE_CACHE_WB @@ -280,11 +283,17 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, spin_lock(&memtype_lock); + if (cached_entry && start >= cached_start) + entry = cached_entry; + else + entry = list_entry(&memtype_list, struct memtype, nd); + /* Search for existing mapping that overlaps the current range */ where = NULL; - list_for_each_entry(entry, &memtype_list, nd) { + list_for_each_entry_continue(entry, &memtype_list, nd) { if (end <= entry->start) { where = entry->nd.prev; + cached_entry = list_entry(where, struct memtype, nd); break; } else if (start <= entry->start) { /* end > entry->start */ err = chk_conflict(new, entry, new_type); @@ -292,6 +301,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, dprintk("Overlap at 0x%Lx-0x%Lx\n", entry->start, entry->end); where = entry->nd.prev; + cached_entry = list_entry(where, + struct memtype, nd); } break; } else if (start < entry->end) { /* start > entry->start */ @@ -299,7 +310,20 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, if (!err) { dprintk("Overlap at 0x%Lx-0x%Lx\n", entry->start, entry->end); - where = &entry->nd; + cached_entry = list_entry(entry->nd.prev, + struct memtype, nd); + + /* + * Move to right position in the linked + * list to add this new entry + */ + list_for_each_entry_continue(entry, + &memtype_list, nd) { + if (start <= entry->start) { + where = entry->nd.prev; + break; + } + } } break; } @@ -314,6 +338,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, return err; } + cached_start = start; + if (where) list_add(&new->nd, where); else @@ -343,6 +369,9 @@ int free_memtype(u64 start, u64 end) spin_lock(&memtype_lock); list_for_each_entry(entry, &memtype_list, nd) { if (entry->start == start && entry->end == end) { + if (cached_entry == entry || cached_start == start) + cached_entry = NULL; + list_del(&entry->nd); kfree(entry); err = 0; @@ -361,14 +390,6 @@ int free_memtype(u64 start, u64 end) } -/* - * /dev/mem mmap interface. The memtype used for mapping varies: - * - Use UC for mappings with O_SYNC flag - * - Without O_SYNC flag, if there is any conflict in reserve_memtype, - * inherit the memtype from existing mapping. - * - Else use UC_MINUS memtype (for backward compatibility with existing - * X drivers. - */ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot) { @@ -406,14 +427,14 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, unsigned long size, pgprot_t *vma_prot) { u64 offset = ((u64) pfn) << PAGE_SHIFT; - unsigned long flags = _PAGE_CACHE_UC_MINUS; + unsigned long flags = -1; int retval; if (!range_is_allowed(pfn, size)) return 0; if (file->f_flags & O_SYNC) { - flags = _PAGE_CACHE_UC; + flags = _PAGE_CACHE_UC_MINUS; } #ifdef CONFIG_X86_32 @@ -436,13 +457,14 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, #endif /* - * With O_SYNC, we can only take UC mapping. Fail if we cannot. + * With O_SYNC, we can only take UC_MINUS mapping. Fail if we cannot. + * * Without O_SYNC, we want to get * - WB for WB-able memory and no other conflicting mappings * - UC_MINUS for non-WB-able memory with no other conflicting mappings * - Inherit from confliting mappings otherwise */ - if (flags != _PAGE_CACHE_UC_MINUS) { + if (flags != -1) { retval = reserve_memtype(offset, offset + size, flags, NULL); } else { retval = reserve_memtype(offset, offset + size, -1, &flags); diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 557b2abceef..d50302774fe 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -207,6 +207,9 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) unsigned long addr; int i; + if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */ + return; + pud = pud_offset(pgd, 0); for (addr = i = 0; i < PREALLOCATED_PMDS; diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index b4becbf8c57..cab0abbd1eb 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -20,53 +20,6 @@ #include <asm/tlb.h> #include <asm/tlbflush.h> -void show_mem(void) -{ - int total = 0, reserved = 0; - int shared = 0, cached = 0; - int highmem = 0; - struct page *page; - pg_data_t *pgdat; - unsigned long i; - unsigned long flags; - - printk(KERN_INFO "Mem-info:\n"); - show_free_areas(); - for_each_online_pgdat(pgdat) { - pgdat_resize_lock(pgdat, &flags); - for (i = 0; i < pgdat->node_spanned_pages; ++i) { - if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) - touch_nmi_watchdog(); - page = pgdat_page_nr(pgdat, i); - total++; - if (PageHighMem(page)) - highmem++; - if (PageReserved(page)) - reserved++; - else if (PageSwapCache(page)) - cached++; - else if (page_count(page)) - shared += page_count(page) - 1; - } - pgdat_resize_unlock(pgdat, &flags); - } - printk(KERN_INFO "%d pages of RAM\n", total); - printk(KERN_INFO "%d pages of HIGHMEM\n", highmem); - printk(KERN_INFO "%d reserved pages\n", reserved); - printk(KERN_INFO "%d pages shared\n", shared); - printk(KERN_INFO "%d pages swap cached\n", cached); - - printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY)); - printk(KERN_INFO "%lu pages writeback\n", - global_page_state(NR_WRITEBACK)); - printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED)); - printk(KERN_INFO "%lu pages slab\n", - global_page_state(NR_SLAB_RECLAIMABLE) + - global_page_state(NR_SLAB_UNRECLAIMABLE)); - printk(KERN_INFO "%lu pages pagetables\n", - global_page_state(NR_PAGETABLE)); -} - /* * Associate a virtual page frame with a given physical page frame * and protection flags for that frame. diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c index 1eb2973a301..16ae70fc57e 100644 --- a/arch/x86/mm/srat_32.c +++ b/arch/x86/mm/srat_32.c @@ -178,7 +178,7 @@ void acpi_numa_arch_fixup(void) * start of the node, and that the current "end" address is after * the previous one. */ -static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk) +static __init int node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk) { /* * Only add present memory as told by the e820. @@ -189,10 +189,10 @@ static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_c if (memory_chunk->start_pfn >= max_pfn) { printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n", memory_chunk->start_pfn, memory_chunk->end_pfn); - return; + return -1; } if (memory_chunk->nid != nid) - return; + return -1; if (!node_has_online_mem(nid)) node_start_pfn[nid] = memory_chunk->start_pfn; @@ -202,6 +202,8 @@ static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_c if (node_end_pfn[nid] < memory_chunk->end_pfn) node_end_pfn[nid] = memory_chunk->end_pfn; + + return 0; } int __init get_memcfg_from_srat(void) @@ -259,7 +261,9 @@ int __init get_memcfg_from_srat(void) printk(KERN_DEBUG "chunk %d nid %d start_pfn %08lx end_pfn %08lx\n", j, chunk->nid, chunk->start_pfn, chunk->end_pfn); - node_read_chunk(chunk->nid, chunk); + if (node_read_chunk(chunk->nid, chunk)) + continue; + e820_register_active_regions(chunk->nid, chunk->start_pfn, min(chunk->end_pfn, max_pfn)); } |