diff options
Diffstat (limited to 'arch/x86/mm')
-rw-r--r-- | arch/x86/mm/fault.c | 32 | ||||
-rw-r--r-- | arch/x86/mm/hugetlbpage.c | 24 | ||||
-rw-r--r-- | arch/x86/mm/init.c | 2 | ||||
-rw-r--r-- | arch/x86/mm/init_32.c | 13 | ||||
-rw-r--r-- | arch/x86/mm/pageattr.c | 10 | ||||
-rw-r--r-- | arch/x86/mm/pat.c | 87 | ||||
-rw-r--r-- | arch/x86/mm/pat_rbtree.c | 34 | ||||
-rw-r--r-- | arch/x86/mm/srat.c | 15 | ||||
-rw-r--r-- | arch/x86/mm/tlb.c | 4 |
9 files changed, 144 insertions, 77 deletions
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 76dcd9d8e0b..8e13ecb41be 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -18,6 +18,7 @@ #include <asm/pgalloc.h> /* pgd_*(), ... */ #include <asm/kmemcheck.h> /* kmemcheck_*(), ... */ #include <asm/fixmap.h> /* VSYSCALL_START */ +#include <asm/rcu.h> /* exception_enter(), ... */ /* * Page fault error code bits: @@ -995,13 +996,24 @@ static int fault_in_kernel_space(unsigned long address) return address >= TASK_SIZE_MAX; } +static inline bool smap_violation(int error_code, struct pt_regs *regs) +{ + if (error_code & PF_USER) + return false; + + if (!user_mode_vm(regs) && (regs->flags & X86_EFLAGS_AC)) + return false; + + return true; +} + /* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate * routines. */ -dotraplinkage void __kprobes -do_page_fault(struct pt_regs *regs, unsigned long error_code) +static void __kprobes +__do_page_fault(struct pt_regs *regs, unsigned long error_code) { struct vm_area_struct *vma; struct task_struct *tsk; @@ -1088,6 +1100,13 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) if (unlikely(error_code & PF_RSVD)) pgtable_bad(regs, error_code, address); + if (static_cpu_has(X86_FEATURE_SMAP)) { + if (unlikely(smap_violation(error_code, regs))) { + bad_area_nosemaphore(regs, error_code, address); + return; + } + } + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); /* @@ -1201,6 +1220,7 @@ good_area: /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk * of starvation. */ flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; goto retry; } } @@ -1209,3 +1229,11 @@ good_area: up_read(&mm->mmap_sem); } + +dotraplinkage void __kprobes +do_page_fault(struct pt_regs *regs, unsigned long error_code) +{ + exception_enter(regs); + __do_page_fault(regs, error_code); + exception_exit(regs); +} diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index f6679a7fb8c..937bff5cdaa 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -56,24 +56,31 @@ static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) } /* - * search for a shareable pmd page for hugetlb. + * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() + * and returns the corresponding pte. While this is not necessary for the + * !shared pmd case because we can allocate the pmd later as well, it makes the + * code much cleaner. pmd allocation is essential for the shared case because + * pud has to be populated inside the same i_mmap_mutex section - otherwise + * racing tasks could either miss the sharing (see huge_pte_offset) or select a + * bad pmd for sharing. */ -static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) +static pte_t * +huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) { struct vm_area_struct *vma = find_vma(mm, addr); struct address_space *mapping = vma->vm_file->f_mapping; pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - struct prio_tree_iter iter; struct vm_area_struct *svma; unsigned long saddr; pte_t *spte = NULL; + pte_t *pte; if (!vma_shareable(vma, addr)) - return; + return (pte_t *)pmd_alloc(mm, pud, addr); mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) { + vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { if (svma == vma) continue; @@ -97,7 +104,9 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) put_page(virt_to_page(spte)); spin_unlock(&mm->page_table_lock); out: + pte = (pte_t *)pmd_alloc(mm, pud, addr); mutex_unlock(&mapping->i_mmap_mutex); + return pte; } /* @@ -142,8 +151,9 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, } else { BUG_ON(sz != PMD_SIZE); if (pud_none(*pud)) - huge_pmd_share(mm, addr, pud); - pte = (pte_t *) pmd_alloc(mm, pud, addr); + pte = huge_pmd_share(mm, addr, pud); + else + pte = (pte_t *)pmd_alloc(mm, pud, addr); } } BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index e0e6990723e..ab1f6a93b52 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -319,7 +319,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, */ int devmem_is_allowed(unsigned long pagenr) { - if (pagenr <= 256) + if (pagenr < 256) return 1; if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) return 0; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 575d86f85ce..11a58001b4c 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -445,10 +445,10 @@ static inline void permanent_kmaps_init(pgd_t *pgd_base) } #endif /* CONFIG_HIGHMEM */ -void __init native_pagetable_setup_start(pgd_t *base) +void __init native_pagetable_init(void) { unsigned long pfn, va; - pgd_t *pgd; + pgd_t *pgd, *base = swapper_pg_dir; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -475,10 +475,7 @@ void __init native_pagetable_setup_start(pgd_t *base) pte_clear(NULL, va, pte); } paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT); -} - -void __init native_pagetable_setup_done(pgd_t *base) -{ + paging_init(); } /* @@ -493,7 +490,7 @@ void __init native_pagetable_setup_done(pgd_t *base) * If we're booting paravirtualized under a hypervisor, then there are * more options: we may already be running PAE, and the pagetable may * or may not be based in swapper_pg_dir. In any case, - * paravirt_pagetable_setup_start() will set up swapper_pg_dir + * paravirt_pagetable_init() will set up swapper_pg_dir * appropriately for the rest of the initialization to work. * * In general, pagetable_init() assumes that the pagetable may already @@ -712,7 +709,7 @@ static void __init test_wp_bit(void) "Checking if this processor honours the WP bit even in supervisor mode..."); /* Any page-aligned address will do, the test is non-destructive */ - __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY); + __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_KERNEL_RO); boot_cpu_data.wp_works_ok = do_test_wp_bit(); clear_fixmap(FIX_WP_TEST); diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 931930a9616..a718e0d2350 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -919,13 +919,11 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, /* * On success we use clflush, when the CPU supports it to - * avoid the wbindv. If the CPU does not support it, in the - * error case, and during early boot (for EFI) we fall back - * to cpa_flush_all (which uses wbinvd): + * avoid the wbindv. If the CPU does not support it and in the + * error case we fall back to cpa_flush_all (which uses + * wbindv): */ - if (early_boot_irqs_disabled) - __cpa_flush_all((void *)(long)cache); - else if (!ret && cpu_has_clflush) { + if (!ret && cpu_has_clflush) { if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { cpa_flush_array(addr, numpages, cache, cpa.flags, pages); diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 3d68ef6d226..0eb572eda40 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -664,20 +664,20 @@ static void free_pfn_range(u64 paddr, unsigned long size) } /* - * track_pfn_vma_copy is called when vma that is covering the pfnmap gets + * track_pfn_copy is called when vma that is covering the pfnmap gets * copied through copy_page_range(). * * If the vma has a linear pfn mapping for the entire range, we get the prot * from pte and reserve the entire vma range with single reserve_pfn_range call. */ -int track_pfn_vma_copy(struct vm_area_struct *vma) +int track_pfn_copy(struct vm_area_struct *vma) { resource_size_t paddr; unsigned long prot; unsigned long vma_size = vma->vm_end - vma->vm_start; pgprot_t pgprot; - if (is_linear_pfn_mapping(vma)) { + if (vma->vm_flags & VM_PAT) { /* * reserve the whole chunk covered by vma. We need the * starting address and protection from pte. @@ -694,31 +694,59 @@ int track_pfn_vma_copy(struct vm_area_struct *vma) } /* - * track_pfn_vma_new is called when a _new_ pfn mapping is being established - * for physical range indicated by pfn and size. - * * prot is passed in as a parameter for the new mapping. If the vma has a * linear pfn mapping for the entire range reserve the entire vma range with * single reserve_pfn_range call. */ -int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, - unsigned long pfn, unsigned long size) +int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, + unsigned long pfn, unsigned long addr, unsigned long size) { + resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT; unsigned long flags; - resource_size_t paddr; - unsigned long vma_size = vma->vm_end - vma->vm_start; - if (is_linear_pfn_mapping(vma)) { - /* reserve the whole chunk starting from vm_pgoff */ - paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; - return reserve_pfn_range(paddr, vma_size, prot, 0); + /* reserve the whole chunk starting from paddr */ + if (addr == vma->vm_start && size == (vma->vm_end - vma->vm_start)) { + int ret; + + ret = reserve_pfn_range(paddr, size, prot, 0); + if (!ret) + vma->vm_flags |= VM_PAT; + return ret; } if (!pat_enabled) return 0; - /* for vm_insert_pfn and friends, we set prot based on lookup */ - flags = lookup_memtype(pfn << PAGE_SHIFT); + /* + * For anything smaller than the vma size we set prot based on the + * lookup. + */ + flags = lookup_memtype(paddr); + + /* Check memtype for the remaining pages */ + while (size > PAGE_SIZE) { + size -= PAGE_SIZE; + paddr += PAGE_SIZE; + if (flags != lookup_memtype(paddr)) + return -EINVAL; + } + + *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | + flags); + + return 0; +} + +int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, + unsigned long pfn) +{ + unsigned long flags; + + if (!pat_enabled) + return 0; + + /* Set prot based on lookup */ + flags = lookup_memtype((resource_size_t)pfn << PAGE_SHIFT); *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | flags); @@ -726,22 +754,31 @@ int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, } /* - * untrack_pfn_vma is called while unmapping a pfnmap for a region. + * untrack_pfn is called while unmapping a pfnmap for a region. * untrack can be called for a specific region indicated by pfn and size or - * can be for the entire vma (in which case size can be zero). + * can be for the entire vma (in which case pfn, size are zero). */ -void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, - unsigned long size) +void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, + unsigned long size) { resource_size_t paddr; - unsigned long vma_size = vma->vm_end - vma->vm_start; + unsigned long prot; - if (is_linear_pfn_mapping(vma)) { - /* free the whole chunk starting from vm_pgoff */ - paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; - free_pfn_range(paddr, vma_size); + if (!(vma->vm_flags & VM_PAT)) return; + + /* free the chunk starting from pfn or the whole chunk */ + paddr = (resource_size_t)pfn << PAGE_SHIFT; + if (!paddr && !size) { + if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) { + WARN_ON_ONCE(1); + return; + } + + size = vma->vm_end - vma->vm_start; } + free_pfn_range(paddr, size); + vma->vm_flags &= ~VM_PAT; } pgprot_t pgprot_writecombine(pgprot_t prot) diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c index 8acaddd0fb2..415f6c4ced3 100644 --- a/arch/x86/mm/pat_rbtree.c +++ b/arch/x86/mm/pat_rbtree.c @@ -12,7 +12,7 @@ #include <linux/debugfs.h> #include <linux/kernel.h> #include <linux/module.h> -#include <linux/rbtree.h> +#include <linux/rbtree_augmented.h> #include <linux/sched.h> #include <linux/gfp.h> @@ -54,29 +54,24 @@ static u64 get_subtree_max_end(struct rb_node *node) return ret; } -/* Update 'subtree_max_end' for a node, based on node and its children */ -static void memtype_rb_augment_cb(struct rb_node *node, void *__unused) +static u64 compute_subtree_max_end(struct memtype *data) { - struct memtype *data; - u64 max_end, child_max_end; - - if (!node) - return; + u64 max_end = data->end, child_max_end; - data = container_of(node, struct memtype, rb); - max_end = data->end; - - child_max_end = get_subtree_max_end(node->rb_right); + child_max_end = get_subtree_max_end(data->rb.rb_right); if (child_max_end > max_end) max_end = child_max_end; - child_max_end = get_subtree_max_end(node->rb_left); + child_max_end = get_subtree_max_end(data->rb.rb_left); if (child_max_end > max_end) max_end = child_max_end; - data->subtree_max_end = max_end; + return max_end; } +RB_DECLARE_CALLBACKS(static, memtype_rb_augment_cb, struct memtype, rb, + u64, subtree_max_end, compute_subtree_max_end) + /* Find the first (lowest start addr) overlapping range from rb tree */ static struct memtype *memtype_rb_lowest_match(struct rb_root *root, u64 start, u64 end) @@ -179,15 +174,17 @@ static void memtype_rb_insert(struct rb_root *root, struct memtype *newdata) struct memtype *data = container_of(*node, struct memtype, rb); parent = *node; + if (data->subtree_max_end < newdata->end) + data->subtree_max_end = newdata->end; if (newdata->start <= data->start) node = &((*node)->rb_left); else if (newdata->start > data->start) node = &((*node)->rb_right); } + newdata->subtree_max_end = newdata->end; rb_link_node(&newdata->rb, parent, node); - rb_insert_color(&newdata->rb, root); - rb_augment_insert(&newdata->rb, memtype_rb_augment_cb, NULL); + rb_insert_augmented(&newdata->rb, root, &memtype_rb_augment_cb); } int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type) @@ -209,16 +206,13 @@ int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type) struct memtype *rbt_memtype_erase(u64 start, u64 end) { - struct rb_node *deepest; struct memtype *data; data = memtype_rb_exact_match(&memtype_rbroot, start, end); if (!data) goto out; - deepest = rb_augment_erase_begin(&data->rb); - rb_erase(&data->rb, &memtype_rbroot); - rb_augment_erase_end(deepest, memtype_rb_augment_cb, NULL); + rb_erase_augmented(&data->rb, &memtype_rbroot, &memtype_rb_augment_cb); out: return data; } diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 4599c3e8bcb..4ddf497ca65 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -142,23 +142,23 @@ static inline int save_add_info(void) {return 0;} #endif /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ -void __init +int __init acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) { u64 start, end; int node, pxm; if (srat_disabled()) - return; + return -1; if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) { bad_srat(); - return; + return -1; } if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) - return; + return -1; if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info()) - return; + return -1; start = ma->base_address; end = start + ma->length; pxm = ma->proximity_domain; @@ -168,12 +168,12 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) if (node < 0) { printk(KERN_ERR "SRAT: Too many proximity domains.\n"); bad_srat(); - return; + return -1; } if (numa_add_memblk(node, start, end) < 0) { bad_srat(); - return; + return -1; } node_set(node, numa_nodes_parsed); @@ -181,6 +181,7 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n", node, pxm, (unsigned long long) start, (unsigned long long) end - 1); + return 0; } void __init acpi_numa_arch_fixup(void) {} diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 613cd83e8c0..0777f042e40 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -98,6 +98,8 @@ static void flush_tlb_func(void *info) { struct flush_tlb_info *f = info; + inc_irq_stat(irq_tlb_count); + if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) return; @@ -320,7 +322,7 @@ static ssize_t tlbflush_write_file(struct file *file, if (kstrtos8(buf, 0, &shift)) return -EINVAL; - if (shift > 64) + if (shift < -1 || shift >= BITS_PER_LONG) return -EINVAL; tlb_flushall_shift = shift; |