diff options
Diffstat (limited to 'arch/x86/mm')
-rw-r--r-- | arch/x86/mm/Makefile | 2 | ||||
-rw-r--r-- | arch/x86/mm/dump_pagetables.c | 27 | ||||
-rw-r--r-- | arch/x86/mm/fault.c | 64 | ||||
-rw-r--r-- | arch/x86/mm/init.c | 41 | ||||
-rw-r--r-- | arch/x86/mm/init_64.c | 76 | ||||
-rw-r--r-- | arch/x86/mm/iomap_32.c | 12 | ||||
-rw-r--r-- | arch/x86/mm/ioremap.c | 67 | ||||
-rw-r--r-- | arch/x86/mm/mm_internal.h | 2 | ||||
-rw-r--r-- | arch/x86/mm/mpx.c | 928 | ||||
-rw-r--r-- | arch/x86/mm/pageattr.c | 106 | ||||
-rw-r--r-- | arch/x86/mm/pat.c | 247 | ||||
-rw-r--r-- | arch/x86/mm/pat_internal.h | 22 | ||||
-rw-r--r-- | arch/x86/mm/pat_rbtree.c | 8 |
13 files changed, 1346 insertions, 256 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 6a19ad9f370..ecfdc46a024 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -30,3 +30,5 @@ obj-$(CONFIG_ACPI_NUMA) += srat.o obj-$(CONFIG_NUMA_EMU) += numa_emulation.o obj-$(CONFIG_MEMTEST) += memtest.o + +obj-$(CONFIG_X86_INTEL_MPX) += mpx.o diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 95a427e5788..f0cedf3395a 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -76,6 +76,9 @@ static struct addr_marker address_markers[] = { # ifdef CONFIG_X86_ESPFIX64 { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, # endif +# ifdef CONFIG_EFI + { EFI_VA_END, "EFI Runtime Services" }, +# endif { __START_KERNEL_map, "High Kernel Mapping" }, { MODULES_VADDR, "Modules" }, { MODULES_END, "End Modules" }, @@ -126,7 +129,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) if (!pgprot_val(prot)) { /* Not present */ - pt_dump_cont_printf(m, dmsg, " "); + pt_dump_cont_printf(m, dmsg, " "); } else { if (pr & _PAGE_USER) pt_dump_cont_printf(m, dmsg, "USR "); @@ -145,18 +148,16 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) else pt_dump_cont_printf(m, dmsg, " "); - /* Bit 9 has a different meaning on level 3 vs 4 */ - if (level <= 3) { - if (pr & _PAGE_PSE) - pt_dump_cont_printf(m, dmsg, "PSE "); - else - pt_dump_cont_printf(m, dmsg, " "); - } else { - if (pr & _PAGE_PAT) - pt_dump_cont_printf(m, dmsg, "pat "); - else - pt_dump_cont_printf(m, dmsg, " "); - } + /* Bit 7 has a different meaning on level 3 vs 4 */ + if (level <= 3 && pr & _PAGE_PSE) + pt_dump_cont_printf(m, dmsg, "PSE "); + else + pt_dump_cont_printf(m, dmsg, " "); + if ((level == 4 && pr & _PAGE_PAT) || + ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE)) + pt_dump_cont_printf(m, dmsg, "pat "); + else + pt_dump_cont_printf(m, dmsg, " "); if (pr & _PAGE_GLOBAL) pt_dump_cont_printf(m, dmsg, "GLB "); else diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index d973e61e450..38dcec403b4 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -844,11 +844,8 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, unsigned int fault) { struct task_struct *tsk = current; - struct mm_struct *mm = tsk->mm; int code = BUS_ADRERR; - up_read(&mm->mmap_sem); - /* Kernel mode? Handle exceptions or die: */ if (!(error_code & PF_USER)) { no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); @@ -879,7 +876,6 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, unsigned long address, unsigned int fault) { if (fatal_signal_pending(current) && !(error_code & PF_USER)) { - up_read(¤t->mm->mmap_sem); no_context(regs, error_code, address, 0, 0); return; } @@ -887,14 +883,11 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, if (fault & VM_FAULT_OOM) { /* Kernel mode? Handle exceptions or die: */ if (!(error_code & PF_USER)) { - up_read(¤t->mm->mmap_sem); no_context(regs, error_code, address, SIGSEGV, SEGV_MAPERR); return; } - up_read(¤t->mm->mmap_sem); - /* * We ran out of memory, call the OOM killer, and return the * userspace (which will retry the fault, or kill us if we got @@ -1062,7 +1055,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, struct vm_area_struct *vma; struct task_struct *tsk; struct mm_struct *mm; - int fault; + int fault, major = 0; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; tsk = current; @@ -1237,47 +1230,50 @@ good_area: * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked. */ fault = handle_mm_fault(mm, vma, address, flags); + major |= fault & VM_FAULT_MAJOR; /* - * If we need to retry but a fatal signal is pending, handle the - * signal first. We do not need to release the mmap_sem because it - * would already be released in __lock_page_or_retry in mm/filemap.c. + * If we need to retry the mmap_sem has already been released, + * and if there is a fatal signal pending there is no guarantee + * that we made any progress. Handle this case first. */ - if (unlikely((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))) + if (unlikely(fault & VM_FAULT_RETRY)) { + /* Retry at most once */ + if (flags & FAULT_FLAG_ALLOW_RETRY) { + flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; + if (!fatal_signal_pending(tsk)) + goto retry; + } + + /* User mode? Just return to handle the fatal exception */ + if (flags & FAULT_FLAG_USER) + return; + + /* Not returning to user mode? Handle exceptions or die: */ + no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); return; + } + up_read(&mm->mmap_sem); if (unlikely(fault & VM_FAULT_ERROR)) { mm_fault_error(regs, error_code, address, fault); return; } /* - * Major/minor page fault accounting is only done on the - * initial attempt. If we go through a retry, it is extremely - * likely that the page will be found in page cache at that point. + * Major/minor page fault accounting. If any of the events + * returned VM_FAULT_MAJOR, we account it as a major fault. */ - if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_MAJOR) { - tsk->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, - regs, address); - } else { - tsk->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, - regs, address); - } - if (fault & VM_FAULT_RETRY) { - /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk - * of starvation. */ - flags &= ~FAULT_FLAG_ALLOW_RETRY; - flags |= FAULT_FLAG_TRIED; - goto retry; - } + if (major) { + tsk->maj_flt++; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); + } else { + tsk->min_flt++; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); } check_v8086_mode(regs, address, tsk); - - up_read(&mm->mmap_sem); } NOKPROBE_SYMBOL(__do_page_fault); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 66dba36f234..a97ee080147 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -27,6 +27,35 @@ #include "mm_internal.h" +/* + * Tables translating between page_cache_type_t and pte encoding. + * Minimal supported modes are defined statically, modified if more supported + * cache modes are available. + * Index into __cachemode2pte_tbl is the cachemode. + * Index into __pte2cachemode_tbl are the caching attribute bits of the pte + * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2. + */ +uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { + [_PAGE_CACHE_MODE_WB] = 0, + [_PAGE_CACHE_MODE_WC] = _PAGE_PWT, + [_PAGE_CACHE_MODE_UC_MINUS] = _PAGE_PCD, + [_PAGE_CACHE_MODE_UC] = _PAGE_PCD | _PAGE_PWT, + [_PAGE_CACHE_MODE_WT] = _PAGE_PCD, + [_PAGE_CACHE_MODE_WP] = _PAGE_PCD, +}; +EXPORT_SYMBOL_GPL(__cachemode2pte_tbl); +uint8_t __pte2cachemode_tbl[8] = { + [__pte2cm_idx(0)] = _PAGE_CACHE_MODE_WB, + [__pte2cm_idx(_PAGE_PWT)] = _PAGE_CACHE_MODE_WC, + [__pte2cm_idx(_PAGE_PCD)] = _PAGE_CACHE_MODE_UC_MINUS, + [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD)] = _PAGE_CACHE_MODE_UC, + [__pte2cm_idx(_PAGE_PAT)] = _PAGE_CACHE_MODE_WB, + [__pte2cm_idx(_PAGE_PWT | _PAGE_PAT)] = _PAGE_CACHE_MODE_WC, + [__pte2cm_idx(_PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, + [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC, +}; +EXPORT_SYMBOL_GPL(__pte2cachemode_tbl); + static unsigned long __initdata pgt_buf_start; static unsigned long __initdata pgt_buf_end; static unsigned long __initdata pgt_buf_top; @@ -674,10 +703,10 @@ void __init zone_sizes_init(void) memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); #ifdef CONFIG_ZONE_DMA - max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; + max_zone_pfns[ZONE_DMA] = min(MAX_DMA_PFN, max_low_pfn); #endif #ifdef CONFIG_ZONE_DMA32 - max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; + max_zone_pfns[ZONE_DMA32] = min(MAX_DMA32_PFN, max_low_pfn); #endif max_zone_pfns[ZONE_NORMAL] = max_low_pfn; #ifdef CONFIG_HIGHMEM @@ -687,3 +716,11 @@ void __init zone_sizes_init(void) free_area_init_nodes(max_zone_pfns); } +void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache) +{ + /* entry 0 MUST be WB (hardwired to speed up translations) */ + BUG_ON(!entry && cache != _PAGE_CACHE_MODE_WB); + + __cachemode2pte_tbl[cache] = __cm_idx2pte(entry); + __pte2cachemode_tbl[entry] = cache; +} diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 4cb8763868f..30eb05ae706 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -52,7 +52,6 @@ #include <asm/numa.h> #include <asm/cacheflush.h> #include <asm/init.h> -#include <asm/uv/uv.h> #include <asm/setup.h> #include "mm_internal.h" @@ -338,12 +337,15 @@ pte_t * __init populate_extra_pte(unsigned long vaddr) * Create large page table mappings for a range of physical addresses. */ static void __init __init_extra_mapping(unsigned long phys, unsigned long size, - pgprot_t prot) + enum page_cache_mode cache) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; + pgprot_t prot; + pgprot_val(prot) = pgprot_val(PAGE_KERNEL_LARGE) | + pgprot_val(pgprot_4k_2_large(cachemode2pgprot(cache))); BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK)); for (; size; phys += PMD_SIZE, size -= PMD_SIZE) { pgd = pgd_offset_k((unsigned long)__va(phys)); @@ -366,12 +368,12 @@ static void __init __init_extra_mapping(unsigned long phys, unsigned long size, void __init init_extra_mapping_wb(unsigned long phys, unsigned long size) { - __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE); + __init_extra_mapping(phys, size, _PAGE_CACHE_MODE_WB); } void __init init_extra_mapping_uc(unsigned long phys, unsigned long size) { - __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE); + __init_extra_mapping(phys, size, _PAGE_CACHE_MODE_UC); } /* @@ -1123,7 +1125,7 @@ void mark_rodata_ro(void) unsigned long end = (unsigned long) &__end_rodata_hpage_align; unsigned long text_end = PFN_ALIGN(&__stop___ex_table); unsigned long rodata_end = PFN_ALIGN(&__end_rodata); - unsigned long all_end = PFN_ALIGN(&_end); + unsigned long all_end; printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", (end - start) >> 10); @@ -1134,7 +1136,16 @@ void mark_rodata_ro(void) /* * The rodata/data/bss/brk section (but not the kernel text!) * should also be not-executable. + * + * We align all_end to PMD_SIZE because the existing mapping + * is a full PMD. If we would align _brk_end to PAGE_SIZE we + * split the PMD and the reminder between _brk_end and the end + * of the PMD will remain mapped executable. + * + * Any PMD which was setup after the one which covers _brk_end + * has been zapped already via cleanup_highmem(). */ + all_end = roundup((unsigned long)_brk_end, PMD_SIZE); set_memory_nx(rodata_start, (all_end - rodata_start) >> PAGE_SHIFT); rodata_test(); @@ -1193,66 +1204,15 @@ int kern_addr_valid(unsigned long addr) return pfn_valid(pte_pfn(*pte)); } -/* - * A pseudo VMA to allow ptrace access for the vsyscall page. This only - * covers the 64bit vsyscall page now. 32bit has a real VMA now and does - * not need special handling anymore: - */ -static const char *gate_vma_name(struct vm_area_struct *vma) -{ - return "[vsyscall]"; -} -static struct vm_operations_struct gate_vma_ops = { - .name = gate_vma_name, -}; -static struct vm_area_struct gate_vma = { - .vm_start = VSYSCALL_ADDR, - .vm_end = VSYSCALL_ADDR + PAGE_SIZE, - .vm_page_prot = PAGE_READONLY_EXEC, - .vm_flags = VM_READ | VM_EXEC, - .vm_ops = &gate_vma_ops, -}; - -struct vm_area_struct *get_gate_vma(struct mm_struct *mm) -{ -#ifdef CONFIG_IA32_EMULATION - if (!mm || mm->context.ia32_compat) - return NULL; -#endif - return &gate_vma; -} - -int in_gate_area(struct mm_struct *mm, unsigned long addr) -{ - struct vm_area_struct *vma = get_gate_vma(mm); - - if (!vma) - return 0; - - return (addr >= vma->vm_start) && (addr < vma->vm_end); -} - -/* - * Use this when you have no reliable mm, typically from interrupt - * context. It is less reliable than using a task's mm and may give - * false positives. - */ -int in_gate_area_no_mm(unsigned long addr) -{ - return (addr & PAGE_MASK) == VSYSCALL_ADDR; -} - static unsigned long probe_memory_block_size(void) { /* start from 2g */ unsigned long bz = 1UL<<31; -#ifdef CONFIG_X86_UV - if (is_uv_system()) { - printk(KERN_INFO "UV: memory block size 2GB\n"); + if (totalram_pages >= (64ULL << (30 - PAGE_SHIFT))) { + pr_info("Using 2GB memory block size for large-memory system\n"); return 2UL * 1024 * 1024 * 1024; } -#endif /* less than 64g installed */ if ((max_pfn << PAGE_SHIFT) < (16UL << 32)) diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index 7b179b499fa..9ca35fc60cf 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -33,17 +33,17 @@ static int is_io_mapping_possible(resource_size_t base, unsigned long size) int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot) { - unsigned long flag = _PAGE_CACHE_WC; + enum page_cache_mode pcm = _PAGE_CACHE_MODE_WC; int ret; if (!is_io_mapping_possible(base, size)) return -EINVAL; - ret = io_reserve_memtype(base, base + size, &flag); + ret = io_reserve_memtype(base, base + size, &pcm); if (ret) return ret; - *prot = __pgprot(__PAGE_KERNEL | flag); + *prot = __pgprot(__PAGE_KERNEL | cachemode2protval(pcm)); return 0; } EXPORT_SYMBOL_GPL(iomap_create_wc); @@ -82,8 +82,10 @@ iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) * MTRR is UC or WC. UC_MINUS gets the real intention, of the * user, which is "WC if the MTRR is WC, UC if you can't do that." */ - if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC)) - prot = PAGE_KERNEL_UC_MINUS; + if (!pat_enabled && pgprot_val(prot) == + (__PAGE_KERNEL | cachemode2protval(_PAGE_CACHE_MODE_WC))) + prot = __pgprot(__PAGE_KERNEL | + cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS)); return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, prot); } diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index af78e50ca6c..fdf617c00e2 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -29,20 +29,20 @@ * conflicts. */ int ioremap_change_attr(unsigned long vaddr, unsigned long size, - unsigned long prot_val) + enum page_cache_mode pcm) { unsigned long nrpages = size >> PAGE_SHIFT; int err; - switch (prot_val) { - case _PAGE_CACHE_UC: + switch (pcm) { + case _PAGE_CACHE_MODE_UC: default: err = _set_memory_uc(vaddr, nrpages); break; - case _PAGE_CACHE_WC: + case _PAGE_CACHE_MODE_WC: err = _set_memory_wc(vaddr, nrpages); break; - case _PAGE_CACHE_WB: + case _PAGE_CACHE_MODE_WB: err = _set_memory_wb(vaddr, nrpages); break; } @@ -75,14 +75,14 @@ static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages, * caller shouldn't need to know that small detail. */ static void __iomem *__ioremap_caller(resource_size_t phys_addr, - unsigned long size, unsigned long prot_val, void *caller) + unsigned long size, enum page_cache_mode pcm, void *caller) { unsigned long offset, vaddr; resource_size_t pfn, last_pfn, last_addr; const resource_size_t unaligned_phys_addr = phys_addr; const unsigned long unaligned_size = size; struct vm_struct *area; - unsigned long new_prot_val; + enum page_cache_mode new_pcm; pgprot_t prot; int retval; void __iomem *ret_addr; @@ -134,38 +134,40 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, size = PAGE_ALIGN(last_addr+1) - phys_addr; retval = reserve_memtype(phys_addr, (u64)phys_addr + size, - prot_val, &new_prot_val); + pcm, &new_pcm); if (retval) { printk(KERN_ERR "ioremap reserve_memtype failed %d\n", retval); return NULL; } - if (prot_val != new_prot_val) { - if (!is_new_memtype_allowed(phys_addr, size, - prot_val, new_prot_val)) { + if (pcm != new_pcm) { + if (!is_new_memtype_allowed(phys_addr, size, pcm, new_pcm)) { printk(KERN_ERR - "ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n", + "ioremap error for 0x%llx-0x%llx, requested 0x%x, got 0x%x\n", (unsigned long long)phys_addr, (unsigned long long)(phys_addr + size), - prot_val, new_prot_val); + pcm, new_pcm); goto err_free_memtype; } - prot_val = new_prot_val; + pcm = new_pcm; } - switch (prot_val) { - case _PAGE_CACHE_UC: + prot = PAGE_KERNEL_IO; + switch (pcm) { + case _PAGE_CACHE_MODE_UC: default: - prot = PAGE_KERNEL_IO_NOCACHE; + prot = __pgprot(pgprot_val(prot) | + cachemode2protval(_PAGE_CACHE_MODE_UC)); break; - case _PAGE_CACHE_UC_MINUS: - prot = PAGE_KERNEL_IO_UC_MINUS; + case _PAGE_CACHE_MODE_UC_MINUS: + prot = __pgprot(pgprot_val(prot) | + cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS)); break; - case _PAGE_CACHE_WC: - prot = PAGE_KERNEL_IO_WC; + case _PAGE_CACHE_MODE_WC: + prot = __pgprot(pgprot_val(prot) | + cachemode2protval(_PAGE_CACHE_MODE_WC)); break; - case _PAGE_CACHE_WB: - prot = PAGE_KERNEL_IO; + case _PAGE_CACHE_MODE_WB: break; } @@ -178,7 +180,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, area->phys_addr = phys_addr; vaddr = (unsigned long) area->addr; - if (kernel_map_sync_memtype(phys_addr, size, prot_val)) + if (kernel_map_sync_memtype(phys_addr, size, pcm)) goto err_free_area; if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) @@ -227,14 +229,14 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size) { /* * Ideally, this should be: - * pat_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS; + * pat_enabled ? _PAGE_CACHE_MODE_UC : _PAGE_CACHE_MODE_UC_MINUS; * * Till we fix all X drivers to use ioremap_wc(), we will use * UC MINUS. */ - unsigned long val = _PAGE_CACHE_UC_MINUS; + enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS; - return __ioremap_caller(phys_addr, size, val, + return __ioremap_caller(phys_addr, size, pcm, __builtin_return_address(0)); } EXPORT_SYMBOL(ioremap_nocache); @@ -252,7 +254,7 @@ EXPORT_SYMBOL(ioremap_nocache); void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size) { if (pat_enabled) - return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC, + return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC, __builtin_return_address(0)); else return ioremap_nocache(phys_addr, size); @@ -261,7 +263,7 @@ EXPORT_SYMBOL(ioremap_wc); void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size) { - return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WB, + return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB, __builtin_return_address(0)); } EXPORT_SYMBOL(ioremap_cache); @@ -269,7 +271,8 @@ EXPORT_SYMBOL(ioremap_cache); void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size, unsigned long prot_val) { - return __ioremap_caller(phys_addr, size, (prot_val & _PAGE_CACHE_MASK), + return __ioremap_caller(phys_addr, size, + pgprot2cachemode(__pgprot(prot_val)), __builtin_return_address(0)); } EXPORT_SYMBOL(ioremap_prot); @@ -327,7 +330,7 @@ EXPORT_SYMBOL(iounmap); * Convert a physical pointer to a virtual kernel pointer for /dev/mem * access */ -void *xlate_dev_mem_ptr(unsigned long phys) +void *xlate_dev_mem_ptr(phys_addr_t phys) { void *addr; unsigned long start = phys & PAGE_MASK; @@ -343,7 +346,7 @@ void *xlate_dev_mem_ptr(unsigned long phys) return addr; } -void unxlate_dev_mem_ptr(unsigned long phys, void *addr) +void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr) { if (page_is_ram(phys >> PAGE_SHIFT)) return; diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h index 6b563a11889..62474ba66c8 100644 --- a/arch/x86/mm/mm_internal.h +++ b/arch/x86/mm/mm_internal.h @@ -16,4 +16,6 @@ void zone_sizes_init(void); extern int after_bootmem; +void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache); + #endif /* __X86_MM_INTERNAL_H */ diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c new file mode 100644 index 00000000000..67ebf575122 --- /dev/null +++ b/arch/x86/mm/mpx.c @@ -0,0 +1,928 @@ +/* + * mpx.c - Memory Protection eXtensions + * + * Copyright (c) 2014, Intel Corporation. + * Qiaowei Ren <qiaowei.ren@intel.com> + * Dave Hansen <dave.hansen@intel.com> + */ +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/syscalls.h> +#include <linux/sched/sysctl.h> + +#include <asm/i387.h> +#include <asm/insn.h> +#include <asm/mman.h> +#include <asm/mmu_context.h> +#include <asm/mpx.h> +#include <asm/processor.h> +#include <asm/fpu-internal.h> + +static const char *mpx_mapping_name(struct vm_area_struct *vma) +{ + return "[mpx]"; +} + +static struct vm_operations_struct mpx_vma_ops = { + .name = mpx_mapping_name, +}; + +static int is_mpx_vma(struct vm_area_struct *vma) +{ + return (vma->vm_ops == &mpx_vma_ops); +} + +/* + * This is really a simplified "vm_mmap". it only handles MPX + * bounds tables (the bounds directory is user-allocated). + * + * Later on, we use the vma->vm_ops to uniquely identify these + * VMAs. + */ +static unsigned long mpx_mmap(unsigned long len) +{ + unsigned long ret; + unsigned long addr, pgoff; + struct mm_struct *mm = current->mm; + vm_flags_t vm_flags; + struct vm_area_struct *vma; + + /* Only bounds table and bounds directory can be allocated here */ + if (len != MPX_BD_SIZE_BYTES && len != MPX_BT_SIZE_BYTES) + return -EINVAL; + + down_write(&mm->mmap_sem); + + /* Too many mappings? */ + if (mm->map_count > sysctl_max_map_count) { + ret = -ENOMEM; + goto out; + } + + /* Obtain the address to map to. we verify (or select) it and ensure + * that it represents a valid section of the address space. + */ + addr = get_unmapped_area(NULL, 0, len, 0, MAP_ANONYMOUS | MAP_PRIVATE); + if (addr & ~PAGE_MASK) { + ret = addr; + goto out; + } + + vm_flags = VM_READ | VM_WRITE | VM_MPX | + mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + + /* Set pgoff according to addr for anon_vma */ + pgoff = addr >> PAGE_SHIFT; + + ret = mmap_region(NULL, addr, len, vm_flags, pgoff); + if (IS_ERR_VALUE(ret)) + goto out; + + vma = find_vma(mm, ret); + if (!vma) { + ret = -ENOMEM; + goto out; + } + vma->vm_ops = &mpx_vma_ops; + + if (vm_flags & VM_LOCKED) { + up_write(&mm->mmap_sem); + mm_populate(ret, len); + return ret; + } + +out: + up_write(&mm->mmap_sem); + return ret; +} + +enum reg_type { + REG_TYPE_RM = 0, + REG_TYPE_INDEX, + REG_TYPE_BASE, +}; + +static int get_reg_offset(struct insn *insn, struct pt_regs *regs, + enum reg_type type) +{ + int regno = 0; + + static const int regoff[] = { + offsetof(struct pt_regs, ax), + offsetof(struct pt_regs, cx), + offsetof(struct pt_regs, dx), + offsetof(struct pt_regs, bx), + offsetof(struct pt_regs, sp), + offsetof(struct pt_regs, bp), + offsetof(struct pt_regs, si), + offsetof(struct pt_regs, di), +#ifdef CONFIG_X86_64 + offsetof(struct pt_regs, r8), + offsetof(struct pt_regs, r9), + offsetof(struct pt_regs, r10), + offsetof(struct pt_regs, r11), + offsetof(struct pt_regs, r12), + offsetof(struct pt_regs, r13), + offsetof(struct pt_regs, r14), + offsetof(struct pt_regs, r15), +#endif + }; + int nr_registers = ARRAY_SIZE(regoff); + /* + * Don't possibly decode a 32-bit instructions as + * reading a 64-bit-only register. + */ + if (IS_ENABLED(CONFIG_X86_64) && !insn->x86_64) + nr_registers -= 8; + + switch (type) { + case REG_TYPE_RM: + regno = X86_MODRM_RM(insn->modrm.value); + if (X86_REX_B(insn->rex_prefix.value) == 1) + regno += 8; + break; + + case REG_TYPE_INDEX: + regno = X86_SIB_INDEX(insn->sib.value); + if (X86_REX_X(insn->rex_prefix.value) == 1) + regno += 8; + break; + + case REG_TYPE_BASE: + regno = X86_SIB_BASE(insn->sib.value); + if (X86_REX_B(insn->rex_prefix.value) == 1) + regno += 8; + break; + + default: + pr_err("invalid register type"); + BUG(); + break; + } + + if (regno > nr_registers) { + WARN_ONCE(1, "decoded an instruction with an invalid register"); + return -EINVAL; + } + return regoff[regno]; +} + +/* + * return the address being referenced be instruction + * for rm=3 returning the content of the rm reg + * for rm!=3 calculates the address using SIB and Disp + */ +static void __user *mpx_get_addr_ref(struct insn *insn, struct pt_regs *regs) +{ + unsigned long addr, base, indx; + int addr_offset, base_offset, indx_offset; + insn_byte_t sib; + + insn_get_modrm(insn); + insn_get_sib(insn); + sib = insn->sib.value; + + if (X86_MODRM_MOD(insn->modrm.value) == 3) { + addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM); + if (addr_offset < 0) + goto out_err; + addr = regs_get_register(regs, addr_offset); + } else { + if (insn->sib.nbytes) { + base_offset = get_reg_offset(insn, regs, REG_TYPE_BASE); + if (base_offset < 0) + goto out_err; + + indx_offset = get_reg_offset(insn, regs, REG_TYPE_INDEX); + if (indx_offset < 0) + goto out_err; + + base = regs_get_register(regs, base_offset); + indx = regs_get_register(regs, indx_offset); + addr = base + indx * (1 << X86_SIB_SCALE(sib)); + } else { + addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM); + if (addr_offset < 0) + goto out_err; + addr = regs_get_register(regs, addr_offset); + } + addr += insn->displacement.value; + } + return (void __user *)addr; +out_err: + return (void __user *)-1; +} + +static int mpx_insn_decode(struct insn *insn, + struct pt_regs *regs) +{ + unsigned char buf[MAX_INSN_SIZE]; + int x86_64 = !test_thread_flag(TIF_IA32); + int not_copied; + int nr_copied; + + not_copied = copy_from_user(buf, (void __user *)regs->ip, sizeof(buf)); + nr_copied = sizeof(buf) - not_copied; + /* + * The decoder _should_ fail nicely if we pass it a short buffer. + * But, let's not depend on that implementation detail. If we + * did not get anything, just error out now. + */ + if (!nr_copied) + return -EFAULT; + insn_init(insn, buf, nr_copied, x86_64); + insn_get_length(insn); + /* + * copy_from_user() tries to get as many bytes as we could see in + * the largest possible instruction. If the instruction we are + * after is shorter than that _and_ we attempt to copy from + * something unreadable, we might get a short read. This is OK + * as long as the read did not stop in the middle of the + * instruction. Check to see if we got a partial instruction. + */ + if (nr_copied < insn->length) + return -EFAULT; + + insn_get_opcode(insn); + /* + * We only _really_ need to decode bndcl/bndcn/bndcu + * Error out on anything else. + */ + if (insn->opcode.bytes[0] != 0x0f) + goto bad_opcode; + if ((insn->opcode.bytes[1] != 0x1a) && + (insn->opcode.bytes[1] != 0x1b)) + goto bad_opcode; + + return 0; +bad_opcode: + return -EINVAL; +} + +/* + * If a bounds overflow occurs then a #BR is generated. This + * function decodes MPX instructions to get violation address + * and set this address into extended struct siginfo. + * + * Note that this is not a super precise way of doing this. + * Userspace could have, by the time we get here, written + * anything it wants in to the instructions. We can not + * trust anything about it. They might not be valid + * instructions or might encode invalid registers, etc... + * + * The caller is expected to kfree() the returned siginfo_t. + */ +siginfo_t *mpx_generate_siginfo(struct pt_regs *regs, + struct xsave_struct *xsave_buf) +{ + struct bndreg *bndregs, *bndreg; + siginfo_t *info = NULL; + struct insn insn; + uint8_t bndregno; + int err; + + err = mpx_insn_decode(&insn, regs); + if (err) + goto err_out; + + /* + * We know at this point that we are only dealing with + * MPX instructions. + */ + insn_get_modrm(&insn); + bndregno = X86_MODRM_REG(insn.modrm.value); + if (bndregno > 3) { + err = -EINVAL; + goto err_out; + } + /* get the bndregs _area_ of the xsave structure */ + bndregs = get_xsave_addr(xsave_buf, XSTATE_BNDREGS); + if (!bndregs) { + err = -EINVAL; + goto err_out; + } + /* now go select the individual register in the set of 4 */ + bndreg = &bndregs[bndregno]; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) { + err = -ENOMEM; + goto err_out; + } + /* + * The registers are always 64-bit, but the upper 32 + * bits are ignored in 32-bit mode. Also, note that the + * upper bounds are architecturally represented in 1's + * complement form. + * + * The 'unsigned long' cast is because the compiler + * complains when casting from integers to different-size + * pointers. + */ + info->si_lower = (void __user *)(unsigned long)bndreg->lower_bound; + info->si_upper = (void __user *)(unsigned long)~bndreg->upper_bound; + info->si_addr_lsb = 0; + info->si_signo = SIGSEGV; + info->si_errno = 0; + info->si_code = SEGV_BNDERR; + info->si_addr = mpx_get_addr_ref(&insn, regs); + /* + * We were not able to extract an address from the instruction, + * probably because there was something invalid in it. + */ + if (info->si_addr == (void *)-1) { + err = -EINVAL; + goto err_out; + } + return info; +err_out: + /* info might be NULL, but kfree() handles that */ + kfree(info); + return ERR_PTR(err); +} + +static __user void *task_get_bounds_dir(struct task_struct *tsk) +{ + struct bndcsr *bndcsr; + + if (!cpu_feature_enabled(X86_FEATURE_MPX)) + return MPX_INVALID_BOUNDS_DIR; + + /* + * The bounds directory pointer is stored in a register + * only accessible if we first do an xsave. + */ + fpu_save_init(&tsk->thread.fpu); + bndcsr = get_xsave_addr(&tsk->thread.fpu.state->xsave, XSTATE_BNDCSR); + if (!bndcsr) + return MPX_INVALID_BOUNDS_DIR; + + /* + * Make sure the register looks valid by checking the + * enable bit. + */ + if (!(bndcsr->bndcfgu & MPX_BNDCFG_ENABLE_FLAG)) + return MPX_INVALID_BOUNDS_DIR; + + /* + * Lastly, mask off the low bits used for configuration + * flags, and return the address of the bounds table. + */ + return (void __user *)(unsigned long) + (bndcsr->bndcfgu & MPX_BNDCFG_ADDR_MASK); +} + +int mpx_enable_management(struct task_struct *tsk) +{ + void __user *bd_base = MPX_INVALID_BOUNDS_DIR; + struct mm_struct *mm = tsk->mm; + int ret = 0; + + /* + * runtime in the userspace will be responsible for allocation of + * the bounds directory. Then, it will save the base of the bounds + * directory into XSAVE/XRSTOR Save Area and enable MPX through + * XRSTOR instruction. + * + * fpu_xsave() is expected to be very expensive. Storing the bounds + * directory here means that we do not have to do xsave in the unmap + * path; we can just use mm->bd_addr instead. + */ + bd_base = task_get_bounds_dir(tsk); + down_write(&mm->mmap_sem); + mm->bd_addr = bd_base; + if (mm->bd_addr == MPX_INVALID_BOUNDS_DIR) + ret = -ENXIO; + + up_write(&mm->mmap_sem); + return ret; +} + +int mpx_disable_management(struct task_struct *tsk) +{ + struct mm_struct *mm = current->mm; + + if (!cpu_feature_enabled(X86_FEATURE_MPX)) + return -ENXIO; + + down_write(&mm->mmap_sem); + mm->bd_addr = MPX_INVALID_BOUNDS_DIR; + up_write(&mm->mmap_sem); + return 0; +} + +/* + * With 32-bit mode, MPX_BT_SIZE_BYTES is 4MB, and the size of each + * bounds table is 16KB. With 64-bit mode, MPX_BT_SIZE_BYTES is 2GB, + * and the size of each bounds table is 4MB. + */ +static int allocate_bt(long __user *bd_entry) +{ + unsigned long expected_old_val = 0; + unsigned long actual_old_val = 0; + unsigned long bt_addr; + int ret = 0; + + /* + * Carve the virtual space out of userspace for the new + * bounds table: + */ + bt_addr = mpx_mmap(MPX_BT_SIZE_BYTES); + if (IS_ERR((void *)bt_addr)) + return PTR_ERR((void *)bt_addr); + /* + * Set the valid flag (kinda like _PAGE_PRESENT in a pte) + */ + bt_addr = bt_addr | MPX_BD_ENTRY_VALID_FLAG; + + /* + * Go poke the address of the new bounds table in to the + * bounds directory entry out in userspace memory. Note: + * we may race with another CPU instantiating the same table. + * In that case the cmpxchg will see an unexpected + * 'actual_old_val'. + * + * This can fault, but that's OK because we do not hold + * mmap_sem at this point, unlike some of the other part + * of the MPX code that have to pagefault_disable(). + */ + ret = user_atomic_cmpxchg_inatomic(&actual_old_val, bd_entry, + expected_old_val, bt_addr); + if (ret) + goto out_unmap; + + /* + * The user_atomic_cmpxchg_inatomic() will only return nonzero + * for faults, *not* if the cmpxchg itself fails. Now we must + * verify that the cmpxchg itself completed successfully. + */ + /* + * We expected an empty 'expected_old_val', but instead found + * an apparently valid entry. Assume we raced with another + * thread to instantiate this table and desclare succecss. + */ + if (actual_old_val & MPX_BD_ENTRY_VALID_FLAG) { + ret = 0; + goto out_unmap; + } + /* + * We found a non-empty bd_entry but it did not have the + * VALID_FLAG set. Return an error which will result in + * a SEGV since this probably means that somebody scribbled + * some invalid data in to a bounds table. + */ + if (expected_old_val != actual_old_val) { + ret = -EINVAL; + goto out_unmap; + } + return 0; +out_unmap: + vm_munmap(bt_addr & MPX_BT_ADDR_MASK, MPX_BT_SIZE_BYTES); + return ret; +} + +/* + * When a BNDSTX instruction attempts to save bounds to a bounds + * table, it will first attempt to look up the table in the + * first-level bounds directory. If it does not find a table in + * the directory, a #BR is generated and we get here in order to + * allocate a new table. + * + * With 32-bit mode, the size of BD is 4MB, and the size of each + * bound table is 16KB. With 64-bit mode, the size of BD is 2GB, + * and the size of each bound table is 4MB. + */ +static int do_mpx_bt_fault(struct xsave_struct *xsave_buf) +{ + unsigned long bd_entry, bd_base; + struct bndcsr *bndcsr; + + bndcsr = get_xsave_addr(xsave_buf, XSTATE_BNDCSR); + if (!bndcsr) + return -EINVAL; + /* + * Mask off the preserve and enable bits + */ + bd_base = bndcsr->bndcfgu & MPX_BNDCFG_ADDR_MASK; + /* + * The hardware provides the address of the missing or invalid + * entry via BNDSTATUS, so we don't have to go look it up. + */ + bd_entry = bndcsr->bndstatus & MPX_BNDSTA_ADDR_MASK; + /* + * Make sure the directory entry is within where we think + * the directory is. + */ + if ((bd_entry < bd_base) || + (bd_entry >= bd_base + MPX_BD_SIZE_BYTES)) + return -EINVAL; + + return allocate_bt((long __user *)bd_entry); +} + +int mpx_handle_bd_fault(struct xsave_struct *xsave_buf) +{ + /* + * Userspace never asked us to manage the bounds tables, + * so refuse to help. + */ + if (!kernel_managing_mpx_tables(current->mm)) + return -EINVAL; + + if (do_mpx_bt_fault(xsave_buf)) { + force_sig(SIGSEGV, current); + /* + * The force_sig() is essentially "handling" this + * exception, so we do not pass up the error + * from do_mpx_bt_fault(). + */ + } + return 0; +} + +/* + * A thin wrapper around get_user_pages(). Returns 0 if the + * fault was resolved or -errno if not. + */ +static int mpx_resolve_fault(long __user *addr, int write) +{ + long gup_ret; + int nr_pages = 1; + int force = 0; + + gup_ret = get_user_pages(current, current->mm, (unsigned long)addr, + nr_pages, write, force, NULL, NULL); + /* + * get_user_pages() returns number of pages gotten. + * 0 means we failed to fault in and get anything, + * probably because 'addr' is bad. + */ + if (!gup_ret) + return -EFAULT; + /* Other error, return it */ + if (gup_ret < 0) + return gup_ret; + /* must have gup'd a page and gup_ret>0, success */ + return 0; +} + +/* + * Get the base of bounds tables pointed by specific bounds + * directory entry. + */ +static int get_bt_addr(struct mm_struct *mm, + long __user *bd_entry, unsigned long *bt_addr) +{ + int ret; + int valid_bit; + + if (!access_ok(VERIFY_READ, (bd_entry), sizeof(*bd_entry))) + return -EFAULT; + + while (1) { + int need_write = 0; + + pagefault_disable(); + ret = get_user(*bt_addr, bd_entry); + pagefault_enable(); + if (!ret) + break; + if (ret == -EFAULT) + ret = mpx_resolve_fault(bd_entry, need_write); + /* + * If we could not resolve the fault, consider it + * userspace's fault and error out. + */ + if (ret) + return ret; + } + + valid_bit = *bt_addr & MPX_BD_ENTRY_VALID_FLAG; + *bt_addr &= MPX_BT_ADDR_MASK; + + /* + * When the kernel is managing bounds tables, a bounds directory + * entry will either have a valid address (plus the valid bit) + * *OR* be completely empty. If we see a !valid entry *and* some + * data in the address field, we know something is wrong. This + * -EINVAL return will cause a SIGSEGV. + */ + if (!valid_bit && *bt_addr) + return -EINVAL; + /* + * Do we have an completely zeroed bt entry? That is OK. It + * just means there was no bounds table for this memory. Make + * sure to distinguish this from -EINVAL, which will cause + * a SEGV. + */ + if (!valid_bit) + return -ENOENT; + + return 0; +} + +/* + * Free the backing physical pages of bounds table 'bt_addr'. + * Assume start...end is within that bounds table. + */ +static int zap_bt_entries(struct mm_struct *mm, + unsigned long bt_addr, + unsigned long start, unsigned long end) +{ + struct vm_area_struct *vma; + unsigned long addr, len; + + /* + * Find the first overlapping vma. If vma->vm_start > start, there + * will be a hole in the bounds table. This -EINVAL return will + * cause a SIGSEGV. + */ + vma = find_vma(mm, start); + if (!vma || vma->vm_start > start) + return -EINVAL; + + /* + * A NUMA policy on a VM_MPX VMA could cause this bouds table to + * be split. So we need to look across the entire 'start -> end' + * range of this bounds table, find all of the VM_MPX VMAs, and + * zap only those. + */ + addr = start; + while (vma && vma->vm_start < end) { + /* + * We followed a bounds directory entry down + * here. If we find a non-MPX VMA, that's bad, + * so stop immediately and return an error. This + * probably results in a SIGSEGV. + */ + if (!is_mpx_vma(vma)) + return -EINVAL; + + len = min(vma->vm_end, end) - addr; + zap_page_range(vma, addr, len, NULL); + + vma = vma->vm_next; + addr = vma->vm_start; + } + + return 0; +} + +static int unmap_single_bt(struct mm_struct *mm, + long __user *bd_entry, unsigned long bt_addr) +{ + unsigned long expected_old_val = bt_addr | MPX_BD_ENTRY_VALID_FLAG; + unsigned long actual_old_val = 0; + int ret; + + while (1) { + int need_write = 1; + + pagefault_disable(); + ret = user_atomic_cmpxchg_inatomic(&actual_old_val, bd_entry, + expected_old_val, 0); + pagefault_enable(); + if (!ret) + break; + if (ret == -EFAULT) + ret = mpx_resolve_fault(bd_entry, need_write); + /* + * If we could not resolve the fault, consider it + * userspace's fault and error out. + */ + if (ret) + return ret; + } + /* + * The cmpxchg was performed, check the results. + */ + if (actual_old_val != expected_old_val) { + /* + * Someone else raced with us to unmap the table. + * There was no bounds table pointed to by the + * directory, so declare success. Somebody freed + * it. + */ + if (!actual_old_val) + return 0; + /* + * Something messed with the bounds directory + * entry. We hold mmap_sem for read or write + * here, so it could not be a _new_ bounds table + * that someone just allocated. Something is + * wrong, so pass up the error and SIGSEGV. + */ + return -EINVAL; + } + + /* + * Note, we are likely being called under do_munmap() already. To + * avoid recursion, do_munmap() will check whether it comes + * from one bounds table through VM_MPX flag. + */ + return do_munmap(mm, bt_addr, MPX_BT_SIZE_BYTES); +} + +/* + * If the bounds table pointed by bounds directory 'bd_entry' is + * not shared, unmap this whole bounds table. Otherwise, only free + * those backing physical pages of bounds table entries covered + * in this virtual address region start...end. + */ +static int unmap_shared_bt(struct mm_struct *mm, + long __user *bd_entry, unsigned long start, + unsigned long end, bool prev_shared, bool next_shared) +{ + unsigned long bt_addr; + int ret; + + ret = get_bt_addr(mm, bd_entry, &bt_addr); + /* + * We could see an "error" ret for not-present bounds + * tables (not really an error), or actual errors, but + * stop unmapping either way. + */ + if (ret) + return ret; + + if (prev_shared && next_shared) + ret = zap_bt_entries(mm, bt_addr, + bt_addr+MPX_GET_BT_ENTRY_OFFSET(start), + bt_addr+MPX_GET_BT_ENTRY_OFFSET(end)); + else if (prev_shared) + ret = zap_bt_entries(mm, bt_addr, + bt_addr+MPX_GET_BT_ENTRY_OFFSET(start), + bt_addr+MPX_BT_SIZE_BYTES); + else if (next_shared) + ret = zap_bt_entries(mm, bt_addr, bt_addr, + bt_addr+MPX_GET_BT_ENTRY_OFFSET(end)); + else + ret = unmap_single_bt(mm, bd_entry, bt_addr); + + return ret; +} + +/* + * A virtual address region being munmap()ed might share bounds table + * with adjacent VMAs. We only need to free the backing physical + * memory of these shared bounds tables entries covered in this virtual + * address region. + */ +static int unmap_edge_bts(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + int ret; + long __user *bde_start, *bde_end; + struct vm_area_struct *prev, *next; + bool prev_shared = false, next_shared = false; + + bde_start = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(start); + bde_end = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(end-1); + + /* + * Check whether bde_start and bde_end are shared with adjacent + * VMAs. + * + * We already unliked the VMAs from the mm's rbtree so 'start' + * is guaranteed to be in a hole. This gets us the first VMA + * before the hole in to 'prev' and the next VMA after the hole + * in to 'next'. + */ + next = find_vma_prev(mm, start, &prev); + if (prev && (mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(prev->vm_end-1)) + == bde_start) + prev_shared = true; + if (next && (mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(next->vm_start)) + == bde_end) + next_shared = true; + + /* + * This virtual address region being munmap()ed is only + * covered by one bounds table. + * + * In this case, if this table is also shared with adjacent + * VMAs, only part of the backing physical memory of the bounds + * table need be freeed. Otherwise the whole bounds table need + * be unmapped. + */ + if (bde_start == bde_end) { + return unmap_shared_bt(mm, bde_start, start, end, + prev_shared, next_shared); + } + + /* + * If more than one bounds tables are covered in this virtual + * address region being munmap()ed, we need to separately check + * whether bde_start and bde_end are shared with adjacent VMAs. + */ + ret = unmap_shared_bt(mm, bde_start, start, end, prev_shared, false); + if (ret) + return ret; + ret = unmap_shared_bt(mm, bde_end, start, end, false, next_shared); + if (ret) + return ret; + + return 0; +} + +static int mpx_unmap_tables(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + int ret; + long __user *bd_entry, *bde_start, *bde_end; + unsigned long bt_addr; + + /* + * "Edge" bounds tables are those which are being used by the region + * (start -> end), but that may be shared with adjacent areas. If they + * turn out to be completely unshared, they will be freed. If they are + * shared, we will free the backing store (like an MADV_DONTNEED) for + * areas used by this region. + */ + ret = unmap_edge_bts(mm, start, end); + switch (ret) { + /* non-present tables are OK */ + case 0: + case -ENOENT: + /* Success, or no tables to unmap */ + break; + case -EINVAL: + case -EFAULT: + default: + return ret; + } + + /* + * Only unmap the bounds table that are + * 1. fully covered + * 2. not at the edges of the mapping, even if full aligned + */ + bde_start = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(start); + bde_end = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(end-1); + for (bd_entry = bde_start + 1; bd_entry < bde_end; bd_entry++) { + ret = get_bt_addr(mm, bd_entry, &bt_addr); + switch (ret) { + case 0: + break; + case -ENOENT: + /* No table here, try the next one */ + continue; + case -EINVAL: + case -EFAULT: + default: + /* + * Note: we are being strict here. + * Any time we run in to an issue + * unmapping tables, we stop and + * SIGSEGV. + */ + return ret; + } + + ret = unmap_single_bt(mm, bd_entry, bt_addr); + if (ret) + return ret; + } + + return 0; +} + +/* + * Free unused bounds tables covered in a virtual address region being + * munmap()ed. Assume end > start. + * + * This function will be called by do_munmap(), and the VMAs covering + * the virtual address region start...end have already been split if + * necessary, and the 'vma' is the first vma in this range (start -> end). + */ +void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + int ret; + + /* + * Refuse to do anything unless userspace has asked + * the kernel to help manage the bounds tables, + */ + if (!kernel_managing_mpx_tables(current->mm)) + return; + /* + * This will look across the entire 'start -> end' range, + * and find all of the non-VM_MPX VMAs. + * + * To avoid recursion, if a VM_MPX vma is found in the range + * (start->end), we will not continue follow-up work. This + * recursion represents having bounds tables for bounds tables, + * which should not occur normally. Being strict about it here + * helps ensure that we do not have an exploitable stack overflow. + */ + do { + if (vma->vm_flags & VM_MPX) + return; + vma = vma->vm_next; + } while (vma && vma->vm_start < end); + + ret = mpx_unmap_tables(mm, start, end); + if (ret) + force_sig(SIGSEGV, current); +} diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 36de293caf2..536ea2fb6e3 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -384,6 +384,26 @@ static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address, } /* + * Lookup the PMD entry for a virtual address. Return a pointer to the entry + * or NULL if not present. + */ +pmd_t *lookup_pmd_address(unsigned long address) +{ + pgd_t *pgd; + pud_t *pud; + + pgd = pgd_offset_k(address); + if (pgd_none(*pgd)) + return NULL; + + pud = pud_offset(pgd, address); + if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud)) + return NULL; + + return pmd_offset(pud, address); +} + +/* * This is necessary because __pa() does not work on some * kinds of memory, like vmalloc() or the alloc_remap() * areas on 32-bit NUMA systems. The percpu areas can @@ -485,14 +505,23 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, /* * We are safe now. Check whether the new pgprot is the same: + * Convert protection attributes to 4k-format, as cpa->mask* are set + * up accordingly. */ old_pte = *kpte; - old_prot = req_prot = pte_pgprot(old_pte); + old_prot = req_prot = pgprot_large_2_4k(pte_pgprot(old_pte)); pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr); pgprot_val(req_prot) |= pgprot_val(cpa->mask_set); /* + * req_prot is in format of 4k pages. It must be converted to large + * page format: the caching mode includes the PAT bit located at + * different bit positions in the two formats. + */ + req_prot = pgprot_4k_2_large(req_prot); + + /* * Set the PSE and GLOBAL flags only if the PRESENT flag is * set otherwise pmd_present/pmd_huge will return true even on * a non present pmd. The canon_pgprot will clear _PAGE_GLOBAL @@ -585,13 +614,10 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, paravirt_alloc_pte(&init_mm, page_to_pfn(base)); ref_prot = pte_pgprot(pte_clrhuge(*kpte)); - /* - * If we ever want to utilize the PAT bit, we need to - * update this function to make sure it's converted from - * bit 12 to bit 7 when we cross from the 2MB level to - * the 4K level: - */ - WARN_ON_ONCE(pgprot_val(ref_prot) & _PAGE_PAT_LARGE); + + /* promote PAT bit to correct position */ + if (level == PG_LEVEL_2M) + ref_prot = pgprot_large_2_4k(ref_prot); #ifdef CONFIG_X86_64 if (level == PG_LEVEL_1G) { @@ -879,6 +905,7 @@ static int populate_pmd(struct cpa_data *cpa, { unsigned int cur_pages = 0; pmd_t *pmd; + pgprot_t pmd_pgprot; /* * Not on a 2M boundary? @@ -910,6 +937,8 @@ static int populate_pmd(struct cpa_data *cpa, if (num_pages == cur_pages) return cur_pages; + pmd_pgprot = pgprot_4k_2_large(pgprot); + while (end - start >= PMD_SIZE) { /* @@ -921,7 +950,8 @@ static int populate_pmd(struct cpa_data *cpa, pmd = pmd_offset(pud, start); - set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE | massage_pgprot(pgprot))); + set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE | + massage_pgprot(pmd_pgprot))); start += PMD_SIZE; cpa->pfn += PMD_SIZE; @@ -949,6 +979,7 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, pud_t *pud; unsigned long end; int cur_pages = 0; + pgprot_t pud_pgprot; end = start + (cpa->numpages << PAGE_SHIFT); @@ -986,12 +1017,14 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, return cur_pages; pud = pud_offset(pgd, start); + pud_pgprot = pgprot_4k_2_large(pgprot); /* * Map everything starting from the Gb boundary, possibly with 1G pages */ while (end - start >= PUD_SIZE) { - set_pud(pud, __pud(cpa->pfn | _PAGE_PSE | massage_pgprot(pgprot))); + set_pud(pud, __pud(cpa->pfn | _PAGE_PSE | + massage_pgprot(pud_pgprot))); start += PUD_SIZE; cpa->pfn += PUD_SIZE; @@ -1304,12 +1337,6 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) return 0; } -static inline int cache_attr(pgprot_t attr) -{ - return pgprot_val(attr) & - (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD); -} - static int change_page_attr_set_clr(unsigned long *addr, int numpages, pgprot_t mask_set, pgprot_t mask_clr, int force_split, int in_flag, @@ -1390,7 +1417,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, * No need to flush, when we did not set any of the caching * attributes: */ - cache = cache_attr(mask_set); + cache = !!pgprot2cachemode(mask_set); /* * On success we use CLFLUSH, when the CPU supports it to @@ -1445,7 +1472,8 @@ int _set_memory_uc(unsigned long addr, int numpages) * for now UC MINUS. see comments in ioremap_nocache() */ return change_page_attr_set(&addr, numpages, - __pgprot(_PAGE_CACHE_UC_MINUS), 0); + cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), + 0); } int set_memory_uc(unsigned long addr, int numpages) @@ -1456,7 +1484,7 @@ int set_memory_uc(unsigned long addr, int numpages) * for now UC MINUS. see comments in ioremap_nocache() */ ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, - _PAGE_CACHE_UC_MINUS, NULL); + _PAGE_CACHE_MODE_UC_MINUS, NULL); if (ret) goto out_err; @@ -1474,7 +1502,7 @@ out_err: EXPORT_SYMBOL(set_memory_uc); static int _set_memory_array(unsigned long *addr, int addrinarray, - unsigned long new_type) + enum page_cache_mode new_type) { int i, j; int ret; @@ -1490,11 +1518,13 @@ static int _set_memory_array(unsigned long *addr, int addrinarray, } ret = change_page_attr_set(addr, addrinarray, - __pgprot(_PAGE_CACHE_UC_MINUS), 1); + cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), + 1); - if (!ret && new_type == _PAGE_CACHE_WC) + if (!ret && new_type == _PAGE_CACHE_MODE_WC) ret = change_page_attr_set_clr(addr, addrinarray, - __pgprot(_PAGE_CACHE_WC), + cachemode2pgprot( + _PAGE_CACHE_MODE_WC), __pgprot(_PAGE_CACHE_MASK), 0, CPA_ARRAY, NULL); if (ret) @@ -1511,13 +1541,13 @@ out_free: int set_memory_array_uc(unsigned long *addr, int addrinarray) { - return _set_memory_array(addr, addrinarray, _PAGE_CACHE_UC_MINUS); + return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_UC_MINUS); } EXPORT_SYMBOL(set_memory_array_uc); int set_memory_array_wc(unsigned long *addr, int addrinarray) { - return _set_memory_array(addr, addrinarray, _PAGE_CACHE_WC); + return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WC); } EXPORT_SYMBOL(set_memory_array_wc); @@ -1527,10 +1557,12 @@ int _set_memory_wc(unsigned long addr, int numpages) unsigned long addr_copy = addr; ret = change_page_attr_set(&addr, numpages, - __pgprot(_PAGE_CACHE_UC_MINUS), 0); + cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), + 0); if (!ret) { ret = change_page_attr_set_clr(&addr_copy, numpages, - __pgprot(_PAGE_CACHE_WC), + cachemode2pgprot( + _PAGE_CACHE_MODE_WC), __pgprot(_PAGE_CACHE_MASK), 0, 0, NULL); } @@ -1545,7 +1577,7 @@ int set_memory_wc(unsigned long addr, int numpages) return set_memory_uc(addr, numpages); ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, - _PAGE_CACHE_WC, NULL); + _PAGE_CACHE_MODE_WC, NULL); if (ret) goto out_err; @@ -1564,6 +1596,7 @@ EXPORT_SYMBOL(set_memory_wc); int _set_memory_wb(unsigned long addr, int numpages) { + /* WB cache mode is hard wired to all cache attribute bits being 0 */ return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_CACHE_MASK), 0); } @@ -1586,6 +1619,7 @@ int set_memory_array_wb(unsigned long *addr, int addrinarray) int i; int ret; + /* WB cache mode is hard wired to all cache attribute bits being 0 */ ret = change_page_attr_clear(addr, addrinarray, __pgprot(_PAGE_CACHE_MASK), 1); if (ret) @@ -1648,7 +1682,7 @@ int set_pages_uc(struct page *page, int numpages) EXPORT_SYMBOL(set_pages_uc); static int _set_pages_array(struct page **pages, int addrinarray, - unsigned long new_type) + enum page_cache_mode new_type) { unsigned long start; unsigned long end; @@ -1666,10 +1700,11 @@ static int _set_pages_array(struct page **pages, int addrinarray, } ret = cpa_set_pages_array(pages, addrinarray, - __pgprot(_PAGE_CACHE_UC_MINUS)); - if (!ret && new_type == _PAGE_CACHE_WC) + cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS)); + if (!ret && new_type == _PAGE_CACHE_MODE_WC) ret = change_page_attr_set_clr(NULL, addrinarray, - __pgprot(_PAGE_CACHE_WC), + cachemode2pgprot( + _PAGE_CACHE_MODE_WC), __pgprot(_PAGE_CACHE_MASK), 0, CPA_PAGES_ARRAY, pages); if (ret) @@ -1689,13 +1724,13 @@ err_out: int set_pages_array_uc(struct page **pages, int addrinarray) { - return _set_pages_array(pages, addrinarray, _PAGE_CACHE_UC_MINUS); + return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_UC_MINUS); } EXPORT_SYMBOL(set_pages_array_uc); int set_pages_array_wc(struct page **pages, int addrinarray) { - return _set_pages_array(pages, addrinarray, _PAGE_CACHE_WC); + return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WC); } EXPORT_SYMBOL(set_pages_array_wc); @@ -1714,6 +1749,7 @@ int set_pages_array_wb(struct page **pages, int addrinarray) unsigned long end; int i; + /* WB cache mode is hard wired to all cache attribute bits being 0 */ retval = cpa_clear_pages_array(pages, addrinarray, __pgprot(_PAGE_CACHE_MASK)); if (retval) @@ -1801,7 +1837,7 @@ static int __set_pages_np(struct page *page, int numpages) return __change_page_attr_set_clr(&cpa, 0); } -void kernel_map_pages(struct page *page, int numpages, int enable) +void __kernel_map_pages(struct page *page, int numpages, int enable) { if (PageHighMem(page)) return; diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 657438858e8..edf299c8ff6 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -31,6 +31,7 @@ #include <asm/io.h> #include "pat_internal.h" +#include "mm_internal.h" #ifdef CONFIG_X86_PAT int __read_mostly pat_enabled = 1; @@ -66,6 +67,75 @@ __setup("debugpat", pat_debug_setup); static u64 __read_mostly boot_pat_state; +#ifdef CONFIG_X86_PAT +/* + * X86 PAT uses page flags WC and Uncached together to keep track of + * memory type of pages that have backing page struct. X86 PAT supports 3 + * different memory types, _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC and + * _PAGE_CACHE_MODE_UC_MINUS and fourth state where page's memory type has not + * been changed from its default (value of -1 used to denote this). + * Note we do not support _PAGE_CACHE_MODE_UC here. + */ + +#define _PGMT_DEFAULT 0 +#define _PGMT_WC (1UL << PG_arch_1) +#define _PGMT_UC_MINUS (1UL << PG_uncached) +#define _PGMT_WB (1UL << PG_uncached | 1UL << PG_arch_1) +#define _PGMT_MASK (1UL << PG_uncached | 1UL << PG_arch_1) +#define _PGMT_CLEAR_MASK (~_PGMT_MASK) + +static inline enum page_cache_mode get_page_memtype(struct page *pg) +{ + unsigned long pg_flags = pg->flags & _PGMT_MASK; + + if (pg_flags == _PGMT_DEFAULT) + return -1; + else if (pg_flags == _PGMT_WC) + return _PAGE_CACHE_MODE_WC; + else if (pg_flags == _PGMT_UC_MINUS) + return _PAGE_CACHE_MODE_UC_MINUS; + else + return _PAGE_CACHE_MODE_WB; +} + +static inline void set_page_memtype(struct page *pg, + enum page_cache_mode memtype) +{ + unsigned long memtype_flags; + unsigned long old_flags; + unsigned long new_flags; + + switch (memtype) { + case _PAGE_CACHE_MODE_WC: + memtype_flags = _PGMT_WC; + break; + case _PAGE_CACHE_MODE_UC_MINUS: + memtype_flags = _PGMT_UC_MINUS; + break; + case _PAGE_CACHE_MODE_WB: + memtype_flags = _PGMT_WB; + break; + default: + memtype_flags = _PGMT_DEFAULT; + break; + } + + do { + old_flags = pg->flags; + new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags; + } while (cmpxchg(&pg->flags, old_flags, new_flags) != old_flags); +} +#else +static inline enum page_cache_mode get_page_memtype(struct page *pg) +{ + return -1; +} +static inline void set_page_memtype(struct page *pg, + enum page_cache_mode memtype) +{ +} +#endif + enum { PAT_UC = 0, /* uncached */ PAT_WC = 1, /* Write combining */ @@ -75,6 +145,52 @@ enum { PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */ }; +#define CM(c) (_PAGE_CACHE_MODE_ ## c) + +static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg) +{ + enum page_cache_mode cache; + char *cache_mode; + + switch (pat_val) { + case PAT_UC: cache = CM(UC); cache_mode = "UC "; break; + case PAT_WC: cache = CM(WC); cache_mode = "WC "; break; + case PAT_WT: cache = CM(WT); cache_mode = "WT "; break; + case PAT_WP: cache = CM(WP); cache_mode = "WP "; break; + case PAT_WB: cache = CM(WB); cache_mode = "WB "; break; + case PAT_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break; + default: cache = CM(WB); cache_mode = "WB "; break; + } + + memcpy(msg, cache_mode, 4); + + return cache; +} + +#undef CM + +/* + * Update the cache mode to pgprot translation tables according to PAT + * configuration. + * Using lower indices is preferred, so we start with highest index. + */ +void pat_init_cache_modes(void) +{ + int i; + enum page_cache_mode cache; + char pat_msg[33]; + u64 pat; + + rdmsrl(MSR_IA32_CR_PAT, pat); + pat_msg[32] = 0; + for (i = 7; i >= 0; i--) { + cache = pat_get_cache_mode((pat >> (i * 8)) & 7, + pat_msg + 4 * i); + update_cache_mode_entry(i, cache); + } + pr_info("PAT configuration [0-7]: %s\n", pat_msg); +} + #define PAT(x, y) ((u64)PAT_ ## y << ((x)*8)) void pat_init(void) @@ -124,8 +240,7 @@ void pat_init(void) wrmsrl(MSR_IA32_CR_PAT, pat); if (boot_cpu) - printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n", - smp_processor_id(), boot_pat_state, pat); + pat_init_cache_modes(); } #undef PAT @@ -139,20 +254,21 @@ static DEFINE_SPINLOCK(memtype_lock); /* protects memtype accesses */ * The intersection is based on "Effective Memory Type" tables in IA-32 * SDM vol 3a */ -static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type) +static unsigned long pat_x_mtrr_type(u64 start, u64 end, + enum page_cache_mode req_type) { /* * Look for MTRR hint to get the effective type in case where PAT * request is for WB. */ - if (req_type == _PAGE_CACHE_WB) { + if (req_type == _PAGE_CACHE_MODE_WB) { u8 mtrr_type; mtrr_type = mtrr_type_lookup(start, end); if (mtrr_type != MTRR_TYPE_WRBACK) - return _PAGE_CACHE_UC_MINUS; + return _PAGE_CACHE_MODE_UC_MINUS; - return _PAGE_CACHE_WB; + return _PAGE_CACHE_MODE_WB; } return req_type; @@ -207,25 +323,26 @@ static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end) * - Find the memtype of all the pages in the range, look for any conflicts * - In case of no conflicts, set the new memtype for pages in the range */ -static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type, - unsigned long *new_type) +static int reserve_ram_pages_type(u64 start, u64 end, + enum page_cache_mode req_type, + enum page_cache_mode *new_type) { struct page *page; u64 pfn; - if (req_type == _PAGE_CACHE_UC) { + if (req_type == _PAGE_CACHE_MODE_UC) { /* We do not support strong UC */ WARN_ON_ONCE(1); - req_type = _PAGE_CACHE_UC_MINUS; + req_type = _PAGE_CACHE_MODE_UC_MINUS; } for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { - unsigned long type; + enum page_cache_mode type; page = pfn_to_page(pfn); type = get_page_memtype(page); if (type != -1) { - printk(KERN_INFO "reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%lx, req 0x%lx\n", + pr_info("reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n", start, end - 1, type, req_type); if (new_type) *new_type = type; @@ -258,21 +375,21 @@ static int free_ram_pages_type(u64 start, u64 end) /* * req_type typically has one of the: - * - _PAGE_CACHE_WB - * - _PAGE_CACHE_WC - * - _PAGE_CACHE_UC_MINUS - * - _PAGE_CACHE_UC + * - _PAGE_CACHE_MODE_WB + * - _PAGE_CACHE_MODE_WC + * - _PAGE_CACHE_MODE_UC_MINUS + * - _PAGE_CACHE_MODE_UC * * If new_type is NULL, function will return an error if it cannot reserve the * region with req_type. If new_type is non-NULL, function will return * available type in new_type in case of no error. In case of any error * it will return a negative return value. */ -int reserve_memtype(u64 start, u64 end, unsigned long req_type, - unsigned long *new_type) +int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type, + enum page_cache_mode *new_type) { struct memtype *new; - unsigned long actual_type; + enum page_cache_mode actual_type; int is_range_ram; int err = 0; @@ -281,10 +398,10 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, if (!pat_enabled) { /* This is identical to page table setting without PAT */ if (new_type) { - if (req_type == _PAGE_CACHE_WC) - *new_type = _PAGE_CACHE_UC_MINUS; + if (req_type == _PAGE_CACHE_MODE_WC) + *new_type = _PAGE_CACHE_MODE_UC_MINUS; else - *new_type = req_type & _PAGE_CACHE_MASK; + *new_type = req_type; } return 0; } @@ -292,7 +409,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, /* Low ISA region is always mapped WB in page table. No need to track */ if (x86_platform.is_untracked_pat_range(start, end)) { if (new_type) - *new_type = _PAGE_CACHE_WB; + *new_type = _PAGE_CACHE_MODE_WB; return 0; } @@ -302,7 +419,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, * tools and ACPI tools). Use WB request for WB memory and use * UC_MINUS otherwise. */ - actual_type = pat_x_mtrr_type(start, end, req_type & _PAGE_CACHE_MASK); + actual_type = pat_x_mtrr_type(start, end, req_type); if (new_type) *new_type = actual_type; @@ -394,12 +511,12 @@ int free_memtype(u64 start, u64 end) * * Only to be called when PAT is enabled * - * Returns _PAGE_CACHE_WB, _PAGE_CACHE_WC, _PAGE_CACHE_UC_MINUS or - * _PAGE_CACHE_UC + * Returns _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC, _PAGE_CACHE_MODE_UC_MINUS + * or _PAGE_CACHE_MODE_UC */ -static unsigned long lookup_memtype(u64 paddr) +static enum page_cache_mode lookup_memtype(u64 paddr) { - int rettype = _PAGE_CACHE_WB; + enum page_cache_mode rettype = _PAGE_CACHE_MODE_WB; struct memtype *entry; if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE)) @@ -414,7 +531,7 @@ static unsigned long lookup_memtype(u64 paddr) * default state and not reserved, and hence of type WB */ if (rettype == -1) - rettype = _PAGE_CACHE_WB; + rettype = _PAGE_CACHE_MODE_WB; return rettype; } @@ -425,7 +542,7 @@ static unsigned long lookup_memtype(u64 paddr) if (entry != NULL) rettype = entry->type; else - rettype = _PAGE_CACHE_UC_MINUS; + rettype = _PAGE_CACHE_MODE_UC_MINUS; spin_unlock(&memtype_lock); return rettype; @@ -442,11 +559,11 @@ static unsigned long lookup_memtype(u64 paddr) * On failure, returns non-zero */ int io_reserve_memtype(resource_size_t start, resource_size_t end, - unsigned long *type) + enum page_cache_mode *type) { resource_size_t size = end - start; - unsigned long req_type = *type; - unsigned long new_type; + enum page_cache_mode req_type = *type; + enum page_cache_mode new_type; int ret; WARN_ON_ONCE(iomem_map_sanity_check(start, size)); @@ -520,13 +637,13 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size) int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, unsigned long size, pgprot_t *vma_prot) { - unsigned long flags = _PAGE_CACHE_WB; + enum page_cache_mode pcm = _PAGE_CACHE_MODE_WB; if (!range_is_allowed(pfn, size)) return 0; if (file->f_flags & O_DSYNC) - flags = _PAGE_CACHE_UC_MINUS; + pcm = _PAGE_CACHE_MODE_UC_MINUS; #ifdef CONFIG_X86_32 /* @@ -543,12 +660,12 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, boot_cpu_has(X86_FEATURE_CYRIX_ARR) || boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) && (pfn << PAGE_SHIFT) >= __pa(high_memory)) { - flags = _PAGE_CACHE_UC; + pcm = _PAGE_CACHE_MODE_UC; } #endif *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) | - flags); + cachemode2protval(pcm)); return 1; } @@ -556,7 +673,8 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, * Change the memory type for the physial address range in kernel identity * mapping space if that range is a part of identity map. */ -int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags) +int kernel_map_sync_memtype(u64 base, unsigned long size, + enum page_cache_mode pcm) { unsigned long id_sz; @@ -574,11 +692,11 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags) __pa(high_memory) - base : size; - if (ioremap_change_attr((unsigned long)__va(base), id_sz, flags) < 0) { + if (ioremap_change_attr((unsigned long)__va(base), id_sz, pcm) < 0) { printk(KERN_INFO "%s:%d ioremap_change_attr failed %s " "for [mem %#010Lx-%#010Lx]\n", current->comm, current->pid, - cattr_name(flags), + cattr_name(pcm), base, (unsigned long long)(base + size-1)); return -EINVAL; } @@ -595,8 +713,8 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, { int is_ram = 0; int ret; - unsigned long want_flags = (pgprot_val(*vma_prot) & _PAGE_CACHE_MASK); - unsigned long flags = want_flags; + enum page_cache_mode want_pcm = pgprot2cachemode(*vma_prot); + enum page_cache_mode pcm = want_pcm; is_ram = pat_pagerange_is_ram(paddr, paddr + size); @@ -609,36 +727,36 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, if (!pat_enabled) return 0; - flags = lookup_memtype(paddr); - if (want_flags != flags) { + pcm = lookup_memtype(paddr); + if (want_pcm != pcm) { printk(KERN_WARNING "%s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n", current->comm, current->pid, - cattr_name(want_flags), + cattr_name(want_pcm), (unsigned long long)paddr, (unsigned long long)(paddr + size - 1), - cattr_name(flags)); + cattr_name(pcm)); *vma_prot = __pgprot((pgprot_val(*vma_prot) & - (~_PAGE_CACHE_MASK)) | - flags); + (~_PAGE_CACHE_MASK)) | + cachemode2protval(pcm)); } return 0; } - ret = reserve_memtype(paddr, paddr + size, want_flags, &flags); + ret = reserve_memtype(paddr, paddr + size, want_pcm, &pcm); if (ret) return ret; - if (flags != want_flags) { + if (pcm != want_pcm) { if (strict_prot || - !is_new_memtype_allowed(paddr, size, want_flags, flags)) { + !is_new_memtype_allowed(paddr, size, want_pcm, pcm)) { free_memtype(paddr, paddr + size); printk(KERN_ERR "%s:%d map pfn expected mapping type %s" " for [mem %#010Lx-%#010Lx], got %s\n", current->comm, current->pid, - cattr_name(want_flags), + cattr_name(want_pcm), (unsigned long long)paddr, (unsigned long long)(paddr + size - 1), - cattr_name(flags)); + cattr_name(pcm)); return -EINVAL; } /* @@ -647,10 +765,10 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, */ *vma_prot = __pgprot((pgprot_val(*vma_prot) & (~_PAGE_CACHE_MASK)) | - flags); + cachemode2protval(pcm)); } - if (kernel_map_sync_memtype(paddr, size, flags) < 0) { + if (kernel_map_sync_memtype(paddr, size, pcm) < 0) { free_memtype(paddr, paddr + size); return -EINVAL; } @@ -709,7 +827,7 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn, unsigned long addr, unsigned long size) { resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT; - unsigned long flags; + enum page_cache_mode pcm; /* reserve the whole chunk starting from paddr */ if (addr == vma->vm_start && size == (vma->vm_end - vma->vm_start)) { @@ -728,18 +846,18 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, * For anything smaller than the vma size we set prot based on the * lookup. */ - flags = lookup_memtype(paddr); + pcm = lookup_memtype(paddr); /* Check memtype for the remaining pages */ while (size > PAGE_SIZE) { size -= PAGE_SIZE; paddr += PAGE_SIZE; - if (flags != lookup_memtype(paddr)) + if (pcm != lookup_memtype(paddr)) return -EINVAL; } *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | - flags); + cachemode2protval(pcm)); return 0; } @@ -747,15 +865,15 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn) { - unsigned long flags; + enum page_cache_mode pcm; if (!pat_enabled) return 0; /* Set prot based on lookup */ - flags = lookup_memtype((resource_size_t)pfn << PAGE_SHIFT); + pcm = lookup_memtype((resource_size_t)pfn << PAGE_SHIFT); *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | - flags); + cachemode2protval(pcm)); return 0; } @@ -791,7 +909,8 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, pgprot_t pgprot_writecombine(pgprot_t prot) { if (pat_enabled) - return __pgprot(pgprot_val(prot) | _PAGE_CACHE_WC); + return __pgprot(pgprot_val(prot) | + cachemode2protval(_PAGE_CACHE_MODE_WC)); else return pgprot_noncached(prot); } @@ -824,7 +943,7 @@ static void *memtype_seq_start(struct seq_file *seq, loff_t *pos) { if (*pos == 0) { ++*pos; - seq_printf(seq, "PAT memtype list:\n"); + seq_puts(seq, "PAT memtype list:\n"); } return memtype_get_idx(*pos); diff --git a/arch/x86/mm/pat_internal.h b/arch/x86/mm/pat_internal.h index 77e5ba153fa..f6411620305 100644 --- a/arch/x86/mm/pat_internal.h +++ b/arch/x86/mm/pat_internal.h @@ -10,30 +10,32 @@ struct memtype { u64 start; u64 end; u64 subtree_max_end; - unsigned long type; + enum page_cache_mode type; struct rb_node rb; }; -static inline char *cattr_name(unsigned long flags) +static inline char *cattr_name(enum page_cache_mode pcm) { - switch (flags & _PAGE_CACHE_MASK) { - case _PAGE_CACHE_UC: return "uncached"; - case _PAGE_CACHE_UC_MINUS: return "uncached-minus"; - case _PAGE_CACHE_WB: return "write-back"; - case _PAGE_CACHE_WC: return "write-combining"; - default: return "broken"; + switch (pcm) { + case _PAGE_CACHE_MODE_UC: return "uncached"; + case _PAGE_CACHE_MODE_UC_MINUS: return "uncached-minus"; + case _PAGE_CACHE_MODE_WB: return "write-back"; + case _PAGE_CACHE_MODE_WC: return "write-combining"; + case _PAGE_CACHE_MODE_WT: return "write-through"; + case _PAGE_CACHE_MODE_WP: return "write-protected"; + default: return "broken"; } } #ifdef CONFIG_X86_PAT extern int rbt_memtype_check_insert(struct memtype *new, - unsigned long *new_type); + enum page_cache_mode *new_type); extern struct memtype *rbt_memtype_erase(u64 start, u64 end); extern struct memtype *rbt_memtype_lookup(u64 addr); extern int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos); #else static inline int rbt_memtype_check_insert(struct memtype *new, - unsigned long *new_type) + enum page_cache_mode *new_type) { return 0; } static inline struct memtype *rbt_memtype_erase(u64 start, u64 end) { return NULL; } diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c index 415f6c4ced3..6582adcc8bd 100644 --- a/arch/x86/mm/pat_rbtree.c +++ b/arch/x86/mm/pat_rbtree.c @@ -122,11 +122,12 @@ static struct memtype *memtype_rb_exact_match(struct rb_root *root, static int memtype_rb_check_conflict(struct rb_root *root, u64 start, u64 end, - unsigned long reqtype, unsigned long *newtype) + enum page_cache_mode reqtype, + enum page_cache_mode *newtype) { struct rb_node *node; struct memtype *match; - int found_type = reqtype; + enum page_cache_mode found_type = reqtype; match = memtype_rb_lowest_match(&memtype_rbroot, start, end); if (match == NULL) @@ -187,7 +188,8 @@ static void memtype_rb_insert(struct rb_root *root, struct memtype *newdata) rb_insert_augmented(&newdata->rb, root, &memtype_rb_augment_cb); } -int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type) +int rbt_memtype_check_insert(struct memtype *new, + enum page_cache_mode *ret_type) { int err = 0; |