summaryrefslogtreecommitdiffstats
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/dump_pagetables.c27
-rw-r--r--arch/x86/mm/fault.c64
-rw-r--r--arch/x86/mm/init.c41
-rw-r--r--arch/x86/mm/init_64.c76
-rw-r--r--arch/x86/mm/iomap_32.c12
-rw-r--r--arch/x86/mm/ioremap.c67
-rw-r--r--arch/x86/mm/mm_internal.h2
-rw-r--r--arch/x86/mm/mpx.c928
-rw-r--r--arch/x86/mm/pageattr.c106
-rw-r--r--arch/x86/mm/pat.c247
-rw-r--r--arch/x86/mm/pat_internal.h22
-rw-r--r--arch/x86/mm/pat_rbtree.c8
13 files changed, 1346 insertions, 256 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 6a19ad9f370..ecfdc46a024 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -30,3 +30,5 @@ obj-$(CONFIG_ACPI_NUMA) += srat.o
obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
obj-$(CONFIG_MEMTEST) += memtest.o
+
+obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 95a427e5788..f0cedf3395a 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -76,6 +76,9 @@ static struct addr_marker address_markers[] = {
# ifdef CONFIG_X86_ESPFIX64
{ ESPFIX_BASE_ADDR, "ESPfix Area", 16 },
# endif
+# ifdef CONFIG_EFI
+ { EFI_VA_END, "EFI Runtime Services" },
+# endif
{ __START_KERNEL_map, "High Kernel Mapping" },
{ MODULES_VADDR, "Modules" },
{ MODULES_END, "End Modules" },
@@ -126,7 +129,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
if (!pgprot_val(prot)) {
/* Not present */
- pt_dump_cont_printf(m, dmsg, " ");
+ pt_dump_cont_printf(m, dmsg, " ");
} else {
if (pr & _PAGE_USER)
pt_dump_cont_printf(m, dmsg, "USR ");
@@ -145,18 +148,16 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
else
pt_dump_cont_printf(m, dmsg, " ");
- /* Bit 9 has a different meaning on level 3 vs 4 */
- if (level <= 3) {
- if (pr & _PAGE_PSE)
- pt_dump_cont_printf(m, dmsg, "PSE ");
- else
- pt_dump_cont_printf(m, dmsg, " ");
- } else {
- if (pr & _PAGE_PAT)
- pt_dump_cont_printf(m, dmsg, "pat ");
- else
- pt_dump_cont_printf(m, dmsg, " ");
- }
+ /* Bit 7 has a different meaning on level 3 vs 4 */
+ if (level <= 3 && pr & _PAGE_PSE)
+ pt_dump_cont_printf(m, dmsg, "PSE ");
+ else
+ pt_dump_cont_printf(m, dmsg, " ");
+ if ((level == 4 && pr & _PAGE_PAT) ||
+ ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE))
+ pt_dump_cont_printf(m, dmsg, "pat ");
+ else
+ pt_dump_cont_printf(m, dmsg, " ");
if (pr & _PAGE_GLOBAL)
pt_dump_cont_printf(m, dmsg, "GLB ");
else
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index d973e61e450..38dcec403b4 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -844,11 +844,8 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
unsigned int fault)
{
struct task_struct *tsk = current;
- struct mm_struct *mm = tsk->mm;
int code = BUS_ADRERR;
- up_read(&mm->mmap_sem);
-
/* Kernel mode? Handle exceptions or die: */
if (!(error_code & PF_USER)) {
no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
@@ -879,7 +876,6 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
unsigned long address, unsigned int fault)
{
if (fatal_signal_pending(current) && !(error_code & PF_USER)) {
- up_read(&current->mm->mmap_sem);
no_context(regs, error_code, address, 0, 0);
return;
}
@@ -887,14 +883,11 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
if (fault & VM_FAULT_OOM) {
/* Kernel mode? Handle exceptions or die: */
if (!(error_code & PF_USER)) {
- up_read(&current->mm->mmap_sem);
no_context(regs, error_code, address,
SIGSEGV, SEGV_MAPERR);
return;
}
- up_read(&current->mm->mmap_sem);
-
/*
* We ran out of memory, call the OOM killer, and return the
* userspace (which will retry the fault, or kill us if we got
@@ -1062,7 +1055,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
struct vm_area_struct *vma;
struct task_struct *tsk;
struct mm_struct *mm;
- int fault;
+ int fault, major = 0;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
tsk = current;
@@ -1237,47 +1230,50 @@ good_area:
* we get VM_FAULT_RETRY back, the mmap_sem has been unlocked.
*/
fault = handle_mm_fault(mm, vma, address, flags);
+ major |= fault & VM_FAULT_MAJOR;
/*
- * If we need to retry but a fatal signal is pending, handle the
- * signal first. We do not need to release the mmap_sem because it
- * would already be released in __lock_page_or_retry in mm/filemap.c.
+ * If we need to retry the mmap_sem has already been released,
+ * and if there is a fatal signal pending there is no guarantee
+ * that we made any progress. Handle this case first.
*/
- if (unlikely((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)))
+ if (unlikely(fault & VM_FAULT_RETRY)) {
+ /* Retry at most once */
+ if (flags & FAULT_FLAG_ALLOW_RETRY) {
+ flags &= ~FAULT_FLAG_ALLOW_RETRY;
+ flags |= FAULT_FLAG_TRIED;
+ if (!fatal_signal_pending(tsk))
+ goto retry;
+ }
+
+ /* User mode? Just return to handle the fatal exception */
+ if (flags & FAULT_FLAG_USER)
+ return;
+
+ /* Not returning to user mode? Handle exceptions or die: */
+ no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
return;
+ }
+ up_read(&mm->mmap_sem);
if (unlikely(fault & VM_FAULT_ERROR)) {
mm_fault_error(regs, error_code, address, fault);
return;
}
/*
- * Major/minor page fault accounting is only done on the
- * initial attempt. If we go through a retry, it is extremely
- * likely that the page will be found in page cache at that point.
+ * Major/minor page fault accounting. If any of the events
+ * returned VM_FAULT_MAJOR, we account it as a major fault.
*/
- if (flags & FAULT_FLAG_ALLOW_RETRY) {
- if (fault & VM_FAULT_MAJOR) {
- tsk->maj_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
- regs, address);
- } else {
- tsk->min_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
- regs, address);
- }
- if (fault & VM_FAULT_RETRY) {
- /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
- * of starvation. */
- flags &= ~FAULT_FLAG_ALLOW_RETRY;
- flags |= FAULT_FLAG_TRIED;
- goto retry;
- }
+ if (major) {
+ tsk->maj_flt++;
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
+ } else {
+ tsk->min_flt++;
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
}
check_v8086_mode(regs, address, tsk);
-
- up_read(&mm->mmap_sem);
}
NOKPROBE_SYMBOL(__do_page_fault);
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 66dba36f234..a97ee080147 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -27,6 +27,35 @@
#include "mm_internal.h"
+/*
+ * Tables translating between page_cache_type_t and pte encoding.
+ * Minimal supported modes are defined statically, modified if more supported
+ * cache modes are available.
+ * Index into __cachemode2pte_tbl is the cachemode.
+ * Index into __pte2cachemode_tbl are the caching attribute bits of the pte
+ * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2.
+ */
+uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = {
+ [_PAGE_CACHE_MODE_WB] = 0,
+ [_PAGE_CACHE_MODE_WC] = _PAGE_PWT,
+ [_PAGE_CACHE_MODE_UC_MINUS] = _PAGE_PCD,
+ [_PAGE_CACHE_MODE_UC] = _PAGE_PCD | _PAGE_PWT,
+ [_PAGE_CACHE_MODE_WT] = _PAGE_PCD,
+ [_PAGE_CACHE_MODE_WP] = _PAGE_PCD,
+};
+EXPORT_SYMBOL_GPL(__cachemode2pte_tbl);
+uint8_t __pte2cachemode_tbl[8] = {
+ [__pte2cm_idx(0)] = _PAGE_CACHE_MODE_WB,
+ [__pte2cm_idx(_PAGE_PWT)] = _PAGE_CACHE_MODE_WC,
+ [__pte2cm_idx(_PAGE_PCD)] = _PAGE_CACHE_MODE_UC_MINUS,
+ [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD)] = _PAGE_CACHE_MODE_UC,
+ [__pte2cm_idx(_PAGE_PAT)] = _PAGE_CACHE_MODE_WB,
+ [__pte2cm_idx(_PAGE_PWT | _PAGE_PAT)] = _PAGE_CACHE_MODE_WC,
+ [__pte2cm_idx(_PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS,
+ [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC,
+};
+EXPORT_SYMBOL_GPL(__pte2cachemode_tbl);
+
static unsigned long __initdata pgt_buf_start;
static unsigned long __initdata pgt_buf_end;
static unsigned long __initdata pgt_buf_top;
@@ -674,10 +703,10 @@ void __init zone_sizes_init(void)
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
#ifdef CONFIG_ZONE_DMA
- max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
+ max_zone_pfns[ZONE_DMA] = min(MAX_DMA_PFN, max_low_pfn);
#endif
#ifdef CONFIG_ZONE_DMA32
- max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
+ max_zone_pfns[ZONE_DMA32] = min(MAX_DMA32_PFN, max_low_pfn);
#endif
max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
#ifdef CONFIG_HIGHMEM
@@ -687,3 +716,11 @@ void __init zone_sizes_init(void)
free_area_init_nodes(max_zone_pfns);
}
+void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
+{
+ /* entry 0 MUST be WB (hardwired to speed up translations) */
+ BUG_ON(!entry && cache != _PAGE_CACHE_MODE_WB);
+
+ __cachemode2pte_tbl[cache] = __cm_idx2pte(entry);
+ __pte2cachemode_tbl[entry] = cache;
+}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 4cb8763868f..30eb05ae706 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -52,7 +52,6 @@
#include <asm/numa.h>
#include <asm/cacheflush.h>
#include <asm/init.h>
-#include <asm/uv/uv.h>
#include <asm/setup.h>
#include "mm_internal.h"
@@ -338,12 +337,15 @@ pte_t * __init populate_extra_pte(unsigned long vaddr)
* Create large page table mappings for a range of physical addresses.
*/
static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
- pgprot_t prot)
+ enum page_cache_mode cache)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
+ pgprot_t prot;
+ pgprot_val(prot) = pgprot_val(PAGE_KERNEL_LARGE) |
+ pgprot_val(pgprot_4k_2_large(cachemode2pgprot(cache)));
BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
pgd = pgd_offset_k((unsigned long)__va(phys));
@@ -366,12 +368,12 @@ static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
{
- __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
+ __init_extra_mapping(phys, size, _PAGE_CACHE_MODE_WB);
}
void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
{
- __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
+ __init_extra_mapping(phys, size, _PAGE_CACHE_MODE_UC);
}
/*
@@ -1123,7 +1125,7 @@ void mark_rodata_ro(void)
unsigned long end = (unsigned long) &__end_rodata_hpage_align;
unsigned long text_end = PFN_ALIGN(&__stop___ex_table);
unsigned long rodata_end = PFN_ALIGN(&__end_rodata);
- unsigned long all_end = PFN_ALIGN(&_end);
+ unsigned long all_end;
printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
(end - start) >> 10);
@@ -1134,7 +1136,16 @@ void mark_rodata_ro(void)
/*
* The rodata/data/bss/brk section (but not the kernel text!)
* should also be not-executable.
+ *
+ * We align all_end to PMD_SIZE because the existing mapping
+ * is a full PMD. If we would align _brk_end to PAGE_SIZE we
+ * split the PMD and the reminder between _brk_end and the end
+ * of the PMD will remain mapped executable.
+ *
+ * Any PMD which was setup after the one which covers _brk_end
+ * has been zapped already via cleanup_highmem().
*/
+ all_end = roundup((unsigned long)_brk_end, PMD_SIZE);
set_memory_nx(rodata_start, (all_end - rodata_start) >> PAGE_SHIFT);
rodata_test();
@@ -1193,66 +1204,15 @@ int kern_addr_valid(unsigned long addr)
return pfn_valid(pte_pfn(*pte));
}
-/*
- * A pseudo VMA to allow ptrace access for the vsyscall page. This only
- * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
- * not need special handling anymore:
- */
-static const char *gate_vma_name(struct vm_area_struct *vma)
-{
- return "[vsyscall]";
-}
-static struct vm_operations_struct gate_vma_ops = {
- .name = gate_vma_name,
-};
-static struct vm_area_struct gate_vma = {
- .vm_start = VSYSCALL_ADDR,
- .vm_end = VSYSCALL_ADDR + PAGE_SIZE,
- .vm_page_prot = PAGE_READONLY_EXEC,
- .vm_flags = VM_READ | VM_EXEC,
- .vm_ops = &gate_vma_ops,
-};
-
-struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
-{
-#ifdef CONFIG_IA32_EMULATION
- if (!mm || mm->context.ia32_compat)
- return NULL;
-#endif
- return &gate_vma;
-}
-
-int in_gate_area(struct mm_struct *mm, unsigned long addr)
-{
- struct vm_area_struct *vma = get_gate_vma(mm);
-
- if (!vma)
- return 0;
-
- return (addr >= vma->vm_start) && (addr < vma->vm_end);
-}
-
-/*
- * Use this when you have no reliable mm, typically from interrupt
- * context. It is less reliable than using a task's mm and may give
- * false positives.
- */
-int in_gate_area_no_mm(unsigned long addr)
-{
- return (addr & PAGE_MASK) == VSYSCALL_ADDR;
-}
-
static unsigned long probe_memory_block_size(void)
{
/* start from 2g */
unsigned long bz = 1UL<<31;
-#ifdef CONFIG_X86_UV
- if (is_uv_system()) {
- printk(KERN_INFO "UV: memory block size 2GB\n");
+ if (totalram_pages >= (64ULL << (30 - PAGE_SHIFT))) {
+ pr_info("Using 2GB memory block size for large-memory system\n");
return 2UL * 1024 * 1024 * 1024;
}
-#endif
/* less than 64g installed */
if ((max_pfn << PAGE_SHIFT) < (16UL << 32))
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 7b179b499fa..9ca35fc60cf 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -33,17 +33,17 @@ static int is_io_mapping_possible(resource_size_t base, unsigned long size)
int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot)
{
- unsigned long flag = _PAGE_CACHE_WC;
+ enum page_cache_mode pcm = _PAGE_CACHE_MODE_WC;
int ret;
if (!is_io_mapping_possible(base, size))
return -EINVAL;
- ret = io_reserve_memtype(base, base + size, &flag);
+ ret = io_reserve_memtype(base, base + size, &pcm);
if (ret)
return ret;
- *prot = __pgprot(__PAGE_KERNEL | flag);
+ *prot = __pgprot(__PAGE_KERNEL | cachemode2protval(pcm));
return 0;
}
EXPORT_SYMBOL_GPL(iomap_create_wc);
@@ -82,8 +82,10 @@ iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
* MTRR is UC or WC. UC_MINUS gets the real intention, of the
* user, which is "WC if the MTRR is WC, UC if you can't do that."
*/
- if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC))
- prot = PAGE_KERNEL_UC_MINUS;
+ if (!pat_enabled && pgprot_val(prot) ==
+ (__PAGE_KERNEL | cachemode2protval(_PAGE_CACHE_MODE_WC)))
+ prot = __pgprot(__PAGE_KERNEL |
+ cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS));
return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, prot);
}
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index af78e50ca6c..fdf617c00e2 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -29,20 +29,20 @@
* conflicts.
*/
int ioremap_change_attr(unsigned long vaddr, unsigned long size,
- unsigned long prot_val)
+ enum page_cache_mode pcm)
{
unsigned long nrpages = size >> PAGE_SHIFT;
int err;
- switch (prot_val) {
- case _PAGE_CACHE_UC:
+ switch (pcm) {
+ case _PAGE_CACHE_MODE_UC:
default:
err = _set_memory_uc(vaddr, nrpages);
break;
- case _PAGE_CACHE_WC:
+ case _PAGE_CACHE_MODE_WC:
err = _set_memory_wc(vaddr, nrpages);
break;
- case _PAGE_CACHE_WB:
+ case _PAGE_CACHE_MODE_WB:
err = _set_memory_wb(vaddr, nrpages);
break;
}
@@ -75,14 +75,14 @@ static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages,
* caller shouldn't need to know that small detail.
*/
static void __iomem *__ioremap_caller(resource_size_t phys_addr,
- unsigned long size, unsigned long prot_val, void *caller)
+ unsigned long size, enum page_cache_mode pcm, void *caller)
{
unsigned long offset, vaddr;
resource_size_t pfn, last_pfn, last_addr;
const resource_size_t unaligned_phys_addr = phys_addr;
const unsigned long unaligned_size = size;
struct vm_struct *area;
- unsigned long new_prot_val;
+ enum page_cache_mode new_pcm;
pgprot_t prot;
int retval;
void __iomem *ret_addr;
@@ -134,38 +134,40 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
size = PAGE_ALIGN(last_addr+1) - phys_addr;
retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
- prot_val, &new_prot_val);
+ pcm, &new_pcm);
if (retval) {
printk(KERN_ERR "ioremap reserve_memtype failed %d\n", retval);
return NULL;
}
- if (prot_val != new_prot_val) {
- if (!is_new_memtype_allowed(phys_addr, size,
- prot_val, new_prot_val)) {
+ if (pcm != new_pcm) {
+ if (!is_new_memtype_allowed(phys_addr, size, pcm, new_pcm)) {
printk(KERN_ERR
- "ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n",
+ "ioremap error for 0x%llx-0x%llx, requested 0x%x, got 0x%x\n",
(unsigned long long)phys_addr,
(unsigned long long)(phys_addr + size),
- prot_val, new_prot_val);
+ pcm, new_pcm);
goto err_free_memtype;
}
- prot_val = new_prot_val;
+ pcm = new_pcm;
}
- switch (prot_val) {
- case _PAGE_CACHE_UC:
+ prot = PAGE_KERNEL_IO;
+ switch (pcm) {
+ case _PAGE_CACHE_MODE_UC:
default:
- prot = PAGE_KERNEL_IO_NOCACHE;
+ prot = __pgprot(pgprot_val(prot) |
+ cachemode2protval(_PAGE_CACHE_MODE_UC));
break;
- case _PAGE_CACHE_UC_MINUS:
- prot = PAGE_KERNEL_IO_UC_MINUS;
+ case _PAGE_CACHE_MODE_UC_MINUS:
+ prot = __pgprot(pgprot_val(prot) |
+ cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS));
break;
- case _PAGE_CACHE_WC:
- prot = PAGE_KERNEL_IO_WC;
+ case _PAGE_CACHE_MODE_WC:
+ prot = __pgprot(pgprot_val(prot) |
+ cachemode2protval(_PAGE_CACHE_MODE_WC));
break;
- case _PAGE_CACHE_WB:
- prot = PAGE_KERNEL_IO;
+ case _PAGE_CACHE_MODE_WB:
break;
}
@@ -178,7 +180,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
area->phys_addr = phys_addr;
vaddr = (unsigned long) area->addr;
- if (kernel_map_sync_memtype(phys_addr, size, prot_val))
+ if (kernel_map_sync_memtype(phys_addr, size, pcm))
goto err_free_area;
if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot))
@@ -227,14 +229,14 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
{
/*
* Ideally, this should be:
- * pat_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
+ * pat_enabled ? _PAGE_CACHE_MODE_UC : _PAGE_CACHE_MODE_UC_MINUS;
*
* Till we fix all X drivers to use ioremap_wc(), we will use
* UC MINUS.
*/
- unsigned long val = _PAGE_CACHE_UC_MINUS;
+ enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
- return __ioremap_caller(phys_addr, size, val,
+ return __ioremap_caller(phys_addr, size, pcm,
__builtin_return_address(0));
}
EXPORT_SYMBOL(ioremap_nocache);
@@ -252,7 +254,7 @@ EXPORT_SYMBOL(ioremap_nocache);
void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
{
if (pat_enabled)
- return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
+ return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
__builtin_return_address(0));
else
return ioremap_nocache(phys_addr, size);
@@ -261,7 +263,7 @@ EXPORT_SYMBOL(ioremap_wc);
void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
{
- return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WB,
+ return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
__builtin_return_address(0));
}
EXPORT_SYMBOL(ioremap_cache);
@@ -269,7 +271,8 @@ EXPORT_SYMBOL(ioremap_cache);
void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
unsigned long prot_val)
{
- return __ioremap_caller(phys_addr, size, (prot_val & _PAGE_CACHE_MASK),
+ return __ioremap_caller(phys_addr, size,
+ pgprot2cachemode(__pgprot(prot_val)),
__builtin_return_address(0));
}
EXPORT_SYMBOL(ioremap_prot);
@@ -327,7 +330,7 @@ EXPORT_SYMBOL(iounmap);
* Convert a physical pointer to a virtual kernel pointer for /dev/mem
* access
*/
-void *xlate_dev_mem_ptr(unsigned long phys)
+void *xlate_dev_mem_ptr(phys_addr_t phys)
{
void *addr;
unsigned long start = phys & PAGE_MASK;
@@ -343,7 +346,7 @@ void *xlate_dev_mem_ptr(unsigned long phys)
return addr;
}
-void unxlate_dev_mem_ptr(unsigned long phys, void *addr)
+void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
{
if (page_is_ram(phys >> PAGE_SHIFT))
return;
diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h
index 6b563a11889..62474ba66c8 100644
--- a/arch/x86/mm/mm_internal.h
+++ b/arch/x86/mm/mm_internal.h
@@ -16,4 +16,6 @@ void zone_sizes_init(void);
extern int after_bootmem;
+void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache);
+
#endif /* __X86_MM_INTERNAL_H */
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
new file mode 100644
index 00000000000..67ebf575122
--- /dev/null
+++ b/arch/x86/mm/mpx.c
@@ -0,0 +1,928 @@
+/*
+ * mpx.c - Memory Protection eXtensions
+ *
+ * Copyright (c) 2014, Intel Corporation.
+ * Qiaowei Ren <qiaowei.ren@intel.com>
+ * Dave Hansen <dave.hansen@intel.com>
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+#include <linux/sched/sysctl.h>
+
+#include <asm/i387.h>
+#include <asm/insn.h>
+#include <asm/mman.h>
+#include <asm/mmu_context.h>
+#include <asm/mpx.h>
+#include <asm/processor.h>
+#include <asm/fpu-internal.h>
+
+static const char *mpx_mapping_name(struct vm_area_struct *vma)
+{
+ return "[mpx]";
+}
+
+static struct vm_operations_struct mpx_vma_ops = {
+ .name = mpx_mapping_name,
+};
+
+static int is_mpx_vma(struct vm_area_struct *vma)
+{
+ return (vma->vm_ops == &mpx_vma_ops);
+}
+
+/*
+ * This is really a simplified "vm_mmap". it only handles MPX
+ * bounds tables (the bounds directory is user-allocated).
+ *
+ * Later on, we use the vma->vm_ops to uniquely identify these
+ * VMAs.
+ */
+static unsigned long mpx_mmap(unsigned long len)
+{
+ unsigned long ret;
+ unsigned long addr, pgoff;
+ struct mm_struct *mm = current->mm;
+ vm_flags_t vm_flags;
+ struct vm_area_struct *vma;
+
+ /* Only bounds table and bounds directory can be allocated here */
+ if (len != MPX_BD_SIZE_BYTES && len != MPX_BT_SIZE_BYTES)
+ return -EINVAL;
+
+ down_write(&mm->mmap_sem);
+
+ /* Too many mappings? */
+ if (mm->map_count > sysctl_max_map_count) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /* Obtain the address to map to. we verify (or select) it and ensure
+ * that it represents a valid section of the address space.
+ */
+ addr = get_unmapped_area(NULL, 0, len, 0, MAP_ANONYMOUS | MAP_PRIVATE);
+ if (addr & ~PAGE_MASK) {
+ ret = addr;
+ goto out;
+ }
+
+ vm_flags = VM_READ | VM_WRITE | VM_MPX |
+ mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
+
+ /* Set pgoff according to addr for anon_vma */
+ pgoff = addr >> PAGE_SHIFT;
+
+ ret = mmap_region(NULL, addr, len, vm_flags, pgoff);
+ if (IS_ERR_VALUE(ret))
+ goto out;
+
+ vma = find_vma(mm, ret);
+ if (!vma) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ vma->vm_ops = &mpx_vma_ops;
+
+ if (vm_flags & VM_LOCKED) {
+ up_write(&mm->mmap_sem);
+ mm_populate(ret, len);
+ return ret;
+ }
+
+out:
+ up_write(&mm->mmap_sem);
+ return ret;
+}
+
+enum reg_type {
+ REG_TYPE_RM = 0,
+ REG_TYPE_INDEX,
+ REG_TYPE_BASE,
+};
+
+static int get_reg_offset(struct insn *insn, struct pt_regs *regs,
+ enum reg_type type)
+{
+ int regno = 0;
+
+ static const int regoff[] = {
+ offsetof(struct pt_regs, ax),
+ offsetof(struct pt_regs, cx),
+ offsetof(struct pt_regs, dx),
+ offsetof(struct pt_regs, bx),
+ offsetof(struct pt_regs, sp),
+ offsetof(struct pt_regs, bp),
+ offsetof(struct pt_regs, si),
+ offsetof(struct pt_regs, di),
+#ifdef CONFIG_X86_64
+ offsetof(struct pt_regs, r8),
+ offsetof(struct pt_regs, r9),
+ offsetof(struct pt_regs, r10),
+ offsetof(struct pt_regs, r11),
+ offsetof(struct pt_regs, r12),
+ offsetof(struct pt_regs, r13),
+ offsetof(struct pt_regs, r14),
+ offsetof(struct pt_regs, r15),
+#endif
+ };
+ int nr_registers = ARRAY_SIZE(regoff);
+ /*
+ * Don't possibly decode a 32-bit instructions as
+ * reading a 64-bit-only register.
+ */
+ if (IS_ENABLED(CONFIG_X86_64) && !insn->x86_64)
+ nr_registers -= 8;
+
+ switch (type) {
+ case REG_TYPE_RM:
+ regno = X86_MODRM_RM(insn->modrm.value);
+ if (X86_REX_B(insn->rex_prefix.value) == 1)
+ regno += 8;
+ break;
+
+ case REG_TYPE_INDEX:
+ regno = X86_SIB_INDEX(insn->sib.value);
+ if (X86_REX_X(insn->rex_prefix.value) == 1)
+ regno += 8;
+ break;
+
+ case REG_TYPE_BASE:
+ regno = X86_SIB_BASE(insn->sib.value);
+ if (X86_REX_B(insn->rex_prefix.value) == 1)
+ regno += 8;
+ break;
+
+ default:
+ pr_err("invalid register type");
+ BUG();
+ break;
+ }
+
+ if (regno > nr_registers) {
+ WARN_ONCE(1, "decoded an instruction with an invalid register");
+ return -EINVAL;
+ }
+ return regoff[regno];
+}
+
+/*
+ * return the address being referenced be instruction
+ * for rm=3 returning the content of the rm reg
+ * for rm!=3 calculates the address using SIB and Disp
+ */
+static void __user *mpx_get_addr_ref(struct insn *insn, struct pt_regs *regs)
+{
+ unsigned long addr, base, indx;
+ int addr_offset, base_offset, indx_offset;
+ insn_byte_t sib;
+
+ insn_get_modrm(insn);
+ insn_get_sib(insn);
+ sib = insn->sib.value;
+
+ if (X86_MODRM_MOD(insn->modrm.value) == 3) {
+ addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM);
+ if (addr_offset < 0)
+ goto out_err;
+ addr = regs_get_register(regs, addr_offset);
+ } else {
+ if (insn->sib.nbytes) {
+ base_offset = get_reg_offset(insn, regs, REG_TYPE_BASE);
+ if (base_offset < 0)
+ goto out_err;
+
+ indx_offset = get_reg_offset(insn, regs, REG_TYPE_INDEX);
+ if (indx_offset < 0)
+ goto out_err;
+
+ base = regs_get_register(regs, base_offset);
+ indx = regs_get_register(regs, indx_offset);
+ addr = base + indx * (1 << X86_SIB_SCALE(sib));
+ } else {
+ addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM);
+ if (addr_offset < 0)
+ goto out_err;
+ addr = regs_get_register(regs, addr_offset);
+ }
+ addr += insn->displacement.value;
+ }
+ return (void __user *)addr;
+out_err:
+ return (void __user *)-1;
+}
+
+static int mpx_insn_decode(struct insn *insn,
+ struct pt_regs *regs)
+{
+ unsigned char buf[MAX_INSN_SIZE];
+ int x86_64 = !test_thread_flag(TIF_IA32);
+ int not_copied;
+ int nr_copied;
+
+ not_copied = copy_from_user(buf, (void __user *)regs->ip, sizeof(buf));
+ nr_copied = sizeof(buf) - not_copied;
+ /*
+ * The decoder _should_ fail nicely if we pass it a short buffer.
+ * But, let's not depend on that implementation detail. If we
+ * did not get anything, just error out now.
+ */
+ if (!nr_copied)
+ return -EFAULT;
+ insn_init(insn, buf, nr_copied, x86_64);
+ insn_get_length(insn);
+ /*
+ * copy_from_user() tries to get as many bytes as we could see in
+ * the largest possible instruction. If the instruction we are
+ * after is shorter than that _and_ we attempt to copy from
+ * something unreadable, we might get a short read. This is OK
+ * as long as the read did not stop in the middle of the
+ * instruction. Check to see if we got a partial instruction.
+ */
+ if (nr_copied < insn->length)
+ return -EFAULT;
+
+ insn_get_opcode(insn);
+ /*
+ * We only _really_ need to decode bndcl/bndcn/bndcu
+ * Error out on anything else.
+ */
+ if (insn->opcode.bytes[0] != 0x0f)
+ goto bad_opcode;
+ if ((insn->opcode.bytes[1] != 0x1a) &&
+ (insn->opcode.bytes[1] != 0x1b))
+ goto bad_opcode;
+
+ return 0;
+bad_opcode:
+ return -EINVAL;
+}
+
+/*
+ * If a bounds overflow occurs then a #BR is generated. This
+ * function decodes MPX instructions to get violation address
+ * and set this address into extended struct siginfo.
+ *
+ * Note that this is not a super precise way of doing this.
+ * Userspace could have, by the time we get here, written
+ * anything it wants in to the instructions. We can not
+ * trust anything about it. They might not be valid
+ * instructions or might encode invalid registers, etc...
+ *
+ * The caller is expected to kfree() the returned siginfo_t.
+ */
+siginfo_t *mpx_generate_siginfo(struct pt_regs *regs,
+ struct xsave_struct *xsave_buf)
+{
+ struct bndreg *bndregs, *bndreg;
+ siginfo_t *info = NULL;
+ struct insn insn;
+ uint8_t bndregno;
+ int err;
+
+ err = mpx_insn_decode(&insn, regs);
+ if (err)
+ goto err_out;
+
+ /*
+ * We know at this point that we are only dealing with
+ * MPX instructions.
+ */
+ insn_get_modrm(&insn);
+ bndregno = X86_MODRM_REG(insn.modrm.value);
+ if (bndregno > 3) {
+ err = -EINVAL;
+ goto err_out;
+ }
+ /* get the bndregs _area_ of the xsave structure */
+ bndregs = get_xsave_addr(xsave_buf, XSTATE_BNDREGS);
+ if (!bndregs) {
+ err = -EINVAL;
+ goto err_out;
+ }
+ /* now go select the individual register in the set of 4 */
+ bndreg = &bndregs[bndregno];
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (!info) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ /*
+ * The registers are always 64-bit, but the upper 32
+ * bits are ignored in 32-bit mode. Also, note that the
+ * upper bounds are architecturally represented in 1's
+ * complement form.
+ *
+ * The 'unsigned long' cast is because the compiler
+ * complains when casting from integers to different-size
+ * pointers.
+ */
+ info->si_lower = (void __user *)(unsigned long)bndreg->lower_bound;
+ info->si_upper = (void __user *)(unsigned long)~bndreg->upper_bound;
+ info->si_addr_lsb = 0;
+ info->si_signo = SIGSEGV;
+ info->si_errno = 0;
+ info->si_code = SEGV_BNDERR;
+ info->si_addr = mpx_get_addr_ref(&insn, regs);
+ /*
+ * We were not able to extract an address from the instruction,
+ * probably because there was something invalid in it.
+ */
+ if (info->si_addr == (void *)-1) {
+ err = -EINVAL;
+ goto err_out;
+ }
+ return info;
+err_out:
+ /* info might be NULL, but kfree() handles that */
+ kfree(info);
+ return ERR_PTR(err);
+}
+
+static __user void *task_get_bounds_dir(struct task_struct *tsk)
+{
+ struct bndcsr *bndcsr;
+
+ if (!cpu_feature_enabled(X86_FEATURE_MPX))
+ return MPX_INVALID_BOUNDS_DIR;
+
+ /*
+ * The bounds directory pointer is stored in a register
+ * only accessible if we first do an xsave.
+ */
+ fpu_save_init(&tsk->thread.fpu);
+ bndcsr = get_xsave_addr(&tsk->thread.fpu.state->xsave, XSTATE_BNDCSR);
+ if (!bndcsr)
+ return MPX_INVALID_BOUNDS_DIR;
+
+ /*
+ * Make sure the register looks valid by checking the
+ * enable bit.
+ */
+ if (!(bndcsr->bndcfgu & MPX_BNDCFG_ENABLE_FLAG))
+ return MPX_INVALID_BOUNDS_DIR;
+
+ /*
+ * Lastly, mask off the low bits used for configuration
+ * flags, and return the address of the bounds table.
+ */
+ return (void __user *)(unsigned long)
+ (bndcsr->bndcfgu & MPX_BNDCFG_ADDR_MASK);
+}
+
+int mpx_enable_management(struct task_struct *tsk)
+{
+ void __user *bd_base = MPX_INVALID_BOUNDS_DIR;
+ struct mm_struct *mm = tsk->mm;
+ int ret = 0;
+
+ /*
+ * runtime in the userspace will be responsible for allocation of
+ * the bounds directory. Then, it will save the base of the bounds
+ * directory into XSAVE/XRSTOR Save Area and enable MPX through
+ * XRSTOR instruction.
+ *
+ * fpu_xsave() is expected to be very expensive. Storing the bounds
+ * directory here means that we do not have to do xsave in the unmap
+ * path; we can just use mm->bd_addr instead.
+ */
+ bd_base = task_get_bounds_dir(tsk);
+ down_write(&mm->mmap_sem);
+ mm->bd_addr = bd_base;
+ if (mm->bd_addr == MPX_INVALID_BOUNDS_DIR)
+ ret = -ENXIO;
+
+ up_write(&mm->mmap_sem);
+ return ret;
+}
+
+int mpx_disable_management(struct task_struct *tsk)
+{
+ struct mm_struct *mm = current->mm;
+
+ if (!cpu_feature_enabled(X86_FEATURE_MPX))
+ return -ENXIO;
+
+ down_write(&mm->mmap_sem);
+ mm->bd_addr = MPX_INVALID_BOUNDS_DIR;
+ up_write(&mm->mmap_sem);
+ return 0;
+}
+
+/*
+ * With 32-bit mode, MPX_BT_SIZE_BYTES is 4MB, and the size of each
+ * bounds table is 16KB. With 64-bit mode, MPX_BT_SIZE_BYTES is 2GB,
+ * and the size of each bounds table is 4MB.
+ */
+static int allocate_bt(long __user *bd_entry)
+{
+ unsigned long expected_old_val = 0;
+ unsigned long actual_old_val = 0;
+ unsigned long bt_addr;
+ int ret = 0;
+
+ /*
+ * Carve the virtual space out of userspace for the new
+ * bounds table:
+ */
+ bt_addr = mpx_mmap(MPX_BT_SIZE_BYTES);
+ if (IS_ERR((void *)bt_addr))
+ return PTR_ERR((void *)bt_addr);
+ /*
+ * Set the valid flag (kinda like _PAGE_PRESENT in a pte)
+ */
+ bt_addr = bt_addr | MPX_BD_ENTRY_VALID_FLAG;
+
+ /*
+ * Go poke the address of the new bounds table in to the
+ * bounds directory entry out in userspace memory. Note:
+ * we may race with another CPU instantiating the same table.
+ * In that case the cmpxchg will see an unexpected
+ * 'actual_old_val'.
+ *
+ * This can fault, but that's OK because we do not hold
+ * mmap_sem at this point, unlike some of the other part
+ * of the MPX code that have to pagefault_disable().
+ */
+ ret = user_atomic_cmpxchg_inatomic(&actual_old_val, bd_entry,
+ expected_old_val, bt_addr);
+ if (ret)
+ goto out_unmap;
+
+ /*
+ * The user_atomic_cmpxchg_inatomic() will only return nonzero
+ * for faults, *not* if the cmpxchg itself fails. Now we must
+ * verify that the cmpxchg itself completed successfully.
+ */
+ /*
+ * We expected an empty 'expected_old_val', but instead found
+ * an apparently valid entry. Assume we raced with another
+ * thread to instantiate this table and desclare succecss.
+ */
+ if (actual_old_val & MPX_BD_ENTRY_VALID_FLAG) {
+ ret = 0;
+ goto out_unmap;
+ }
+ /*
+ * We found a non-empty bd_entry but it did not have the
+ * VALID_FLAG set. Return an error which will result in
+ * a SEGV since this probably means that somebody scribbled
+ * some invalid data in to a bounds table.
+ */
+ if (expected_old_val != actual_old_val) {
+ ret = -EINVAL;
+ goto out_unmap;
+ }
+ return 0;
+out_unmap:
+ vm_munmap(bt_addr & MPX_BT_ADDR_MASK, MPX_BT_SIZE_BYTES);
+ return ret;
+}
+
+/*
+ * When a BNDSTX instruction attempts to save bounds to a bounds
+ * table, it will first attempt to look up the table in the
+ * first-level bounds directory. If it does not find a table in
+ * the directory, a #BR is generated and we get here in order to
+ * allocate a new table.
+ *
+ * With 32-bit mode, the size of BD is 4MB, and the size of each
+ * bound table is 16KB. With 64-bit mode, the size of BD is 2GB,
+ * and the size of each bound table is 4MB.
+ */
+static int do_mpx_bt_fault(struct xsave_struct *xsave_buf)
+{
+ unsigned long bd_entry, bd_base;
+ struct bndcsr *bndcsr;
+
+ bndcsr = get_xsave_addr(xsave_buf, XSTATE_BNDCSR);
+ if (!bndcsr)
+ return -EINVAL;
+ /*
+ * Mask off the preserve and enable bits
+ */
+ bd_base = bndcsr->bndcfgu & MPX_BNDCFG_ADDR_MASK;
+ /*
+ * The hardware provides the address of the missing or invalid
+ * entry via BNDSTATUS, so we don't have to go look it up.
+ */
+ bd_entry = bndcsr->bndstatus & MPX_BNDSTA_ADDR_MASK;
+ /*
+ * Make sure the directory entry is within where we think
+ * the directory is.
+ */
+ if ((bd_entry < bd_base) ||
+ (bd_entry >= bd_base + MPX_BD_SIZE_BYTES))
+ return -EINVAL;
+
+ return allocate_bt((long __user *)bd_entry);
+}
+
+int mpx_handle_bd_fault(struct xsave_struct *xsave_buf)
+{
+ /*
+ * Userspace never asked us to manage the bounds tables,
+ * so refuse to help.
+ */
+ if (!kernel_managing_mpx_tables(current->mm))
+ return -EINVAL;
+
+ if (do_mpx_bt_fault(xsave_buf)) {
+ force_sig(SIGSEGV, current);
+ /*
+ * The force_sig() is essentially "handling" this
+ * exception, so we do not pass up the error
+ * from do_mpx_bt_fault().
+ */
+ }
+ return 0;
+}
+
+/*
+ * A thin wrapper around get_user_pages(). Returns 0 if the
+ * fault was resolved or -errno if not.
+ */
+static int mpx_resolve_fault(long __user *addr, int write)
+{
+ long gup_ret;
+ int nr_pages = 1;
+ int force = 0;
+
+ gup_ret = get_user_pages(current, current->mm, (unsigned long)addr,
+ nr_pages, write, force, NULL, NULL);
+ /*
+ * get_user_pages() returns number of pages gotten.
+ * 0 means we failed to fault in and get anything,
+ * probably because 'addr' is bad.
+ */
+ if (!gup_ret)
+ return -EFAULT;
+ /* Other error, return it */
+ if (gup_ret < 0)
+ return gup_ret;
+ /* must have gup'd a page and gup_ret>0, success */
+ return 0;
+}
+
+/*
+ * Get the base of bounds tables pointed by specific bounds
+ * directory entry.
+ */
+static int get_bt_addr(struct mm_struct *mm,
+ long __user *bd_entry, unsigned long *bt_addr)
+{
+ int ret;
+ int valid_bit;
+
+ if (!access_ok(VERIFY_READ, (bd_entry), sizeof(*bd_entry)))
+ return -EFAULT;
+
+ while (1) {
+ int need_write = 0;
+
+ pagefault_disable();
+ ret = get_user(*bt_addr, bd_entry);
+ pagefault_enable();
+ if (!ret)
+ break;
+ if (ret == -EFAULT)
+ ret = mpx_resolve_fault(bd_entry, need_write);
+ /*
+ * If we could not resolve the fault, consider it
+ * userspace's fault and error out.
+ */
+ if (ret)
+ return ret;
+ }
+
+ valid_bit = *bt_addr & MPX_BD_ENTRY_VALID_FLAG;
+ *bt_addr &= MPX_BT_ADDR_MASK;
+
+ /*
+ * When the kernel is managing bounds tables, a bounds directory
+ * entry will either have a valid address (plus the valid bit)
+ * *OR* be completely empty. If we see a !valid entry *and* some
+ * data in the address field, we know something is wrong. This
+ * -EINVAL return will cause a SIGSEGV.
+ */
+ if (!valid_bit && *bt_addr)
+ return -EINVAL;
+ /*
+ * Do we have an completely zeroed bt entry? That is OK. It
+ * just means there was no bounds table for this memory. Make
+ * sure to distinguish this from -EINVAL, which will cause
+ * a SEGV.
+ */
+ if (!valid_bit)
+ return -ENOENT;
+
+ return 0;
+}
+
+/*
+ * Free the backing physical pages of bounds table 'bt_addr'.
+ * Assume start...end is within that bounds table.
+ */
+static int zap_bt_entries(struct mm_struct *mm,
+ unsigned long bt_addr,
+ unsigned long start, unsigned long end)
+{
+ struct vm_area_struct *vma;
+ unsigned long addr, len;
+
+ /*
+ * Find the first overlapping vma. If vma->vm_start > start, there
+ * will be a hole in the bounds table. This -EINVAL return will
+ * cause a SIGSEGV.
+ */
+ vma = find_vma(mm, start);
+ if (!vma || vma->vm_start > start)
+ return -EINVAL;
+
+ /*
+ * A NUMA policy on a VM_MPX VMA could cause this bouds table to
+ * be split. So we need to look across the entire 'start -> end'
+ * range of this bounds table, find all of the VM_MPX VMAs, and
+ * zap only those.
+ */
+ addr = start;
+ while (vma && vma->vm_start < end) {
+ /*
+ * We followed a bounds directory entry down
+ * here. If we find a non-MPX VMA, that's bad,
+ * so stop immediately and return an error. This
+ * probably results in a SIGSEGV.
+ */
+ if (!is_mpx_vma(vma))
+ return -EINVAL;
+
+ len = min(vma->vm_end, end) - addr;
+ zap_page_range(vma, addr, len, NULL);
+
+ vma = vma->vm_next;
+ addr = vma->vm_start;
+ }
+
+ return 0;
+}
+
+static int unmap_single_bt(struct mm_struct *mm,
+ long __user *bd_entry, unsigned long bt_addr)
+{
+ unsigned long expected_old_val = bt_addr | MPX_BD_ENTRY_VALID_FLAG;
+ unsigned long actual_old_val = 0;
+ int ret;
+
+ while (1) {
+ int need_write = 1;
+
+ pagefault_disable();
+ ret = user_atomic_cmpxchg_inatomic(&actual_old_val, bd_entry,
+ expected_old_val, 0);
+ pagefault_enable();
+ if (!ret)
+ break;
+ if (ret == -EFAULT)
+ ret = mpx_resolve_fault(bd_entry, need_write);
+ /*
+ * If we could not resolve the fault, consider it
+ * userspace's fault and error out.
+ */
+ if (ret)
+ return ret;
+ }
+ /*
+ * The cmpxchg was performed, check the results.
+ */
+ if (actual_old_val != expected_old_val) {
+ /*
+ * Someone else raced with us to unmap the table.
+ * There was no bounds table pointed to by the
+ * directory, so declare success. Somebody freed
+ * it.
+ */
+ if (!actual_old_val)
+ return 0;
+ /*
+ * Something messed with the bounds directory
+ * entry. We hold mmap_sem for read or write
+ * here, so it could not be a _new_ bounds table
+ * that someone just allocated. Something is
+ * wrong, so pass up the error and SIGSEGV.
+ */
+ return -EINVAL;
+ }
+
+ /*
+ * Note, we are likely being called under do_munmap() already. To
+ * avoid recursion, do_munmap() will check whether it comes
+ * from one bounds table through VM_MPX flag.
+ */
+ return do_munmap(mm, bt_addr, MPX_BT_SIZE_BYTES);
+}
+
+/*
+ * If the bounds table pointed by bounds directory 'bd_entry' is
+ * not shared, unmap this whole bounds table. Otherwise, only free
+ * those backing physical pages of bounds table entries covered
+ * in this virtual address region start...end.
+ */
+static int unmap_shared_bt(struct mm_struct *mm,
+ long __user *bd_entry, unsigned long start,
+ unsigned long end, bool prev_shared, bool next_shared)
+{
+ unsigned long bt_addr;
+ int ret;
+
+ ret = get_bt_addr(mm, bd_entry, &bt_addr);
+ /*
+ * We could see an "error" ret for not-present bounds
+ * tables (not really an error), or actual errors, but
+ * stop unmapping either way.
+ */
+ if (ret)
+ return ret;
+
+ if (prev_shared && next_shared)
+ ret = zap_bt_entries(mm, bt_addr,
+ bt_addr+MPX_GET_BT_ENTRY_OFFSET(start),
+ bt_addr+MPX_GET_BT_ENTRY_OFFSET(end));
+ else if (prev_shared)
+ ret = zap_bt_entries(mm, bt_addr,
+ bt_addr+MPX_GET_BT_ENTRY_OFFSET(start),
+ bt_addr+MPX_BT_SIZE_BYTES);
+ else if (next_shared)
+ ret = zap_bt_entries(mm, bt_addr, bt_addr,
+ bt_addr+MPX_GET_BT_ENTRY_OFFSET(end));
+ else
+ ret = unmap_single_bt(mm, bd_entry, bt_addr);
+
+ return ret;
+}
+
+/*
+ * A virtual address region being munmap()ed might share bounds table
+ * with adjacent VMAs. We only need to free the backing physical
+ * memory of these shared bounds tables entries covered in this virtual
+ * address region.
+ */
+static int unmap_edge_bts(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ int ret;
+ long __user *bde_start, *bde_end;
+ struct vm_area_struct *prev, *next;
+ bool prev_shared = false, next_shared = false;
+
+ bde_start = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(start);
+ bde_end = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(end-1);
+
+ /*
+ * Check whether bde_start and bde_end are shared with adjacent
+ * VMAs.
+ *
+ * We already unliked the VMAs from the mm's rbtree so 'start'
+ * is guaranteed to be in a hole. This gets us the first VMA
+ * before the hole in to 'prev' and the next VMA after the hole
+ * in to 'next'.
+ */
+ next = find_vma_prev(mm, start, &prev);
+ if (prev && (mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(prev->vm_end-1))
+ == bde_start)
+ prev_shared = true;
+ if (next && (mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(next->vm_start))
+ == bde_end)
+ next_shared = true;
+
+ /*
+ * This virtual address region being munmap()ed is only
+ * covered by one bounds table.
+ *
+ * In this case, if this table is also shared with adjacent
+ * VMAs, only part of the backing physical memory of the bounds
+ * table need be freeed. Otherwise the whole bounds table need
+ * be unmapped.
+ */
+ if (bde_start == bde_end) {
+ return unmap_shared_bt(mm, bde_start, start, end,
+ prev_shared, next_shared);
+ }
+
+ /*
+ * If more than one bounds tables are covered in this virtual
+ * address region being munmap()ed, we need to separately check
+ * whether bde_start and bde_end are shared with adjacent VMAs.
+ */
+ ret = unmap_shared_bt(mm, bde_start, start, end, prev_shared, false);
+ if (ret)
+ return ret;
+ ret = unmap_shared_bt(mm, bde_end, start, end, false, next_shared);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+static int mpx_unmap_tables(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ int ret;
+ long __user *bd_entry, *bde_start, *bde_end;
+ unsigned long bt_addr;
+
+ /*
+ * "Edge" bounds tables are those which are being used by the region
+ * (start -> end), but that may be shared with adjacent areas. If they
+ * turn out to be completely unshared, they will be freed. If they are
+ * shared, we will free the backing store (like an MADV_DONTNEED) for
+ * areas used by this region.
+ */
+ ret = unmap_edge_bts(mm, start, end);
+ switch (ret) {
+ /* non-present tables are OK */
+ case 0:
+ case -ENOENT:
+ /* Success, or no tables to unmap */
+ break;
+ case -EINVAL:
+ case -EFAULT:
+ default:
+ return ret;
+ }
+
+ /*
+ * Only unmap the bounds table that are
+ * 1. fully covered
+ * 2. not at the edges of the mapping, even if full aligned
+ */
+ bde_start = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(start);
+ bde_end = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(end-1);
+ for (bd_entry = bde_start + 1; bd_entry < bde_end; bd_entry++) {
+ ret = get_bt_addr(mm, bd_entry, &bt_addr);
+ switch (ret) {
+ case 0:
+ break;
+ case -ENOENT:
+ /* No table here, try the next one */
+ continue;
+ case -EINVAL:
+ case -EFAULT:
+ default:
+ /*
+ * Note: we are being strict here.
+ * Any time we run in to an issue
+ * unmapping tables, we stop and
+ * SIGSEGV.
+ */
+ return ret;
+ }
+
+ ret = unmap_single_bt(mm, bd_entry, bt_addr);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * Free unused bounds tables covered in a virtual address region being
+ * munmap()ed. Assume end > start.
+ *
+ * This function will be called by do_munmap(), and the VMAs covering
+ * the virtual address region start...end have already been split if
+ * necessary, and the 'vma' is the first vma in this range (start -> end).
+ */
+void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ int ret;
+
+ /*
+ * Refuse to do anything unless userspace has asked
+ * the kernel to help manage the bounds tables,
+ */
+ if (!kernel_managing_mpx_tables(current->mm))
+ return;
+ /*
+ * This will look across the entire 'start -> end' range,
+ * and find all of the non-VM_MPX VMAs.
+ *
+ * To avoid recursion, if a VM_MPX vma is found in the range
+ * (start->end), we will not continue follow-up work. This
+ * recursion represents having bounds tables for bounds tables,
+ * which should not occur normally. Being strict about it here
+ * helps ensure that we do not have an exploitable stack overflow.
+ */
+ do {
+ if (vma->vm_flags & VM_MPX)
+ return;
+ vma = vma->vm_next;
+ } while (vma && vma->vm_start < end);
+
+ ret = mpx_unmap_tables(mm, start, end);
+ if (ret)
+ force_sig(SIGSEGV, current);
+}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 36de293caf2..536ea2fb6e3 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -384,6 +384,26 @@ static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
}
/*
+ * Lookup the PMD entry for a virtual address. Return a pointer to the entry
+ * or NULL if not present.
+ */
+pmd_t *lookup_pmd_address(unsigned long address)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+
+ pgd = pgd_offset_k(address);
+ if (pgd_none(*pgd))
+ return NULL;
+
+ pud = pud_offset(pgd, address);
+ if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud))
+ return NULL;
+
+ return pmd_offset(pud, address);
+}
+
+/*
* This is necessary because __pa() does not work on some
* kinds of memory, like vmalloc() or the alloc_remap()
* areas on 32-bit NUMA systems. The percpu areas can
@@ -485,14 +505,23 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
/*
* We are safe now. Check whether the new pgprot is the same:
+ * Convert protection attributes to 4k-format, as cpa->mask* are set
+ * up accordingly.
*/
old_pte = *kpte;
- old_prot = req_prot = pte_pgprot(old_pte);
+ old_prot = req_prot = pgprot_large_2_4k(pte_pgprot(old_pte));
pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
/*
+ * req_prot is in format of 4k pages. It must be converted to large
+ * page format: the caching mode includes the PAT bit located at
+ * different bit positions in the two formats.
+ */
+ req_prot = pgprot_4k_2_large(req_prot);
+
+ /*
* Set the PSE and GLOBAL flags only if the PRESENT flag is
* set otherwise pmd_present/pmd_huge will return true even on
* a non present pmd. The canon_pgprot will clear _PAGE_GLOBAL
@@ -585,13 +614,10 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
paravirt_alloc_pte(&init_mm, page_to_pfn(base));
ref_prot = pte_pgprot(pte_clrhuge(*kpte));
- /*
- * If we ever want to utilize the PAT bit, we need to
- * update this function to make sure it's converted from
- * bit 12 to bit 7 when we cross from the 2MB level to
- * the 4K level:
- */
- WARN_ON_ONCE(pgprot_val(ref_prot) & _PAGE_PAT_LARGE);
+
+ /* promote PAT bit to correct position */
+ if (level == PG_LEVEL_2M)
+ ref_prot = pgprot_large_2_4k(ref_prot);
#ifdef CONFIG_X86_64
if (level == PG_LEVEL_1G) {
@@ -879,6 +905,7 @@ static int populate_pmd(struct cpa_data *cpa,
{
unsigned int cur_pages = 0;
pmd_t *pmd;
+ pgprot_t pmd_pgprot;
/*
* Not on a 2M boundary?
@@ -910,6 +937,8 @@ static int populate_pmd(struct cpa_data *cpa,
if (num_pages == cur_pages)
return cur_pages;
+ pmd_pgprot = pgprot_4k_2_large(pgprot);
+
while (end - start >= PMD_SIZE) {
/*
@@ -921,7 +950,8 @@ static int populate_pmd(struct cpa_data *cpa,
pmd = pmd_offset(pud, start);
- set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE | massage_pgprot(pgprot)));
+ set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE |
+ massage_pgprot(pmd_pgprot)));
start += PMD_SIZE;
cpa->pfn += PMD_SIZE;
@@ -949,6 +979,7 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
pud_t *pud;
unsigned long end;
int cur_pages = 0;
+ pgprot_t pud_pgprot;
end = start + (cpa->numpages << PAGE_SHIFT);
@@ -986,12 +1017,14 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
return cur_pages;
pud = pud_offset(pgd, start);
+ pud_pgprot = pgprot_4k_2_large(pgprot);
/*
* Map everything starting from the Gb boundary, possibly with 1G pages
*/
while (end - start >= PUD_SIZE) {
- set_pud(pud, __pud(cpa->pfn | _PAGE_PSE | massage_pgprot(pgprot)));
+ set_pud(pud, __pud(cpa->pfn | _PAGE_PSE |
+ massage_pgprot(pud_pgprot)));
start += PUD_SIZE;
cpa->pfn += PUD_SIZE;
@@ -1304,12 +1337,6 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
return 0;
}
-static inline int cache_attr(pgprot_t attr)
-{
- return pgprot_val(attr) &
- (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD);
-}
-
static int change_page_attr_set_clr(unsigned long *addr, int numpages,
pgprot_t mask_set, pgprot_t mask_clr,
int force_split, int in_flag,
@@ -1390,7 +1417,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
* No need to flush, when we did not set any of the caching
* attributes:
*/
- cache = cache_attr(mask_set);
+ cache = !!pgprot2cachemode(mask_set);
/*
* On success we use CLFLUSH, when the CPU supports it to
@@ -1445,7 +1472,8 @@ int _set_memory_uc(unsigned long addr, int numpages)
* for now UC MINUS. see comments in ioremap_nocache()
*/
return change_page_attr_set(&addr, numpages,
- __pgprot(_PAGE_CACHE_UC_MINUS), 0);
+ cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
+ 0);
}
int set_memory_uc(unsigned long addr, int numpages)
@@ -1456,7 +1484,7 @@ int set_memory_uc(unsigned long addr, int numpages)
* for now UC MINUS. see comments in ioremap_nocache()
*/
ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
- _PAGE_CACHE_UC_MINUS, NULL);
+ _PAGE_CACHE_MODE_UC_MINUS, NULL);
if (ret)
goto out_err;
@@ -1474,7 +1502,7 @@ out_err:
EXPORT_SYMBOL(set_memory_uc);
static int _set_memory_array(unsigned long *addr, int addrinarray,
- unsigned long new_type)
+ enum page_cache_mode new_type)
{
int i, j;
int ret;
@@ -1490,11 +1518,13 @@ static int _set_memory_array(unsigned long *addr, int addrinarray,
}
ret = change_page_attr_set(addr, addrinarray,
- __pgprot(_PAGE_CACHE_UC_MINUS), 1);
+ cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
+ 1);
- if (!ret && new_type == _PAGE_CACHE_WC)
+ if (!ret && new_type == _PAGE_CACHE_MODE_WC)
ret = change_page_attr_set_clr(addr, addrinarray,
- __pgprot(_PAGE_CACHE_WC),
+ cachemode2pgprot(
+ _PAGE_CACHE_MODE_WC),
__pgprot(_PAGE_CACHE_MASK),
0, CPA_ARRAY, NULL);
if (ret)
@@ -1511,13 +1541,13 @@ out_free:
int set_memory_array_uc(unsigned long *addr, int addrinarray)
{
- return _set_memory_array(addr, addrinarray, _PAGE_CACHE_UC_MINUS);
+ return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_UC_MINUS);
}
EXPORT_SYMBOL(set_memory_array_uc);
int set_memory_array_wc(unsigned long *addr, int addrinarray)
{
- return _set_memory_array(addr, addrinarray, _PAGE_CACHE_WC);
+ return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WC);
}
EXPORT_SYMBOL(set_memory_array_wc);
@@ -1527,10 +1557,12 @@ int _set_memory_wc(unsigned long addr, int numpages)
unsigned long addr_copy = addr;
ret = change_page_attr_set(&addr, numpages,
- __pgprot(_PAGE_CACHE_UC_MINUS), 0);
+ cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
+ 0);
if (!ret) {
ret = change_page_attr_set_clr(&addr_copy, numpages,
- __pgprot(_PAGE_CACHE_WC),
+ cachemode2pgprot(
+ _PAGE_CACHE_MODE_WC),
__pgprot(_PAGE_CACHE_MASK),
0, 0, NULL);
}
@@ -1545,7 +1577,7 @@ int set_memory_wc(unsigned long addr, int numpages)
return set_memory_uc(addr, numpages);
ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
- _PAGE_CACHE_WC, NULL);
+ _PAGE_CACHE_MODE_WC, NULL);
if (ret)
goto out_err;
@@ -1564,6 +1596,7 @@ EXPORT_SYMBOL(set_memory_wc);
int _set_memory_wb(unsigned long addr, int numpages)
{
+ /* WB cache mode is hard wired to all cache attribute bits being 0 */
return change_page_attr_clear(&addr, numpages,
__pgprot(_PAGE_CACHE_MASK), 0);
}
@@ -1586,6 +1619,7 @@ int set_memory_array_wb(unsigned long *addr, int addrinarray)
int i;
int ret;
+ /* WB cache mode is hard wired to all cache attribute bits being 0 */
ret = change_page_attr_clear(addr, addrinarray,
__pgprot(_PAGE_CACHE_MASK), 1);
if (ret)
@@ -1648,7 +1682,7 @@ int set_pages_uc(struct page *page, int numpages)
EXPORT_SYMBOL(set_pages_uc);
static int _set_pages_array(struct page **pages, int addrinarray,
- unsigned long new_type)
+ enum page_cache_mode new_type)
{
unsigned long start;
unsigned long end;
@@ -1666,10 +1700,11 @@ static int _set_pages_array(struct page **pages, int addrinarray,
}
ret = cpa_set_pages_array(pages, addrinarray,
- __pgprot(_PAGE_CACHE_UC_MINUS));
- if (!ret && new_type == _PAGE_CACHE_WC)
+ cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS));
+ if (!ret && new_type == _PAGE_CACHE_MODE_WC)
ret = change_page_attr_set_clr(NULL, addrinarray,
- __pgprot(_PAGE_CACHE_WC),
+ cachemode2pgprot(
+ _PAGE_CACHE_MODE_WC),
__pgprot(_PAGE_CACHE_MASK),
0, CPA_PAGES_ARRAY, pages);
if (ret)
@@ -1689,13 +1724,13 @@ err_out:
int set_pages_array_uc(struct page **pages, int addrinarray)
{
- return _set_pages_array(pages, addrinarray, _PAGE_CACHE_UC_MINUS);
+ return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_UC_MINUS);
}
EXPORT_SYMBOL(set_pages_array_uc);
int set_pages_array_wc(struct page **pages, int addrinarray)
{
- return _set_pages_array(pages, addrinarray, _PAGE_CACHE_WC);
+ return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WC);
}
EXPORT_SYMBOL(set_pages_array_wc);
@@ -1714,6 +1749,7 @@ int set_pages_array_wb(struct page **pages, int addrinarray)
unsigned long end;
int i;
+ /* WB cache mode is hard wired to all cache attribute bits being 0 */
retval = cpa_clear_pages_array(pages, addrinarray,
__pgprot(_PAGE_CACHE_MASK));
if (retval)
@@ -1801,7 +1837,7 @@ static int __set_pages_np(struct page *page, int numpages)
return __change_page_attr_set_clr(&cpa, 0);
}
-void kernel_map_pages(struct page *page, int numpages, int enable)
+void __kernel_map_pages(struct page *page, int numpages, int enable)
{
if (PageHighMem(page))
return;
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 657438858e8..edf299c8ff6 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -31,6 +31,7 @@
#include <asm/io.h>
#include "pat_internal.h"
+#include "mm_internal.h"
#ifdef CONFIG_X86_PAT
int __read_mostly pat_enabled = 1;
@@ -66,6 +67,75 @@ __setup("debugpat", pat_debug_setup);
static u64 __read_mostly boot_pat_state;
+#ifdef CONFIG_X86_PAT
+/*
+ * X86 PAT uses page flags WC and Uncached together to keep track of
+ * memory type of pages that have backing page struct. X86 PAT supports 3
+ * different memory types, _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC and
+ * _PAGE_CACHE_MODE_UC_MINUS and fourth state where page's memory type has not
+ * been changed from its default (value of -1 used to denote this).
+ * Note we do not support _PAGE_CACHE_MODE_UC here.
+ */
+
+#define _PGMT_DEFAULT 0
+#define _PGMT_WC (1UL << PG_arch_1)
+#define _PGMT_UC_MINUS (1UL << PG_uncached)
+#define _PGMT_WB (1UL << PG_uncached | 1UL << PG_arch_1)
+#define _PGMT_MASK (1UL << PG_uncached | 1UL << PG_arch_1)
+#define _PGMT_CLEAR_MASK (~_PGMT_MASK)
+
+static inline enum page_cache_mode get_page_memtype(struct page *pg)
+{
+ unsigned long pg_flags = pg->flags & _PGMT_MASK;
+
+ if (pg_flags == _PGMT_DEFAULT)
+ return -1;
+ else if (pg_flags == _PGMT_WC)
+ return _PAGE_CACHE_MODE_WC;
+ else if (pg_flags == _PGMT_UC_MINUS)
+ return _PAGE_CACHE_MODE_UC_MINUS;
+ else
+ return _PAGE_CACHE_MODE_WB;
+}
+
+static inline void set_page_memtype(struct page *pg,
+ enum page_cache_mode memtype)
+{
+ unsigned long memtype_flags;
+ unsigned long old_flags;
+ unsigned long new_flags;
+
+ switch (memtype) {
+ case _PAGE_CACHE_MODE_WC:
+ memtype_flags = _PGMT_WC;
+ break;
+ case _PAGE_CACHE_MODE_UC_MINUS:
+ memtype_flags = _PGMT_UC_MINUS;
+ break;
+ case _PAGE_CACHE_MODE_WB:
+ memtype_flags = _PGMT_WB;
+ break;
+ default:
+ memtype_flags = _PGMT_DEFAULT;
+ break;
+ }
+
+ do {
+ old_flags = pg->flags;
+ new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags;
+ } while (cmpxchg(&pg->flags, old_flags, new_flags) != old_flags);
+}
+#else
+static inline enum page_cache_mode get_page_memtype(struct page *pg)
+{
+ return -1;
+}
+static inline void set_page_memtype(struct page *pg,
+ enum page_cache_mode memtype)
+{
+}
+#endif
+
enum {
PAT_UC = 0, /* uncached */
PAT_WC = 1, /* Write combining */
@@ -75,6 +145,52 @@ enum {
PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */
};
+#define CM(c) (_PAGE_CACHE_MODE_ ## c)
+
+static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg)
+{
+ enum page_cache_mode cache;
+ char *cache_mode;
+
+ switch (pat_val) {
+ case PAT_UC: cache = CM(UC); cache_mode = "UC "; break;
+ case PAT_WC: cache = CM(WC); cache_mode = "WC "; break;
+ case PAT_WT: cache = CM(WT); cache_mode = "WT "; break;
+ case PAT_WP: cache = CM(WP); cache_mode = "WP "; break;
+ case PAT_WB: cache = CM(WB); cache_mode = "WB "; break;
+ case PAT_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break;
+ default: cache = CM(WB); cache_mode = "WB "; break;
+ }
+
+ memcpy(msg, cache_mode, 4);
+
+ return cache;
+}
+
+#undef CM
+
+/*
+ * Update the cache mode to pgprot translation tables according to PAT
+ * configuration.
+ * Using lower indices is preferred, so we start with highest index.
+ */
+void pat_init_cache_modes(void)
+{
+ int i;
+ enum page_cache_mode cache;
+ char pat_msg[33];
+ u64 pat;
+
+ rdmsrl(MSR_IA32_CR_PAT, pat);
+ pat_msg[32] = 0;
+ for (i = 7; i >= 0; i--) {
+ cache = pat_get_cache_mode((pat >> (i * 8)) & 7,
+ pat_msg + 4 * i);
+ update_cache_mode_entry(i, cache);
+ }
+ pr_info("PAT configuration [0-7]: %s\n", pat_msg);
+}
+
#define PAT(x, y) ((u64)PAT_ ## y << ((x)*8))
void pat_init(void)
@@ -124,8 +240,7 @@ void pat_init(void)
wrmsrl(MSR_IA32_CR_PAT, pat);
if (boot_cpu)
- printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n",
- smp_processor_id(), boot_pat_state, pat);
+ pat_init_cache_modes();
}
#undef PAT
@@ -139,20 +254,21 @@ static DEFINE_SPINLOCK(memtype_lock); /* protects memtype accesses */
* The intersection is based on "Effective Memory Type" tables in IA-32
* SDM vol 3a
*/
-static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
+static unsigned long pat_x_mtrr_type(u64 start, u64 end,
+ enum page_cache_mode req_type)
{
/*
* Look for MTRR hint to get the effective type in case where PAT
* request is for WB.
*/
- if (req_type == _PAGE_CACHE_WB) {
+ if (req_type == _PAGE_CACHE_MODE_WB) {
u8 mtrr_type;
mtrr_type = mtrr_type_lookup(start, end);
if (mtrr_type != MTRR_TYPE_WRBACK)
- return _PAGE_CACHE_UC_MINUS;
+ return _PAGE_CACHE_MODE_UC_MINUS;
- return _PAGE_CACHE_WB;
+ return _PAGE_CACHE_MODE_WB;
}
return req_type;
@@ -207,25 +323,26 @@ static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end)
* - Find the memtype of all the pages in the range, look for any conflicts
* - In case of no conflicts, set the new memtype for pages in the range
*/
-static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,
- unsigned long *new_type)
+static int reserve_ram_pages_type(u64 start, u64 end,
+ enum page_cache_mode req_type,
+ enum page_cache_mode *new_type)
{
struct page *page;
u64 pfn;
- if (req_type == _PAGE_CACHE_UC) {
+ if (req_type == _PAGE_CACHE_MODE_UC) {
/* We do not support strong UC */
WARN_ON_ONCE(1);
- req_type = _PAGE_CACHE_UC_MINUS;
+ req_type = _PAGE_CACHE_MODE_UC_MINUS;
}
for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
- unsigned long type;
+ enum page_cache_mode type;
page = pfn_to_page(pfn);
type = get_page_memtype(page);
if (type != -1) {
- printk(KERN_INFO "reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%lx, req 0x%lx\n",
+ pr_info("reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n",
start, end - 1, type, req_type);
if (new_type)
*new_type = type;
@@ -258,21 +375,21 @@ static int free_ram_pages_type(u64 start, u64 end)
/*
* req_type typically has one of the:
- * - _PAGE_CACHE_WB
- * - _PAGE_CACHE_WC
- * - _PAGE_CACHE_UC_MINUS
- * - _PAGE_CACHE_UC
+ * - _PAGE_CACHE_MODE_WB
+ * - _PAGE_CACHE_MODE_WC
+ * - _PAGE_CACHE_MODE_UC_MINUS
+ * - _PAGE_CACHE_MODE_UC
*
* If new_type is NULL, function will return an error if it cannot reserve the
* region with req_type. If new_type is non-NULL, function will return
* available type in new_type in case of no error. In case of any error
* it will return a negative return value.
*/
-int reserve_memtype(u64 start, u64 end, unsigned long req_type,
- unsigned long *new_type)
+int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type,
+ enum page_cache_mode *new_type)
{
struct memtype *new;
- unsigned long actual_type;
+ enum page_cache_mode actual_type;
int is_range_ram;
int err = 0;
@@ -281,10 +398,10 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
if (!pat_enabled) {
/* This is identical to page table setting without PAT */
if (new_type) {
- if (req_type == _PAGE_CACHE_WC)
- *new_type = _PAGE_CACHE_UC_MINUS;
+ if (req_type == _PAGE_CACHE_MODE_WC)
+ *new_type = _PAGE_CACHE_MODE_UC_MINUS;
else
- *new_type = req_type & _PAGE_CACHE_MASK;
+ *new_type = req_type;
}
return 0;
}
@@ -292,7 +409,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
/* Low ISA region is always mapped WB in page table. No need to track */
if (x86_platform.is_untracked_pat_range(start, end)) {
if (new_type)
- *new_type = _PAGE_CACHE_WB;
+ *new_type = _PAGE_CACHE_MODE_WB;
return 0;
}
@@ -302,7 +419,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
* tools and ACPI tools). Use WB request for WB memory and use
* UC_MINUS otherwise.
*/
- actual_type = pat_x_mtrr_type(start, end, req_type & _PAGE_CACHE_MASK);
+ actual_type = pat_x_mtrr_type(start, end, req_type);
if (new_type)
*new_type = actual_type;
@@ -394,12 +511,12 @@ int free_memtype(u64 start, u64 end)
*
* Only to be called when PAT is enabled
*
- * Returns _PAGE_CACHE_WB, _PAGE_CACHE_WC, _PAGE_CACHE_UC_MINUS or
- * _PAGE_CACHE_UC
+ * Returns _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC, _PAGE_CACHE_MODE_UC_MINUS
+ * or _PAGE_CACHE_MODE_UC
*/
-static unsigned long lookup_memtype(u64 paddr)
+static enum page_cache_mode lookup_memtype(u64 paddr)
{
- int rettype = _PAGE_CACHE_WB;
+ enum page_cache_mode rettype = _PAGE_CACHE_MODE_WB;
struct memtype *entry;
if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE))
@@ -414,7 +531,7 @@ static unsigned long lookup_memtype(u64 paddr)
* default state and not reserved, and hence of type WB
*/
if (rettype == -1)
- rettype = _PAGE_CACHE_WB;
+ rettype = _PAGE_CACHE_MODE_WB;
return rettype;
}
@@ -425,7 +542,7 @@ static unsigned long lookup_memtype(u64 paddr)
if (entry != NULL)
rettype = entry->type;
else
- rettype = _PAGE_CACHE_UC_MINUS;
+ rettype = _PAGE_CACHE_MODE_UC_MINUS;
spin_unlock(&memtype_lock);
return rettype;
@@ -442,11 +559,11 @@ static unsigned long lookup_memtype(u64 paddr)
* On failure, returns non-zero
*/
int io_reserve_memtype(resource_size_t start, resource_size_t end,
- unsigned long *type)
+ enum page_cache_mode *type)
{
resource_size_t size = end - start;
- unsigned long req_type = *type;
- unsigned long new_type;
+ enum page_cache_mode req_type = *type;
+ enum page_cache_mode new_type;
int ret;
WARN_ON_ONCE(iomem_map_sanity_check(start, size));
@@ -520,13 +637,13 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
unsigned long size, pgprot_t *vma_prot)
{
- unsigned long flags = _PAGE_CACHE_WB;
+ enum page_cache_mode pcm = _PAGE_CACHE_MODE_WB;
if (!range_is_allowed(pfn, size))
return 0;
if (file->f_flags & O_DSYNC)
- flags = _PAGE_CACHE_UC_MINUS;
+ pcm = _PAGE_CACHE_MODE_UC_MINUS;
#ifdef CONFIG_X86_32
/*
@@ -543,12 +660,12 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) &&
(pfn << PAGE_SHIFT) >= __pa(high_memory)) {
- flags = _PAGE_CACHE_UC;
+ pcm = _PAGE_CACHE_MODE_UC;
}
#endif
*vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) |
- flags);
+ cachemode2protval(pcm));
return 1;
}
@@ -556,7 +673,8 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
* Change the memory type for the physial address range in kernel identity
* mapping space if that range is a part of identity map.
*/
-int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags)
+int kernel_map_sync_memtype(u64 base, unsigned long size,
+ enum page_cache_mode pcm)
{
unsigned long id_sz;
@@ -574,11 +692,11 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags)
__pa(high_memory) - base :
size;
- if (ioremap_change_attr((unsigned long)__va(base), id_sz, flags) < 0) {
+ if (ioremap_change_attr((unsigned long)__va(base), id_sz, pcm) < 0) {
printk(KERN_INFO "%s:%d ioremap_change_attr failed %s "
"for [mem %#010Lx-%#010Lx]\n",
current->comm, current->pid,
- cattr_name(flags),
+ cattr_name(pcm),
base, (unsigned long long)(base + size-1));
return -EINVAL;
}
@@ -595,8 +713,8 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
{
int is_ram = 0;
int ret;
- unsigned long want_flags = (pgprot_val(*vma_prot) & _PAGE_CACHE_MASK);
- unsigned long flags = want_flags;
+ enum page_cache_mode want_pcm = pgprot2cachemode(*vma_prot);
+ enum page_cache_mode pcm = want_pcm;
is_ram = pat_pagerange_is_ram(paddr, paddr + size);
@@ -609,36 +727,36 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
if (!pat_enabled)
return 0;
- flags = lookup_memtype(paddr);
- if (want_flags != flags) {
+ pcm = lookup_memtype(paddr);
+ if (want_pcm != pcm) {
printk(KERN_WARNING "%s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n",
current->comm, current->pid,
- cattr_name(want_flags),
+ cattr_name(want_pcm),
(unsigned long long)paddr,
(unsigned long long)(paddr + size - 1),
- cattr_name(flags));
+ cattr_name(pcm));
*vma_prot = __pgprot((pgprot_val(*vma_prot) &
- (~_PAGE_CACHE_MASK)) |
- flags);
+ (~_PAGE_CACHE_MASK)) |
+ cachemode2protval(pcm));
}
return 0;
}
- ret = reserve_memtype(paddr, paddr + size, want_flags, &flags);
+ ret = reserve_memtype(paddr, paddr + size, want_pcm, &pcm);
if (ret)
return ret;
- if (flags != want_flags) {
+ if (pcm != want_pcm) {
if (strict_prot ||
- !is_new_memtype_allowed(paddr, size, want_flags, flags)) {
+ !is_new_memtype_allowed(paddr, size, want_pcm, pcm)) {
free_memtype(paddr, paddr + size);
printk(KERN_ERR "%s:%d map pfn expected mapping type %s"
" for [mem %#010Lx-%#010Lx], got %s\n",
current->comm, current->pid,
- cattr_name(want_flags),
+ cattr_name(want_pcm),
(unsigned long long)paddr,
(unsigned long long)(paddr + size - 1),
- cattr_name(flags));
+ cattr_name(pcm));
return -EINVAL;
}
/*
@@ -647,10 +765,10 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
*/
*vma_prot = __pgprot((pgprot_val(*vma_prot) &
(~_PAGE_CACHE_MASK)) |
- flags);
+ cachemode2protval(pcm));
}
- if (kernel_map_sync_memtype(paddr, size, flags) < 0) {
+ if (kernel_map_sync_memtype(paddr, size, pcm) < 0) {
free_memtype(paddr, paddr + size);
return -EINVAL;
}
@@ -709,7 +827,7 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
unsigned long pfn, unsigned long addr, unsigned long size)
{
resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT;
- unsigned long flags;
+ enum page_cache_mode pcm;
/* reserve the whole chunk starting from paddr */
if (addr == vma->vm_start && size == (vma->vm_end - vma->vm_start)) {
@@ -728,18 +846,18 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
* For anything smaller than the vma size we set prot based on the
* lookup.
*/
- flags = lookup_memtype(paddr);
+ pcm = lookup_memtype(paddr);
/* Check memtype for the remaining pages */
while (size > PAGE_SIZE) {
size -= PAGE_SIZE;
paddr += PAGE_SIZE;
- if (flags != lookup_memtype(paddr))
+ if (pcm != lookup_memtype(paddr))
return -EINVAL;
}
*prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
- flags);
+ cachemode2protval(pcm));
return 0;
}
@@ -747,15 +865,15 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
unsigned long pfn)
{
- unsigned long flags;
+ enum page_cache_mode pcm;
if (!pat_enabled)
return 0;
/* Set prot based on lookup */
- flags = lookup_memtype((resource_size_t)pfn << PAGE_SHIFT);
+ pcm = lookup_memtype((resource_size_t)pfn << PAGE_SHIFT);
*prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
- flags);
+ cachemode2protval(pcm));
return 0;
}
@@ -791,7 +909,8 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
pgprot_t pgprot_writecombine(pgprot_t prot)
{
if (pat_enabled)
- return __pgprot(pgprot_val(prot) | _PAGE_CACHE_WC);
+ return __pgprot(pgprot_val(prot) |
+ cachemode2protval(_PAGE_CACHE_MODE_WC));
else
return pgprot_noncached(prot);
}
@@ -824,7 +943,7 @@ static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
{
if (*pos == 0) {
++*pos;
- seq_printf(seq, "PAT memtype list:\n");
+ seq_puts(seq, "PAT memtype list:\n");
}
return memtype_get_idx(*pos);
diff --git a/arch/x86/mm/pat_internal.h b/arch/x86/mm/pat_internal.h
index 77e5ba153fa..f6411620305 100644
--- a/arch/x86/mm/pat_internal.h
+++ b/arch/x86/mm/pat_internal.h
@@ -10,30 +10,32 @@ struct memtype {
u64 start;
u64 end;
u64 subtree_max_end;
- unsigned long type;
+ enum page_cache_mode type;
struct rb_node rb;
};
-static inline char *cattr_name(unsigned long flags)
+static inline char *cattr_name(enum page_cache_mode pcm)
{
- switch (flags & _PAGE_CACHE_MASK) {
- case _PAGE_CACHE_UC: return "uncached";
- case _PAGE_CACHE_UC_MINUS: return "uncached-minus";
- case _PAGE_CACHE_WB: return "write-back";
- case _PAGE_CACHE_WC: return "write-combining";
- default: return "broken";
+ switch (pcm) {
+ case _PAGE_CACHE_MODE_UC: return "uncached";
+ case _PAGE_CACHE_MODE_UC_MINUS: return "uncached-minus";
+ case _PAGE_CACHE_MODE_WB: return "write-back";
+ case _PAGE_CACHE_MODE_WC: return "write-combining";
+ case _PAGE_CACHE_MODE_WT: return "write-through";
+ case _PAGE_CACHE_MODE_WP: return "write-protected";
+ default: return "broken";
}
}
#ifdef CONFIG_X86_PAT
extern int rbt_memtype_check_insert(struct memtype *new,
- unsigned long *new_type);
+ enum page_cache_mode *new_type);
extern struct memtype *rbt_memtype_erase(u64 start, u64 end);
extern struct memtype *rbt_memtype_lookup(u64 addr);
extern int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos);
#else
static inline int rbt_memtype_check_insert(struct memtype *new,
- unsigned long *new_type)
+ enum page_cache_mode *new_type)
{ return 0; }
static inline struct memtype *rbt_memtype_erase(u64 start, u64 end)
{ return NULL; }
diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
index 415f6c4ced3..6582adcc8bd 100644
--- a/arch/x86/mm/pat_rbtree.c
+++ b/arch/x86/mm/pat_rbtree.c
@@ -122,11 +122,12 @@ static struct memtype *memtype_rb_exact_match(struct rb_root *root,
static int memtype_rb_check_conflict(struct rb_root *root,
u64 start, u64 end,
- unsigned long reqtype, unsigned long *newtype)
+ enum page_cache_mode reqtype,
+ enum page_cache_mode *newtype)
{
struct rb_node *node;
struct memtype *match;
- int found_type = reqtype;
+ enum page_cache_mode found_type = reqtype;
match = memtype_rb_lowest_match(&memtype_rbroot, start, end);
if (match == NULL)
@@ -187,7 +188,8 @@ static void memtype_rb_insert(struct rb_root *root, struct memtype *newdata)
rb_insert_augmented(&newdata->rb, root, &memtype_rb_augment_cb);
}
-int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type)
+int rbt_memtype_check_insert(struct memtype *new,
+ enum page_cache_mode *ret_type)
{
int err = 0;