diff options
Diffstat (limited to 'arch/s390/mm')
-rw-r--r-- | arch/s390/mm/Makefile | 1 | ||||
-rw-r--r-- | arch/s390/mm/dump_pagetables.c | 226 | ||||
-rw-r--r-- | arch/s390/mm/fault.c | 1 | ||||
-rw-r--r-- | arch/s390/mm/gup.c | 11 | ||||
-rw-r--r-- | arch/s390/mm/pageattr.c | 40 | ||||
-rw-r--r-- | arch/s390/mm/pgtable.c | 108 | ||||
-rw-r--r-- | arch/s390/mm/vmem.c | 45 |
7 files changed, 398 insertions, 34 deletions
diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile index 0f5536b0c1a..1bea6d1f55a 100644 --- a/arch/s390/mm/Makefile +++ b/arch/s390/mm/Makefile @@ -7,3 +7,4 @@ obj-y := init.o fault.o extmem.o mmap.o vmem.o pgtable.o maccess.o \ obj-$(CONFIG_CMM) += cmm.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_DEBUG_SET_MODULE_RONX) += pageattr.o +obj-$(CONFIG_S390_PTDUMP) += dump_pagetables.o diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c new file mode 100644 index 00000000000..cbc6668acb8 --- /dev/null +++ b/arch/s390/mm/dump_pagetables.c @@ -0,0 +1,226 @@ +#include <linux/seq_file.h> +#include <linux/debugfs.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <asm/sections.h> +#include <asm/pgtable.h> + +static unsigned long max_addr; + +struct addr_marker { + unsigned long start_address; + const char *name; +}; + +enum address_markers_idx { + IDENTITY_NR = 0, + KERNEL_START_NR, + KERNEL_END_NR, + VMEMMAP_NR, + VMALLOC_NR, +#ifdef CONFIG_64BIT + MODULES_NR, +#endif +}; + +static struct addr_marker address_markers[] = { + [IDENTITY_NR] = {0, "Identity Mapping"}, + [KERNEL_START_NR] = {(unsigned long)&_stext, "Kernel Image Start"}, + [KERNEL_END_NR] = {(unsigned long)&_end, "Kernel Image End"}, + [VMEMMAP_NR] = {0, "vmemmap Area"}, + [VMALLOC_NR] = {0, "vmalloc Area"}, +#ifdef CONFIG_64BIT + [MODULES_NR] = {0, "Modules Area"}, +#endif + { -1, NULL } +}; + +struct pg_state { + int level; + unsigned int current_prot; + unsigned long start_address; + unsigned long current_address; + const struct addr_marker *marker; +}; + +static void print_prot(struct seq_file *m, unsigned int pr, int level) +{ + static const char * const level_name[] = + { "ASCE", "PGD", "PUD", "PMD", "PTE" }; + + seq_printf(m, "%s ", level_name[level]); + if (pr & _PAGE_INVALID) + seq_printf(m, "I\n"); + else + seq_printf(m, "%s\n", pr & _PAGE_RO ? "RO" : "RW"); +} + +static void note_page(struct seq_file *m, struct pg_state *st, + unsigned int new_prot, int level) +{ + static const char units[] = "KMGTPE"; + int width = sizeof(unsigned long) * 2; + const char *unit = units; + unsigned int prot, cur; + unsigned long delta; + + /* + * If we have a "break" in the series, we need to flush the state + * that we have now. "break" is either changing perms, levels or + * address space marker. + */ + prot = new_prot; + cur = st->current_prot; + + if (!st->level) { + /* First entry */ + st->current_prot = new_prot; + st->level = level; + st->marker = address_markers; + seq_printf(m, "---[ %s ]---\n", st->marker->name); + } else if (prot != cur || level != st->level || + st->current_address >= st->marker[1].start_address) { + /* Print the actual finished series */ + seq_printf(m, "0x%0*lx-0x%0*lx", + width, st->start_address, + width, st->current_address); + delta = (st->current_address - st->start_address) >> 10; + while (!(delta & 0x3ff) && unit[1]) { + delta >>= 10; + unit++; + } + seq_printf(m, "%9lu%c ", delta, *unit); + print_prot(m, st->current_prot, st->level); + if (st->current_address >= st->marker[1].start_address) { + st->marker++; + seq_printf(m, "---[ %s ]---\n", st->marker->name); + } + st->start_address = st->current_address; + st->current_prot = new_prot; + st->level = level; + } +} + +/* + * The actual page table walker functions. In order to keep the implementation + * of print_prot() short, we only check and pass _PAGE_INVALID and _PAGE_RO + * flags to note_page() if a region, segment or page table entry is invalid or + * read-only. + * After all it's just a hint that the current level being walked contains an + * invalid or read-only entry. + */ +static void walk_pte_level(struct seq_file *m, struct pg_state *st, + pmd_t *pmd, unsigned long addr) +{ + unsigned int prot; + pte_t *pte; + int i; + + for (i = 0; i < PTRS_PER_PTE && addr < max_addr; i++) { + st->current_address = addr; + pte = pte_offset_kernel(pmd, addr); + prot = pte_val(*pte) & (_PAGE_RO | _PAGE_INVALID); + note_page(m, st, prot, 4); + addr += PAGE_SIZE; + } +} + +static void walk_pmd_level(struct seq_file *m, struct pg_state *st, + pud_t *pud, unsigned long addr) +{ + unsigned int prot; + pmd_t *pmd; + int i; + + for (i = 0; i < PTRS_PER_PMD && addr < max_addr; i++) { + st->current_address = addr; + pmd = pmd_offset(pud, addr); + if (!pmd_none(*pmd)) { + if (pmd_large(*pmd)) { + prot = pmd_val(*pmd) & _SEGMENT_ENTRY_RO; + note_page(m, st, prot, 3); + } else + walk_pte_level(m, st, pmd, addr); + } else + note_page(m, st, _PAGE_INVALID, 3); + addr += PMD_SIZE; + } +} + +static void walk_pud_level(struct seq_file *m, struct pg_state *st, + pgd_t *pgd, unsigned long addr) +{ + pud_t *pud; + int i; + + for (i = 0; i < PTRS_PER_PUD && addr < max_addr; i++) { + st->current_address = addr; + pud = pud_offset(pgd, addr); + if (!pud_none(*pud)) + walk_pmd_level(m, st, pud, addr); + else + note_page(m, st, _PAGE_INVALID, 2); + addr += PUD_SIZE; + } +} + +static void walk_pgd_level(struct seq_file *m) +{ + unsigned long addr = 0; + struct pg_state st; + pgd_t *pgd; + int i; + + memset(&st, 0, sizeof(st)); + for (i = 0; i < PTRS_PER_PGD && addr < max_addr; i++) { + st.current_address = addr; + pgd = pgd_offset_k(addr); + if (!pgd_none(*pgd)) + walk_pud_level(m, &st, pgd, addr); + else + note_page(m, &st, _PAGE_INVALID, 1); + addr += PGDIR_SIZE; + } + /* Flush out the last page */ + st.current_address = max_addr; + note_page(m, &st, 0, 0); +} + +static int ptdump_show(struct seq_file *m, void *v) +{ + walk_pgd_level(m); + return 0; +} + +static int ptdump_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, ptdump_show, NULL); +} + +static const struct file_operations ptdump_fops = { + .open = ptdump_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int pt_dump_init(void) +{ + /* + * Figure out the maximum virtual address being accessible with the + * kernel ASCE. We need this to keep the page table walker functions + * from accessing non-existent entries. + */ +#ifdef CONFIG_32BIT + max_addr = 1UL << 31; +#else + max_addr = (S390_lowcore.kernel_asce & _REGION_ENTRY_TYPE_MASK) >> 2; + max_addr = 1UL << (max_addr * 11 + 31); + address_markers[MODULES_NR].start_address = MODULES_VADDR; +#endif + address_markers[VMEMMAP_NR].start_address = (unsigned long) vmemmap; + address_markers[VMALLOC_NR].start_address = VMALLOC_START; + debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, &ptdump_fops); + return 0; +} +device_initcall(pt_dump_init); diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index ac9122ca115..04ad4001a28 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -367,6 +367,7 @@ retry: /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk * of starvation. */ flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; down_read(&mm->mmap_sem); goto retry; } diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c index eeaf8023851..60acb93a468 100644 --- a/arch/s390/mm/gup.c +++ b/arch/s390/mm/gup.c @@ -115,7 +115,16 @@ static inline int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, pmd = *pmdp; barrier(); next = pmd_addr_end(addr, end); - if (pmd_none(pmd)) + /* + * The pmd_trans_splitting() check below explains why + * pmdp_splitting_flush() has to serialize with + * smp_call_function() against our disabled IRQs, to stop + * this gup-fast code from running while we set the + * splitting bit in the pmd. Returning zero will take + * the slow path that will call wait_split_huge_page() + * if the pmd is still in splitting state. + */ + if (pmd_none(pmd) || pmd_trans_splitting(pmd)) return 0; if (unlikely(pmd_huge(pmd))) { if (!gup_huge_pmd(pmdp, pmd, addr, next, diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index b36537a5f43..00be01c4b4f 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -8,25 +8,38 @@ #include <asm/cacheflush.h> #include <asm/pgtable.h> +static pte_t *walk_page_table(unsigned long addr) +{ + pgd_t *pgdp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + pgdp = pgd_offset_k(addr); + if (pgd_none(*pgdp)) + return NULL; + pudp = pud_offset(pgdp, addr); + if (pud_none(*pudp)) + return NULL; + pmdp = pmd_offset(pudp, addr); + if (pmd_none(*pmdp) || pmd_large(*pmdp)) + return NULL; + ptep = pte_offset_kernel(pmdp, addr); + if (pte_none(*ptep)) + return NULL; + return ptep; +} + static void change_page_attr(unsigned long addr, int numpages, pte_t (*set) (pte_t)) { pte_t *ptep, pte; - pmd_t *pmdp; - pud_t *pudp; - pgd_t *pgdp; int i; for (i = 0; i < numpages; i++) { - pgdp = pgd_offset(&init_mm, addr); - pudp = pud_offset(pgdp, addr); - pmdp = pmd_offset(pudp, addr); - if (pmd_huge(*pmdp)) { - WARN_ON_ONCE(1); - continue; - } - ptep = pte_offset_kernel(pmdp, addr); - + ptep = walk_page_table(addr); + if (WARN_ON_ONCE(!ptep)) + break; pte = *ptep; pte = set(pte); __ptep_ipte(addr, ptep); @@ -40,21 +53,18 @@ int set_memory_ro(unsigned long addr, int numpages) change_page_attr(addr, numpages, pte_wrprotect); return 0; } -EXPORT_SYMBOL_GPL(set_memory_ro); int set_memory_rw(unsigned long addr, int numpages) { change_page_attr(addr, numpages, pte_mkwrite); return 0; } -EXPORT_SYMBOL_GPL(set_memory_rw); /* not possible */ int set_memory_nx(unsigned long addr, int numpages) { return 0; } -EXPORT_SYMBOL_GPL(set_memory_nx); int set_memory_x(unsigned long addr, int numpages) { diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index b402991e43d..c8188a18af0 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -787,6 +787,30 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table) tlb_table_flush(tlb); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void thp_split_vma(struct vm_area_struct *vma) +{ + unsigned long addr; + struct page *page; + + for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) { + page = follow_page(vma, addr, FOLL_SPLIT); + } +} + +void thp_split_mm(struct mm_struct *mm) +{ + struct vm_area_struct *vma = mm->mmap; + + while (vma != NULL) { + thp_split_vma(vma); + vma->vm_flags &= ~VM_HUGEPAGE; + vma->vm_flags |= VM_NOHUGEPAGE; + vma = vma->vm_next; + } +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + /* * switch on pgstes for its userspace process (for kvm) */ @@ -824,6 +848,12 @@ int s390_enable_sie(void) if (!mm) return -ENOMEM; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + /* split thp mappings and disable thp for future mappings */ + thp_split_mm(mm); + mm->def_flags |= VM_NOHUGEPAGE; +#endif + /* Now lets check again if something happened */ task_lock(tsk); if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || @@ -866,3 +896,81 @@ bool kernel_page_present(struct page *page) return cc == 0; } #endif /* CONFIG_HIBERNATION && CONFIG_DEBUG_PAGEALLOC */ + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + /* No need to flush TLB + * On s390 reference bits are in storage key and never in TLB */ + return pmdp_test_and_clear_young(vma, address, pmdp); +} + +int pmdp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, + pmd_t entry, int dirty) +{ + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + if (pmd_same(*pmdp, entry)) + return 0; + pmdp_invalidate(vma, address, pmdp); + set_pmd_at(vma->vm_mm, address, pmdp, entry); + return 1; +} + +static void pmdp_splitting_flush_sync(void *arg) +{ + /* Simply deliver the interrupt */ +} + +void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT, + (unsigned long *) pmdp)) { + /* need to serialize against gup-fast (IRQ disabled) */ + smp_call_function(pmdp_splitting_flush_sync, NULL, 1); + } +} + +void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) +{ + struct list_head *lh = (struct list_head *) pgtable; + + assert_spin_locked(&mm->page_table_lock); + + /* FIFO */ + if (!mm->pmd_huge_pte) + INIT_LIST_HEAD(lh); + else + list_add(lh, (struct list_head *) mm->pmd_huge_pte); + mm->pmd_huge_pte = pgtable; +} + +pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm) +{ + struct list_head *lh; + pgtable_t pgtable; + pte_t *ptep; + + assert_spin_locked(&mm->page_table_lock); + + /* FIFO */ + pgtable = mm->pmd_huge_pte; + lh = (struct list_head *) pgtable; + if (list_empty(lh)) + mm->pmd_huge_pte = NULL; + else { + mm->pmd_huge_pte = (pgtable_t) lh->next; + list_del(lh); + } + ptep = (pte_t *) pgtable; + pte_val(*ptep) = _PAGE_TYPE_EMPTY; + ptep++; + pte_val(*ptep) = _PAGE_TYPE_EMPTY; + return pgtable; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index c22abf900c9..387c7c60b5b 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -79,7 +79,8 @@ static pte_t __ref *vmem_pte_alloc(unsigned long address) */ static int vmem_add_mem(unsigned long start, unsigned long size, int ro) { - unsigned long address; + unsigned long end = start + size; + unsigned long address = start; pgd_t *pg_dir; pud_t *pu_dir; pmd_t *pm_dir; @@ -87,7 +88,7 @@ static int vmem_add_mem(unsigned long start, unsigned long size, int ro) pte_t pte; int ret = -ENOMEM; - for (address = start; address < start + size; address += PAGE_SIZE) { + while (address < end) { pg_dir = pgd_offset_k(address); if (pgd_none(*pg_dir)) { pu_dir = vmem_pud_alloc(); @@ -108,12 +109,11 @@ static int vmem_add_mem(unsigned long start, unsigned long size, int ro) pm_dir = pmd_offset(pu_dir, address); #if defined(CONFIG_64BIT) && !defined(CONFIG_DEBUG_PAGEALLOC) - if (MACHINE_HAS_HPAGE && !(address & ~HPAGE_MASK) && - (address + HPAGE_SIZE <= start + size) && - (address >= HPAGE_SIZE)) { + if (MACHINE_HAS_EDAT1 && pmd_none(*pm_dir) && address && + !(address & ~PMD_MASK) && (address + PMD_SIZE <= end)) { pte_val(pte) |= _SEGMENT_ENTRY_LARGE; pmd_val(*pm_dir) = pte_val(pte); - address += HPAGE_SIZE - PAGE_SIZE; + address += PMD_SIZE; continue; } #endif @@ -126,10 +126,11 @@ static int vmem_add_mem(unsigned long start, unsigned long size, int ro) pt_dir = pte_offset_kernel(pm_dir, address); *pt_dir = pte; + address += PAGE_SIZE; } ret = 0; out: - flush_tlb_kernel_range(start, start + size); + flush_tlb_kernel_range(start, end); return ret; } @@ -139,7 +140,8 @@ out: */ static void vmem_remove_range(unsigned long start, unsigned long size) { - unsigned long address; + unsigned long end = start + size; + unsigned long address = start; pgd_t *pg_dir; pud_t *pu_dir; pmd_t *pm_dir; @@ -147,25 +149,32 @@ static void vmem_remove_range(unsigned long start, unsigned long size) pte_t pte; pte_val(pte) = _PAGE_TYPE_EMPTY; - for (address = start; address < start + size; address += PAGE_SIZE) { + while (address < end) { pg_dir = pgd_offset_k(address); + if (pgd_none(*pg_dir)) { + address += PGDIR_SIZE; + continue; + } pu_dir = pud_offset(pg_dir, address); - if (pud_none(*pu_dir)) + if (pud_none(*pu_dir)) { + address += PUD_SIZE; continue; + } pm_dir = pmd_offset(pu_dir, address); - if (pmd_none(*pm_dir)) + if (pmd_none(*pm_dir)) { + address += PMD_SIZE; continue; - - if (pmd_huge(*pm_dir)) { + } + if (pmd_large(*pm_dir)) { pmd_clear(pm_dir); - address += HPAGE_SIZE - PAGE_SIZE; + address += PMD_SIZE; continue; } - pt_dir = pte_offset_kernel(pm_dir, address); *pt_dir = pte; + address += PAGE_SIZE; } - flush_tlb_kernel_range(start, start + size); + flush_tlb_kernel_range(start, end); } /* @@ -330,8 +339,8 @@ void __init vmem_map_init(void) unsigned long start, end; int i; - ro_start = ((unsigned long)&_stext) & PAGE_MASK; - ro_end = PFN_ALIGN((unsigned long)&_eshared); + ro_start = PFN_ALIGN((unsigned long)&_stext); + ro_end = (unsigned long)&_eshared & PAGE_MASK; for (i = 0; i < MEMORY_CHUNKS && memory_chunk[i].size > 0; i++) { if (memory_chunk[i].type == CHUNK_CRASHK || memory_chunk[i].type == CHUNK_OLDMEM) |