diff options
Diffstat (limited to 'arch/s390/mm')
-rw-r--r-- | arch/s390/mm/Makefile | 2 | ||||
-rw-r--r-- | arch/s390/mm/cmm.c | 7 | ||||
-rw-r--r-- | arch/s390/mm/fault.c | 77 | ||||
-rw-r--r-- | arch/s390/mm/gup.c | 225 | ||||
-rw-r--r-- | arch/s390/mm/hugetlbpage.c | 2 | ||||
-rw-r--r-- | arch/s390/mm/init.c | 55 | ||||
-rw-r--r-- | arch/s390/mm/maccess.c | 4 | ||||
-rw-r--r-- | arch/s390/mm/pgtable.c | 173 |
8 files changed, 484 insertions, 61 deletions
diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile index eec05448441..6fbc6f3fbdf 100644 --- a/arch/s390/mm/Makefile +++ b/arch/s390/mm/Makefile @@ -3,6 +3,6 @@ # obj-y := init.o fault.o extmem.o mmap.o vmem.o pgtable.o maccess.o \ - page-states.o + page-states.o gup.o obj-$(CONFIG_CMM) += cmm.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c index a9550dca3e4..c66ffd8dbbb 100644 --- a/arch/s390/mm/cmm.c +++ b/arch/s390/mm/cmm.c @@ -23,7 +23,10 @@ #include <asm/pgalloc.h> #include <asm/diag.h> -static char *sender = "VMRMSVM"; +#ifdef CONFIG_CMM_IUCV +static char *cmm_default_sender = "VMRMSVM"; +#endif +static char *sender; module_param(sender, charp, 0400); MODULE_PARM_DESC(sender, "Guest name that may send SMSG messages (default VMRMSVM)"); @@ -440,6 +443,8 @@ static int __init cmm_init(void) int len = strlen(sender); while (len--) sender[len] = toupper(sender[len]); + } else { + sender = cmm_default_sender; } rc = smsg_register_callback(SMSG_PREFIX, cmm_smsg_target); diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 2505b2ea0ef..fe5701e9efb 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -52,6 +52,14 @@ #define VM_FAULT_BADMAP 0x020000 #define VM_FAULT_BADACCESS 0x040000 +static unsigned long store_indication; + +void fault_init(void) +{ + if (test_facility(2) && test_facility(75)) + store_indication = 0xc00; +} + static inline int notify_page_fault(struct pt_regs *regs) { int ret = 0; @@ -199,14 +207,21 @@ static noinline void do_sigbus(struct pt_regs *regs, long int_code, unsigned long trans_exc_code) { struct task_struct *tsk = current; + unsigned long address; + struct siginfo si; /* * Send a sigbus, regardless of whether we were in kernel * or user mode. */ - tsk->thread.prot_addr = trans_exc_code & __FAIL_ADDR_MASK; + address = trans_exc_code & __FAIL_ADDR_MASK; + tsk->thread.prot_addr = address; tsk->thread.trap_no = int_code; - force_sig(SIGBUS, tsk); + si.si_signo = SIGBUS; + si.si_errno = 0; + si.si_code = BUS_ADRERR; + si.si_addr = (void __user *) address; + force_sig_info(SIGBUS, &si, tsk); } #ifdef CONFIG_S390_EXEC_PROTECT @@ -266,10 +281,11 @@ static noinline void do_fault_error(struct pt_regs *regs, long int_code, if (fault & VM_FAULT_OOM) pagefault_out_of_memory(); else if (fault & VM_FAULT_SIGBUS) { - do_sigbus(regs, int_code, trans_exc_code); /* Kernel mode? Handle exceptions or die */ if (!(regs->psw.mask & PSW_MASK_PSTATE)) do_no_context(regs, int_code, trans_exc_code); + else + do_sigbus(regs, int_code, trans_exc_code); } else BUG(); break; @@ -294,7 +310,7 @@ static inline int do_exception(struct pt_regs *regs, int access, struct mm_struct *mm; struct vm_area_struct *vma; unsigned long address; - int fault; + int fault, write; if (notify_page_fault(regs)) return 0; @@ -312,12 +328,6 @@ static inline int do_exception(struct pt_regs *regs, int access, goto out; address = trans_exc_code & __FAIL_ADDR_MASK; - /* - * When we get here, the fault happened in the current - * task's user address space, so we can switch on the - * interrupts again and then search the VMAs - */ - local_irq_enable(); perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); down_read(&mm->mmap_sem); @@ -348,8 +358,10 @@ static inline int do_exception(struct pt_regs *regs, int access, * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(mm, vma, address, - (access == VM_WRITE) ? FAULT_FLAG_WRITE : 0); + write = (access == VM_WRITE || + (trans_exc_code & store_indication) == 0x400) ? + FAULT_FLAG_WRITE : 0; + fault = handle_mm_fault(mm, vma, address, write); if (unlikely(fault & VM_FAULT_ERROR)) goto out_up; @@ -374,20 +386,20 @@ out: return fault; } -void __kprobes do_protection_exception(struct pt_regs *regs, long int_code) +void __kprobes do_protection_exception(struct pt_regs *regs, long pgm_int_code, + unsigned long trans_exc_code) { - unsigned long trans_exc_code = S390_lowcore.trans_exc_code; int fault; /* Protection exception is supressing, decrement psw address. */ - regs->psw.addr -= (int_code >> 16); + regs->psw.addr -= (pgm_int_code >> 16); /* * Check for low-address protection. This needs to be treated * as a special case because the translation exception code * field is not guaranteed to contain valid data in this case. */ if (unlikely(!(trans_exc_code & 4))) { - do_low_address(regs, int_code, trans_exc_code); + do_low_address(regs, pgm_int_code, trans_exc_code); return; } fault = do_exception(regs, VM_WRITE, trans_exc_code); @@ -395,9 +407,9 @@ void __kprobes do_protection_exception(struct pt_regs *regs, long int_code) do_fault_error(regs, 4, trans_exc_code, fault); } -void __kprobes do_dat_exception(struct pt_regs *regs, long int_code) +void __kprobes do_dat_exception(struct pt_regs *regs, long pgm_int_code, + unsigned long trans_exc_code) { - unsigned long trans_exc_code = S390_lowcore.trans_exc_code; int access, fault; access = VM_READ | VM_EXEC | VM_WRITE; @@ -408,21 +420,19 @@ void __kprobes do_dat_exception(struct pt_regs *regs, long int_code) #endif fault = do_exception(regs, access, trans_exc_code); if (unlikely(fault)) - do_fault_error(regs, int_code & 255, trans_exc_code, fault); + do_fault_error(regs, pgm_int_code & 255, trans_exc_code, fault); } #ifdef CONFIG_64BIT -void __kprobes do_asce_exception(struct pt_regs *regs, long int_code) +void __kprobes do_asce_exception(struct pt_regs *regs, long pgm_int_code, + unsigned long trans_exc_code) { - unsigned long trans_exc_code = S390_lowcore.trans_exc_code; struct mm_struct *mm = current->mm; struct vm_area_struct *vma; if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm)) goto no_context; - local_irq_enable(); - down_read(&mm->mmap_sem); vma = find_vma(mm, trans_exc_code & __FAIL_ADDR_MASK); up_read(&mm->mmap_sem); @@ -434,16 +444,16 @@ void __kprobes do_asce_exception(struct pt_regs *regs, long int_code) /* User mode accesses just cause a SIGSEGV */ if (regs->psw.mask & PSW_MASK_PSTATE) { - do_sigsegv(regs, int_code, SEGV_MAPERR, trans_exc_code); + do_sigsegv(regs, pgm_int_code, SEGV_MAPERR, trans_exc_code); return; } no_context: - do_no_context(regs, int_code, trans_exc_code); + do_no_context(regs, pgm_int_code, trans_exc_code); } #endif -int __handle_fault(unsigned long uaddr, unsigned long int_code, int write_user) +int __handle_fault(unsigned long uaddr, unsigned long pgm_int_code, int write) { struct pt_regs regs; int access, fault; @@ -454,14 +464,14 @@ int __handle_fault(unsigned long uaddr, unsigned long int_code, int write_user) regs.psw.addr = (unsigned long) __builtin_return_address(0); regs.psw.addr |= PSW_ADDR_AMODE; uaddr &= PAGE_MASK; - access = write_user ? VM_WRITE : VM_READ; + access = write ? VM_WRITE : VM_READ; fault = do_exception(®s, access, uaddr | 2); if (unlikely(fault)) { if (fault & VM_FAULT_OOM) { pagefault_out_of_memory(); fault = 0; } else if (fault & VM_FAULT_SIGBUS) - do_sigbus(®s, int_code, uaddr); + do_sigbus(®s, pgm_int_code, uaddr); } return fault ? -EFAULT : 0; } @@ -527,7 +537,8 @@ void pfault_fini(void) : : "a" (&refbk), "m" (refbk) : "cc"); } -static void pfault_interrupt(__u16 int_code) +static void pfault_interrupt(unsigned int ext_int_code, + unsigned int param32, unsigned long param64) { struct task_struct *tsk; __u16 subcode; @@ -538,14 +549,18 @@ static void pfault_interrupt(__u16 int_code) * in the 'cpu address' field associated with the * external interrupt. */ - subcode = S390_lowcore.cpu_addr; + subcode = ext_int_code >> 16; if ((subcode & 0xff00) != __SUBCODE_MASK) return; /* * Get the token (= address of the task structure of the affected task). */ - tsk = *(struct task_struct **) __LC_PFAULT_INTPARM; +#ifdef CONFIG_64BIT + tsk = *(struct task_struct **) param64; +#else + tsk = *(struct task_struct **) param32; +#endif if (subcode & 0x0080) { /* signal bit is set -> a page has been swapped in by VM */ diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c new file mode 100644 index 00000000000..38e641cdd97 --- /dev/null +++ b/arch/s390/mm/gup.c @@ -0,0 +1,225 @@ +/* + * Lockless get_user_pages_fast for s390 + * + * Copyright IBM Corp. 2010 + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> + */ +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <linux/vmstat.h> +#include <linux/pagemap.h> +#include <linux/rwsem.h> +#include <asm/pgtable.h> + +/* + * The performance critical leaf functions are made noinline otherwise gcc + * inlines everything into a single function which results in too much + * register pressure. + */ +static inline int gup_pte_range(pmd_t *pmdp, pmd_t pmd, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + unsigned long mask, result; + pte_t *ptep, pte; + struct page *page; + + result = write ? 0 : _PAGE_RO; + mask = result | _PAGE_INVALID | _PAGE_SPECIAL; + + ptep = ((pte_t *) pmd_deref(pmd)) + pte_index(addr); + do { + pte = *ptep; + barrier(); + if ((pte_val(pte) & mask) != result) + return 0; + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + page = pte_page(pte); + if (!page_cache_get_speculative(page)) + return 0; + if (unlikely(pte_val(pte) != pte_val(*ptep))) { + put_page(page); + return 0; + } + pages[*nr] = page; + (*nr)++; + + } while (ptep++, addr += PAGE_SIZE, addr != end); + + return 1; +} + +static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + unsigned long mask, result; + struct page *head, *page; + int refs; + + result = write ? 0 : _SEGMENT_ENTRY_RO; + mask = result | _SEGMENT_ENTRY_INV; + if ((pmd_val(pmd) & mask) != result) + return 0; + VM_BUG_ON(!pfn_valid(pmd_val(pmd) >> PAGE_SHIFT)); + + refs = 0; + head = pmd_page(pmd); + page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + do { + VM_BUG_ON(compound_head(page) != head); + pages[*nr] = page; + (*nr)++; + page++; + refs++; + } while (addr += PAGE_SIZE, addr != end); + + if (!page_cache_add_speculative(head, refs)) { + *nr -= refs; + return 0; + } + + if (unlikely(pmd_val(pmd) != pmd_val(*pmdp))) { + *nr -= refs; + while (refs--) + put_page(head); + } + + return 1; +} + + +static inline int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + unsigned long next; + pmd_t *pmdp, pmd; + + pmdp = (pmd_t *) pudp; +#ifdef CONFIG_64BIT + if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) + pmdp = (pmd_t *) pud_deref(pud); + pmdp += pmd_index(addr); +#endif + do { + pmd = *pmdp; + barrier(); + next = pmd_addr_end(addr, end); + if (pmd_none(pmd)) + return 0; + if (unlikely(pmd_huge(pmd))) { + if (!gup_huge_pmd(pmdp, pmd, addr, next, + write, pages, nr)) + return 0; + } else if (!gup_pte_range(pmdp, pmd, addr, next, + write, pages, nr)) + return 0; + } while (pmdp++, addr = next, addr != end); + + return 1; +} + +static inline int gup_pud_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + unsigned long next; + pud_t *pudp, pud; + + pudp = (pud_t *) pgdp; +#ifdef CONFIG_64BIT + if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R2) + pudp = (pud_t *) pgd_deref(pgd); + pudp += pud_index(addr); +#endif + do { + pud = *pudp; + barrier(); + next = pud_addr_end(addr, end); + if (pud_none(pud)) + return 0; + if (!gup_pmd_range(pudp, pud, addr, next, write, pages, nr)) + return 0; + } while (pudp++, addr = next, addr != end); + + return 1; +} + +/** + * get_user_pages_fast() - pin user pages in memory + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @write: whether pages will be written to + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. + * + * Attempt to pin user pages in memory without taking mm->mmap_sem. + * If not successful, it will fall back to taking the lock and + * calling get_user_pages(). + * + * Returns number of pages pinned. This may be fewer than the number + * requested. If nr_pages is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. + */ +int get_user_pages_fast(unsigned long start, int nr_pages, int write, + struct page **pages) +{ + struct mm_struct *mm = current->mm; + unsigned long addr, len, end; + unsigned long next; + pgd_t *pgdp, pgd; + int nr = 0; + + start &= PAGE_MASK; + addr = start; + len = (unsigned long) nr_pages << PAGE_SHIFT; + end = start + len; + if (end < start) + goto slow_irqon; + + /* + * local_irq_disable() doesn't prevent pagetable teardown, but does + * prevent the pagetables from being freed on s390. + * + * So long as we atomically load page table pointers versus teardown, + * we can follow the address down to the the page and take a ref on it. + */ + local_irq_disable(); + pgdp = pgd_offset(mm, addr); + do { + pgd = *pgdp; + barrier(); + next = pgd_addr_end(addr, end); + if (pgd_none(pgd)) + goto slow; + if (!gup_pud_range(pgdp, pgd, addr, next, write, pages, &nr)) + goto slow; + } while (pgdp++, addr = next, addr != end); + local_irq_enable(); + + VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); + return nr; + + { + int ret; +slow: + local_irq_enable(); +slow_irqon: + /* Try to get the remaining pages with get_user_pages */ + start += nr << PAGE_SHIFT; + pages += nr; + + down_read(&mm->mmap_sem); + ret = get_user_pages(current, mm, start, + (end - start) >> PAGE_SHIFT, write, 0, pages, NULL); + up_read(&mm->mmap_sem); + + /* Have to be a bit careful with return values */ + if (nr > 0) { + if (ret < 0) + ret = nr; + else + ret += nr; + } + + return ret; + } +} diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index f28c43d2f61..639cd21f221 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -68,7 +68,7 @@ void arch_release_hugepage(struct page *page) ptep = (pte_t *) page[1].index; if (!ptep) return; - pte_free(&init_mm, ptep); + page_table_free(&init_mm, (unsigned long *) ptep); page[1].index = 0; } diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 30eb6d02ddb..bb409332a48 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -38,19 +38,59 @@ #include <asm/tlbflush.h> #include <asm/sections.h> -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); - pgd_t swapper_pg_dir[PTRS_PER_PGD] __attribute__((__aligned__(PAGE_SIZE))); -char empty_zero_page[PAGE_SIZE] __attribute__((__aligned__(PAGE_SIZE))); +unsigned long empty_zero_page, zero_page_mask; EXPORT_SYMBOL(empty_zero_page); +static unsigned long setup_zero_pages(void) +{ + struct cpuid cpu_id; + unsigned int order; + unsigned long size; + struct page *page; + int i; + + get_cpu_id(&cpu_id); + switch (cpu_id.machine) { + case 0x9672: /* g5 */ + case 0x2064: /* z900 */ + case 0x2066: /* z900 */ + case 0x2084: /* z990 */ + case 0x2086: /* z990 */ + case 0x2094: /* z9-109 */ + case 0x2096: /* z9-109 */ + order = 0; + break; + case 0x2097: /* z10 */ + case 0x2098: /* z10 */ + default: + order = 2; + break; + } + + empty_zero_page = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order); + if (!empty_zero_page) + panic("Out of memory in setup_zero_pages"); + + page = virt_to_page((void *) empty_zero_page); + split_page(page, order); + for (i = 1 << order; i > 0; i--) { + SetPageReserved(page); + page++; + } + + size = PAGE_SIZE << order; + zero_page_mask = (size - 1) & PAGE_MASK; + + return 1UL << order; +} + /* * paging_init() sets up the page tables */ void __init paging_init(void) { - static const int ssm_mask = 0x04000000L; unsigned long max_zone_pfns[MAX_NR_ZONES]; unsigned long pgd_type; @@ -72,7 +112,7 @@ void __init paging_init(void) __ctl_load(S390_lowcore.kernel_asce, 1, 1); __ctl_load(S390_lowcore.kernel_asce, 7, 7); __ctl_load(S390_lowcore.kernel_asce, 13, 13); - __raw_local_irq_ssm(ssm_mask); + arch_local_irq_restore(4UL << (BITS_PER_LONG - 8)); atomic_set(&init_mm.context.attach_count, 1); @@ -84,6 +124,7 @@ void __init paging_init(void) #endif max_zone_pfns[ZONE_NORMAL] = max_low_pfn; free_area_init_nodes(max_zone_pfns); + fault_init(); } void __init mem_init(void) @@ -93,14 +134,12 @@ void __init mem_init(void) max_mapnr = num_physpages = max_low_pfn; high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); - /* clear the zero-page */ - memset(empty_zero_page, 0, PAGE_SIZE); - /* Setup guest page hinting */ cmma_init(); /* this will put all low memory onto the freelists */ totalram_pages += free_all_bootmem(); + totalram_pages -= setup_zero_pages(); /* Setup zeroed pages. */ reservedpages = 0; diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c index a8c2af8c650..71a4b0d34be 100644 --- a/arch/s390/mm/maccess.c +++ b/arch/s390/mm/maccess.c @@ -71,7 +71,7 @@ int memcpy_real(void *dest, void *src, size_t count) if (!count) return 0; - flags = __raw_local_irq_stnsm(0xf8UL); + flags = __arch_local_irq_stnsm(0xf8UL); asm volatile ( "0: mvcle %1,%2,0x0\n" "1: jo 0b\n" @@ -82,6 +82,6 @@ int memcpy_real(void *dest, void *src, size_t count) "+d" (_len2), "=m" (*((long *) dest)) : "m" (*((long *) src)) : "cc", "memory"); - __raw_local_irq_ssm(flags); + arch_local_irq_restore(flags); return rc; } diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 8d999249d35..0c719c61972 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -15,6 +15,7 @@ #include <linux/spinlock.h> #include <linux/module.h> #include <linux/quicklist.h> +#include <linux/rcupdate.h> #include <asm/system.h> #include <asm/pgtable.h> @@ -23,6 +24,67 @@ #include <asm/tlbflush.h> #include <asm/mmu_context.h> +struct rcu_table_freelist { + struct rcu_head rcu; + struct mm_struct *mm; + unsigned int pgt_index; + unsigned int crst_index; + unsigned long *table[0]; +}; + +#define RCU_FREELIST_SIZE \ + ((PAGE_SIZE - sizeof(struct rcu_table_freelist)) \ + / sizeof(unsigned long)) + +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +static DEFINE_PER_CPU(struct rcu_table_freelist *, rcu_table_freelist); + +static void __page_table_free(struct mm_struct *mm, unsigned long *table); +static void __crst_table_free(struct mm_struct *mm, unsigned long *table); + +static struct rcu_table_freelist *rcu_table_freelist_get(struct mm_struct *mm) +{ + struct rcu_table_freelist **batchp = &__get_cpu_var(rcu_table_freelist); + struct rcu_table_freelist *batch = *batchp; + + if (batch) + return batch; + batch = (struct rcu_table_freelist *) __get_free_page(GFP_ATOMIC); + if (batch) { + batch->mm = mm; + batch->pgt_index = 0; + batch->crst_index = RCU_FREELIST_SIZE; + *batchp = batch; + } + return batch; +} + +static void rcu_table_freelist_callback(struct rcu_head *head) +{ + struct rcu_table_freelist *batch = + container_of(head, struct rcu_table_freelist, rcu); + + while (batch->pgt_index > 0) + __page_table_free(batch->mm, batch->table[--batch->pgt_index]); + while (batch->crst_index < RCU_FREELIST_SIZE) + __crst_table_free(batch->mm, batch->table[batch->crst_index++]); + free_page((unsigned long) batch); +} + +void rcu_table_freelist_finish(void) +{ + struct rcu_table_freelist *batch = __get_cpu_var(rcu_table_freelist); + + if (!batch) + return; + call_rcu(&batch->rcu, rcu_table_freelist_callback); + __get_cpu_var(rcu_table_freelist) = NULL; +} + +static void smp_sync(void *arg) +{ +} + #ifndef CONFIG_64BIT #define ALLOC_ORDER 1 #define TABLES_PER_PAGE 4 @@ -78,25 +140,55 @@ unsigned long *crst_table_alloc(struct mm_struct *mm, int noexec) } page->index = page_to_phys(shadow); } - spin_lock(&mm->context.list_lock); + spin_lock_bh(&mm->context.list_lock); list_add(&page->lru, &mm->context.crst_list); - spin_unlock(&mm->context.list_lock); + spin_unlock_bh(&mm->context.list_lock); return (unsigned long *) page_to_phys(page); } -void crst_table_free(struct mm_struct *mm, unsigned long *table) +static void __crst_table_free(struct mm_struct *mm, unsigned long *table) { unsigned long *shadow = get_shadow_table(table); - struct page *page = virt_to_page(table); - spin_lock(&mm->context.list_lock); - list_del(&page->lru); - spin_unlock(&mm->context.list_lock); if (shadow) free_pages((unsigned long) shadow, ALLOC_ORDER); free_pages((unsigned long) table, ALLOC_ORDER); } +void crst_table_free(struct mm_struct *mm, unsigned long *table) +{ + struct page *page = virt_to_page(table); + + spin_lock_bh(&mm->context.list_lock); + list_del(&page->lru); + spin_unlock_bh(&mm->context.list_lock); + __crst_table_free(mm, table); +} + +void crst_table_free_rcu(struct mm_struct *mm, unsigned long *table) +{ + struct rcu_table_freelist *batch; + struct page *page = virt_to_page(table); + + spin_lock_bh(&mm->context.list_lock); + list_del(&page->lru); + spin_unlock_bh(&mm->context.list_lock); + if (atomic_read(&mm->mm_users) < 2 && + cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) { + __crst_table_free(mm, table); + return; + } + batch = rcu_table_freelist_get(mm); + if (!batch) { + smp_call_function(smp_sync, NULL, 1); + __crst_table_free(mm, table); + return; + } + batch->table[--batch->crst_index] = table; + if (batch->pgt_index >= batch->crst_index) + rcu_table_freelist_finish(); +} + #ifdef CONFIG_64BIT int crst_table_upgrade(struct mm_struct *mm, unsigned long limit) { @@ -108,7 +200,7 @@ repeat: table = crst_table_alloc(mm, mm->context.noexec); if (!table) return -ENOMEM; - spin_lock(&mm->page_table_lock); + spin_lock_bh(&mm->page_table_lock); if (mm->context.asce_limit < limit) { pgd = (unsigned long *) mm->pgd; if (mm->context.asce_limit <= (1UL << 31)) { @@ -130,7 +222,7 @@ repeat: mm->task_size = mm->context.asce_limit; table = NULL; } - spin_unlock(&mm->page_table_lock); + spin_unlock_bh(&mm->page_table_lock); if (table) crst_table_free(mm, table); if (mm->context.asce_limit < limit) @@ -182,7 +274,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) unsigned long bits; bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL; - spin_lock(&mm->context.list_lock); + spin_lock_bh(&mm->context.list_lock); page = NULL; if (!list_empty(&mm->context.pgtable_list)) { page = list_first_entry(&mm->context.pgtable_list, @@ -191,7 +283,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) page = NULL; } if (!page) { - spin_unlock(&mm->context.list_lock); + spin_unlock_bh(&mm->context.list_lock); page = alloc_page(GFP_KERNEL|__GFP_REPEAT); if (!page) return NULL; @@ -202,7 +294,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) clear_table_pgstes(table); else clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE); - spin_lock(&mm->context.list_lock); + spin_lock_bh(&mm->context.list_lock); list_add(&page->lru, &mm->context.pgtable_list); } table = (unsigned long *) page_to_phys(page); @@ -213,10 +305,25 @@ unsigned long *page_table_alloc(struct mm_struct *mm) page->flags |= bits; if ((page->flags & FRAG_MASK) == ((1UL << TABLES_PER_PAGE) - 1)) list_move_tail(&page->lru, &mm->context.pgtable_list); - spin_unlock(&mm->context.list_lock); + spin_unlock_bh(&mm->context.list_lock); return table; } +static void __page_table_free(struct mm_struct *mm, unsigned long *table) +{ + struct page *page; + unsigned long bits; + + bits = ((unsigned long) table) & 15; + table = (unsigned long *)(((unsigned long) table) ^ bits); + page = pfn_to_page(__pa(table) >> PAGE_SHIFT); + page->flags ^= bits; + if (!(page->flags & FRAG_MASK)) { + pgtable_page_dtor(page); + __free_page(page); + } +} + void page_table_free(struct mm_struct *mm, unsigned long *table) { struct page *page; @@ -225,7 +332,7 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL; bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long); page = pfn_to_page(__pa(table) >> PAGE_SHIFT); - spin_lock(&mm->context.list_lock); + spin_lock_bh(&mm->context.list_lock); page->flags ^= bits; if (page->flags & FRAG_MASK) { /* Page now has some free pgtable fragments. */ @@ -234,18 +341,48 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) } else /* All fragments of the 4K page have been freed. */ list_del(&page->lru); - spin_unlock(&mm->context.list_lock); + spin_unlock_bh(&mm->context.list_lock); if (page) { pgtable_page_dtor(page); __free_page(page); } } +void page_table_free_rcu(struct mm_struct *mm, unsigned long *table) +{ + struct rcu_table_freelist *batch; + struct page *page; + unsigned long bits; + + if (atomic_read(&mm->mm_users) < 2 && + cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) { + page_table_free(mm, table); + return; + } + batch = rcu_table_freelist_get(mm); + if (!batch) { + smp_call_function(smp_sync, NULL, 1); + page_table_free(mm, table); + return; + } + bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL; + bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long); + page = pfn_to_page(__pa(table) >> PAGE_SHIFT); + spin_lock_bh(&mm->context.list_lock); + /* Delayed freeing with rcu prevents reuse of pgtable fragments */ + list_del_init(&page->lru); + spin_unlock_bh(&mm->context.list_lock); + table = (unsigned long *)(((unsigned long) table) | bits); + batch->table[batch->pgt_index++] = table; + if (batch->pgt_index >= batch->crst_index) + rcu_table_freelist_finish(); +} + void disable_noexec(struct mm_struct *mm, struct task_struct *tsk) { struct page *page; - spin_lock(&mm->context.list_lock); + spin_lock_bh(&mm->context.list_lock); /* Free shadow region and segment tables. */ list_for_each_entry(page, &mm->context.crst_list, lru) if (page->index) { @@ -255,7 +392,7 @@ void disable_noexec(struct mm_struct *mm, struct task_struct *tsk) /* "Free" second halves of page tables. */ list_for_each_entry(page, &mm->context.pgtable_list, lru) page->flags &= ~SECOND_HALVES; - spin_unlock(&mm->context.list_lock); + spin_unlock_bh(&mm->context.list_lock); mm->context.noexec = 0; update_mm(mm, tsk); } @@ -312,6 +449,8 @@ int s390_enable_sie(void) tsk->mm = tsk->active_mm = mm; preempt_disable(); update_mm(mm, tsk); + atomic_inc(&mm->context.attach_count); + atomic_dec(&old_mm->context.attach_count); cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm)); preempt_enable(); task_unlock(tsk); |