summaryrefslogtreecommitdiffstats
path: root/arch/s390/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/s390/mm')
-rw-r--r--arch/s390/mm/Makefile2
-rw-r--r--arch/s390/mm/cmm.c7
-rw-r--r--arch/s390/mm/fault.c77
-rw-r--r--arch/s390/mm/gup.c225
-rw-r--r--arch/s390/mm/hugetlbpage.c2
-rw-r--r--arch/s390/mm/init.c55
-rw-r--r--arch/s390/mm/maccess.c4
-rw-r--r--arch/s390/mm/pgtable.c173
8 files changed, 484 insertions, 61 deletions
diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile
index eec05448441..6fbc6f3fbdf 100644
--- a/arch/s390/mm/Makefile
+++ b/arch/s390/mm/Makefile
@@ -3,6 +3,6 @@
#
obj-y := init.o fault.o extmem.o mmap.o vmem.o pgtable.o maccess.o \
- page-states.o
+ page-states.o gup.o
obj-$(CONFIG_CMM) += cmm.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c
index a9550dca3e4..c66ffd8dbbb 100644
--- a/arch/s390/mm/cmm.c
+++ b/arch/s390/mm/cmm.c
@@ -23,7 +23,10 @@
#include <asm/pgalloc.h>
#include <asm/diag.h>
-static char *sender = "VMRMSVM";
+#ifdef CONFIG_CMM_IUCV
+static char *cmm_default_sender = "VMRMSVM";
+#endif
+static char *sender;
module_param(sender, charp, 0400);
MODULE_PARM_DESC(sender,
"Guest name that may send SMSG messages (default VMRMSVM)");
@@ -440,6 +443,8 @@ static int __init cmm_init(void)
int len = strlen(sender);
while (len--)
sender[len] = toupper(sender[len]);
+ } else {
+ sender = cmm_default_sender;
}
rc = smsg_register_callback(SMSG_PREFIX, cmm_smsg_target);
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 2505b2ea0ef..fe5701e9efb 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -52,6 +52,14 @@
#define VM_FAULT_BADMAP 0x020000
#define VM_FAULT_BADACCESS 0x040000
+static unsigned long store_indication;
+
+void fault_init(void)
+{
+ if (test_facility(2) && test_facility(75))
+ store_indication = 0xc00;
+}
+
static inline int notify_page_fault(struct pt_regs *regs)
{
int ret = 0;
@@ -199,14 +207,21 @@ static noinline void do_sigbus(struct pt_regs *regs, long int_code,
unsigned long trans_exc_code)
{
struct task_struct *tsk = current;
+ unsigned long address;
+ struct siginfo si;
/*
* Send a sigbus, regardless of whether we were in kernel
* or user mode.
*/
- tsk->thread.prot_addr = trans_exc_code & __FAIL_ADDR_MASK;
+ address = trans_exc_code & __FAIL_ADDR_MASK;
+ tsk->thread.prot_addr = address;
tsk->thread.trap_no = int_code;
- force_sig(SIGBUS, tsk);
+ si.si_signo = SIGBUS;
+ si.si_errno = 0;
+ si.si_code = BUS_ADRERR;
+ si.si_addr = (void __user *) address;
+ force_sig_info(SIGBUS, &si, tsk);
}
#ifdef CONFIG_S390_EXEC_PROTECT
@@ -266,10 +281,11 @@ static noinline void do_fault_error(struct pt_regs *regs, long int_code,
if (fault & VM_FAULT_OOM)
pagefault_out_of_memory();
else if (fault & VM_FAULT_SIGBUS) {
- do_sigbus(regs, int_code, trans_exc_code);
/* Kernel mode? Handle exceptions or die */
if (!(regs->psw.mask & PSW_MASK_PSTATE))
do_no_context(regs, int_code, trans_exc_code);
+ else
+ do_sigbus(regs, int_code, trans_exc_code);
} else
BUG();
break;
@@ -294,7 +310,7 @@ static inline int do_exception(struct pt_regs *regs, int access,
struct mm_struct *mm;
struct vm_area_struct *vma;
unsigned long address;
- int fault;
+ int fault, write;
if (notify_page_fault(regs))
return 0;
@@ -312,12 +328,6 @@ static inline int do_exception(struct pt_regs *regs, int access,
goto out;
address = trans_exc_code & __FAIL_ADDR_MASK;
- /*
- * When we get here, the fault happened in the current
- * task's user address space, so we can switch on the
- * interrupts again and then search the VMAs
- */
- local_irq_enable();
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
down_read(&mm->mmap_sem);
@@ -348,8 +358,10 @@ static inline int do_exception(struct pt_regs *regs, int access,
* make sure we exit gracefully rather than endlessly redo
* the fault.
*/
- fault = handle_mm_fault(mm, vma, address,
- (access == VM_WRITE) ? FAULT_FLAG_WRITE : 0);
+ write = (access == VM_WRITE ||
+ (trans_exc_code & store_indication) == 0x400) ?
+ FAULT_FLAG_WRITE : 0;
+ fault = handle_mm_fault(mm, vma, address, write);
if (unlikely(fault & VM_FAULT_ERROR))
goto out_up;
@@ -374,20 +386,20 @@ out:
return fault;
}
-void __kprobes do_protection_exception(struct pt_regs *regs, long int_code)
+void __kprobes do_protection_exception(struct pt_regs *regs, long pgm_int_code,
+ unsigned long trans_exc_code)
{
- unsigned long trans_exc_code = S390_lowcore.trans_exc_code;
int fault;
/* Protection exception is supressing, decrement psw address. */
- regs->psw.addr -= (int_code >> 16);
+ regs->psw.addr -= (pgm_int_code >> 16);
/*
* Check for low-address protection. This needs to be treated
* as a special case because the translation exception code
* field is not guaranteed to contain valid data in this case.
*/
if (unlikely(!(trans_exc_code & 4))) {
- do_low_address(regs, int_code, trans_exc_code);
+ do_low_address(regs, pgm_int_code, trans_exc_code);
return;
}
fault = do_exception(regs, VM_WRITE, trans_exc_code);
@@ -395,9 +407,9 @@ void __kprobes do_protection_exception(struct pt_regs *regs, long int_code)
do_fault_error(regs, 4, trans_exc_code, fault);
}
-void __kprobes do_dat_exception(struct pt_regs *regs, long int_code)
+void __kprobes do_dat_exception(struct pt_regs *regs, long pgm_int_code,
+ unsigned long trans_exc_code)
{
- unsigned long trans_exc_code = S390_lowcore.trans_exc_code;
int access, fault;
access = VM_READ | VM_EXEC | VM_WRITE;
@@ -408,21 +420,19 @@ void __kprobes do_dat_exception(struct pt_regs *regs, long int_code)
#endif
fault = do_exception(regs, access, trans_exc_code);
if (unlikely(fault))
- do_fault_error(regs, int_code & 255, trans_exc_code, fault);
+ do_fault_error(regs, pgm_int_code & 255, trans_exc_code, fault);
}
#ifdef CONFIG_64BIT
-void __kprobes do_asce_exception(struct pt_regs *regs, long int_code)
+void __kprobes do_asce_exception(struct pt_regs *regs, long pgm_int_code,
+ unsigned long trans_exc_code)
{
- unsigned long trans_exc_code = S390_lowcore.trans_exc_code;
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm))
goto no_context;
- local_irq_enable();
-
down_read(&mm->mmap_sem);
vma = find_vma(mm, trans_exc_code & __FAIL_ADDR_MASK);
up_read(&mm->mmap_sem);
@@ -434,16 +444,16 @@ void __kprobes do_asce_exception(struct pt_regs *regs, long int_code)
/* User mode accesses just cause a SIGSEGV */
if (regs->psw.mask & PSW_MASK_PSTATE) {
- do_sigsegv(regs, int_code, SEGV_MAPERR, trans_exc_code);
+ do_sigsegv(regs, pgm_int_code, SEGV_MAPERR, trans_exc_code);
return;
}
no_context:
- do_no_context(regs, int_code, trans_exc_code);
+ do_no_context(regs, pgm_int_code, trans_exc_code);
}
#endif
-int __handle_fault(unsigned long uaddr, unsigned long int_code, int write_user)
+int __handle_fault(unsigned long uaddr, unsigned long pgm_int_code, int write)
{
struct pt_regs regs;
int access, fault;
@@ -454,14 +464,14 @@ int __handle_fault(unsigned long uaddr, unsigned long int_code, int write_user)
regs.psw.addr = (unsigned long) __builtin_return_address(0);
regs.psw.addr |= PSW_ADDR_AMODE;
uaddr &= PAGE_MASK;
- access = write_user ? VM_WRITE : VM_READ;
+ access = write ? VM_WRITE : VM_READ;
fault = do_exception(&regs, access, uaddr | 2);
if (unlikely(fault)) {
if (fault & VM_FAULT_OOM) {
pagefault_out_of_memory();
fault = 0;
} else if (fault & VM_FAULT_SIGBUS)
- do_sigbus(&regs, int_code, uaddr);
+ do_sigbus(&regs, pgm_int_code, uaddr);
}
return fault ? -EFAULT : 0;
}
@@ -527,7 +537,8 @@ void pfault_fini(void)
: : "a" (&refbk), "m" (refbk) : "cc");
}
-static void pfault_interrupt(__u16 int_code)
+static void pfault_interrupt(unsigned int ext_int_code,
+ unsigned int param32, unsigned long param64)
{
struct task_struct *tsk;
__u16 subcode;
@@ -538,14 +549,18 @@ static void pfault_interrupt(__u16 int_code)
* in the 'cpu address' field associated with the
* external interrupt.
*/
- subcode = S390_lowcore.cpu_addr;
+ subcode = ext_int_code >> 16;
if ((subcode & 0xff00) != __SUBCODE_MASK)
return;
/*
* Get the token (= address of the task structure of the affected task).
*/
- tsk = *(struct task_struct **) __LC_PFAULT_INTPARM;
+#ifdef CONFIG_64BIT
+ tsk = *(struct task_struct **) param64;
+#else
+ tsk = *(struct task_struct **) param32;
+#endif
if (subcode & 0x0080) {
/* signal bit is set -> a page has been swapped in by VM */
diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c
new file mode 100644
index 00000000000..38e641cdd97
--- /dev/null
+++ b/arch/s390/mm/gup.c
@@ -0,0 +1,225 @@
+/*
+ * Lockless get_user_pages_fast for s390
+ *
+ * Copyright IBM Corp. 2010
+ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/vmstat.h>
+#include <linux/pagemap.h>
+#include <linux/rwsem.h>
+#include <asm/pgtable.h>
+
+/*
+ * The performance critical leaf functions are made noinline otherwise gcc
+ * inlines everything into a single function which results in too much
+ * register pressure.
+ */
+static inline int gup_pte_range(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
+ unsigned long end, int write, struct page **pages, int *nr)
+{
+ unsigned long mask, result;
+ pte_t *ptep, pte;
+ struct page *page;
+
+ result = write ? 0 : _PAGE_RO;
+ mask = result | _PAGE_INVALID | _PAGE_SPECIAL;
+
+ ptep = ((pte_t *) pmd_deref(pmd)) + pte_index(addr);
+ do {
+ pte = *ptep;
+ barrier();
+ if ((pte_val(pte) & mask) != result)
+ return 0;
+ VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+ page = pte_page(pte);
+ if (!page_cache_get_speculative(page))
+ return 0;
+ if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+ put_page(page);
+ return 0;
+ }
+ pages[*nr] = page;
+ (*nr)++;
+
+ } while (ptep++, addr += PAGE_SIZE, addr != end);
+
+ return 1;
+}
+
+static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
+ unsigned long end, int write, struct page **pages, int *nr)
+{
+ unsigned long mask, result;
+ struct page *head, *page;
+ int refs;
+
+ result = write ? 0 : _SEGMENT_ENTRY_RO;
+ mask = result | _SEGMENT_ENTRY_INV;
+ if ((pmd_val(pmd) & mask) != result)
+ return 0;
+ VM_BUG_ON(!pfn_valid(pmd_val(pmd) >> PAGE_SHIFT));
+
+ refs = 0;
+ head = pmd_page(pmd);
+ page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+ do {
+ VM_BUG_ON(compound_head(page) != head);
+ pages[*nr] = page;
+ (*nr)++;
+ page++;
+ refs++;
+ } while (addr += PAGE_SIZE, addr != end);
+
+ if (!page_cache_add_speculative(head, refs)) {
+ *nr -= refs;
+ return 0;
+ }
+
+ if (unlikely(pmd_val(pmd) != pmd_val(*pmdp))) {
+ *nr -= refs;
+ while (refs--)
+ put_page(head);
+ }
+
+ return 1;
+}
+
+
+static inline int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr,
+ unsigned long end, int write, struct page **pages, int *nr)
+{
+ unsigned long next;
+ pmd_t *pmdp, pmd;
+
+ pmdp = (pmd_t *) pudp;
+#ifdef CONFIG_64BIT
+ if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
+ pmdp = (pmd_t *) pud_deref(pud);
+ pmdp += pmd_index(addr);
+#endif
+ do {
+ pmd = *pmdp;
+ barrier();
+ next = pmd_addr_end(addr, end);
+ if (pmd_none(pmd))
+ return 0;
+ if (unlikely(pmd_huge(pmd))) {
+ if (!gup_huge_pmd(pmdp, pmd, addr, next,
+ write, pages, nr))
+ return 0;
+ } else if (!gup_pte_range(pmdp, pmd, addr, next,
+ write, pages, nr))
+ return 0;
+ } while (pmdp++, addr = next, addr != end);
+
+ return 1;
+}
+
+static inline int gup_pud_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr,
+ unsigned long end, int write, struct page **pages, int *nr)
+{
+ unsigned long next;
+ pud_t *pudp, pud;
+
+ pudp = (pud_t *) pgdp;
+#ifdef CONFIG_64BIT
+ if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R2)
+ pudp = (pud_t *) pgd_deref(pgd);
+ pudp += pud_index(addr);
+#endif
+ do {
+ pud = *pudp;
+ barrier();
+ next = pud_addr_end(addr, end);
+ if (pud_none(pud))
+ return 0;
+ if (!gup_pmd_range(pudp, pud, addr, next, write, pages, nr))
+ return 0;
+ } while (pudp++, addr = next, addr != end);
+
+ return 1;
+}
+
+/**
+ * get_user_pages_fast() - pin user pages in memory
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @write: whether pages will be written to
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long.
+ *
+ * Attempt to pin user pages in memory without taking mm->mmap_sem.
+ * If not successful, it will fall back to taking the lock and
+ * calling get_user_pages().
+ *
+ * Returns number of pages pinned. This may be fewer than the number
+ * requested. If nr_pages is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno.
+ */
+int get_user_pages_fast(unsigned long start, int nr_pages, int write,
+ struct page **pages)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long addr, len, end;
+ unsigned long next;
+ pgd_t *pgdp, pgd;
+ int nr = 0;
+
+ start &= PAGE_MASK;
+ addr = start;
+ len = (unsigned long) nr_pages << PAGE_SHIFT;
+ end = start + len;
+ if (end < start)
+ goto slow_irqon;
+
+ /*
+ * local_irq_disable() doesn't prevent pagetable teardown, but does
+ * prevent the pagetables from being freed on s390.
+ *
+ * So long as we atomically load page table pointers versus teardown,
+ * we can follow the address down to the the page and take a ref on it.
+ */
+ local_irq_disable();
+ pgdp = pgd_offset(mm, addr);
+ do {
+ pgd = *pgdp;
+ barrier();
+ next = pgd_addr_end(addr, end);
+ if (pgd_none(pgd))
+ goto slow;
+ if (!gup_pud_range(pgdp, pgd, addr, next, write, pages, &nr))
+ goto slow;
+ } while (pgdp++, addr = next, addr != end);
+ local_irq_enable();
+
+ VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
+ return nr;
+
+ {
+ int ret;
+slow:
+ local_irq_enable();
+slow_irqon:
+ /* Try to get the remaining pages with get_user_pages */
+ start += nr << PAGE_SHIFT;
+ pages += nr;
+
+ down_read(&mm->mmap_sem);
+ ret = get_user_pages(current, mm, start,
+ (end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
+ up_read(&mm->mmap_sem);
+
+ /* Have to be a bit careful with return values */
+ if (nr > 0) {
+ if (ret < 0)
+ ret = nr;
+ else
+ ret += nr;
+ }
+
+ return ret;
+ }
+}
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index f28c43d2f61..639cd21f221 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -68,7 +68,7 @@ void arch_release_hugepage(struct page *page)
ptep = (pte_t *) page[1].index;
if (!ptep)
return;
- pte_free(&init_mm, ptep);
+ page_table_free(&init_mm, (unsigned long *) ptep);
page[1].index = 0;
}
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 30eb6d02ddb..bb409332a48 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -38,19 +38,59 @@
#include <asm/tlbflush.h>
#include <asm/sections.h>
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-
pgd_t swapper_pg_dir[PTRS_PER_PGD] __attribute__((__aligned__(PAGE_SIZE)));
-char empty_zero_page[PAGE_SIZE] __attribute__((__aligned__(PAGE_SIZE)));
+unsigned long empty_zero_page, zero_page_mask;
EXPORT_SYMBOL(empty_zero_page);
+static unsigned long setup_zero_pages(void)
+{
+ struct cpuid cpu_id;
+ unsigned int order;
+ unsigned long size;
+ struct page *page;
+ int i;
+
+ get_cpu_id(&cpu_id);
+ switch (cpu_id.machine) {
+ case 0x9672: /* g5 */
+ case 0x2064: /* z900 */
+ case 0x2066: /* z900 */
+ case 0x2084: /* z990 */
+ case 0x2086: /* z990 */
+ case 0x2094: /* z9-109 */
+ case 0x2096: /* z9-109 */
+ order = 0;
+ break;
+ case 0x2097: /* z10 */
+ case 0x2098: /* z10 */
+ default:
+ order = 2;
+ break;
+ }
+
+ empty_zero_page = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
+ if (!empty_zero_page)
+ panic("Out of memory in setup_zero_pages");
+
+ page = virt_to_page((void *) empty_zero_page);
+ split_page(page, order);
+ for (i = 1 << order; i > 0; i--) {
+ SetPageReserved(page);
+ page++;
+ }
+
+ size = PAGE_SIZE << order;
+ zero_page_mask = (size - 1) & PAGE_MASK;
+
+ return 1UL << order;
+}
+
/*
* paging_init() sets up the page tables
*/
void __init paging_init(void)
{
- static const int ssm_mask = 0x04000000L;
unsigned long max_zone_pfns[MAX_NR_ZONES];
unsigned long pgd_type;
@@ -72,7 +112,7 @@ void __init paging_init(void)
__ctl_load(S390_lowcore.kernel_asce, 1, 1);
__ctl_load(S390_lowcore.kernel_asce, 7, 7);
__ctl_load(S390_lowcore.kernel_asce, 13, 13);
- __raw_local_irq_ssm(ssm_mask);
+ arch_local_irq_restore(4UL << (BITS_PER_LONG - 8));
atomic_set(&init_mm.context.attach_count, 1);
@@ -84,6 +124,7 @@ void __init paging_init(void)
#endif
max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
free_area_init_nodes(max_zone_pfns);
+ fault_init();
}
void __init mem_init(void)
@@ -93,14 +134,12 @@ void __init mem_init(void)
max_mapnr = num_physpages = max_low_pfn;
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
- /* clear the zero-page */
- memset(empty_zero_page, 0, PAGE_SIZE);
-
/* Setup guest page hinting */
cmma_init();
/* this will put all low memory onto the freelists */
totalram_pages += free_all_bootmem();
+ totalram_pages -= setup_zero_pages(); /* Setup zeroed pages. */
reservedpages = 0;
diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c
index a8c2af8c650..71a4b0d34be 100644
--- a/arch/s390/mm/maccess.c
+++ b/arch/s390/mm/maccess.c
@@ -71,7 +71,7 @@ int memcpy_real(void *dest, void *src, size_t count)
if (!count)
return 0;
- flags = __raw_local_irq_stnsm(0xf8UL);
+ flags = __arch_local_irq_stnsm(0xf8UL);
asm volatile (
"0: mvcle %1,%2,0x0\n"
"1: jo 0b\n"
@@ -82,6 +82,6 @@ int memcpy_real(void *dest, void *src, size_t count)
"+d" (_len2), "=m" (*((long *) dest))
: "m" (*((long *) src))
: "cc", "memory");
- __raw_local_irq_ssm(flags);
+ arch_local_irq_restore(flags);
return rc;
}
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 8d999249d35..0c719c61972 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -15,6 +15,7 @@
#include <linux/spinlock.h>
#include <linux/module.h>
#include <linux/quicklist.h>
+#include <linux/rcupdate.h>
#include <asm/system.h>
#include <asm/pgtable.h>
@@ -23,6 +24,67 @@
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
+struct rcu_table_freelist {
+ struct rcu_head rcu;
+ struct mm_struct *mm;
+ unsigned int pgt_index;
+ unsigned int crst_index;
+ unsigned long *table[0];
+};
+
+#define RCU_FREELIST_SIZE \
+ ((PAGE_SIZE - sizeof(struct rcu_table_freelist)) \
+ / sizeof(unsigned long))
+
+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+static DEFINE_PER_CPU(struct rcu_table_freelist *, rcu_table_freelist);
+
+static void __page_table_free(struct mm_struct *mm, unsigned long *table);
+static void __crst_table_free(struct mm_struct *mm, unsigned long *table);
+
+static struct rcu_table_freelist *rcu_table_freelist_get(struct mm_struct *mm)
+{
+ struct rcu_table_freelist **batchp = &__get_cpu_var(rcu_table_freelist);
+ struct rcu_table_freelist *batch = *batchp;
+
+ if (batch)
+ return batch;
+ batch = (struct rcu_table_freelist *) __get_free_page(GFP_ATOMIC);
+ if (batch) {
+ batch->mm = mm;
+ batch->pgt_index = 0;
+ batch->crst_index = RCU_FREELIST_SIZE;
+ *batchp = batch;
+ }
+ return batch;
+}
+
+static void rcu_table_freelist_callback(struct rcu_head *head)
+{
+ struct rcu_table_freelist *batch =
+ container_of(head, struct rcu_table_freelist, rcu);
+
+ while (batch->pgt_index > 0)
+ __page_table_free(batch->mm, batch->table[--batch->pgt_index]);
+ while (batch->crst_index < RCU_FREELIST_SIZE)
+ __crst_table_free(batch->mm, batch->table[batch->crst_index++]);
+ free_page((unsigned long) batch);
+}
+
+void rcu_table_freelist_finish(void)
+{
+ struct rcu_table_freelist *batch = __get_cpu_var(rcu_table_freelist);
+
+ if (!batch)
+ return;
+ call_rcu(&batch->rcu, rcu_table_freelist_callback);
+ __get_cpu_var(rcu_table_freelist) = NULL;
+}
+
+static void smp_sync(void *arg)
+{
+}
+
#ifndef CONFIG_64BIT
#define ALLOC_ORDER 1
#define TABLES_PER_PAGE 4
@@ -78,25 +140,55 @@ unsigned long *crst_table_alloc(struct mm_struct *mm, int noexec)
}
page->index = page_to_phys(shadow);
}
- spin_lock(&mm->context.list_lock);
+ spin_lock_bh(&mm->context.list_lock);
list_add(&page->lru, &mm->context.crst_list);
- spin_unlock(&mm->context.list_lock);
+ spin_unlock_bh(&mm->context.list_lock);
return (unsigned long *) page_to_phys(page);
}
-void crst_table_free(struct mm_struct *mm, unsigned long *table)
+static void __crst_table_free(struct mm_struct *mm, unsigned long *table)
{
unsigned long *shadow = get_shadow_table(table);
- struct page *page = virt_to_page(table);
- spin_lock(&mm->context.list_lock);
- list_del(&page->lru);
- spin_unlock(&mm->context.list_lock);
if (shadow)
free_pages((unsigned long) shadow, ALLOC_ORDER);
free_pages((unsigned long) table, ALLOC_ORDER);
}
+void crst_table_free(struct mm_struct *mm, unsigned long *table)
+{
+ struct page *page = virt_to_page(table);
+
+ spin_lock_bh(&mm->context.list_lock);
+ list_del(&page->lru);
+ spin_unlock_bh(&mm->context.list_lock);
+ __crst_table_free(mm, table);
+}
+
+void crst_table_free_rcu(struct mm_struct *mm, unsigned long *table)
+{
+ struct rcu_table_freelist *batch;
+ struct page *page = virt_to_page(table);
+
+ spin_lock_bh(&mm->context.list_lock);
+ list_del(&page->lru);
+ spin_unlock_bh(&mm->context.list_lock);
+ if (atomic_read(&mm->mm_users) < 2 &&
+ cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) {
+ __crst_table_free(mm, table);
+ return;
+ }
+ batch = rcu_table_freelist_get(mm);
+ if (!batch) {
+ smp_call_function(smp_sync, NULL, 1);
+ __crst_table_free(mm, table);
+ return;
+ }
+ batch->table[--batch->crst_index] = table;
+ if (batch->pgt_index >= batch->crst_index)
+ rcu_table_freelist_finish();
+}
+
#ifdef CONFIG_64BIT
int crst_table_upgrade(struct mm_struct *mm, unsigned long limit)
{
@@ -108,7 +200,7 @@ repeat:
table = crst_table_alloc(mm, mm->context.noexec);
if (!table)
return -ENOMEM;
- spin_lock(&mm->page_table_lock);
+ spin_lock_bh(&mm->page_table_lock);
if (mm->context.asce_limit < limit) {
pgd = (unsigned long *) mm->pgd;
if (mm->context.asce_limit <= (1UL << 31)) {
@@ -130,7 +222,7 @@ repeat:
mm->task_size = mm->context.asce_limit;
table = NULL;
}
- spin_unlock(&mm->page_table_lock);
+ spin_unlock_bh(&mm->page_table_lock);
if (table)
crst_table_free(mm, table);
if (mm->context.asce_limit < limit)
@@ -182,7 +274,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
unsigned long bits;
bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL;
- spin_lock(&mm->context.list_lock);
+ spin_lock_bh(&mm->context.list_lock);
page = NULL;
if (!list_empty(&mm->context.pgtable_list)) {
page = list_first_entry(&mm->context.pgtable_list,
@@ -191,7 +283,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
page = NULL;
}
if (!page) {
- spin_unlock(&mm->context.list_lock);
+ spin_unlock_bh(&mm->context.list_lock);
page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
if (!page)
return NULL;
@@ -202,7 +294,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
clear_table_pgstes(table);
else
clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE);
- spin_lock(&mm->context.list_lock);
+ spin_lock_bh(&mm->context.list_lock);
list_add(&page->lru, &mm->context.pgtable_list);
}
table = (unsigned long *) page_to_phys(page);
@@ -213,10 +305,25 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
page->flags |= bits;
if ((page->flags & FRAG_MASK) == ((1UL << TABLES_PER_PAGE) - 1))
list_move_tail(&page->lru, &mm->context.pgtable_list);
- spin_unlock(&mm->context.list_lock);
+ spin_unlock_bh(&mm->context.list_lock);
return table;
}
+static void __page_table_free(struct mm_struct *mm, unsigned long *table)
+{
+ struct page *page;
+ unsigned long bits;
+
+ bits = ((unsigned long) table) & 15;
+ table = (unsigned long *)(((unsigned long) table) ^ bits);
+ page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+ page->flags ^= bits;
+ if (!(page->flags & FRAG_MASK)) {
+ pgtable_page_dtor(page);
+ __free_page(page);
+ }
+}
+
void page_table_free(struct mm_struct *mm, unsigned long *table)
{
struct page *page;
@@ -225,7 +332,7 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL;
bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long);
page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
- spin_lock(&mm->context.list_lock);
+ spin_lock_bh(&mm->context.list_lock);
page->flags ^= bits;
if (page->flags & FRAG_MASK) {
/* Page now has some free pgtable fragments. */
@@ -234,18 +341,48 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
} else
/* All fragments of the 4K page have been freed. */
list_del(&page->lru);
- spin_unlock(&mm->context.list_lock);
+ spin_unlock_bh(&mm->context.list_lock);
if (page) {
pgtable_page_dtor(page);
__free_page(page);
}
}
+void page_table_free_rcu(struct mm_struct *mm, unsigned long *table)
+{
+ struct rcu_table_freelist *batch;
+ struct page *page;
+ unsigned long bits;
+
+ if (atomic_read(&mm->mm_users) < 2 &&
+ cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) {
+ page_table_free(mm, table);
+ return;
+ }
+ batch = rcu_table_freelist_get(mm);
+ if (!batch) {
+ smp_call_function(smp_sync, NULL, 1);
+ page_table_free(mm, table);
+ return;
+ }
+ bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL;
+ bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long);
+ page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+ spin_lock_bh(&mm->context.list_lock);
+ /* Delayed freeing with rcu prevents reuse of pgtable fragments */
+ list_del_init(&page->lru);
+ spin_unlock_bh(&mm->context.list_lock);
+ table = (unsigned long *)(((unsigned long) table) | bits);
+ batch->table[batch->pgt_index++] = table;
+ if (batch->pgt_index >= batch->crst_index)
+ rcu_table_freelist_finish();
+}
+
void disable_noexec(struct mm_struct *mm, struct task_struct *tsk)
{
struct page *page;
- spin_lock(&mm->context.list_lock);
+ spin_lock_bh(&mm->context.list_lock);
/* Free shadow region and segment tables. */
list_for_each_entry(page, &mm->context.crst_list, lru)
if (page->index) {
@@ -255,7 +392,7 @@ void disable_noexec(struct mm_struct *mm, struct task_struct *tsk)
/* "Free" second halves of page tables. */
list_for_each_entry(page, &mm->context.pgtable_list, lru)
page->flags &= ~SECOND_HALVES;
- spin_unlock(&mm->context.list_lock);
+ spin_unlock_bh(&mm->context.list_lock);
mm->context.noexec = 0;
update_mm(mm, tsk);
}
@@ -312,6 +449,8 @@ int s390_enable_sie(void)
tsk->mm = tsk->active_mm = mm;
preempt_disable();
update_mm(mm, tsk);
+ atomic_inc(&mm->context.attach_count);
+ atomic_dec(&old_mm->context.attach_count);
cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
preempt_enable();
task_unlock(tsk);