diff options
author | Martin Schwidefsky <schwidefsky@de.ibm.com> | 2012-11-07 13:17:37 +0100 |
---|---|---|
committer | Martin Schwidefsky <schwidefsky@de.ibm.com> | 2013-02-14 15:55:23 +0100 |
commit | abf09bed3cceadd809f0356065c2ada6cee90d4a (patch) | |
tree | b81cac34a4111f498cdef104a2b9c4c444faf0bd /arch/s390/include | |
parent | 486c0a0bc80d370471b21662bf03f04fbb37cdc6 (diff) |
s390/mm: implement software dirty bits
The s390 architecture is unique in respect to dirty page detection,
it uses the change bit in the per-page storage key to track page
modifications. All other architectures track dirty bits by means
of page table entries. This property of s390 has caused numerous
problems in the past, e.g. see git commit ef5d437f71afdf4a
"mm: fix XFS oops due to dirty pages without buffers on s390".
To avoid future issues in regard to per-page dirty bits convert
s390 to a fault based software dirty bit detection mechanism. All
user page table entries which are marked as clean will be hardware
read-only, even if the pte is supposed to be writable. A write by
the user process will trigger a protection fault which will cause
the user pte to be marked as dirty and the hardware read-only bit
is removed.
With this change the dirty bit in the storage key is irrelevant
for Linux as a host, but the storage key is still required for
KVM guests. The effect is that page_test_and_clear_dirty and the
related code can be removed. The referenced bit in the storage
key is still used by the page_test_and_clear_young primitive to
provide page age information.
For page cache pages of mappings with mapping_cap_account_dirty
there will not be any change in behavior as the dirty bit tracking
already uses read-only ptes to control the amount of dirty pages.
Only for swap cache pages and pages of mappings without
mapping_cap_account_dirty there can be additional protection faults.
To avoid an excessive number of additional faults the mk_pte
primitive checks for PageDirty if the pgprot value allows for writes
and pre-dirties the pte. That avoids all additional faults for
tmpfs and shmem pages until these pages are added to the swap cache.
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Diffstat (limited to 'arch/s390/include')
-rw-r--r-- | arch/s390/include/asm/page.h | 22 | ||||
-rw-r--r-- | arch/s390/include/asm/pgtable.h | 131 | ||||
-rw-r--r-- | arch/s390/include/asm/sclp.h | 1 | ||||
-rw-r--r-- | arch/s390/include/asm/setup.h | 16 |
4 files changed, 97 insertions, 73 deletions
diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index a86ad408407..75ce9b065f9 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -155,28 +155,6 @@ static inline int page_reset_referenced(unsigned long addr) #define _PAGE_ACC_BITS 0xf0 /* HW access control bits */ /* - * Test and clear dirty bit in storage key. - * We can't clear the changed bit atomically. This is a potential - * race against modification of the referenced bit. This function - * should therefore only be called if it is not mapped in any - * address space. - * - * Note that the bit gets set whenever page content is changed. That means - * also when the page is modified by DMA or from inside the kernel. - */ -#define __HAVE_ARCH_PAGE_TEST_AND_CLEAR_DIRTY -static inline int page_test_and_clear_dirty(unsigned long pfn, int mapped) -{ - unsigned char skey; - - skey = page_get_storage_key(pfn << PAGE_SHIFT); - if (!(skey & _PAGE_CHANGED)) - return 0; - page_set_storage_key(pfn << PAGE_SHIFT, skey & ~_PAGE_CHANGED, mapped); - return 1; -} - -/* * Test and clear referenced bit in storage key. */ #define __HAVE_ARCH_PAGE_TEST_AND_CLEAR_YOUNG diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index a009d4dd70c..97de1200c84 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -29,6 +29,7 @@ #ifndef __ASSEMBLY__ #include <linux/sched.h> #include <linux/mm_types.h> +#include <linux/page-flags.h> #include <asm/bug.h> #include <asm/page.h> @@ -221,13 +222,15 @@ extern unsigned long MODULES_END; /* Software bits in the page table entry */ #define _PAGE_SWT 0x001 /* SW pte type bit t */ #define _PAGE_SWX 0x002 /* SW pte type bit x */ -#define _PAGE_SWC 0x004 /* SW pte changed bit (for KVM) */ -#define _PAGE_SWR 0x008 /* SW pte referenced bit (for KVM) */ -#define _PAGE_SPECIAL 0x010 /* SW associated with special page */ +#define _PAGE_SWC 0x004 /* SW pte changed bit */ +#define _PAGE_SWR 0x008 /* SW pte referenced bit */ +#define _PAGE_SWW 0x010 /* SW pte write bit */ +#define _PAGE_SPECIAL 0x020 /* SW associated with special page */ #define __HAVE_ARCH_PTE_SPECIAL /* Set of bits not changed in pte_modify */ -#define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_SPECIAL | _PAGE_SWC | _PAGE_SWR) +#define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_SPECIAL | _PAGE_CO | \ + _PAGE_SWC | _PAGE_SWR) /* Six different types of pages. */ #define _PAGE_TYPE_EMPTY 0x400 @@ -321,6 +324,7 @@ extern unsigned long MODULES_END; /* Bits in the region table entry */ #define _REGION_ENTRY_ORIGIN ~0xfffUL/* region/segment table origin */ +#define _REGION_ENTRY_RO 0x200 /* region protection bit */ #define _REGION_ENTRY_INV 0x20 /* invalid region table entry */ #define _REGION_ENTRY_TYPE_MASK 0x0c /* region/segment table type mask */ #define _REGION_ENTRY_TYPE_R1 0x0c /* region first table type */ @@ -382,9 +386,10 @@ extern unsigned long MODULES_END; */ #define PAGE_NONE __pgprot(_PAGE_TYPE_NONE) #define PAGE_RO __pgprot(_PAGE_TYPE_RO) -#define PAGE_RW __pgprot(_PAGE_TYPE_RW) +#define PAGE_RW __pgprot(_PAGE_TYPE_RO | _PAGE_SWW) +#define PAGE_RWC __pgprot(_PAGE_TYPE_RW | _PAGE_SWW | _PAGE_SWC) -#define PAGE_KERNEL PAGE_RW +#define PAGE_KERNEL PAGE_RWC #define PAGE_SHARED PAGE_KERNEL #define PAGE_COPY PAGE_RO @@ -632,23 +637,23 @@ static inline pgste_t pgste_update_all(pte_t *ptep, pgste_t pgste) bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); /* Clear page changed & referenced bit in the storage key */ if (bits & _PAGE_CHANGED) - page_set_storage_key(address, skey ^ bits, 1); + page_set_storage_key(address, skey ^ bits, 0); else if (bits) page_reset_referenced(address); /* Transfer page changed & referenced bit to guest bits in pgste */ pgste_val(pgste) |= bits << 48; /* RCP_GR_BIT & RCP_GC_BIT */ /* Get host changed & referenced bits from pgste */ bits |= (pgste_val(pgste) & (RCP_HR_BIT | RCP_HC_BIT)) >> 52; - /* Clear host bits in pgste. */ + /* Transfer page changed & referenced bit to kvm user bits */ + pgste_val(pgste) |= bits << 45; /* KVM_UR_BIT & KVM_UC_BIT */ + /* Clear relevant host bits in pgste. */ pgste_val(pgste) &= ~(RCP_HR_BIT | RCP_HC_BIT); pgste_val(pgste) &= ~(RCP_ACC_BITS | RCP_FP_BIT); /* Copy page access key and fetch protection bit to pgste */ pgste_val(pgste) |= (unsigned long) (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; - /* Transfer changed and referenced to kvm user bits */ - pgste_val(pgste) |= bits << 45; /* KVM_UR_BIT & KVM_UC_BIT */ - /* Transfer changed & referenced to pte sofware bits */ - pte_val(*ptep) |= bits << 1; /* _PAGE_SWR & _PAGE_SWC */ + /* Transfer referenced bit to pte */ + pte_val(*ptep) |= (bits & _PAGE_REFERENCED) << 1; #endif return pgste; @@ -661,20 +666,25 @@ static inline pgste_t pgste_update_young(pte_t *ptep, pgste_t pgste) if (!pte_present(*ptep)) return pgste; + /* Get referenced bit from storage key */ young = page_reset_referenced(pte_val(*ptep) & PAGE_MASK); - /* Transfer page referenced bit to pte software bit (host view) */ - if (young || (pgste_val(pgste) & RCP_HR_BIT)) + if (young) + pgste_val(pgste) |= RCP_GR_BIT; + /* Get host referenced bit from pgste */ + if (pgste_val(pgste) & RCP_HR_BIT) { + pgste_val(pgste) &= ~RCP_HR_BIT; + young = 1; + } + /* Transfer referenced bit to kvm user bits and pte */ + if (young) { + pgste_val(pgste) |= KVM_UR_BIT; pte_val(*ptep) |= _PAGE_SWR; - /* Clear host referenced bit in pgste. */ - pgste_val(pgste) &= ~RCP_HR_BIT; - /* Transfer page referenced bit to guest bit in pgste */ - pgste_val(pgste) |= (unsigned long) young << 50; /* set RCP_GR_BIT */ + } #endif return pgste; - } -static inline void pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry) +static inline void pgste_set_key(pte_t *ptep, pgste_t pgste, pte_t entry) { #ifdef CONFIG_PGSTE unsigned long address; @@ -688,10 +698,23 @@ static inline void pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry) /* Set page access key and fetch protection bit from pgste */ nkey |= (pgste_val(pgste) & (RCP_ACC_BITS | RCP_FP_BIT)) >> 56; if (okey != nkey) - page_set_storage_key(address, nkey, 1); + page_set_storage_key(address, nkey, 0); #endif } +static inline void pgste_set_pte(pte_t *ptep, pte_t entry) +{ + if (!MACHINE_HAS_ESOP && (pte_val(entry) & _PAGE_SWW)) { + /* + * Without enhanced suppression-on-protection force + * the dirty bit on for all writable ptes. + */ + pte_val(entry) |= _PAGE_SWC; + pte_val(entry) &= ~_PAGE_RO; + } + *ptep = entry; +} + /** * struct gmap_struct - guest address space * @mm: pointer to the parent mm_struct @@ -750,11 +773,14 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, if (mm_has_pgste(mm)) { pgste = pgste_get_lock(ptep); - pgste_set_pte(ptep, pgste, entry); - *ptep = entry; + pgste_set_key(ptep, pgste, entry); + pgste_set_pte(ptep, entry); pgste_set_unlock(ptep, pgste); - } else + } else { + if (!(pte_val(entry) & _PAGE_INVALID) && MACHINE_HAS_EDAT1) + pte_val(entry) |= _PAGE_CO; *ptep = entry; + } } /* @@ -763,16 +789,12 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, */ static inline int pte_write(pte_t pte) { - return (pte_val(pte) & _PAGE_RO) == 0; + return (pte_val(pte) & _PAGE_SWW) != 0; } static inline int pte_dirty(pte_t pte) { -#ifdef CONFIG_PGSTE - if (pte_val(pte) & _PAGE_SWC) - return 1; -#endif - return 0; + return (pte_val(pte) & _PAGE_SWC) != 0; } static inline int pte_young(pte_t pte) @@ -822,11 +844,14 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { pte_val(pte) &= _PAGE_CHG_MASK; pte_val(pte) |= pgprot_val(newprot); + if ((pte_val(pte) & _PAGE_SWC) && (pte_val(pte) & _PAGE_SWW)) + pte_val(pte) &= ~_PAGE_RO; return pte; } static inline pte_t pte_wrprotect(pte_t pte) { + pte_val(pte) &= ~_PAGE_SWW; /* Do not clobber _PAGE_TYPE_NONE pages! */ if (!(pte_val(pte) & _PAGE_INVALID)) pte_val(pte) |= _PAGE_RO; @@ -835,20 +860,26 @@ static inline pte_t pte_wrprotect(pte_t pte) static inline pte_t pte_mkwrite(pte_t pte) { - pte_val(pte) &= ~_PAGE_RO; + pte_val(pte) |= _PAGE_SWW; + if (pte_val(pte) & _PAGE_SWC) + pte_val(pte) &= ~_PAGE_RO; return pte; } static inline pte_t pte_mkclean(pte_t pte) { -#ifdef CONFIG_PGSTE pte_val(pte) &= ~_PAGE_SWC; -#endif + /* Do not clobber _PAGE_TYPE_NONE pages! */ + if (!(pte_val(pte) & _PAGE_INVALID)) + pte_val(pte) |= _PAGE_RO; return pte; } static inline pte_t pte_mkdirty(pte_t pte) { + pte_val(pte) |= _PAGE_SWC; + if (pte_val(pte) & _PAGE_SWW) + pte_val(pte) &= ~_PAGE_RO; return pte; } @@ -886,10 +917,10 @@ static inline pte_t pte_mkhuge(pte_t pte) pte_val(pte) |= _SEGMENT_ENTRY_INV; } /* - * Clear SW pte bits SWT and SWX, there are no SW bits in a segment - * table entry. + * Clear SW pte bits, there are no SW bits in a segment table entry. */ - pte_val(pte) &= ~(_PAGE_SWT | _PAGE_SWX); + pte_val(pte) &= ~(_PAGE_SWT | _PAGE_SWX | _PAGE_SWC | + _PAGE_SWR | _PAGE_SWW); /* * Also set the change-override bit because we don't need dirty bit * tracking for hugetlbfs pages. @@ -1041,9 +1072,11 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long address, pte_t *ptep, pte_t pte) { - *ptep = pte; - if (mm_has_pgste(mm)) + if (mm_has_pgste(mm)) { + pgste_set_pte(ptep, pte); pgste_set_unlock(ptep, *(pgste_t *)(ptep + PTRS_PER_PTE)); + } else + *ptep = pte; } #define __HAVE_ARCH_PTEP_CLEAR_FLUSH @@ -1111,10 +1144,13 @@ static inline pte_t ptep_set_wrprotect(struct mm_struct *mm, if (!mm_exclusive(mm)) __ptep_ipte(address, ptep); - *ptep = pte_wrprotect(pte); + pte = pte_wrprotect(pte); - if (mm_has_pgste(mm)) + if (mm_has_pgste(mm)) { + pgste_set_pte(ptep, pte); pgste_set_unlock(ptep, pgste); + } else + *ptep = pte; } return pte; } @@ -1132,10 +1168,12 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma, pgste = pgste_get_lock(ptep); __ptep_ipte(address, ptep); - *ptep = entry; - if (mm_has_pgste(vma->vm_mm)) + if (mm_has_pgste(vma->vm_mm)) { + pgste_set_pte(ptep, entry); pgste_set_unlock(ptep, pgste); + } else + *ptep = entry; return 1; } @@ -1153,8 +1191,13 @@ static inline pte_t mk_pte_phys(unsigned long physpage, pgprot_t pgprot) static inline pte_t mk_pte(struct page *page, pgprot_t pgprot) { unsigned long physpage = page_to_phys(page); + pte_t __pte = mk_pte_phys(physpage, pgprot); - return mk_pte_phys(physpage, pgprot); + if ((pte_val(__pte) & _PAGE_SWW) && PageDirty(page)) { + pte_val(__pte) |= _PAGE_SWC; + pte_val(__pte) &= ~_PAGE_RO; + } + return __pte; } #define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) @@ -1246,6 +1289,8 @@ static inline int pmd_trans_splitting(pmd_t pmd) static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t entry) { + if (!(pmd_val(entry) & _SEGMENT_ENTRY_INV) && MACHINE_HAS_EDAT1) + pmd_val(entry) |= _SEGMENT_ENTRY_CO; *pmdp = entry; } diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index 833788693f0..06a13613604 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h @@ -46,7 +46,6 @@ int sclp_cpu_deconfigure(u8 cpu); void sclp_facilities_detect(void); unsigned long long sclp_get_rnmax(void); unsigned long long sclp_get_rzm(void); -u8 sclp_get_fac85(void); int sclp_sdias_blk_count(void); int sclp_sdias_copy(void *dest, int blk_num, int nr_blks); int sclp_chp_configure(struct chp_id chpid); diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index f69f76b3447..f6857516e52 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -64,13 +64,14 @@ extern unsigned int s390_user_mode; #define MACHINE_FLAG_VM (1UL << 0) #define MACHINE_FLAG_IEEE (1UL << 1) -#define MACHINE_FLAG_CSP (1UL << 3) -#define MACHINE_FLAG_MVPG (1UL << 4) -#define MACHINE_FLAG_DIAG44 (1UL << 5) -#define MACHINE_FLAG_IDTE (1UL << 6) -#define MACHINE_FLAG_DIAG9C (1UL << 7) -#define MACHINE_FLAG_MVCOS (1UL << 8) -#define MACHINE_FLAG_KVM (1UL << 9) +#define MACHINE_FLAG_CSP (1UL << 2) +#define MACHINE_FLAG_MVPG (1UL << 3) +#define MACHINE_FLAG_DIAG44 (1UL << 4) +#define MACHINE_FLAG_IDTE (1UL << 5) +#define MACHINE_FLAG_DIAG9C (1UL << 6) +#define MACHINE_FLAG_MVCOS (1UL << 7) +#define MACHINE_FLAG_KVM (1UL << 8) +#define MACHINE_FLAG_ESOP (1UL << 9) #define MACHINE_FLAG_EDAT1 (1UL << 10) #define MACHINE_FLAG_EDAT2 (1UL << 11) #define MACHINE_FLAG_LPAR (1UL << 12) @@ -84,6 +85,7 @@ extern unsigned int s390_user_mode; #define MACHINE_IS_LPAR (S390_lowcore.machine_flags & MACHINE_FLAG_LPAR) #define MACHINE_HAS_DIAG9C (S390_lowcore.machine_flags & MACHINE_FLAG_DIAG9C) +#define MACHINE_HAS_ESOP (S390_lowcore.machine_flags & MACHINE_FLAG_ESOP) #define MACHINE_HAS_PFMF MACHINE_HAS_EDAT1 #define MACHINE_HAS_HPAGE MACHINE_HAS_EDAT1 |