s390/mm: implement software dirty bits

The s390 architecture is unique in respect to dirty page detection, it uses the change bit in the per-page storage key to track page modifications. All other architectures track dirty bits by means of page table entries. This property of s390 has caused numerous problems in the past, e.g. see git commit ef5d437f71afdf4a "mm: fix XFS oops due to dirty pages without buffers on s390". To avoid future issues in regard to per-page dirty bits convert s390 to a fault based software dirty bit detection mechanism. All user page table entries which are marked as clean will be hardware read-only, even if the pte is supposed to be writable. A write by the user process will trigger a protection fault which will cause the user pte to be marked as dirty and the hardware read-only bit is removed. With this change the dirty bit in the storage key is irrelevant for Linux as a host, but the storage key is still required for KVM guests. The effect is that page_test_and_clear_dirty and the related code can be removed. The referenced bit in the storage key is still used by the page_test_and_clear_young primitive to provide page age information. For page cache pages of mappings with mapping_cap_account_dirty there will not be any change in behavior as the dirty bit tracking already uses read-only ptes to control the amount of dirty pages. Only for swap cache pages and pages of mappings without mapping_cap_account_dirty there can be additional protection faults. To avoid an excessive number of additional faults the mk_pte primitive checks for PageDirty if the pgprot value allows for writes and pre-dirties the pte. That avoids all additional faults for tmpfs and shmem pages until these pages are added to the swap cache. Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
author: Martin Schwidefsky <schwidefsky@de.ibm.com> 2012-11-07 13:17:37 +0100
committer: Martin Schwidefsky <schwidefsky@de.ibm.com> 2013-02-14 15:55:23 +0100
commit: abf09bed3cceadd809f0356065c2ada6cee90d4a (patch)
tree: b81cac34a4111f498cdef104a2b9c4c444faf0bd /arch/s390/mm/vmem.c
parent: 486c0a0bc80d370471b21662bf03f04fbb37cdc6 (diff)
1 files changed, 10 insertions, 14 deletions
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index 6ed1426d27c..79699f46a44 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -85,11 +85,9 @@ static int vmem_add_mem(unsigned long start, unsigned long size, int ro)
 	pud_t *pu_dir;
 	pmd_t *pm_dir;
 	pte_t *pt_dir;
-	pte_t  pte;
 	int ret = -ENOMEM;
 
 	while (address < end) {
-		pte = mk_pte_phys(address, __pgprot(ro ? _PAGE_RO : 0));
 		pg_dir = pgd_offset_k(address);
 		if (pgd_none(*pg_dir)) {
 			pu_dir = vmem_pud_alloc();
@@ -101,9 +99,9 @@ static int vmem_add_mem(unsigned long start, unsigned long size, int ro)
 #if defined(CONFIG_64BIT) && !defined(CONFIG_DEBUG_PAGEALLOC)
 		if (MACHINE_HAS_EDAT2 && pud_none(*pu_dir) && address &&
 		    !(address & ~PUD_MASK) && (address + PUD_SIZE <= end)) {
-			pte_val(pte) |= _REGION3_ENTRY_LARGE;
-			pte_val(pte) |= _REGION_ENTRY_TYPE_R3;
-			pud_val(*pu_dir) = pte_val(pte);
+			pud_val(*pu_dir) = __pa(address) |
+				_REGION_ENTRY_TYPE_R3 | _REGION3_ENTRY_LARGE |
+				(ro ? _REGION_ENTRY_RO : 0);
 			address += PUD_SIZE;
 			continue;
 		}
@@ -118,8 +116,9 @@ static int vmem_add_mem(unsigned long start, unsigned long size, int ro)
 #if defined(CONFIG_64BIT) && !defined(CONFIG_DEBUG_PAGEALLOC)
 		if (MACHINE_HAS_EDAT1 && pmd_none(*pm_dir) && address &&
 		    !(address & ~PMD_MASK) && (address + PMD_SIZE <= end)) {
-			pte_val(pte) |= _SEGMENT_ENTRY_LARGE;
-			pmd_val(*pm_dir) = pte_val(pte);
+			pmd_val(*pm_dir) = __pa(address) |
+				_SEGMENT_ENTRY | _SEGMENT_ENTRY_LARGE |
+				(ro ? _SEGMENT_ENTRY_RO : 0);
 			address += PMD_SIZE;
 			continue;
 		}
@@ -132,7 +131,7 @@ static int vmem_add_mem(unsigned long start, unsigned long size, int ro)
 		}
 
 		pt_dir = pte_offset_kernel(pm_dir, address);
-		*pt_dir = pte;
+		pte_val(*pt_dir) = __pa(address) | (ro ? _PAGE_RO : 0);
 		address += PAGE_SIZE;
 	}
 	ret = 0;
@@ -199,7 +198,6 @@ int __meminit vmemmap_populate(struct page *start, unsigned long nr, int node)
 	pud_t *pu_dir;
 	pmd_t *pm_dir;
 	pte_t *pt_dir;
-	pte_t  pte;
 	int ret = -ENOMEM;
 
 	start_addr = (unsigned long) start;
@@ -237,9 +235,8 @@ int __meminit vmemmap_populate(struct page *start, unsigned long nr, int node)
 				new_page = vmemmap_alloc_block(PMD_SIZE, node);
 				if (!new_page)
 					goto out;
-				pte = mk_pte_phys(__pa(new_page), PAGE_RW);
-				pte_val(pte) |= _SEGMENT_ENTRY_LARGE;
-				pmd_val(*pm_dir) = pte_val(pte);
+				pmd_val(*pm_dir) = __pa(new_page) |
+					_SEGMENT_ENTRY | _SEGMENT_ENTRY_LARGE;
 				address = (address + PMD_SIZE) & PMD_MASK;
 				continue;
 			}
@@ -260,8 +257,7 @@ int __meminit vmemmap_populate(struct page *start, unsigned long nr, int node)
 			new_page =__pa(vmem_alloc_pages(0));
 			if (!new_page)
 				goto out;
-			pte = pfn_pte(new_page >> PAGE_SHIFT, PAGE_KERNEL);
-			*pt_dir = pte;
+			pte_val(*pt_dir) = __pa(new_page);
 		}
 		address += PAGE_SIZE;
 	}
author	Martin Schwidefsky <schwidefsky@de.ibm.com>	2012-11-07 13:17:37 +0100
committer	Martin Schwidefsky <schwidefsky@de.ibm.com>	2013-02-14 15:55:23 +0100
commit	abf09bed3cceadd809f0356065c2ada6cee90d4a (patch)
tree	b81cac34a4111f498cdef104a2b9c4c444faf0bd /arch/s390/mm/vmem.c
parent	486c0a0bc80d370471b21662bf03f04fbb37cdc6 (diff)