From a0d22f485af1553060b4094ee0154537a8f6a8a6 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Thu, 9 Apr 2009 16:45:29 -0700 Subject: x86: Document get_user_pages_fast() While better than get_user_pages(), the usage of gupf(), especially the return values and the fact that it can potentially only partially pin the range, warranted some documentation. Signed-off-by: Andy Grover Cc: npiggin@suse.de Cc: akpm@linux-foundation.org LKML-Reference: <1239320729-3262-1-git-send-email-andy.grover@oracle.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/gup.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index be54176e9eb..6340cef6798 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -219,6 +219,22 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, return 1; } +/** + * get_user_pages_fast() - pin user pages in memory + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @write: whether pages will be written to + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. + * + * Attempt to pin user pages in memory without taking mm->mmap_sem. + * If not successful, it will fall back to taking the lock and + * calling get_user_pages(). + * + * Returns number of pages pinned. This may be fewer than the number + * requested. If nr_pages is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. + */ int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { -- cgit v1.2.3-70-g09d2 From 43a432b1559798d33970261f710030f787770231 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 9 Apr 2009 14:26:47 -0700 Subject: x86, CPA: Change idmap attribute before ioremap attribute setup Change the identity mapping with the requested attribute first, before we setup the virtual memory mapping with the new requested attribute. This makes sure that there is no window when identity map'ed attribute may disagree with ioremap range on the attribute type. This also avoids doing cpa on the ioremap'ed address twice (first in ioremap_page_range and then in ioremap_change_attr using vaddr), and should improve ioremap performance a bit. Signed-off-by: Suresh Siddha Signed-off-by: Venkatesh Pallipadi LKML-Reference: <20090409212708.373330000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 0dfa09d69e8..329387eca12 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -280,15 +280,16 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, return NULL; area->phys_addr = phys_addr; vaddr = (unsigned long) area->addr; - if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) { + + if (kernel_map_sync_memtype(phys_addr, size, prot_val)) { free_memtype(phys_addr, phys_addr + size); free_vm_area(area); return NULL; } - if (ioremap_change_attr(vaddr, size, prot_val) < 0) { + if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) { free_memtype(phys_addr, phys_addr + size); - vunmap(area->addr); + free_vm_area(area); return NULL; } -- cgit v1.2.3-70-g09d2 From a5593e0b329a14dea41ea173380dbf1533de2bd2 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Thu, 9 Apr 2009 14:26:48 -0700 Subject: x86, PAT: Change order of cpa and free in set_memory_wb To be free of aliasing due to races, set_memory_* interfaces should follow ordering of reserving, changing memtype to UC/WC, changing memtype back to WB followed by free. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha LKML-Reference: <20090409212708.512280000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index d71e1b636ce..d487eaa17bf 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1021,15 +1021,19 @@ int _set_memory_wb(unsigned long addr, int numpages) int set_memory_wb(unsigned long addr, int numpages) { + int ret = _set_memory_wb(addr, numpages); free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); - - return _set_memory_wb(addr, numpages); + return ret; } EXPORT_SYMBOL(set_memory_wb); int set_memory_array_wb(unsigned long *addr, int addrinarray) { int i; + int ret; + + ret = change_page_attr_clear(addr, addrinarray, + __pgprot(_PAGE_CACHE_MASK), 1); for (i = 0; i < addrinarray; i++) { unsigned long start = __pa(addr[i]); @@ -1042,8 +1046,7 @@ int set_memory_array_wb(unsigned long *addr, int addrinarray) } free_memtype(start, end); } - return change_page_attr_clear(addr, addrinarray, - __pgprot(_PAGE_CACHE_MASK), 1); + return ret; } EXPORT_SYMBOL(set_memory_array_wb); -- cgit v1.2.3-70-g09d2 From 9fa3ab390abfc8b49fc0dd7c845b0ad224ec429f Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Thu, 9 Apr 2009 14:26:49 -0700 Subject: x86, PAT: Handle faults cleanly in set_memory_ APIs Handle faults and do proper cleanups in set_memory_*() functions. In some cases, these functions were not doing proper free on failure paths. With the changes to tracking memtype of RAM pages in struct page instead of pat list, we do not need the changes in commits c5e147. This patch reverts that change. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha LKML-Reference: <20090409212708.653222000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 113 ++++++++++++++++++++++++++++--------------------- 1 file changed, 65 insertions(+), 48 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index d487eaa17bf..985eef80c55 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -945,52 +945,56 @@ int _set_memory_uc(unsigned long addr, int numpages) int set_memory_uc(unsigned long addr, int numpages) { + int ret; + /* * for now UC MINUS. see comments in ioremap_nocache() */ - if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, - _PAGE_CACHE_UC_MINUS, NULL)) - return -EINVAL; + ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, + _PAGE_CACHE_UC_MINUS, NULL); + if (ret) + goto out_err; + + ret = _set_memory_uc(addr, numpages); + if (ret) + goto out_free; + + return 0; - return _set_memory_uc(addr, numpages); +out_free: + free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); +out_err: + return ret; } EXPORT_SYMBOL(set_memory_uc); int set_memory_array_uc(unsigned long *addr, int addrinarray) { - unsigned long start; - unsigned long end; - int i; + int i, j; + int ret; + /* * for now UC MINUS. see comments in ioremap_nocache() */ for (i = 0; i < addrinarray; i++) { - start = __pa(addr[i]); - for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) { - if (end != __pa(addr[i + 1])) - break; - i++; - } - if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL)) - goto out; + ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE, + _PAGE_CACHE_UC_MINUS, NULL); + if (ret) + goto out_free; } - return change_page_attr_set(addr, addrinarray, + ret = change_page_attr_set(addr, addrinarray, __pgprot(_PAGE_CACHE_UC_MINUS), 1); -out: - for (i = 0; i < addrinarray; i++) { - unsigned long tmp = __pa(addr[i]); - - if (tmp == start) - break; - for (end = tmp + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) { - if (end != __pa(addr[i + 1])) - break; - i++; - } - free_memtype(tmp, end); - } - return -EINVAL; + if (ret) + goto out_free; + + return 0; + +out_free: + for (j = 0; j < i; j++) + free_memtype(__pa(addr[j]), __pa(addr[j]) + PAGE_SIZE); + + return ret; } EXPORT_SYMBOL(set_memory_array_uc); @@ -1002,14 +1006,26 @@ int _set_memory_wc(unsigned long addr, int numpages) int set_memory_wc(unsigned long addr, int numpages) { + int ret; + if (!pat_enabled) return set_memory_uc(addr, numpages); - if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, - _PAGE_CACHE_WC, NULL)) - return -EINVAL; + ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, + _PAGE_CACHE_WC, NULL); + if (ret) + goto out_err; - return _set_memory_wc(addr, numpages); + ret = _set_memory_wc(addr, numpages); + if (ret) + goto out_free; + + return 0; + +out_free: + free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); +out_err: + return ret; } EXPORT_SYMBOL(set_memory_wc); @@ -1021,9 +1037,14 @@ int _set_memory_wb(unsigned long addr, int numpages) int set_memory_wb(unsigned long addr, int numpages) { - int ret = _set_memory_wb(addr, numpages); + int ret; + + ret = _set_memory_wb(addr, numpages); + if (ret) + return ret; + free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); - return ret; + return 0; } EXPORT_SYMBOL(set_memory_wb); @@ -1034,19 +1055,13 @@ int set_memory_array_wb(unsigned long *addr, int addrinarray) ret = change_page_attr_clear(addr, addrinarray, __pgprot(_PAGE_CACHE_MASK), 1); + if (ret) + return ret; - for (i = 0; i < addrinarray; i++) { - unsigned long start = __pa(addr[i]); - unsigned long end; + for (i = 0; i < addrinarray; i++) + free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE); - for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) { - if (end != __pa(addr[i + 1])) - break; - i++; - } - free_memtype(start, end); - } - return ret; + return 0; } EXPORT_SYMBOL(set_memory_array_wb); @@ -1139,6 +1154,8 @@ int set_pages_array_wb(struct page **pages, int addrinarray) retval = cpa_clear_pages_array(pages, addrinarray, __pgprot(_PAGE_CACHE_MASK)); + if (retval) + return retval; for (i = 0; i < addrinarray; i++) { start = (unsigned long)page_address(pages[i]); @@ -1146,7 +1163,7 @@ int set_pages_array_wb(struct page **pages, int addrinarray) free_memtype(start, end); } - return retval; + return 0; } EXPORT_SYMBOL(set_pages_array_wb); -- cgit v1.2.3-70-g09d2 From 3869c4aa18835c8c61b44bd0f3ace36e9d3b5bd0 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Thu, 9 Apr 2009 14:26:50 -0700 Subject: x86, PAT: Changing memtype to WC ensuring no WB alias As per SDM, there should not be any aliasing of a WC with any cacheable type across CPUs. That is if one CPU is changing the identity map memtype to _WC, no other CPU at the time of this change should not have a TLB for this page that carries a WB attribute. SDM suggests to make the page not present. But for that we will have to handle any page faults that can potentially happen due to these pages being not present. Other way to deal with this without having any WB mapping is to change the page first to UC and then to WC. This ensures that we meet the SDM requirement of no cacheable alais to WC page. This also has same or lower overhead than marking the page not present and making it present later. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha LKML-Reference: <20090409212708.797481000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 985eef80c55..797f9f107cb 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1000,8 +1000,15 @@ EXPORT_SYMBOL(set_memory_array_uc); int _set_memory_wc(unsigned long addr, int numpages) { - return change_page_attr_set(&addr, numpages, + int ret; + ret = change_page_attr_set(&addr, numpages, + __pgprot(_PAGE_CACHE_UC_MINUS), 0); + + if (!ret) { + ret = change_page_attr_set(&addr, numpages, __pgprot(_PAGE_CACHE_WC), 0); + } + return ret; } int set_memory_wc(unsigned long addr, int numpages) -- cgit v1.2.3-70-g09d2 From b6ff32d9aaeeeecf98f9a852d715569183585312 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 9 Apr 2009 14:26:51 -0700 Subject: x86, PAT: Consolidate code in pat_x_mtrr_type() and reserve_memtype() Fix pat_x_mtrr_type() to use UC_MINUS when the mtrr type return UC. This is to be consistent with ioremap() and ioremap_nocache() which uses UC_MINUS. Consolidate the code such that reserve_memtype() also uses pat_x_mtrr_type() when the caller doesn't specify any special attribute (non WB attribute). Signed-off-by: Suresh Siddha Signed-off-by: Venkatesh Pallipadi LKML-Reference: <20090409212708.939936000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 3 ++- arch/x86/mm/pat.c | 35 +++++++++++++---------------------- 2 files changed, 15 insertions(+), 23 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 329387eca12..d4c4b2c4dbb 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -375,7 +375,8 @@ static void __iomem *ioremap_default(resource_size_t phys_addr, * - UC_MINUS for non-WB-able memory with no other conflicting mappings * - Inherit from confliting mappings otherwise */ - err = reserve_memtype(phys_addr, phys_addr + size, -1, &flags); + err = reserve_memtype(phys_addr, phys_addr + size, + _PAGE_CACHE_WB, &flags); if (err < 0) return NULL; diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 640339ee4fb..8d3de958050 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -182,10 +182,10 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type) u8 mtrr_type; mtrr_type = mtrr_type_lookup(start, end); - if (mtrr_type == MTRR_TYPE_UNCACHABLE) - return _PAGE_CACHE_UC; - if (mtrr_type == MTRR_TYPE_WRCOMB) - return _PAGE_CACHE_WC; + if (mtrr_type != MTRR_TYPE_WRBACK) + return _PAGE_CACHE_UC_MINUS; + + return _PAGE_CACHE_WB; } return req_type; @@ -352,23 +352,13 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, return 0; } - if (req_type == -1) { - /* - * Call mtrr_lookup to get the type hint. This is an - * optimization for /dev/mem mmap'ers into WB memory (BIOS - * tools and ACPI tools). Use WB request for WB memory and use - * UC_MINUS otherwise. - */ - u8 mtrr_type = mtrr_type_lookup(start, end); - - if (mtrr_type == MTRR_TYPE_WRBACK) - actual_type = _PAGE_CACHE_WB; - else - actual_type = _PAGE_CACHE_UC_MINUS; - } else { - actual_type = pat_x_mtrr_type(start, end, - req_type & _PAGE_CACHE_MASK); - } + /* + * Call mtrr_lookup to get the type hint. This is an + * optimization for /dev/mem mmap'ers into WB memory (BIOS + * tools and ACPI tools). Use WB request for WB memory and use + * UC_MINUS otherwise. + */ + actual_type = pat_x_mtrr_type(start, end, req_type & _PAGE_CACHE_MASK); if (new_type) *new_type = actual_type; @@ -587,7 +577,8 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, if (flags != -1) { retval = reserve_memtype(offset, offset + size, flags, NULL); } else { - retval = reserve_memtype(offset, offset + size, -1, &flags); + retval = reserve_memtype(offset, offset + size, + _PAGE_CACHE_WB, &flags); } if (retval < 0) -- cgit v1.2.3-70-g09d2 From 0c3c8a18361a636069f5a5d9d0d0f9c2124e6b94 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 9 Apr 2009 14:26:52 -0700 Subject: x86, PAT: Remove duplicate memtype reserve in devmem mmap /dev/mem mmap code was doing memtype reserve/free for a while now. Recently we added memtype tracking in remap_pfn_range, and /dev/mem mmap uses it indirectly. So, we don't need seperate tracking in /dev/mem code any more. That means another ~100 lines of code removed :-). Signed-off-by: Suresh Siddha Signed-off-by: Venkatesh Pallipadi LKML-Reference: <20090409212709.085210000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pat.h | 4 ---- arch/x86/mm/pat.c | 60 ++-------------------------------------------- drivers/char/mem.c | 27 --------------------- 3 files changed, 2 insertions(+), 89 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h index 2cd07b9422f..7af14e512f9 100644 --- a/arch/x86/include/asm/pat.h +++ b/arch/x86/include/asm/pat.h @@ -18,9 +18,5 @@ extern int free_memtype(u64 start, u64 end); extern int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flag); -extern void map_devmem(unsigned long pfn, unsigned long size, - struct pgprot vma_prot); -extern void unmap_devmem(unsigned long pfn, unsigned long size, - struct pgprot vma_prot); #endif /* _ASM_X86_PAT_H */ diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 8d3de958050..cc5e0e24e44 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -536,9 +536,7 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size) int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, unsigned long size, pgprot_t *vma_prot) { - u64 offset = ((u64) pfn) << PAGE_SHIFT; - unsigned long flags = -1; - int retval; + unsigned long flags = _PAGE_CACHE_WB; if (!range_is_allowed(pfn, size)) return 0; @@ -566,65 +564,11 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, } #endif - /* - * With O_SYNC, we can only take UC_MINUS mapping. Fail if we cannot. - * - * Without O_SYNC, we want to get - * - WB for WB-able memory and no other conflicting mappings - * - UC_MINUS for non-WB-able memory with no other conflicting mappings - * - Inherit from confliting mappings otherwise - */ - if (flags != -1) { - retval = reserve_memtype(offset, offset + size, flags, NULL); - } else { - retval = reserve_memtype(offset, offset + size, - _PAGE_CACHE_WB, &flags); - } - - if (retval < 0) - return 0; - - if (((pfn < max_low_pfn_mapped) || - (pfn >= (1UL<<(32 - PAGE_SHIFT)) && pfn < max_pfn_mapped)) && - ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) { - free_memtype(offset, offset + size); - printk(KERN_INFO - "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n", - current->comm, current->pid, - cattr_name(flags), - offset, (unsigned long long)(offset + size)); - return 0; - } - *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) | flags); return 1; } -void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) -{ - unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK); - u64 addr = (u64)pfn << PAGE_SHIFT; - unsigned long flags; - - reserve_memtype(addr, addr + size, want_flags, &flags); - if (flags != want_flags) { - printk(KERN_INFO - "%s:%d /dev/mem expected mapping type %s for %Lx-%Lx, got %s\n", - current->comm, current->pid, - cattr_name(want_flags), - addr, (unsigned long long)(addr + size), - cattr_name(flags)); - } -} - -void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) -{ - u64 addr = (u64)pfn << PAGE_SHIFT; - - free_memtype(addr, addr + size); -} - /* * Change the memory type for the physial address range in kernel identity * mapping space if that range is a part of identity map. @@ -662,8 +606,8 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, { int is_ram = 0; int ret; - unsigned long flags; unsigned long want_flags = (pgprot_val(*vma_prot) & _PAGE_CACHE_MASK); + unsigned long flags = want_flags; is_ram = pat_pagerange_is_ram(paddr, paddr + size); diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 3586b3b3df3..8f05c38c2f0 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -301,33 +301,7 @@ static inline int private_mapping_ok(struct vm_area_struct *vma) } #endif -void __attribute__((weak)) -map_devmem(unsigned long pfn, unsigned long len, pgprot_t prot) -{ - /* nothing. architectures can override. */ -} - -void __attribute__((weak)) -unmap_devmem(unsigned long pfn, unsigned long len, pgprot_t prot) -{ - /* nothing. architectures can override. */ -} - -static void mmap_mem_open(struct vm_area_struct *vma) -{ - map_devmem(vma->vm_pgoff, vma->vm_end - vma->vm_start, - vma->vm_page_prot); -} - -static void mmap_mem_close(struct vm_area_struct *vma) -{ - unmap_devmem(vma->vm_pgoff, vma->vm_end - vma->vm_start, - vma->vm_page_prot); -} - static struct vm_operations_struct mmap_mem_ops = { - .open = mmap_mem_open, - .close = mmap_mem_close, #ifdef CONFIG_HAVE_IOREMAP_PROT .access = generic_access_phys #endif @@ -362,7 +336,6 @@ static int mmap_mem(struct file * file, struct vm_area_struct * vma) vma->vm_pgoff, size, vma->vm_page_prot)) { - unmap_devmem(vma->vm_pgoff, size, vma->vm_page_prot); return -EAGAIN; } return 0; -- cgit v1.2.3-70-g09d2 From 9b987aeb4a7bc42a3eb8361030b820b0263c31f1 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 9 Apr 2009 10:55:33 -0700 Subject: x86: fix set_fixmap to use phys_addr_t Impact: fix kprobes crash on 32-bit with RAM above 4G Use phys_addr_t for receiving a physical address argument instead of unsigned long. This allows fixmap to handle pages higher than 4GB on x86-32. Signed-off-by: Masami Hiramatsu Acked-by: Mathieu Desnoyers Cc: Andrew Morton Cc: Ananth N Mavinakayanahalli Cc: systemtap-ml Cc: Gary Hade Cc: Linus Torvalds LKML-Reference: <49DE3695.6040800@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fixmap.h | 4 ++-- arch/x86/include/asm/io.h | 6 ++++-- arch/x86/include/asm/paravirt.h | 4 ++-- arch/x86/mm/ioremap.c | 23 +++++++++++++---------- arch/x86/mm/pgtable.c | 3 ++- arch/x86/xen/mmu.c | 2 +- 6 files changed, 24 insertions(+), 18 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 81937a5dc77..2d81af3974a 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -151,11 +151,11 @@ extern pte_t *pkmap_page_table; void __native_set_fixmap(enum fixed_addresses idx, pte_t pte); void native_set_fixmap(enum fixed_addresses idx, - unsigned long phys, pgprot_t flags); + phys_addr_t phys, pgprot_t flags); #ifndef CONFIG_PARAVIRT static inline void __set_fixmap(enum fixed_addresses idx, - unsigned long phys, pgprot_t flags) + phys_addr_t phys, pgprot_t flags) { native_set_fixmap(idx, phys, flags); } diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index e5383e3d2f8..73739322b6d 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -193,8 +193,10 @@ extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size); */ extern void early_ioremap_init(void); extern void early_ioremap_reset(void); -extern void __iomem *early_ioremap(unsigned long offset, unsigned long size); -extern void __iomem *early_memremap(unsigned long offset, unsigned long size); +extern void __iomem *early_ioremap(resource_size_t phys_addr, + unsigned long size); +extern void __iomem *early_memremap(resource_size_t phys_addr, + unsigned long size); extern void early_iounmap(void __iomem *addr, unsigned long size); #define IO_SPACE_LIMIT 0xffff diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 7727aa8b7dd..378e3691c08 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -347,7 +347,7 @@ struct pv_mmu_ops { /* Sometimes the physical address is a pfn, and sometimes its an mfn. We can tell which is which from the index. */ void (*set_fixmap)(unsigned /* enum fixed_addresses */ idx, - unsigned long phys, pgprot_t flags); + phys_addr_t phys, pgprot_t flags); }; struct raw_spinlock; @@ -1432,7 +1432,7 @@ static inline void arch_leave_lazy_mmu_mode(void) void arch_flush_lazy_mmu_mode(void); static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, - unsigned long phys, pgprot_t flags) + phys_addr_t phys, pgprot_t flags) { pv_mmu_ops.set_fixmap(idx, phys, flags); } diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 0dfa09d69e8..09daebfdb11 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -547,7 +547,7 @@ void __init early_ioremap_reset(void) } static void __init __early_set_fixmap(enum fixed_addresses idx, - unsigned long phys, pgprot_t flags) + phys_addr_t phys, pgprot_t flags) { unsigned long addr = __fix_to_virt(idx); pte_t *pte; @@ -566,7 +566,7 @@ static void __init __early_set_fixmap(enum fixed_addresses idx, } static inline void __init early_set_fixmap(enum fixed_addresses idx, - unsigned long phys, pgprot_t prot) + phys_addr_t phys, pgprot_t prot) { if (after_paging_init) __set_fixmap(idx, phys, prot); @@ -607,9 +607,10 @@ static int __init check_early_ioremap_leak(void) late_initcall(check_early_ioremap_leak); static void __init __iomem * -__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot) +__early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) { - unsigned long offset, last_addr; + unsigned long offset; + resource_size_t last_addr; unsigned int nrpages; enum fixed_addresses idx0, idx; int i, slot; @@ -625,15 +626,15 @@ __early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot) } if (slot < 0) { - printk(KERN_INFO "early_iomap(%08lx, %08lx) not found slot\n", - phys_addr, size); + printk(KERN_INFO "early_iomap(%08llx, %08lx) not found slot\n", + (u64)phys_addr, size); WARN_ON(1); return NULL; } if (early_ioremap_debug) { - printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ", - phys_addr, size, slot); + printk(KERN_INFO "early_ioremap(%08llx, %08lx) [%d] => ", + (u64)phys_addr, size, slot); dump_stack(); } @@ -680,13 +681,15 @@ __early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot) } /* Remap an IO device */ -void __init __iomem *early_ioremap(unsigned long phys_addr, unsigned long size) +void __init __iomem * +early_ioremap(resource_size_t phys_addr, unsigned long size) { return __early_ioremap(phys_addr, size, PAGE_KERNEL_IO); } /* Remap memory */ -void __init __iomem *early_memremap(unsigned long phys_addr, unsigned long size) +void __init __iomem * +early_memremap(resource_size_t phys_addr, unsigned long size) { return __early_ioremap(phys_addr, size, PAGE_KERNEL); } diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 5b7c7c8464f..7aa03a5389f 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -345,7 +345,8 @@ void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) fixmaps_set++; } -void native_set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags) +void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, + pgprot_t flags) { __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags)); } diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index db3802fb7b8..2a81838a9ab 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1750,7 +1750,7 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, } #endif /* CONFIG_X86_64 */ -static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot) +static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) { pte_t pte; -- cgit v1.2.3-70-g09d2 From 1ee4bd92a7aa49eb66c8d5672e837090d3e7b7ff Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Fri, 10 Apr 2009 22:47:17 +0200 Subject: x86: fix wrong section of pat_disable & make it static pat_disable cannot be __cpuinit anymore because it's called from pat_init and the callchain looks like this: pat_disable [cpuinit] <- pat_init <- generic_set_all <- ipi_handler <- set_mtrr <- (other non init/cpuinit functions) WARNING: arch/x86/mm/built-in.o(.text+0x449e): Section mismatch in reference from the function pat_init() to the function .cpuinit.text:pat_disable() The function pat_init() references the function __cpuinit pat_disable(). This is often because pat_init lacks a __cpuinit annotation or the annotation of pat_disable is wrong. Non CONFIG_X86_PAT version of pat_disable is static inline, so this version can be static too (and there are no callers outside of this file). Signed-off-by: Marcin Slusarz Acked-by: Sam Ravnborg LKML-Reference: <49DFB055.6070405@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 640339ee4fb..c009a241d56 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -31,7 +31,7 @@ #ifdef CONFIG_X86_PAT int __read_mostly pat_enabled = 1; -void __cpuinit pat_disable(const char *reason) +static inline void pat_disable(const char *reason) { pat_enabled = 0; printk(KERN_INFO "%s\n", reason); -- cgit v1.2.3-70-g09d2 From 4b065046273afa01ec8e3de7da407e8d3599251d Mon Sep 17 00:00:00 2001 From: "Pallipadi, Venkatesh" Date: Wed, 8 Apr 2009 15:37:16 -0700 Subject: x86, PAT: Remove page granularity tracking for vm_insert_pfn maps This change resolves the problem of too many single page entries in pat_memtype_list and "freeing invalid memtype" errors with i915, reported here: http://marc.info/?l=linux-kernel&m=123845244713183&w=2 Remove page level granularity track and untrack of vm_insert_pfn. memtype tracking at page granularity does not scale and cleaner approach would be for the driver to request a type for a bigger IO address range or PCI io memory range for that device, either at mmap time or driver init time and just use that type during vm_insert_pfn. This patch just removes the track/untrack of vm_insert_pfn. That means we will be in same state as 2.6.28, with respect to these APIs. Newer APIs for the drivers to request a memtype for a bigger region is coming soon. [ Impact: fix Xorg startup warnings and hangs ] Reported-by: Arkadiusz Miskiewicz Tested-by: Arkadiusz Miskiewicz Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Cc: Jesse Barnes LKML-Reference: <20090408223716.GC3493@linux-os.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 98 +++++++++++-------------------------------------------- 1 file changed, 19 insertions(+), 79 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index cc5e0e24e44..41c80571815 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -669,29 +669,28 @@ static void free_pfn_range(u64 paddr, unsigned long size) * * If the vma has a linear pfn mapping for the entire range, we get the prot * from pte and reserve the entire vma range with single reserve_pfn_range call. - * Otherwise, we reserve the entire vma range, my ging through the PTEs page - * by page to get physical address and protection. */ int track_pfn_vma_copy(struct vm_area_struct *vma) { - int retval = 0; - unsigned long i, j; resource_size_t paddr; unsigned long prot; - unsigned long vma_start = vma->vm_start; - unsigned long vma_end = vma->vm_end; - unsigned long vma_size = vma_end - vma_start; + unsigned long vma_size = vma->vm_end - vma->vm_start; pgprot_t pgprot; if (!pat_enabled) return 0; + /* + * For now, only handle remap_pfn_range() vmas where + * is_linear_pfn_mapping() == TRUE. Handling of + * vm_insert_pfn() is TBD. + */ if (is_linear_pfn_mapping(vma)) { /* * reserve the whole chunk covered by vma. We need the * starting address and protection from pte. */ - if (follow_phys(vma, vma_start, 0, &prot, &paddr)) { + if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) { WARN_ON_ONCE(1); return -EINVAL; } @@ -699,28 +698,7 @@ int track_pfn_vma_copy(struct vm_area_struct *vma) return reserve_pfn_range(paddr, vma_size, &pgprot, 1); } - /* reserve entire vma page by page, using pfn and prot from pte */ - for (i = 0; i < vma_size; i += PAGE_SIZE) { - if (follow_phys(vma, vma_start + i, 0, &prot, &paddr)) - continue; - - pgprot = __pgprot(prot); - retval = reserve_pfn_range(paddr, PAGE_SIZE, &pgprot, 1); - if (retval) - goto cleanup_ret; - } return 0; - -cleanup_ret: - /* Reserve error: Cleanup partial reservation and return error */ - for (j = 0; j < i; j += PAGE_SIZE) { - if (follow_phys(vma, vma_start + j, 0, &prot, &paddr)) - continue; - - free_pfn_range(paddr, PAGE_SIZE); - } - - return retval; } /* @@ -730,50 +708,28 @@ cleanup_ret: * prot is passed in as a parameter for the new mapping. If the vma has a * linear pfn mapping for the entire range reserve the entire vma range with * single reserve_pfn_range call. - * Otherwise, we look t the pfn and size and reserve only the specified range - * page by page. - * - * Note that this function can be called with caller trying to map only a - * subrange/page inside the vma. */ int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn, unsigned long size) { - int retval = 0; - unsigned long i, j; - resource_size_t base_paddr; resource_size_t paddr; - unsigned long vma_start = vma->vm_start; - unsigned long vma_end = vma->vm_end; - unsigned long vma_size = vma_end - vma_start; + unsigned long vma_size = vma->vm_end - vma->vm_start; if (!pat_enabled) return 0; + /* + * For now, only handle remap_pfn_range() vmas where + * is_linear_pfn_mapping() == TRUE. Handling of + * vm_insert_pfn() is TBD. + */ if (is_linear_pfn_mapping(vma)) { /* reserve the whole chunk starting from vm_pgoff */ paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; return reserve_pfn_range(paddr, vma_size, prot, 0); } - /* reserve page by page using pfn and size */ - base_paddr = (resource_size_t)pfn << PAGE_SHIFT; - for (i = 0; i < size; i += PAGE_SIZE) { - paddr = base_paddr + i; - retval = reserve_pfn_range(paddr, PAGE_SIZE, prot, 0); - if (retval) - goto cleanup_ret; - } return 0; - -cleanup_ret: - /* Reserve error: Cleanup partial reservation and return error */ - for (j = 0; j < i; j += PAGE_SIZE) { - paddr = base_paddr + j; - free_pfn_range(paddr, PAGE_SIZE); - } - - return retval; } /* @@ -784,39 +740,23 @@ cleanup_ret: void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, unsigned long size) { - unsigned long i; resource_size_t paddr; - unsigned long prot; - unsigned long vma_start = vma->vm_start; - unsigned long vma_end = vma->vm_end; - unsigned long vma_size = vma_end - vma_start; + unsigned long vma_size = vma->vm_end - vma->vm_start; if (!pat_enabled) return; + /* + * For now, only handle remap_pfn_range() vmas where + * is_linear_pfn_mapping() == TRUE. Handling of + * vm_insert_pfn() is TBD. + */ if (is_linear_pfn_mapping(vma)) { /* free the whole chunk starting from vm_pgoff */ paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; free_pfn_range(paddr, vma_size); return; } - - if (size != 0 && size != vma_size) { - /* free page by page, using pfn and size */ - paddr = (resource_size_t)pfn << PAGE_SHIFT; - for (i = 0; i < size; i += PAGE_SIZE) { - paddr = paddr + i; - free_pfn_range(paddr, PAGE_SIZE); - } - } else { - /* free entire vma, page by page, using the pfn from pte */ - for (i = 0; i < vma_size; i += PAGE_SIZE) { - if (follow_phys(vma, vma_start + i, 0, &prot, &paddr)) - continue; - - free_pfn_range(paddr, PAGE_SIZE); - } - } } pgprot_t pgprot_writecombine(pgprot_t prot) -- cgit v1.2.3-70-g09d2 From dc098551918093901d8ac8936e9d1a1b891b56ed Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Fri, 17 Apr 2009 09:22:42 -0500 Subject: x86/uv: fix init of memory-less nodes Add support for nodes that have cpus but no memory. The current code was failing to add these nodes to the nodes_present_map. v2: Fixes case caught by David Rientjes - missed support for the x2apic SRAT table. [ Impact: fix potential boot crash on memory-less UV nodes. ] Reported-by: David Rientjes Signed-off-by: Jack Steiner LKML-Reference: <20090417142242.GA23743@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/srat_64.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index c7d272b8574..33c5fa57e43 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -28,6 +28,7 @@ int acpi_numa __initdata; static struct acpi_table_slit *acpi_slit; static nodemask_t nodes_parsed __initdata; +static nodemask_t cpu_nodes_parsed __initdata; static struct bootnode nodes[MAX_NUMNODES] __initdata; static struct bootnode nodes_add[MAX_NUMNODES]; static int found_add_area __initdata; @@ -141,6 +142,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) apic_id = pa->apic_id; apicid_to_node[apic_id] = node; + node_set(node, cpu_nodes_parsed); acpi_numa = 1; printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n", pxm, apic_id, node); @@ -174,6 +176,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) else apic_id = pa->apic_id; apicid_to_node[apic_id] = node; + node_set(node, cpu_nodes_parsed); acpi_numa = 1; printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n", pxm, apic_id, node); @@ -402,7 +405,8 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) return -1; } - node_possible_map = nodes_parsed; + /* Account for nodes with cpus and no memory */ + nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed); /* Finally register nodes */ for_each_node_mask(i, node_possible_map) -- cgit v1.2.3-70-g09d2 From a81b6314e0aa480b8ac6dd02779d44cd0bee0a34 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 17 Apr 2009 23:31:20 +0530 Subject: x86: mm/numa_32.c calculate_numa_remap_pages should use __init calculate_numa_remap_pages() is called only by __init initmem_init() further calculate_numa_remap_pages is calling: __init find_e820_area() and __init reserve_early() So calculate_numa_remap_pages() should be __init calculate_numa_remap_pages(). WARNING: arch/x86/built-in.o(.text+0x82ea3): Section mismatch in reference from the function calculate_numa_remap_pages() to the function .init.text:find_e820_area() The function calculate_numa_remap_pages() references the function __init find_e820_area(). This is often because calculate_numa_remap_pages lacks a __init annotation or the annotation of find_e820_area is wrong. WARNING: arch/x86/built-in.o(.text+0x82f5f): Section mismatch in reference from the function calculate_numa_remap_pages() to the function .init.text:reserve_early() The function calculate_numa_remap_pages() references the function __init reserve_early(). This is often because calculate_numa_remap_pages lacks a __init annotation or the annotation of reserve_early is wrong. [ Impact: save memory, address Section mismatch warning ] Signed-off-by: Jaswinder Singh Rajput Cc: Sam Ravnborg LKML-Reference: <1239991281.3153.4.camel@ht.satnam> Signed-off-by: Ingo Molnar --- arch/x86/mm/numa_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 3daefa04ace..d2530062fe0 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -257,7 +257,7 @@ void resume_map_numa_kva(pgd_t *pgd_base) } #endif -static unsigned long calculate_numa_remap_pages(void) +static __init unsigned long calculate_numa_remap_pages(void) { int nid; unsigned long size, reserve_pages = 0; -- cgit v1.2.3-70-g09d2 From 4c31e92b97b6d7e7b19ee5e54a22571ffdebb305 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 22 Apr 2009 14:19:27 -0700 Subject: x86: check boundary in setup_node_bootmem() Commit dc09855 ("x86/uv: fix init of memory-less nodes") causes a two sockets system (where node-1 doesn't have RAM installed) to crash. That commit makes node_possible include cpu nodes that do not have memory. So check boundary in setup_node_bootmem(). [ Impact: fix boot crash on RAM-less NUMA node system ] Signed-off-by: Yinghai Lu Cc: Jack Steiner LKML-Reference: <49EF89DF.9090404@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/mm/numa_64.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index d73aaa89237..2d05a12029d 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -188,6 +188,9 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE); int nid; + if (!end) + return; + start = roundup(start, ZONE_ALIGN); printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, -- cgit v1.2.3-70-g09d2