From 822c18f2e38cbc775792ab65ace4f9198678dec9 Mon Sep 17 00:00:00 2001 From: Ivan Kokshaysky Date: Thu, 15 Jan 2009 13:50:48 -0800 Subject: alpha: fix vmalloc breakage On alpha, we have to map some stuff in the VMALLOC space very early in the boot process (to make SRM console callbacks work and so on, see arch/alpha/mm/init.c). For old VM allocator, we just manually placed a vm_struct onto the global vmlist and this worked for ages. Unfortunately, the new allocator isn't aware of this, so it constantly tries to allocate the VM space which is already in use, making vmalloc on alpha defunct. This patch forces KVA to import vmlist entries on init. [akpm@linux-foundation.org: remove unneeded check (per Johannes)] Signed-off-by: Ivan Kokshaysky Cc: Nick Piggin Cc: Johannes Weiner Cc: Richard Henderson Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index c5db9a7264d..7e00b280648 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -984,6 +985,8 @@ EXPORT_SYMBOL(vm_map_ram); void __init vmalloc_init(void) { + struct vmap_area *va; + struct vm_struct *tmp; int i; for_each_possible_cpu(i) { @@ -996,6 +999,14 @@ void __init vmalloc_init(void) vbq->nr_dirty = 0; } + /* Import existing vmlist entries. */ + for (tmp = vmlist; tmp; tmp = tmp->next) { + va = alloc_bootmem(sizeof(struct vmap_area)); + va->flags = tmp->flags | VM_VM_AREA; + va->va_start = (unsigned long)tmp->addr; + va->va_end = va->va_start + tmp->size; + __insert_vmap_area(va); + } vmap_initialized = true; } -- cgit v1.2.3-70-g09d2 From 46666d8ac42893f90edde7e57a11bc8749d7e89c Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 15 Jan 2009 13:51:15 -0800 Subject: revert "mm: vmalloc use mutex for purge" Revert commit e97a630eb0f5b8b380fd67504de6cedebb489003 ("mm: vmalloc use mutex for purge") Bryan Donlan reports: : After testing 2.6.29-rc1 on xen-x86 with a btrfs root filesystem, I : got the OOPS quoted below and a hard freeze shortly after boot. : Boot messages and config are attached. : : ------------[ cut here ]------------ : Kernel BUG at c05ef80d [verbose debug info unavailable] : invalid opcode: 0000 [#1] SMP : last sysfs file: /sys/block/xvdc/size : Modules linked in: : : Pid: 0, comm: swapper Not tainted (2.6.29-rc1 #6) : EIP: 0061:[] EFLAGS: 00010087 CPU: 2 : EIP is at schedule+0x7cd/0x950 : EAX: d5aeca80 EBX: 00000002 ECX: 00000000 EDX: d4cb9a40 : ESI: c12f5600 EDI: d4cb9a40 EBP: d6033fa4 ESP: d6033ef4 : DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0069 : Process swapper (pid: 0, ti=d6032000 task=d6020b70 task.ti=d6032000) : Stack: : 000d85bc 00000000 000186a0 00000000 0dd11410 c0105417 c12efe00 0dc367c3 : 00000011 c0105d46 d5a5d310 deadbeef d4cb9a40 c07cc600 c05f1340 c12e0060 : deadbeef d6020b70 d6020d08 00000002 c014377d 00000000 c12f5600 00002c22 : Call Trace: : [] xen_force_evtchn_callback+0x17/0x30 : [] check_events+0x8/0x12 : [] _spin_unlock_irqrestore+0x20/0x40 : [] hrtimer_start_range_ns+0x12d/0x2e0 : [] tick_nohz_restart_sched_tick+0x146/0x160 : [] cpu_idle+0xa5/0xc0 and bisected it to this commit. Let's remove it now while we have a think about the problem. Reported-by: Bryan Donlan Tested-by: Christophe Saout Cc: Nick Piggin Cc: Ingo Molnar Cc: Jeremy Fitzhardinge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 7e00b280648..75f49d312e8 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include @@ -496,7 +495,7 @@ static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, int sync, int force_flush) { - static DEFINE_MUTEX(purge_lock); + static DEFINE_SPINLOCK(purge_lock); LIST_HEAD(valist); struct vmap_area *va; int nr = 0; @@ -507,10 +506,10 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, * the case that isn't actually used at the moment anyway. */ if (!sync && !force_flush) { - if (!mutex_trylock(&purge_lock)) + if (!spin_trylock(&purge_lock)) return; } else - mutex_lock(&purge_lock); + spin_lock(&purge_lock); rcu_read_lock(); list_for_each_entry_rcu(va, &vmap_area_list, list) { @@ -542,7 +541,7 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, __free_vmap_area(va); spin_unlock(&vmap_area_lock); } - mutex_unlock(&purge_lock); + spin_unlock(&purge_lock); } /* -- cgit v1.2.3-70-g09d2 From c296861291669f305deef19b78042330d7135017 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 18 Feb 2009 14:48:12 -0800 Subject: vmalloc: add __get_vm_area_caller() We have get_vm_area_caller() and __get_vm_area() but not __get_vm_area_caller() On powerpc, I use __get_vm_area() to separate the ranges of addresses given to vmalloc vs. ioremap (various good reasons for that) so in order to be able to implement the new caller tracking in /proc/vmallocinfo, I need a "_caller" variant of it. (akpm: needed for ongoing powerpc development, so merge it early) [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Benjamin Herrenschmidt Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 4 ++++ mm/vmalloc.c | 8 ++++++++ 2 files changed, 12 insertions(+) (limited to 'mm/vmalloc.c') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 506e7620a98..9c0890c7a06 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -84,6 +84,10 @@ extern struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, void *caller); extern struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, unsigned long start, unsigned long end); +extern struct vm_struct *__get_vm_area_caller(unsigned long size, + unsigned long flags, + unsigned long start, unsigned long end, + void *caller); extern struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int node, gfp_t gfp_mask); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 75f49d312e8..4dd2636d0b9 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1106,6 +1106,14 @@ struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, } EXPORT_SYMBOL_GPL(__get_vm_area); +struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, + unsigned long start, unsigned long end, + void *caller) +{ + return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL, + caller); +} + /** * get_vm_area - reserve a contiguous kernel virtual area * @size: size of the area -- cgit v1.2.3-70-g09d2 From 734269521e320ad14ed39ae9b64d482b9028dcd2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Feb 2009 16:29:07 +0900 Subject: vmalloc: call flush_cache_vunmap() from unmap_kernel_range() Impact: proper vcache flush on unmap_kernel_range() flush_cache_vunmap() should be called before pages are unmapped. Add a call to it in unmap_kernel_range(). Signed-off-by: Tejun Heo --- mm/vmalloc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 75f49d312e8..c37924a2ee3 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1012,6 +1012,8 @@ void __init vmalloc_init(void) void unmap_kernel_range(unsigned long addr, unsigned long size) { unsigned long end = addr + size; + + flush_cache_vunmap(addr, end); vunmap_page_range(addr, end); flush_tlb_kernel_range(addr, end); } -- cgit v1.2.3-70-g09d2 From f0aa6617903648077dffe5cfcf7c4458f4610fa7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Feb 2009 16:29:08 +0900 Subject: vmalloc: implement vm_area_register_early() Impact: allow multiple early vm areas There are places where kernel VM area needs to be allocated before vmalloc is initialized. This is done by allocating static vm_struct, initializing several fields and linking it to vmlist and later vmalloc initialization picking up these from vmlist. This is currently done manually and if there's more than one such areas, there's no defined way to arbitrate who gets which address. This patch implements vm_area_register_early(), which takes vm_area struct with flags and size initialized, assigns address to it and puts it on the vmlist. This way, multiple early vm areas can determine which addresses they should use. The only current user - alpha mm init - is converted to use it. Signed-off-by: Tejun Heo --- arch/alpha/mm/init.c | 20 +++++++++++++------- include/linux/vmalloc.h | 1 + mm/vmalloc.c | 24 ++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 7 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index 5d7a16eab31..df6df025ded 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -189,9 +189,21 @@ callback_init(void * kernel_end) if (alpha_using_srm) { static struct vm_struct console_remap_vm; - unsigned long vaddr = VMALLOC_START; + unsigned long nr_pages = 0; + unsigned long vaddr; unsigned long i, j; + /* calculate needed size */ + for (i = 0; i < crb->map_entries; ++i) + nr_pages += crb->map[i].count; + + /* register the vm area */ + console_remap_vm.flags = VM_ALLOC; + console_remap_vm.size = nr_pages << PAGE_SHIFT; + vm_area_register_early(&console_remap_vm); + + vaddr = (unsigned long)consle_remap_vm.addr; + /* Set up the third level PTEs and update the virtual addresses of the CRB entries. */ for (i = 0; i < crb->map_entries; ++i) { @@ -213,12 +225,6 @@ callback_init(void * kernel_end) vaddr += PAGE_SIZE; } } - - /* Let vmalloc know that we've allocated some space. */ - console_remap_vm.flags = VM_ALLOC; - console_remap_vm.addr = (void *) VMALLOC_START; - console_remap_vm.size = vaddr - VMALLOC_START; - vmlist = &console_remap_vm; } callback_init_done = 1; diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 506e7620a98..bbc05139229 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -106,5 +106,6 @@ extern long vwrite(char *buf, char *addr, unsigned long count); */ extern rwlock_t vmlist_lock; extern struct vm_struct *vmlist; +extern __init void vm_area_register_early(struct vm_struct *vm); #endif /* _LINUX_VMALLOC_H */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index c37924a2ee3..d206261ad9e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -982,6 +983,29 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro } EXPORT_SYMBOL(vm_map_ram); +/** + * vm_area_register_early - register vmap area early during boot + * @vm: vm_struct to register + * @size: size of area to register + * + * This function is used to register kernel vm area before + * vmalloc_init() is called. @vm->size and @vm->flags should contain + * proper values on entry and other fields should be zero. On return, + * vm->addr contains the allocated address. + * + * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. + */ +void __init vm_area_register_early(struct vm_struct *vm) +{ + static size_t vm_init_off __initdata; + + vm->addr = (void *)VMALLOC_START + vm_init_off; + vm_init_off = PFN_ALIGN(vm_init_off + vm->size); + + vm->next = vmlist; + vmlist = vm; +} + void __init vmalloc_init(void) { struct vmap_area *va; -- cgit v1.2.3-70-g09d2 From 8fc48985006da4ceba24508db64ec77fc0dfe3bb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Feb 2009 16:29:08 +0900 Subject: vmalloc: add un/map_kernel_range_noflush() Impact: two more public map/unmap functions Implement map_kernel_range_noflush() and unmap_kernel_range_noflush(). These functions respectively map and unmap address range in kernel VM area but doesn't do any vcache or tlb flushing. These will be used by new percpu allocator. Signed-off-by: Tejun Heo Cc: Nick Piggin --- include/linux/vmalloc.h | 3 +++ mm/vmalloc.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 67 insertions(+), 3 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index bbc05139229..599ba798431 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -91,6 +91,9 @@ extern struct vm_struct *remove_vm_area(const void *addr); extern int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages); +extern int map_kernel_range_noflush(unsigned long start, unsigned long size, + pgprot_t prot, struct page **pages); +extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size); extern void unmap_kernel_range(unsigned long addr, unsigned long size); /* Allocate/destroy a 'vmalloc' VM area. */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d206261ad9e..224eca9650a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -153,8 +153,8 @@ static int vmap_pud_range(pgd_t *pgd, unsigned long addr, * * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N] */ -static int vmap_page_range(unsigned long start, unsigned long end, - pgprot_t prot, struct page **pages) +static int vmap_page_range_noflush(unsigned long start, unsigned long end, + pgprot_t prot, struct page **pages) { pgd_t *pgd; unsigned long next; @@ -170,13 +170,22 @@ static int vmap_page_range(unsigned long start, unsigned long end, if (err) break; } while (pgd++, addr = next, addr != end); - flush_cache_vmap(start, end); if (unlikely(err)) return err; return nr; } +static int vmap_page_range(unsigned long start, unsigned long end, + pgprot_t prot, struct page **pages) +{ + int ret; + + ret = vmap_page_range_noflush(start, end, prot, pages); + flush_cache_vmap(start, end); + return ret; +} + static inline int is_vmalloc_or_module_addr(const void *x) { /* @@ -1033,6 +1042,58 @@ void __init vmalloc_init(void) vmap_initialized = true; } +/** + * map_kernel_range_noflush - map kernel VM area with the specified pages + * @addr: start of the VM area to map + * @size: size of the VM area to map + * @prot: page protection flags to use + * @pages: pages to map + * + * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size + * specify should have been allocated using get_vm_area() and its + * friends. + * + * NOTE: + * This function does NOT do any cache flushing. The caller is + * responsible for calling flush_cache_vmap() on to-be-mapped areas + * before calling this function. + * + * RETURNS: + * The number of pages mapped on success, -errno on failure. + */ +int map_kernel_range_noflush(unsigned long addr, unsigned long size, + pgprot_t prot, struct page **pages) +{ + return vmap_page_range_noflush(addr, addr + size, prot, pages); +} + +/** + * unmap_kernel_range_noflush - unmap kernel VM area + * @addr: start of the VM area to unmap + * @size: size of the VM area to unmap + * + * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size + * specify should have been allocated using get_vm_area() and its + * friends. + * + * NOTE: + * This function does NOT do any cache flushing. The caller is + * responsible for calling flush_cache_vunmap() on to-be-mapped areas + * before calling this function and flush_tlb_kernel_range() after. + */ +void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) +{ + vunmap_page_range(addr, addr + size); +} + +/** + * unmap_kernel_range - unmap kernel VM area and flush cache and TLB + * @addr: start of the VM area to unmap + * @size: size of the VM area to unmap + * + * Similar to unmap_kernel_range_noflush() but flushes vcache before + * the unmapping and tlb after. + */ void unmap_kernel_range(unsigned long addr, unsigned long size) { unsigned long end = addr + size; -- cgit v1.2.3-70-g09d2 From f6fcba7014f9cc535fa75ef98c008b24e49e2212 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Feb 2009 15:38:48 -0800 Subject: vmalloc: call flush_cache_vunmap() from unmap_kernel_range() Impact: proper vcache flush on unmap_kernel_range() flush_cache_vunmap() should be called before pages are unmapped. Add a call to it in unmap_kernel_range(). Signed-off-by: Tejun Heo Acked-by: Nick Piggin Acked-by: David S. Miller Cc: [2.6.28.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 4dd2636d0b9..903cad46e79 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1012,6 +1012,8 @@ void __init vmalloc_init(void) void unmap_kernel_range(unsigned long addr, unsigned long size) { unsigned long end = addr + size; + + flush_cache_vunmap(addr, end); vunmap_page_range(addr, end); flush_tlb_kernel_range(addr, end); } -- cgit v1.2.3-70-g09d2 From c0c0a29379b5848aec2e8f1c58d853d3cb7118b8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 24 Feb 2009 11:57:21 +0900 Subject: vmalloc: add @align to vm_area_register_early() Impact: allow larger alignment for early vmalloc area allocation Some early vmalloc users might want larger alignment, for example, for custom large page mapping. Add @align to vm_area_register_early(). While at it, drop docbook comment on non-existent @size. Signed-off-by: Tejun Heo Cc: Nick Piggin Cc: Ivan Kokshaysky --- arch/alpha/mm/init.c | 2 +- include/linux/vmalloc.h | 2 +- mm/percpu.c | 2 +- mm/vmalloc.c | 11 +++++++---- 4 files changed, 10 insertions(+), 7 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index df6df025ded..91eddd8505d 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -200,7 +200,7 @@ callback_init(void * kernel_end) /* register the vm area */ console_remap_vm.flags = VM_ALLOC; console_remap_vm.size = nr_pages << PAGE_SHIFT; - vm_area_register_early(&console_remap_vm); + vm_area_register_early(&console_remap_vm, PAGE_SIZE); vaddr = (unsigned long)consle_remap_vm.addr; diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 599ba798431..2f6994fdf0e 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -109,6 +109,6 @@ extern long vwrite(char *buf, char *addr, unsigned long count); */ extern rwlock_t vmlist_lock; extern struct vm_struct *vmlist; -extern __init void vm_area_register_early(struct vm_struct *vm); +extern __init void vm_area_register_early(struct vm_struct *vm, size_t align); #endif /* _LINUX_VMALLOC_H */ diff --git a/mm/percpu.c b/mm/percpu.c index ed92caa2aa3..41e7a5f5ab1 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -860,7 +860,7 @@ size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn, /* init and register vm area */ static_vm.flags = VM_ALLOC; static_vm.size = pcpu_chunk_size; - vm_area_register_early(&static_vm); + vm_area_register_early(&static_vm, PAGE_SIZE); /* init static_chunk */ static_chunk = alloc_bootmem(pcpu_chunk_struct_size); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 224eca9650a..366ae9ea6af 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -995,7 +995,7 @@ EXPORT_SYMBOL(vm_map_ram); /** * vm_area_register_early - register vmap area early during boot * @vm: vm_struct to register - * @size: size of area to register + * @align: requested alignment * * This function is used to register kernel vm area before * vmalloc_init() is called. @vm->size and @vm->flags should contain @@ -1004,12 +1004,15 @@ EXPORT_SYMBOL(vm_map_ram); * * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. */ -void __init vm_area_register_early(struct vm_struct *vm) +void __init vm_area_register_early(struct vm_struct *vm, size_t align) { static size_t vm_init_off __initdata; + unsigned long addr; + + addr = ALIGN(VMALLOC_START + vm_init_off, align); + vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START; - vm->addr = (void *)VMALLOC_START + vm_init_off; - vm_init_off = PFN_ALIGN(vm_init_off + vm->size); + vm->addr = (void *)addr; vm->next = vmlist; vmlist = vm; -- cgit v1.2.3-70-g09d2 From 34754b69a6f87aa6aa2860525a82f12532f83afd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 25 Feb 2009 16:04:03 +0100 Subject: x86: make vmap yell louder when it is used under irqs_disabled() Signed-off-by: Ingo Molnar --- arch/x86/kernel/alternative.c | 6 +++--- mm/vmalloc.c | 3 +++ 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index a84ac7b570e..6907b8e85d5 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -498,12 +498,12 @@ void *text_poke_early(void *addr, const void *opcode, size_t len) */ void *__kprobes text_poke(void *addr, const void *opcode, size_t len) { - unsigned long flags; char *vaddr; int nr_pages = 2; struct page *pages[2]; int i; + might_sleep(); if (!core_kernel_text((unsigned long)addr)) { pages[0] = vmalloc_to_page(addr); pages[1] = vmalloc_to_page(addr + PAGE_SIZE); @@ -517,9 +517,9 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len) nr_pages = 1; vaddr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); BUG_ON(!vaddr); - local_irq_save(flags); + local_irq_disable(); memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len); - local_irq_restore(flags); + local_irq_enable(); vunmap(vaddr); sync_core(); /* Could also do a CLFLUSH here to speed up CPU recovery; but diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 4dd2636d0b9..f83a70167b9 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1257,6 +1257,7 @@ EXPORT_SYMBOL(vfree); void vunmap(const void *addr) { BUG_ON(in_interrupt()); + might_sleep(); __vunmap(addr, 0); } EXPORT_SYMBOL(vunmap); @@ -1276,6 +1277,8 @@ void *vmap(struct page **pages, unsigned int count, { struct vm_struct *area; + might_sleep(); + if (count > num_physpages) return NULL; -- cgit v1.2.3-70-g09d2 From 7766970cc13e9071b356b1f2a48a9eb8675bfcce Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 27 Feb 2009 14:03:03 -0800 Subject: mm: vmap fix overflow The new vmap allocator can wrap the address and get confused in the case of large allocations or VMALLOC_END near the end of address space. Problem reported by Christoph Hellwig on a 32-bit XFS workload. Signed-off-by: Nick Piggin Reported-by: Christoph Hellwig Cc: [2.6.28.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 903cad46e79..ed3705e4b83 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -323,6 +323,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, unsigned long addr; int purged = 0; + BUG_ON(!size); BUG_ON(size & ~PAGE_MASK); va = kmalloc_node(sizeof(struct vmap_area), @@ -334,6 +335,9 @@ retry: addr = ALIGN(vstart, align); spin_lock(&vmap_area_lock); + if (addr + size - 1 < addr) + goto overflow; + /* XXX: could have a last_hole cache */ n = vmap_area_root.rb_node; if (n) { @@ -365,6 +369,8 @@ retry: while (addr + size > first->va_start && addr + size <= vend) { addr = ALIGN(first->va_end + PAGE_SIZE, align); + if (addr + size - 1 < addr) + goto overflow; n = rb_next(&first->rb_node); if (n) @@ -375,6 +381,7 @@ retry: } found: if (addr + size > vend) { +overflow: spin_unlock(&vmap_area_lock); if (!purged) { purge_vmap_area_lazy(); -- cgit v1.2.3-70-g09d2 From cbb766766f3f2f6d9326c561b1020590642c6e39 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Fri, 27 Feb 2009 14:03:04 -0800 Subject: mm: fix lazy vmap purging (use-after-free error) I just got this new warning from kmemcheck: WARNING: kmemcheck: Caught 32-bit read from freed memory (c7806a60) a06a80c7ecde70c1a04080c700000000a06709c1000000000000000000000000 f f f f f f f f f f f f f f f f f f f f f f f f f f f f f f f f ^ Pid: 0, comm: swapper Not tainted (2.6.29-rc4 #230) EIP: 0060:[] EFLAGS: 00000286 CPU: 0 EIP is at __purge_vmap_area_lazy+0x117/0x140 EAX: 00070f43 EBX: c7806a40 ECX: c1677080 EDX: 00027b66 ESI: 00002001 EDI: c170df0c EBP: c170df00 ESP: c178830c DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 CR0: 80050033 CR2: c7806b14 CR3: 01775000 CR4: 00000690 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: 00004000 DR7: 00000000 [] free_unmap_vmap_area_noflush+0x6e/0x70 [] remove_vm_area+0x2a/0x70 [] __vunmap+0x45/0xe0 [] vunmap+0x1e/0x30 [] text_poke+0x95/0x150 [] alternatives_smp_unlock+0x49/0x60 [] alternative_instructions+0x11b/0x124 [] check_bugs+0xbd/0xdc [] start_kernel+0x2ed/0x360 [] __init_begin+0x9e/0xa9 [] 0xffffffff It happened here: $ addr2line -e vmlinux -i c1096df7 mm/vmalloc.c:540 Code: list_for_each_entry(va, &valist, purge_list) __free_vmap_area(va); It's this instruction: mov 0x20(%ebx),%edx Which corresponds to a dereference of va->purge_list.next: (gdb) p ((struct vmap_area *) 0)->purge_list.next Cannot access memory at address 0x20 It seems that we should use "safe" list traversal here, as the element is freed inside the loop. Please verify that this is the right fix. Acked-by: Nick Piggin Signed-off-by: Vegard Nossum Cc: Pekka Enberg Cc: Ingo Molnar Cc: "Paul E. McKenney" Cc: [2.6.28.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ed3705e4b83..520a7598026 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -505,6 +505,7 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, static DEFINE_SPINLOCK(purge_lock); LIST_HEAD(valist); struct vmap_area *va; + struct vmap_area *n_va; int nr = 0; /* @@ -544,7 +545,7 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, if (nr) { spin_lock(&vmap_area_lock); - list_for_each_entry(va, &valist, purge_list) + list_for_each_entry_safe(va, n_va, &valist, purge_list) __free_vmap_area(va); spin_unlock(&vmap_area_lock); } -- cgit v1.2.3-70-g09d2