diff options
Diffstat (limited to 'mm/percpu.c')
-rw-r--r-- | mm/percpu.c | 213 |
1 files changed, 90 insertions, 123 deletions
diff --git a/mm/percpu.c b/mm/percpu.c index 1aa5d8fbca1..3311c8919f3 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -8,12 +8,12 @@ * * This is percpu allocator which can handle both static and dynamic * areas. Percpu areas are allocated in chunks in vmalloc area. Each - * chunk is consisted of num_possible_cpus() units and the first chunk - * is used for static percpu variables in the kernel image (special - * boot time alloc/init handling necessary as these areas need to be - * brought up before allocation services are running). Unit grows as - * necessary and all units grow or shrink in unison. When a chunk is - * filled up, another chunk is allocated. ie. in vmalloc area + * chunk is consisted of nr_cpu_ids units and the first chunk is used + * for static percpu variables in the kernel image (special boot time + * alloc/init handling necessary as these areas need to be brought up + * before allocation services are running). Unit grows as necessary + * and all units grow or shrink in unison. When a chunk is filled up, + * another chunk is allocated. ie. in vmalloc area * * c0 c1 c2 * ------------------- ------------------- ------------ @@ -23,7 +23,7 @@ * Allocation is done in offset-size areas of single unit space. Ie, * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring - * percpu base registers UNIT_SIZE apart. + * percpu base registers pcpu_unit_size apart. * * There are usually many small percpu allocations many of them as * small as 4 bytes. The allocator organizes chunks into lists @@ -38,8 +38,8 @@ * region and negative allocated. Allocation inside a chunk is done * by scanning this map sequentially and serving the first matching * entry. This is mostly copied from the percpu_modalloc() allocator. - * Chunks are also linked into a rb tree to ease address to chunk - * mapping during free. + * Chunks can be determined from the address using the index field + * in the page struct. The index field contains a pointer to the chunk. * * To use this allocator, arch code should do the followings. * @@ -61,7 +61,6 @@ #include <linux/mutex.h> #include <linux/percpu.h> #include <linux/pfn.h> -#include <linux/rbtree.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/vmalloc.h> @@ -88,7 +87,6 @@ struct pcpu_chunk { struct list_head list; /* linked to pcpu_slot lists */ - struct rb_node rb_node; /* key is chunk->vm->addr */ int free_size; /* free bytes in the chunk */ int contig_hint; /* max contiguous size hint */ struct vm_struct *vm; /* mapped vmalloc region */ @@ -110,9 +108,21 @@ static size_t pcpu_chunk_struct_size __read_mostly; void *pcpu_base_addr __read_mostly; EXPORT_SYMBOL_GPL(pcpu_base_addr); -/* optional reserved chunk, only accessible for reserved allocations */ +/* + * The first chunk which always exists. Note that unlike other + * chunks, this one can be allocated and mapped in several different + * ways and thus often doesn't live in the vmalloc area. + */ +static struct pcpu_chunk *pcpu_first_chunk; + +/* + * Optional reserved chunk. This chunk reserves part of the first + * chunk and serves it for reserved allocations. The amount of + * reserved offset is in pcpu_reserved_chunk_limit. When reserved + * area doesn't exist, the following variables contain NULL and 0 + * respectively. + */ static struct pcpu_chunk *pcpu_reserved_chunk; -/* offset limit of the reserved chunk */ static int pcpu_reserved_chunk_limit; /* @@ -121,7 +131,7 @@ static int pcpu_reserved_chunk_limit; * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former * protects allocation/reclaim paths, chunks and chunk->page arrays. * The latter is a spinlock and protects the index data structures - - * chunk slots, rbtree, chunks and area maps in chunks. + * chunk slots, chunks and area maps in chunks. * * During allocation, pcpu_alloc_mutex is kept locked all the time and * pcpu_lock is grabbed and released as necessary. All actual memory @@ -140,7 +150,6 @@ static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */ static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ -static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */ /* reclaim work to release fully free chunks, scheduled from free path */ static void pcpu_reclaim(struct work_struct *work); @@ -188,7 +197,24 @@ static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, int page_idx) { - return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL; + /* + * Any possible cpu id can be used here, so there's no need to + * worry about preemption or cpu hotplug. + */ + return *pcpu_chunk_pagep(chunk, raw_smp_processor_id(), + page_idx) != NULL; +} + +/* set the pointer to a chunk in a page struct */ +static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu) +{ + page->index = (unsigned long)pcpu; +} + +/* obtain pointer to a chunk from a page struct */ +static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page) +{ + return (struct pcpu_chunk *)page->index; } /** @@ -257,93 +283,34 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) } } -static struct rb_node **pcpu_chunk_rb_search(void *addr, - struct rb_node **parentp) -{ - struct rb_node **p = &pcpu_addr_root.rb_node; - struct rb_node *parent = NULL; - struct pcpu_chunk *chunk; - - while (*p) { - parent = *p; - chunk = rb_entry(parent, struct pcpu_chunk, rb_node); - - if (addr < chunk->vm->addr) - p = &(*p)->rb_left; - else if (addr > chunk->vm->addr) - p = &(*p)->rb_right; - else - break; - } - - if (parentp) - *parentp = parent; - return p; -} - /** - * pcpu_chunk_addr_search - search for chunk containing specified address - * @addr: address to search for - * - * Look for chunk which might contain @addr. More specifically, it - * searchs for the chunk with the highest start address which isn't - * beyond @addr. - * - * CONTEXT: - * pcpu_lock. + * pcpu_chunk_addr_search - determine chunk containing specified address + * @addr: address for which the chunk needs to be determined. * * RETURNS: * The address of the found chunk. */ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) { - struct rb_node *n, *parent; - struct pcpu_chunk *chunk; - - /* is it in the reserved chunk? */ - if (pcpu_reserved_chunk) { - void *start = pcpu_reserved_chunk->vm->addr; + void *first_start = pcpu_first_chunk->vm->addr; - if (addr >= start && addr < start + pcpu_reserved_chunk_limit) + /* is it in the first chunk? */ + if (addr >= first_start && addr < first_start + pcpu_chunk_size) { + /* is it in the reserved area? */ + if (addr < first_start + pcpu_reserved_chunk_limit) return pcpu_reserved_chunk; + return pcpu_first_chunk; } - /* nah... search the regular ones */ - n = *pcpu_chunk_rb_search(addr, &parent); - if (!n) { - /* no exactly matching chunk, the parent is the closest */ - n = parent; - BUG_ON(!n); - } - chunk = rb_entry(n, struct pcpu_chunk, rb_node); - - if (addr < chunk->vm->addr) { - /* the parent was the next one, look for the previous one */ - n = rb_prev(n); - BUG_ON(!n); - chunk = rb_entry(n, struct pcpu_chunk, rb_node); - } - - return chunk; -} - -/** - * pcpu_chunk_addr_insert - insert chunk into address rb tree - * @new: chunk to insert - * - * Insert @new into address rb tree. - * - * CONTEXT: - * pcpu_lock. - */ -static void pcpu_chunk_addr_insert(struct pcpu_chunk *new) -{ - struct rb_node **p, *parent; - - p = pcpu_chunk_rb_search(new->vm->addr, &parent); - BUG_ON(*p); - rb_link_node(&new->rb_node, parent, p); - rb_insert_color(&new->rb_node, &pcpu_addr_root); + /* + * The address is relative to unit0 which might be unused and + * thus unmapped. Offset the address to the unit space of the + * current processor before looking it up in the vmalloc + * space. Note that any possible cpu id can be used here, so + * there's no need to worry about preemption or cpu hotplug. + */ + addr += raw_smp_processor_id() * pcpu_unit_size; + return pcpu_get_page_chunk(vmalloc_to_page(addr)); } /** @@ -595,16 +562,16 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) * @chunk: chunk of interest * @page_start: page index of the first page to unmap * @page_end: page index of the last page to unmap + 1 - * @flush: whether to flush cache and tlb or not + * @flush_tlb: whether to flush tlb or not * * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. * If @flush is true, vcache is flushed before unmapping and tlb * after. */ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, - bool flush) + bool flush_tlb) { - unsigned int last = num_possible_cpus() - 1; + unsigned int last = nr_cpu_ids - 1; unsigned int cpu; /* unmap must not be done on immutable chunk */ @@ -615,9 +582,8 @@ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, * the whole region at once rather than doing it for each cpu. * This could be an overkill but is more scalable. */ - if (flush) - flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start), - pcpu_chunk_addr(chunk, last, page_end)); + flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start), + pcpu_chunk_addr(chunk, last, page_end)); for_each_possible_cpu(cpu) unmap_kernel_range_noflush( @@ -625,7 +591,7 @@ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, (page_end - page_start) << PAGE_SHIFT); /* ditto as flush_cache_vunmap() */ - if (flush) + if (flush_tlb) flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start), pcpu_chunk_addr(chunk, last, page_end)); } @@ -690,7 +656,7 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, */ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) { - unsigned int last = num_possible_cpus() - 1; + unsigned int last = nr_cpu_ids - 1; unsigned int cpu; int err; @@ -755,6 +721,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) alloc_mask, 0); if (!*pagep) goto err; + pcpu_set_page_chunk(*pagep, chunk); } } @@ -795,7 +762,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) chunk->map[chunk->map_used++] = pcpu_unit_size; chunk->page = chunk->page_ar; - chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL); + chunk->vm = get_vm_area(pcpu_chunk_size, VM_ALLOC); if (!chunk->vm) { free_pcpu_chunk(chunk); return NULL; @@ -879,7 +846,6 @@ restart: spin_lock_irq(&pcpu_lock); pcpu_chunk_relocate(chunk, -1); - pcpu_chunk_addr_insert(chunk); goto restart; area_found: @@ -968,7 +934,6 @@ static void pcpu_reclaim(struct work_struct *work) if (chunk == list_first_entry(head, struct pcpu_chunk, list)) continue; - rb_erase(&chunk->rb_node, &pcpu_addr_root); list_move(&chunk->list, &todo); } @@ -1115,9 +1080,9 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, PFN_UP(size_sum)); pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; - pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; + pcpu_chunk_size = nr_cpu_ids * pcpu_unit_size; pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) - + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *); + + nr_cpu_ids * pcpu_unit_pages * sizeof(struct page *); if (dyn_size < 0) dyn_size = pcpu_unit_size - static_size - reserved_size; @@ -1147,7 +1112,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, if (reserved_size) { schunk->free_size = reserved_size; - pcpu_reserved_chunk = schunk; /* not for dynamic alloc */ + pcpu_reserved_chunk = schunk; + pcpu_reserved_chunk_limit = static_size + reserved_size; } else { schunk->free_size = dyn_size; dyn_size = 0; /* dynamic area covered */ @@ -1158,8 +1124,6 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, if (schunk->free_size) schunk->map[schunk->map_used++] = schunk->free_size; - pcpu_reserved_chunk_limit = static_size + schunk->free_size; - /* init dynamic chunk if necessary */ if (dyn_size) { dchunk = alloc_bootmem(sizeof(struct pcpu_chunk)); @@ -1226,13 +1190,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, } /* link the first chunk in */ - if (!dchunk) { - pcpu_chunk_relocate(schunk, -1); - pcpu_chunk_addr_insert(schunk); - } else { - pcpu_chunk_relocate(dchunk, -1); - pcpu_chunk_addr_insert(dchunk); - } + pcpu_first_chunk = dchunk ?: schunk; + pcpu_chunk_relocate(pcpu_first_chunk, -1); /* we're done */ pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); @@ -1287,6 +1246,7 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, ssize_t dyn_size, ssize_t unit_size) { + size_t chunk_size; unsigned int cpu; /* determine parameters and allocate */ @@ -1301,19 +1261,26 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, } else pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); - pcpue_ptr = __alloc_bootmem_nopanic( - num_possible_cpus() * pcpue_unit_size, - PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); - if (!pcpue_ptr) + chunk_size = pcpue_unit_size * nr_cpu_ids; + + pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, + __pa(MAX_DMA_ADDRESS)); + if (!pcpue_ptr) { + pr_warning("PERCPU: failed to allocate %zu bytes for " + "embedding\n", chunk_size); return -ENOMEM; + } /* return the leftover and copy */ - for_each_possible_cpu(cpu) { + for (cpu = 0; cpu < nr_cpu_ids; cpu++) { void *ptr = pcpue_ptr + cpu * pcpue_unit_size; - free_bootmem(__pa(ptr + pcpue_size), - pcpue_unit_size - pcpue_size); - memcpy(ptr, __per_cpu_load, static_size); + if (cpu_possible(cpu)) { + free_bootmem(__pa(ptr + pcpue_size), + pcpue_unit_size - pcpue_size); + memcpy(ptr, __per_cpu_load, static_size); + } else + free_bootmem(__pa(ptr), pcpue_unit_size); } /* we're ready, commit */ |