summaryrefslogtreecommitdiffstats
path: root/mm/percpu.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/percpu.c')
-rw-r--r--mm/percpu.c213
1 files changed, 90 insertions, 123 deletions
diff --git a/mm/percpu.c b/mm/percpu.c
index 1aa5d8fbca1..3311c8919f3 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -8,12 +8,12 @@
*
* This is percpu allocator which can handle both static and dynamic
* areas. Percpu areas are allocated in chunks in vmalloc area. Each
- * chunk is consisted of num_possible_cpus() units and the first chunk
- * is used for static percpu variables in the kernel image (special
- * boot time alloc/init handling necessary as these areas need to be
- * brought up before allocation services are running). Unit grows as
- * necessary and all units grow or shrink in unison. When a chunk is
- * filled up, another chunk is allocated. ie. in vmalloc area
+ * chunk is consisted of nr_cpu_ids units and the first chunk is used
+ * for static percpu variables in the kernel image (special boot time
+ * alloc/init handling necessary as these areas need to be brought up
+ * before allocation services are running). Unit grows as necessary
+ * and all units grow or shrink in unison. When a chunk is filled up,
+ * another chunk is allocated. ie. in vmalloc area
*
* c0 c1 c2
* ------------------- ------------------- ------------
@@ -23,7 +23,7 @@
* Allocation is done in offset-size areas of single unit space. Ie,
* an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
* c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring
- * percpu base registers UNIT_SIZE apart.
+ * percpu base registers pcpu_unit_size apart.
*
* There are usually many small percpu allocations many of them as
* small as 4 bytes. The allocator organizes chunks into lists
@@ -38,8 +38,8 @@
* region and negative allocated. Allocation inside a chunk is done
* by scanning this map sequentially and serving the first matching
* entry. This is mostly copied from the percpu_modalloc() allocator.
- * Chunks are also linked into a rb tree to ease address to chunk
- * mapping during free.
+ * Chunks can be determined from the address using the index field
+ * in the page struct. The index field contains a pointer to the chunk.
*
* To use this allocator, arch code should do the followings.
*
@@ -61,7 +61,6 @@
#include <linux/mutex.h>
#include <linux/percpu.h>
#include <linux/pfn.h>
-#include <linux/rbtree.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
@@ -88,7 +87,6 @@
struct pcpu_chunk {
struct list_head list; /* linked to pcpu_slot lists */
- struct rb_node rb_node; /* key is chunk->vm->addr */
int free_size; /* free bytes in the chunk */
int contig_hint; /* max contiguous size hint */
struct vm_struct *vm; /* mapped vmalloc region */
@@ -110,9 +108,21 @@ static size_t pcpu_chunk_struct_size __read_mostly;
void *pcpu_base_addr __read_mostly;
EXPORT_SYMBOL_GPL(pcpu_base_addr);
-/* optional reserved chunk, only accessible for reserved allocations */
+/*
+ * The first chunk which always exists. Note that unlike other
+ * chunks, this one can be allocated and mapped in several different
+ * ways and thus often doesn't live in the vmalloc area.
+ */
+static struct pcpu_chunk *pcpu_first_chunk;
+
+/*
+ * Optional reserved chunk. This chunk reserves part of the first
+ * chunk and serves it for reserved allocations. The amount of
+ * reserved offset is in pcpu_reserved_chunk_limit. When reserved
+ * area doesn't exist, the following variables contain NULL and 0
+ * respectively.
+ */
static struct pcpu_chunk *pcpu_reserved_chunk;
-/* offset limit of the reserved chunk */
static int pcpu_reserved_chunk_limit;
/*
@@ -121,7 +131,7 @@ static int pcpu_reserved_chunk_limit;
* There are two locks - pcpu_alloc_mutex and pcpu_lock. The former
* protects allocation/reclaim paths, chunks and chunk->page arrays.
* The latter is a spinlock and protects the index data structures -
- * chunk slots, rbtree, chunks and area maps in chunks.
+ * chunk slots, chunks and area maps in chunks.
*
* During allocation, pcpu_alloc_mutex is kept locked all the time and
* pcpu_lock is grabbed and released as necessary. All actual memory
@@ -140,7 +150,6 @@ static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */
static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */
static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
-static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */
/* reclaim work to release fully free chunks, scheduled from free path */
static void pcpu_reclaim(struct work_struct *work);
@@ -188,7 +197,24 @@ static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
int page_idx)
{
- return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL;
+ /*
+ * Any possible cpu id can be used here, so there's no need to
+ * worry about preemption or cpu hotplug.
+ */
+ return *pcpu_chunk_pagep(chunk, raw_smp_processor_id(),
+ page_idx) != NULL;
+}
+
+/* set the pointer to a chunk in a page struct */
+static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
+{
+ page->index = (unsigned long)pcpu;
+}
+
+/* obtain pointer to a chunk from a page struct */
+static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
+{
+ return (struct pcpu_chunk *)page->index;
}
/**
@@ -257,93 +283,34 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
}
}
-static struct rb_node **pcpu_chunk_rb_search(void *addr,
- struct rb_node **parentp)
-{
- struct rb_node **p = &pcpu_addr_root.rb_node;
- struct rb_node *parent = NULL;
- struct pcpu_chunk *chunk;
-
- while (*p) {
- parent = *p;
- chunk = rb_entry(parent, struct pcpu_chunk, rb_node);
-
- if (addr < chunk->vm->addr)
- p = &(*p)->rb_left;
- else if (addr > chunk->vm->addr)
- p = &(*p)->rb_right;
- else
- break;
- }
-
- if (parentp)
- *parentp = parent;
- return p;
-}
-
/**
- * pcpu_chunk_addr_search - search for chunk containing specified address
- * @addr: address to search for
- *
- * Look for chunk which might contain @addr. More specifically, it
- * searchs for the chunk with the highest start address which isn't
- * beyond @addr.
- *
- * CONTEXT:
- * pcpu_lock.
+ * pcpu_chunk_addr_search - determine chunk containing specified address
+ * @addr: address for which the chunk needs to be determined.
*
* RETURNS:
* The address of the found chunk.
*/
static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
{
- struct rb_node *n, *parent;
- struct pcpu_chunk *chunk;
-
- /* is it in the reserved chunk? */
- if (pcpu_reserved_chunk) {
- void *start = pcpu_reserved_chunk->vm->addr;
+ void *first_start = pcpu_first_chunk->vm->addr;
- if (addr >= start && addr < start + pcpu_reserved_chunk_limit)
+ /* is it in the first chunk? */
+ if (addr >= first_start && addr < first_start + pcpu_chunk_size) {
+ /* is it in the reserved area? */
+ if (addr < first_start + pcpu_reserved_chunk_limit)
return pcpu_reserved_chunk;
+ return pcpu_first_chunk;
}
- /* nah... search the regular ones */
- n = *pcpu_chunk_rb_search(addr, &parent);
- if (!n) {
- /* no exactly matching chunk, the parent is the closest */
- n = parent;
- BUG_ON(!n);
- }
- chunk = rb_entry(n, struct pcpu_chunk, rb_node);
-
- if (addr < chunk->vm->addr) {
- /* the parent was the next one, look for the previous one */
- n = rb_prev(n);
- BUG_ON(!n);
- chunk = rb_entry(n, struct pcpu_chunk, rb_node);
- }
-
- return chunk;
-}
-
-/**
- * pcpu_chunk_addr_insert - insert chunk into address rb tree
- * @new: chunk to insert
- *
- * Insert @new into address rb tree.
- *
- * CONTEXT:
- * pcpu_lock.
- */
-static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
-{
- struct rb_node **p, *parent;
-
- p = pcpu_chunk_rb_search(new->vm->addr, &parent);
- BUG_ON(*p);
- rb_link_node(&new->rb_node, parent, p);
- rb_insert_color(&new->rb_node, &pcpu_addr_root);
+ /*
+ * The address is relative to unit0 which might be unused and
+ * thus unmapped. Offset the address to the unit space of the
+ * current processor before looking it up in the vmalloc
+ * space. Note that any possible cpu id can be used here, so
+ * there's no need to worry about preemption or cpu hotplug.
+ */
+ addr += raw_smp_processor_id() * pcpu_unit_size;
+ return pcpu_get_page_chunk(vmalloc_to_page(addr));
}
/**
@@ -595,16 +562,16 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
* @chunk: chunk of interest
* @page_start: page index of the first page to unmap
* @page_end: page index of the last page to unmap + 1
- * @flush: whether to flush cache and tlb or not
+ * @flush_tlb: whether to flush tlb or not
*
* For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
* If @flush is true, vcache is flushed before unmapping and tlb
* after.
*/
static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
- bool flush)
+ bool flush_tlb)
{
- unsigned int last = num_possible_cpus() - 1;
+ unsigned int last = nr_cpu_ids - 1;
unsigned int cpu;
/* unmap must not be done on immutable chunk */
@@ -615,9 +582,8 @@ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
* the whole region at once rather than doing it for each cpu.
* This could be an overkill but is more scalable.
*/
- if (flush)
- flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
- pcpu_chunk_addr(chunk, last, page_end));
+ flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
+ pcpu_chunk_addr(chunk, last, page_end));
for_each_possible_cpu(cpu)
unmap_kernel_range_noflush(
@@ -625,7 +591,7 @@ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
(page_end - page_start) << PAGE_SHIFT);
/* ditto as flush_cache_vunmap() */
- if (flush)
+ if (flush_tlb)
flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
pcpu_chunk_addr(chunk, last, page_end));
}
@@ -690,7 +656,7 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
*/
static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
{
- unsigned int last = num_possible_cpus() - 1;
+ unsigned int last = nr_cpu_ids - 1;
unsigned int cpu;
int err;
@@ -755,6 +721,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
alloc_mask, 0);
if (!*pagep)
goto err;
+ pcpu_set_page_chunk(*pagep, chunk);
}
}
@@ -795,7 +762,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
chunk->map[chunk->map_used++] = pcpu_unit_size;
chunk->page = chunk->page_ar;
- chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
+ chunk->vm = get_vm_area(pcpu_chunk_size, VM_ALLOC);
if (!chunk->vm) {
free_pcpu_chunk(chunk);
return NULL;
@@ -879,7 +846,6 @@ restart:
spin_lock_irq(&pcpu_lock);
pcpu_chunk_relocate(chunk, -1);
- pcpu_chunk_addr_insert(chunk);
goto restart;
area_found:
@@ -968,7 +934,6 @@ static void pcpu_reclaim(struct work_struct *work)
if (chunk == list_first_entry(head, struct pcpu_chunk, list))
continue;
- rb_erase(&chunk->rb_node, &pcpu_addr_root);
list_move(&chunk->list, &todo);
}
@@ -1115,9 +1080,9 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
PFN_UP(size_sum));
pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
- pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
+ pcpu_chunk_size = nr_cpu_ids * pcpu_unit_size;
pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
- + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *);
+ + nr_cpu_ids * pcpu_unit_pages * sizeof(struct page *);
if (dyn_size < 0)
dyn_size = pcpu_unit_size - static_size - reserved_size;
@@ -1147,7 +1112,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
if (reserved_size) {
schunk->free_size = reserved_size;
- pcpu_reserved_chunk = schunk; /* not for dynamic alloc */
+ pcpu_reserved_chunk = schunk;
+ pcpu_reserved_chunk_limit = static_size + reserved_size;
} else {
schunk->free_size = dyn_size;
dyn_size = 0; /* dynamic area covered */
@@ -1158,8 +1124,6 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
if (schunk->free_size)
schunk->map[schunk->map_used++] = schunk->free_size;
- pcpu_reserved_chunk_limit = static_size + schunk->free_size;
-
/* init dynamic chunk if necessary */
if (dyn_size) {
dchunk = alloc_bootmem(sizeof(struct pcpu_chunk));
@@ -1226,13 +1190,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
}
/* link the first chunk in */
- if (!dchunk) {
- pcpu_chunk_relocate(schunk, -1);
- pcpu_chunk_addr_insert(schunk);
- } else {
- pcpu_chunk_relocate(dchunk, -1);
- pcpu_chunk_addr_insert(dchunk);
- }
+ pcpu_first_chunk = dchunk ?: schunk;
+ pcpu_chunk_relocate(pcpu_first_chunk, -1);
/* we're done */
pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0);
@@ -1287,6 +1246,7 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
ssize_t dyn_size, ssize_t unit_size)
{
+ size_t chunk_size;
unsigned int cpu;
/* determine parameters and allocate */
@@ -1301,19 +1261,26 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
} else
pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
- pcpue_ptr = __alloc_bootmem_nopanic(
- num_possible_cpus() * pcpue_unit_size,
- PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
- if (!pcpue_ptr)
+ chunk_size = pcpue_unit_size * nr_cpu_ids;
+
+ pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
+ __pa(MAX_DMA_ADDRESS));
+ if (!pcpue_ptr) {
+ pr_warning("PERCPU: failed to allocate %zu bytes for "
+ "embedding\n", chunk_size);
return -ENOMEM;
+ }
/* return the leftover and copy */
- for_each_possible_cpu(cpu) {
+ for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
void *ptr = pcpue_ptr + cpu * pcpue_unit_size;
- free_bootmem(__pa(ptr + pcpue_size),
- pcpue_unit_size - pcpue_size);
- memcpy(ptr, __per_cpu_load, static_size);
+ if (cpu_possible(cpu)) {
+ free_bootmem(__pa(ptr + pcpue_size),
+ pcpue_unit_size - pcpue_size);
+ memcpy(ptr, __per_cpu_load, static_size);
+ } else
+ free_bootmem(__pa(ptr), pcpue_unit_size);
}
/* we're ready, commit */