From 90459ce06f410b983540be56209c0abcbce23944 Mon Sep 17 00:00:00 2001
From: Bob Liu <lliubbo@gmail.com>
Date: Thu, 4 Aug 2011 11:02:33 +0200
Subject: percpu: rename pcpu_mem_alloc to pcpu_mem_zalloc

Currently pcpu_mem_alloc() is implemented always return zeroed memory.
So rename it to make user like pcpu_get_pages_and_bitmap() know don't
reinit it.

Signed-off-by: Bob Liu <lliubbo@gmail.com>
Reviewed-by: Pekka Enberg <penberg@kernel.org>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 mm/percpu.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'mm/percpu.c')

diff --git a/mm/percpu.c b/mm/percpu.c
index bf80e55dbed..28c37a2e2de 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -273,11 +273,11 @@ static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
 	     (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
 
 /**
- * pcpu_mem_alloc - allocate memory
+ * pcpu_mem_zalloc - allocate memory
  * @size: bytes to allocate
  *
  * Allocate @size bytes.  If @size is smaller than PAGE_SIZE,
- * kzalloc() is used; otherwise, vmalloc() is used.  The returned
+ * kzalloc() is used; otherwise, vzalloc() is used.  The returned
  * memory is always zeroed.
  *
  * CONTEXT:
@@ -286,7 +286,7 @@ static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
  * RETURNS:
  * Pointer to the allocated area on success, NULL on failure.
  */
-static void *pcpu_mem_alloc(size_t size)
+static void *pcpu_mem_zalloc(size_t size)
 {
 	if (WARN_ON_ONCE(!slab_is_available()))
 		return NULL;
@@ -302,7 +302,7 @@ static void *pcpu_mem_alloc(size_t size)
  * @ptr: memory to free
  * @size: size of the area
  *
- * Free @ptr.  @ptr should have been allocated using pcpu_mem_alloc().
+ * Free @ptr.  @ptr should have been allocated using pcpu_mem_zalloc().
  */
 static void pcpu_mem_free(void *ptr, size_t size)
 {
@@ -384,7 +384,7 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
 	size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
 	unsigned long flags;
 
-	new = pcpu_mem_alloc(new_size);
+	new = pcpu_mem_zalloc(new_size);
 	if (!new)
 		return -ENOMEM;
 
@@ -604,11 +604,12 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void)
 {
 	struct pcpu_chunk *chunk;
 
-	chunk = pcpu_mem_alloc(pcpu_chunk_struct_size);
+	chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size);
 	if (!chunk)
 		return NULL;
 
-	chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
+	chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC *
+						sizeof(chunk->map[0]));
 	if (!chunk->map) {
 		kfree(chunk);
 		return NULL;
@@ -1889,7 +1890,7 @@ void __init percpu_init_late(void)
 
 		BUILD_BUG_ON(size > PAGE_SIZE);
 
-		map = pcpu_mem_alloc(size);
+		map = pcpu_mem_zalloc(size);
 		BUG_ON(!map);
 
 		spin_lock_irqsave(&pcpu_lock, flags);
-- 
cgit v1.2.3-70-g09d2


From a855b84c3d8c73220d4d3cd392a7bee7c83de70e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 18 Nov 2011 10:55:35 -0800
Subject: percpu: fix chunk range calculation

Percpu allocator recorded the cpus which map to the first and last
units in pcpu_first/last_unit_cpu respectively and used them to
determine the address range of a chunk - e.g. it assumed that the
first unit has the lowest address in a chunk while the last unit has
the highest address.

This simply isn't true.  Groups in a chunk can have arbitrary positive
or negative offsets from the previous one and there is no guarantee
that the first unit occupies the lowest offset while the last one the
highest.

Fix it by actually comparing unit offsets to determine cpus occupying
the lowest and highest offsets.  Also, rename pcu_first/last_unit_cpu
to pcpu_low/high_unit_cpu to avoid confusion.

The chunk address range is used to flush cache on vmalloc area
map/unmap and decide whether a given address is in the first chunk by
per_cpu_ptr_to_phys() and the bug was discovered by invalid
per_cpu_ptr_to_phys() translation for crash_note.

Kudos to Dave Young for tracking down the problem.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: WANG Cong <xiyou.wangcong@gmail.com>
Reported-by: Dave Young <dyoung@redhat.com>
Tested-by: Dave Young <dyoung@redhat.com>
LKML-Reference: <4EC21F67.10905@redhat.com>
Cc: stable @kernel.org
---
 mm/percpu-vm.c | 12 ++++++------
 mm/percpu.c    | 34 ++++++++++++++++++++--------------
 2 files changed, 26 insertions(+), 20 deletions(-)

(limited to 'mm/percpu.c')

diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 29e3730d2ff..12a48a88c0d 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -142,8 +142,8 @@ static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
 				 int page_start, int page_end)
 {
 	flush_cache_vunmap(
-		pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
-		pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+		pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
+		pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
 }
 
 static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
@@ -205,8 +205,8 @@ static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
 				      int page_start, int page_end)
 {
 	flush_tlb_kernel_range(
-		pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
-		pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+		pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
+		pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
 }
 
 static int __pcpu_map_pages(unsigned long addr, struct page **pages,
@@ -283,8 +283,8 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
 				int page_start, int page_end)
 {
 	flush_cache_vmap(
-		pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
-		pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+		pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
+		pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
 }
 
 /**
diff --git a/mm/percpu.c b/mm/percpu.c
index 28c37a2e2de..2473ff06dc7 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -116,9 +116,9 @@ static int pcpu_atom_size __read_mostly;
 static int pcpu_nr_slots __read_mostly;
 static size_t pcpu_chunk_struct_size __read_mostly;
 
-/* cpus with the lowest and highest unit numbers */
-static unsigned int pcpu_first_unit_cpu __read_mostly;
-static unsigned int pcpu_last_unit_cpu __read_mostly;
+/* cpus with the lowest and highest unit addresses */
+static unsigned int pcpu_low_unit_cpu __read_mostly;
+static unsigned int pcpu_high_unit_cpu __read_mostly;
 
 /* the address of the first chunk which starts with the kernel static area */
 void *pcpu_base_addr __read_mostly;
@@ -985,19 +985,19 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr)
 {
 	void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
 	bool in_first_chunk = false;
-	unsigned long first_start, first_end;
+	unsigned long first_low, first_high;
 	unsigned int cpu;
 
 	/*
-	 * The following test on first_start/end isn't strictly
+	 * The following test on unit_low/high isn't strictly
 	 * necessary but will speed up lookups of addresses which
 	 * aren't in the first chunk.
 	 */
-	first_start = pcpu_chunk_addr(pcpu_first_chunk, pcpu_first_unit_cpu, 0);
-	first_end = pcpu_chunk_addr(pcpu_first_chunk, pcpu_last_unit_cpu,
-				    pcpu_unit_pages);
-	if ((unsigned long)addr >= first_start &&
-	    (unsigned long)addr < first_end) {
+	first_low = pcpu_chunk_addr(pcpu_first_chunk, pcpu_low_unit_cpu, 0);
+	first_high = pcpu_chunk_addr(pcpu_first_chunk, pcpu_high_unit_cpu,
+				     pcpu_unit_pages);
+	if ((unsigned long)addr >= first_low &&
+	    (unsigned long)addr < first_high) {
 		for_each_possible_cpu(cpu) {
 			void *start = per_cpu_ptr(base, cpu);
 
@@ -1234,7 +1234,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 
 	for (cpu = 0; cpu < nr_cpu_ids; cpu++)
 		unit_map[cpu] = UINT_MAX;
-	pcpu_first_unit_cpu = NR_CPUS;
+
+	pcpu_low_unit_cpu = NR_CPUS;
+	pcpu_high_unit_cpu = NR_CPUS;
 
 	for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
 		const struct pcpu_group_info *gi = &ai->groups[group];
@@ -1254,9 +1256,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 			unit_map[cpu] = unit + i;
 			unit_off[cpu] = gi->base_offset + i * ai->unit_size;
 
-			if (pcpu_first_unit_cpu == NR_CPUS)
-				pcpu_first_unit_cpu = cpu;
-			pcpu_last_unit_cpu = cpu;
+			/* determine low/high unit_cpu */
+			if (pcpu_low_unit_cpu == NR_CPUS ||
+			    unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
+				pcpu_low_unit_cpu = cpu;
+			if (pcpu_high_unit_cpu == NR_CPUS ||
+			    unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
+				pcpu_high_unit_cpu = cpu;
 		}
 	}
 	pcpu_nr_units = unit;
-- 
cgit v1.2.3-70-g09d2


From 67589c71456b0346500629967292dea3802230b6 Mon Sep 17 00:00:00 2001
From: Dave Young <dyoung@redhat.com>
Date: Wed, 23 Nov 2011 08:20:53 -0800
Subject: percpu: explain why per_cpu_ptr_to_phys() is more complicated than
 necessary

Add comments about current per_cpu_ptr_to_phys implementation to
explain why the logic is more complicated than necessary.

-tj: relocated comment into kerneldoc comment

Signed-off-by: Dave Young <dyoung@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 mm/percpu.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'mm/percpu.c')

diff --git a/mm/percpu.c b/mm/percpu.c
index 2473ff06dc7..3bb810a7200 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -978,6 +978,17 @@ bool is_kernel_percpu_address(unsigned long addr)
  * address.  The caller is responsible for ensuring @addr stays valid
  * until this function finishes.
  *
+ * percpu allocator has special setup for the first chunk, which currently
+ * supports either embedding in linear address space or vmalloc mapping,
+ * and, from the second one, the backing allocator (currently either vm or
+ * km) provides translation.
+ *
+ * The addr can be tranlated simply without checking if it falls into the
+ * first chunk. But the current code reflects better how percpu allocator
+ * actually works, and the verification can discover both bugs in percpu
+ * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current
+ * code.
+ *
  * RETURNS:
  * The physical address for @addr.
  */
-- 
cgit v1.2.3-70-g09d2


From 9f57bd4d6dc69a4e3bf43044fa00fcd24dd363e3 Mon Sep 17 00:00:00 2001
From: Eugene Surovegin <ebs@ebshome.net>
Date: Thu, 15 Dec 2011 11:25:59 -0800
Subject: percpu: fix per_cpu_ptr_to_phys() handling of non-page-aligned
 addresses

per_cpu_ptr_to_phys() incorrectly rounds up its result for non-kmalloc
case to the page boundary, which is bogus for any non-page-aligned
address.

This affects the only in-tree user of this function - sysfs handler
for per-cpu 'crash_notes' physical address.  The trouble is that the
crash_notes per-cpu variable is not page-aligned:

crash_notes = 0xc08e8ed4
PER-CPU OFFSET VALUES:
 CPU 0: 3711f000
 CPU 1: 37129000
 CPU 2: 37133000
 CPU 3: 3713d000

So, the per-cpu addresses are:
 crash_notes on CPU 0: f7a07ed4 => phys 36b57ed4
 crash_notes on CPU 1: f7a11ed4 => phys 36b4ded4
 crash_notes on CPU 2: f7a1bed4 => phys 36b43ed4
 crash_notes on CPU 3: f7a25ed4 => phys 36b39ed4

However, /sys/devices/system/cpu/cpu*/crash_notes says:
 /sys/devices/system/cpu/cpu0/crash_notes: 36b57000
 /sys/devices/system/cpu/cpu1/crash_notes: 36b4d000
 /sys/devices/system/cpu/cpu2/crash_notes: 36b43000
 /sys/devices/system/cpu/cpu3/crash_notes: 36b39000

As you can see, all values are rounded down to a page
boundary. Consequently, this is where kexec sets up the NOTE segments,
and thus where the secondary kernel is looking for them. However, when
the first kernel crashes, it saves the notes to the unaligned
addresses, where they are not found.

Fix it by adding offset_in_page() to the translated page address.

-tj: Combined Eugene's and Petr's commit messages.

Signed-off-by: Eugene Surovegin <ebs@ebshome.net>
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Petr Tesarik <ptesarik@suse.cz>
Cc: stable@kernel.org
---
 mm/percpu.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'mm/percpu.c')

diff --git a/mm/percpu.c b/mm/percpu.c
index 3bb810a7200..716eb4acf2f 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1023,9 +1023,11 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr)
 		if (!is_vmalloc_addr(addr))
 			return __pa(addr);
 		else
-			return page_to_phys(vmalloc_to_page(addr));
+			return page_to_phys(vmalloc_to_page(addr)) +
+			       offset_in_page(addr);
 	} else
-		return page_to_phys(pcpu_addr_to_page(addr));
+		return page_to_phys(pcpu_addr_to_page(addr)) +
+		       offset_in_page(addr);
 }
 
 /**
-- 
cgit v1.2.3-70-g09d2