From 241771ef016b5c0c83cd7a4372a74321c973c1e6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 3 Dec 2008 10:39:53 +0100
Subject: performance counters: x86 support

Implement performance counters for x86 Intel CPUs.

It's simplified right now: the PERFMON CPU feature is assumed,
which is available in Core2 and later Intel CPUs.

The design is flexible to be extended to more CPU types as well.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/Kconfig')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d4d4cb7629e..f2fdc186724 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -643,6 +643,7 @@ config X86_UP_IOAPIC
 config X86_LOCAL_APIC
 	def_bool y
 	depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH))
+	select HAVE_PERF_COUNTERS
 
 config X86_IO_APIC
 	def_bool y
-- 
cgit v1.2.3-70-g09d2


From ee06094f8279e1312fc0a31591320cc7b6f0ab1e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 13 Dec 2008 09:00:03 +0100
Subject: perfcounters: restructure x86 counter math

Impact: restructure code

Change counter math from absolute values to clear delta logic.

We try to extract elapsed deltas from the raw hw counter - and put
that into the generic counter.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig                   |   2 +-
 arch/x86/kernel/cpu/perf_counter.c | 230 ++++++++++++++++++++-----------------
 include/linux/perf_counter.h       |  15 ++-
 kernel/perf_counter.c              |  68 +----------
 4 files changed, 137 insertions(+), 178 deletions(-)

(limited to 'arch/x86/Kconfig')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f2fdc186724..fe94490bab6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -643,7 +643,7 @@ config X86_UP_IOAPIC
 config X86_LOCAL_APIC
 	def_bool y
 	depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH))
-	select HAVE_PERF_COUNTERS
+	select HAVE_PERF_COUNTERS if (!M386 && !M486)
 
 config X86_IO_APIC
 	def_bool y
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index b903f8df72b..5afae13d8d5 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -53,6 +53,48 @@ const int intel_perfmon_event_map[] =
 
 const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
 
+/*
+ * Propagate counter elapsed time into the generic counter.
+ * Can only be executed on the CPU where the counter is active.
+ * Returns the delta events processed.
+ */
+static void
+x86_perf_counter_update(struct perf_counter *counter,
+			struct hw_perf_counter *hwc, int idx)
+{
+	u64 prev_raw_count, new_raw_count, delta;
+
+	WARN_ON_ONCE(counter->state != PERF_COUNTER_STATE_ACTIVE);
+	/*
+	 * Careful: an NMI might modify the previous counter value.
+	 *
+	 * Our tactic to handle this is to first atomically read and
+	 * exchange a new raw count - then add that new-prev delta
+	 * count to the generic counter atomically:
+	 */
+again:
+	prev_raw_count = atomic64_read(&hwc->prev_count);
+	rdmsrl(hwc->counter_base + idx, new_raw_count);
+
+	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
+					new_raw_count) != prev_raw_count)
+		goto again;
+
+	/*
+	 * Now we have the new raw value and have updated the prev
+	 * timestamp already. We can now calculate the elapsed delta
+	 * (counter-)time and add that to the generic counter.
+	 *
+	 * Careful, not all hw sign-extends above the physical width
+	 * of the count, so we do that by clipping the delta to 32 bits:
+	 */
+	delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count);
+	WARN_ON_ONCE((int)delta < 0);
+
+	atomic64_add(delta, &counter->count);
+	atomic64_sub(delta, &hwc->period_left);
+}
+
 /*
  * Setup the hardware configuration for a given hw_event_type
  */
@@ -90,10 +132,10 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	 * so we install an artificial 1<<31 period regardless of
 	 * the generic counter period:
 	 */
-	if (!hwc->irq_period)
+	if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF)
 		hwc->irq_period = 0x7FFFFFFF;
 
-	hwc->next_count	= -(s32)hwc->irq_period;
+	atomic64_set(&hwc->period_left, hwc->irq_period);
 
 	/*
 	 * Raw event type provide the config in the event structure
@@ -118,12 +160,6 @@ void hw_perf_enable_all(void)
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
 }
 
-void hw_perf_restore(u64 ctrl)
-{
-	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0);
-}
-EXPORT_SYMBOL_GPL(hw_perf_restore);
-
 u64 hw_perf_save_disable(void)
 {
 	u64 ctrl;
@@ -134,27 +170,74 @@ u64 hw_perf_save_disable(void)
 }
 EXPORT_SYMBOL_GPL(hw_perf_save_disable);
 
+void hw_perf_restore(u64 ctrl)
+{
+	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0);
+}
+EXPORT_SYMBOL_GPL(hw_perf_restore);
+
 static inline void
-__x86_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx)
+__x86_perf_counter_disable(struct perf_counter *counter,
+			   struct hw_perf_counter *hwc, unsigned int idx)
 {
-	wrmsr(hwc->config_base + idx, hwc->config, 0);
+	int err;
+
+	err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
+	WARN_ON_ONCE(err);
 }
 
-static DEFINE_PER_CPU(u64, prev_next_count[MAX_HW_COUNTERS]);
+static DEFINE_PER_CPU(u64, prev_left[MAX_HW_COUNTERS]);
 
-static void __hw_perf_counter_set_period(struct hw_perf_counter *hwc, int idx)
+/*
+ * Set the next IRQ period, based on the hwc->period_left value.
+ * To be called with the counter disabled in hw:
+ */
+static void
+__hw_perf_counter_set_period(struct perf_counter *counter,
+			     struct hw_perf_counter *hwc, int idx)
 {
-	per_cpu(prev_next_count[idx], smp_processor_id()) = hwc->next_count;
+	s32 left = atomic64_read(&hwc->period_left);
+	s32 period = hwc->irq_period;
+
+	WARN_ON_ONCE(period <= 0);
+
+	/*
+	 * If we are way outside a reasoable range then just skip forward:
+	 */
+	if (unlikely(left <= -period)) {
+		left = period;
+		atomic64_set(&hwc->period_left, left);
+	}
+
+	if (unlikely(left <= 0)) {
+		left += period;
+		atomic64_set(&hwc->period_left, left);
+	}
 
-	wrmsr(hwc->counter_base + idx, hwc->next_count, 0);
+	WARN_ON_ONCE(left <= 0);
+
+	per_cpu(prev_left[idx], smp_processor_id()) = left;
+
+	/*
+	 * The hw counter starts counting from this counter offset,
+	 * mark it to be able to extra future deltas:
+	 */
+	atomic64_set(&hwc->prev_count, (u64)(s64)-left);
+
+	wrmsr(hwc->counter_base + idx, -left, 0);
 }
 
-static void __x86_perf_counter_enable(struct hw_perf_counter *hwc, int idx)
+static void
+__x86_perf_counter_enable(struct perf_counter *counter,
+			  struct hw_perf_counter *hwc, int idx)
 {
 	wrmsr(hwc->config_base + idx,
 	      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
 }
 
+/*
+ * Find a PMC slot for the freshly enabled / scheduled in counter:
+ */
 static void x86_perf_counter_enable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
@@ -170,55 +253,17 @@ static void x86_perf_counter_enable(struct perf_counter *counter)
 
 	perf_counters_lapic_init(hwc->nmi);
 
-	__x86_perf_counter_disable(hwc, idx);
+	__x86_perf_counter_disable(counter, hwc, idx);
 
 	cpuc->counters[idx] = counter;
 
-	__hw_perf_counter_set_period(hwc, idx);
-	__x86_perf_counter_enable(hwc, idx);
-}
-
-static void __hw_perf_save_counter(struct perf_counter *counter,
-				   struct hw_perf_counter *hwc, int idx)
-{
-	s64 raw = -1;
-	s64 delta;
-
-	/*
-	 * Get the raw hw counter value:
-	 */
-	rdmsrl(hwc->counter_base + idx, raw);
-
-	/*
-	 * Rebase it to zero (it started counting at -irq_period),
-	 * to see the delta since ->prev_count:
-	 */
-	delta = (s64)hwc->irq_period + (s64)(s32)raw;
-
-	atomic64_counter_set(counter, hwc->prev_count + delta);
-
-	/*
-	 * Adjust the ->prev_count offset - if we went beyond
-	 * irq_period of units, then we got an IRQ and the counter
-	 * was set back to -irq_period:
-	 */
-	while (delta >= (s64)hwc->irq_period) {
-		hwc->prev_count += hwc->irq_period;
-		delta -= (s64)hwc->irq_period;
-	}
-
-	/*
-	 * Calculate the next raw counter value we'll write into
-	 * the counter at the next sched-in time:
-	 */
-	delta -= (s64)hwc->irq_period;
-
-	hwc->next_count = (s32)delta;
+	__hw_perf_counter_set_period(counter, hwc, idx);
+	__x86_perf_counter_enable(counter, hwc, idx);
 }
 
 void perf_counter_print_debug(void)
 {
-	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, next_count;
+	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left;
 	int cpu, idx;
 
 	if (!nr_hw_counters)
@@ -241,14 +286,14 @@ void perf_counter_print_debug(void)
 		rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
 		rdmsrl(MSR_ARCH_PERFMON_PERFCTR0  + idx, pmc_count);
 
-		next_count = per_cpu(prev_next_count[idx], cpu);
+		prev_left = per_cpu(prev_left[idx], cpu);
 
 		printk(KERN_INFO "CPU#%d: PMC%d ctrl:  %016llx\n",
 			cpu, idx, pmc_ctrl);
 		printk(KERN_INFO "CPU#%d: PMC%d count: %016llx\n",
 			cpu, idx, pmc_count);
-		printk(KERN_INFO "CPU#%d: PMC%d next:  %016llx\n",
-			cpu, idx, next_count);
+		printk(KERN_INFO "CPU#%d: PMC%d left:  %016llx\n",
+			cpu, idx, prev_left);
 	}
 	local_irq_enable();
 }
@@ -259,29 +304,16 @@ static void x86_perf_counter_disable(struct perf_counter *counter)
 	struct hw_perf_counter *hwc = &counter->hw;
 	unsigned int idx = hwc->idx;
 
-	__x86_perf_counter_disable(hwc, idx);
+	__x86_perf_counter_disable(counter, hwc, idx);
 
 	clear_bit(idx, cpuc->used);
 	cpuc->counters[idx] = NULL;
-	__hw_perf_save_counter(counter, hwc, idx);
-}
 
-static void x86_perf_counter_read(struct perf_counter *counter)
-{
-	struct hw_perf_counter *hwc = &counter->hw;
-	unsigned long addr = hwc->counter_base + hwc->idx;
-	s64 offs, val = -1LL;
-	s32 val32;
-
-	/* Careful: NMI might modify the counter offset */
-	do {
-		offs = hwc->prev_count;
-		rdmsrl(addr, val);
-	} while (offs != hwc->prev_count);
-
-	val32 = (s32) val;
-	val = (s64)hwc->irq_period + (s64)val32;
-	atomic64_counter_set(counter, hwc->prev_count + val);
+	/*
+	 * Drain the remaining delta count out of a counter
+	 * that we are disabling:
+	 */
+	x86_perf_counter_update(counter, hwc, idx);
 }
 
 static void perf_store_irq_data(struct perf_counter *counter, u64 data)
@@ -299,7 +331,8 @@ static void perf_store_irq_data(struct perf_counter *counter, u64 data)
 }
 
 /*
- * NMI-safe enable method:
+ * Save and restart an expired counter. Called by NMI contexts,
+ * so it has to be careful about preempting normal counter ops:
  */
 static void perf_save_and_restart(struct perf_counter *counter)
 {
@@ -309,45 +342,25 @@ static void perf_save_and_restart(struct perf_counter *counter)
 
 	rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
 
-	__hw_perf_save_counter(counter, hwc, idx);
-	__hw_perf_counter_set_period(hwc, idx);
+	x86_perf_counter_update(counter, hwc, idx);
+	__hw_perf_counter_set_period(counter, hwc, idx);
 
 	if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE)
-		__x86_perf_counter_enable(hwc, idx);
+		__x86_perf_counter_enable(counter, hwc, idx);
 }
 
 static void
 perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 {
 	struct perf_counter *counter, *group_leader = sibling->group_leader;
-	int bit;
-
-	/*
-	 * Store the counter's own timestamp first:
-	 */
-	perf_store_irq_data(sibling, sibling->hw_event.type);
-	perf_store_irq_data(sibling, atomic64_counter_read(sibling));
 
 	/*
-	 * Then store sibling timestamps (if any):
+	 * Store sibling timestamps (if any):
 	 */
 	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
-		if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
-			/*
-			 * When counter was not in the overflow mask, we have to
-			 * read it from hardware. We read it as well, when it
-			 * has not been read yet and clear the bit in the
-			 * status mask.
-			 */
-			bit = counter->hw.idx;
-			if (!test_bit(bit, (unsigned long *) overflown) ||
-			    test_bit(bit, (unsigned long *) status)) {
-				clear_bit(bit, (unsigned long *) status);
-				perf_save_and_restart(counter);
-			}
-		}
+		x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
 		perf_store_irq_data(sibling, counter->hw_event.type);
-		perf_store_irq_data(sibling, atomic64_counter_read(counter));
+		perf_store_irq_data(sibling, atomic64_read(&counter->count));
 	}
 }
 
@@ -540,6 +553,11 @@ void __init init_hw_perf_counters(void)
 	perf_counters_initialized = true;
 }
 
+static void x86_perf_counter_read(struct perf_counter *counter)
+{
+	x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
+}
+
 static const struct hw_perf_counter_ops x86_perf_counter_ops = {
 	.hw_perf_counter_enable		= x86_perf_counter_enable,
 	.hw_perf_counter_disable	= x86_perf_counter_disable,
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 8cb095fa442..72460289c65 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -91,14 +91,16 @@ struct perf_counter_hw_event {
  * struct hw_perf_counter - performance counter hardware details:
  */
 struct hw_perf_counter {
+#ifdef CONFIG_PERF_COUNTERS
 	u64				config;
 	unsigned long			config_base;
 	unsigned long			counter_base;
 	int				nmi;
 	unsigned int			idx;
-	u64				prev_count;
+	atomic64_t			prev_count;
 	u64				irq_period;
-	s32				next_count;
+	atomic64_t			period_left;
+#endif
 };
 
 /*
@@ -140,17 +142,15 @@ enum perf_counter_active_state {
  * struct perf_counter - performance counter kernel representation:
  */
 struct perf_counter {
+#ifdef CONFIG_PERF_COUNTERS
 	struct list_head		list_entry;
 	struct list_head		sibling_list;
 	struct perf_counter		*group_leader;
 	const struct hw_perf_counter_ops *hw_ops;
 
 	enum perf_counter_active_state	state;
-#if BITS_PER_LONG == 64
 	atomic64_t			count;
-#else
-	atomic_t			count32[2];
-#endif
+
 	struct perf_counter_hw_event	hw_event;
 	struct hw_perf_counter		hw;
 
@@ -172,6 +172,7 @@ struct perf_counter {
 	struct perf_data		*irqdata;
 	struct perf_data		*usrdata;
 	struct perf_data		data[2];
+#endif
 };
 
 /**
@@ -220,8 +221,6 @@ extern void perf_counter_notify(struct pt_regs *regs);
 extern void perf_counter_print_debug(void);
 extern u64 hw_perf_save_disable(void);
 extern void hw_perf_restore(u64 ctrl);
-extern void atomic64_counter_set(struct perf_counter *counter, u64 val64);
-extern u64 atomic64_counter_read(struct perf_counter *counter);
 extern int perf_counter_task_disable(void);
 extern int perf_counter_task_enable(void);
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 559130b8774..416861ce8b2 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -44,67 +44,9 @@ hw_perf_counter_init(struct perf_counter *counter)
 }
 
 u64 __weak hw_perf_save_disable(void)		{ return 0; }
-void __weak hw_perf_restore(u64 ctrl)	{ }
+void __weak hw_perf_restore(u64 ctrl)		{ }
 void __weak hw_perf_counter_setup(void)		{ }
 
-#if BITS_PER_LONG == 64
-
-/*
- * Read the cached counter in counter safe against cross CPU / NMI
- * modifications. 64 bit version - no complications.
- */
-static inline u64 perf_counter_read_safe(struct perf_counter *counter)
-{
-	return (u64) atomic64_read(&counter->count);
-}
-
-void atomic64_counter_set(struct perf_counter *counter, u64 val)
-{
-	atomic64_set(&counter->count, val);
-}
-
-u64 atomic64_counter_read(struct perf_counter *counter)
-{
-	return atomic64_read(&counter->count);
-}
-
-#else
-
-/*
- * Read the cached counter in counter safe against cross CPU / NMI
- * modifications. 32 bit version.
- */
-static u64 perf_counter_read_safe(struct perf_counter *counter)
-{
-	u32 cntl, cnth;
-
-	local_irq_disable();
-	do {
-		cnth = atomic_read(&counter->count32[1]);
-		cntl = atomic_read(&counter->count32[0]);
-	} while (cnth != atomic_read(&counter->count32[1]));
-
-	local_irq_enable();
-
-	return cntl | ((u64) cnth) << 32;
-}
-
-void atomic64_counter_set(struct perf_counter *counter, u64 val64)
-{
-	u32 *val32 = (void *)&val64;
-
-	atomic_set(counter->count32 + 0, *(val32 + 0));
-	atomic_set(counter->count32 + 1, *(val32 + 1));
-}
-
-u64 atomic64_counter_read(struct perf_counter *counter)
-{
-	return atomic_read(counter->count32 + 0) |
-		(u64) atomic_read(counter->count32 + 1) << 32;
-}
-
-#endif
-
 static void
 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 {
@@ -280,11 +222,11 @@ static void __perf_install_in_context(void *info)
 	ctx->nr_counters++;
 
 	if (cpuctx->active_oncpu < perf_max_counters) {
-		counter->hw_ops->hw_perf_counter_enable(counter);
 		counter->state = PERF_COUNTER_STATE_ACTIVE;
 		counter->oncpu = cpu;
 		ctx->nr_active++;
 		cpuctx->active_oncpu++;
+		counter->hw_ops->hw_perf_counter_enable(counter);
 	}
 
 	if (!ctx->task && cpuctx->max_pertask)
@@ -624,7 +566,7 @@ static u64 perf_counter_read(struct perf_counter *counter)
 					 __hw_perf_counter_read, counter, 1);
 	}
 
-	return perf_counter_read_safe(counter);
+	return atomic64_read(&counter->count);
 }
 
 /*
@@ -921,7 +863,7 @@ static void cpu_clock_perf_counter_read(struct perf_counter *counter)
 {
 	int cpu = raw_smp_processor_id();
 
-	atomic64_counter_set(counter, cpu_clock(cpu));
+	atomic64_set(&counter->count, cpu_clock(cpu));
 }
 
 static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
@@ -940,7 +882,7 @@ static void task_clock_perf_counter_disable(struct perf_counter *counter)
 
 static void task_clock_perf_counter_read(struct perf_counter *counter)
 {
-	atomic64_counter_set(counter, current->se.sum_exec_runtime);
+	atomic64_set(&counter->count, current->se.sum_exec_runtime);
 }
 
 static const struct hw_perf_counter_ops perf_ops_task_clock = {
-- 
cgit v1.2.3-70-g09d2


From 51b26ada79b605ed709ddcedbb6012e8f8e0ebed Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 26 Apr 2009 10:12:47 -0700
Subject: x86: unify arch/x86/boot/compressed/vmlinux_*.lds

Look at the:

	diff -u arch/x86/boot/compressed/vmlinux_*.lds

output and realize that they're basially exactly the same except for
trivial naming differences, and the fact that the 64-bit version has a
"pgtable" thing.

So unify them.

There's some trivial cleanup there (make the output format a Kconfig thing
rather than doing #ifdef's for it, and unify both 32-bit and 64-bit BSS
end to "_ebss", where 32-bit used to use the traditional "_end"), but
other than that it's really very mindless and straigt conversion.

For example, I think we should aim to remove "startup_32" vs "startup_64",
and just call it "startup", and get rid of one more difference. I didn't
do that.

Also, notice the comment in the unified vmlinux.lds.S talks about
"head_64" and "startup_32" which is an odd and incorrect mix, but that was
actually what the old 64-bit only lds file had, so the confusion isn't
new, and now that mixing is arguably more accurate thanks to the
vmlinux.lds.S file being shared between the two cases ;)

[ Impact: cleanup, unification ]

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig                        |  5 +++
 arch/x86/boot/compressed/Makefile       |  2 +-
 arch/x86/boot/compressed/head_32.S      |  8 ++---
 arch/x86/boot/compressed/vmlinux.lds.S  | 57 +++++++++++++++++++++++++++++++++
 arch/x86/boot/compressed/vmlinux_32.lds | 43 -------------------------
 arch/x86/boot/compressed/vmlinux_64.lds | 48 ---------------------------
 6 files changed, 67 insertions(+), 96 deletions(-)
 create mode 100644 arch/x86/boot/compressed/vmlinux.lds.S
 delete mode 100644 arch/x86/boot/compressed/vmlinux_32.lds
 delete mode 100644 arch/x86/boot/compressed/vmlinux_64.lds

(limited to 'arch/x86/Kconfig')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index bc25b9f5e4c..039c3f04aac 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -47,6 +47,11 @@ config X86
 	select HAVE_KERNEL_BZIP2
 	select HAVE_KERNEL_LZMA
 
+config OUTPUT_FORMAT
+	string
+	default "elf32-i386" if X86_32
+	default "elf64-x86-64" if X86_64
+
 config ARCH_DEFCONFIG
 	string
 	default "arch/x86/configs/i386_defconfig" if X86_32
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 65551c9f857..0f4b5e2abd3 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -19,7 +19,7 @@ KBUILD_AFLAGS  := $(KBUILD_CFLAGS) -D__ASSEMBLY__
 LDFLAGS := -m elf_$(UTS_MACHINE)
 LDFLAGS_vmlinux := -T
 
-$(obj)/vmlinux: $(src)/vmlinux_$(BITS).lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE
+$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE
 	$(call if_changed,ld)
 	@:
 
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 3a8a866fb2e..85bd3285706 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -88,9 +88,9 @@ ENTRY(startup_32)
  * where decompression in place becomes safe.
  */
 	pushl %esi
-	leal _end(%ebp), %esi
-	leal _end(%ebx), %edi
-	movl $(_end - startup_32), %ecx
+	leal _ebss(%ebp), %esi
+	leal _ebss(%ebx), %edi
+	movl $(_ebss - startup_32), %ecx
 	std
 	rep
 	movsb
@@ -121,7 +121,7 @@ relocated:
  */
 	xorl %eax,%eax
 	leal _edata(%ebx),%edi
-	leal _end(%ebx), %ecx
+	leal _ebss(%ebx), %ecx
 	subl %edi,%ecx
 	cld
 	rep
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
new file mode 100644
index 00000000000..ffcb19134bf
--- /dev/null
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -0,0 +1,57 @@
+OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
+
+#ifdef CONFIG_X86_64
+OUTPUT_ARCH(i386:x86-64)
+ENTRY(startup_64)
+#else
+OUTPUT_ARCH(i386)
+ENTRY(startup_32)
+#endif
+
+SECTIONS
+{
+	/* Be careful parts of head_64.S assume startup_32 is at
+	 * address 0.
+	 */
+	. = 0;
+	.text.head : {
+		_head = . ;
+		*(.text.head)
+		_ehead = . ;
+	}
+	.rodata.compressed : {
+		*(.rodata.compressed)
+	}
+	.text :	{
+		_text = .; 	/* Text */
+		*(.text)
+		*(.text.*)
+		_etext = . ;
+	}
+	.rodata : {
+		_rodata = . ;
+		*(.rodata)	 /* read-only data */
+		*(.rodata.*)
+		_erodata = . ;
+	}
+	.data :	{
+		_data = . ;
+		*(.data)
+		*(.data.*)
+		_edata = . ;
+	}
+	.bss : {
+		_bss = . ;
+		*(.bss)
+		*(.bss.*)
+		*(COMMON)
+#ifdef CONFIG_X86_64
+		. = ALIGN(8);
+		_end_before_pgt = . ;
+		. = ALIGN(4096);
+		pgtable = . ;
+		. = . + 4096 * 6;
+#endif
+		_ebss = .;
+	}
+}
diff --git a/arch/x86/boot/compressed/vmlinux_32.lds b/arch/x86/boot/compressed/vmlinux_32.lds
deleted file mode 100644
index bb3c48379c4..00000000000
--- a/arch/x86/boot/compressed/vmlinux_32.lds
+++ /dev/null
@@ -1,43 +0,0 @@
-OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
-OUTPUT_ARCH(i386)
-ENTRY(startup_32)
-SECTIONS
-{
-	/* Be careful parts of head_32.S assume startup_32 is at
-	 * address 0.
-	 */
-	. = 0;
-	.text.head : {
-		_head = . ;
-		*(.text.head)
-		_ehead = . ;
-	}
-	.rodata.compressed : {
-		*(.rodata.compressed)
-	}
-	.text :	{
-		_text = .; 	/* Text */
-		*(.text)
-		*(.text.*)
-		_etext = . ;
-	}
-	.rodata : {
-		_rodata = . ;
-		*(.rodata)	 /* read-only data */
-		*(.rodata.*)
-		_erodata = . ;
-	}
-	.data :	{
-		_data = . ;
-		*(.data)
-		*(.data.*)
-		_edata = . ;
-	}
-	.bss : {
-		_bss = . ;
-		*(.bss)
-		*(.bss.*)
-		*(COMMON)
-		_end = . ;
-	}
-}
diff --git a/arch/x86/boot/compressed/vmlinux_64.lds b/arch/x86/boot/compressed/vmlinux_64.lds
deleted file mode 100644
index bef1ac891bc..00000000000
--- a/arch/x86/boot/compressed/vmlinux_64.lds
+++ /dev/null
@@ -1,48 +0,0 @@
-OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
-OUTPUT_ARCH(i386:x86-64)
-ENTRY(startup_64)
-SECTIONS
-{
-	/* Be careful parts of head_64.S assume startup_32 is at
-	 * address 0.
-	 */
-	. = 0;
-	.text.head : {
-		_head = . ;
-		*(.text.head)
-		_ehead = . ;
-	}
-	.rodata.compressed : {
-		*(.rodata.compressed)
-	}
-	.text :	{
-		_text = .; 	/* Text */
-		*(.text)
-		*(.text.*)
-		_etext = . ;
-	}
-	.rodata : {
-		_rodata = . ;
-		*(.rodata)	 /* read-only data */
-		*(.rodata.*)
-		_erodata = . ;
-	}
-	.data :	{
-		_data = . ;
-		*(.data)
-		*(.data.*)
-		_edata = . ;
-	}
-	.bss : {
-		_bss = . ;
-		*(.bss)
-		*(.bss.*)
-		*(COMMON)
-		. = ALIGN(8);
-		_end_before_pgt = . ;
-		. = ALIGN(4096);
-		pgtable = . ;
-		. = . + 4096 * 6;
-		_ebss = .;
-	}
-}
-- 
cgit v1.2.3-70-g09d2


From 845adf7266a7ba6970bf982ffd96abc60d2018ab Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 5 May 2009 21:20:51 -0700
Subject: x86: add a Kconfig symbol for when relocations are needed

We only need to build relocations when we are building a 32-bit
relocatable kernel.  Rather than unnecessarily complicating the
Makefiles, make an explicit Kbuild symbol for this.

[ Impact: permits future cleanup ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
---
 arch/x86/Kconfig | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86/Kconfig')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 039c3f04aac..5aee45356b5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1513,6 +1513,11 @@ config RELOCATABLE
 	  it has been loaded at and the compile time physical address
 	  (CONFIG_PHYSICAL_START) is ignored.
 
+# Relocation on x86-32 needs some additional build support
+config X86_NEED_RELOCS
+	def_bool y
+	depends on X86_32 && RELOCATABLE
+
 config PHYSICAL_ALIGN
 	hex
 	prompt "Alignment value to which kernel should be aligned" if X86_32
-- 
cgit v1.2.3-70-g09d2


From ceefccc93932b920a8ec6f35f596db05202a12fe Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 11 May 2009 16:12:16 -0700
Subject: x86: default CONFIG_PHYSICAL_START and CONFIG_PHYSICAL_ALIGN to 16 MB

Default CONFIG_PHYSICAL_START and CONFIG_PHYSICAL_ALIGN each to 16 MB,
so that both non-relocatable and relocatable kernels are loaded at
16 MB by a non-relocating bootloader.  This is somewhat hacky, but it
appears to be the only way to do this that does not break some some
set of existing bootloaders.

We want to avoid the bottom 16 MB because of large page breakup,
memory holes, and ZONE_DMA.  Embedded systems may need to reduce this,
or update their bootloaders to be aware of the new min_alignment field.

[ Impact: performance improvement, avoids problems on some systems ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/Kconfig | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

(limited to 'arch/x86/Kconfig')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5aee45356b5..50fbb47f529 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1455,9 +1455,7 @@ config KEXEC_JUMP
 
 config PHYSICAL_START
 	hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
-	default "0x1000000" if X86_NUMAQ
-	default "0x200000" if X86_64
-	default "0x100000"
+	default "0x1000000"
 	---help---
 	  This gives the physical address where the kernel is loaded.
 
@@ -1476,15 +1474,15 @@ config PHYSICAL_START
 	  to be specifically compiled to run from a specific memory area
 	  (normally a reserved region) and this option comes handy.
 
-	  So if you are using bzImage for capturing the crash dump, leave
-	  the value here unchanged to 0x100000 and set CONFIG_RELOCATABLE=y.
-	  Otherwise if you plan to use vmlinux for capturing the crash dump
-	  change this value to start of the reserved region (Typically 16MB
-	  0x1000000). In other words, it can be set based on the "X" value as
-	  specified in the "crashkernel=YM@XM" command line boot parameter
-	  passed to the panic-ed kernel. Typically this parameter is set as
-	  crashkernel=64M@16M. Please take a look at
-	  Documentation/kdump/kdump.txt for more details about crash dumps.
+	  So if you are using bzImage for capturing the crash dump,
+	  leave the value here unchanged to 0x1000000 and set
+	  CONFIG_RELOCATABLE=y.  Otherwise if you plan to use vmlinux
+	  for capturing the crash dump change this value to start of
+	  the reserved region.  In other words, it can be set based on
+	  the "X" value as specified in the "crashkernel=YM@XM"
+	  command line boot parameter passed to the panic-ed
+	  kernel. Please take a look at Documentation/kdump/kdump.txt
+	  for more details about crash dumps.
 
 	  Usage of bzImage for capturing the crash dump is recommended as
 	  one does not have to build two kernels. Same kernel can be used
@@ -1521,9 +1519,8 @@ config X86_NEED_RELOCS
 config PHYSICAL_ALIGN
 	hex
 	prompt "Alignment value to which kernel should be aligned" if X86_32
-	default "0x100000" if X86_32
-	default "0x200000" if X86_64
-	range 0x2000 0x400000
+	default "0x1000000"
+	range 0x2000 0x1000000
 	---help---
 	  This value puts the alignment restrictions on physical address
 	  where kernel is loaded and run from. Kernel is compiled for an
-- 
cgit v1.2.3-70-g09d2


From 26717808f93a27c22d4853c4fb17fa225f4ccc68 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Thu, 7 May 2009 14:19:34 -0700
Subject: x86: make CONFIG_RELOCATABLE the default

Remove the EXPERIMENTAL tag from CONFIG_RELOCATABLE and make it the
default.  Relocatable kernels have been used for a while now, and
should now have identical semantics to non-relocatable kernels when
loaded by a non-relocating bootloader.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/Kconfig')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 50fbb47f529..3e0f80a764a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1495,8 +1495,8 @@ config PHYSICAL_START
 	  Don't change this unless you know what you are doing.
 
 config RELOCATABLE
-	bool "Build a relocatable kernel (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
+	bool "Build a relocatable kernel"
+	default y
 	---help---
 	  This builds a kernel image that retains relocation information
 	  so it can be loaded someplace besides the default 1MB.
-- 
cgit v1.2.3-70-g09d2