summaryrefslogtreecommitdiffstats
path: root/arch/x86_64/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86_64/kernel')
-rw-r--r--arch/x86_64/kernel/Makefile3
-rw-r--r--arch/x86_64/kernel/aperture.c2
-rw-r--r--arch/x86_64/kernel/apic.c10
-rw-r--r--arch/x86_64/kernel/e820.c3
-rw-r--r--arch/x86_64/kernel/entry.S3
-rw-r--r--arch/x86_64/kernel/head.S77
-rw-r--r--arch/x86_64/kernel/head64.c14
-rw-r--r--arch/x86_64/kernel/i8259.c12
-rw-r--r--arch/x86_64/kernel/io_apic.c80
-rw-r--r--arch/x86_64/kernel/kprobes.c191
-rw-r--r--arch/x86_64/kernel/mce.c27
-rw-r--r--arch/x86_64/kernel/mce_amd.c538
-rw-r--r--arch/x86_64/kernel/mpparse.c23
-rw-r--r--arch/x86_64/kernel/pci-gart.c12
-rw-r--r--arch/x86_64/kernel/pci-nommu.c2
-rw-r--r--arch/x86_64/kernel/process.c123
-rw-r--r--arch/x86_64/kernel/ptrace.c43
-rw-r--r--arch/x86_64/kernel/reboot.c7
-rw-r--r--arch/x86_64/kernel/setup.c118
-rw-r--r--arch/x86_64/kernel/setup64.c6
-rw-r--r--arch/x86_64/kernel/signal.c17
-rw-r--r--arch/x86_64/kernel/smp.c7
-rw-r--r--arch/x86_64/kernel/smpboot.c120
-rw-r--r--arch/x86_64/kernel/suspend.c92
-rw-r--r--arch/x86_64/kernel/suspend_asm.S17
-rw-r--r--arch/x86_64/kernel/sys_x86_64.c14
-rw-r--r--arch/x86_64/kernel/time.c35
-rw-r--r--arch/x86_64/kernel/traps.c44
-rw-r--r--arch/x86_64/kernel/vmlinux.lds.S2
-rw-r--r--arch/x86_64/kernel/x8664_ksyms.c3
30 files changed, 1185 insertions, 460 deletions
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index bcdd0a805fe..fe4cbd1c4b2 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -11,6 +11,7 @@ obj-y := process.o signal.o entry.o traps.o irq.o \
obj-$(CONFIG_X86_MCE) += mce.o
obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o
+obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o
obj-$(CONFIG_MTRR) += ../../i386/kernel/cpu/mtrr/
obj-$(CONFIG_ACPI) += acpi/
obj-$(CONFIG_X86_MSR) += msr.o
@@ -27,7 +28,6 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq/
obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
obj-$(CONFIG_GART_IOMMU) += pci-gart.o aperture.o
obj-$(CONFIG_DUMMY_IOMMU) += pci-nommu.o pci-dma.o
-obj-$(CONFIG_SWIOTLB) += swiotlb.o
obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_X86_PM_TIMER) += pmtimer.o
@@ -41,7 +41,6 @@ CFLAGS_vsyscall.o := $(PROFILING) -g0
bootflag-y += ../../i386/kernel/bootflag.o
cpuid-$(subst m,y,$(CONFIG_X86_CPUID)) += ../../i386/kernel/cpuid.o
topology-y += ../../i386/mach-default/topology.o
-swiotlb-$(CONFIG_SWIOTLB) += ../../ia64/lib/swiotlb.o
microcode-$(subst m,y,$(CONFIG_MICROCODE)) += ../../i386/kernel/microcode.o
intel_cacheinfo-y += ../../i386/kernel/cpu/intel_cacheinfo.o
quirks-y += ../../i386/kernel/quirks.o
diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c
index 962ad4823b6..c7f4fdd20f0 100644
--- a/arch/x86_64/kernel/aperture.c
+++ b/arch/x86_64/kernel/aperture.c
@@ -196,7 +196,7 @@ static __u32 __init search_agp_bridge(u32 *order, int *valid_agp)
void __init iommu_hole_init(void)
{
int fix, num;
- u32 aper_size, aper_alloc = 0, aper_order, last_aper_order = 0;
+ u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0;
u64 aper_base, last_aper_base = 0;
int valid_agp = 0;
diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index b6e7715d877..18691ce4c75 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -833,6 +833,16 @@ int setup_profiling_timer(unsigned int multiplier)
return 0;
}
+#ifdef CONFIG_X86_MCE_AMD
+void setup_threshold_lvt(unsigned long lvt_off)
+{
+ unsigned int v = 0;
+ unsigned long reg = (lvt_off << 4) + 0x500;
+ v |= THRESHOLD_APIC_VECTOR;
+ apic_write(reg, v);
+}
+#endif /* CONFIG_X86_MCE_AMD */
+
#undef APIC_DIVISOR
/*
diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index ab3f87aaff7..17579a1a174 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -23,8 +23,7 @@
#include <asm/e820.h>
#include <asm/proto.h>
#include <asm/bootsetup.h>
-
-extern char _end[];
+#include <asm/sections.h>
/*
* PFN of last memory page.
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index 7937971d185..9ff42041bb6 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -612,6 +612,9 @@ retint_kernel:
ENTRY(thermal_interrupt)
apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
+ENTRY(threshold_interrupt)
+ apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt
+
#ifdef CONFIG_SMP
ENTRY(reschedule_interrupt)
apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
index 4592bf21fca..15290968e49 100644
--- a/arch/x86_64/kernel/head.S
+++ b/arch/x86_64/kernel/head.S
@@ -12,6 +12,7 @@
#include <linux/linkage.h>
#include <linux/threads.h>
+#include <linux/init.h>
#include <asm/desc.h>
#include <asm/segment.h>
#include <asm/page.h>
@@ -70,7 +71,7 @@ startup_32:
movl %eax, %cr4
/* Setup early boot stage 4 level pagetables */
- movl $(init_level4_pgt - __START_KERNEL_map), %eax
+ movl $(boot_level4_pgt - __START_KERNEL_map), %eax
movl %eax, %cr3
/* Setup EFER (Extended Feature Enable Register) */
@@ -113,7 +114,7 @@ startup_64:
movq %rax, %cr4
/* Setup early boot stage 4 level pagetables. */
- movq $(init_level4_pgt - __START_KERNEL_map), %rax
+ movq $(boot_level4_pgt - __START_KERNEL_map), %rax
movq %rax, %cr3
/* Check if nx is implemented */
@@ -240,20 +241,10 @@ ljumpvector:
ENTRY(stext)
ENTRY(_stext)
- /*
- * This default setting generates an ident mapping at address 0x100000
- * and a mapping for the kernel that precisely maps virtual address
- * 0xffffffff80000000 to physical address 0x000000. (always using
- * 2Mbyte large pages provided by PAE mode)
- */
.org 0x1000
ENTRY(init_level4_pgt)
- .quad 0x0000000000002007 + __PHYSICAL_START /* -> level3_ident_pgt */
- .fill 255,8,0
- .quad 0x000000000000a007 + __PHYSICAL_START
- .fill 254,8,0
- /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
- .quad 0x0000000000003007 + __PHYSICAL_START /* -> level3_kernel_pgt */
+ /* This gets initialized in x86_64_start_kernel */
+ .fill 512,8,0
.org 0x2000
ENTRY(level3_ident_pgt)
@@ -270,26 +261,26 @@ ENTRY(level3_kernel_pgt)
.org 0x4000
ENTRY(level2_ident_pgt)
/* 40MB for bootup. */
- .quad 0x0000000000000183
- .quad 0x0000000000200183
- .quad 0x0000000000400183
- .quad 0x0000000000600183
- .quad 0x0000000000800183
- .quad 0x0000000000A00183
- .quad 0x0000000000C00183
- .quad 0x0000000000E00183
- .quad 0x0000000001000183
- .quad 0x0000000001200183
- .quad 0x0000000001400183
- .quad 0x0000000001600183
- .quad 0x0000000001800183
- .quad 0x0000000001A00183
- .quad 0x0000000001C00183
- .quad 0x0000000001E00183
- .quad 0x0000000002000183
- .quad 0x0000000002200183
- .quad 0x0000000002400183
- .quad 0x0000000002600183
+ .quad 0x0000000000000083
+ .quad 0x0000000000200083
+ .quad 0x0000000000400083
+ .quad 0x0000000000600083
+ .quad 0x0000000000800083
+ .quad 0x0000000000A00083
+ .quad 0x0000000000C00083
+ .quad 0x0000000000E00083
+ .quad 0x0000000001000083
+ .quad 0x0000000001200083
+ .quad 0x0000000001400083
+ .quad 0x0000000001600083
+ .quad 0x0000000001800083
+ .quad 0x0000000001A00083
+ .quad 0x0000000001C00083
+ .quad 0x0000000001E00083
+ .quad 0x0000000002000083
+ .quad 0x0000000002200083
+ .quad 0x0000000002400083
+ .quad 0x0000000002600083
/* Temporary mappings for the super early allocator in arch/x86_64/mm/init.c */
.globl temp_boot_pmds
temp_boot_pmds:
@@ -350,6 +341,24 @@ ENTRY(wakeup_level4_pgt)
.quad 0x0000000000003007 + __PHYSICAL_START /* -> level3_kernel_pgt */
#endif
+#ifndef CONFIG_HOTPLUG_CPU
+ __INITDATA
+#endif
+ /*
+ * This default setting generates an ident mapping at address 0x100000
+ * and a mapping for the kernel that precisely maps virtual address
+ * 0xffffffff80000000 to physical address 0x000000. (always using
+ * 2Mbyte large pages provided by PAE mode)
+ */
+ .align PAGE_SIZE
+ENTRY(boot_level4_pgt)
+ .quad 0x0000000000002007 + __PHYSICAL_START /* -> level3_ident_pgt */
+ .fill 255,8,0
+ .quad 0x000000000000a007 + __PHYSICAL_START
+ .fill 254,8,0
+ /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
+ .quad 0x0000000000003007 + __PHYSICAL_START /* -> level3_kernel_pgt */
+
.data
.align 16
diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c
index cf6ab147a2a..b675c5add01 100644
--- a/arch/x86_64/kernel/head64.c
+++ b/arch/x86_64/kernel/head64.c
@@ -19,14 +19,15 @@
#include <asm/bootsetup.h>
#include <asm/setup.h>
#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/sections.h>
/* Don't add a printk in there. printk relies on the PDA which is not initialized
yet. */
static void __init clear_bss(void)
{
- extern char __bss_start[], __bss_end[];
memset(__bss_start, 0,
- (unsigned long) __bss_end - (unsigned long) __bss_start);
+ (unsigned long) __bss_stop - (unsigned long) __bss_start);
}
#define NEW_CL_POINTER 0x228 /* Relative to real mode data */
@@ -75,8 +76,6 @@ static void __init setup_boot_cpu_data(void)
boot_cpu_data.x86_mask = eax & 0xf;
}
-extern char _end[];
-
void __init x86_64_start_kernel(char * real_mode_data)
{
char *s;
@@ -86,6 +85,13 @@ void __init x86_64_start_kernel(char * real_mode_data)
set_intr_gate(i, early_idt_handler);
asm volatile("lidt %0" :: "m" (idt_descr));
clear_bss();
+
+ /*
+ * switch to init_level4_pgt from boot_level4_pgt
+ */
+ memcpy(init_level4_pgt, boot_level4_pgt, PTRS_PER_PGD*sizeof(pgd_t));
+ asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
+
pda_init(0);
copy_bootdata(real_mode_data);
#ifdef CONFIG_SMP
diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
index b2a238b5a17..6e5101ad3d1 100644
--- a/arch/x86_64/kernel/i8259.c
+++ b/arch/x86_64/kernel/i8259.c
@@ -492,9 +492,10 @@ void invalidate_interrupt5(void);
void invalidate_interrupt6(void);
void invalidate_interrupt7(void);
void thermal_interrupt(void);
+void threshold_interrupt(void);
void i8254_timer_resume(void);
-static void setup_timer(void)
+static void setup_timer_hardware(void)
{
outb_p(0x34,0x43); /* binary, mode 2, LSB/MSB, ch 0 */
udelay(10);
@@ -505,17 +506,17 @@ static void setup_timer(void)
static int timer_resume(struct sys_device *dev)
{
- setup_timer();
+ setup_timer_hardware();
return 0;
}
void i8254_timer_resume(void)
{
- setup_timer();
+ setup_timer_hardware();
}
static struct sysdev_class timer_sysclass = {
- set_kset_name("timer"),
+ set_kset_name("timer_pit"),
.resume = timer_resume,
};
@@ -580,6 +581,7 @@ void __init init_IRQ(void)
set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
#endif
set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+ set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
#ifdef CONFIG_X86_LOCAL_APIC
/* self generated IPI for local APIC timer */
@@ -594,7 +596,7 @@ void __init init_IRQ(void)
* Set the clock to HZ Hz, we already have a valid
* vector now:
*/
- setup_timer();
+ setup_timer_hardware();
if (!acpi_ioapic)
setup_irq(2, &irq2);
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index c8eee20cd51..97154ab058b 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -57,7 +57,7 @@ int nr_ioapic_registers[MAX_IO_APICS];
* Rough estimation of how many shared IRQs there are, can
* be changed anytime.
*/
-#define MAX_PLUS_SHARED_IRQS NR_IRQS
+#define MAX_PLUS_SHARED_IRQS NR_IRQ_VECTORS
#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
/*
@@ -85,6 +85,7 @@ int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
int pin; \
struct irq_pin_list *entry = irq_2_pin + irq; \
\
+ BUG_ON(irq >= NR_IRQS); \
for (;;) { \
unsigned int reg; \
pin = entry->pin; \
@@ -127,6 +128,8 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
}
#endif
+static u8 gsi_2_irq[NR_IRQ_VECTORS] = { [0 ... NR_IRQ_VECTORS-1] = 0xFF };
+
/*
* The common case is 1:1 IRQ<->pin mappings. Sometimes there are
* shared ISA-space IRQs, so we have to support them. We are super
@@ -137,6 +140,7 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
static int first_free_entry = NR_IRQS;
struct irq_pin_list *entry = irq_2_pin + irq;
+ BUG_ON(irq >= NR_IRQS);
while (entry->next)
entry = irq_2_pin + entry->next;
@@ -144,7 +148,7 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
entry->next = first_free_entry;
entry = irq_2_pin + entry->next;
if (++first_free_entry >= PIN_MAP_SIZE)
- panic("io_apic.c: whoops");
+ panic("io_apic.c: ran out of irq_2_pin entries!");
}
entry->apic = apic;
entry->pin = pin;
@@ -420,6 +424,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
best_guess = irq;
}
}
+ BUG_ON(best_guess >= NR_IRQS);
return best_guess;
}
@@ -610,6 +615,64 @@ static inline int irq_trigger(int idx)
return MPBIOS_trigger(idx);
}
+static int next_irq = 16;
+
+/*
+ * gsi_irq_sharing -- Name overload! "irq" can be either a legacy IRQ
+ * in the range 0-15, a linux IRQ in the range 0-223, or a GSI number
+ * from ACPI, which can reach 800 in large boxen.
+ *
+ * Compact the sparse GSI space into a sequential IRQ series and reuse
+ * vectors if possible.
+ */
+int gsi_irq_sharing(int gsi)
+{
+ int i, tries, vector;
+
+ BUG_ON(gsi >= NR_IRQ_VECTORS);
+
+ if (platform_legacy_irq(gsi))
+ return gsi;
+
+ if (gsi_2_irq[gsi] != 0xFF)
+ return (int)gsi_2_irq[gsi];
+
+ tries = NR_IRQS;
+ try_again:
+ vector = assign_irq_vector(gsi);
+
+ /*
+ * Sharing vectors means sharing IRQs, so scan irq_vectors for previous
+ * use of vector and if found, return that IRQ. However, we never want
+ * to share legacy IRQs, which usually have a different trigger mode
+ * than PCI.
+ */
+ for (i = 0; i < NR_IRQS; i++)
+ if (IO_APIC_VECTOR(i) == vector)
+ break;
+ if (platform_legacy_irq(i)) {
+ if (--tries >= 0) {
+ IO_APIC_VECTOR(i) = 0;
+ goto try_again;
+ }
+ panic("gsi_irq_sharing: didn't find an IRQ using vector 0x%02X for GSI %d", vector, gsi);
+ }
+ if (i < NR_IRQS) {
+ gsi_2_irq[gsi] = i;
+ printk(KERN_INFO "GSI %d sharing vector 0x%02X and IRQ %d\n",
+ gsi, vector, i);
+ return i;
+ }
+
+ i = next_irq++;
+ BUG_ON(i >= NR_IRQS);
+ gsi_2_irq[gsi] = i;
+ IO_APIC_VECTOR(i) = vector;
+ printk(KERN_INFO "GSI %d assigned vector 0x%02X and IRQ %d\n",
+ gsi, vector, i);
+ return i;
+}
+
static int pin_2_irq(int idx, int apic, int pin)
{
int irq, i;
@@ -639,6 +702,7 @@ static int pin_2_irq(int idx, int apic, int pin)
while (i < apic)
irq += nr_ioapic_registers[i++];
irq += pin;
+ irq = gsi_irq_sharing(irq);
break;
}
default:
@@ -648,6 +712,7 @@ static int pin_2_irq(int idx, int apic, int pin)
break;
}
}
+ BUG_ON(irq >= NR_IRQS);
/*
* PCI IRQ command line redirection. Yes, limits are hardcoded.
@@ -663,6 +728,7 @@ static int pin_2_irq(int idx, int apic, int pin)
}
}
}
+ BUG_ON(irq >= NR_IRQS);
return irq;
}
@@ -690,8 +756,8 @@ int assign_irq_vector(int irq)
{
static int current_vector = FIRST_DEVICE_VECTOR, offset = 0;
- BUG_ON(irq >= NR_IRQ_VECTORS);
- if (IO_APIC_VECTOR(irq) > 0)
+ BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS);
+ if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0)
return IO_APIC_VECTOR(irq);
next:
current_vector += 8;
@@ -699,9 +765,8 @@ next:
goto next;
if (current_vector >= FIRST_SYSTEM_VECTOR) {
- offset++;
- if (!(offset%8))
- return -ENOSPC;
+ /* If we run out of vectors on large boxen, must share them. */
+ offset = (offset + 1) % 8;
current_vector = FIRST_DEVICE_VECTOR + offset;
}
@@ -1917,6 +1982,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
entry.polarity = active_high_low;
entry.mask = 1; /* Disabled (masked) */
+ irq = gsi_irq_sharing(irq);
/*
* IRQs < 16 are already in the irq_2_pin[] map
*/
diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
index df08c43276a..dddeb678b44 100644
--- a/arch/x86_64/kernel/kprobes.c
+++ b/arch/x86_64/kernel/kprobes.c
@@ -34,7 +34,6 @@
#include <linux/config.h>
#include <linux/kprobes.h>
#include <linux/ptrace.h>
-#include <linux/spinlock.h>
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/preempt.h>
@@ -44,17 +43,10 @@
#include <asm/kdebug.h>
static DECLARE_MUTEX(kprobe_mutex);
-
-static struct kprobe *current_kprobe;
-static unsigned long kprobe_status, kprobe_old_rflags, kprobe_saved_rflags;
-static struct kprobe *kprobe_prev;
-static unsigned long kprobe_status_prev, kprobe_old_rflags_prev, kprobe_saved_rflags_prev;
-static struct pt_regs jprobe_saved_regs;
-static long *jprobe_saved_rsp;
void jprobe_return_end(void);
-/* copy of the kernel stack at the probe fire time */
-static kprobe_opcode_t jprobes_stack[MAX_STACK_SIZE];
+DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
+DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
/*
* returns non-zero if opcode modifies the interrupt flag.
@@ -77,9 +69,9 @@ static inline int is_IF_modifier(kprobe_opcode_t *insn)
int __kprobes arch_prepare_kprobe(struct kprobe *p)
{
/* insn: must be on special executable page on x86_64. */
- up(&kprobe_mutex);
- p->ainsn.insn = get_insn_slot();
down(&kprobe_mutex);
+ p->ainsn.insn = get_insn_slot();
+ up(&kprobe_mutex);
if (!p->ainsn.insn) {
return -ENOMEM;
}
@@ -231,34 +223,35 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
void __kprobes arch_remove_kprobe(struct kprobe *p)
{
- up(&kprobe_mutex);
- free_insn_slot(p->ainsn.insn);
down(&kprobe_mutex);
+ free_insn_slot(p->ainsn.insn);
+ up(&kprobe_mutex);
}
-static inline void save_previous_kprobe(void)
+static inline void save_previous_kprobe(struct kprobe_ctlblk *kcb)
{
- kprobe_prev = current_kprobe;
- kprobe_status_prev = kprobe_status;
- kprobe_old_rflags_prev = kprobe_old_rflags;
- kprobe_saved_rflags_prev = kprobe_saved_rflags;
+ kcb->prev_kprobe.kp = kprobe_running();
+ kcb->prev_kprobe.status = kcb->kprobe_status;
+ kcb->prev_kprobe.old_rflags = kcb->kprobe_old_rflags;
+ kcb->prev_kprobe.saved_rflags = kcb->kprobe_saved_rflags;
}
-static inline void restore_previous_kprobe(void)
+static inline void restore_previous_kprobe(struct kprobe_ctlblk *kcb)
{
- current_kprobe = kprobe_prev;
- kprobe_status = kprobe_status_prev;
- kprobe_old_rflags = kprobe_old_rflags_prev;
- kprobe_saved_rflags = kprobe_saved_rflags_prev;
+ __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
+ kcb->kprobe_status = kcb->prev_kprobe.status;
+ kcb->kprobe_old_rflags = kcb->prev_kprobe.old_rflags;
+ kcb->kprobe_saved_rflags = kcb->prev_kprobe.saved_rflags;
}
-static inline void set_current_kprobe(struct kprobe *p, struct pt_regs *regs)
+static inline void set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
{
- current_kprobe = p;
- kprobe_saved_rflags = kprobe_old_rflags
+ __get_cpu_var(current_kprobe) = p;
+ kcb->kprobe_saved_rflags = kcb->kprobe_old_rflags
= (regs->eflags & (TF_MASK | IF_MASK));
if (is_IF_modifier(p->ainsn.insn))
- kprobe_saved_rflags &= ~IF_MASK;
+ kcb->kprobe_saved_rflags &= ~IF_MASK;
}
static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
@@ -272,6 +265,7 @@ static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
regs->rip = (unsigned long)p->ainsn.insn;
}
+/* Called with kretprobe_lock held */
void __kprobes arch_prepare_kretprobe(struct kretprobe *rp,
struct pt_regs *regs)
{
@@ -292,32 +286,30 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe *rp,
}
}
-/*
- * Interrupts are disabled on entry as trap3 is an interrupt gate and they
- * remain disabled thorough out this function.
- */
int __kprobes kprobe_handler(struct pt_regs *regs)
{
struct kprobe *p;
int ret = 0;
kprobe_opcode_t *addr = (kprobe_opcode_t *)(regs->rip - sizeof(kprobe_opcode_t));
+ struct kprobe_ctlblk *kcb;
- /* We're in an interrupt, but this is clear and BUG()-safe. */
+ /*
+ * We don't want to be preempted for the entire
+ * duration of kprobe processing
+ */
preempt_disable();
+ kcb = get_kprobe_ctlblk();
/* Check we're not actually recursing */
if (kprobe_running()) {
- /* We *are* holding lock here, so this is safe.
- Disarm the probe we just hit, and ignore it. */
p = get_kprobe(addr);
if (p) {
- if (kprobe_status == KPROBE_HIT_SS &&
+ if (kcb->kprobe_status == KPROBE_HIT_SS &&
*p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
regs->eflags &= ~TF_MASK;
- regs->eflags |= kprobe_saved_rflags;
- unlock_kprobes();
+ regs->eflags |= kcb->kprobe_saved_rflags;
goto no_kprobe;
- } else if (kprobe_status == KPROBE_HIT_SSDONE) {
+ } else if (kcb->kprobe_status == KPROBE_HIT_SSDONE) {
/* TODO: Provide re-entrancy from
* post_kprobes_handler() and avoid exception
* stack corruption while single-stepping on
@@ -325,6 +317,7 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
*/
arch_disarm_kprobe(p);
regs->rip = (unsigned long)p->addr;
+ reset_current_kprobe();
ret = 1;
} else {
/* We have reentered the kprobe_handler(), since
@@ -334,27 +327,24 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
* of the new probe without calling any user
* handlers.
*/
- save_previous_kprobe();
- set_current_kprobe(p, regs);
+ save_previous_kprobe(kcb);
+ set_current_kprobe(p, regs, kcb);
p->nmissed++;
prepare_singlestep(p, regs);
- kprobe_status = KPROBE_REENTER;
+ kcb->kprobe_status = KPROBE_REENTER;
return 1;
}
} else {
- p = current_kprobe;
+ p = __get_cpu_var(current_kprobe);
if (p->break_handler && p->break_handler(p, regs)) {
goto ss_probe;
}
}
- /* If it's not ours, can't be delete race, (we hold lock). */
goto no_kprobe;
}
- lock_kprobes();
p = get_kprobe(addr);
if (!p) {
- unlock_kprobes();
if (*addr != BREAKPOINT_INSTRUCTION) {
/*
* The breakpoint instruction was removed right
@@ -372,8 +362,8 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
goto no_kprobe;
}
- kprobe_status = KPROBE_HIT_ACTIVE;
- set_current_kprobe(p, regs);
+ set_current_kprobe(p, regs, kcb);
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
if (p->pre_handler && p->pre_handler(p, regs))
/* handler has already set things up, so skip ss setup */
@@ -381,7 +371,7 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
ss_probe:
prepare_singlestep(p, regs);
- kprobe_status = KPROBE_HIT_SS;
+ kcb->kprobe_status = KPROBE_HIT_SS;
return 1;
no_kprobe:
@@ -409,9 +399,10 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
struct kretprobe_instance *ri = NULL;
struct hlist_head *head;
struct hlist_node *node, *tmp;
- unsigned long orig_ret_address = 0;
+ unsigned long flags, orig_ret_address = 0;
unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline;
+ spin_lock_irqsave(&kretprobe_lock, flags);
head = kretprobe_inst_table_head(current);
/*
@@ -450,13 +441,14 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
BUG_ON(!orig_ret_address || (orig_ret_address == trampoline_address));
regs->rip = orig_ret_address;
- unlock_kprobes();
+ reset_current_kprobe();
+ spin_unlock_irqrestore(&kretprobe_lock, flags);
preempt_enable_no_resched();
/*
* By returning a non-zero value, we are telling
- * kprobe_handler() that we have handled unlocking
- * and re-enabling preemption.
+ * kprobe_handler() that we don't want the post_handler
+ * to run (and have re-enabled preemption)
*/
return 1;
}
@@ -483,7 +475,8 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
* that is atop the stack is the address following the copied instruction.
* We need to make it the address following the original instruction.
*/
-static void __kprobes resume_execution(struct kprobe *p, struct pt_regs *regs)
+static void __kprobes resume_execution(struct kprobe *p,
+ struct pt_regs *regs, struct kprobe_ctlblk *kcb)
{
unsigned long *tos = (unsigned long *)regs->rsp;
unsigned long next_rip = 0;
@@ -498,7 +491,7 @@ static void __kprobes resume_execution(struct kprobe *p, struct pt_regs *regs)
switch (*insn) {
case 0x9c: /* pushfl */
*tos &= ~(TF_MASK | IF_MASK);
- *tos |= kprobe_old_rflags;
+ *tos |= kcb->kprobe_old_rflags;
break;
case 0xc3: /* ret/lret */
case 0xcb:
@@ -537,30 +530,28 @@ static void __kprobes resume_execution(struct kprobe *p, struct pt_regs *regs)
}
}
-/*
- * Interrupts are disabled on entry as trap1 is an interrupt gate and they
- * remain disabled thoroughout this function. And we hold kprobe lock.
- */
int __kprobes post_kprobe_handler(struct pt_regs *regs)
{
- if (!kprobe_running())
+ struct kprobe *cur = kprobe_running();
+ struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+ if (!cur)
return 0;
- if ((kprobe_status != KPROBE_REENTER) && current_kprobe->post_handler) {
- kprobe_status = KPROBE_HIT_SSDONE;
- current_kprobe->post_handler(current_kprobe, regs, 0);
+ if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
+ kcb->kprobe_status = KPROBE_HIT_SSDONE;
+ cur->post_handler(cur, regs, 0);
}
- resume_execution(current_kprobe, regs);
- regs->eflags |= kprobe_saved_rflags;
+ resume_execution(cur, regs, kcb);
+ regs->eflags |= kcb->kprobe_saved_rflags;
/* Restore the original saved kprobes variables and continue. */
- if (kprobe_status == KPROBE_REENTER) {
- restore_previous_kprobe();
+ if (kcb->kprobe_status == KPROBE_REENTER) {
+ restore_previous_kprobe(kcb);
goto out;
- } else {
- unlock_kprobes();
}
+ reset_current_kprobe();
out:
preempt_enable_no_resched();
@@ -575,18 +566,19 @@ out:
return 1;
}
-/* Interrupts disabled, kprobe_lock held. */
int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
{
- if (current_kprobe->fault_handler
- && current_kprobe->fault_handler(current_kprobe, regs, trapnr))
+ struct kprobe *cur = kprobe_running();
+ struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+ if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
return 1;
- if (kprobe_status & KPROBE_HIT_SS) {
- resume_execution(current_kprobe, regs);
- regs->eflags |= kprobe_old_rflags;
+ if (kcb->kprobe_status & KPROBE_HIT_SS) {
+ resume_execution(cur, regs, kcb);
+ regs->eflags |= kcb->kprobe_old_rflags;
- unlock_kprobes();
+ reset_current_kprobe();
preempt_enable_no_resched();
}
return 0;
@@ -599,39 +591,41 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
unsigned long val, void *data)
{
struct die_args *args = (struct die_args *)data;
+ int ret = NOTIFY_DONE;
+
switch (val) {
case DIE_INT3:
if (kprobe_handler(args->regs))
- return NOTIFY_STOP;
+ ret = NOTIFY_STOP;
break;
case DIE_DEBUG:
if (post_kprobe_handler(args->regs))
- return NOTIFY_STOP;
+ ret = NOTIFY_STOP;
break;
case DIE_GPF:
- if (kprobe_running() &&
- kprobe_fault_handler(args->regs, args->trapnr))
- return NOTIFY_STOP;
- break;
case DIE_PAGE_FAULT:
+ /* kprobe_running() needs smp_processor_id() */
+ preempt_disable();
if (kprobe_running() &&
kprobe_fault_handler(args->regs, args->trapnr))
- return NOTIFY_STOP;
+ ret = NOTIFY_STOP;
+ preempt_enable();
break;
default:
break;
}
- return NOTIFY_DONE;
+ return ret;
}
int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
{
struct jprobe *jp = container_of(p, struct jprobe, kp);
unsigned long addr;
+ struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
- jprobe_saved_regs = *regs;
- jprobe_saved_rsp = (long *) regs->rsp;
- addr = (unsigned long)jprobe_saved_rsp;
+ kcb->jprobe_saved_regs = *regs;
+ kcb->jprobe_saved_rsp = (long *) regs->rsp;
+ addr = (unsigned long)(kcb->jprobe_saved_rsp);
/*
* As Linus pointed out, gcc assumes that the callee
* owns the argument space and could overwrite it, e.g.
@@ -639,7 +633,8 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
* we also save and restore enough stack bytes to cover
* the argument area.
*/
- memcpy(jprobes_stack, (kprobe_opcode_t *) addr, MIN_STACK_SIZE(addr));
+ memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
+ MIN_STACK_SIZE(addr));
regs->eflags &= ~IF_MASK;
regs->rip = (unsigned long)(jp->entry);
return 1;
@@ -647,36 +642,40 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
void __kprobes jprobe_return(void)
{
- preempt_enable_no_resched();
+ struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
asm volatile (" xchg %%rbx,%%rsp \n"
" int3 \n"
" .globl jprobe_return_end \n"
" jprobe_return_end: \n"
" nop \n"::"b"
- (jprobe_saved_rsp):"memory");
+ (kcb->jprobe_saved_rsp):"memory");
}
int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
{
+ struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
u8 *addr = (u8 *) (regs->rip - 1);
- unsigned long stack_addr = (unsigned long)jprobe_saved_rsp;
+ unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_rsp);
struct jprobe *jp = container_of(p, struct jprobe, kp);
if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) {
- if ((long *)regs->rsp != jprobe_saved_rsp) {
+ if ((long *)regs->rsp != kcb->jprobe_saved_rsp) {
struct pt_regs *saved_regs =
- container_of(jprobe_saved_rsp, struct pt_regs, rsp);
+ container_of(kcb->jprobe_saved_rsp,
+ struct pt_regs, rsp);
printk("current rsp %p does not match saved rsp %p\n",
- (long *)regs->rsp, jprobe_saved_rsp);
+ (long *)regs->rsp, kcb->jprobe_saved_rsp);
printk("Saved registers for jprobe %p\n", jp);
show_registers(saved_regs);
printk("Current registers\n");
show_registers(regs);
BUG();
}
- *regs = jprobe_saved_regs;
- memcpy((kprobe_opcode_t *) stack_addr, jprobes_stack,
+ *regs = kcb->jprobe_saved_regs;
+ memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack,
MIN_STACK_SIZE(stack_addr));
+ preempt_enable_no_resched();
return 1;
}
return 0;
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index 08203b07f4b..183dc610542 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -37,7 +37,7 @@ static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
static unsigned long console_logged;
static int notify_user;
static int rip_msr;
-static int mce_bootlog;
+static int mce_bootlog = 1;
/*
* Lockless MCE logging infrastructure.
@@ -54,9 +54,12 @@ void mce_log(struct mce *mce)
{
unsigned next, entry;
mce->finished = 0;
- smp_wmb();
+ wmb();
for (;;) {
entry = rcu_dereference(mcelog.next);
+ /* The rmb forces the compiler to reload next in each
+ iteration */
+ rmb();
for (;;) {
/* When the buffer fills up discard new entries. Assume
that the earlier errors are the more interesting. */
@@ -69,6 +72,7 @@ void mce_log(struct mce *mce)
entry++;
continue;
}
+ break;
}
smp_rmb();
next = entry + 1;
@@ -76,9 +80,9 @@ void mce_log(struct mce *mce)
break;
}
memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
- smp_wmb();
+ wmb();
mcelog.entry[entry].finished = 1;
- smp_wmb();
+ wmb();
if (!test_and_set_bit(0, &console_logged))
notify_user = 1;
@@ -343,7 +347,11 @@ static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
/* disable GART TBL walk error reporting, which trips off
incorrectly with the IOMMU & 3ware & Cerberus. */
clear_bit(10, &bank[4]);
+ /* Lots of broken BIOS around that don't clear them
+ by default and leave crap in there. Don't log. */
+ mce_bootlog = 0;
}
+
}
static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
@@ -352,6 +360,9 @@ static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
case X86_VENDOR_INTEL:
mce_intel_feature_init(c);
break;
+ case X86_VENDOR_AMD:
+ mce_amd_feature_init(c);
+ break;
default:
break;
}
@@ -491,16 +502,16 @@ static int __init mcheck_disable(char *str)
/* mce=off disables machine check. Note you can reenable it later
using sysfs.
mce=TOLERANCELEVEL (number, see above)
- mce=bootlog Log MCEs from before booting. Disabled by default to work
- around buggy BIOS that leave bogus MCEs. */
+ mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
+ mce=nobootlog Don't log MCEs from before booting. */
static int __init mcheck_enable(char *str)
{
if (*str == '=')
str++;
if (!strcmp(str, "off"))
mce_dont_init = 1;
- else if (!strcmp(str, "bootlog"))
- mce_bootlog = 1;
+ else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
+ mce_bootlog = str[0] == 'b';
else if (isdigit(str[0]))
get_option(&str, &tolerant);
else
diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c
new file mode 100644
index 00000000000..1f76175ace0
--- /dev/null
+++ b/arch/x86_64/kernel/mce_amd.c
@@ -0,0 +1,538 @@
+/*
+ * (c) 2005 Advanced Micro Devices, Inc.
+ * Your use of this code is subject to the terms and conditions of the
+ * GNU general public license version 2. See "COPYING" or
+ * http://www.gnu.org/licenses/gpl.html
+ *
+ * Written by Jacob Shin - AMD, Inc.
+ *
+ * Support : jacob.shin@amd.com
+ *
+ * MC4_MISC0 DRAM ECC Error Threshold available under AMD K8 Rev F.
+ * MC4_MISC0 exists per physical processor.
+ *
+ */
+
+#include <linux/cpu.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kobject.h>
+#include <linux/notifier.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/sysdev.h>
+#include <linux/sysfs.h>
+#include <asm/apic.h>
+#include <asm/mce.h>
+#include <asm/msr.h>
+#include <asm/percpu.h>
+
+#define PFX "mce_threshold: "
+#define VERSION "version 1.00.9"
+#define NR_BANKS 5
+#define THRESHOLD_MAX 0xFFF
+#define INT_TYPE_APIC 0x00020000
+#define MASK_VALID_HI 0x80000000
+#define MASK_LVTOFF_HI 0x00F00000
+#define MASK_COUNT_EN_HI 0x00080000
+#define MASK_INT_TYPE_HI 0x00060000
+#define MASK_OVERFLOW_HI 0x00010000
+#define MASK_ERR_COUNT_HI 0x00000FFF
+#define MASK_OVERFLOW 0x0001000000000000L
+
+struct threshold_bank {
+ unsigned int cpu;
+ u8 bank;
+ u8 interrupt_enable;
+ u16 threshold_limit;
+ struct kobject kobj;
+};
+
+static struct threshold_bank threshold_defaults = {
+ .interrupt_enable = 0,
+ .threshold_limit = THRESHOLD_MAX,
+};
+
+#ifdef CONFIG_SMP
+static unsigned char shared_bank[NR_BANKS] = {
+ 0, 0, 0, 0, 1
+};
+#endif
+
+static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
+
+/*
+ * CPU Initialization
+ */
+
+/* must be called with correct cpu affinity */
+static void threshold_restart_bank(struct threshold_bank *b,
+ int reset, u16 old_limit)
+{
+ u32 mci_misc_hi, mci_misc_lo;
+
+ rdmsr(MSR_IA32_MC0_MISC + b->bank * 4, mci_misc_lo, mci_misc_hi);
+
+ if (b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
+ reset = 1; /* limit cannot be lower than err count */
+
+ if (reset) { /* reset err count and overflow bit */
+ mci_misc_hi =
+ (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
+ (THRESHOLD_MAX - b->threshold_limit);
+ } else if (old_limit) { /* change limit w/o reset */
+ int new_count = (mci_misc_hi & THRESHOLD_MAX) +
+ (old_limit - b->threshold_limit);
+ mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
+ (new_count & THRESHOLD_MAX);
+ }
+
+ b->interrupt_enable ?
+ (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
+ (mci_misc_hi &= ~MASK_INT_TYPE_HI);
+
+ mci_misc_hi |= MASK_COUNT_EN_HI;
+ wrmsr(MSR_IA32_MC0_MISC + b->bank * 4, mci_misc_lo, mci_misc_hi);
+}
+
+void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
+{
+ int bank;
+ u32 mci_misc_lo, mci_misc_hi;
+ unsigned int cpu = smp_processor_id();
+
+ for (bank = 0; bank < NR_BANKS; ++bank) {
+ rdmsr(MSR_IA32_MC0_MISC + bank * 4, mci_misc_lo, mci_misc_hi);
+
+ /* !valid, !counter present, bios locked */
+ if (!(mci_misc_hi & MASK_VALID_HI) ||
+ !(mci_misc_hi & MASK_VALID_HI >> 1) ||
+ (mci_misc_hi & MASK_VALID_HI >> 2))
+ continue;
+
+ per_cpu(bank_map, cpu) |= (1 << bank);
+
+#ifdef CONFIG_SMP
+ if (shared_bank[bank] && cpu_core_id[cpu])
+ continue;
+#endif
+
+ setup_threshold_lvt((mci_misc_hi & MASK_LVTOFF_HI) >> 20);
+ threshold_defaults.cpu = cpu;
+ threshold_defaults.bank = bank;
+ threshold_restart_bank(&threshold_defaults, 0, 0);
+ }
+}
+
+/*
+ * APIC Interrupt Handler
+ */
+
+/*
+ * threshold interrupt handler will service THRESHOLD_APIC_VECTOR.
+ * the interrupt goes off when error_count reaches threshold_limit.
+ * the handler will simply log mcelog w/ software defined bank number.
+ */
+asmlinkage void mce_threshold_interrupt(void)
+{
+ int bank;
+ struct mce m;
+
+ ack_APIC_irq();
+ irq_enter();
+
+ memset(&m, 0, sizeof(m));
+ rdtscll(m.tsc);
+ m.cpu = smp_processor_id();
+
+ /* assume first bank caused it */
+ for (bank = 0; bank < NR_BANKS; ++bank) {
+ m.bank = MCE_THRESHOLD_BASE + bank;
+ rdmsrl(MSR_IA32_MC0_MISC + bank * 4, m.misc);
+
+ if (m.misc & MASK_OVERFLOW) {
+ mce_log(&m);
+ goto out;
+ }
+ }
+ out:
+ irq_exit();
+}
+
+/*
+ * Sysfs Interface
+ */
+
+static struct sysdev_class threshold_sysclass = {
+ set_kset_name("threshold"),
+};
+
+static DEFINE_PER_CPU(struct sys_device, device_threshold);
+
+struct threshold_attr {
+ struct attribute attr;
+ ssize_t(*show) (struct threshold_bank *, char *);
+ ssize_t(*store) (struct threshold_bank *, const char *, size_t count);
+};
+
+static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
+
+static cpumask_t affinity_set(unsigned int cpu)
+{
+ cpumask_t oldmask = current->cpus_allowed;
+ cpumask_t newmask = CPU_MASK_NONE;
+ cpu_set(cpu, newmask);
+ set_cpus_allowed(current, newmask);
+ return oldmask;
+}
+
+static void affinity_restore(cpumask_t oldmask)
+{
+ set_cpus_allowed(current, oldmask);
+}
+
+#define SHOW_FIELDS(name) \
+ static ssize_t show_ ## name(struct threshold_bank * b, char *buf) \
+ { \
+ return sprintf(buf, "%lx\n", (unsigned long) b->name); \
+ }
+SHOW_FIELDS(interrupt_enable)
+SHOW_FIELDS(threshold_limit)
+
+static ssize_t store_interrupt_enable(struct threshold_bank *b,
+ const char *buf, size_t count)
+{
+ char *end;
+ cpumask_t oldmask;
+ unsigned long new = simple_strtoul(buf, &end, 0);
+ if (end == buf)
+ return -EINVAL;
+ b->interrupt_enable = !!new;
+
+ oldmask = affinity_set(b->cpu);
+ threshold_restart_bank(b, 0, 0);
+ affinity_restore(oldmask);
+
+ return end - buf;
+}
+
+static ssize_t store_threshold_limit(struct threshold_bank *b,
+ const char *buf, size_t count)
+{
+ char *end;
+ cpumask_t oldmask;
+ u16 old;
+ unsigned long new = simple_strtoul(buf, &end, 0);
+ if (end == buf)
+ return -EINVAL;
+ if (new > THRESHOLD_MAX)
+ new = THRESHOLD_MAX;
+ if (new < 1)
+ new = 1;
+ old = b->threshold_limit;
+ b->threshold_limit = new;
+
+ oldmask = affinity_set(b->cpu);
+ threshold_restart_bank(b, 0, old);
+ affinity_restore(oldmask);
+
+ return end - buf;
+}
+
+static ssize_t show_error_count(struct threshold_bank *b, char *buf)
+{
+ u32 high, low;
+ cpumask_t oldmask;
+ oldmask = affinity_set(b->cpu);
+ rdmsr(MSR_IA32_MC0_MISC + b->bank * 4, low, high); /* ignore low 32 */
+ affinity_restore(oldmask);
+ return sprintf(buf, "%x\n",
+ (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit));
+}
+
+static ssize_t store_error_count(struct threshold_bank *b,
+ const char *buf, size_t count)
+{
+ cpumask_t oldmask;
+ oldmask = affinity_set(b->cpu);
+ threshold_restart_bank(b, 1, 0);
+ affinity_restore(oldmask);
+ return 1;
+}
+
+#define THRESHOLD_ATTR(_name,_mode,_show,_store) { \
+ .attr = {.name = __stringify(_name), .mode = _mode }, \
+ .show = _show, \
+ .store = _store, \
+};
+
+#define ATTR_FIELDS(name) \
+ static struct threshold_attr name = \
+ THRESHOLD_ATTR(name, 0644, show_## name, store_## name)
+
+ATTR_FIELDS(interrupt_enable);
+ATTR_FIELDS(threshold_limit);
+ATTR_FIELDS(error_count);
+
+static struct attribute *default_attrs[] = {
+ &interrupt_enable.attr,
+ &threshold_limit.attr,
+ &error_count.attr,
+ NULL
+};
+
+#define to_bank(k) container_of(k,struct threshold_bank,kobj)
+#define to_attr(a) container_of(a,struct threshold_attr,attr)
+
+static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+ struct threshold_bank *b = to_bank(kobj);
+ struct threshold_attr *a = to_attr(attr);
+ ssize_t ret;
+ ret = a->show ? a->show(b, buf) : -EIO;
+ return ret;
+}
+
+static ssize_t store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t count)
+{
+ struct threshold_bank *b = to_bank(kobj);
+ struct threshold_attr *a = to_attr(attr);
+ ssize_t ret;
+ ret = a->store ? a->store(b, buf, count) : -EIO;
+ return ret;
+}
+
+static struct sysfs_ops threshold_ops = {
+ .show = show,
+ .store = store,
+};
+
+static struct kobj_type threshold_ktype = {
+ .sysfs_ops = &threshold_ops,
+ .default_attrs = default_attrs,
+};
+
+/* symlinks sibling shared banks to first core. first core owns dir/files. */
+static __cpuinit int threshold_create_bank(unsigned int cpu, int bank)
+{
+ int err = 0;
+ struct threshold_bank *b = 0;
+
+#ifdef CONFIG_SMP
+ if (cpu_core_id[cpu] && shared_bank[bank]) { /* symlink */
+ char name[16];
+ unsigned lcpu = first_cpu(cpu_core_map[cpu]);
+ if (cpu_core_id[lcpu])
+ goto out; /* first core not up yet */
+
+ b = per_cpu(threshold_banks, lcpu)[bank];
+ if (!b)
+ goto out;
+ sprintf(name, "bank%i", bank);
+ err = sysfs_create_link(&per_cpu(device_threshold, cpu).kobj,
+ &b->kobj, name);
+ if (err)
+ goto out;
+ per_cpu(threshold_banks, cpu)[bank] = b;
+ goto out;
+ }
+#endif
+
+ b = kmalloc(sizeof(struct threshold_bank), GFP_KERNEL);
+ if (!b) {
+ err = -ENOMEM;
+ goto out;
+ }
+ memset(b, 0, sizeof(struct threshold_bank));
+
+ b->cpu = cpu;
+ b->bank = bank;
+ b->interrupt_enable = 0;
+ b->threshold_limit = THRESHOLD_MAX;
+ kobject_set_name(&b->kobj, "bank%i", bank);
+ b->kobj.parent = &per_cpu(device_threshold, cpu).kobj;
+ b->kobj.ktype = &threshold_ktype;
+
+ err = kobject_register(&b->kobj);
+ if (err) {
+ kfree(b);
+ goto out;
+ }
+ per_cpu(threshold_banks, cpu)[bank] = b;
+ out:
+ return err;
+}
+
+/* create dir/files for all valid threshold banks */
+static __cpuinit int threshold_create_device(unsigned int cpu)
+{
+ int bank;
+ int err = 0;
+
+ per_cpu(device_threshold, cpu).id = cpu;
+ per_cpu(device_threshold, cpu).cls = &threshold_sysclass;
+ err = sysdev_register(&per_cpu(device_threshold, cpu));
+ if (err)
+ goto out;
+
+ for (bank = 0; bank < NR_BANKS; ++bank) {
+ if (!(per_cpu(bank_map, cpu) & 1 << bank))
+ continue;
+ err = threshold_create_bank(cpu, bank);
+ if (err)
+ goto out;
+ }
+ out:
+ return err;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * let's be hotplug friendly.
+ * in case of multiple core processors, the first core always takes ownership
+ * of shared sysfs dir/files, and rest of the cores will be symlinked to it.
+ */
+
+/* cpu hotplug call removes all symlinks before first core dies */
+static __cpuinit void threshold_remove_bank(unsigned int cpu, int bank)
+{
+ struct threshold_bank *b;
+ char name[16];
+
+ b = per_cpu(threshold_banks, cpu)[bank];
+ if (!b)
+ return;
+ if (shared_bank[bank] && atomic_read(&b->kobj.kref.refcount) > 2) {
+ sprintf(name, "bank%i", bank);
+ sysfs_remove_link(&per_cpu(device_threshold, cpu).kobj, name);
+ per_cpu(threshold_banks, cpu)[bank] = 0;
+ } else {
+ kobject_unregister(&b->kobj);
+ kfree(per_cpu(threshold_banks, cpu)[bank]);
+ }
+}
+
+static __cpuinit void threshold_remove_device(unsigned int cpu)
+{
+ int bank;
+
+ for (bank = 0; bank < NR_BANKS; ++bank) {
+ if (!(per_cpu(bank_map, cpu) & 1 << bank))
+ continue;
+ threshold_remove_bank(cpu, bank);
+ }
+ sysdev_unregister(&per_cpu(device_threshold, cpu));
+}
+
+/* link all existing siblings when first core comes up */
+static __cpuinit int threshold_create_symlinks(unsigned int cpu)
+{
+ int bank, err = 0;
+ unsigned int lcpu = 0;
+
+ if (cpu_core_id[cpu])
+ return 0;
+ for_each_cpu_mask(lcpu, cpu_core_map[cpu]) {
+ if (lcpu == cpu)
+ continue;
+ for (bank = 0; bank < NR_BANKS; ++bank) {
+ if (!(per_cpu(bank_map, cpu) & 1 << bank))
+ continue;
+ if (!shared_bank[bank])
+ continue;
+ err = threshold_create_bank(lcpu, bank);
+ }
+ }
+ return err;
+}
+
+/* remove all symlinks before first core dies. */
+static __cpuinit void threshold_remove_symlinks(unsigned int cpu)
+{
+ int bank;
+ unsigned int lcpu = 0;
+ if (cpu_core_id[cpu])
+ return;
+ for_each_cpu_mask(lcpu, cpu_core_map[cpu]) {
+ if (lcpu == cpu)
+ continue;
+ for (bank = 0; bank < NR_BANKS; ++bank) {
+ if (!(per_cpu(bank_map, cpu) & 1 << bank))
+ continue;
+ if (!shared_bank[bank])
+ continue;
+ threshold_remove_bank(lcpu, bank);
+ }
+ }
+}
+#else /* !CONFIG_HOTPLUG_CPU */
+static __cpuinit void threshold_create_symlinks(unsigned int cpu)
+{
+}
+static __cpuinit void threshold_remove_symlinks(unsigned int cpu)
+{
+}
+static void threshold_remove_device(unsigned int cpu)
+{
+}
+#endif
+
+/* get notified when a cpu comes on/off */
+static __cpuinit int threshold_cpu_callback(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ /* cpu was unsigned int to begin with */
+ unsigned int cpu = (unsigned long)hcpu;
+
+ if (cpu >= NR_CPUS)
+ goto out;
+
+ switch (action) {
+ case CPU_ONLINE:
+ threshold_create_device(cpu);
+ threshold_create_symlinks(cpu);
+ break;
+ case CPU_DOWN_PREPARE:
+ threshold_remove_symlinks(cpu);
+ break;
+ case CPU_DOWN_FAILED:
+ threshold_create_symlinks(cpu);
+ break;
+ case CPU_DEAD:
+ threshold_remove_device(cpu);
+ break;
+ default:
+ break;
+ }
+ out:
+ return NOTIFY_OK;
+}
+
+static struct notifier_block threshold_cpu_notifier = {
+ .notifier_call = threshold_cpu_callback,
+};
+
+static __init int threshold_init_device(void)
+{
+ int err;
+ int lcpu = 0;
+
+ err = sysdev_class_register(&threshold_sysclass);
+ if (err)
+ goto out;
+
+ /* to hit CPUs online before the notifier is up */
+ for_each_online_cpu(lcpu) {
+ err = threshold_create_device(lcpu);
+ if (err)
+ goto out;
+ }
+ register_cpu_notifier(&threshold_cpu_notifier);
+
+ out:
+ return err;
+}
+
+device_initcall(threshold_init_device);
diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c
index f16d38d09da..1105250bf02 100644
--- a/arch/x86_64/kernel/mpparse.c
+++ b/arch/x86_64/kernel/mpparse.c
@@ -42,7 +42,7 @@ int acpi_found_madt;
* Various Linux-internal data structures created from the
* MP-table.
*/
-int apic_version [MAX_APICS];
+unsigned char apic_version [MAX_APICS];
unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
@@ -65,7 +65,9 @@ unsigned long mp_lapic_addr = 0;
/* Processor that is doing the boot up */
unsigned int boot_cpu_id = -1U;
/* Internal processor count */
-static unsigned int num_processors = 0;
+unsigned int num_processors __initdata = 0;
+
+unsigned disabled_cpus __initdata;
/* Bitmask of physically existing CPUs */
physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
@@ -106,11 +108,14 @@ static int __init mpf_checksum(unsigned char *mp, int len)
static void __init MP_processor_info (struct mpc_config_processor *m)
{
- int ver, cpu;
+ int cpu;
+ unsigned char ver;
static int found_bsp=0;
- if (!(m->mpc_cpuflag & CPU_ENABLED))
+ if (!(m->mpc_cpuflag & CPU_ENABLED)) {
+ disabled_cpus++;
return;
+ }
printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n",
m->mpc_apicid,
@@ -129,12 +134,14 @@ static void __init MP_processor_info (struct mpc_config_processor *m)
}
cpu = num_processors++;
-
- if (m->mpc_apicid > MAX_APICS) {
+
+#if MAX_APICS < 255
+ if ((int)m->mpc_apicid > MAX_APICS) {
printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
m->mpc_apicid, MAX_APICS);
return;
}
+#endif
ver = m->mpc_apicver;
physid_set(m->mpc_apicid, phys_cpu_present_map);
@@ -218,7 +225,7 @@ static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
m->mpc_irqtype, m->mpc_irqflag & 3,
(m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
- if (++mp_irq_entries == MAX_IRQ_SOURCES)
+ if (++mp_irq_entries >= MAX_IRQ_SOURCES)
panic("Max # of irq sources exceeded!!\n");
}
@@ -549,7 +556,7 @@ void __init get_smp_config (void)
* Read the physical hardware table. Anything here will
* override the defaults.
*/
- if (!smp_read_mpc((void *)(unsigned long)mpf->mpf_physptr)) {
+ if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr))) {
smp_found_config = 0;
printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
index cf0a0315d58..2e28e855ec3 100644
--- a/arch/x86_64/kernel/pci-gart.c
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -187,7 +187,7 @@ static void flush_gart(struct device *dev)
/* Allocate DMA memory on node near device */
noinline
-static void *dma_alloc_pages(struct device *dev, unsigned gfp, unsigned order)
+static void *dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
{
struct page *page;
int node;
@@ -204,7 +204,7 @@ static void *dma_alloc_pages(struct device *dev, unsigned gfp, unsigned order)
*/
void *
dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
- unsigned gfp)
+ gfp_t gfp)
{
void *memory;
unsigned long dma_mask = 0;
@@ -220,6 +220,12 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
uses the normal dma_mask for alloc_coherent. */
dma_mask &= *dev->dma_mask;
+ /* Why <=? Even when the mask is smaller than 4GB it is often larger
+ than 16MB and in this case we have a chance of finding fitting memory
+ in the next higher zone first. If not retry with true GFP_DMA. -AK */
+ if (dma_mask <= 0xffffffff)
+ gfp |= GFP_DMA32;
+
again:
memory = dma_alloc_pages(dev, gfp, get_order(size));
if (memory == NULL)
@@ -245,7 +251,7 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
}
if (!(gfp & GFP_DMA)) {
- gfp |= GFP_DMA;
+ gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
goto again;
}
return NULL;
diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c
index 67d90b89af0..5a981dca87f 100644
--- a/arch/x86_64/kernel/pci-nommu.c
+++ b/arch/x86_64/kernel/pci-nommu.c
@@ -24,7 +24,7 @@ EXPORT_SYMBOL(iommu_sac_force);
*/
void *dma_alloc_coherent(struct device *hwdev, size_t size,
- dma_addr_t *dma_handle, unsigned gfp)
+ dma_addr_t *dma_handle, gfp_t gfp)
{
void *ret;
u64 mask;
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index b5a89c0bdf5..7519fc520eb 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -86,12 +86,22 @@ EXPORT_SYMBOL(enable_hlt);
*/
void default_idle(void)
{
+ local_irq_enable();
+
if (!atomic_read(&hlt_counter)) {
- local_irq_disable();
- if (!need_resched())
- safe_halt();
- else
- local_irq_enable();
+ clear_thread_flag(TIF_POLLING_NRFLAG);
+ smp_mb__after_clear_bit();
+ while (!need_resched()) {
+ local_irq_disable();
+ if (!need_resched())
+ safe_halt();
+ else
+ local_irq_enable();
+ }
+ set_thread_flag(TIF_POLLING_NRFLAG);
+ } else {
+ while (!need_resched())
+ cpu_relax();
}
}
@@ -102,30 +112,16 @@ void default_idle(void)
*/
static void poll_idle (void)
{
- int oldval;
-
local_irq_enable();
- /*
- * Deal with another CPU just having chosen a thread to
- * run here:
- */
- oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
-
- if (!oldval) {
- set_thread_flag(TIF_POLLING_NRFLAG);
- asm volatile(
- "2:"
- "testl %0,%1;"
- "rep; nop;"
- "je 2b;"
- : :
- "i" (_TIF_NEED_RESCHED),
- "m" (current_thread_info()->flags));
- clear_thread_flag(TIF_POLLING_NRFLAG);
- } else {
- set_need_resched();
- }
+ asm volatile(
+ "2:"
+ "testl %0,%1;"
+ "rep; nop;"
+ "je 2b;"
+ : :
+ "i" (_TIF_NEED_RESCHED),
+ "m" (current_thread_info()->flags));
}
void cpu_idle_wait(void)
@@ -148,7 +144,8 @@ void cpu_idle_wait(void)
do {
ssleep(1);
for_each_online_cpu(cpu) {
- if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
+ if (cpu_isset(cpu, map) &&
+ !per_cpu(cpu_idle_state, cpu))
cpu_clear(cpu, map);
}
cpus_and(map, map, cpu_online_map);
@@ -187,6 +184,8 @@ static inline void play_dead(void)
*/
void cpu_idle (void)
{
+ set_thread_flag(TIF_POLLING_NRFLAG);
+
/* endless idle loop with no priority at all */
while (1) {
while (!need_resched()) {
@@ -204,7 +203,9 @@ void cpu_idle (void)
idle();
}
+ preempt_enable_no_resched();
schedule();
+ preempt_disable();
}
}
@@ -219,15 +220,12 @@ static void mwait_idle(void)
{
local_irq_enable();
- if (!need_resched()) {
- set_thread_flag(TIF_POLLING_NRFLAG);
- do {
- __monitor((void *)&current_thread_info()->flags, 0, 0);
- if (need_resched())
- break;
- __mwait(0, 0);
- } while (!need_resched());
- clear_thread_flag(TIF_POLLING_NRFLAG);
+ while (!need_resched()) {
+ __monitor((void *)&current_thread_info()->flags, 0, 0);
+ smp_mb();
+ if (need_resched())
+ break;
+ __mwait(0, 0);
}
}
@@ -278,7 +276,8 @@ void __show_regs(struct pt_regs * regs)
system_utsname.version);
printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
printk_address(regs->rip);
- printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, regs->eflags);
+ printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
+ regs->eflags);
printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
regs->rax, regs->rbx, regs->rcx);
printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
@@ -352,13 +351,6 @@ void flush_thread(void)
struct task_struct *tsk = current;
struct thread_info *t = current_thread_info();
- /*
- * Remove function-return probe instances associated with this task
- * and put them back on the free list. Do not insert an exit probe for
- * this function, it will be disabled by kprobe_flush_task if you do.
- */
- kprobe_flush_task(tsk);
-
if (t->flags & _TIF_ABI_PENDING)
t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
@@ -430,15 +422,14 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
struct pt_regs * childregs;
struct task_struct *me = current;
- childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1;
-
+ childregs = ((struct pt_regs *)
+ (THREAD_SIZE + (unsigned long) p->thread_info)) - 1;
*childregs = *regs;
childregs->rax = 0;
childregs->rsp = rsp;
- if (rsp == ~0UL) {
+ if (rsp == ~0UL)
childregs->rsp = (unsigned long)childregs;
- }
p->thread.rsp = (unsigned long) childregs;
p->thread.rsp0 = (unsigned long) (childregs+1);
@@ -460,7 +451,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
p->thread.io_bitmap_max = 0;
return -ENOMEM;
}
- memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, IO_BITMAP_BYTES);
+ memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
+ IO_BITMAP_BYTES);
}
/*
@@ -497,7 +489,8 @@ out:
* - fold all the options into a flag word and test it with a single test.
* - could test fs/gs bitsliced
*/
-struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+struct task_struct *
+__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev = &prev_p->thread,
*next = &next_p->thread;
@@ -568,7 +561,8 @@ struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct *
prev->userrsp = read_pda(oldrsp);
write_pda(oldrsp, next->userrsp);
write_pda(pcurrent, next_p);
- write_pda(kernelstack, (unsigned long)next_p->thread_info + THREAD_SIZE - PDA_STACKOFFSET);
+ write_pda(kernelstack,
+ (unsigned long)next_p->thread_info + THREAD_SIZE - PDA_STACKOFFSET);
/*
* Now maybe reload the debug registers
@@ -649,7 +643,9 @@ asmlinkage long sys_fork(struct pt_regs *regs)
return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
}
-asmlinkage long sys_clone(unsigned long clone_flags, unsigned long newsp, void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
+asmlinkage long
+sys_clone(unsigned long clone_flags, unsigned long newsp,
+ void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
{
if (!newsp)
newsp = regs->rsp;
@@ -685,7 +681,8 @@ unsigned long get_wchan(struct task_struct *p)
return 0;
fp = *(u64 *)(p->thread.rsp);
do {
- if (fp < (unsigned long)stack || fp > (unsigned long)stack+THREAD_SIZE)
+ if (fp < (unsigned long)stack ||
+ fp > (unsigned long)stack+THREAD_SIZE)
return 0;
rip = *(u64 *)(fp+8);
if (!in_sched_functions(rip))
@@ -720,8 +717,8 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
task->thread.gsindex = 0;
task->thread.gs = addr;
if (doit) {
- load_gs_index(0);
- ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
+ load_gs_index(0);
+ ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
}
}
put_cpu();
@@ -738,7 +735,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
set_32bit_tls(task, FS_TLS, addr);
if (doit) {
load_TLS(&task->thread, cpu);
- asm volatile("movl %0,%%fs" :: "r" (FS_TLS_SEL));
+ asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
}
task->thread.fsindex = FS_TLS_SEL;
task->thread.fs = 0;
@@ -748,8 +745,8 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
if (doit) {
/* set the selector to 0 to not confuse
__switch_to */
- asm volatile("movl %0,%%fs" :: "r" (0));
- ret = checking_wrmsrl(MSR_FS_BASE, addr);
+ asm volatile("movl %0,%%fs" :: "r" (0));
+ ret = checking_wrmsrl(MSR_FS_BASE, addr);
}
}
put_cpu();
@@ -758,9 +755,9 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
unsigned long base;
if (task->thread.fsindex == FS_TLS_SEL)
base = read_32bit_tls(task, FS_TLS);
- else if (doit) {
+ else if (doit)
rdmsrl(MSR_FS_BASE, base);
- } else
+ else
base = task->thread.fs;
ret = put_user(base, (unsigned long __user *)addr);
break;
@@ -769,9 +766,9 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
unsigned long base;
if (task->thread.gsindex == GS_TLS_SEL)
base = read_32bit_tls(task, GS_TLS);
- else if (doit) {
+ else if (doit)
rdmsrl(MSR_KERNEL_GS_BASE, base);
- } else
+ else
base = task->thread.gs;
ret = put_user(base, (unsigned long __user *)addr);
break;
diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c
index bbf64b59a21..a87b6cebe80 100644
--- a/arch/x86_64/kernel/ptrace.c
+++ b/arch/x86_64/kernel/ptrace.c
@@ -313,48 +313,11 @@ static unsigned long getreg(struct task_struct *child, unsigned long regno)
}
-asmlinkage long sys_ptrace(long request, long pid, unsigned long addr, long data)
+long arch_ptrace(struct task_struct *child, long request, long addr, long data)
{
- struct task_struct *child;
long i, ret;
unsigned ui;
- /* This lock_kernel fixes a subtle race with suid exec */
- lock_kernel();
- ret = -EPERM;
- if (request == PTRACE_TRACEME) {
- /* are we already being traced? */
- if (current->ptrace & PT_PTRACED)
- goto out;
- ret = security_ptrace(current->parent, current);
- if (ret)
- goto out;
- /* set the ptrace bit in the process flags. */
- current->ptrace |= PT_PTRACED;
- ret = 0;
- goto out;
- }
- ret = -ESRCH;
- read_lock(&tasklist_lock);
- child = find_task_by_pid(pid);
- if (child)
- get_task_struct(child);
- read_unlock(&tasklist_lock);
- if (!child)
- goto out;
-
- ret = -EPERM;
- if (pid == 1) /* you may not mess with init */
- goto out_tsk;
-
- if (request == PTRACE_ATTACH) {
- ret = ptrace_attach(child);
- goto out_tsk;
- }
- ret = ptrace_check_attach(child, request == PTRACE_KILL);
- if (ret < 0)
- goto out_tsk;
-
switch (request) {
/* when I and D space are separate, these will need to be fixed. */
case PTRACE_PEEKTEXT: /* read word at location addr. */
@@ -608,10 +571,6 @@ asmlinkage long sys_ptrace(long request, long pid, unsigned long addr, long data
ret = ptrace_request(child, request, addr, data);
break;
}
-out_tsk:
- put_task_struct(child);
-out:
- unlock_kernel();
return ret;
}
diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c
index 47f95687905..75235ed2b31 100644
--- a/arch/x86_64/kernel/reboot.c
+++ b/arch/x86_64/kernel/reboot.c
@@ -77,6 +77,7 @@ static inline void kb_wait(void)
void machine_shutdown(void)
{
+ unsigned long flags;
/* Stop the cpus and apics */
#ifdef CONFIG_SMP
int reboot_cpu_id;
@@ -98,7 +99,7 @@ void machine_shutdown(void)
smp_send_stop();
#endif
- local_irq_disable();
+ local_irq_save(flags);
#ifndef CONFIG_SMP
disable_local_APIC();
@@ -106,7 +107,7 @@ void machine_shutdown(void)
disable_IO_APIC();
- local_irq_enable();
+ local_irq_restore(flags);
}
void machine_emergency_restart(void)
@@ -120,7 +121,7 @@ void machine_emergency_restart(void)
/* Could also try the reset bit in the Hammer NB */
switch (reboot_type) {
case BOOT_KBD:
- for (i=0; i<100; i++) {
+ for (i=0; i<10; i++) {
kb_wait();
udelay(50);
outb(0xfe,0x64); /* pulse reset low */
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 238f73e1a83..750e01dcbdf 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -60,6 +60,7 @@
#include <asm/setup.h>
#include <asm/mach_apic.h>
#include <asm/numa.h>
+#include <asm/sections.h>
/*
* Machine setup..
@@ -103,7 +104,6 @@ struct edid_info edid_info;
struct e820map e820;
extern int root_mountflags;
-extern char _text, _etext, _edata, _end;
char command_line[COMMAND_LINE_SIZE];
@@ -412,7 +412,6 @@ contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
{
unsigned long bootmap_size, bootmap;
- memory_present(0, start_pfn, end_pfn);
bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
if (bootmap == -1L)
@@ -571,6 +570,8 @@ void __init setup_arch(char **cmdline_p)
init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
+ zap_low_mappings(0);
+
#ifdef CONFIG_ACPI
/*
* Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
@@ -657,8 +658,6 @@ void __init setup_arch(char **cmdline_p)
}
#endif
- sparse_init();
-
paging_init();
check_ioapic();
@@ -793,7 +792,7 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
#endif
bits = 0;
- while ((1 << bits) < c->x86_num_cores)
+ while ((1 << bits) < c->x86_max_cores)
bits++;
/* Low order bits define the core id (index of core in socket) */
@@ -823,16 +822,14 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
if (!node_online(node))
node = nearby_node(apicid);
}
- cpu_to_node[cpu] = node;
+ numa_set_node(cpu, node);
printk(KERN_INFO "CPU %d(%d) -> Node %d -> Core %d\n",
- cpu, c->x86_num_cores, node, cpu_core_id[cpu]);
+ cpu, c->x86_max_cores, node, cpu_core_id[cpu]);
#endif
#endif
}
-#define HWCR 0xc0010015
-
static int __init init_amd(struct cpuinfo_x86 *c)
{
int r;
@@ -841,14 +838,18 @@ static int __init init_amd(struct cpuinfo_x86 *c)
#ifdef CONFIG_SMP
unsigned long value;
- // Disable TLB flush filter by setting HWCR.FFDIS:
- // bit 6 of msr C001_0015
- //
- // Errata 63 for SH-B3 steppings
- // Errata 122 for all(?) steppings
- rdmsrl(HWCR, value);
- value |= 1 << 6;
- wrmsrl(HWCR, value);
+ /*
+ * Disable TLB flush filter by setting HWCR.FFDIS on K8
+ * bit 6 of msr C001_0015
+ *
+ * Errata 63 for SH-B3 steppings
+ * Errata 122 for all steppings (F+ have it disabled by default)
+ */
+ if (c->x86 == 15) {
+ rdmsrl(MSR_K8_HWCR, value);
+ value |= 1 << 6;
+ wrmsrl(MSR_K8_HWCR, value);
+ }
#endif
/* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
@@ -873,9 +874,9 @@ static int __init init_amd(struct cpuinfo_x86 *c)
display_cacheinfo(c);
if (c->extended_cpuid_level >= 0x80000008) {
- c->x86_num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
- if (c->x86_num_cores & (c->x86_num_cores - 1))
- c->x86_num_cores = 1;
+ c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
+ if (c->x86_max_cores & (c->x86_max_cores - 1))
+ c->x86_max_cores = 1;
amd_detect_cmp(c);
}
@@ -887,54 +888,44 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
u32 eax, ebx, ecx, edx;
- int index_msb, tmp;
+ int index_msb, core_bits;
int cpu = smp_processor_id();
-
+
+ cpuid(1, &eax, &ebx, &ecx, &edx);
+
+ c->apicid = phys_pkg_id(0);
+
if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
return;
- cpuid(1, &eax, &ebx, &ecx, &edx);
smp_num_siblings = (ebx & 0xff0000) >> 16;
-
+
if (smp_num_siblings == 1) {
printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
- } else if (smp_num_siblings > 1) {
- index_msb = 31;
- /*
- * At this point we only support two siblings per
- * processor package.
- */
+ } else if (smp_num_siblings > 1 ) {
+
if (smp_num_siblings > NR_CPUS) {
printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
smp_num_siblings = 1;
return;
}
- tmp = smp_num_siblings;
- while ((tmp & 0x80000000 ) == 0) {
- tmp <<=1 ;
- index_msb--;
- }
- if (smp_num_siblings & (smp_num_siblings - 1))
- index_msb++;
+
+ index_msb = get_count_order(smp_num_siblings);
phys_proc_id[cpu] = phys_pkg_id(index_msb);
-
+
printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
phys_proc_id[cpu]);
- smp_num_siblings = smp_num_siblings / c->x86_num_cores;
+ smp_num_siblings = smp_num_siblings / c->x86_max_cores;
- tmp = smp_num_siblings;
- index_msb = 31;
- while ((tmp & 0x80000000) == 0) {
- tmp <<=1 ;
- index_msb--;
- }
- if (smp_num_siblings & (smp_num_siblings - 1))
- index_msb++;
+ index_msb = get_count_order(smp_num_siblings) ;
+
+ core_bits = get_count_order(c->x86_max_cores);
- cpu_core_id[cpu] = phys_pkg_id(index_msb);
+ cpu_core_id[cpu] = phys_pkg_id(index_msb) &
+ ((1 << core_bits) - 1);
- if (c->x86_num_cores > 1)
+ if (c->x86_max_cores > 1)
printk(KERN_INFO "CPU: Processor Core ID: %d\n",
cpu_core_id[cpu]);
}
@@ -965,16 +956,15 @@ static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
static void srat_detect_node(void)
{
#ifdef CONFIG_NUMA
- unsigned apicid, node;
+ unsigned node;
int cpu = smp_processor_id();
/* Don't do the funky fallback heuristics the AMD version employs
for now. */
- apicid = phys_proc_id[cpu];
- node = apicid_to_node[apicid];
+ node = apicid_to_node[hard_smp_processor_id()];
if (node == NUMA_NO_NODE)
node = 0;
- cpu_to_node[cpu] = node;
+ numa_set_node(cpu, node);
if (acpi_numa > 0)
printk(KERN_INFO "CPU %d -> Node %d\n", cpu, node);
@@ -992,13 +982,18 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
unsigned eax = cpuid_eax(0x80000008);
c->x86_virt_bits = (eax >> 8) & 0xff;
c->x86_phys_bits = eax & 0xff;
+ /* CPUID workaround for Intel 0F34 CPU */
+ if (c->x86_vendor == X86_VENDOR_INTEL &&
+ c->x86 == 0xF && c->x86_model == 0x3 &&
+ c->x86_mask == 0x4)
+ c->x86_phys_bits = 36;
}
if (c->x86 == 15)
c->x86_cache_alignment = c->x86_clflush_size * 2;
if (c->x86 >= 15)
set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
- c->x86_num_cores = intel_num_cpu_cores(c);
+ c->x86_max_cores = intel_num_cpu_cores(c);
srat_detect_node();
}
@@ -1036,7 +1031,7 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
c->x86_model_id[0] = '\0'; /* Unset */
c->x86_clflush_size = 64;
c->x86_cache_alignment = c->x86_clflush_size;
- c->x86_num_cores = 1;
+ c->x86_max_cores = 1;
c->extended_cpuid_level = 0;
memset(&c->x86_capability, 0, sizeof c->x86_capability);
@@ -1059,10 +1054,10 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
c->x86 = (tfms >> 8) & 0xf;
c->x86_model = (tfms >> 4) & 0xf;
c->x86_mask = tfms & 0xf;
- if (c->x86 == 0xf) {
+ if (c->x86 == 0xf)
c->x86 += (tfms >> 20) & 0xff;
+ if (c->x86 >= 0x6)
c->x86_model += ((tfms >> 16) & 0xF) << 4;
- }
if (c->x86_capability[0] & (1<<19))
c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
} else {
@@ -1212,7 +1207,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
/* Intel-defined (#2) */
- "pni", NULL, NULL, "monitor", "ds_cpl", NULL, NULL, "est",
+ "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", NULL, "est",
"tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
@@ -1270,13 +1265,12 @@ static int show_cpuinfo(struct seq_file *m, void *v)
seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
#ifdef CONFIG_SMP
- if (smp_num_siblings * c->x86_num_cores > 1) {
+ if (smp_num_siblings * c->x86_max_cores > 1) {
int cpu = c - cpu_data;
seq_printf(m, "physical id\t: %d\n", phys_proc_id[cpu]);
- seq_printf(m, "siblings\t: %d\n",
- c->x86_num_cores * smp_num_siblings);
+ seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[cpu]));
seq_printf(m, "core id\t\t: %d\n", cpu_core_id[cpu]);
- seq_printf(m, "cpu cores\t: %d\n", c->x86_num_cores);
+ seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
}
#endif
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index bd33be24a38..06dc354375c 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -87,6 +87,10 @@ void __init setup_per_cpu_areas(void)
int i;
unsigned long size;
+#ifdef CONFIG_HOTPLUG_CPU
+ prefill_possible_map();
+#endif
+
/* Copy section for each CPU (we discard the original) */
size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
#ifdef CONFIG_MODULES
@@ -137,7 +141,6 @@ void pda_init(int cpu)
panic("cannot allocate irqstack for cpu %d", cpu);
}
- asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
pda->irqstackptr += IRQSTACKSIZE-64;
}
@@ -193,6 +196,7 @@ void __cpuinit cpu_init (void)
/* CPU 0 is initialised in head64.c */
if (cpu != 0) {
pda_init(cpu);
+ zap_low_mappings(cpu);
} else
estacks = boot_exception_stacks;
diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c
index d642fbf3da2..5876df116c9 100644
--- a/arch/x86_64/kernel/signal.c
+++ b/arch/x86_64/kernel/signal.c
@@ -110,6 +110,15 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned
COPY(r14);
COPY(r15);
+ /* Kernel saves and restores only the CS segment register on signals,
+ * which is the bare minimum needed to allow mixed 32/64-bit code.
+ * App's signal handler can save/restore other segments if needed. */
+ {
+ unsigned cs;
+ err |= __get_user(cs, &sc->cs);
+ regs->cs = cs | 3; /* Force into user mode */
+ }
+
{
unsigned int tmpflags;
err |= __get_user(tmpflags, &sc->eflags);
@@ -187,6 +196,7 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned lo
{
int err = 0;
+ err |= __put_user(regs->cs, &sc->cs);
err |= __put_user(0, &sc->gs);
err |= __put_user(0, &sc->fs);
@@ -318,7 +328,14 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
regs->rsp = (unsigned long)frame;
+ /* Set up the CS register to run signal handlers in 64-bit mode,
+ even if the handler happens to be interrupting 32-bit code. */
+ regs->cs = __USER_CS;
+
+ /* This, by contrast, has nothing to do with segment registers -
+ see include/asm-x86_64/uaccess.h for details. */
set_fs(USER_DS);
+
regs->eflags &= ~TF_MASK;
if (test_thread_flag(TIF_SINGLESTEP))
ptrace_notify(SIGTRAP);
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index 9db9dda161b..cfc3d9dccbd 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -28,8 +28,6 @@
#include <asm/proto.h>
#include <asm/apicdef.h>
-#define __cpuinit __init
-
/*
* Smarter SMP flushing macros.
* c/o Linus Torvalds.
@@ -452,13 +450,14 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
void smp_stop_cpu(void)
{
+ unsigned long flags;
/*
* Remove this CPU:
*/
cpu_clear(smp_processor_id(), cpu_online_map);
- local_irq_disable();
+ local_irq_save(flags);
disable_local_APIC();
- local_irq_enable();
+ local_irq_restore(flags);
}
static void smp_really_stop_cpu(void *dummy)
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index e12d7baeb33..683c33f7b96 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -64,9 +64,8 @@
int smp_num_siblings = 1;
/* Package ID of each logical CPU */
u8 phys_proc_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
+/* core ID of each logical CPU */
u8 cpu_core_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
-EXPORT_SYMBOL(phys_proc_id);
-EXPORT_SYMBOL(cpu_core_id);
/* Bitmask of currently online CPUs */
cpumask_t cpu_online_map __read_mostly;
@@ -89,7 +88,10 @@ struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
/* Set when the idlers are all forked */
int smp_threads_ready;
+/* representing HT siblings of each logical CPU */
cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
+
+/* representing HT and core siblings of each logical CPU */
cpumask_t cpu_core_map[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(cpu_core_map);
@@ -436,30 +438,59 @@ void __cpuinit smp_callin(void)
cpu_set(cpuid, cpu_callin_map);
}
+/* representing cpus for which sibling maps can be computed */
+static cpumask_t cpu_sibling_setup_map;
+
static inline void set_cpu_sibling_map(int cpu)
{
int i;
+ struct cpuinfo_x86 *c = cpu_data;
+
+ cpu_set(cpu, cpu_sibling_setup_map);
if (smp_num_siblings > 1) {
- for_each_cpu(i) {
- if (cpu_core_id[cpu] == cpu_core_id[i]) {
+ for_each_cpu_mask(i, cpu_sibling_setup_map) {
+ if (phys_proc_id[cpu] == phys_proc_id[i] &&
+ cpu_core_id[cpu] == cpu_core_id[i]) {
cpu_set(i, cpu_sibling_map[cpu]);
cpu_set(cpu, cpu_sibling_map[i]);
+ cpu_set(i, cpu_core_map[cpu]);
+ cpu_set(cpu, cpu_core_map[i]);
}
}
} else {
cpu_set(cpu, cpu_sibling_map[cpu]);
}
- if (current_cpu_data.x86_num_cores > 1) {
- for_each_cpu(i) {
- if (phys_proc_id[cpu] == phys_proc_id[i]) {
- cpu_set(i, cpu_core_map[cpu]);
- cpu_set(cpu, cpu_core_map[i]);
- }
- }
- } else {
+ if (current_cpu_data.x86_max_cores == 1) {
cpu_core_map[cpu] = cpu_sibling_map[cpu];
+ c[cpu].booted_cores = 1;
+ return;
+ }
+
+ for_each_cpu_mask(i, cpu_sibling_setup_map) {
+ if (phys_proc_id[cpu] == phys_proc_id[i]) {
+ cpu_set(i, cpu_core_map[cpu]);
+ cpu_set(cpu, cpu_core_map[i]);
+ /*
+ * Does this new cpu bringup a new core?
+ */
+ if (cpus_weight(cpu_sibling_map[cpu]) == 1) {
+ /*
+ * for each core in package, increment
+ * the booted_cores for this new cpu
+ */
+ if (first_cpu(cpu_sibling_map[i]) == i)
+ c[cpu].booted_cores++;
+ /*
+ * increment the core count for all
+ * the other cpus in this package
+ */
+ if (i != cpu)
+ c[i].booted_cores++;
+ } else if (i != cpu && !c[cpu].booted_cores)
+ c[cpu].booted_cores = c[i].booted_cores;
+ }
}
}
@@ -474,6 +505,7 @@ void __cpuinit start_secondary(void)
* things done here to the most necessary things.
*/
cpu_init();
+ preempt_disable();
smp_callin();
/* otherwise gcc will move up the smp_processor_id before the cpu_init */
@@ -880,6 +912,9 @@ static __init void disable_smp(void)
}
#ifdef CONFIG_HOTPLUG_CPU
+
+int additional_cpus __initdata = -1;
+
/*
* cpu_possible_map should be static, it cannot change as cpu's
* are onlined, or offlined. The reason is per-cpu data-structures
@@ -888,14 +923,38 @@ static __init void disable_smp(void)
* cpu_present_map on the other hand can change dynamically.
* In case when cpu_hotplug is not compiled, then we resort to current
* behaviour, which is cpu_possible == cpu_present.
- * If cpu-hotplug is supported, then we need to preallocate for all
- * those NR_CPUS, hence cpu_possible_map represents entire NR_CPUS range.
* - Ashok Raj
+ *
+ * Three ways to find out the number of additional hotplug CPUs:
+ * - If the BIOS specified disabled CPUs in ACPI/mptables use that.
+ * - otherwise use half of the available CPUs or 2, whatever is more.
+ * - The user can overwrite it with additional_cpus=NUM
+ * We do this because additional CPUs waste a lot of memory.
+ * -AK
*/
-static void prefill_possible_map(void)
+__init void prefill_possible_map(void)
{
int i;
- for (i = 0; i < NR_CPUS; i++)
+ int possible;
+
+ if (additional_cpus == -1) {
+ if (disabled_cpus > 0) {
+ additional_cpus = disabled_cpus;
+ } else {
+ additional_cpus = num_processors / 2;
+ if (additional_cpus == 0)
+ additional_cpus = 2;
+ }
+ }
+ possible = num_processors + additional_cpus;
+ if (possible > NR_CPUS)
+ possible = NR_CPUS;
+
+ printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
+ possible,
+ max_t(int, possible - num_processors, 0));
+
+ for (i = 0; i < possible; i++)
cpu_set(i, cpu_possible_map);
}
#endif
@@ -966,10 +1025,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
nmi_watchdog_default();
current_cpu_data = boot_cpu_data;
current_thread_info()->cpu = 0; /* needed? */
-
-#ifdef CONFIG_HOTPLUG_CPU
- prefill_possible_map();
-#endif
+ set_cpu_sibling_map(0);
if (smp_sanity_check(max_cpus) < 0) {
printk(KERN_INFO "SMP disabled\n");
@@ -1013,8 +1069,6 @@ void __init smp_prepare_boot_cpu(void)
int me = smp_processor_id();
cpu_set(me, cpu_online_map);
cpu_set(me, cpu_callout_map);
- cpu_set(0, cpu_sibling_map[0]);
- cpu_set(0, cpu_core_map[0]);
per_cpu(cpu_state, me) = CPU_ONLINE;
}
@@ -1067,9 +1121,6 @@ int __cpuinit __cpu_up(unsigned int cpu)
*/
void __init smp_cpus_done(unsigned int max_cpus)
{
-#ifndef CONFIG_HOTPLUG_CPU
- zap_low_mappings();
-#endif
smp_cleanup_boot();
#ifdef CONFIG_X86_IO_APIC
@@ -1086,15 +1137,24 @@ void __init smp_cpus_done(unsigned int max_cpus)
static void remove_siblinginfo(int cpu)
{
int sibling;
+ struct cpuinfo_x86 *c = cpu_data;
+ for_each_cpu_mask(sibling, cpu_core_map[cpu]) {
+ cpu_clear(cpu, cpu_core_map[sibling]);
+ /*
+ * last thread sibling in this cpu core going down
+ */
+ if (cpus_weight(cpu_sibling_map[cpu]) == 1)
+ c[sibling].booted_cores--;
+ }
+
for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
cpu_clear(cpu, cpu_sibling_map[sibling]);
- for_each_cpu_mask(sibling, cpu_core_map[cpu])
- cpu_clear(cpu, cpu_core_map[sibling]);
cpus_clear(cpu_sibling_map[cpu]);
cpus_clear(cpu_core_map[cpu]);
phys_proc_id[cpu] = BAD_APICID;
cpu_core_id[cpu] = BAD_APICID;
+ cpu_clear(cpu, cpu_sibling_setup_map);
}
void remove_cpu_from_maps(void)
@@ -1158,6 +1218,12 @@ void __cpu_die(unsigned int cpu)
printk(KERN_ERR "CPU %u didn't die...\n", cpu);
}
+static __init int setup_additional_cpus(char *s)
+{
+ return get_option(&s, &additional_cpus);
+}
+__setup("additional_cpus=", setup_additional_cpus);
+
#else /* ... !CONFIG_HOTPLUG_CPU */
int __cpu_disable(void)
diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c
index ebb9abf3ce6..fd2bef78088 100644
--- a/arch/x86_64/kernel/suspend.c
+++ b/arch/x86_64/kernel/suspend.c
@@ -11,6 +11,8 @@
#include <linux/smp.h>
#include <linux/suspend.h>
#include <asm/proto.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
struct saved_context saved_context;
@@ -61,13 +63,12 @@ void save_processor_state(void)
__save_processor_state(&saved_context);
}
-static void
-do_fpu_end(void)
+static void do_fpu_end(void)
{
- /* restore FPU regs if necessary */
- /* Do it out of line so that gcc does not move cr0 load to some stupid place */
- kernel_fpu_end();
- mxcsr_feature_mask_init();
+ /*
+ * Restore FPU regs if necessary
+ */
+ kernel_fpu_end();
}
void __restore_processor_state(struct saved_context *ctxt)
@@ -140,4 +141,83 @@ void fix_processor_context(void)
}
+#ifdef CONFIG_SOFTWARE_SUSPEND
+/* Defined in arch/x86_64/kernel/suspend_asm.S */
+extern int restore_image(void);
+
+pgd_t *temp_level4_pgt;
+
+static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
+{
+ long i, j;
+
+ i = pud_index(address);
+ pud = pud + i;
+ for (; i < PTRS_PER_PUD; pud++, i++) {
+ unsigned long paddr;
+ pmd_t *pmd;
+
+ paddr = address + i*PUD_SIZE;
+ if (paddr >= end)
+ break;
+
+ pmd = (pmd_t *)get_safe_page(GFP_ATOMIC);
+ if (!pmd)
+ return -ENOMEM;
+ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+ for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
+ unsigned long pe;
+
+ if (paddr >= end)
+ break;
+ pe = _PAGE_NX | _PAGE_PSE | _KERNPG_TABLE | paddr;
+ pe &= __supported_pte_mask;
+ set_pmd(pmd, __pmd(pe));
+ }
+ }
+ return 0;
+}
+
+static int set_up_temporary_mappings(void)
+{
+ unsigned long start, end, next;
+ int error;
+
+ temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC);
+ if (!temp_level4_pgt)
+ return -ENOMEM;
+
+ /* It is safe to reuse the original kernel mapping */
+ set_pgd(temp_level4_pgt + pgd_index(__START_KERNEL_map),
+ init_level4_pgt[pgd_index(__START_KERNEL_map)]);
+
+ /* Set up the direct mapping from scratch */
+ start = (unsigned long)pfn_to_kaddr(0);
+ end = (unsigned long)pfn_to_kaddr(end_pfn);
+
+ for (; start < end; start = next) {
+ pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC);
+ if (!pud)
+ return -ENOMEM;
+ next = start + PGDIR_SIZE;
+ if (next > end)
+ next = end;
+ if ((error = res_phys_pud_init(pud, __pa(start), __pa(next))))
+ return error;
+ set_pgd(temp_level4_pgt + pgd_index(start),
+ mk_kernel_pgd(__pa(pud)));
+ }
+ return 0;
+}
+
+int swsusp_arch_resume(void)
+{
+ int error;
+ /* We have got enough memory and from now on we cannot recover */
+ if ((error = set_up_temporary_mappings()))
+ return error;
+ restore_image();
+ return 0;
+}
+#endif /* CONFIG_SOFTWARE_SUSPEND */
diff --git a/arch/x86_64/kernel/suspend_asm.S b/arch/x86_64/kernel/suspend_asm.S
index 4d659e97df1..320b6fb00cc 100644
--- a/arch/x86_64/kernel/suspend_asm.S
+++ b/arch/x86_64/kernel/suspend_asm.S
@@ -39,12 +39,13 @@ ENTRY(swsusp_arch_suspend)
call swsusp_save
ret
-ENTRY(swsusp_arch_resume)
- /* set up cr3 */
- leaq init_level4_pgt(%rip),%rax
- subq $__START_KERNEL_map,%rax
- movq %rax,%cr3
-
+ENTRY(restore_image)
+ /* switch to temporary page tables */
+ movq $__PAGE_OFFSET, %rdx
+ movq temp_level4_pgt(%rip), %rax
+ subq %rdx, %rax
+ movq %rax, %cr3
+ /* Flush TLB */
movq mmu_cr4_features(%rip), %rax
movq %rax, %rdx
andq $~(1<<7), %rdx # PGE
@@ -69,6 +70,10 @@ loop:
movq pbe_next(%rdx), %rdx
jmp loop
done:
+ /* go back to the original page tables */
+ leaq init_level4_pgt(%rip), %rax
+ subq $__START_KERNEL_map, %rax
+ movq %rax, %cr3
/* Flush TLB, including "global" things (vmalloc) */
movq mmu_cr4_features(%rip), %rax
movq %rax, %rdx
diff --git a/arch/x86_64/kernel/sys_x86_64.c b/arch/x86_64/kernel/sys_x86_64.c
index cc7821c6885..6449ea8fe75 100644
--- a/arch/x86_64/kernel/sys_x86_64.c
+++ b/arch/x86_64/kernel/sys_x86_64.c
@@ -154,17 +154,3 @@ asmlinkage long sys_uname(struct new_utsname __user * name)
err |= copy_to_user(&name->machine, "i686", 5);
return err ? -EFAULT : 0;
}
-
-asmlinkage long sys_time64(long __user * tloc)
-{
- struct timeval now;
- int i;
-
- do_gettimeofday(&now);
- i = now.tv_sec;
- if (tloc) {
- if (put_user(i,tloc))
- i = -EFAULT;
- }
- return i;
-}
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 2373cb8b862..fdaddc4e528 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -42,10 +42,6 @@
#include <asm/apic.h>
#endif
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
#ifdef CONFIG_CPU_FREQ
static void cpufreq_delayed_get(void);
#endif
@@ -481,9 +477,9 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
static unsigned int cyc2ns_scale;
#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
-static inline void set_cyc2ns_scale(unsigned long cpu_mhz)
+static inline void set_cyc2ns_scale(unsigned long cpu_khz)
{
- cyc2ns_scale = (1000 << CYC2NS_SCALE_FACTOR)/cpu_mhz;
+ cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
}
static inline unsigned long long cycles_2_ns(unsigned long long cyc)
@@ -655,7 +651,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
vxtime.tsc_quot = (1000L << 32) / cpu_khz;
}
- set_cyc2ns_scale(cpu_khz_ref / 1000);
+ set_cyc2ns_scale(cpu_khz_ref);
return 0;
}
@@ -939,7 +935,7 @@ void __init time_init(void)
rdtscll_sync(&vxtime.last_tsc);
setup_irq(0, &irq0);
- set_cyc2ns_scale(cpu_khz / 1000);
+ set_cyc2ns_scale(cpu_khz);
#ifndef CONFIG_SMP
time_init_gtod();
@@ -959,9 +955,6 @@ static __init int unsynchronized_tsc(void)
are handled in the OEM check above. */
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
return 0;
- /* All in a single socket - should be synchronized */
- if (cpus_weight(cpu_core_map[0]) == num_online_cpus())
- return 0;
#endif
/* Assume multi socket systems are not synchronized */
return num_online_cpus() > 1;
@@ -1096,6 +1089,7 @@ static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ;
static unsigned long PIE_count;
static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */
+static unsigned int hpet_t1_cmp; /* cached comparator register */
int is_hpet_enabled(void)
{
@@ -1132,10 +1126,12 @@ int hpet_rtc_timer_init(void)
cnt = hpet_readl(HPET_COUNTER);
cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq);
hpet_writel(cnt, HPET_T1_CMP);
+ hpet_t1_cmp = cnt;
local_irq_restore(flags);
cfg = hpet_readl(HPET_T1_CFG);
- cfg |= HPET_TN_ENABLE | HPET_TN_SETVAL | HPET_TN_32BIT;
+ cfg &= ~HPET_TN_PERIODIC;
+ cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
hpet_writel(cfg, HPET_T1_CFG);
return 1;
@@ -1145,8 +1141,12 @@ static void hpet_rtc_timer_reinit(void)
{
unsigned int cfg, cnt;
- if (!(PIE_on | AIE_on | UIE_on))
+ if (unlikely(!(PIE_on | AIE_on | UIE_on))) {
+ cfg = hpet_readl(HPET_T1_CFG);
+ cfg &= ~HPET_TN_ENABLE;
+ hpet_writel(cfg, HPET_T1_CFG);
return;
+ }
if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
hpet_rtc_int_freq = PIE_freq;
@@ -1154,15 +1154,10 @@ static void hpet_rtc_timer_reinit(void)
hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
/* It is more accurate to use the comparator value than current count.*/
- cnt = hpet_readl(HPET_T1_CMP);
+ cnt = hpet_t1_cmp;
cnt += hpet_tick*HZ/hpet_rtc_int_freq;
hpet_writel(cnt, HPET_T1_CMP);
-
- cfg = hpet_readl(HPET_T1_CFG);
- cfg |= HPET_TN_ENABLE | HPET_TN_SETVAL | HPET_TN_32BIT;
- hpet_writel(cfg, HPET_T1_CFG);
-
- return;
+ hpet_t1_cmp = cnt;
}
/*
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index b5e09e6b553..bf337f49318 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -428,19 +428,6 @@ static void __kprobes do_trap(int trapnr, int signr, char *str,
{
conditional_sti(regs);
-#ifdef CONFIG_CHECKING
- {
- unsigned long gs;
- struct x8664_pda *pda = cpu_pda + safe_smp_processor_id();
- rdmsrl(MSR_GS_BASE, gs);
- if (gs != (unsigned long)pda) {
- wrmsrl(MSR_GS_BASE, pda);
- printk("%s: wrong gs %lx expected %p rip %lx\n", str, gs, pda,
- regs->rip);
- }
- }
-#endif
-
if (user_mode(regs)) {
struct task_struct *tsk = current;
@@ -513,20 +500,6 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
{
conditional_sti(regs);
-#ifdef CONFIG_CHECKING
- {
- unsigned long gs;
- struct x8664_pda *pda = cpu_pda + safe_smp_processor_id();
- rdmsrl(MSR_GS_BASE, gs);
- if (gs != (unsigned long)pda) {
- wrmsrl(MSR_GS_BASE, pda);
- oops_in_progress++;
- printk("general protection handler: wrong gs %lx expected %p\n", gs, pda);
- oops_in_progress--;
- }
- }
-#endif
-
if (user_mode(regs)) {
struct task_struct *tsk = current;
@@ -665,19 +638,6 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs,
struct task_struct *tsk = current;
siginfo_t info;
-#ifdef CONFIG_CHECKING
- {
- /* RED-PEN interaction with debugger - could destroy gs */
- unsigned long gs;
- struct x8664_pda *pda = cpu_pda + safe_smp_processor_id();
- rdmsrl(MSR_GS_BASE, gs);
- if (gs != (unsigned long)pda) {
- wrmsrl(MSR_GS_BASE, pda);
- printk("debug handler: wrong gs %lx expected %p\n", gs, pda);
- }
- }
-#endif
-
get_debugreg(condition, 6);
if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
@@ -888,6 +848,10 @@ asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
{
}
+asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
+{
+}
+
/*
* 'math_state_restore()' saves the current math information in the
* old math state array, and gets the new ones from the current task
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index 6dd642cad2e..58b19215b4b 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -50,7 +50,7 @@ SECTIONS
*(.bss.page_aligned)
*(.bss)
}
- __bss_end = .;
+ __bss_stop = .;
. = ALIGN(PAGE_SIZE);
. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c
index fd99ddd009b..4a54221e10b 100644
--- a/arch/x86_64/kernel/x8664_ksyms.c
+++ b/arch/x86_64/kernel/x8664_ksyms.c
@@ -203,3 +203,6 @@ EXPORT_SYMBOL(flush_tlb_page);
#endif
EXPORT_SYMBOL(cpu_khz);
+
+EXPORT_SYMBOL(load_gs_index);
+