diff options
Diffstat (limited to 'arch/x86_64/kernel')
30 files changed, 1185 insertions, 460 deletions
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index bcdd0a805fe..fe4cbd1c4b2 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -11,6 +11,7 @@ obj-y := process.o signal.o entry.o traps.o irq.o \ obj-$(CONFIG_X86_MCE) += mce.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o +obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o obj-$(CONFIG_MTRR) += ../../i386/kernel/cpu/mtrr/ obj-$(CONFIG_ACPI) += acpi/ obj-$(CONFIG_X86_MSR) += msr.o @@ -27,7 +28,6 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq/ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_GART_IOMMU) += pci-gart.o aperture.o obj-$(CONFIG_DUMMY_IOMMU) += pci-nommu.o pci-dma.o -obj-$(CONFIG_SWIOTLB) += swiotlb.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_X86_PM_TIMER) += pmtimer.o @@ -41,7 +41,6 @@ CFLAGS_vsyscall.o := $(PROFILING) -g0 bootflag-y += ../../i386/kernel/bootflag.o cpuid-$(subst m,y,$(CONFIG_X86_CPUID)) += ../../i386/kernel/cpuid.o topology-y += ../../i386/mach-default/topology.o -swiotlb-$(CONFIG_SWIOTLB) += ../../ia64/lib/swiotlb.o microcode-$(subst m,y,$(CONFIG_MICROCODE)) += ../../i386/kernel/microcode.o intel_cacheinfo-y += ../../i386/kernel/cpu/intel_cacheinfo.o quirks-y += ../../i386/kernel/quirks.o diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c index 962ad4823b6..c7f4fdd20f0 100644 --- a/arch/x86_64/kernel/aperture.c +++ b/arch/x86_64/kernel/aperture.c @@ -196,7 +196,7 @@ static __u32 __init search_agp_bridge(u32 *order, int *valid_agp) void __init iommu_hole_init(void) { int fix, num; - u32 aper_size, aper_alloc = 0, aper_order, last_aper_order = 0; + u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0; u64 aper_base, last_aper_base = 0; int valid_agp = 0; diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index b6e7715d877..18691ce4c75 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -833,6 +833,16 @@ int setup_profiling_timer(unsigned int multiplier) return 0; } +#ifdef CONFIG_X86_MCE_AMD +void setup_threshold_lvt(unsigned long lvt_off) +{ + unsigned int v = 0; + unsigned long reg = (lvt_off << 4) + 0x500; + v |= THRESHOLD_APIC_VECTOR; + apic_write(reg, v); +} +#endif /* CONFIG_X86_MCE_AMD */ + #undef APIC_DIVISOR /* diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index ab3f87aaff7..17579a1a174 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -23,8 +23,7 @@ #include <asm/e820.h> #include <asm/proto.h> #include <asm/bootsetup.h> - -extern char _end[]; +#include <asm/sections.h> /* * PFN of last memory page. diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index 7937971d185..9ff42041bb6 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -612,6 +612,9 @@ retint_kernel: ENTRY(thermal_interrupt) apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt +ENTRY(threshold_interrupt) + apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt + #ifdef CONFIG_SMP ENTRY(reschedule_interrupt) apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 4592bf21fca..15290968e49 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -12,6 +12,7 @@ #include <linux/linkage.h> #include <linux/threads.h> +#include <linux/init.h> #include <asm/desc.h> #include <asm/segment.h> #include <asm/page.h> @@ -70,7 +71,7 @@ startup_32: movl %eax, %cr4 /* Setup early boot stage 4 level pagetables */ - movl $(init_level4_pgt - __START_KERNEL_map), %eax + movl $(boot_level4_pgt - __START_KERNEL_map), %eax movl %eax, %cr3 /* Setup EFER (Extended Feature Enable Register) */ @@ -113,7 +114,7 @@ startup_64: movq %rax, %cr4 /* Setup early boot stage 4 level pagetables. */ - movq $(init_level4_pgt - __START_KERNEL_map), %rax + movq $(boot_level4_pgt - __START_KERNEL_map), %rax movq %rax, %cr3 /* Check if nx is implemented */ @@ -240,20 +241,10 @@ ljumpvector: ENTRY(stext) ENTRY(_stext) - /* - * This default setting generates an ident mapping at address 0x100000 - * and a mapping for the kernel that precisely maps virtual address - * 0xffffffff80000000 to physical address 0x000000. (always using - * 2Mbyte large pages provided by PAE mode) - */ .org 0x1000 ENTRY(init_level4_pgt) - .quad 0x0000000000002007 + __PHYSICAL_START /* -> level3_ident_pgt */ - .fill 255,8,0 - .quad 0x000000000000a007 + __PHYSICAL_START - .fill 254,8,0 - /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ - .quad 0x0000000000003007 + __PHYSICAL_START /* -> level3_kernel_pgt */ + /* This gets initialized in x86_64_start_kernel */ + .fill 512,8,0 .org 0x2000 ENTRY(level3_ident_pgt) @@ -270,26 +261,26 @@ ENTRY(level3_kernel_pgt) .org 0x4000 ENTRY(level2_ident_pgt) /* 40MB for bootup. */ - .quad 0x0000000000000183 - .quad 0x0000000000200183 - .quad 0x0000000000400183 - .quad 0x0000000000600183 - .quad 0x0000000000800183 - .quad 0x0000000000A00183 - .quad 0x0000000000C00183 - .quad 0x0000000000E00183 - .quad 0x0000000001000183 - .quad 0x0000000001200183 - .quad 0x0000000001400183 - .quad 0x0000000001600183 - .quad 0x0000000001800183 - .quad 0x0000000001A00183 - .quad 0x0000000001C00183 - .quad 0x0000000001E00183 - .quad 0x0000000002000183 - .quad 0x0000000002200183 - .quad 0x0000000002400183 - .quad 0x0000000002600183 + .quad 0x0000000000000083 + .quad 0x0000000000200083 + .quad 0x0000000000400083 + .quad 0x0000000000600083 + .quad 0x0000000000800083 + .quad 0x0000000000A00083 + .quad 0x0000000000C00083 + .quad 0x0000000000E00083 + .quad 0x0000000001000083 + .quad 0x0000000001200083 + .quad 0x0000000001400083 + .quad 0x0000000001600083 + .quad 0x0000000001800083 + .quad 0x0000000001A00083 + .quad 0x0000000001C00083 + .quad 0x0000000001E00083 + .quad 0x0000000002000083 + .quad 0x0000000002200083 + .quad 0x0000000002400083 + .quad 0x0000000002600083 /* Temporary mappings for the super early allocator in arch/x86_64/mm/init.c */ .globl temp_boot_pmds temp_boot_pmds: @@ -350,6 +341,24 @@ ENTRY(wakeup_level4_pgt) .quad 0x0000000000003007 + __PHYSICAL_START /* -> level3_kernel_pgt */ #endif +#ifndef CONFIG_HOTPLUG_CPU + __INITDATA +#endif + /* + * This default setting generates an ident mapping at address 0x100000 + * and a mapping for the kernel that precisely maps virtual address + * 0xffffffff80000000 to physical address 0x000000. (always using + * 2Mbyte large pages provided by PAE mode) + */ + .align PAGE_SIZE +ENTRY(boot_level4_pgt) + .quad 0x0000000000002007 + __PHYSICAL_START /* -> level3_ident_pgt */ + .fill 255,8,0 + .quad 0x000000000000a007 + __PHYSICAL_START + .fill 254,8,0 + /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ + .quad 0x0000000000003007 + __PHYSICAL_START /* -> level3_kernel_pgt */ + .data .align 16 diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c index cf6ab147a2a..b675c5add01 100644 --- a/arch/x86_64/kernel/head64.c +++ b/arch/x86_64/kernel/head64.c @@ -19,14 +19,15 @@ #include <asm/bootsetup.h> #include <asm/setup.h> #include <asm/desc.h> +#include <asm/pgtable.h> +#include <asm/sections.h> /* Don't add a printk in there. printk relies on the PDA which is not initialized yet. */ static void __init clear_bss(void) { - extern char __bss_start[], __bss_end[]; memset(__bss_start, 0, - (unsigned long) __bss_end - (unsigned long) __bss_start); + (unsigned long) __bss_stop - (unsigned long) __bss_start); } #define NEW_CL_POINTER 0x228 /* Relative to real mode data */ @@ -75,8 +76,6 @@ static void __init setup_boot_cpu_data(void) boot_cpu_data.x86_mask = eax & 0xf; } -extern char _end[]; - void __init x86_64_start_kernel(char * real_mode_data) { char *s; @@ -86,6 +85,13 @@ void __init x86_64_start_kernel(char * real_mode_data) set_intr_gate(i, early_idt_handler); asm volatile("lidt %0" :: "m" (idt_descr)); clear_bss(); + + /* + * switch to init_level4_pgt from boot_level4_pgt + */ + memcpy(init_level4_pgt, boot_level4_pgt, PTRS_PER_PGD*sizeof(pgd_t)); + asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt))); + pda_init(0); copy_bootdata(real_mode_data); #ifdef CONFIG_SMP diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c index b2a238b5a17..6e5101ad3d1 100644 --- a/arch/x86_64/kernel/i8259.c +++ b/arch/x86_64/kernel/i8259.c @@ -492,9 +492,10 @@ void invalidate_interrupt5(void); void invalidate_interrupt6(void); void invalidate_interrupt7(void); void thermal_interrupt(void); +void threshold_interrupt(void); void i8254_timer_resume(void); -static void setup_timer(void) +static void setup_timer_hardware(void) { outb_p(0x34,0x43); /* binary, mode 2, LSB/MSB, ch 0 */ udelay(10); @@ -505,17 +506,17 @@ static void setup_timer(void) static int timer_resume(struct sys_device *dev) { - setup_timer(); + setup_timer_hardware(); return 0; } void i8254_timer_resume(void) { - setup_timer(); + setup_timer_hardware(); } static struct sysdev_class timer_sysclass = { - set_kset_name("timer"), + set_kset_name("timer_pit"), .resume = timer_resume, }; @@ -580,6 +581,7 @@ void __init init_IRQ(void) set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); #endif set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); + set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); #ifdef CONFIG_X86_LOCAL_APIC /* self generated IPI for local APIC timer */ @@ -594,7 +596,7 @@ void __init init_IRQ(void) * Set the clock to HZ Hz, we already have a valid * vector now: */ - setup_timer(); + setup_timer_hardware(); if (!acpi_ioapic) setup_irq(2, &irq2); diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index c8eee20cd51..97154ab058b 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -57,7 +57,7 @@ int nr_ioapic_registers[MAX_IO_APICS]; * Rough estimation of how many shared IRQs there are, can * be changed anytime. */ -#define MAX_PLUS_SHARED_IRQS NR_IRQS +#define MAX_PLUS_SHARED_IRQS NR_IRQ_VECTORS #define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) /* @@ -85,6 +85,7 @@ int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1}; int pin; \ struct irq_pin_list *entry = irq_2_pin + irq; \ \ + BUG_ON(irq >= NR_IRQS); \ for (;;) { \ unsigned int reg; \ pin = entry->pin; \ @@ -127,6 +128,8 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) } #endif +static u8 gsi_2_irq[NR_IRQ_VECTORS] = { [0 ... NR_IRQ_VECTORS-1] = 0xFF }; + /* * The common case is 1:1 IRQ<->pin mappings. Sometimes there are * shared ISA-space IRQs, so we have to support them. We are super @@ -137,6 +140,7 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin) static int first_free_entry = NR_IRQS; struct irq_pin_list *entry = irq_2_pin + irq; + BUG_ON(irq >= NR_IRQS); while (entry->next) entry = irq_2_pin + entry->next; @@ -144,7 +148,7 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin) entry->next = first_free_entry; entry = irq_2_pin + entry->next; if (++first_free_entry >= PIN_MAP_SIZE) - panic("io_apic.c: whoops"); + panic("io_apic.c: ran out of irq_2_pin entries!"); } entry->apic = apic; entry->pin = pin; @@ -420,6 +424,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) best_guess = irq; } } + BUG_ON(best_guess >= NR_IRQS); return best_guess; } @@ -610,6 +615,64 @@ static inline int irq_trigger(int idx) return MPBIOS_trigger(idx); } +static int next_irq = 16; + +/* + * gsi_irq_sharing -- Name overload! "irq" can be either a legacy IRQ + * in the range 0-15, a linux IRQ in the range 0-223, or a GSI number + * from ACPI, which can reach 800 in large boxen. + * + * Compact the sparse GSI space into a sequential IRQ series and reuse + * vectors if possible. + */ +int gsi_irq_sharing(int gsi) +{ + int i, tries, vector; + + BUG_ON(gsi >= NR_IRQ_VECTORS); + + if (platform_legacy_irq(gsi)) + return gsi; + + if (gsi_2_irq[gsi] != 0xFF) + return (int)gsi_2_irq[gsi]; + + tries = NR_IRQS; + try_again: + vector = assign_irq_vector(gsi); + + /* + * Sharing vectors means sharing IRQs, so scan irq_vectors for previous + * use of vector and if found, return that IRQ. However, we never want + * to share legacy IRQs, which usually have a different trigger mode + * than PCI. + */ + for (i = 0; i < NR_IRQS; i++) + if (IO_APIC_VECTOR(i) == vector) + break; + if (platform_legacy_irq(i)) { + if (--tries >= 0) { + IO_APIC_VECTOR(i) = 0; + goto try_again; + } + panic("gsi_irq_sharing: didn't find an IRQ using vector 0x%02X for GSI %d", vector, gsi); + } + if (i < NR_IRQS) { + gsi_2_irq[gsi] = i; + printk(KERN_INFO "GSI %d sharing vector 0x%02X and IRQ %d\n", + gsi, vector, i); + return i; + } + + i = next_irq++; + BUG_ON(i >= NR_IRQS); + gsi_2_irq[gsi] = i; + IO_APIC_VECTOR(i) = vector; + printk(KERN_INFO "GSI %d assigned vector 0x%02X and IRQ %d\n", + gsi, vector, i); + return i; +} + static int pin_2_irq(int idx, int apic, int pin) { int irq, i; @@ -639,6 +702,7 @@ static int pin_2_irq(int idx, int apic, int pin) while (i < apic) irq += nr_ioapic_registers[i++]; irq += pin; + irq = gsi_irq_sharing(irq); break; } default: @@ -648,6 +712,7 @@ static int pin_2_irq(int idx, int apic, int pin) break; } } + BUG_ON(irq >= NR_IRQS); /* * PCI IRQ command line redirection. Yes, limits are hardcoded. @@ -663,6 +728,7 @@ static int pin_2_irq(int idx, int apic, int pin) } } } + BUG_ON(irq >= NR_IRQS); return irq; } @@ -690,8 +756,8 @@ int assign_irq_vector(int irq) { static int current_vector = FIRST_DEVICE_VECTOR, offset = 0; - BUG_ON(irq >= NR_IRQ_VECTORS); - if (IO_APIC_VECTOR(irq) > 0) + BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS); + if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) return IO_APIC_VECTOR(irq); next: current_vector += 8; @@ -699,9 +765,8 @@ next: goto next; if (current_vector >= FIRST_SYSTEM_VECTOR) { - offset++; - if (!(offset%8)) - return -ENOSPC; + /* If we run out of vectors on large boxen, must share them. */ + offset = (offset + 1) % 8; current_vector = FIRST_DEVICE_VECTOR + offset; } @@ -1917,6 +1982,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a entry.polarity = active_high_low; entry.mask = 1; /* Disabled (masked) */ + irq = gsi_irq_sharing(irq); /* * IRQs < 16 are already in the irq_2_pin[] map */ diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c index df08c43276a..dddeb678b44 100644 --- a/arch/x86_64/kernel/kprobes.c +++ b/arch/x86_64/kernel/kprobes.c @@ -34,7 +34,6 @@ #include <linux/config.h> #include <linux/kprobes.h> #include <linux/ptrace.h> -#include <linux/spinlock.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/preempt.h> @@ -44,17 +43,10 @@ #include <asm/kdebug.h> static DECLARE_MUTEX(kprobe_mutex); - -static struct kprobe *current_kprobe; -static unsigned long kprobe_status, kprobe_old_rflags, kprobe_saved_rflags; -static struct kprobe *kprobe_prev; -static unsigned long kprobe_status_prev, kprobe_old_rflags_prev, kprobe_saved_rflags_prev; -static struct pt_regs jprobe_saved_regs; -static long *jprobe_saved_rsp; void jprobe_return_end(void); -/* copy of the kernel stack at the probe fire time */ -static kprobe_opcode_t jprobes_stack[MAX_STACK_SIZE]; +DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; +DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); /* * returns non-zero if opcode modifies the interrupt flag. @@ -77,9 +69,9 @@ static inline int is_IF_modifier(kprobe_opcode_t *insn) int __kprobes arch_prepare_kprobe(struct kprobe *p) { /* insn: must be on special executable page on x86_64. */ - up(&kprobe_mutex); - p->ainsn.insn = get_insn_slot(); down(&kprobe_mutex); + p->ainsn.insn = get_insn_slot(); + up(&kprobe_mutex); if (!p->ainsn.insn) { return -ENOMEM; } @@ -231,34 +223,35 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p) void __kprobes arch_remove_kprobe(struct kprobe *p) { - up(&kprobe_mutex); - free_insn_slot(p->ainsn.insn); down(&kprobe_mutex); + free_insn_slot(p->ainsn.insn); + up(&kprobe_mutex); } -static inline void save_previous_kprobe(void) +static inline void save_previous_kprobe(struct kprobe_ctlblk *kcb) { - kprobe_prev = current_kprobe; - kprobe_status_prev = kprobe_status; - kprobe_old_rflags_prev = kprobe_old_rflags; - kprobe_saved_rflags_prev = kprobe_saved_rflags; + kcb->prev_kprobe.kp = kprobe_running(); + kcb->prev_kprobe.status = kcb->kprobe_status; + kcb->prev_kprobe.old_rflags = kcb->kprobe_old_rflags; + kcb->prev_kprobe.saved_rflags = kcb->kprobe_saved_rflags; } -static inline void restore_previous_kprobe(void) +static inline void restore_previous_kprobe(struct kprobe_ctlblk *kcb) { - current_kprobe = kprobe_prev; - kprobe_status = kprobe_status_prev; - kprobe_old_rflags = kprobe_old_rflags_prev; - kprobe_saved_rflags = kprobe_saved_rflags_prev; + __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp; + kcb->kprobe_status = kcb->prev_kprobe.status; + kcb->kprobe_old_rflags = kcb->prev_kprobe.old_rflags; + kcb->kprobe_saved_rflags = kcb->prev_kprobe.saved_rflags; } -static inline void set_current_kprobe(struct kprobe *p, struct pt_regs *regs) +static inline void set_current_kprobe(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) { - current_kprobe = p; - kprobe_saved_rflags = kprobe_old_rflags + __get_cpu_var(current_kprobe) = p; + kcb->kprobe_saved_rflags = kcb->kprobe_old_rflags = (regs->eflags & (TF_MASK | IF_MASK)); if (is_IF_modifier(p->ainsn.insn)) - kprobe_saved_rflags &= ~IF_MASK; + kcb->kprobe_saved_rflags &= ~IF_MASK; } static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) @@ -272,6 +265,7 @@ static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) regs->rip = (unsigned long)p->ainsn.insn; } +/* Called with kretprobe_lock held */ void __kprobes arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs) { @@ -292,32 +286,30 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe *rp, } } -/* - * Interrupts are disabled on entry as trap3 is an interrupt gate and they - * remain disabled thorough out this function. - */ int __kprobes kprobe_handler(struct pt_regs *regs) { struct kprobe *p; int ret = 0; kprobe_opcode_t *addr = (kprobe_opcode_t *)(regs->rip - sizeof(kprobe_opcode_t)); + struct kprobe_ctlblk *kcb; - /* We're in an interrupt, but this is clear and BUG()-safe. */ + /* + * We don't want to be preempted for the entire + * duration of kprobe processing + */ preempt_disable(); + kcb = get_kprobe_ctlblk(); /* Check we're not actually recursing */ if (kprobe_running()) { - /* We *are* holding lock here, so this is safe. - Disarm the probe we just hit, and ignore it. */ p = get_kprobe(addr); if (p) { - if (kprobe_status == KPROBE_HIT_SS && + if (kcb->kprobe_status == KPROBE_HIT_SS && *p->ainsn.insn == BREAKPOINT_INSTRUCTION) { regs->eflags &= ~TF_MASK; - regs->eflags |= kprobe_saved_rflags; - unlock_kprobes(); + regs->eflags |= kcb->kprobe_saved_rflags; goto no_kprobe; - } else if (kprobe_status == KPROBE_HIT_SSDONE) { + } else if (kcb->kprobe_status == KPROBE_HIT_SSDONE) { /* TODO: Provide re-entrancy from * post_kprobes_handler() and avoid exception * stack corruption while single-stepping on @@ -325,6 +317,7 @@ int __kprobes kprobe_handler(struct pt_regs *regs) */ arch_disarm_kprobe(p); regs->rip = (unsigned long)p->addr; + reset_current_kprobe(); ret = 1; } else { /* We have reentered the kprobe_handler(), since @@ -334,27 +327,24 @@ int __kprobes kprobe_handler(struct pt_regs *regs) * of the new probe without calling any user * handlers. */ - save_previous_kprobe(); - set_current_kprobe(p, regs); + save_previous_kprobe(kcb); + set_current_kprobe(p, regs, kcb); p->nmissed++; prepare_singlestep(p, regs); - kprobe_status = KPROBE_REENTER; + kcb->kprobe_status = KPROBE_REENTER; return 1; } } else { - p = current_kprobe; + p = __get_cpu_var(current_kprobe); if (p->break_handler && p->break_handler(p, regs)) { goto ss_probe; } } - /* If it's not ours, can't be delete race, (we hold lock). */ goto no_kprobe; } - lock_kprobes(); p = get_kprobe(addr); if (!p) { - unlock_kprobes(); if (*addr != BREAKPOINT_INSTRUCTION) { /* * The breakpoint instruction was removed right @@ -372,8 +362,8 @@ int __kprobes kprobe_handler(struct pt_regs *regs) goto no_kprobe; } - kprobe_status = KPROBE_HIT_ACTIVE; - set_current_kprobe(p, regs); + set_current_kprobe(p, regs, kcb); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; if (p->pre_handler && p->pre_handler(p, regs)) /* handler has already set things up, so skip ss setup */ @@ -381,7 +371,7 @@ int __kprobes kprobe_handler(struct pt_regs *regs) ss_probe: prepare_singlestep(p, regs); - kprobe_status = KPROBE_HIT_SS; + kcb->kprobe_status = KPROBE_HIT_SS; return 1; no_kprobe: @@ -409,9 +399,10 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) struct kretprobe_instance *ri = NULL; struct hlist_head *head; struct hlist_node *node, *tmp; - unsigned long orig_ret_address = 0; + unsigned long flags, orig_ret_address = 0; unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline; + spin_lock_irqsave(&kretprobe_lock, flags); head = kretprobe_inst_table_head(current); /* @@ -450,13 +441,14 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) BUG_ON(!orig_ret_address || (orig_ret_address == trampoline_address)); regs->rip = orig_ret_address; - unlock_kprobes(); + reset_current_kprobe(); + spin_unlock_irqrestore(&kretprobe_lock, flags); preempt_enable_no_resched(); /* * By returning a non-zero value, we are telling - * kprobe_handler() that we have handled unlocking - * and re-enabling preemption. + * kprobe_handler() that we don't want the post_handler + * to run (and have re-enabled preemption) */ return 1; } @@ -483,7 +475,8 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) * that is atop the stack is the address following the copied instruction. * We need to make it the address following the original instruction. */ -static void __kprobes resume_execution(struct kprobe *p, struct pt_regs *regs) +static void __kprobes resume_execution(struct kprobe *p, + struct pt_regs *regs, struct kprobe_ctlblk *kcb) { unsigned long *tos = (unsigned long *)regs->rsp; unsigned long next_rip = 0; @@ -498,7 +491,7 @@ static void __kprobes resume_execution(struct kprobe *p, struct pt_regs *regs) switch (*insn) { case 0x9c: /* pushfl */ *tos &= ~(TF_MASK | IF_MASK); - *tos |= kprobe_old_rflags; + *tos |= kcb->kprobe_old_rflags; break; case 0xc3: /* ret/lret */ case 0xcb: @@ -537,30 +530,28 @@ static void __kprobes resume_execution(struct kprobe *p, struct pt_regs *regs) } } -/* - * Interrupts are disabled on entry as trap1 is an interrupt gate and they - * remain disabled thoroughout this function. And we hold kprobe lock. - */ int __kprobes post_kprobe_handler(struct pt_regs *regs) { - if (!kprobe_running()) + struct kprobe *cur = kprobe_running(); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + if (!cur) return 0; - if ((kprobe_status != KPROBE_REENTER) && current_kprobe->post_handler) { - kprobe_status = KPROBE_HIT_SSDONE; - current_kprobe->post_handler(current_kprobe, regs, 0); + if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { + kcb->kprobe_status = KPROBE_HIT_SSDONE; + cur->post_handler(cur, regs, 0); } - resume_execution(current_kprobe, regs); - regs->eflags |= kprobe_saved_rflags; + resume_execution(cur, regs, kcb); + regs->eflags |= kcb->kprobe_saved_rflags; /* Restore the original saved kprobes variables and continue. */ - if (kprobe_status == KPROBE_REENTER) { - restore_previous_kprobe(); + if (kcb->kprobe_status == KPROBE_REENTER) { + restore_previous_kprobe(kcb); goto out; - } else { - unlock_kprobes(); } + reset_current_kprobe(); out: preempt_enable_no_resched(); @@ -575,18 +566,19 @@ out: return 1; } -/* Interrupts disabled, kprobe_lock held. */ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) { - if (current_kprobe->fault_handler - && current_kprobe->fault_handler(current_kprobe, regs, trapnr)) + struct kprobe *cur = kprobe_running(); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr)) return 1; - if (kprobe_status & KPROBE_HIT_SS) { - resume_execution(current_kprobe, regs); - regs->eflags |= kprobe_old_rflags; + if (kcb->kprobe_status & KPROBE_HIT_SS) { + resume_execution(cur, regs, kcb); + regs->eflags |= kcb->kprobe_old_rflags; - unlock_kprobes(); + reset_current_kprobe(); preempt_enable_no_resched(); } return 0; @@ -599,39 +591,41 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *data) { struct die_args *args = (struct die_args *)data; + int ret = NOTIFY_DONE; + switch (val) { case DIE_INT3: if (kprobe_handler(args->regs)) - return NOTIFY_STOP; + ret = NOTIFY_STOP; break; case DIE_DEBUG: if (post_kprobe_handler(args->regs)) - return NOTIFY_STOP; + ret = NOTIFY_STOP; break; case DIE_GPF: - if (kprobe_running() && - kprobe_fault_handler(args->regs, args->trapnr)) - return NOTIFY_STOP; - break; case DIE_PAGE_FAULT: + /* kprobe_running() needs smp_processor_id() */ + preempt_disable(); if (kprobe_running() && kprobe_fault_handler(args->regs, args->trapnr)) - return NOTIFY_STOP; + ret = NOTIFY_STOP; + preempt_enable(); break; default: break; } - return NOTIFY_DONE; + return ret; } int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) { struct jprobe *jp = container_of(p, struct jprobe, kp); unsigned long addr; + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - jprobe_saved_regs = *regs; - jprobe_saved_rsp = (long *) regs->rsp; - addr = (unsigned long)jprobe_saved_rsp; + kcb->jprobe_saved_regs = *regs; + kcb->jprobe_saved_rsp = (long *) regs->rsp; + addr = (unsigned long)(kcb->jprobe_saved_rsp); /* * As Linus pointed out, gcc assumes that the callee * owns the argument space and could overwrite it, e.g. @@ -639,7 +633,8 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) * we also save and restore enough stack bytes to cover * the argument area. */ - memcpy(jprobes_stack, (kprobe_opcode_t *) addr, MIN_STACK_SIZE(addr)); + memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr, + MIN_STACK_SIZE(addr)); regs->eflags &= ~IF_MASK; regs->rip = (unsigned long)(jp->entry); return 1; @@ -647,36 +642,40 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) void __kprobes jprobe_return(void) { - preempt_enable_no_resched(); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + asm volatile (" xchg %%rbx,%%rsp \n" " int3 \n" " .globl jprobe_return_end \n" " jprobe_return_end: \n" " nop \n"::"b" - (jprobe_saved_rsp):"memory"); + (kcb->jprobe_saved_rsp):"memory"); } int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) { + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); u8 *addr = (u8 *) (regs->rip - 1); - unsigned long stack_addr = (unsigned long)jprobe_saved_rsp; + unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_rsp); struct jprobe *jp = container_of(p, struct jprobe, kp); if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) { - if ((long *)regs->rsp != jprobe_saved_rsp) { + if ((long *)regs->rsp != kcb->jprobe_saved_rsp) { struct pt_regs *saved_regs = - container_of(jprobe_saved_rsp, struct pt_regs, rsp); + container_of(kcb->jprobe_saved_rsp, + struct pt_regs, rsp); printk("current rsp %p does not match saved rsp %p\n", - (long *)regs->rsp, jprobe_saved_rsp); + (long *)regs->rsp, kcb->jprobe_saved_rsp); printk("Saved registers for jprobe %p\n", jp); show_registers(saved_regs); printk("Current registers\n"); show_registers(regs); BUG(); } - *regs = jprobe_saved_regs; - memcpy((kprobe_opcode_t *) stack_addr, jprobes_stack, + *regs = kcb->jprobe_saved_regs; + memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack, MIN_STACK_SIZE(stack_addr)); + preempt_enable_no_resched(); return 1; } return 0; diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index 08203b07f4b..183dc610542 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -37,7 +37,7 @@ static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL }; static unsigned long console_logged; static int notify_user; static int rip_msr; -static int mce_bootlog; +static int mce_bootlog = 1; /* * Lockless MCE logging infrastructure. @@ -54,9 +54,12 @@ void mce_log(struct mce *mce) { unsigned next, entry; mce->finished = 0; - smp_wmb(); + wmb(); for (;;) { entry = rcu_dereference(mcelog.next); + /* The rmb forces the compiler to reload next in each + iteration */ + rmb(); for (;;) { /* When the buffer fills up discard new entries. Assume that the earlier errors are the more interesting. */ @@ -69,6 +72,7 @@ void mce_log(struct mce *mce) entry++; continue; } + break; } smp_rmb(); next = entry + 1; @@ -76,9 +80,9 @@ void mce_log(struct mce *mce) break; } memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); - smp_wmb(); + wmb(); mcelog.entry[entry].finished = 1; - smp_wmb(); + wmb(); if (!test_and_set_bit(0, &console_logged)) notify_user = 1; @@ -343,7 +347,11 @@ static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) /* disable GART TBL walk error reporting, which trips off incorrectly with the IOMMU & 3ware & Cerberus. */ clear_bit(10, &bank[4]); + /* Lots of broken BIOS around that don't clear them + by default and leave crap in there. Don't log. */ + mce_bootlog = 0; } + } static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c) @@ -352,6 +360,9 @@ static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c) case X86_VENDOR_INTEL: mce_intel_feature_init(c); break; + case X86_VENDOR_AMD: + mce_amd_feature_init(c); + break; default: break; } @@ -491,16 +502,16 @@ static int __init mcheck_disable(char *str) /* mce=off disables machine check. Note you can reenable it later using sysfs. mce=TOLERANCELEVEL (number, see above) - mce=bootlog Log MCEs from before booting. Disabled by default to work - around buggy BIOS that leave bogus MCEs. */ + mce=bootlog Log MCEs from before booting. Disabled by default on AMD. + mce=nobootlog Don't log MCEs from before booting. */ static int __init mcheck_enable(char *str) { if (*str == '=') str++; if (!strcmp(str, "off")) mce_dont_init = 1; - else if (!strcmp(str, "bootlog")) - mce_bootlog = 1; + else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog")) + mce_bootlog = str[0] == 'b'; else if (isdigit(str[0])) get_option(&str, &tolerant); else diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c new file mode 100644 index 00000000000..1f76175ace0 --- /dev/null +++ b/arch/x86_64/kernel/mce_amd.c @@ -0,0 +1,538 @@ +/* + * (c) 2005 Advanced Micro Devices, Inc. + * Your use of this code is subject to the terms and conditions of the + * GNU general public license version 2. See "COPYING" or + * http://www.gnu.org/licenses/gpl.html + * + * Written by Jacob Shin - AMD, Inc. + * + * Support : jacob.shin@amd.com + * + * MC4_MISC0 DRAM ECC Error Threshold available under AMD K8 Rev F. + * MC4_MISC0 exists per physical processor. + * + */ + +#include <linux/cpu.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/kobject.h> +#include <linux/notifier.h> +#include <linux/sched.h> +#include <linux/smp.h> +#include <linux/sysdev.h> +#include <linux/sysfs.h> +#include <asm/apic.h> +#include <asm/mce.h> +#include <asm/msr.h> +#include <asm/percpu.h> + +#define PFX "mce_threshold: " +#define VERSION "version 1.00.9" +#define NR_BANKS 5 +#define THRESHOLD_MAX 0xFFF +#define INT_TYPE_APIC 0x00020000 +#define MASK_VALID_HI 0x80000000 +#define MASK_LVTOFF_HI 0x00F00000 +#define MASK_COUNT_EN_HI 0x00080000 +#define MASK_INT_TYPE_HI 0x00060000 +#define MASK_OVERFLOW_HI 0x00010000 +#define MASK_ERR_COUNT_HI 0x00000FFF +#define MASK_OVERFLOW 0x0001000000000000L + +struct threshold_bank { + unsigned int cpu; + u8 bank; + u8 interrupt_enable; + u16 threshold_limit; + struct kobject kobj; +}; + +static struct threshold_bank threshold_defaults = { + .interrupt_enable = 0, + .threshold_limit = THRESHOLD_MAX, +}; + +#ifdef CONFIG_SMP +static unsigned char shared_bank[NR_BANKS] = { + 0, 0, 0, 0, 1 +}; +#endif + +static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ + +/* + * CPU Initialization + */ + +/* must be called with correct cpu affinity */ +static void threshold_restart_bank(struct threshold_bank *b, + int reset, u16 old_limit) +{ + u32 mci_misc_hi, mci_misc_lo; + + rdmsr(MSR_IA32_MC0_MISC + b->bank * 4, mci_misc_lo, mci_misc_hi); + + if (b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX)) + reset = 1; /* limit cannot be lower than err count */ + + if (reset) { /* reset err count and overflow bit */ + mci_misc_hi = + (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | + (THRESHOLD_MAX - b->threshold_limit); + } else if (old_limit) { /* change limit w/o reset */ + int new_count = (mci_misc_hi & THRESHOLD_MAX) + + (old_limit - b->threshold_limit); + mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | + (new_count & THRESHOLD_MAX); + } + + b->interrupt_enable ? + (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) : + (mci_misc_hi &= ~MASK_INT_TYPE_HI); + + mci_misc_hi |= MASK_COUNT_EN_HI; + wrmsr(MSR_IA32_MC0_MISC + b->bank * 4, mci_misc_lo, mci_misc_hi); +} + +void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c) +{ + int bank; + u32 mci_misc_lo, mci_misc_hi; + unsigned int cpu = smp_processor_id(); + + for (bank = 0; bank < NR_BANKS; ++bank) { + rdmsr(MSR_IA32_MC0_MISC + bank * 4, mci_misc_lo, mci_misc_hi); + + /* !valid, !counter present, bios locked */ + if (!(mci_misc_hi & MASK_VALID_HI) || + !(mci_misc_hi & MASK_VALID_HI >> 1) || + (mci_misc_hi & MASK_VALID_HI >> 2)) + continue; + + per_cpu(bank_map, cpu) |= (1 << bank); + +#ifdef CONFIG_SMP + if (shared_bank[bank] && cpu_core_id[cpu]) + continue; +#endif + + setup_threshold_lvt((mci_misc_hi & MASK_LVTOFF_HI) >> 20); + threshold_defaults.cpu = cpu; + threshold_defaults.bank = bank; + threshold_restart_bank(&threshold_defaults, 0, 0); + } +} + +/* + * APIC Interrupt Handler + */ + +/* + * threshold interrupt handler will service THRESHOLD_APIC_VECTOR. + * the interrupt goes off when error_count reaches threshold_limit. + * the handler will simply log mcelog w/ software defined bank number. + */ +asmlinkage void mce_threshold_interrupt(void) +{ + int bank; + struct mce m; + + ack_APIC_irq(); + irq_enter(); + + memset(&m, 0, sizeof(m)); + rdtscll(m.tsc); + m.cpu = smp_processor_id(); + + /* assume first bank caused it */ + for (bank = 0; bank < NR_BANKS; ++bank) { + m.bank = MCE_THRESHOLD_BASE + bank; + rdmsrl(MSR_IA32_MC0_MISC + bank * 4, m.misc); + + if (m.misc & MASK_OVERFLOW) { + mce_log(&m); + goto out; + } + } + out: + irq_exit(); +} + +/* + * Sysfs Interface + */ + +static struct sysdev_class threshold_sysclass = { + set_kset_name("threshold"), +}; + +static DEFINE_PER_CPU(struct sys_device, device_threshold); + +struct threshold_attr { + struct attribute attr; + ssize_t(*show) (struct threshold_bank *, char *); + ssize_t(*store) (struct threshold_bank *, const char *, size_t count); +}; + +static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); + +static cpumask_t affinity_set(unsigned int cpu) +{ + cpumask_t oldmask = current->cpus_allowed; + cpumask_t newmask = CPU_MASK_NONE; + cpu_set(cpu, newmask); + set_cpus_allowed(current, newmask); + return oldmask; +} + +static void affinity_restore(cpumask_t oldmask) +{ + set_cpus_allowed(current, oldmask); +} + +#define SHOW_FIELDS(name) \ + static ssize_t show_ ## name(struct threshold_bank * b, char *buf) \ + { \ + return sprintf(buf, "%lx\n", (unsigned long) b->name); \ + } +SHOW_FIELDS(interrupt_enable) +SHOW_FIELDS(threshold_limit) + +static ssize_t store_interrupt_enable(struct threshold_bank *b, + const char *buf, size_t count) +{ + char *end; + cpumask_t oldmask; + unsigned long new = simple_strtoul(buf, &end, 0); + if (end == buf) + return -EINVAL; + b->interrupt_enable = !!new; + + oldmask = affinity_set(b->cpu); + threshold_restart_bank(b, 0, 0); + affinity_restore(oldmask); + + return end - buf; +} + +static ssize_t store_threshold_limit(struct threshold_bank *b, + const char *buf, size_t count) +{ + char *end; + cpumask_t oldmask; + u16 old; + unsigned long new = simple_strtoul(buf, &end, 0); + if (end == buf) + return -EINVAL; + if (new > THRESHOLD_MAX) + new = THRESHOLD_MAX; + if (new < 1) + new = 1; + old = b->threshold_limit; + b->threshold_limit = new; + + oldmask = affinity_set(b->cpu); + threshold_restart_bank(b, 0, old); + affinity_restore(oldmask); + + return end - buf; +} + +static ssize_t show_error_count(struct threshold_bank *b, char *buf) +{ + u32 high, low; + cpumask_t oldmask; + oldmask = affinity_set(b->cpu); + rdmsr(MSR_IA32_MC0_MISC + b->bank * 4, low, high); /* ignore low 32 */ + affinity_restore(oldmask); + return sprintf(buf, "%x\n", + (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit)); +} + +static ssize_t store_error_count(struct threshold_bank *b, + const char *buf, size_t count) +{ + cpumask_t oldmask; + oldmask = affinity_set(b->cpu); + threshold_restart_bank(b, 1, 0); + affinity_restore(oldmask); + return 1; +} + +#define THRESHOLD_ATTR(_name,_mode,_show,_store) { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ +}; + +#define ATTR_FIELDS(name) \ + static struct threshold_attr name = \ + THRESHOLD_ATTR(name, 0644, show_## name, store_## name) + +ATTR_FIELDS(interrupt_enable); +ATTR_FIELDS(threshold_limit); +ATTR_FIELDS(error_count); + +static struct attribute *default_attrs[] = { + &interrupt_enable.attr, + &threshold_limit.attr, + &error_count.attr, + NULL +}; + +#define to_bank(k) container_of(k,struct threshold_bank,kobj) +#define to_attr(a) container_of(a,struct threshold_attr,attr) + +static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + struct threshold_bank *b = to_bank(kobj); + struct threshold_attr *a = to_attr(attr); + ssize_t ret; + ret = a->show ? a->show(b, buf) : -EIO; + return ret; +} + +static ssize_t store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct threshold_bank *b = to_bank(kobj); + struct threshold_attr *a = to_attr(attr); + ssize_t ret; + ret = a->store ? a->store(b, buf, count) : -EIO; + return ret; +} + +static struct sysfs_ops threshold_ops = { + .show = show, + .store = store, +}; + +static struct kobj_type threshold_ktype = { + .sysfs_ops = &threshold_ops, + .default_attrs = default_attrs, +}; + +/* symlinks sibling shared banks to first core. first core owns dir/files. */ +static __cpuinit int threshold_create_bank(unsigned int cpu, int bank) +{ + int err = 0; + struct threshold_bank *b = 0; + +#ifdef CONFIG_SMP + if (cpu_core_id[cpu] && shared_bank[bank]) { /* symlink */ + char name[16]; + unsigned lcpu = first_cpu(cpu_core_map[cpu]); + if (cpu_core_id[lcpu]) + goto out; /* first core not up yet */ + + b = per_cpu(threshold_banks, lcpu)[bank]; + if (!b) + goto out; + sprintf(name, "bank%i", bank); + err = sysfs_create_link(&per_cpu(device_threshold, cpu).kobj, + &b->kobj, name); + if (err) + goto out; + per_cpu(threshold_banks, cpu)[bank] = b; + goto out; + } +#endif + + b = kmalloc(sizeof(struct threshold_bank), GFP_KERNEL); + if (!b) { + err = -ENOMEM; + goto out; + } + memset(b, 0, sizeof(struct threshold_bank)); + + b->cpu = cpu; + b->bank = bank; + b->interrupt_enable = 0; + b->threshold_limit = THRESHOLD_MAX; + kobject_set_name(&b->kobj, "bank%i", bank); + b->kobj.parent = &per_cpu(device_threshold, cpu).kobj; + b->kobj.ktype = &threshold_ktype; + + err = kobject_register(&b->kobj); + if (err) { + kfree(b); + goto out; + } + per_cpu(threshold_banks, cpu)[bank] = b; + out: + return err; +} + +/* create dir/files for all valid threshold banks */ +static __cpuinit int threshold_create_device(unsigned int cpu) +{ + int bank; + int err = 0; + + per_cpu(device_threshold, cpu).id = cpu; + per_cpu(device_threshold, cpu).cls = &threshold_sysclass; + err = sysdev_register(&per_cpu(device_threshold, cpu)); + if (err) + goto out; + + for (bank = 0; bank < NR_BANKS; ++bank) { + if (!(per_cpu(bank_map, cpu) & 1 << bank)) + continue; + err = threshold_create_bank(cpu, bank); + if (err) + goto out; + } + out: + return err; +} + +#ifdef CONFIG_HOTPLUG_CPU +/* + * let's be hotplug friendly. + * in case of multiple core processors, the first core always takes ownership + * of shared sysfs dir/files, and rest of the cores will be symlinked to it. + */ + +/* cpu hotplug call removes all symlinks before first core dies */ +static __cpuinit void threshold_remove_bank(unsigned int cpu, int bank) +{ + struct threshold_bank *b; + char name[16]; + + b = per_cpu(threshold_banks, cpu)[bank]; + if (!b) + return; + if (shared_bank[bank] && atomic_read(&b->kobj.kref.refcount) > 2) { + sprintf(name, "bank%i", bank); + sysfs_remove_link(&per_cpu(device_threshold, cpu).kobj, name); + per_cpu(threshold_banks, cpu)[bank] = 0; + } else { + kobject_unregister(&b->kobj); + kfree(per_cpu(threshold_banks, cpu)[bank]); + } +} + +static __cpuinit void threshold_remove_device(unsigned int cpu) +{ + int bank; + + for (bank = 0; bank < NR_BANKS; ++bank) { + if (!(per_cpu(bank_map, cpu) & 1 << bank)) + continue; + threshold_remove_bank(cpu, bank); + } + sysdev_unregister(&per_cpu(device_threshold, cpu)); +} + +/* link all existing siblings when first core comes up */ +static __cpuinit int threshold_create_symlinks(unsigned int cpu) +{ + int bank, err = 0; + unsigned int lcpu = 0; + + if (cpu_core_id[cpu]) + return 0; + for_each_cpu_mask(lcpu, cpu_core_map[cpu]) { + if (lcpu == cpu) + continue; + for (bank = 0; bank < NR_BANKS; ++bank) { + if (!(per_cpu(bank_map, cpu) & 1 << bank)) + continue; + if (!shared_bank[bank]) + continue; + err = threshold_create_bank(lcpu, bank); + } + } + return err; +} + +/* remove all symlinks before first core dies. */ +static __cpuinit void threshold_remove_symlinks(unsigned int cpu) +{ + int bank; + unsigned int lcpu = 0; + if (cpu_core_id[cpu]) + return; + for_each_cpu_mask(lcpu, cpu_core_map[cpu]) { + if (lcpu == cpu) + continue; + for (bank = 0; bank < NR_BANKS; ++bank) { + if (!(per_cpu(bank_map, cpu) & 1 << bank)) + continue; + if (!shared_bank[bank]) + continue; + threshold_remove_bank(lcpu, bank); + } + } +} +#else /* !CONFIG_HOTPLUG_CPU */ +static __cpuinit void threshold_create_symlinks(unsigned int cpu) +{ +} +static __cpuinit void threshold_remove_symlinks(unsigned int cpu) +{ +} +static void threshold_remove_device(unsigned int cpu) +{ +} +#endif + +/* get notified when a cpu comes on/off */ +static __cpuinit int threshold_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + /* cpu was unsigned int to begin with */ + unsigned int cpu = (unsigned long)hcpu; + + if (cpu >= NR_CPUS) + goto out; + + switch (action) { + case CPU_ONLINE: + threshold_create_device(cpu); + threshold_create_symlinks(cpu); + break; + case CPU_DOWN_PREPARE: + threshold_remove_symlinks(cpu); + break; + case CPU_DOWN_FAILED: + threshold_create_symlinks(cpu); + break; + case CPU_DEAD: + threshold_remove_device(cpu); + break; + default: + break; + } + out: + return NOTIFY_OK; +} + +static struct notifier_block threshold_cpu_notifier = { + .notifier_call = threshold_cpu_callback, +}; + +static __init int threshold_init_device(void) +{ + int err; + int lcpu = 0; + + err = sysdev_class_register(&threshold_sysclass); + if (err) + goto out; + + /* to hit CPUs online before the notifier is up */ + for_each_online_cpu(lcpu) { + err = threshold_create_device(lcpu); + if (err) + goto out; + } + register_cpu_notifier(&threshold_cpu_notifier); + + out: + return err; +} + +device_initcall(threshold_init_device); diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index f16d38d09da..1105250bf02 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -42,7 +42,7 @@ int acpi_found_madt; * Various Linux-internal data structures created from the * MP-table. */ -int apic_version [MAX_APICS]; +unsigned char apic_version [MAX_APICS]; unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; @@ -65,7 +65,9 @@ unsigned long mp_lapic_addr = 0; /* Processor that is doing the boot up */ unsigned int boot_cpu_id = -1U; /* Internal processor count */ -static unsigned int num_processors = 0; +unsigned int num_processors __initdata = 0; + +unsigned disabled_cpus __initdata; /* Bitmask of physically existing CPUs */ physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE; @@ -106,11 +108,14 @@ static int __init mpf_checksum(unsigned char *mp, int len) static void __init MP_processor_info (struct mpc_config_processor *m) { - int ver, cpu; + int cpu; + unsigned char ver; static int found_bsp=0; - if (!(m->mpc_cpuflag & CPU_ENABLED)) + if (!(m->mpc_cpuflag & CPU_ENABLED)) { + disabled_cpus++; return; + } printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n", m->mpc_apicid, @@ -129,12 +134,14 @@ static void __init MP_processor_info (struct mpc_config_processor *m) } cpu = num_processors++; - - if (m->mpc_apicid > MAX_APICS) { + +#if MAX_APICS < 255 + if ((int)m->mpc_apicid > MAX_APICS) { printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n", m->mpc_apicid, MAX_APICS); return; } +#endif ver = m->mpc_apicver; physid_set(m->mpc_apicid, phys_cpu_present_map); @@ -218,7 +225,7 @@ static void __init MP_intsrc_info (struct mpc_config_intsrc *m) m->mpc_irqtype, m->mpc_irqflag & 3, (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); - if (++mp_irq_entries == MAX_IRQ_SOURCES) + if (++mp_irq_entries >= MAX_IRQ_SOURCES) panic("Max # of irq sources exceeded!!\n"); } @@ -549,7 +556,7 @@ void __init get_smp_config (void) * Read the physical hardware table. Anything here will * override the defaults. */ - if (!smp_read_mpc((void *)(unsigned long)mpf->mpf_physptr)) { + if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr))) { smp_found_config = 0; printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"); printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n"); diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c index cf0a0315d58..2e28e855ec3 100644 --- a/arch/x86_64/kernel/pci-gart.c +++ b/arch/x86_64/kernel/pci-gart.c @@ -187,7 +187,7 @@ static void flush_gart(struct device *dev) /* Allocate DMA memory on node near device */ noinline -static void *dma_alloc_pages(struct device *dev, unsigned gfp, unsigned order) +static void *dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order) { struct page *page; int node; @@ -204,7 +204,7 @@ static void *dma_alloc_pages(struct device *dev, unsigned gfp, unsigned order) */ void * dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, - unsigned gfp) + gfp_t gfp) { void *memory; unsigned long dma_mask = 0; @@ -220,6 +220,12 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, uses the normal dma_mask for alloc_coherent. */ dma_mask &= *dev->dma_mask; + /* Why <=? Even when the mask is smaller than 4GB it is often larger + than 16MB and in this case we have a chance of finding fitting memory + in the next higher zone first. If not retry with true GFP_DMA. -AK */ + if (dma_mask <= 0xffffffff) + gfp |= GFP_DMA32; + again: memory = dma_alloc_pages(dev, gfp, get_order(size)); if (memory == NULL) @@ -245,7 +251,7 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, } if (!(gfp & GFP_DMA)) { - gfp |= GFP_DMA; + gfp = (gfp & ~GFP_DMA32) | GFP_DMA; goto again; } return NULL; diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c index 67d90b89af0..5a981dca87f 100644 --- a/arch/x86_64/kernel/pci-nommu.c +++ b/arch/x86_64/kernel/pci-nommu.c @@ -24,7 +24,7 @@ EXPORT_SYMBOL(iommu_sac_force); */ void *dma_alloc_coherent(struct device *hwdev, size_t size, - dma_addr_t *dma_handle, unsigned gfp) + dma_addr_t *dma_handle, gfp_t gfp) { void *ret; u64 mask; diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index b5a89c0bdf5..7519fc520eb 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -86,12 +86,22 @@ EXPORT_SYMBOL(enable_hlt); */ void default_idle(void) { + local_irq_enable(); + if (!atomic_read(&hlt_counter)) { - local_irq_disable(); - if (!need_resched()) - safe_halt(); - else - local_irq_enable(); + clear_thread_flag(TIF_POLLING_NRFLAG); + smp_mb__after_clear_bit(); + while (!need_resched()) { + local_irq_disable(); + if (!need_resched()) + safe_halt(); + else + local_irq_enable(); + } + set_thread_flag(TIF_POLLING_NRFLAG); + } else { + while (!need_resched()) + cpu_relax(); } } @@ -102,30 +112,16 @@ void default_idle(void) */ static void poll_idle (void) { - int oldval; - local_irq_enable(); - /* - * Deal with another CPU just having chosen a thread to - * run here: - */ - oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED); - - if (!oldval) { - set_thread_flag(TIF_POLLING_NRFLAG); - asm volatile( - "2:" - "testl %0,%1;" - "rep; nop;" - "je 2b;" - : : - "i" (_TIF_NEED_RESCHED), - "m" (current_thread_info()->flags)); - clear_thread_flag(TIF_POLLING_NRFLAG); - } else { - set_need_resched(); - } + asm volatile( + "2:" + "testl %0,%1;" + "rep; nop;" + "je 2b;" + : : + "i" (_TIF_NEED_RESCHED), + "m" (current_thread_info()->flags)); } void cpu_idle_wait(void) @@ -148,7 +144,8 @@ void cpu_idle_wait(void) do { ssleep(1); for_each_online_cpu(cpu) { - if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) + if (cpu_isset(cpu, map) && + !per_cpu(cpu_idle_state, cpu)) cpu_clear(cpu, map); } cpus_and(map, map, cpu_online_map); @@ -187,6 +184,8 @@ static inline void play_dead(void) */ void cpu_idle (void) { + set_thread_flag(TIF_POLLING_NRFLAG); + /* endless idle loop with no priority at all */ while (1) { while (!need_resched()) { @@ -204,7 +203,9 @@ void cpu_idle (void) idle(); } + preempt_enable_no_resched(); schedule(); + preempt_disable(); } } @@ -219,15 +220,12 @@ static void mwait_idle(void) { local_irq_enable(); - if (!need_resched()) { - set_thread_flag(TIF_POLLING_NRFLAG); - do { - __monitor((void *)¤t_thread_info()->flags, 0, 0); - if (need_resched()) - break; - __mwait(0, 0); - } while (!need_resched()); - clear_thread_flag(TIF_POLLING_NRFLAG); + while (!need_resched()) { + __monitor((void *)¤t_thread_info()->flags, 0, 0); + smp_mb(); + if (need_resched()) + break; + __mwait(0, 0); } } @@ -278,7 +276,8 @@ void __show_regs(struct pt_regs * regs) system_utsname.version); printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); printk_address(regs->rip); - printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, regs->eflags); + printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, + regs->eflags); printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", regs->rax, regs->rbx, regs->rcx); printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", @@ -352,13 +351,6 @@ void flush_thread(void) struct task_struct *tsk = current; struct thread_info *t = current_thread_info(); - /* - * Remove function-return probe instances associated with this task - * and put them back on the free list. Do not insert an exit probe for - * this function, it will be disabled by kprobe_flush_task if you do. - */ - kprobe_flush_task(tsk); - if (t->flags & _TIF_ABI_PENDING) t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32); @@ -430,15 +422,14 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, struct pt_regs * childregs; struct task_struct *me = current; - childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1; - + childregs = ((struct pt_regs *) + (THREAD_SIZE + (unsigned long) p->thread_info)) - 1; *childregs = *regs; childregs->rax = 0; childregs->rsp = rsp; - if (rsp == ~0UL) { + if (rsp == ~0UL) childregs->rsp = (unsigned long)childregs; - } p->thread.rsp = (unsigned long) childregs; p->thread.rsp0 = (unsigned long) (childregs+1); @@ -460,7 +451,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, p->thread.io_bitmap_max = 0; return -ENOMEM; } - memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, IO_BITMAP_BYTES); + memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, + IO_BITMAP_BYTES); } /* @@ -497,7 +489,8 @@ out: * - fold all the options into a flag word and test it with a single test. * - could test fs/gs bitsliced */ -struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct *next_p) +struct task_struct * +__switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; @@ -568,7 +561,8 @@ struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct * prev->userrsp = read_pda(oldrsp); write_pda(oldrsp, next->userrsp); write_pda(pcurrent, next_p); - write_pda(kernelstack, (unsigned long)next_p->thread_info + THREAD_SIZE - PDA_STACKOFFSET); + write_pda(kernelstack, + (unsigned long)next_p->thread_info + THREAD_SIZE - PDA_STACKOFFSET); /* * Now maybe reload the debug registers @@ -649,7 +643,9 @@ asmlinkage long sys_fork(struct pt_regs *regs) return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL); } -asmlinkage long sys_clone(unsigned long clone_flags, unsigned long newsp, void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) +asmlinkage long +sys_clone(unsigned long clone_flags, unsigned long newsp, + void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) { if (!newsp) newsp = regs->rsp; @@ -685,7 +681,8 @@ unsigned long get_wchan(struct task_struct *p) return 0; fp = *(u64 *)(p->thread.rsp); do { - if (fp < (unsigned long)stack || fp > (unsigned long)stack+THREAD_SIZE) + if (fp < (unsigned long)stack || + fp > (unsigned long)stack+THREAD_SIZE) return 0; rip = *(u64 *)(fp+8); if (!in_sched_functions(rip)) @@ -720,8 +717,8 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) task->thread.gsindex = 0; task->thread.gs = addr; if (doit) { - load_gs_index(0); - ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); + load_gs_index(0); + ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); } } put_cpu(); @@ -738,7 +735,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) set_32bit_tls(task, FS_TLS, addr); if (doit) { load_TLS(&task->thread, cpu); - asm volatile("movl %0,%%fs" :: "r" (FS_TLS_SEL)); + asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL)); } task->thread.fsindex = FS_TLS_SEL; task->thread.fs = 0; @@ -748,8 +745,8 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) if (doit) { /* set the selector to 0 to not confuse __switch_to */ - asm volatile("movl %0,%%fs" :: "r" (0)); - ret = checking_wrmsrl(MSR_FS_BASE, addr); + asm volatile("movl %0,%%fs" :: "r" (0)); + ret = checking_wrmsrl(MSR_FS_BASE, addr); } } put_cpu(); @@ -758,9 +755,9 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) unsigned long base; if (task->thread.fsindex == FS_TLS_SEL) base = read_32bit_tls(task, FS_TLS); - else if (doit) { + else if (doit) rdmsrl(MSR_FS_BASE, base); - } else + else base = task->thread.fs; ret = put_user(base, (unsigned long __user *)addr); break; @@ -769,9 +766,9 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) unsigned long base; if (task->thread.gsindex == GS_TLS_SEL) base = read_32bit_tls(task, GS_TLS); - else if (doit) { + else if (doit) rdmsrl(MSR_KERNEL_GS_BASE, base); - } else + else base = task->thread.gs; ret = put_user(base, (unsigned long __user *)addr); break; diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c index bbf64b59a21..a87b6cebe80 100644 --- a/arch/x86_64/kernel/ptrace.c +++ b/arch/x86_64/kernel/ptrace.c @@ -313,48 +313,11 @@ static unsigned long getreg(struct task_struct *child, unsigned long regno) } -asmlinkage long sys_ptrace(long request, long pid, unsigned long addr, long data) +long arch_ptrace(struct task_struct *child, long request, long addr, long data) { - struct task_struct *child; long i, ret; unsigned ui; - /* This lock_kernel fixes a subtle race with suid exec */ - lock_kernel(); - ret = -EPERM; - if (request == PTRACE_TRACEME) { - /* are we already being traced? */ - if (current->ptrace & PT_PTRACED) - goto out; - ret = security_ptrace(current->parent, current); - if (ret) - goto out; - /* set the ptrace bit in the process flags. */ - current->ptrace |= PT_PTRACED; - ret = 0; - goto out; - } - ret = -ESRCH; - read_lock(&tasklist_lock); - child = find_task_by_pid(pid); - if (child) - get_task_struct(child); - read_unlock(&tasklist_lock); - if (!child) - goto out; - - ret = -EPERM; - if (pid == 1) /* you may not mess with init */ - goto out_tsk; - - if (request == PTRACE_ATTACH) { - ret = ptrace_attach(child); - goto out_tsk; - } - ret = ptrace_check_attach(child, request == PTRACE_KILL); - if (ret < 0) - goto out_tsk; - switch (request) { /* when I and D space are separate, these will need to be fixed. */ case PTRACE_PEEKTEXT: /* read word at location addr. */ @@ -608,10 +571,6 @@ asmlinkage long sys_ptrace(long request, long pid, unsigned long addr, long data ret = ptrace_request(child, request, addr, data); break; } -out_tsk: - put_task_struct(child); -out: - unlock_kernel(); return ret; } diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c index 47f95687905..75235ed2b31 100644 --- a/arch/x86_64/kernel/reboot.c +++ b/arch/x86_64/kernel/reboot.c @@ -77,6 +77,7 @@ static inline void kb_wait(void) void machine_shutdown(void) { + unsigned long flags; /* Stop the cpus and apics */ #ifdef CONFIG_SMP int reboot_cpu_id; @@ -98,7 +99,7 @@ void machine_shutdown(void) smp_send_stop(); #endif - local_irq_disable(); + local_irq_save(flags); #ifndef CONFIG_SMP disable_local_APIC(); @@ -106,7 +107,7 @@ void machine_shutdown(void) disable_IO_APIC(); - local_irq_enable(); + local_irq_restore(flags); } void machine_emergency_restart(void) @@ -120,7 +121,7 @@ void machine_emergency_restart(void) /* Could also try the reset bit in the Hammer NB */ switch (reboot_type) { case BOOT_KBD: - for (i=0; i<100; i++) { + for (i=0; i<10; i++) { kb_wait(); udelay(50); outb(0xfe,0x64); /* pulse reset low */ diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 238f73e1a83..750e01dcbdf 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -60,6 +60,7 @@ #include <asm/setup.h> #include <asm/mach_apic.h> #include <asm/numa.h> +#include <asm/sections.h> /* * Machine setup.. @@ -103,7 +104,6 @@ struct edid_info edid_info; struct e820map e820; extern int root_mountflags; -extern char _text, _etext, _edata, _end; char command_line[COMMAND_LINE_SIZE]; @@ -412,7 +412,6 @@ contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn) { unsigned long bootmap_size, bootmap; - memory_present(0, start_pfn, end_pfn); bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size); if (bootmap == -1L) @@ -571,6 +570,8 @@ void __init setup_arch(char **cmdline_p) init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT)); + zap_low_mappings(0); + #ifdef CONFIG_ACPI /* * Initialize the ACPI boot-time table parser (gets the RSDP and SDT). @@ -657,8 +658,6 @@ void __init setup_arch(char **cmdline_p) } #endif - sparse_init(); - paging_init(); check_ioapic(); @@ -793,7 +792,7 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c) #endif bits = 0; - while ((1 << bits) < c->x86_num_cores) + while ((1 << bits) < c->x86_max_cores) bits++; /* Low order bits define the core id (index of core in socket) */ @@ -823,16 +822,14 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c) if (!node_online(node)) node = nearby_node(apicid); } - cpu_to_node[cpu] = node; + numa_set_node(cpu, node); printk(KERN_INFO "CPU %d(%d) -> Node %d -> Core %d\n", - cpu, c->x86_num_cores, node, cpu_core_id[cpu]); + cpu, c->x86_max_cores, node, cpu_core_id[cpu]); #endif #endif } -#define HWCR 0xc0010015 - static int __init init_amd(struct cpuinfo_x86 *c) { int r; @@ -841,14 +838,18 @@ static int __init init_amd(struct cpuinfo_x86 *c) #ifdef CONFIG_SMP unsigned long value; - // Disable TLB flush filter by setting HWCR.FFDIS: - // bit 6 of msr C001_0015 - // - // Errata 63 for SH-B3 steppings - // Errata 122 for all(?) steppings - rdmsrl(HWCR, value); - value |= 1 << 6; - wrmsrl(HWCR, value); + /* + * Disable TLB flush filter by setting HWCR.FFDIS on K8 + * bit 6 of msr C001_0015 + * + * Errata 63 for SH-B3 steppings + * Errata 122 for all steppings (F+ have it disabled by default) + */ + if (c->x86 == 15) { + rdmsrl(MSR_K8_HWCR, value); + value |= 1 << 6; + wrmsrl(MSR_K8_HWCR, value); + } #endif /* Bit 31 in normal CPUID used for nonstandard 3DNow ID; @@ -873,9 +874,9 @@ static int __init init_amd(struct cpuinfo_x86 *c) display_cacheinfo(c); if (c->extended_cpuid_level >= 0x80000008) { - c->x86_num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; - if (c->x86_num_cores & (c->x86_num_cores - 1)) - c->x86_num_cores = 1; + c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; + if (c->x86_max_cores & (c->x86_max_cores - 1)) + c->x86_max_cores = 1; amd_detect_cmp(c); } @@ -887,54 +888,44 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP u32 eax, ebx, ecx, edx; - int index_msb, tmp; + int index_msb, core_bits; int cpu = smp_processor_id(); - + + cpuid(1, &eax, &ebx, &ecx, &edx); + + c->apicid = phys_pkg_id(0); + if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY)) return; - cpuid(1, &eax, &ebx, &ecx, &edx); smp_num_siblings = (ebx & 0xff0000) >> 16; - + if (smp_num_siblings == 1) { printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); - } else if (smp_num_siblings > 1) { - index_msb = 31; - /* - * At this point we only support two siblings per - * processor package. - */ + } else if (smp_num_siblings > 1 ) { + if (smp_num_siblings > NR_CPUS) { printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings); smp_num_siblings = 1; return; } - tmp = smp_num_siblings; - while ((tmp & 0x80000000 ) == 0) { - tmp <<=1 ; - index_msb--; - } - if (smp_num_siblings & (smp_num_siblings - 1)) - index_msb++; + + index_msb = get_count_order(smp_num_siblings); phys_proc_id[cpu] = phys_pkg_id(index_msb); - + printk(KERN_INFO "CPU: Physical Processor ID: %d\n", phys_proc_id[cpu]); - smp_num_siblings = smp_num_siblings / c->x86_num_cores; + smp_num_siblings = smp_num_siblings / c->x86_max_cores; - tmp = smp_num_siblings; - index_msb = 31; - while ((tmp & 0x80000000) == 0) { - tmp <<=1 ; - index_msb--; - } - if (smp_num_siblings & (smp_num_siblings - 1)) - index_msb++; + index_msb = get_count_order(smp_num_siblings) ; + + core_bits = get_count_order(c->x86_max_cores); - cpu_core_id[cpu] = phys_pkg_id(index_msb); + cpu_core_id[cpu] = phys_pkg_id(index_msb) & + ((1 << core_bits) - 1); - if (c->x86_num_cores > 1) + if (c->x86_max_cores > 1) printk(KERN_INFO "CPU: Processor Core ID: %d\n", cpu_core_id[cpu]); } @@ -965,16 +956,15 @@ static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) static void srat_detect_node(void) { #ifdef CONFIG_NUMA - unsigned apicid, node; + unsigned node; int cpu = smp_processor_id(); /* Don't do the funky fallback heuristics the AMD version employs for now. */ - apicid = phys_proc_id[cpu]; - node = apicid_to_node[apicid]; + node = apicid_to_node[hard_smp_processor_id()]; if (node == NUMA_NO_NODE) node = 0; - cpu_to_node[cpu] = node; + numa_set_node(cpu, node); if (acpi_numa > 0) printk(KERN_INFO "CPU %d -> Node %d\n", cpu, node); @@ -992,13 +982,18 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) unsigned eax = cpuid_eax(0x80000008); c->x86_virt_bits = (eax >> 8) & 0xff; c->x86_phys_bits = eax & 0xff; + /* CPUID workaround for Intel 0F34 CPU */ + if (c->x86_vendor == X86_VENDOR_INTEL && + c->x86 == 0xF && c->x86_model == 0x3 && + c->x86_mask == 0x4) + c->x86_phys_bits = 36; } if (c->x86 == 15) c->x86_cache_alignment = c->x86_clflush_size * 2; if (c->x86 >= 15) set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); - c->x86_num_cores = intel_num_cpu_cores(c); + c->x86_max_cores = intel_num_cpu_cores(c); srat_detect_node(); } @@ -1036,7 +1031,7 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) c->x86_model_id[0] = '\0'; /* Unset */ c->x86_clflush_size = 64; c->x86_cache_alignment = c->x86_clflush_size; - c->x86_num_cores = 1; + c->x86_max_cores = 1; c->extended_cpuid_level = 0; memset(&c->x86_capability, 0, sizeof c->x86_capability); @@ -1059,10 +1054,10 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) c->x86 = (tfms >> 8) & 0xf; c->x86_model = (tfms >> 4) & 0xf; c->x86_mask = tfms & 0xf; - if (c->x86 == 0xf) { + if (c->x86 == 0xf) c->x86 += (tfms >> 20) & 0xff; + if (c->x86 >= 0x6) c->x86_model += ((tfms >> 16) & 0xF) << 4; - } if (c->x86_capability[0] & (1<<19)) c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; } else { @@ -1212,7 +1207,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* Intel-defined (#2) */ - "pni", NULL, NULL, "monitor", "ds_cpl", NULL, NULL, "est", + "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", NULL, "est", "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, @@ -1270,13 +1265,12 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size); #ifdef CONFIG_SMP - if (smp_num_siblings * c->x86_num_cores > 1) { + if (smp_num_siblings * c->x86_max_cores > 1) { int cpu = c - cpu_data; seq_printf(m, "physical id\t: %d\n", phys_proc_id[cpu]); - seq_printf(m, "siblings\t: %d\n", - c->x86_num_cores * smp_num_siblings); + seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[cpu])); seq_printf(m, "core id\t\t: %d\n", cpu_core_id[cpu]); - seq_printf(m, "cpu cores\t: %d\n", c->x86_num_cores); + seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); } #endif diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index bd33be24a38..06dc354375c 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -87,6 +87,10 @@ void __init setup_per_cpu_areas(void) int i; unsigned long size; +#ifdef CONFIG_HOTPLUG_CPU + prefill_possible_map(); +#endif + /* Copy section for each CPU (we discard the original) */ size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES); #ifdef CONFIG_MODULES @@ -137,7 +141,6 @@ void pda_init(int cpu) panic("cannot allocate irqstack for cpu %d", cpu); } - asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt))); pda->irqstackptr += IRQSTACKSIZE-64; } @@ -193,6 +196,7 @@ void __cpuinit cpu_init (void) /* CPU 0 is initialised in head64.c */ if (cpu != 0) { pda_init(cpu); + zap_low_mappings(cpu); } else estacks = boot_exception_stacks; diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c index d642fbf3da2..5876df116c9 100644 --- a/arch/x86_64/kernel/signal.c +++ b/arch/x86_64/kernel/signal.c @@ -110,6 +110,15 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned COPY(r14); COPY(r15); + /* Kernel saves and restores only the CS segment register on signals, + * which is the bare minimum needed to allow mixed 32/64-bit code. + * App's signal handler can save/restore other segments if needed. */ + { + unsigned cs; + err |= __get_user(cs, &sc->cs); + regs->cs = cs | 3; /* Force into user mode */ + } + { unsigned int tmpflags; err |= __get_user(tmpflags, &sc->eflags); @@ -187,6 +196,7 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned lo { int err = 0; + err |= __put_user(regs->cs, &sc->cs); err |= __put_user(0, &sc->gs); err |= __put_user(0, &sc->fs); @@ -318,7 +328,14 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, regs->rsp = (unsigned long)frame; + /* Set up the CS register to run signal handlers in 64-bit mode, + even if the handler happens to be interrupting 32-bit code. */ + regs->cs = __USER_CS; + + /* This, by contrast, has nothing to do with segment registers - + see include/asm-x86_64/uaccess.h for details. */ set_fs(USER_DS); + regs->eflags &= ~TF_MASK; if (test_thread_flag(TIF_SINGLESTEP)) ptrace_notify(SIGTRAP); diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index 9db9dda161b..cfc3d9dccbd 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c @@ -28,8 +28,6 @@ #include <asm/proto.h> #include <asm/apicdef.h> -#define __cpuinit __init - /* * Smarter SMP flushing macros. * c/o Linus Torvalds. @@ -452,13 +450,14 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic, void smp_stop_cpu(void) { + unsigned long flags; /* * Remove this CPU: */ cpu_clear(smp_processor_id(), cpu_online_map); - local_irq_disable(); + local_irq_save(flags); disable_local_APIC(); - local_irq_enable(); + local_irq_restore(flags); } static void smp_really_stop_cpu(void *dummy) diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index e12d7baeb33..683c33f7b96 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -64,9 +64,8 @@ int smp_num_siblings = 1; /* Package ID of each logical CPU */ u8 phys_proc_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; +/* core ID of each logical CPU */ u8 cpu_core_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; -EXPORT_SYMBOL(phys_proc_id); -EXPORT_SYMBOL(cpu_core_id); /* Bitmask of currently online CPUs */ cpumask_t cpu_online_map __read_mostly; @@ -89,7 +88,10 @@ struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; /* Set when the idlers are all forked */ int smp_threads_ready; +/* representing HT siblings of each logical CPU */ cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly; + +/* representing HT and core siblings of each logical CPU */ cpumask_t cpu_core_map[NR_CPUS] __read_mostly; EXPORT_SYMBOL(cpu_core_map); @@ -436,30 +438,59 @@ void __cpuinit smp_callin(void) cpu_set(cpuid, cpu_callin_map); } +/* representing cpus for which sibling maps can be computed */ +static cpumask_t cpu_sibling_setup_map; + static inline void set_cpu_sibling_map(int cpu) { int i; + struct cpuinfo_x86 *c = cpu_data; + + cpu_set(cpu, cpu_sibling_setup_map); if (smp_num_siblings > 1) { - for_each_cpu(i) { - if (cpu_core_id[cpu] == cpu_core_id[i]) { + for_each_cpu_mask(i, cpu_sibling_setup_map) { + if (phys_proc_id[cpu] == phys_proc_id[i] && + cpu_core_id[cpu] == cpu_core_id[i]) { cpu_set(i, cpu_sibling_map[cpu]); cpu_set(cpu, cpu_sibling_map[i]); + cpu_set(i, cpu_core_map[cpu]); + cpu_set(cpu, cpu_core_map[i]); } } } else { cpu_set(cpu, cpu_sibling_map[cpu]); } - if (current_cpu_data.x86_num_cores > 1) { - for_each_cpu(i) { - if (phys_proc_id[cpu] == phys_proc_id[i]) { - cpu_set(i, cpu_core_map[cpu]); - cpu_set(cpu, cpu_core_map[i]); - } - } - } else { + if (current_cpu_data.x86_max_cores == 1) { cpu_core_map[cpu] = cpu_sibling_map[cpu]; + c[cpu].booted_cores = 1; + return; + } + + for_each_cpu_mask(i, cpu_sibling_setup_map) { + if (phys_proc_id[cpu] == phys_proc_id[i]) { + cpu_set(i, cpu_core_map[cpu]); + cpu_set(cpu, cpu_core_map[i]); + /* + * Does this new cpu bringup a new core? + */ + if (cpus_weight(cpu_sibling_map[cpu]) == 1) { + /* + * for each core in package, increment + * the booted_cores for this new cpu + */ + if (first_cpu(cpu_sibling_map[i]) == i) + c[cpu].booted_cores++; + /* + * increment the core count for all + * the other cpus in this package + */ + if (i != cpu) + c[i].booted_cores++; + } else if (i != cpu && !c[cpu].booted_cores) + c[cpu].booted_cores = c[i].booted_cores; + } } } @@ -474,6 +505,7 @@ void __cpuinit start_secondary(void) * things done here to the most necessary things. */ cpu_init(); + preempt_disable(); smp_callin(); /* otherwise gcc will move up the smp_processor_id before the cpu_init */ @@ -880,6 +912,9 @@ static __init void disable_smp(void) } #ifdef CONFIG_HOTPLUG_CPU + +int additional_cpus __initdata = -1; + /* * cpu_possible_map should be static, it cannot change as cpu's * are onlined, or offlined. The reason is per-cpu data-structures @@ -888,14 +923,38 @@ static __init void disable_smp(void) * cpu_present_map on the other hand can change dynamically. * In case when cpu_hotplug is not compiled, then we resort to current * behaviour, which is cpu_possible == cpu_present. - * If cpu-hotplug is supported, then we need to preallocate for all - * those NR_CPUS, hence cpu_possible_map represents entire NR_CPUS range. * - Ashok Raj + * + * Three ways to find out the number of additional hotplug CPUs: + * - If the BIOS specified disabled CPUs in ACPI/mptables use that. + * - otherwise use half of the available CPUs or 2, whatever is more. + * - The user can overwrite it with additional_cpus=NUM + * We do this because additional CPUs waste a lot of memory. + * -AK */ -static void prefill_possible_map(void) +__init void prefill_possible_map(void) { int i; - for (i = 0; i < NR_CPUS; i++) + int possible; + + if (additional_cpus == -1) { + if (disabled_cpus > 0) { + additional_cpus = disabled_cpus; + } else { + additional_cpus = num_processors / 2; + if (additional_cpus == 0) + additional_cpus = 2; + } + } + possible = num_processors + additional_cpus; + if (possible > NR_CPUS) + possible = NR_CPUS; + + printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", + possible, + max_t(int, possible - num_processors, 0)); + + for (i = 0; i < possible; i++) cpu_set(i, cpu_possible_map); } #endif @@ -966,10 +1025,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus) nmi_watchdog_default(); current_cpu_data = boot_cpu_data; current_thread_info()->cpu = 0; /* needed? */ - -#ifdef CONFIG_HOTPLUG_CPU - prefill_possible_map(); -#endif + set_cpu_sibling_map(0); if (smp_sanity_check(max_cpus) < 0) { printk(KERN_INFO "SMP disabled\n"); @@ -1013,8 +1069,6 @@ void __init smp_prepare_boot_cpu(void) int me = smp_processor_id(); cpu_set(me, cpu_online_map); cpu_set(me, cpu_callout_map); - cpu_set(0, cpu_sibling_map[0]); - cpu_set(0, cpu_core_map[0]); per_cpu(cpu_state, me) = CPU_ONLINE; } @@ -1067,9 +1121,6 @@ int __cpuinit __cpu_up(unsigned int cpu) */ void __init smp_cpus_done(unsigned int max_cpus) { -#ifndef CONFIG_HOTPLUG_CPU - zap_low_mappings(); -#endif smp_cleanup_boot(); #ifdef CONFIG_X86_IO_APIC @@ -1086,15 +1137,24 @@ void __init smp_cpus_done(unsigned int max_cpus) static void remove_siblinginfo(int cpu) { int sibling; + struct cpuinfo_x86 *c = cpu_data; + for_each_cpu_mask(sibling, cpu_core_map[cpu]) { + cpu_clear(cpu, cpu_core_map[sibling]); + /* + * last thread sibling in this cpu core going down + */ + if (cpus_weight(cpu_sibling_map[cpu]) == 1) + c[sibling].booted_cores--; + } + for_each_cpu_mask(sibling, cpu_sibling_map[cpu]) cpu_clear(cpu, cpu_sibling_map[sibling]); - for_each_cpu_mask(sibling, cpu_core_map[cpu]) - cpu_clear(cpu, cpu_core_map[sibling]); cpus_clear(cpu_sibling_map[cpu]); cpus_clear(cpu_core_map[cpu]); phys_proc_id[cpu] = BAD_APICID; cpu_core_id[cpu] = BAD_APICID; + cpu_clear(cpu, cpu_sibling_setup_map); } void remove_cpu_from_maps(void) @@ -1158,6 +1218,12 @@ void __cpu_die(unsigned int cpu) printk(KERN_ERR "CPU %u didn't die...\n", cpu); } +static __init int setup_additional_cpus(char *s) +{ + return get_option(&s, &additional_cpus); +} +__setup("additional_cpus=", setup_additional_cpus); + #else /* ... !CONFIG_HOTPLUG_CPU */ int __cpu_disable(void) diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c index ebb9abf3ce6..fd2bef78088 100644 --- a/arch/x86_64/kernel/suspend.c +++ b/arch/x86_64/kernel/suspend.c @@ -11,6 +11,8 @@ #include <linux/smp.h> #include <linux/suspend.h> #include <asm/proto.h> +#include <asm/page.h> +#include <asm/pgtable.h> struct saved_context saved_context; @@ -61,13 +63,12 @@ void save_processor_state(void) __save_processor_state(&saved_context); } -static void -do_fpu_end(void) +static void do_fpu_end(void) { - /* restore FPU regs if necessary */ - /* Do it out of line so that gcc does not move cr0 load to some stupid place */ - kernel_fpu_end(); - mxcsr_feature_mask_init(); + /* + * Restore FPU regs if necessary + */ + kernel_fpu_end(); } void __restore_processor_state(struct saved_context *ctxt) @@ -140,4 +141,83 @@ void fix_processor_context(void) } +#ifdef CONFIG_SOFTWARE_SUSPEND +/* Defined in arch/x86_64/kernel/suspend_asm.S */ +extern int restore_image(void); + +pgd_t *temp_level4_pgt; + +static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) +{ + long i, j; + + i = pud_index(address); + pud = pud + i; + for (; i < PTRS_PER_PUD; pud++, i++) { + unsigned long paddr; + pmd_t *pmd; + + paddr = address + i*PUD_SIZE; + if (paddr >= end) + break; + + pmd = (pmd_t *)get_safe_page(GFP_ATOMIC); + if (!pmd) + return -ENOMEM; + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); + for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) { + unsigned long pe; + + if (paddr >= end) + break; + pe = _PAGE_NX | _PAGE_PSE | _KERNPG_TABLE | paddr; + pe &= __supported_pte_mask; + set_pmd(pmd, __pmd(pe)); + } + } + return 0; +} + +static int set_up_temporary_mappings(void) +{ + unsigned long start, end, next; + int error; + + temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC); + if (!temp_level4_pgt) + return -ENOMEM; + + /* It is safe to reuse the original kernel mapping */ + set_pgd(temp_level4_pgt + pgd_index(__START_KERNEL_map), + init_level4_pgt[pgd_index(__START_KERNEL_map)]); + + /* Set up the direct mapping from scratch */ + start = (unsigned long)pfn_to_kaddr(0); + end = (unsigned long)pfn_to_kaddr(end_pfn); + + for (; start < end; start = next) { + pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC); + if (!pud) + return -ENOMEM; + next = start + PGDIR_SIZE; + if (next > end) + next = end; + if ((error = res_phys_pud_init(pud, __pa(start), __pa(next)))) + return error; + set_pgd(temp_level4_pgt + pgd_index(start), + mk_kernel_pgd(__pa(pud))); + } + return 0; +} + +int swsusp_arch_resume(void) +{ + int error; + /* We have got enough memory and from now on we cannot recover */ + if ((error = set_up_temporary_mappings())) + return error; + restore_image(); + return 0; +} +#endif /* CONFIG_SOFTWARE_SUSPEND */ diff --git a/arch/x86_64/kernel/suspend_asm.S b/arch/x86_64/kernel/suspend_asm.S index 4d659e97df1..320b6fb00cc 100644 --- a/arch/x86_64/kernel/suspend_asm.S +++ b/arch/x86_64/kernel/suspend_asm.S @@ -39,12 +39,13 @@ ENTRY(swsusp_arch_suspend) call swsusp_save ret -ENTRY(swsusp_arch_resume) - /* set up cr3 */ - leaq init_level4_pgt(%rip),%rax - subq $__START_KERNEL_map,%rax - movq %rax,%cr3 - +ENTRY(restore_image) + /* switch to temporary page tables */ + movq $__PAGE_OFFSET, %rdx + movq temp_level4_pgt(%rip), %rax + subq %rdx, %rax + movq %rax, %cr3 + /* Flush TLB */ movq mmu_cr4_features(%rip), %rax movq %rax, %rdx andq $~(1<<7), %rdx # PGE @@ -69,6 +70,10 @@ loop: movq pbe_next(%rdx), %rdx jmp loop done: + /* go back to the original page tables */ + leaq init_level4_pgt(%rip), %rax + subq $__START_KERNEL_map, %rax + movq %rax, %cr3 /* Flush TLB, including "global" things (vmalloc) */ movq mmu_cr4_features(%rip), %rax movq %rax, %rdx diff --git a/arch/x86_64/kernel/sys_x86_64.c b/arch/x86_64/kernel/sys_x86_64.c index cc7821c6885..6449ea8fe75 100644 --- a/arch/x86_64/kernel/sys_x86_64.c +++ b/arch/x86_64/kernel/sys_x86_64.c @@ -154,17 +154,3 @@ asmlinkage long sys_uname(struct new_utsname __user * name) err |= copy_to_user(&name->machine, "i686", 5); return err ? -EFAULT : 0; } - -asmlinkage long sys_time64(long __user * tloc) -{ - struct timeval now; - int i; - - do_gettimeofday(&now); - i = now.tv_sec; - if (tloc) { - if (put_user(i,tloc)) - i = -EFAULT; - } - return i; -} diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 2373cb8b862..fdaddc4e528 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -42,10 +42,6 @@ #include <asm/apic.h> #endif -u64 jiffies_64 = INITIAL_JIFFIES; - -EXPORT_SYMBOL(jiffies_64); - #ifdef CONFIG_CPU_FREQ static void cpufreq_delayed_get(void); #endif @@ -481,9 +477,9 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) static unsigned int cyc2ns_scale; #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ -static inline void set_cyc2ns_scale(unsigned long cpu_mhz) +static inline void set_cyc2ns_scale(unsigned long cpu_khz) { - cyc2ns_scale = (1000 << CYC2NS_SCALE_FACTOR)/cpu_mhz; + cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz; } static inline unsigned long long cycles_2_ns(unsigned long long cyc) @@ -655,7 +651,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, vxtime.tsc_quot = (1000L << 32) / cpu_khz; } - set_cyc2ns_scale(cpu_khz_ref / 1000); + set_cyc2ns_scale(cpu_khz_ref); return 0; } @@ -939,7 +935,7 @@ void __init time_init(void) rdtscll_sync(&vxtime.last_tsc); setup_irq(0, &irq0); - set_cyc2ns_scale(cpu_khz / 1000); + set_cyc2ns_scale(cpu_khz); #ifndef CONFIG_SMP time_init_gtod(); @@ -959,9 +955,6 @@ static __init int unsynchronized_tsc(void) are handled in the OEM check above. */ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) return 0; - /* All in a single socket - should be synchronized */ - if (cpus_weight(cpu_core_map[0]) == num_online_cpus()) - return 0; #endif /* Assume multi socket systems are not synchronized */ return num_online_cpus() > 1; @@ -1096,6 +1089,7 @@ static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ; static unsigned long PIE_count; static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */ +static unsigned int hpet_t1_cmp; /* cached comparator register */ int is_hpet_enabled(void) { @@ -1132,10 +1126,12 @@ int hpet_rtc_timer_init(void) cnt = hpet_readl(HPET_COUNTER); cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq); hpet_writel(cnt, HPET_T1_CMP); + hpet_t1_cmp = cnt; local_irq_restore(flags); cfg = hpet_readl(HPET_T1_CFG); - cfg |= HPET_TN_ENABLE | HPET_TN_SETVAL | HPET_TN_32BIT; + cfg &= ~HPET_TN_PERIODIC; + cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; hpet_writel(cfg, HPET_T1_CFG); return 1; @@ -1145,8 +1141,12 @@ static void hpet_rtc_timer_reinit(void) { unsigned int cfg, cnt; - if (!(PIE_on | AIE_on | UIE_on)) + if (unlikely(!(PIE_on | AIE_on | UIE_on))) { + cfg = hpet_readl(HPET_T1_CFG); + cfg &= ~HPET_TN_ENABLE; + hpet_writel(cfg, HPET_T1_CFG); return; + } if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) hpet_rtc_int_freq = PIE_freq; @@ -1154,15 +1154,10 @@ static void hpet_rtc_timer_reinit(void) hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; /* It is more accurate to use the comparator value than current count.*/ - cnt = hpet_readl(HPET_T1_CMP); + cnt = hpet_t1_cmp; cnt += hpet_tick*HZ/hpet_rtc_int_freq; hpet_writel(cnt, HPET_T1_CMP); - - cfg = hpet_readl(HPET_T1_CFG); - cfg |= HPET_TN_ENABLE | HPET_TN_SETVAL | HPET_TN_32BIT; - hpet_writel(cfg, HPET_T1_CFG); - - return; + hpet_t1_cmp = cnt; } /* diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index b5e09e6b553..bf337f49318 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -428,19 +428,6 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, { conditional_sti(regs); -#ifdef CONFIG_CHECKING - { - unsigned long gs; - struct x8664_pda *pda = cpu_pda + safe_smp_processor_id(); - rdmsrl(MSR_GS_BASE, gs); - if (gs != (unsigned long)pda) { - wrmsrl(MSR_GS_BASE, pda); - printk("%s: wrong gs %lx expected %p rip %lx\n", str, gs, pda, - regs->rip); - } - } -#endif - if (user_mode(regs)) { struct task_struct *tsk = current; @@ -513,20 +500,6 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs, { conditional_sti(regs); -#ifdef CONFIG_CHECKING - { - unsigned long gs; - struct x8664_pda *pda = cpu_pda + safe_smp_processor_id(); - rdmsrl(MSR_GS_BASE, gs); - if (gs != (unsigned long)pda) { - wrmsrl(MSR_GS_BASE, pda); - oops_in_progress++; - printk("general protection handler: wrong gs %lx expected %p\n", gs, pda); - oops_in_progress--; - } - } -#endif - if (user_mode(regs)) { struct task_struct *tsk = current; @@ -665,19 +638,6 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs, struct task_struct *tsk = current; siginfo_t info; -#ifdef CONFIG_CHECKING - { - /* RED-PEN interaction with debugger - could destroy gs */ - unsigned long gs; - struct x8664_pda *pda = cpu_pda + safe_smp_processor_id(); - rdmsrl(MSR_GS_BASE, gs); - if (gs != (unsigned long)pda) { - wrmsrl(MSR_GS_BASE, pda); - printk("debug handler: wrong gs %lx expected %p\n", gs, pda); - } - } -#endif - get_debugreg(condition, 6); if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, @@ -888,6 +848,10 @@ asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) { } +asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) +{ +} + /* * 'math_state_restore()' saves the current math information in the * old math state array, and gets the new ones from the current task diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index 6dd642cad2e..58b19215b4b 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -50,7 +50,7 @@ SECTIONS *(.bss.page_aligned) *(.bss) } - __bss_end = .; + __bss_stop = .; . = ALIGN(PAGE_SIZE); . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c index fd99ddd009b..4a54221e10b 100644 --- a/arch/x86_64/kernel/x8664_ksyms.c +++ b/arch/x86_64/kernel/x8664_ksyms.c @@ -203,3 +203,6 @@ EXPORT_SYMBOL(flush_tlb_page); #endif EXPORT_SYMBOL(cpu_khz); + +EXPORT_SYMBOL(load_gs_index); + |