From 3d0833953e1b98b79ddf491dd49229eef9baeac1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:42 +0200 Subject: ftrace: dynamic enabling/disabling of function calls This patch adds a feature to dynamically replace the ftrace code with the jmps to allow a kernel with ftrace configured to run as fast as it can without it configured. The way this works, is on bootup (if ftrace is enabled), a ftrace function is registered to record the instruction pointer of all places that call the function. Later, if there's still any code to patch, a kthread is awoken (rate limited to at most once a second) that performs a stop_machine, and replaces all the code that was called with a jmp over the call to ftrace. It only replaces what was found the previous time. Typically the system reaches equilibrium quickly after bootup and there's no code patching needed at all. e.g. call ftrace /* 5 bytes */ is replaced with jmp 3f /* jmp is 2 bytes and we jump 3 forward */ 3: When we want to enable ftrace for function tracing, the IP recording is removed, and stop_machine is called again to replace all the locations of that were recorded back to the call of ftrace. When it is disabled, we replace the code back to the jmp. Allocation is done by the kthread. If the ftrace recording function is called, and we don't have any record slots available, then we simply skip that call. Once a second a new page (if needed) is allocated for recording new ftrace function calls. A large batch is allocated at boot up to get most of the calls there. Because we do this via stop_machine, we don't have to worry about another CPU executing a ftrace call as we modify it. But we do need to worry about NMI's so all functions that might be called via nmi must be annotated with notrace_nmi. When this code is configured in, the NMI code will not call notrace. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel/Makefile') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5e618c3b472..e142091524b 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -56,6 +56,7 @@ obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi_$(BITS).o obj-$(CONFIG_X86_IO_APIC) += io_apic_$(BITS).o obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o +obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o -- cgit v1.2.3-70-g09d2 From 7fa09f24b477ad41b821713eba757b3aa7a2864a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 14 May 2008 21:30:32 -0400 Subject: ftrace: use the new kbuild CFLAGS_REMOVE for x86/kernel directory This patch removes the Makefile turd and uses the nice CFLAGS_REMOVE macro in the x86/kernel directory. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/Makefile | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86/kernel/Makefile') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index e142091524b..739d49acd2f 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -6,6 +6,13 @@ extra-y := head_$(BITS).o head$(BITS).o init_task.o vmlinux.lds CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) +ifdef CONFIG_FTRACE +# Do not profile debug utilities +CFLAGS_REMOVE_tsc_64.o = -pg +CFLAGS_REMOVE_tsc_32.o = -pg +CFLAGS_REMOVE_rtc.o = -pg +endif + # # vsyscalls (which work on the user stack) should have # no stack-protector checks: -- cgit v1.2.3-70-g09d2 From 40bd21740012132afb62d78ac3e6b82372b2fbc2 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Wed, 21 May 2008 11:44:02 +0200 Subject: x86: automatical unification of i8259.c Make conversion of i8259 very mechanical -- i8259 was generated by --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/i8259.c | 368 +++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/i8259_32.c | 295 ------------------------------------ arch/x86/kernel/i8259_64.c | 309 ------------------------------------- include/asm-x86/i8259.h | 2 + 5 files changed, 371 insertions(+), 605 deletions(-) create mode 100644 arch/x86/kernel/i8259.c (limited to 'arch/x86/kernel/Makefile') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5e618c3b472..f71a76ef259 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -18,7 +18,7 @@ CFLAGS_tsc_64.o := $(nostackp) obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o obj-y += traps_$(BITS).o irq_$(BITS).o obj-y += time_$(BITS).o ioport.o ldt.o -obj-y += setup_$(BITS).o i8259_$(BITS).o setup.o +obj-y += setup_$(BITS).o i8259.o i8259_$(BITS).o setup.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o setup64.o diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c new file mode 100644 index 00000000000..2decba6b010 --- /dev/null +++ b/arch/x86/kernel/i8259.c @@ -0,0 +1,368 @@ +#ifdef CONFIG_X86_64 +#include +#endif /* CONFIG_X86_64 */ +#include +#include +#include +#include +#include +#ifdef CONFIG_X86_64 +#include +#endif /* CONFIG_X86_64 */ +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_X86_64 +#include +#endif /* CONFIG_X86_64 */ +#include +#include +#include +#ifndef CONFIG_X86_64 +#include +#else /* CONFIG_X86_64 */ +#include +#endif /* CONFIG_X86_64 */ +#include +#include +#include +#include +#ifndef CONFIG_X86_64 +#include +#endif /* ! CONFIG_X86_64 */ +#include + +/* + * This is the 'legacy' 8259A Programmable Interrupt Controller, + * present in the majority of PC/AT boxes. + * plus some generic x86 specific things if generic specifics makes + * any sense at all. + */ + +static int i8259A_auto_eoi; +DEFINE_SPINLOCK(i8259A_lock); +static void mask_and_ack_8259A(unsigned int); + +struct irq_chip i8259A_chip = { + .name = "XT-PIC", + .mask = disable_8259A_irq, + .disable = disable_8259A_irq, + .unmask = enable_8259A_irq, + .mask_ack = mask_and_ack_8259A, +}; + +/* + * 8259A PIC functions to handle ISA devices: + */ + +/* + * This contains the irq mask for both 8259A irq controllers, + */ +unsigned int cached_irq_mask = 0xffff; + +/* + * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) + * boards the timer interrupt is not really connected to any IO-APIC pin, + * it's fed to the master 8259A's IR0 line only. + * + * Any '1' bit in this mask means the IRQ is routed through the IO-APIC. + * this 'mixed mode' IRQ handling costs nothing because it's only used + * at IRQ setup time. + */ +unsigned long io_apic_irqs; + +void disable_8259A_irq(unsigned int irq) +{ + unsigned int mask = 1 << irq; + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + cached_irq_mask |= mask; + if (irq & 8) + outb(cached_slave_mask, PIC_SLAVE_IMR); + else + outb(cached_master_mask, PIC_MASTER_IMR); + spin_unlock_irqrestore(&i8259A_lock, flags); +} + +void enable_8259A_irq(unsigned int irq) +{ + unsigned int mask = ~(1 << irq); + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + cached_irq_mask &= mask; + if (irq & 8) + outb(cached_slave_mask, PIC_SLAVE_IMR); + else + outb(cached_master_mask, PIC_MASTER_IMR); + spin_unlock_irqrestore(&i8259A_lock, flags); +} + +int i8259A_irq_pending(unsigned int irq) +{ + unsigned int mask = 1<> 8); + spin_unlock_irqrestore(&i8259A_lock, flags); + + return ret; +} + +void make_8259A_irq(unsigned int irq) +{ + disable_irq_nosync(irq); + io_apic_irqs &= ~(1<> 8); + outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */ + return value; +} + +/* + * Careful! The 8259A is a fragile beast, it pretty + * much _has_ to be done exactly like this (mask it + * first, _then_ send the EOI, and the order of EOI + * to the two 8259s is important! + */ +static void mask_and_ack_8259A(unsigned int irq) +{ + unsigned int irqmask = 1 << irq; + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + /* + * Lightweight spurious IRQ detection. We do not want + * to overdo spurious IRQ handling - it's usually a sign + * of hardware problems, so we only do the checks we can + * do without slowing down good hardware unnecessarily. + * + * Note that IRQ7 and IRQ15 (the two spurious IRQs + * usually resulting from the 8259A-1|2 PICs) occur + * even if the IRQ is masked in the 8259A. Thus we + * can check spurious 8259A IRQs without doing the + * quite slow i8259A_irq_real() call for every IRQ. + * This does not cover 100% of spurious interrupts, + * but should be enough to warn the user that there + * is something bad going on ... + */ + if (cached_irq_mask & irqmask) + goto spurious_8259A_irq; + cached_irq_mask |= irqmask; + +handle_real_irq: + if (irq & 8) { + inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */ + outb(cached_slave_mask, PIC_SLAVE_IMR); +#ifndef CONFIG_X86_64 + outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */ + outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */ +#else /* CONFIG_X86_64 */ + /* 'Specific EOI' to slave */ + outb(0x60+(irq&7),PIC_SLAVE_CMD); + /* 'Specific EOI' to master-IRQ2 */ + outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); +#endif /* CONFIG_X86_64 */ + } else { + inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */ + outb(cached_master_mask, PIC_MASTER_IMR); +#ifndef CONFIG_X86_64 + outb(0x60+irq,PIC_MASTER_CMD); /* 'Specific EOI to master */ +#else /* CONFIG_X86_64 */ + /* 'Specific EOI' to master */ + outb(0x60+irq,PIC_MASTER_CMD); +#endif /* CONFIG_X86_64 */ + } + spin_unlock_irqrestore(&i8259A_lock, flags); + return; + +spurious_8259A_irq: + /* + * this is the slow path - should happen rarely. + */ + if (i8259A_irq_real(irq)) + /* + * oops, the IRQ _is_ in service according to the + * 8259A - not spurious, go handle it. + */ + goto handle_real_irq; + + { + static int spurious_irq_mask; + /* + * At this point we can be sure the IRQ is spurious, + * lets ACK and report it. [once per IRQ] + */ + if (!(spurious_irq_mask & irqmask)) { +#ifndef CONFIG_X86_64 + printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); +#else /* CONFIG_X86_64 */ + printk(KERN_DEBUG + "spurious 8259A interrupt: IRQ%d.\n", irq); +#endif /* CONFIG_X86_64 */ + spurious_irq_mask |= irqmask; + } + atomic_inc(&irq_err_count); + /* + * Theoretically we do not have to handle this IRQ, + * but in Linux this does not cause problems and is + * simpler for us. + */ + goto handle_real_irq; + } +} + +static char irq_trigger[2]; +/** + * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ + */ +static void restore_ELCR(char *trigger) +{ + outb(trigger[0], 0x4d0); + outb(trigger[1], 0x4d1); +} + +static void save_ELCR(char *trigger) +{ + /* IRQ 0,1,2,8,13 are marked as reserved */ + trigger[0] = inb(0x4d0) & 0xF8; + trigger[1] = inb(0x4d1) & 0xDE; +} + +static int i8259A_resume(struct sys_device *dev) +{ + init_8259A(i8259A_auto_eoi); + restore_ELCR(irq_trigger); + return 0; +} + +static int i8259A_suspend(struct sys_device *dev, pm_message_t state) +{ + save_ELCR(irq_trigger); + return 0; +} + +static int i8259A_shutdown(struct sys_device *dev) +{ + /* Put the i8259A into a quiescent state that + * the kernel initialization code can get it + * out of. + */ + outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ + outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */ + return 0; +} + +static struct sysdev_class i8259_sysdev_class = { + .name = "i8259", + .suspend = i8259A_suspend, + .resume = i8259A_resume, + .shutdown = i8259A_shutdown, +}; + +static struct sys_device device_i8259A = { + .id = 0, + .cls = &i8259_sysdev_class, +}; + +static int __init i8259A_init_sysfs(void) +{ + int error = sysdev_class_register(&i8259_sysdev_class); + if (!error) + error = sysdev_register(&device_i8259A); + return error; +} + +device_initcall(i8259A_init_sysfs); + +void init_8259A(int auto_eoi) +{ + unsigned long flags; + + i8259A_auto_eoi = auto_eoi; + + spin_lock_irqsave(&i8259A_lock, flags); + + outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ + outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ + + /* + * outb_pic - this has to work on a wide range of PC hardware. + */ + outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ +#ifndef CONFIG_X86_64 + outb_pic(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */ + outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ +#else /* CONFIG_X86_64 */ + /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */ + outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR); + /* 8259A-1 (the master) has a slave on IR2 */ + outb_pic(0x04, PIC_MASTER_IMR); +#endif /* CONFIG_X86_64 */ + if (auto_eoi) /* master does Auto EOI */ + outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); + else /* master expects normal EOI */ + outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); + + outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ +#ifndef CONFIG_X86_64 + outb_pic(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */ + outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */ + outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */ +#else /* CONFIG_X86_64 */ + /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */ + outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR); + /* 8259A-2 is a slave on master's IR2 */ + outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); + /* (slave's support for AEOI in flat mode is to be investigated) */ + outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); + +#endif /* CONFIG_X86_64 */ + if (auto_eoi) + /* + * In AEOI mode we just have to mask the interrupt + * when acking. + */ + i8259A_chip.mask_ack = disable_8259A_irq; + else + i8259A_chip.mask_ack = mask_and_ack_8259A; + + udelay(100); /* wait for 8259A to initialize */ + + outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ + outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ + + spin_unlock_irqrestore(&i8259A_lock, flags); +} diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259_32.c index fe631967d62..d66914287ee 100644 --- a/arch/x86/kernel/i8259_32.c +++ b/arch/x86/kernel/i8259_32.c @@ -21,302 +21,7 @@ #include #include -/* - * This is the 'legacy' 8259A Programmable Interrupt Controller, - * present in the majority of PC/AT boxes. - * plus some generic x86 specific things if generic specifics makes - * any sense at all. - */ - -static int i8259A_auto_eoi; -DEFINE_SPINLOCK(i8259A_lock); -static void mask_and_ack_8259A(unsigned int); - -static struct irq_chip i8259A_chip = { - .name = "XT-PIC", - .mask = disable_8259A_irq, - .disable = disable_8259A_irq, - .unmask = enable_8259A_irq, - .mask_ack = mask_and_ack_8259A, -}; - -/* - * 8259A PIC functions to handle ISA devices: - */ - -/* - * This contains the irq mask for both 8259A irq controllers, - */ -unsigned int cached_irq_mask = 0xffff; - -/* - * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) - * boards the timer interrupt is not really connected to any IO-APIC pin, - * it's fed to the master 8259A's IR0 line only. - * - * Any '1' bit in this mask means the IRQ is routed through the IO-APIC. - * this 'mixed mode' IRQ handling costs nothing because it's only used - * at IRQ setup time. - */ -unsigned long io_apic_irqs; - -void disable_8259A_irq(unsigned int irq) -{ - unsigned int mask = 1 << irq; - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - cached_irq_mask |= mask; - if (irq & 8) - outb(cached_slave_mask, PIC_SLAVE_IMR); - else - outb(cached_master_mask, PIC_MASTER_IMR); - spin_unlock_irqrestore(&i8259A_lock, flags); -} - -void enable_8259A_irq(unsigned int irq) -{ - unsigned int mask = ~(1 << irq); - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - cached_irq_mask &= mask; - if (irq & 8) - outb(cached_slave_mask, PIC_SLAVE_IMR); - else - outb(cached_master_mask, PIC_MASTER_IMR); - spin_unlock_irqrestore(&i8259A_lock, flags); -} - -int i8259A_irq_pending(unsigned int irq) -{ - unsigned int mask = 1<> 8); - spin_unlock_irqrestore(&i8259A_lock, flags); - - return ret; -} - -void make_8259A_irq(unsigned int irq) -{ - disable_irq_nosync(irq); - io_apic_irqs &= ~(1<> 8); - outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */ - return value; -} - -/* - * Careful! The 8259A is a fragile beast, it pretty - * much _has_ to be done exactly like this (mask it - * first, _then_ send the EOI, and the order of EOI - * to the two 8259s is important! - */ -static void mask_and_ack_8259A(unsigned int irq) -{ - unsigned int irqmask = 1 << irq; - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - /* - * Lightweight spurious IRQ detection. We do not want - * to overdo spurious IRQ handling - it's usually a sign - * of hardware problems, so we only do the checks we can - * do without slowing down good hardware unnecessarily. - * - * Note that IRQ7 and IRQ15 (the two spurious IRQs - * usually resulting from the 8259A-1|2 PICs) occur - * even if the IRQ is masked in the 8259A. Thus we - * can check spurious 8259A IRQs without doing the - * quite slow i8259A_irq_real() call for every IRQ. - * This does not cover 100% of spurious interrupts, - * but should be enough to warn the user that there - * is something bad going on ... - */ - if (cached_irq_mask & irqmask) - goto spurious_8259A_irq; - cached_irq_mask |= irqmask; -handle_real_irq: - if (irq & 8) { - inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */ - outb(cached_slave_mask, PIC_SLAVE_IMR); - outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */ - outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */ - } else { - inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */ - outb(cached_master_mask, PIC_MASTER_IMR); - outb(0x60+irq,PIC_MASTER_CMD); /* 'Specific EOI to master */ - } - spin_unlock_irqrestore(&i8259A_lock, flags); - return; - -spurious_8259A_irq: - /* - * this is the slow path - should happen rarely. - */ - if (i8259A_irq_real(irq)) - /* - * oops, the IRQ _is_ in service according to the - * 8259A - not spurious, go handle it. - */ - goto handle_real_irq; - - { - static int spurious_irq_mask; - /* - * At this point we can be sure the IRQ is spurious, - * lets ACK and report it. [once per IRQ] - */ - if (!(spurious_irq_mask & irqmask)) { - printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); - spurious_irq_mask |= irqmask; - } - atomic_inc(&irq_err_count); - /* - * Theoretically we do not have to handle this IRQ, - * but in Linux this does not cause problems and is - * simpler for us. - */ - goto handle_real_irq; - } -} - -static char irq_trigger[2]; -/** - * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ - */ -static void restore_ELCR(char *trigger) -{ - outb(trigger[0], 0x4d0); - outb(trigger[1], 0x4d1); -} - -static void save_ELCR(char *trigger) -{ - /* IRQ 0,1,2,8,13 are marked as reserved */ - trigger[0] = inb(0x4d0) & 0xF8; - trigger[1] = inb(0x4d1) & 0xDE; -} - -static int i8259A_resume(struct sys_device *dev) -{ - init_8259A(i8259A_auto_eoi); - restore_ELCR(irq_trigger); - return 0; -} - -static int i8259A_suspend(struct sys_device *dev, pm_message_t state) -{ - save_ELCR(irq_trigger); - return 0; -} - -static int i8259A_shutdown(struct sys_device *dev) -{ - /* Put the i8259A into a quiescent state that - * the kernel initialization code can get it - * out of. - */ - outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ - outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */ - return 0; -} - -static struct sysdev_class i8259_sysdev_class = { - .name = "i8259", - .suspend = i8259A_suspend, - .resume = i8259A_resume, - .shutdown = i8259A_shutdown, -}; - -static struct sys_device device_i8259A = { - .id = 0, - .cls = &i8259_sysdev_class, -}; - -static int __init i8259A_init_sysfs(void) -{ - int error = sysdev_class_register(&i8259_sysdev_class); - if (!error) - error = sysdev_register(&device_i8259A); - return error; -} - -device_initcall(i8259A_init_sysfs); - -void init_8259A(int auto_eoi) -{ - unsigned long flags; - - i8259A_auto_eoi = auto_eoi; - - spin_lock_irqsave(&i8259A_lock, flags); - - outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ - outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ - - /* - * outb_pic - this has to work on a wide range of PC hardware. - */ - outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ - outb_pic(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */ - outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ - if (auto_eoi) /* master does Auto EOI */ - outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); - else /* master expects normal EOI */ - outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); - - outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ - outb_pic(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */ - outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */ - outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */ - if (auto_eoi) - /* - * In AEOI mode we just have to mask the interrupt - * when acking. - */ - i8259A_chip.mask_ack = disable_8259A_irq; - else - i8259A_chip.mask_ack = mask_and_ack_8259A; - - udelay(100); /* wait for 8259A to initialize */ - - outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ - outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ - - spin_unlock_irqrestore(&i8259A_lock, flags); -} /* * Note that on a 486, we don't want to do a SIGFPE on an irq13 diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c index fa57a156850..a95d2bd27e9 100644 --- a/arch/x86/kernel/i8259_64.c +++ b/arch/x86/kernel/i8259_64.c @@ -87,315 +87,6 @@ static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = #undef IRQ #undef IRQLIST_16 -/* - * This is the 'legacy' 8259A Programmable Interrupt Controller, - * present in the majority of PC/AT boxes. - * plus some generic x86 specific things if generic specifics makes - * any sense at all. - * this file should become arch/i386/kernel/irq.c when the old irq.c - * moves to arch independent land - */ - -static int i8259A_auto_eoi; -DEFINE_SPINLOCK(i8259A_lock); -static void mask_and_ack_8259A(unsigned int); - -static struct irq_chip i8259A_chip = { - .name = "XT-PIC", - .mask = disable_8259A_irq, - .disable = disable_8259A_irq, - .unmask = enable_8259A_irq, - .mask_ack = mask_and_ack_8259A, -}; - -/* - * 8259A PIC functions to handle ISA devices: - */ - -/* - * This contains the irq mask for both 8259A irq controllers, - */ -unsigned int cached_irq_mask = 0xffff; - -/* - * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) - * boards the timer interrupt is not really connected to any IO-APIC pin, - * it's fed to the master 8259A's IR0 line only. - * - * Any '1' bit in this mask means the IRQ is routed through the IO-APIC. - * this 'mixed mode' IRQ handling costs nothing because it's only used - * at IRQ setup time. - */ -unsigned long io_apic_irqs; - -void disable_8259A_irq(unsigned int irq) -{ - unsigned int mask = 1 << irq; - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - cached_irq_mask |= mask; - if (irq & 8) - outb(cached_slave_mask, PIC_SLAVE_IMR); - else - outb(cached_master_mask, PIC_MASTER_IMR); - spin_unlock_irqrestore(&i8259A_lock, flags); -} - -void enable_8259A_irq(unsigned int irq) -{ - unsigned int mask = ~(1 << irq); - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - cached_irq_mask &= mask; - if (irq & 8) - outb(cached_slave_mask, PIC_SLAVE_IMR); - else - outb(cached_master_mask, PIC_MASTER_IMR); - spin_unlock_irqrestore(&i8259A_lock, flags); -} - -int i8259A_irq_pending(unsigned int irq) -{ - unsigned int mask = 1<> 8); - spin_unlock_irqrestore(&i8259A_lock, flags); - - return ret; -} - -void make_8259A_irq(unsigned int irq) -{ - disable_irq_nosync(irq); - io_apic_irqs &= ~(1<> 8); - outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */ - return value; -} - -/* - * Careful! The 8259A is a fragile beast, it pretty - * much _has_ to be done exactly like this (mask it - * first, _then_ send the EOI, and the order of EOI - * to the two 8259s is important! - */ -static void mask_and_ack_8259A(unsigned int irq) -{ - unsigned int irqmask = 1 << irq; - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - /* - * Lightweight spurious IRQ detection. We do not want - * to overdo spurious IRQ handling - it's usually a sign - * of hardware problems, so we only do the checks we can - * do without slowing down good hardware unnecessarily. - * - * Note that IRQ7 and IRQ15 (the two spurious IRQs - * usually resulting from the 8259A-1|2 PICs) occur - * even if the IRQ is masked in the 8259A. Thus we - * can check spurious 8259A IRQs without doing the - * quite slow i8259A_irq_real() call for every IRQ. - * This does not cover 100% of spurious interrupts, - * but should be enough to warn the user that there - * is something bad going on ... - */ - if (cached_irq_mask & irqmask) - goto spurious_8259A_irq; - cached_irq_mask |= irqmask; - -handle_real_irq: - if (irq & 8) { - inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */ - outb(cached_slave_mask, PIC_SLAVE_IMR); - /* 'Specific EOI' to slave */ - outb(0x60+(irq&7),PIC_SLAVE_CMD); - /* 'Specific EOI' to master-IRQ2 */ - outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); - } else { - inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */ - outb(cached_master_mask, PIC_MASTER_IMR); - /* 'Specific EOI' to master */ - outb(0x60+irq,PIC_MASTER_CMD); - } - spin_unlock_irqrestore(&i8259A_lock, flags); - return; - -spurious_8259A_irq: - /* - * this is the slow path - should happen rarely. - */ - if (i8259A_irq_real(irq)) - /* - * oops, the IRQ _is_ in service according to the - * 8259A - not spurious, go handle it. - */ - goto handle_real_irq; - - { - static int spurious_irq_mask; - /* - * At this point we can be sure the IRQ is spurious, - * lets ACK and report it. [once per IRQ] - */ - if (!(spurious_irq_mask & irqmask)) { - printk(KERN_DEBUG - "spurious 8259A interrupt: IRQ%d.\n", irq); - spurious_irq_mask |= irqmask; - } - atomic_inc(&irq_err_count); - /* - * Theoretically we do not have to handle this IRQ, - * but in Linux this does not cause problems and is - * simpler for us. - */ - goto handle_real_irq; - } -} - -static char irq_trigger[2]; -/** - * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ - */ -static void restore_ELCR(char *trigger) -{ - outb(trigger[0], 0x4d0); - outb(trigger[1], 0x4d1); -} - -static void save_ELCR(char *trigger) -{ - /* IRQ 0,1,2,8,13 are marked as reserved */ - trigger[0] = inb(0x4d0) & 0xF8; - trigger[1] = inb(0x4d1) & 0xDE; -} - -static int i8259A_resume(struct sys_device *dev) -{ - init_8259A(i8259A_auto_eoi); - restore_ELCR(irq_trigger); - return 0; -} - -static int i8259A_suspend(struct sys_device *dev, pm_message_t state) -{ - save_ELCR(irq_trigger); - return 0; -} - -static int i8259A_shutdown(struct sys_device *dev) -{ - /* Put the i8259A into a quiescent state that - * the kernel initialization code can get it - * out of. - */ - outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ - outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */ - return 0; -} - -static struct sysdev_class i8259_sysdev_class = { - .name = "i8259", - .suspend = i8259A_suspend, - .resume = i8259A_resume, - .shutdown = i8259A_shutdown, -}; - -static struct sys_device device_i8259A = { - .id = 0, - .cls = &i8259_sysdev_class, -}; - -static int __init i8259A_init_sysfs(void) -{ - int error = sysdev_class_register(&i8259_sysdev_class); - if (!error) - error = sysdev_register(&device_i8259A); - return error; -} - -device_initcall(i8259A_init_sysfs); - -void init_8259A(int auto_eoi) -{ - unsigned long flags; - - i8259A_auto_eoi = auto_eoi; - - spin_lock_irqsave(&i8259A_lock, flags); - - outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ - outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ - - /* - * outb_pic - this has to work on a wide range of PC hardware. - */ - outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ - /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */ - outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR); - /* 8259A-1 (the master) has a slave on IR2 */ - outb_pic(0x04, PIC_MASTER_IMR); - if (auto_eoi) /* master does Auto EOI */ - outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); - else /* master expects normal EOI */ - outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); - - outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ - /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */ - outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR); - /* 8259A-2 is a slave on master's IR2 */ - outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); - /* (slave's support for AEOI in flat mode is to be investigated) */ - outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); - - if (auto_eoi) - /* - * In AEOI mode we just have to mask the interrupt - * when acking. - */ - i8259A_chip.mask_ack = disable_8259A_irq; - else - i8259A_chip.mask_ack = mask_and_ack_8259A; - - udelay(100); /* wait for 8259A to initialize */ - - outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ - outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ - - spin_unlock_irqrestore(&i8259A_lock, flags); -} - diff --git a/include/asm-x86/i8259.h b/include/asm-x86/i8259.h index 45d4df3e51e..2f98df91f1f 100644 --- a/include/asm-x86/i8259.h +++ b/include/asm-x86/i8259.h @@ -55,4 +55,6 @@ static inline void outb_pic(unsigned char value, unsigned int port) udelay(2); } +extern struct irq_chip i8259A_chip; + #endif /* __ASM_I8259_H__ */ -- cgit v1.2.3-70-g09d2 From f20b11e716936f85850654c49e06a1f955b9e5fc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 24 May 2008 15:59:58 +0200 Subject: x86: rename the i8259_32/64.c leftovers to initirq_32/64.c The leftovers of the i8259 unification have nothing to do with i8259 at all. They contain interrupt init code and the i8259_xx name is just misleading now. Rename them to initirq_32/64.c Signed-off-by: Thomas Gleixner --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/i8259_32.c | 114 ------------------------ arch/x86/kernel/i8259_64.c | 203 ------------------------------------------- arch/x86/kernel/irqinit_32.c | 114 ++++++++++++++++++++++++ arch/x86/kernel/irqinit_64.c | 203 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 318 insertions(+), 318 deletions(-) delete mode 100644 arch/x86/kernel/i8259_32.c delete mode 100644 arch/x86/kernel/i8259_64.c create mode 100644 arch/x86/kernel/irqinit_32.c create mode 100644 arch/x86/kernel/irqinit_64.c (limited to 'arch/x86/kernel/Makefile') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index f71a76ef259..2a53ad2cb45 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -18,7 +18,7 @@ CFLAGS_tsc_64.o := $(nostackp) obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o obj-y += traps_$(BITS).o irq_$(BITS).o obj-y += time_$(BITS).o ioport.o ldt.o -obj-y += setup_$(BITS).o i8259.o i8259_$(BITS).o setup.o +obj-y += setup_$(BITS).o i8259.o irqinit_$(BITS).o setup.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o setup64.o diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259_32.c deleted file mode 100644 index d66914287ee..00000000000 --- a/arch/x86/kernel/i8259_32.c +++ /dev/null @@ -1,114 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - - -/* - * Note that on a 486, we don't want to do a SIGFPE on an irq13 - * as the irq is unreliable, and exception 16 works correctly - * (ie as explained in the intel literature). On a 386, you - * can't use exception 16 due to bad IBM design, so we have to - * rely on the less exact irq13. - * - * Careful.. Not only is IRQ13 unreliable, but it is also - * leads to races. IBM designers who came up with it should - * be shot. - */ - - -static irqreturn_t math_error_irq(int cpl, void *dev_id) -{ - extern void math_error(void __user *); - outb(0,0xF0); - if (ignore_fpu_irq || !boot_cpu_data.hard_math) - return IRQ_NONE; - math_error((void __user *)get_irq_regs()->ip); - return IRQ_HANDLED; -} - -/* - * New motherboards sometimes make IRQ 13 be a PCI interrupt, - * so allow interrupt sharing. - */ -static struct irqaction fpu_irq = { - .handler = math_error_irq, - .mask = CPU_MASK_NONE, - .name = "fpu", -}; - -void __init init_ISA_irqs (void) -{ - int i; - -#ifdef CONFIG_X86_LOCAL_APIC - init_bsp_APIC(); -#endif - init_8259A(0); - - /* - * 16 old-style INTA-cycle interrupts: - */ - for (i = 0; i < 16; i++) { - set_irq_chip_and_handler_name(i, &i8259A_chip, - handle_level_irq, "XT"); - } -} - -/* Overridden in paravirt.c */ -void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); - -void __init native_init_IRQ(void) -{ - int i; - - /* all the set up before the call gates are initialised */ - pre_intr_init_hook(); - - /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) - */ - for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { - int vector = FIRST_EXTERNAL_VECTOR + i; - if (i >= NR_IRQS) - break; - /* SYSCALL_VECTOR was reserved in trap_init. */ - if (!test_bit(vector, used_vectors)) - set_intr_gate(vector, interrupt[i]); - } - - /* setup after call gates are initialised (usually add in - * the architecture specific gates) - */ - intr_init_hook(); - - /* - * External FPU? Set up irq13 if so, for - * original braindamaged IBM FERR coupling. - */ - if (boot_cpu_data.hard_math && !cpu_has_fpu) - setup_irq(FPU_IRQ, &fpu_irq); - - irq_ctx_init(smp_processor_id()); -} diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c deleted file mode 100644 index a95d2bd27e9..00000000000 --- a/arch/x86/kernel/i8259_64.c +++ /dev/null @@ -1,203 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Common place to define all x86 IRQ vectors - * - * This builds up the IRQ handler stubs using some ugly macros in irq.h - * - * These macros create the low-level assembly IRQ routines that save - * register context and call do_IRQ(). do_IRQ() then does all the - * operations that are needed to keep the AT (or SMP IOAPIC) - * interrupt-controller happy. - */ - -#define BI(x,y) \ - BUILD_IRQ(x##y) - -#define BUILD_16_IRQS(x) \ - BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ - BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ - BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ - BI(x,c) BI(x,d) BI(x,e) BI(x,f) - -/* - * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: - * (these are usually mapped to vectors 0x30-0x3f) - */ - -/* - * The IO-APIC gives us many more interrupt sources. Most of these - * are unused but an SMP system is supposed to have enough memory ... - * sometimes (mostly wrt. hw bugs) we get corrupted vectors all - * across the spectrum, so we really want to be prepared to get all - * of these. Plus, more powerful systems might have more than 64 - * IO-APIC registers. - * - * (these are usually mapped into the 0x30-0xff vector range) - */ - BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3) -BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7) -BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb) -BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf) - -#undef BUILD_16_IRQS -#undef BI - - -#define IRQ(x,y) \ - IRQ##x##y##_interrupt - -#define IRQLIST_16(x) \ - IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ - IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ - IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ - IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) - -/* for the irq vectors */ -static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = { - IRQLIST_16(0x2), IRQLIST_16(0x3), - IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), - IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), - IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf) -}; - -#undef IRQ -#undef IRQLIST_16 - - - - -/* - * IRQ2 is cascade interrupt to second interrupt controller - */ - -static struct irqaction irq2 = { - .handler = no_action, - .mask = CPU_MASK_NONE, - .name = "cascade", -}; -DEFINE_PER_CPU(vector_irq_t, vector_irq) = { - [0 ... IRQ0_VECTOR - 1] = -1, - [IRQ0_VECTOR] = 0, - [IRQ1_VECTOR] = 1, - [IRQ2_VECTOR] = 2, - [IRQ3_VECTOR] = 3, - [IRQ4_VECTOR] = 4, - [IRQ5_VECTOR] = 5, - [IRQ6_VECTOR] = 6, - [IRQ7_VECTOR] = 7, - [IRQ8_VECTOR] = 8, - [IRQ9_VECTOR] = 9, - [IRQ10_VECTOR] = 10, - [IRQ11_VECTOR] = 11, - [IRQ12_VECTOR] = 12, - [IRQ13_VECTOR] = 13, - [IRQ14_VECTOR] = 14, - [IRQ15_VECTOR] = 15, - [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 -}; - -void __init init_ISA_irqs (void) -{ - int i; - - init_bsp_APIC(); - init_8259A(0); - - for (i = 0; i < NR_IRQS; i++) { - irq_desc[i].status = IRQ_DISABLED; - irq_desc[i].action = NULL; - irq_desc[i].depth = 1; - - if (i < 16) { - /* - * 16 old-style INTA-cycle interrupts: - */ - set_irq_chip_and_handler_name(i, &i8259A_chip, - handle_level_irq, "XT"); - } else { - /* - * 'high' PCI IRQs filled in on demand - */ - irq_desc[i].chip = &no_irq_chip; - } - } -} - -void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); - -void __init native_init_IRQ(void) -{ - int i; - - init_ISA_irqs(); - /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) - */ - for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { - int vector = FIRST_EXTERNAL_VECTOR + i; - if (vector != IA32_SYSCALL_VECTOR) - set_intr_gate(vector, interrupt[i]); - } - -#ifdef CONFIG_SMP - /* - * The reschedule interrupt is a CPU-to-CPU reschedule-helper - * IPI, driven by wakeup. - */ - set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); - - /* IPIs for invalidation */ - set_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); - - /* IPI for generic function call */ - set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); - - /* Low priority IPI to cleanup after moving an irq */ - set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); -#endif - set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); - set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); - - /* self generated IPI for local APIC timer */ - set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); - - /* IPI vectors for APIC spurious and error interrupts */ - set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); - set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); - - if (!acpi_ioapic) - setup_irq(2, &irq2); -} diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c new file mode 100644 index 00000000000..d66914287ee --- /dev/null +++ b/arch/x86/kernel/irqinit_32.c @@ -0,0 +1,114 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +/* + * Note that on a 486, we don't want to do a SIGFPE on an irq13 + * as the irq is unreliable, and exception 16 works correctly + * (ie as explained in the intel literature). On a 386, you + * can't use exception 16 due to bad IBM design, so we have to + * rely on the less exact irq13. + * + * Careful.. Not only is IRQ13 unreliable, but it is also + * leads to races. IBM designers who came up with it should + * be shot. + */ + + +static irqreturn_t math_error_irq(int cpl, void *dev_id) +{ + extern void math_error(void __user *); + outb(0,0xF0); + if (ignore_fpu_irq || !boot_cpu_data.hard_math) + return IRQ_NONE; + math_error((void __user *)get_irq_regs()->ip); + return IRQ_HANDLED; +} + +/* + * New motherboards sometimes make IRQ 13 be a PCI interrupt, + * so allow interrupt sharing. + */ +static struct irqaction fpu_irq = { + .handler = math_error_irq, + .mask = CPU_MASK_NONE, + .name = "fpu", +}; + +void __init init_ISA_irqs (void) +{ + int i; + +#ifdef CONFIG_X86_LOCAL_APIC + init_bsp_APIC(); +#endif + init_8259A(0); + + /* + * 16 old-style INTA-cycle interrupts: + */ + for (i = 0; i < 16; i++) { + set_irq_chip_and_handler_name(i, &i8259A_chip, + handle_level_irq, "XT"); + } +} + +/* Overridden in paravirt.c */ +void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); + +void __init native_init_IRQ(void) +{ + int i; + + /* all the set up before the call gates are initialised */ + pre_intr_init_hook(); + + /* + * Cover the whole vector space, no vector can escape + * us. (some of these will be overridden and become + * 'special' SMP interrupts) + */ + for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { + int vector = FIRST_EXTERNAL_VECTOR + i; + if (i >= NR_IRQS) + break; + /* SYSCALL_VECTOR was reserved in trap_init. */ + if (!test_bit(vector, used_vectors)) + set_intr_gate(vector, interrupt[i]); + } + + /* setup after call gates are initialised (usually add in + * the architecture specific gates) + */ + intr_init_hook(); + + /* + * External FPU? Set up irq13 if so, for + * original braindamaged IBM FERR coupling. + */ + if (boot_cpu_data.hard_math && !cpu_has_fpu) + setup_irq(FPU_IRQ, &fpu_irq); + + irq_ctx_init(smp_processor_id()); +} diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c new file mode 100644 index 00000000000..a95d2bd27e9 --- /dev/null +++ b/arch/x86/kernel/irqinit_64.c @@ -0,0 +1,203 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Common place to define all x86 IRQ vectors + * + * This builds up the IRQ handler stubs using some ugly macros in irq.h + * + * These macros create the low-level assembly IRQ routines that save + * register context and call do_IRQ(). do_IRQ() then does all the + * operations that are needed to keep the AT (or SMP IOAPIC) + * interrupt-controller happy. + */ + +#define BI(x,y) \ + BUILD_IRQ(x##y) + +#define BUILD_16_IRQS(x) \ + BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ + BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ + BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ + BI(x,c) BI(x,d) BI(x,e) BI(x,f) + +/* + * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: + * (these are usually mapped to vectors 0x30-0x3f) + */ + +/* + * The IO-APIC gives us many more interrupt sources. Most of these + * are unused but an SMP system is supposed to have enough memory ... + * sometimes (mostly wrt. hw bugs) we get corrupted vectors all + * across the spectrum, so we really want to be prepared to get all + * of these. Plus, more powerful systems might have more than 64 + * IO-APIC registers. + * + * (these are usually mapped into the 0x30-0xff vector range) + */ + BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3) +BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7) +BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb) +BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf) + +#undef BUILD_16_IRQS +#undef BI + + +#define IRQ(x,y) \ + IRQ##x##y##_interrupt + +#define IRQLIST_16(x) \ + IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ + IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ + IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ + IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) + +/* for the irq vectors */ +static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = { + IRQLIST_16(0x2), IRQLIST_16(0x3), + IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), + IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), + IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf) +}; + +#undef IRQ +#undef IRQLIST_16 + + + + +/* + * IRQ2 is cascade interrupt to second interrupt controller + */ + +static struct irqaction irq2 = { + .handler = no_action, + .mask = CPU_MASK_NONE, + .name = "cascade", +}; +DEFINE_PER_CPU(vector_irq_t, vector_irq) = { + [0 ... IRQ0_VECTOR - 1] = -1, + [IRQ0_VECTOR] = 0, + [IRQ1_VECTOR] = 1, + [IRQ2_VECTOR] = 2, + [IRQ3_VECTOR] = 3, + [IRQ4_VECTOR] = 4, + [IRQ5_VECTOR] = 5, + [IRQ6_VECTOR] = 6, + [IRQ7_VECTOR] = 7, + [IRQ8_VECTOR] = 8, + [IRQ9_VECTOR] = 9, + [IRQ10_VECTOR] = 10, + [IRQ11_VECTOR] = 11, + [IRQ12_VECTOR] = 12, + [IRQ13_VECTOR] = 13, + [IRQ14_VECTOR] = 14, + [IRQ15_VECTOR] = 15, + [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 +}; + +void __init init_ISA_irqs (void) +{ + int i; + + init_bsp_APIC(); + init_8259A(0); + + for (i = 0; i < NR_IRQS; i++) { + irq_desc[i].status = IRQ_DISABLED; + irq_desc[i].action = NULL; + irq_desc[i].depth = 1; + + if (i < 16) { + /* + * 16 old-style INTA-cycle interrupts: + */ + set_irq_chip_and_handler_name(i, &i8259A_chip, + handle_level_irq, "XT"); + } else { + /* + * 'high' PCI IRQs filled in on demand + */ + irq_desc[i].chip = &no_irq_chip; + } + } +} + +void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); + +void __init native_init_IRQ(void) +{ + int i; + + init_ISA_irqs(); + /* + * Cover the whole vector space, no vector can escape + * us. (some of these will be overridden and become + * 'special' SMP interrupts) + */ + for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { + int vector = FIRST_EXTERNAL_VECTOR + i; + if (vector != IA32_SYSCALL_VECTOR) + set_intr_gate(vector, interrupt[i]); + } + +#ifdef CONFIG_SMP + /* + * The reschedule interrupt is a CPU-to-CPU reschedule-helper + * IPI, driven by wakeup. + */ + set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); + + /* IPIs for invalidation */ + set_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); + + /* IPI for generic function call */ + set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); + + /* Low priority IPI to cleanup after moving an irq */ + set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); +#endif + set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); + set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); + + /* self generated IPI for local APIC timer */ + set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); + + /* IPI vectors for APIC spurious and error interrupts */ + set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); + set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); + + if (!acpi_ioapic) + setup_irq(2, &irq2); +} -- cgit v1.2.3-70-g09d2 From 21fd5132b223a10bdf17713dd0bf321cbd6471d2 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Wed, 21 May 2008 11:44:02 +0200 Subject: x86: automatical unification of i8259.c Make conversion of i8259 very mechanical -- i8259 was generated by diff -D, with too different parts left in i8259_32 and i8259_64.c. Only "by hand" changes were removal of #ifdef from middle of the comment (prevented compilation) and removal of one static to allow splitting into files. Of course, it will need some cleanups now, and those will follow. Signed-of-by: Pavel Machek --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/i8259.c | 368 +++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/i8259_32.c | 295 ------------------------------------ arch/x86/kernel/i8259_64.c | 309 ------------------------------------- include/asm-x86/i8259.h | 2 + 5 files changed, 371 insertions(+), 605 deletions(-) create mode 100644 arch/x86/kernel/i8259.c (limited to 'arch/x86/kernel/Makefile') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5e618c3b472..f71a76ef259 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -18,7 +18,7 @@ CFLAGS_tsc_64.o := $(nostackp) obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o obj-y += traps_$(BITS).o irq_$(BITS).o obj-y += time_$(BITS).o ioport.o ldt.o -obj-y += setup_$(BITS).o i8259_$(BITS).o setup.o +obj-y += setup_$(BITS).o i8259.o i8259_$(BITS).o setup.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o setup64.o diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c new file mode 100644 index 00000000000..2decba6b010 --- /dev/null +++ b/arch/x86/kernel/i8259.c @@ -0,0 +1,368 @@ +#ifdef CONFIG_X86_64 +#include +#endif /* CONFIG_X86_64 */ +#include +#include +#include +#include +#include +#ifdef CONFIG_X86_64 +#include +#endif /* CONFIG_X86_64 */ +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_X86_64 +#include +#endif /* CONFIG_X86_64 */ +#include +#include +#include +#ifndef CONFIG_X86_64 +#include +#else /* CONFIG_X86_64 */ +#include +#endif /* CONFIG_X86_64 */ +#include +#include +#include +#include +#ifndef CONFIG_X86_64 +#include +#endif /* ! CONFIG_X86_64 */ +#include + +/* + * This is the 'legacy' 8259A Programmable Interrupt Controller, + * present in the majority of PC/AT boxes. + * plus some generic x86 specific things if generic specifics makes + * any sense at all. + */ + +static int i8259A_auto_eoi; +DEFINE_SPINLOCK(i8259A_lock); +static void mask_and_ack_8259A(unsigned int); + +struct irq_chip i8259A_chip = { + .name = "XT-PIC", + .mask = disable_8259A_irq, + .disable = disable_8259A_irq, + .unmask = enable_8259A_irq, + .mask_ack = mask_and_ack_8259A, +}; + +/* + * 8259A PIC functions to handle ISA devices: + */ + +/* + * This contains the irq mask for both 8259A irq controllers, + */ +unsigned int cached_irq_mask = 0xffff; + +/* + * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) + * boards the timer interrupt is not really connected to any IO-APIC pin, + * it's fed to the master 8259A's IR0 line only. + * + * Any '1' bit in this mask means the IRQ is routed through the IO-APIC. + * this 'mixed mode' IRQ handling costs nothing because it's only used + * at IRQ setup time. + */ +unsigned long io_apic_irqs; + +void disable_8259A_irq(unsigned int irq) +{ + unsigned int mask = 1 << irq; + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + cached_irq_mask |= mask; + if (irq & 8) + outb(cached_slave_mask, PIC_SLAVE_IMR); + else + outb(cached_master_mask, PIC_MASTER_IMR); + spin_unlock_irqrestore(&i8259A_lock, flags); +} + +void enable_8259A_irq(unsigned int irq) +{ + unsigned int mask = ~(1 << irq); + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + cached_irq_mask &= mask; + if (irq & 8) + outb(cached_slave_mask, PIC_SLAVE_IMR); + else + outb(cached_master_mask, PIC_MASTER_IMR); + spin_unlock_irqrestore(&i8259A_lock, flags); +} + +int i8259A_irq_pending(unsigned int irq) +{ + unsigned int mask = 1<> 8); + spin_unlock_irqrestore(&i8259A_lock, flags); + + return ret; +} + +void make_8259A_irq(unsigned int irq) +{ + disable_irq_nosync(irq); + io_apic_irqs &= ~(1<> 8); + outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */ + return value; +} + +/* + * Careful! The 8259A is a fragile beast, it pretty + * much _has_ to be done exactly like this (mask it + * first, _then_ send the EOI, and the order of EOI + * to the two 8259s is important! + */ +static void mask_and_ack_8259A(unsigned int irq) +{ + unsigned int irqmask = 1 << irq; + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + /* + * Lightweight spurious IRQ detection. We do not want + * to overdo spurious IRQ handling - it's usually a sign + * of hardware problems, so we only do the checks we can + * do without slowing down good hardware unnecessarily. + * + * Note that IRQ7 and IRQ15 (the two spurious IRQs + * usually resulting from the 8259A-1|2 PICs) occur + * even if the IRQ is masked in the 8259A. Thus we + * can check spurious 8259A IRQs without doing the + * quite slow i8259A_irq_real() call for every IRQ. + * This does not cover 100% of spurious interrupts, + * but should be enough to warn the user that there + * is something bad going on ... + */ + if (cached_irq_mask & irqmask) + goto spurious_8259A_irq; + cached_irq_mask |= irqmask; + +handle_real_irq: + if (irq & 8) { + inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */ + outb(cached_slave_mask, PIC_SLAVE_IMR); +#ifndef CONFIG_X86_64 + outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */ + outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */ +#else /* CONFIG_X86_64 */ + /* 'Specific EOI' to slave */ + outb(0x60+(irq&7),PIC_SLAVE_CMD); + /* 'Specific EOI' to master-IRQ2 */ + outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); +#endif /* CONFIG_X86_64 */ + } else { + inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */ + outb(cached_master_mask, PIC_MASTER_IMR); +#ifndef CONFIG_X86_64 + outb(0x60+irq,PIC_MASTER_CMD); /* 'Specific EOI to master */ +#else /* CONFIG_X86_64 */ + /* 'Specific EOI' to master */ + outb(0x60+irq,PIC_MASTER_CMD); +#endif /* CONFIG_X86_64 */ + } + spin_unlock_irqrestore(&i8259A_lock, flags); + return; + +spurious_8259A_irq: + /* + * this is the slow path - should happen rarely. + */ + if (i8259A_irq_real(irq)) + /* + * oops, the IRQ _is_ in service according to the + * 8259A - not spurious, go handle it. + */ + goto handle_real_irq; + + { + static int spurious_irq_mask; + /* + * At this point we can be sure the IRQ is spurious, + * lets ACK and report it. [once per IRQ] + */ + if (!(spurious_irq_mask & irqmask)) { +#ifndef CONFIG_X86_64 + printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); +#else /* CONFIG_X86_64 */ + printk(KERN_DEBUG + "spurious 8259A interrupt: IRQ%d.\n", irq); +#endif /* CONFIG_X86_64 */ + spurious_irq_mask |= irqmask; + } + atomic_inc(&irq_err_count); + /* + * Theoretically we do not have to handle this IRQ, + * but in Linux this does not cause problems and is + * simpler for us. + */ + goto handle_real_irq; + } +} + +static char irq_trigger[2]; +/** + * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ + */ +static void restore_ELCR(char *trigger) +{ + outb(trigger[0], 0x4d0); + outb(trigger[1], 0x4d1); +} + +static void save_ELCR(char *trigger) +{ + /* IRQ 0,1,2,8,13 are marked as reserved */ + trigger[0] = inb(0x4d0) & 0xF8; + trigger[1] = inb(0x4d1) & 0xDE; +} + +static int i8259A_resume(struct sys_device *dev) +{ + init_8259A(i8259A_auto_eoi); + restore_ELCR(irq_trigger); + return 0; +} + +static int i8259A_suspend(struct sys_device *dev, pm_message_t state) +{ + save_ELCR(irq_trigger); + return 0; +} + +static int i8259A_shutdown(struct sys_device *dev) +{ + /* Put the i8259A into a quiescent state that + * the kernel initialization code can get it + * out of. + */ + outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ + outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */ + return 0; +} + +static struct sysdev_class i8259_sysdev_class = { + .name = "i8259", + .suspend = i8259A_suspend, + .resume = i8259A_resume, + .shutdown = i8259A_shutdown, +}; + +static struct sys_device device_i8259A = { + .id = 0, + .cls = &i8259_sysdev_class, +}; + +static int __init i8259A_init_sysfs(void) +{ + int error = sysdev_class_register(&i8259_sysdev_class); + if (!error) + error = sysdev_register(&device_i8259A); + return error; +} + +device_initcall(i8259A_init_sysfs); + +void init_8259A(int auto_eoi) +{ + unsigned long flags; + + i8259A_auto_eoi = auto_eoi; + + spin_lock_irqsave(&i8259A_lock, flags); + + outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ + outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ + + /* + * outb_pic - this has to work on a wide range of PC hardware. + */ + outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ +#ifndef CONFIG_X86_64 + outb_pic(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */ + outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ +#else /* CONFIG_X86_64 */ + /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */ + outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR); + /* 8259A-1 (the master) has a slave on IR2 */ + outb_pic(0x04, PIC_MASTER_IMR); +#endif /* CONFIG_X86_64 */ + if (auto_eoi) /* master does Auto EOI */ + outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); + else /* master expects normal EOI */ + outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); + + outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ +#ifndef CONFIG_X86_64 + outb_pic(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */ + outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */ + outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */ +#else /* CONFIG_X86_64 */ + /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */ + outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR); + /* 8259A-2 is a slave on master's IR2 */ + outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); + /* (slave's support for AEOI in flat mode is to be investigated) */ + outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); + +#endif /* CONFIG_X86_64 */ + if (auto_eoi) + /* + * In AEOI mode we just have to mask the interrupt + * when acking. + */ + i8259A_chip.mask_ack = disable_8259A_irq; + else + i8259A_chip.mask_ack = mask_and_ack_8259A; + + udelay(100); /* wait for 8259A to initialize */ + + outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ + outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ + + spin_unlock_irqrestore(&i8259A_lock, flags); +} diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259_32.c index fe631967d62..d66914287ee 100644 --- a/arch/x86/kernel/i8259_32.c +++ b/arch/x86/kernel/i8259_32.c @@ -21,302 +21,7 @@ #include #include -/* - * This is the 'legacy' 8259A Programmable Interrupt Controller, - * present in the majority of PC/AT boxes. - * plus some generic x86 specific things if generic specifics makes - * any sense at all. - */ - -static int i8259A_auto_eoi; -DEFINE_SPINLOCK(i8259A_lock); -static void mask_and_ack_8259A(unsigned int); - -static struct irq_chip i8259A_chip = { - .name = "XT-PIC", - .mask = disable_8259A_irq, - .disable = disable_8259A_irq, - .unmask = enable_8259A_irq, - .mask_ack = mask_and_ack_8259A, -}; - -/* - * 8259A PIC functions to handle ISA devices: - */ - -/* - * This contains the irq mask for both 8259A irq controllers, - */ -unsigned int cached_irq_mask = 0xffff; - -/* - * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) - * boards the timer interrupt is not really connected to any IO-APIC pin, - * it's fed to the master 8259A's IR0 line only. - * - * Any '1' bit in this mask means the IRQ is routed through the IO-APIC. - * this 'mixed mode' IRQ handling costs nothing because it's only used - * at IRQ setup time. - */ -unsigned long io_apic_irqs; - -void disable_8259A_irq(unsigned int irq) -{ - unsigned int mask = 1 << irq; - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - cached_irq_mask |= mask; - if (irq & 8) - outb(cached_slave_mask, PIC_SLAVE_IMR); - else - outb(cached_master_mask, PIC_MASTER_IMR); - spin_unlock_irqrestore(&i8259A_lock, flags); -} - -void enable_8259A_irq(unsigned int irq) -{ - unsigned int mask = ~(1 << irq); - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - cached_irq_mask &= mask; - if (irq & 8) - outb(cached_slave_mask, PIC_SLAVE_IMR); - else - outb(cached_master_mask, PIC_MASTER_IMR); - spin_unlock_irqrestore(&i8259A_lock, flags); -} - -int i8259A_irq_pending(unsigned int irq) -{ - unsigned int mask = 1<> 8); - spin_unlock_irqrestore(&i8259A_lock, flags); - - return ret; -} - -void make_8259A_irq(unsigned int irq) -{ - disable_irq_nosync(irq); - io_apic_irqs &= ~(1<> 8); - outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */ - return value; -} - -/* - * Careful! The 8259A is a fragile beast, it pretty - * much _has_ to be done exactly like this (mask it - * first, _then_ send the EOI, and the order of EOI - * to the two 8259s is important! - */ -static void mask_and_ack_8259A(unsigned int irq) -{ - unsigned int irqmask = 1 << irq; - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - /* - * Lightweight spurious IRQ detection. We do not want - * to overdo spurious IRQ handling - it's usually a sign - * of hardware problems, so we only do the checks we can - * do without slowing down good hardware unnecessarily. - * - * Note that IRQ7 and IRQ15 (the two spurious IRQs - * usually resulting from the 8259A-1|2 PICs) occur - * even if the IRQ is masked in the 8259A. Thus we - * can check spurious 8259A IRQs without doing the - * quite slow i8259A_irq_real() call for every IRQ. - * This does not cover 100% of spurious interrupts, - * but should be enough to warn the user that there - * is something bad going on ... - */ - if (cached_irq_mask & irqmask) - goto spurious_8259A_irq; - cached_irq_mask |= irqmask; -handle_real_irq: - if (irq & 8) { - inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */ - outb(cached_slave_mask, PIC_SLAVE_IMR); - outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */ - outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */ - } else { - inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */ - outb(cached_master_mask, PIC_MASTER_IMR); - outb(0x60+irq,PIC_MASTER_CMD); /* 'Specific EOI to master */ - } - spin_unlock_irqrestore(&i8259A_lock, flags); - return; - -spurious_8259A_irq: - /* - * this is the slow path - should happen rarely. - */ - if (i8259A_irq_real(irq)) - /* - * oops, the IRQ _is_ in service according to the - * 8259A - not spurious, go handle it. - */ - goto handle_real_irq; - - { - static int spurious_irq_mask; - /* - * At this point we can be sure the IRQ is spurious, - * lets ACK and report it. [once per IRQ] - */ - if (!(spurious_irq_mask & irqmask)) { - printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); - spurious_irq_mask |= irqmask; - } - atomic_inc(&irq_err_count); - /* - * Theoretically we do not have to handle this IRQ, - * but in Linux this does not cause problems and is - * simpler for us. - */ - goto handle_real_irq; - } -} - -static char irq_trigger[2]; -/** - * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ - */ -static void restore_ELCR(char *trigger) -{ - outb(trigger[0], 0x4d0); - outb(trigger[1], 0x4d1); -} - -static void save_ELCR(char *trigger) -{ - /* IRQ 0,1,2,8,13 are marked as reserved */ - trigger[0] = inb(0x4d0) & 0xF8; - trigger[1] = inb(0x4d1) & 0xDE; -} - -static int i8259A_resume(struct sys_device *dev) -{ - init_8259A(i8259A_auto_eoi); - restore_ELCR(irq_trigger); - return 0; -} - -static int i8259A_suspend(struct sys_device *dev, pm_message_t state) -{ - save_ELCR(irq_trigger); - return 0; -} - -static int i8259A_shutdown(struct sys_device *dev) -{ - /* Put the i8259A into a quiescent state that - * the kernel initialization code can get it - * out of. - */ - outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ - outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */ - return 0; -} - -static struct sysdev_class i8259_sysdev_class = { - .name = "i8259", - .suspend = i8259A_suspend, - .resume = i8259A_resume, - .shutdown = i8259A_shutdown, -}; - -static struct sys_device device_i8259A = { - .id = 0, - .cls = &i8259_sysdev_class, -}; - -static int __init i8259A_init_sysfs(void) -{ - int error = sysdev_class_register(&i8259_sysdev_class); - if (!error) - error = sysdev_register(&device_i8259A); - return error; -} - -device_initcall(i8259A_init_sysfs); - -void init_8259A(int auto_eoi) -{ - unsigned long flags; - - i8259A_auto_eoi = auto_eoi; - - spin_lock_irqsave(&i8259A_lock, flags); - - outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ - outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ - - /* - * outb_pic - this has to work on a wide range of PC hardware. - */ - outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ - outb_pic(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */ - outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ - if (auto_eoi) /* master does Auto EOI */ - outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); - else /* master expects normal EOI */ - outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); - - outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ - outb_pic(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */ - outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */ - outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */ - if (auto_eoi) - /* - * In AEOI mode we just have to mask the interrupt - * when acking. - */ - i8259A_chip.mask_ack = disable_8259A_irq; - else - i8259A_chip.mask_ack = mask_and_ack_8259A; - - udelay(100); /* wait for 8259A to initialize */ - - outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ - outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ - - spin_unlock_irqrestore(&i8259A_lock, flags); -} /* * Note that on a 486, we don't want to do a SIGFPE on an irq13 diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c index 1870e0e8655..b44095efcf8 100644 --- a/arch/x86/kernel/i8259_64.c +++ b/arch/x86/kernel/i8259_64.c @@ -101,315 +101,6 @@ static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = #undef IRQ #undef IRQLIST_16 -/* - * This is the 'legacy' 8259A Programmable Interrupt Controller, - * present in the majority of PC/AT boxes. - * plus some generic x86 specific things if generic specifics makes - * any sense at all. - * this file should become arch/i386/kernel/irq.c when the old irq.c - * moves to arch independent land - */ - -static int i8259A_auto_eoi; -DEFINE_SPINLOCK(i8259A_lock); -static void mask_and_ack_8259A(unsigned int); - -static struct irq_chip i8259A_chip = { - .name = "XT-PIC", - .mask = disable_8259A_irq, - .disable = disable_8259A_irq, - .unmask = enable_8259A_irq, - .mask_ack = mask_and_ack_8259A, -}; - -/* - * 8259A PIC functions to handle ISA devices: - */ - -/* - * This contains the irq mask for both 8259A irq controllers, - */ -unsigned int cached_irq_mask = 0xffff; - -/* - * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) - * boards the timer interrupt is not really connected to any IO-APIC pin, - * it's fed to the master 8259A's IR0 line only. - * - * Any '1' bit in this mask means the IRQ is routed through the IO-APIC. - * this 'mixed mode' IRQ handling costs nothing because it's only used - * at IRQ setup time. - */ -unsigned long io_apic_irqs; - -void disable_8259A_irq(unsigned int irq) -{ - unsigned int mask = 1 << irq; - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - cached_irq_mask |= mask; - if (irq & 8) - outb(cached_slave_mask, PIC_SLAVE_IMR); - else - outb(cached_master_mask, PIC_MASTER_IMR); - spin_unlock_irqrestore(&i8259A_lock, flags); -} - -void enable_8259A_irq(unsigned int irq) -{ - unsigned int mask = ~(1 << irq); - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - cached_irq_mask &= mask; - if (irq & 8) - outb(cached_slave_mask, PIC_SLAVE_IMR); - else - outb(cached_master_mask, PIC_MASTER_IMR); - spin_unlock_irqrestore(&i8259A_lock, flags); -} - -int i8259A_irq_pending(unsigned int irq) -{ - unsigned int mask = 1<> 8); - spin_unlock_irqrestore(&i8259A_lock, flags); - - return ret; -} - -void make_8259A_irq(unsigned int irq) -{ - disable_irq_nosync(irq); - io_apic_irqs &= ~(1<> 8); - outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */ - return value; -} - -/* - * Careful! The 8259A is a fragile beast, it pretty - * much _has_ to be done exactly like this (mask it - * first, _then_ send the EOI, and the order of EOI - * to the two 8259s is important! - */ -static void mask_and_ack_8259A(unsigned int irq) -{ - unsigned int irqmask = 1 << irq; - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - /* - * Lightweight spurious IRQ detection. We do not want - * to overdo spurious IRQ handling - it's usually a sign - * of hardware problems, so we only do the checks we can - * do without slowing down good hardware unnecessarily. - * - * Note that IRQ7 and IRQ15 (the two spurious IRQs - * usually resulting from the 8259A-1|2 PICs) occur - * even if the IRQ is masked in the 8259A. Thus we - * can check spurious 8259A IRQs without doing the - * quite slow i8259A_irq_real() call for every IRQ. - * This does not cover 100% of spurious interrupts, - * but should be enough to warn the user that there - * is something bad going on ... - */ - if (cached_irq_mask & irqmask) - goto spurious_8259A_irq; - cached_irq_mask |= irqmask; - -handle_real_irq: - if (irq & 8) { - inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */ - outb(cached_slave_mask, PIC_SLAVE_IMR); - /* 'Specific EOI' to slave */ - outb(0x60+(irq&7),PIC_SLAVE_CMD); - /* 'Specific EOI' to master-IRQ2 */ - outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); - } else { - inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */ - outb(cached_master_mask, PIC_MASTER_IMR); - /* 'Specific EOI' to master */ - outb(0x60+irq,PIC_MASTER_CMD); - } - spin_unlock_irqrestore(&i8259A_lock, flags); - return; - -spurious_8259A_irq: - /* - * this is the slow path - should happen rarely. - */ - if (i8259A_irq_real(irq)) - /* - * oops, the IRQ _is_ in service according to the - * 8259A - not spurious, go handle it. - */ - goto handle_real_irq; - - { - static int spurious_irq_mask; - /* - * At this point we can be sure the IRQ is spurious, - * lets ACK and report it. [once per IRQ] - */ - if (!(spurious_irq_mask & irqmask)) { - printk(KERN_DEBUG - "spurious 8259A interrupt: IRQ%d.\n", irq); - spurious_irq_mask |= irqmask; - } - atomic_inc(&irq_err_count); - /* - * Theoretically we do not have to handle this IRQ, - * but in Linux this does not cause problems and is - * simpler for us. - */ - goto handle_real_irq; - } -} - -static char irq_trigger[2]; -/** - * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ - */ -static void restore_ELCR(char *trigger) -{ - outb(trigger[0], 0x4d0); - outb(trigger[1], 0x4d1); -} - -static void save_ELCR(char *trigger) -{ - /* IRQ 0,1,2,8,13 are marked as reserved */ - trigger[0] = inb(0x4d0) & 0xF8; - trigger[1] = inb(0x4d1) & 0xDE; -} - -static int i8259A_resume(struct sys_device *dev) -{ - init_8259A(i8259A_auto_eoi); - restore_ELCR(irq_trigger); - return 0; -} - -static int i8259A_suspend(struct sys_device *dev, pm_message_t state) -{ - save_ELCR(irq_trigger); - return 0; -} - -static int i8259A_shutdown(struct sys_device *dev) -{ - /* Put the i8259A into a quiescent state that - * the kernel initialization code can get it - * out of. - */ - outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ - outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */ - return 0; -} - -static struct sysdev_class i8259_sysdev_class = { - .name = "i8259", - .suspend = i8259A_suspend, - .resume = i8259A_resume, - .shutdown = i8259A_shutdown, -}; - -static struct sys_device device_i8259A = { - .id = 0, - .cls = &i8259_sysdev_class, -}; - -static int __init i8259A_init_sysfs(void) -{ - int error = sysdev_class_register(&i8259_sysdev_class); - if (!error) - error = sysdev_register(&device_i8259A); - return error; -} - -device_initcall(i8259A_init_sysfs); - -void init_8259A(int auto_eoi) -{ - unsigned long flags; - - i8259A_auto_eoi = auto_eoi; - - spin_lock_irqsave(&i8259A_lock, flags); - - outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ - outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ - - /* - * outb_pic - this has to work on a wide range of PC hardware. - */ - outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ - /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */ - outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR); - /* 8259A-1 (the master) has a slave on IR2 */ - outb_pic(0x04, PIC_MASTER_IMR); - if (auto_eoi) /* master does Auto EOI */ - outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); - else /* master expects normal EOI */ - outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); - - outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ - /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */ - outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR); - /* 8259A-2 is a slave on master's IR2 */ - outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); - /* (slave's support for AEOI in flat mode is to be investigated) */ - outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); - - if (auto_eoi) - /* - * In AEOI mode we just have to mask the interrupt - * when acking. - */ - i8259A_chip.mask_ack = disable_8259A_irq; - else - i8259A_chip.mask_ack = mask_and_ack_8259A; - - udelay(100); /* wait for 8259A to initialize */ - - outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ - outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ - - spin_unlock_irqrestore(&i8259A_lock, flags); -} - diff --git a/include/asm-x86/i8259.h b/include/asm-x86/i8259.h index 45d4df3e51e..2f98df91f1f 100644 --- a/include/asm-x86/i8259.h +++ b/include/asm-x86/i8259.h @@ -55,4 +55,6 @@ static inline void outb_pic(unsigned char value, unsigned int port) udelay(2); } +extern struct irq_chip i8259A_chip; + #endif /* __ASM_I8259_H__ */ -- cgit v1.2.3-70-g09d2 From ec42418f1973e1f77da1fcca3be59c0acb878a9d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 24 May 2008 15:59:58 +0200 Subject: x86: rename the i8259_32/64.c leftovers to irqinit_32/64.c The leftovers of the i8259 unification have nothing to do with i8259 at all. They contain interrupt init code and the i8259_xx name is just misleading now. Rename them to irqinit_32/64.c Signed-off-by: Thomas Gleixner --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/i8259_32.c | 114 ----------------------- arch/x86/kernel/i8259_64.c | 217 ------------------------------------------- arch/x86/kernel/irqinit_32.c | 114 +++++++++++++++++++++++ arch/x86/kernel/irqinit_64.c | 217 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 332 insertions(+), 332 deletions(-) delete mode 100644 arch/x86/kernel/i8259_32.c delete mode 100644 arch/x86/kernel/i8259_64.c create mode 100644 arch/x86/kernel/irqinit_32.c create mode 100644 arch/x86/kernel/irqinit_64.c (limited to 'arch/x86/kernel/Makefile') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index f71a76ef259..2a53ad2cb45 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -18,7 +18,7 @@ CFLAGS_tsc_64.o := $(nostackp) obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o obj-y += traps_$(BITS).o irq_$(BITS).o obj-y += time_$(BITS).o ioport.o ldt.o -obj-y += setup_$(BITS).o i8259.o i8259_$(BITS).o setup.o +obj-y += setup_$(BITS).o i8259.o irqinit_$(BITS).o setup.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o setup64.o diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259_32.c deleted file mode 100644 index d66914287ee..00000000000 --- a/arch/x86/kernel/i8259_32.c +++ /dev/null @@ -1,114 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - - -/* - * Note that on a 486, we don't want to do a SIGFPE on an irq13 - * as the irq is unreliable, and exception 16 works correctly - * (ie as explained in the intel literature). On a 386, you - * can't use exception 16 due to bad IBM design, so we have to - * rely on the less exact irq13. - * - * Careful.. Not only is IRQ13 unreliable, but it is also - * leads to races. IBM designers who came up with it should - * be shot. - */ - - -static irqreturn_t math_error_irq(int cpl, void *dev_id) -{ - extern void math_error(void __user *); - outb(0,0xF0); - if (ignore_fpu_irq || !boot_cpu_data.hard_math) - return IRQ_NONE; - math_error((void __user *)get_irq_regs()->ip); - return IRQ_HANDLED; -} - -/* - * New motherboards sometimes make IRQ 13 be a PCI interrupt, - * so allow interrupt sharing. - */ -static struct irqaction fpu_irq = { - .handler = math_error_irq, - .mask = CPU_MASK_NONE, - .name = "fpu", -}; - -void __init init_ISA_irqs (void) -{ - int i; - -#ifdef CONFIG_X86_LOCAL_APIC - init_bsp_APIC(); -#endif - init_8259A(0); - - /* - * 16 old-style INTA-cycle interrupts: - */ - for (i = 0; i < 16; i++) { - set_irq_chip_and_handler_name(i, &i8259A_chip, - handle_level_irq, "XT"); - } -} - -/* Overridden in paravirt.c */ -void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); - -void __init native_init_IRQ(void) -{ - int i; - - /* all the set up before the call gates are initialised */ - pre_intr_init_hook(); - - /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) - */ - for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { - int vector = FIRST_EXTERNAL_VECTOR + i; - if (i >= NR_IRQS) - break; - /* SYSCALL_VECTOR was reserved in trap_init. */ - if (!test_bit(vector, used_vectors)) - set_intr_gate(vector, interrupt[i]); - } - - /* setup after call gates are initialised (usually add in - * the architecture specific gates) - */ - intr_init_hook(); - - /* - * External FPU? Set up irq13 if so, for - * original braindamaged IBM FERR coupling. - */ - if (boot_cpu_data.hard_math && !cpu_has_fpu) - setup_irq(FPU_IRQ, &fpu_irq); - - irq_ctx_init(smp_processor_id()); -} diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c deleted file mode 100644 index 31f49e8f46a..00000000000 --- a/arch/x86/kernel/i8259_64.c +++ /dev/null @@ -1,217 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Common place to define all x86 IRQ vectors - * - * This builds up the IRQ handler stubs using some ugly macros in irq.h - * - * These macros create the low-level assembly IRQ routines that save - * register context and call do_IRQ(). do_IRQ() then does all the - * operations that are needed to keep the AT (or SMP IOAPIC) - * interrupt-controller happy. - */ - -#define IRQ_NAME2(nr) nr##_interrupt(void) -#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr) - -/* - * SMP has a few special interrupts for IPI messages - */ - -#define BUILD_IRQ(nr) \ - asmlinkage void IRQ_NAME(nr); \ - asm("\n.p2align\n" \ - "IRQ" #nr "_interrupt:\n\t" \ - "push $~(" #nr ") ; " \ - "jmp common_interrupt"); - -#define BI(x,y) \ - BUILD_IRQ(x##y) - -#define BUILD_16_IRQS(x) \ - BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ - BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ - BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ - BI(x,c) BI(x,d) BI(x,e) BI(x,f) - -/* - * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: - * (these are usually mapped to vectors 0x30-0x3f) - */ - -/* - * The IO-APIC gives us many more interrupt sources. Most of these - * are unused but an SMP system is supposed to have enough memory ... - * sometimes (mostly wrt. hw bugs) we get corrupted vectors all - * across the spectrum, so we really want to be prepared to get all - * of these. Plus, more powerful systems might have more than 64 - * IO-APIC registers. - * - * (these are usually mapped into the 0x30-0xff vector range) - */ - BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3) -BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7) -BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb) -BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf) - -#undef BUILD_16_IRQS -#undef BI - - -#define IRQ(x,y) \ - IRQ##x##y##_interrupt - -#define IRQLIST_16(x) \ - IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ - IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ - IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ - IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) - -/* for the irq vectors */ -static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = { - IRQLIST_16(0x2), IRQLIST_16(0x3), - IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), - IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), - IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf) -}; - -#undef IRQ -#undef IRQLIST_16 - - - - -/* - * IRQ2 is cascade interrupt to second interrupt controller - */ - -static struct irqaction irq2 = { - .handler = no_action, - .mask = CPU_MASK_NONE, - .name = "cascade", -}; -DEFINE_PER_CPU(vector_irq_t, vector_irq) = { - [0 ... IRQ0_VECTOR - 1] = -1, - [IRQ0_VECTOR] = 0, - [IRQ1_VECTOR] = 1, - [IRQ2_VECTOR] = 2, - [IRQ3_VECTOR] = 3, - [IRQ4_VECTOR] = 4, - [IRQ5_VECTOR] = 5, - [IRQ6_VECTOR] = 6, - [IRQ7_VECTOR] = 7, - [IRQ8_VECTOR] = 8, - [IRQ9_VECTOR] = 9, - [IRQ10_VECTOR] = 10, - [IRQ11_VECTOR] = 11, - [IRQ12_VECTOR] = 12, - [IRQ13_VECTOR] = 13, - [IRQ14_VECTOR] = 14, - [IRQ15_VECTOR] = 15, - [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 -}; - -static void __init init_ISA_irqs (void) -{ - int i; - - init_bsp_APIC(); - init_8259A(0); - - for (i = 0; i < NR_IRQS; i++) { - irq_desc[i].status = IRQ_DISABLED; - irq_desc[i].action = NULL; - irq_desc[i].depth = 1; - - if (i < 16) { - /* - * 16 old-style INTA-cycle interrupts: - */ - set_irq_chip_and_handler_name(i, &i8259A_chip, - handle_level_irq, "XT"); - } else { - /* - * 'high' PCI IRQs filled in on demand - */ - irq_desc[i].chip = &no_irq_chip; - } - } -} - -void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); - -void __init native_init_IRQ(void) -{ - int i; - - init_ISA_irqs(); - /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) - */ - for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { - int vector = FIRST_EXTERNAL_VECTOR + i; - if (vector != IA32_SYSCALL_VECTOR) - set_intr_gate(vector, interrupt[i]); - } - -#ifdef CONFIG_SMP - /* - * The reschedule interrupt is a CPU-to-CPU reschedule-helper - * IPI, driven by wakeup. - */ - alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); - - /* IPIs for invalidation */ - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); - - /* IPI for generic function call */ - alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); - - /* Low priority IPI to cleanup after moving an irq */ - set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); -#endif - alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); - alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); - - /* self generated IPI for local APIC timer */ - alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); - - /* IPI vectors for APIC spurious and error interrupts */ - alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); - alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); - - if (!acpi_ioapic) - setup_irq(2, &irq2); -} diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c new file mode 100644 index 00000000000..d66914287ee --- /dev/null +++ b/arch/x86/kernel/irqinit_32.c @@ -0,0 +1,114 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +/* + * Note that on a 486, we don't want to do a SIGFPE on an irq13 + * as the irq is unreliable, and exception 16 works correctly + * (ie as explained in the intel literature). On a 386, you + * can't use exception 16 due to bad IBM design, so we have to + * rely on the less exact irq13. + * + * Careful.. Not only is IRQ13 unreliable, but it is also + * leads to races. IBM designers who came up with it should + * be shot. + */ + + +static irqreturn_t math_error_irq(int cpl, void *dev_id) +{ + extern void math_error(void __user *); + outb(0,0xF0); + if (ignore_fpu_irq || !boot_cpu_data.hard_math) + return IRQ_NONE; + math_error((void __user *)get_irq_regs()->ip); + return IRQ_HANDLED; +} + +/* + * New motherboards sometimes make IRQ 13 be a PCI interrupt, + * so allow interrupt sharing. + */ +static struct irqaction fpu_irq = { + .handler = math_error_irq, + .mask = CPU_MASK_NONE, + .name = "fpu", +}; + +void __init init_ISA_irqs (void) +{ + int i; + +#ifdef CONFIG_X86_LOCAL_APIC + init_bsp_APIC(); +#endif + init_8259A(0); + + /* + * 16 old-style INTA-cycle interrupts: + */ + for (i = 0; i < 16; i++) { + set_irq_chip_and_handler_name(i, &i8259A_chip, + handle_level_irq, "XT"); + } +} + +/* Overridden in paravirt.c */ +void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); + +void __init native_init_IRQ(void) +{ + int i; + + /* all the set up before the call gates are initialised */ + pre_intr_init_hook(); + + /* + * Cover the whole vector space, no vector can escape + * us. (some of these will be overridden and become + * 'special' SMP interrupts) + */ + for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { + int vector = FIRST_EXTERNAL_VECTOR + i; + if (i >= NR_IRQS) + break; + /* SYSCALL_VECTOR was reserved in trap_init. */ + if (!test_bit(vector, used_vectors)) + set_intr_gate(vector, interrupt[i]); + } + + /* setup after call gates are initialised (usually add in + * the architecture specific gates) + */ + intr_init_hook(); + + /* + * External FPU? Set up irq13 if so, for + * original braindamaged IBM FERR coupling. + */ + if (boot_cpu_data.hard_math && !cpu_has_fpu) + setup_irq(FPU_IRQ, &fpu_irq); + + irq_ctx_init(smp_processor_id()); +} diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c new file mode 100644 index 00000000000..31f49e8f46a --- /dev/null +++ b/arch/x86/kernel/irqinit_64.c @@ -0,0 +1,217 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Common place to define all x86 IRQ vectors + * + * This builds up the IRQ handler stubs using some ugly macros in irq.h + * + * These macros create the low-level assembly IRQ routines that save + * register context and call do_IRQ(). do_IRQ() then does all the + * operations that are needed to keep the AT (or SMP IOAPIC) + * interrupt-controller happy. + */ + +#define IRQ_NAME2(nr) nr##_interrupt(void) +#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr) + +/* + * SMP has a few special interrupts for IPI messages + */ + +#define BUILD_IRQ(nr) \ + asmlinkage void IRQ_NAME(nr); \ + asm("\n.p2align\n" \ + "IRQ" #nr "_interrupt:\n\t" \ + "push $~(" #nr ") ; " \ + "jmp common_interrupt"); + +#define BI(x,y) \ + BUILD_IRQ(x##y) + +#define BUILD_16_IRQS(x) \ + BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ + BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ + BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ + BI(x,c) BI(x,d) BI(x,e) BI(x,f) + +/* + * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: + * (these are usually mapped to vectors 0x30-0x3f) + */ + +/* + * The IO-APIC gives us many more interrupt sources. Most of these + * are unused but an SMP system is supposed to have enough memory ... + * sometimes (mostly wrt. hw bugs) we get corrupted vectors all + * across the spectrum, so we really want to be prepared to get all + * of these. Plus, more powerful systems might have more than 64 + * IO-APIC registers. + * + * (these are usually mapped into the 0x30-0xff vector range) + */ + BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3) +BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7) +BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb) +BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf) + +#undef BUILD_16_IRQS +#undef BI + + +#define IRQ(x,y) \ + IRQ##x##y##_interrupt + +#define IRQLIST_16(x) \ + IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ + IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ + IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ + IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) + +/* for the irq vectors */ +static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = { + IRQLIST_16(0x2), IRQLIST_16(0x3), + IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), + IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), + IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf) +}; + +#undef IRQ +#undef IRQLIST_16 + + + + +/* + * IRQ2 is cascade interrupt to second interrupt controller + */ + +static struct irqaction irq2 = { + .handler = no_action, + .mask = CPU_MASK_NONE, + .name = "cascade", +}; +DEFINE_PER_CPU(vector_irq_t, vector_irq) = { + [0 ... IRQ0_VECTOR - 1] = -1, + [IRQ0_VECTOR] = 0, + [IRQ1_VECTOR] = 1, + [IRQ2_VECTOR] = 2, + [IRQ3_VECTOR] = 3, + [IRQ4_VECTOR] = 4, + [IRQ5_VECTOR] = 5, + [IRQ6_VECTOR] = 6, + [IRQ7_VECTOR] = 7, + [IRQ8_VECTOR] = 8, + [IRQ9_VECTOR] = 9, + [IRQ10_VECTOR] = 10, + [IRQ11_VECTOR] = 11, + [IRQ12_VECTOR] = 12, + [IRQ13_VECTOR] = 13, + [IRQ14_VECTOR] = 14, + [IRQ15_VECTOR] = 15, + [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 +}; + +static void __init init_ISA_irqs (void) +{ + int i; + + init_bsp_APIC(); + init_8259A(0); + + for (i = 0; i < NR_IRQS; i++) { + irq_desc[i].status = IRQ_DISABLED; + irq_desc[i].action = NULL; + irq_desc[i].depth = 1; + + if (i < 16) { + /* + * 16 old-style INTA-cycle interrupts: + */ + set_irq_chip_and_handler_name(i, &i8259A_chip, + handle_level_irq, "XT"); + } else { + /* + * 'high' PCI IRQs filled in on demand + */ + irq_desc[i].chip = &no_irq_chip; + } + } +} + +void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); + +void __init native_init_IRQ(void) +{ + int i; + + init_ISA_irqs(); + /* + * Cover the whole vector space, no vector can escape + * us. (some of these will be overridden and become + * 'special' SMP interrupts) + */ + for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { + int vector = FIRST_EXTERNAL_VECTOR + i; + if (vector != IA32_SYSCALL_VECTOR) + set_intr_gate(vector, interrupt[i]); + } + +#ifdef CONFIG_SMP + /* + * The reschedule interrupt is a CPU-to-CPU reschedule-helper + * IPI, driven by wakeup. + */ + alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); + + /* IPIs for invalidation */ + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); + + /* IPI for generic function call */ + alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); + + /* Low priority IPI to cleanup after moving an irq */ + set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); +#endif + alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); + alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); + + /* self generated IPI for local APIC timer */ + alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); + + /* IPI vectors for APIC spurious and error interrupts */ + alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); + alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); + + if (!acpi_ioapic) + setup_irq(2, &irq2); +} -- cgit v1.2.3-70-g09d2 From b79cd8f1268bab57ff85b19d131f7f23deab2dee Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 11 May 2008 00:30:15 -0700 Subject: x86: make e820.c to have common functions remove the duplicated copy of these functions. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/e820.c | 475 +++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/e820_32.c | 411 +-------------------------------------- arch/x86/kernel/e820_64.c | 444 ------------------------------------------ arch/x86/kernel/setup_32.c | 2 +- include/asm-x86/e820.h | 14 ++ include/asm-x86/e820_32.h | 12 -- include/asm-x86/e820_64.h | 12 -- 8 files changed, 496 insertions(+), 876 deletions(-) create mode 100644 arch/x86/kernel/e820.c (limited to 'arch/x86/kernel/Makefile') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5e618c3b472..5369a4e36f1 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -22,7 +22,7 @@ obj-y += setup_$(BITS).o i8259_$(BITS).o setup.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o setup64.o -obj-y += bootflag.o e820_$(BITS).o +obj-y += bootflag.o e820_$(BITS).o e820.o obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o obj-$(CONFIG_X86_64) += bugs_64.o diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c new file mode 100644 index 00000000000..2cb686f60d0 --- /dev/null +++ b/arch/x86/kernel/e820.c @@ -0,0 +1,475 @@ +/* + * Handle the memory map. + * The functions here do the job until bootmem takes over. + * + * Getting sanitize_e820_map() in sync with i386 version by applying change: + * - Provisions for empty E820 memory regions (reported by certain BIOSes). + * Alex Achenbach , December 2002. + * Venkatesh Pallipadi + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +struct e820map e820; + +/* For PCI or other memory-mapped resources */ +unsigned long pci_mem_start = 0xaeedbabe; +#ifdef CONFIG_PCI +EXPORT_SYMBOL(pci_mem_start); +#endif + +/* + * This function checks if any part of the range is mapped + * with type. + */ +int +e820_any_mapped(u64 start, u64 end, unsigned type) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + + if (type && ei->type != type) + continue; + if (ei->addr >= end || ei->addr + ei->size <= start) + continue; + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(e820_any_mapped); + +/* + * This function checks if the entire range is mapped with type. + * + * Note: this function only works correct if the e820 table is sorted and + * not-overlapping, which is the case + */ +int __init e820_all_mapped(u64 start, u64 end, unsigned type) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + + if (type && ei->type != type) + continue; + /* is the region (part) in overlap with the current region ?*/ + if (ei->addr >= end || ei->addr + ei->size <= start) + continue; + + /* if the region is at the beginning of we move + * start to the end of the region since it's ok until there + */ + if (ei->addr <= start) + start = ei->addr + ei->size; + /* + * if start is now at or beyond end, we're done, full + * coverage + */ + if (start >= end) + return 1; + } + return 0; +} + +/* + * Add a memory region to the kernel e820 map. + */ +void __init add_memory_region(u64 start, u64 size, int type) +{ + int x = e820.nr_map; + + if (x == E820MAX) { + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); + return; + } + + e820.map[x].addr = start; + e820.map[x].size = size; + e820.map[x].type = type; + e820.nr_map++; +} + +void __init e820_print_map(char *who) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + printk(KERN_INFO " %s: %016Lx - %016Lx ", who, + (unsigned long long) e820.map[i].addr, + (unsigned long long) + (e820.map[i].addr + e820.map[i].size)); + switch (e820.map[i].type) { + case E820_RAM: + printk(KERN_CONT "(usable)\n"); + break; + case E820_RESERVED: + printk(KERN_CONT "(reserved)\n"); + break; + case E820_ACPI: + printk(KERN_CONT "(ACPI data)\n"); + break; + case E820_NVS: + printk(KERN_CONT "(ACPI NVS)\n"); + break; + default: + printk(KERN_CONT "type %u\n", e820.map[i].type); + break; + } + } +} + +/* + * Sanitize the BIOS e820 map. + * + * Some e820 responses include overlapping entries. The following + * replaces the original e820 map with a new one, removing overlaps. + * + */ +int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map) +{ + struct change_member { + struct e820entry *pbios; /* pointer to original bios entry */ + unsigned long long addr; /* address for this change point */ + }; + static struct change_member change_point_list[2*E820MAX] __initdata; + static struct change_member *change_point[2*E820MAX] __initdata; + static struct e820entry *overlap_list[E820MAX] __initdata; + static struct e820entry new_bios[E820MAX] __initdata; + struct change_member *change_tmp; + unsigned long current_type, last_type; + unsigned long long last_addr; + int chgidx, still_changing; + int overlap_entries; + int new_bios_entry; + int old_nr, new_nr, chg_nr; + int i; + + /* + Visually we're performing the following + (1,2,3,4 = memory types)... + + Sample memory map (w/overlaps): + ____22__________________ + ______________________4_ + ____1111________________ + _44_____________________ + 11111111________________ + ____________________33__ + ___________44___________ + __________33333_________ + ______________22________ + ___________________2222_ + _________111111111______ + _____________________11_ + _________________4______ + + Sanitized equivalent (no overlap): + 1_______________________ + _44_____________________ + ___1____________________ + ____22__________________ + ______11________________ + _________1______________ + __________3_____________ + ___________44___________ + _____________33_________ + _______________2________ + ________________1_______ + _________________4______ + ___________________2____ + ____________________33__ + ______________________4_ + */ + + /* if there's only one memory region, don't bother */ + if (*pnr_map < 2) + return -1; + + old_nr = *pnr_map; + + /* bail out if we find any unreasonable addresses in bios map */ + for (i = 0; i < old_nr; i++) + if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) + return -1; + + /* create pointers for initial change-point information (for sorting) */ + for (i = 0; i < 2 * old_nr; i++) + change_point[i] = &change_point_list[i]; + + /* record all known change-points (starting and ending addresses), + omitting those that are for empty memory regions */ + chgidx = 0; + for (i = 0; i < old_nr; i++) { + if (biosmap[i].size != 0) { + change_point[chgidx]->addr = biosmap[i].addr; + change_point[chgidx++]->pbios = &biosmap[i]; + change_point[chgidx]->addr = biosmap[i].addr + + biosmap[i].size; + change_point[chgidx++]->pbios = &biosmap[i]; + } + } + chg_nr = chgidx; + + /* sort change-point list by memory addresses (low -> high) */ + still_changing = 1; + while (still_changing) { + still_changing = 0; + for (i = 1; i < chg_nr; i++) { + unsigned long long curaddr, lastaddr; + unsigned long long curpbaddr, lastpbaddr; + + curaddr = change_point[i]->addr; + lastaddr = change_point[i - 1]->addr; + curpbaddr = change_point[i]->pbios->addr; + lastpbaddr = change_point[i - 1]->pbios->addr; + + /* + * swap entries, when: + * + * curaddr > lastaddr or + * curaddr == lastaddr and curaddr == curpbaddr and + * lastaddr != lastpbaddr + */ + if (curaddr < lastaddr || + (curaddr == lastaddr && curaddr == curpbaddr && + lastaddr != lastpbaddr)) { + change_tmp = change_point[i]; + change_point[i] = change_point[i-1]; + change_point[i-1] = change_tmp; + still_changing = 1; + } + } + } + + /* create a new bios memory map, removing overlaps */ + overlap_entries = 0; /* number of entries in the overlap table */ + new_bios_entry = 0; /* index for creating new bios map entries */ + last_type = 0; /* start with undefined memory type */ + last_addr = 0; /* start with 0 as last starting address */ + + /* loop through change-points, determining affect on the new bios map */ + for (chgidx = 0; chgidx < chg_nr; chgidx++) { + /* keep track of all overlapping bios entries */ + if (change_point[chgidx]->addr == + change_point[chgidx]->pbios->addr) { + /* + * add map entry to overlap list (> 1 entry + * implies an overlap) + */ + overlap_list[overlap_entries++] = + change_point[chgidx]->pbios; + } else { + /* + * remove entry from list (order independent, + * so swap with last) + */ + for (i = 0; i < overlap_entries; i++) { + if (overlap_list[i] == + change_point[chgidx]->pbios) + overlap_list[i] = + overlap_list[overlap_entries-1]; + } + overlap_entries--; + } + /* + * if there are overlapping entries, decide which + * "type" to use (larger value takes precedence -- + * 1=usable, 2,3,4,4+=unusable) + */ + current_type = 0; + for (i = 0; i < overlap_entries; i++) + if (overlap_list[i]->type > current_type) + current_type = overlap_list[i]->type; + /* + * continue building up new bios map based on this + * information + */ + if (current_type != last_type) { + if (last_type != 0) { + new_bios[new_bios_entry].size = + change_point[chgidx]->addr - last_addr; + /* + * move forward only if the new size + * was non-zero + */ + if (new_bios[new_bios_entry].size != 0) + /* + * no more space left for new + * bios entries ? + */ + if (++new_bios_entry >= E820MAX) + break; + } + if (current_type != 0) { + new_bios[new_bios_entry].addr = + change_point[chgidx]->addr; + new_bios[new_bios_entry].type = current_type; + last_addr = change_point[chgidx]->addr; + } + last_type = current_type; + } + } + /* retain count for new bios entries */ + new_nr = new_bios_entry; + + /* copy new bios mapping into original location */ + memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry)); + *pnr_map = new_nr; + + return 0; +} + +/* + * Copy the BIOS e820 map into a safe place. + * + * Sanity-check it while we're at it.. + * + * If we're lucky and live on a modern system, the setup code + * will have given us a memory map that we can use to properly + * set up memory. If we aren't, we'll fake a memory map. + */ +int __init copy_e820_map(struct e820entry *biosmap, int nr_map) +{ + /* Only one memory region (or negative)? Ignore it */ + if (nr_map < 2) + return -1; + + do { + u64 start = biosmap->addr; + u64 size = biosmap->size; + u64 end = start + size; + u32 type = biosmap->type; + + /* Overflow in 64 bits? Ignore the memory map. */ + if (start > end) + return -1; + + add_memory_region(start, size, type); + } while (biosmap++, --nr_map); + return 0; +} + +u64 __init update_memory_range(u64 start, u64 size, unsigned old_type, + unsigned new_type) +{ + int i; + u64 real_updated_size = 0; + + BUG_ON(old_type == new_type); + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + u64 final_start, final_end; + if (ei->type != old_type) + continue; + /* totally covered? */ + if (ei->addr >= start && + (ei->addr + ei->size) <= (start + size)) { + ei->type = new_type; + real_updated_size += ei->size; + continue; + } + /* partially covered */ + final_start = max(start, ei->addr); + final_end = min(start + size, ei->addr + ei->size); + if (final_start >= final_end) + continue; + add_memory_region(final_start, final_end - final_start, + new_type); + real_updated_size += final_end - final_start; + } + return real_updated_size; +} + +void __init update_e820(void) +{ + u8 nr_map; + + nr_map = e820.nr_map; + if (sanitize_e820_map(e820.map, &nr_map)) + return; + e820.nr_map = nr_map; + printk(KERN_INFO "modified physical RAM map:\n"); + e820_print_map("modified"); +} + +/* + * Search for the biggest gap in the low 32 bits of the e820 + * memory space. We pass this space to PCI to assign MMIO resources + * for hotplug or unconfigured devices in. + * Hopefully the BIOS let enough space left. + */ +__init void e820_setup_gap(void) +{ + unsigned long gapstart, gapsize, round; + unsigned long long last; + int i; + int found = 0; + + last = 0x100000000ull; + gapstart = 0x10000000; + gapsize = 0x400000; + i = e820.nr_map; + while (--i >= 0) { + unsigned long long start = e820.map[i].addr; + unsigned long long end = start + e820.map[i].size; + + /* + * Since "last" is at most 4GB, we know we'll + * fit in 32 bits if this condition is true + */ + if (last > end) { + unsigned long gap = last - end; + + if (gap > gapsize) { + gapsize = gap; + gapstart = end; + found = 1; + } + } + if (start < last) + last = start; + } + +#ifdef CONFIG_X86_64 + if (!found) { + gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024; + printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit " + "address range\n" + KERN_ERR "PCI: Unassigned devices with 32bit resource " + "registers may break!\n"); + } +#endif + + /* + * See how much we want to round up: start off with + * rounding to the next 1MB area. + */ + round = 0x100000; + while ((gapsize >> 4) > round) + round += round; + /* Fun with two's complement */ + pci_mem_start = (gapstart + round) & -round; + + printk(KERN_INFO + "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", + pci_mem_start, gapstart, gapsize); +} + diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c index 751d517d802..dfe25751038 100644 --- a/arch/x86/kernel/e820_32.c +++ b/arch/x86/kernel/e820_32.c @@ -16,21 +16,6 @@ #include #include -struct e820map e820; -struct change_member { - struct e820entry *pbios; /* pointer to original bios entry */ - unsigned long long addr; /* address for this change point */ -}; -static struct change_member change_point_list[2*E820MAX] __initdata; -static struct change_member *change_point[2*E820MAX] __initdata; -static struct e820entry *overlap_list[E820MAX] __initdata; -static struct e820entry new_bios[E820MAX] __initdata; -/* For PCI or other memory-mapped resources */ -unsigned long pci_mem_start = 0x10000000; -#ifdef CONFIG_PCI -EXPORT_SYMBOL(pci_mem_start); -#endif - static struct resource system_rom_resource = { .name = "System ROM", .start = 0xf0000, @@ -254,223 +239,6 @@ void __init e820_mark_nosave_regions(void) } #endif -void __init add_memory_region(unsigned long long start, - unsigned long long size, int type) -{ - int x; - - x = e820.nr_map; - - if (x == E820MAX) { - printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); - return; - } - - e820.map[x].addr = start; - e820.map[x].size = size; - e820.map[x].type = type; - e820.nr_map++; -} /* add_memory_region */ - -/* - * Sanitize the BIOS e820 map. - * - * Some e820 responses include overlapping entries. The following - * replaces the original e820 map with a new one, removing overlaps. - * - */ -int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) -{ - struct change_member *change_tmp; - unsigned long current_type, last_type; - unsigned long long last_addr; - int chgidx, still_changing; - int overlap_entries; - int new_bios_entry; - int old_nr, new_nr, chg_nr; - int i; - - /* - Visually we're performing the following (1,2,3,4 = memory types)... - - Sample memory map (w/overlaps): - ____22__________________ - ______________________4_ - ____1111________________ - _44_____________________ - 11111111________________ - ____________________33__ - ___________44___________ - __________33333_________ - ______________22________ - ___________________2222_ - _________111111111______ - _____________________11_ - _________________4______ - - Sanitized equivalent (no overlap): - 1_______________________ - _44_____________________ - ___1____________________ - ____22__________________ - ______11________________ - _________1______________ - __________3_____________ - ___________44___________ - _____________33_________ - _______________2________ - ________________1_______ - _________________4______ - ___________________2____ - ____________________33__ - ______________________4_ - */ - /* if there's only one memory region, don't bother */ - if (*pnr_map < 2) { - return -1; - } - - old_nr = *pnr_map; - - /* bail out if we find any unreasonable addresses in bios map */ - for (i=0; iaddr = biosmap[i].addr; - change_point[chgidx++]->pbios = &biosmap[i]; - change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; - change_point[chgidx++]->pbios = &biosmap[i]; - } - } - chg_nr = chgidx; /* true number of change-points */ - - /* sort change-point list by memory addresses (low -> high) */ - still_changing = 1; - while (still_changing) { - still_changing = 0; - for (i=1; i < chg_nr; i++) { - /* if > , swap */ - /* or, if current= & last=, swap */ - if ((change_point[i]->addr < change_point[i-1]->addr) || - ((change_point[i]->addr == change_point[i-1]->addr) && - (change_point[i]->addr == change_point[i]->pbios->addr) && - (change_point[i-1]->addr != change_point[i-1]->pbios->addr)) - ) - { - change_tmp = change_point[i]; - change_point[i] = change_point[i-1]; - change_point[i-1] = change_tmp; - still_changing=1; - } - } - } - - /* create a new bios memory map, removing overlaps */ - overlap_entries=0; /* number of entries in the overlap table */ - new_bios_entry=0; /* index for creating new bios map entries */ - last_type = 0; /* start with undefined memory type */ - last_addr = 0; /* start with 0 as last starting address */ - /* loop through change-points, determining affect on the new bios map */ - for (chgidx=0; chgidx < chg_nr; chgidx++) - { - /* keep track of all overlapping bios entries */ - if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) - { - /* add map entry to overlap list (> 1 entry implies an overlap) */ - overlap_list[overlap_entries++]=change_point[chgidx]->pbios; - } - else - { - /* remove entry from list (order independent, so swap with last) */ - for (i=0; ipbios) - overlap_list[i] = overlap_list[overlap_entries-1]; - } - overlap_entries--; - } - /* if there are overlapping entries, decide which "type" to use */ - /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */ - current_type = 0; - for (i=0; itype > current_type) - current_type = overlap_list[i]->type; - /* continue building up new bios map based on this information */ - if (current_type != last_type) { - if (last_type != 0) { - new_bios[new_bios_entry].size = - change_point[chgidx]->addr - last_addr; - /* move forward only if the new size was non-zero */ - if (new_bios[new_bios_entry].size != 0) - if (++new_bios_entry >= E820MAX) - break; /* no more space left for new bios entries */ - } - if (current_type != 0) { - new_bios[new_bios_entry].addr = change_point[chgidx]->addr; - new_bios[new_bios_entry].type = current_type; - last_addr=change_point[chgidx]->addr; - } - last_type = current_type; - } - } - new_nr = new_bios_entry; /* retain count for new bios entries */ - - /* copy new bios mapping into original location */ - memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); - *pnr_map = new_nr; - - return 0; -} - -/* - * Copy the BIOS e820 map into a safe place. - * - * Sanity-check it while we're at it.. - * - * If we're lucky and live on a modern system, the setup code - * will have given us a memory map that we can use to properly - * set up memory. If we aren't, we'll fake a memory map. - * - * We check to see that the memory map contains at least 2 elements - * before we'll use it, because the detection code in setup.S may - * not be perfect and most every PC known to man has two memory - * regions: one from 0 to 640k, and one from 1mb up. (The IBM - * thinkpad 560x, for example, does not cooperate with the memory - * detection code.) - */ -int __init copy_e820_map(struct e820entry *biosmap, int nr_map) -{ - /* Only one memory region (or negative)? Ignore it */ - if (nr_map < 2) - return -1; - - do { - u64 start = biosmap->addr; - u64 size = biosmap->size; - u64 end = start + size; - u32 type = biosmap->type; - - /* Overflow in 64 bits? Ignore the memory map. */ - if (start > end) - return -1; - - add_memory_region(start, size, type); - } while (biosmap++, --nr_map); - - return 0; -} - /* * Find the highest page frame number we have available */ @@ -535,86 +303,12 @@ void __init register_bootmem_low_pages(unsigned long max_low_pfn) } } -void __init e820_register_memory(void) -{ - unsigned long gapstart, gapsize, round; - unsigned long long last; - int i; - - /* - * Search for the biggest gap in the low 32 bits of the e820 - * memory space. - */ - last = 0x100000000ull; - gapstart = 0x10000000; - gapsize = 0x400000; - i = e820.nr_map; - while (--i >= 0) { - unsigned long long start = e820.map[i].addr; - unsigned long long end = start + e820.map[i].size; - - /* - * Since "last" is at most 4GB, we know we'll - * fit in 32 bits if this condition is true - */ - if (last > end) { - unsigned long gap = last - end; - - if (gap > gapsize) { - gapsize = gap; - gapstart = end; - } - } - if (start < last) - last = start; - } - - /* - * See how much we want to round up: start off with - * rounding to the next 1MB area. - */ - round = 0x100000; - while ((gapsize >> 4) > round) - round += round; - /* Fun with two's complement */ - pci_mem_start = (gapstart + round) & -round; - - printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n", - pci_mem_start, gapstart, gapsize); -} - -static void __init print_memory_map(char *who) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - printk(" %s: %016Lx - %016Lx ", who, - e820.map[i].addr, - e820.map[i].addr + e820.map[i].size); - switch (e820.map[i].type) { - case E820_RAM: printk("(usable)\n"); - break; - case E820_RESERVED: - printk("(reserved)\n"); - break; - case E820_ACPI: - printk("(ACPI data)\n"); - break; - case E820_NVS: - printk("(ACPI NVS)\n"); - break; - default: printk("type %u\n", e820.map[i].type); - break; - } - } -} - void __init limit_regions(unsigned long long size) { unsigned long long current_addr; int i; - print_memory_map("limit_regions start"); + e820_print_map("limit_regions start"); for (i = 0; i < e820.nr_map; i++) { current_addr = e820.map[i].addr + e820.map[i].size; if (current_addr < size) @@ -633,62 +327,10 @@ void __init limit_regions(unsigned long long size) e820.nr_map = i + 1; e820.map[i].size -= current_addr - size; } - print_memory_map("limit_regions endfor"); + e820_print_map("limit_regions endfor"); return; } - print_memory_map("limit_regions endfunc"); -} - -/* - * This function checks if any part of the range is mapped - * with type. - */ -int -e820_any_mapped(u64 start, u64 end, unsigned type) -{ - int i; - for (i = 0; i < e820.nr_map; i++) { - const struct e820entry *ei = &e820.map[i]; - if (type && ei->type != type) - continue; - if (ei->addr >= end || ei->addr + ei->size <= start) - continue; - return 1; - } - return 0; -} -EXPORT_SYMBOL_GPL(e820_any_mapped); - - /* - * This function checks if the entire range is mapped with type. - * - * Note: this function only works correct if the e820 table is sorted and - * not-overlapping, which is the case - */ -int __init -e820_all_mapped(unsigned long s, unsigned long e, unsigned type) -{ - u64 start = s; - u64 end = e; - int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - if (type && ei->type != type) - continue; - /* is the region (part) in overlap with the current region ?*/ - if (ei->addr >= end || ei->addr + ei->size <= start) - continue; - /* if the region is at the beginning of we move - * start to the end of the region since it's ok until there - */ - if (ei->addr <= start) - start = ei->addr + ei->size; - /* if start is now at or beyond end, we're done, full - * coverage */ - if (start >= end) - return 1; /* we're done */ - } - return 0; + e820_print_map("limit_regions endfunc"); } /* Overridden in paravirt.c if CONFIG_PARAVIRT */ @@ -700,7 +342,7 @@ char * __init __attribute__((weak)) memory_setup(void) void __init setup_memory_map(void) { printk(KERN_INFO "BIOS-provided physical RAM map:\n"); - print_memory_map(memory_setup()); + e820_print_map(memory_setup()); } static int __initdata user_defined_memmap; @@ -783,55 +425,12 @@ static int __init parse_memmap(char *arg) return 0; } early_param("memmap", parse_memmap); -u64 __init update_memory_range(u64 start, u64 size, unsigned old_type, - unsigned new_type) -{ - int i; - u64 real_updated_size = 0; - - BUG_ON(old_type == new_type); - - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - u64 final_start, final_end; - if (ei->type != old_type) - continue; - /* totally covered? */ - if (ei->addr >= start && - (ei->addr + ei->size) <= (start + size)) { - ei->type = new_type; - real_updated_size += ei->size; - continue; - } - /* partially covered */ - final_start = max(start, ei->addr); - final_end = min(start + size, ei->addr + ei->size); - if (final_start >= final_end) - continue; - add_memory_region(final_start, final_end - final_start, - new_type); - real_updated_size += final_end - final_start; - } - - return real_updated_size; -} void __init finish_e820_parsing(void) { if (user_defined_memmap) { printk(KERN_INFO "user-defined physical RAM map:\n"); - print_memory_map("user"); + e820_print_map("user"); } } -void __init update_e820(void) -{ - u8 nr_map; - - nr_map = e820.nr_map; - if (sanitize_e820_map(e820.map, &nr_map)) - return; - e820.nr_map = nr_map; - printk(KERN_INFO "modified physical RAM map:\n"); - print_memory_map("modified"); -} diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c index c45b4dea405..28a14eb65d3 100644 --- a/arch/x86/kernel/e820_64.c +++ b/arch/x86/kernel/e820_64.c @@ -29,8 +29,6 @@ #include #include -struct e820map e820; - /* * PFN of last memory page. */ @@ -176,62 +174,6 @@ again: } return changed; } -/* - * This function checks if any part of the range is mapped - * with type. - */ -int -e820_any_mapped(unsigned long start, unsigned long end, unsigned type) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - - if (type && ei->type != type) - continue; - if (ei->addr >= end || ei->addr + ei->size <= start) - continue; - return 1; - } - return 0; -} -EXPORT_SYMBOL_GPL(e820_any_mapped); - -/* - * This function checks if the entire range is mapped with type. - * - * Note: this function only works correct if the e820 table is sorted and - * not-overlapping, which is the case - */ -int __init e820_all_mapped(unsigned long start, unsigned long end, - unsigned type) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - - if (type && ei->type != type) - continue; - /* is the region (part) in overlap with the current region ?*/ - if (ei->addr >= end || ei->addr + ei->size <= start) - continue; - - /* if the region is at the beginning of we move - * start to the end of the region since it's ok until there - */ - if (ei->addr <= start) - start = ei->addr + ei->size; - /* - * if start is now at or beyond end, we're done, full - * coverage - */ - if (start >= end) - return 1; - } - return 0; -} /* * Find a free area with specified alignment in a specific range. @@ -434,24 +376,6 @@ e820_register_active_regions(int nid, unsigned long start_pfn, add_active_range(nid, ei_startpfn, ei_endpfn); } -/* - * Add a memory region to the kernel e820 map. - */ -void __init add_memory_region(unsigned long start, unsigned long size, int type) -{ - int x = e820.nr_map; - - if (x == E820MAX) { - printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); - return; - } - - e820.map[x].addr = start; - e820.map[x].size = size; - e820.map[x].type = type; - e820.nr_map++; -} - /* * Find the hole size (in bytes) in the memory range. * @start: starting address of the memory range to scan @@ -473,266 +397,6 @@ unsigned long __init e820_hole_size(unsigned long start, unsigned long end) return end - start - (ram << PAGE_SHIFT); } -static void __init e820_print_map(char *who) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - printk(KERN_INFO " %s: %016Lx - %016Lx ", who, - (unsigned long long) e820.map[i].addr, - (unsigned long long) - (e820.map[i].addr + e820.map[i].size)); - switch (e820.map[i].type) { - case E820_RAM: - printk(KERN_CONT "(usable)\n"); - break; - case E820_RESERVED: - printk(KERN_CONT "(reserved)\n"); - break; - case E820_ACPI: - printk(KERN_CONT "(ACPI data)\n"); - break; - case E820_NVS: - printk(KERN_CONT "(ACPI NVS)\n"); - break; - default: - printk(KERN_CONT "type %u\n", e820.map[i].type); - break; - } - } -} - -/* - * Sanitize the BIOS e820 map. - * - * Some e820 responses include overlapping entries. The following - * replaces the original e820 map with a new one, removing overlaps. - * - */ -static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map) -{ - struct change_member { - struct e820entry *pbios; /* pointer to original bios entry */ - unsigned long long addr; /* address for this change point */ - }; - static struct change_member change_point_list[2*E820MAX] __initdata; - static struct change_member *change_point[2*E820MAX] __initdata; - static struct e820entry *overlap_list[E820MAX] __initdata; - static struct e820entry new_bios[E820MAX] __initdata; - struct change_member *change_tmp; - unsigned long current_type, last_type; - unsigned long long last_addr; - int chgidx, still_changing; - int overlap_entries; - int new_bios_entry; - int old_nr, new_nr, chg_nr; - int i; - - /* - Visually we're performing the following - (1,2,3,4 = memory types)... - - Sample memory map (w/overlaps): - ____22__________________ - ______________________4_ - ____1111________________ - _44_____________________ - 11111111________________ - ____________________33__ - ___________44___________ - __________33333_________ - ______________22________ - ___________________2222_ - _________111111111______ - _____________________11_ - _________________4______ - - Sanitized equivalent (no overlap): - 1_______________________ - _44_____________________ - ___1____________________ - ____22__________________ - ______11________________ - _________1______________ - __________3_____________ - ___________44___________ - _____________33_________ - _______________2________ - ________________1_______ - _________________4______ - ___________________2____ - ____________________33__ - ______________________4_ - */ - - /* if there's only one memory region, don't bother */ - if (*pnr_map < 2) - return -1; - - old_nr = *pnr_map; - - /* bail out if we find any unreasonable addresses in bios map */ - for (i = 0; i < old_nr; i++) - if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) - return -1; - - /* create pointers for initial change-point information (for sorting) */ - for (i = 0; i < 2 * old_nr; i++) - change_point[i] = &change_point_list[i]; - - /* record all known change-points (starting and ending addresses), - omitting those that are for empty memory regions */ - chgidx = 0; - for (i = 0; i < old_nr; i++) { - if (biosmap[i].size != 0) { - change_point[chgidx]->addr = biosmap[i].addr; - change_point[chgidx++]->pbios = &biosmap[i]; - change_point[chgidx]->addr = biosmap[i].addr + - biosmap[i].size; - change_point[chgidx++]->pbios = &biosmap[i]; - } - } - chg_nr = chgidx; - - /* sort change-point list by memory addresses (low -> high) */ - still_changing = 1; - while (still_changing) { - still_changing = 0; - for (i = 1; i < chg_nr; i++) { - unsigned long long curaddr, lastaddr; - unsigned long long curpbaddr, lastpbaddr; - - curaddr = change_point[i]->addr; - lastaddr = change_point[i - 1]->addr; - curpbaddr = change_point[i]->pbios->addr; - lastpbaddr = change_point[i - 1]->pbios->addr; - - /* - * swap entries, when: - * - * curaddr > lastaddr or - * curaddr == lastaddr and curaddr == curpbaddr and - * lastaddr != lastpbaddr - */ - if (curaddr < lastaddr || - (curaddr == lastaddr && curaddr == curpbaddr && - lastaddr != lastpbaddr)) { - change_tmp = change_point[i]; - change_point[i] = change_point[i-1]; - change_point[i-1] = change_tmp; - still_changing = 1; - } - } - } - - /* create a new bios memory map, removing overlaps */ - overlap_entries = 0; /* number of entries in the overlap table */ - new_bios_entry = 0; /* index for creating new bios map entries */ - last_type = 0; /* start with undefined memory type */ - last_addr = 0; /* start with 0 as last starting address */ - - /* loop through change-points, determining affect on the new bios map */ - for (chgidx = 0; chgidx < chg_nr; chgidx++) { - /* keep track of all overlapping bios entries */ - if (change_point[chgidx]->addr == - change_point[chgidx]->pbios->addr) { - /* - * add map entry to overlap list (> 1 entry - * implies an overlap) - */ - overlap_list[overlap_entries++] = - change_point[chgidx]->pbios; - } else { - /* - * remove entry from list (order independent, - * so swap with last) - */ - for (i = 0; i < overlap_entries; i++) { - if (overlap_list[i] == - change_point[chgidx]->pbios) - overlap_list[i] = - overlap_list[overlap_entries-1]; - } - overlap_entries--; - } - /* - * if there are overlapping entries, decide which - * "type" to use (larger value takes precedence -- - * 1=usable, 2,3,4,4+=unusable) - */ - current_type = 0; - for (i = 0; i < overlap_entries; i++) - if (overlap_list[i]->type > current_type) - current_type = overlap_list[i]->type; - /* - * continue building up new bios map based on this - * information - */ - if (current_type != last_type) { - if (last_type != 0) { - new_bios[new_bios_entry].size = - change_point[chgidx]->addr - last_addr; - /* - * move forward only if the new size - * was non-zero - */ - if (new_bios[new_bios_entry].size != 0) - /* - * no more space left for new - * bios entries ? - */ - if (++new_bios_entry >= E820MAX) - break; - } - if (current_type != 0) { - new_bios[new_bios_entry].addr = - change_point[chgidx]->addr; - new_bios[new_bios_entry].type = current_type; - last_addr = change_point[chgidx]->addr; - } - last_type = current_type; - } - } - /* retain count for new bios entries */ - new_nr = new_bios_entry; - - /* copy new bios mapping into original location */ - memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry)); - *pnr_map = new_nr; - - return 0; -} - -/* - * Copy the BIOS e820 map into a safe place. - * - * Sanity-check it while we're at it.. - * - * If we're lucky and live on a modern system, the setup code - * will have given us a memory map that we can use to properly - * set up memory. If we aren't, we'll fake a memory map. - */ -static int __init copy_e820_map(struct e820entry *biosmap, int nr_map) -{ - /* Only one memory region (or negative)? Ignore it */ - if (nr_map < 2) - return -1; - - do { - u64 start = biosmap->addr; - u64 size = biosmap->size; - u64 end = start + size; - u32 type = biosmap->type; - - /* Overflow in 64 bits? Ignore the memory map. */ - if (start > end) - return -1; - - add_memory_region(start, size, type); - } while (biosmap++, --nr_map); - return 0; -} - static void early_panic(char *msg) { early_printk(msg); @@ -829,114 +493,6 @@ void __init finish_e820_parsing(void) } } -u64 __init update_memory_range(u64 start, u64 size, unsigned old_type, - unsigned new_type) -{ - int i; - u64 real_updated_size = 0; - - BUG_ON(old_type == new_type); - - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - u64 final_start, final_end; - if (ei->type != old_type) - continue; - /* totally covered? */ - if (ei->addr >= start && - (ei->addr + ei->size) <= (start + size)) { - ei->type = new_type; - real_updated_size += ei->size; - continue; - } - /* partially covered */ - final_start = max(start, ei->addr); - final_end = min(start + size, ei->addr + ei->size); - if (final_start >= final_end) - continue; - add_memory_region(final_start, final_end - final_start, - new_type); - real_updated_size += final_end - final_start; - } - return real_updated_size; -} - -void __init update_e820(void) -{ - u8 nr_map; - - nr_map = e820.nr_map; - if (sanitize_e820_map(e820.map, &nr_map)) - return; - e820.nr_map = nr_map; - printk(KERN_INFO "modified physical RAM map:\n"); - e820_print_map("modified"); -} - -unsigned long pci_mem_start = 0xaeedbabe; -EXPORT_SYMBOL(pci_mem_start); - -/* - * Search for the biggest gap in the low 32 bits of the e820 - * memory space. We pass this space to PCI to assign MMIO resources - * for hotplug or unconfigured devices in. - * Hopefully the BIOS let enough space left. - */ -__init void e820_setup_gap(void) -{ - unsigned long gapstart, gapsize, round; - unsigned long last; - int i; - int found = 0; - - last = 0x100000000ull; - gapstart = 0x10000000; - gapsize = 0x400000; - i = e820.nr_map; - while (--i >= 0) { - unsigned long long start = e820.map[i].addr; - unsigned long long end = start + e820.map[i].size; - - /* - * Since "last" is at most 4GB, we know we'll - * fit in 32 bits if this condition is true - */ - if (last > end) { - unsigned long gap = last - end; - - if (gap > gapsize) { - gapsize = gap; - gapstart = end; - found = 1; - } - } - if (start < last) - last = start; - } - - if (!found) { - gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024; - printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit " - "address range\n" - KERN_ERR "PCI: Unassigned devices with 32bit resource " - "registers may break!\n"); - } - - /* - * See how much we want to round up: start off with - * rounding to the next 1MB area. - */ - round = 0x100000; - while ((gapsize >> 4) > round) - round += round; - /* Fun with two's complement */ - pci_mem_start = (gapstart + round) & -round; - - printk(KERN_INFO - "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", - pci_mem_start, gapstart, gapsize); -} - int __init arch_get_ram_range(int slot, u64 *addr, u64 *size) { int i; diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index b54c79c91ef..5faeab69edd 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -875,7 +875,7 @@ void __init setup_arch(char **cmdline_p) get_smp_config(); #endif - e820_register_memory(); + e820_setup_gap(); e820_mark_nosave_regions(); #ifdef CONFIG_VT diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h index 7004251fc66..b5b519feba1 100644 --- a/include/asm-x86/e820.h +++ b/include/asm-x86/e820.h @@ -20,6 +20,20 @@ struct e820map { __u32 nr_map; struct e820entry map[E820MAX]; }; + +extern struct e820map e820; + +extern int e820_any_mapped(u64 start, u64 end, unsigned type); +extern int e820_all_mapped(u64 start, u64 end, unsigned type); +extern void add_memory_region(u64 start, u64 size, int type); +extern void e820_print_map(char *who); +extern int sanitize_e820_map(struct e820entry *biosmap, char *pnr_map); +extern int copy_e820_map(struct e820entry *biosmap, int nr_map); +extern u64 update_memory_range(u64 start, u64 size, unsigned old_type, + unsigned new_type); +extern void update_e820(void); +extern void e820_setup_gap(void); + #endif /* __ASSEMBLY__ */ #define ISA_START_ADDRESS 0xa0000 diff --git a/include/asm-x86/e820_32.h b/include/asm-x86/e820_32.h index af0711b220d..9576b438fbd 100644 --- a/include/asm-x86/e820_32.h +++ b/include/asm-x86/e820_32.h @@ -21,19 +21,8 @@ extern void setup_memory_map(void); extern void finish_e820_parsing(void); -extern struct e820map e820; -extern void update_e820(void); - -extern int e820_all_mapped(unsigned long start, unsigned long end, - unsigned type); -extern int e820_any_mapped(u64 start, u64 end, unsigned type); extern void propagate_e820_map(void); extern void register_bootmem_low_pages(unsigned long max_low_pfn); -extern void add_memory_region(unsigned long long start, - unsigned long long size, int type); -extern u64 update_memory_range(u64 start, u64 size, unsigned old_type, - unsigned new_type); -extern void e820_register_memory(void); extern void limit_regions(unsigned long long size); extern void init_iomem_resources(struct resource *code_resource, struct resource *data_resource, @@ -47,6 +36,5 @@ static inline void e820_mark_nosave_regions(void) } #endif - #endif/*!__ASSEMBLY__*/ #endif/*__E820_HEADER*/ diff --git a/include/asm-x86/e820_64.h b/include/asm-x86/e820_64.h index 6ae3e280328..9fac77e0144 100644 --- a/include/asm-x86/e820_64.h +++ b/include/asm-x86/e820_64.h @@ -19,34 +19,22 @@ extern unsigned long find_e820_area(unsigned long start, unsigned long end, extern unsigned long find_e820_area_size(unsigned long start, unsigned long *sizep, unsigned long align); -extern void add_memory_region(unsigned long start, unsigned long size, - int type); -extern u64 update_memory_range(u64 start, u64 size, unsigned old_type, - unsigned new_type); extern void setup_memory_region(void); extern void contig_e820_setup(void); extern unsigned long e820_end_of_ram(void); extern void e820_reserve_resources(void); extern void e820_mark_nosave_regions(void); -extern int e820_any_mapped(unsigned long start, unsigned long end, - unsigned type); -extern int e820_all_mapped(unsigned long start, unsigned long end, - unsigned type); extern int e820_any_non_reserved(unsigned long start, unsigned long end); extern int is_memory_any_valid(unsigned long start, unsigned long end); extern int e820_all_non_reserved(unsigned long start, unsigned long end); extern int is_memory_all_valid(unsigned long start, unsigned long end); extern unsigned long e820_hole_size(unsigned long start, unsigned long end); -extern void e820_setup_gap(void); extern void e820_register_active_regions(int nid, unsigned long start_pfn, unsigned long end_pfn); extern void finish_e820_parsing(void); -extern struct e820map e820; -extern void update_e820(void); - extern void reserve_early(unsigned long start, unsigned long end, char *name); extern void free_early(unsigned long start, unsigned long end); extern void early_res_to_bootmem(unsigned long start, unsigned long end); -- cgit v1.2.3-70-g09d2 From 1798bc22b2790bf2a956588e6b17c36ef79ceff7 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 24 May 2008 19:36:41 +0400 Subject: x86: nmi_32/64.c - merge down nmi_32.c and nmi_64.c to nmi.c Signed-off-by: Cyrill Gorcunov Cc: hpa@zytor.com Cc: mingo@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/nmi.c | 533 +++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/nmi_32.c | 506 -------------------------------------------- arch/x86/kernel/nmi_64.c | 477 ------------------------------------------ 4 files changed, 534 insertions(+), 984 deletions(-) create mode 100644 arch/x86/kernel/nmi.c delete mode 100644 arch/x86/kernel/nmi_32.c delete mode 100644 arch/x86/kernel/nmi_64.c (limited to 'arch/x86/kernel/Makefile') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5e618c3b472..d5dd6668225 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -53,7 +53,7 @@ obj-$(CONFIG_X86_32_SMP) += smpcommon.o obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o obj-$(CONFIG_X86_MPPARSE) += mpparse.o -obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi_$(BITS).o +obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic_$(BITS).o obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c new file mode 100644 index 00000000000..69a839fc1eb --- /dev/null +++ b/arch/x86/kernel/nmi.c @@ -0,0 +1,533 @@ +/* + * NMI watchdog support on APIC systems + * + * Started by Ingo Molnar + * + * Fixes: + * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog. + * Mikael Pettersson : Power Management for local APIC NMI watchdog. + * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog. + * Pavel Machek and + * Mikael Pettersson : PM converted to driver model. Disable/enable API. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include + +int unknown_nmi_panic; +int nmi_watchdog_enabled; + +static cpumask_t backtrace_mask = CPU_MASK_NONE; + +/* nmi_active: + * >0: the lapic NMI watchdog is active, but can be disabled + * <0: the lapic NMI watchdog has not been set up, and cannot + * be enabled + * 0: the lapic NMI watchdog is disabled, but can be enabled + */ +atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ +static int panic_on_timeout; + +unsigned int nmi_watchdog = NMI_DEFAULT; + +static unsigned int nmi_hz = HZ; +static DEFINE_PER_CPU(short, wd_enabled); +static int endflag __initdata = 0; + +static inline unsigned int get_nmi_count(int cpu) +{ +#ifdef CONFIG_X86_64 + return cpu_pda(cpu)->__nmi_count; +#else + return nmi_count(cpu); +#endif +} + +static inline int mce_in_progress(void) +{ +#if defined(CONFIX_X86_64) && defined(CONFIG_X86_MCE) + return atomic_read(&mce_entry) > 0; +#endif + return 0; +} + +/* + * Take the local apic timer and PIT/HPET into account. We don't + * know which one is active, when we have highres/dyntick on + */ +static inline unsigned int get_timer_irqs(int cpu) +{ +#ifdef CONFIG_X86_64 + return read_pda(apic_timer_irqs) + read_pda(irq0_irqs); +#else + return per_cpu(irq_stat, cpu).apic_timer_irqs + + per_cpu(irq_stat, cpu).irq0_irqs; +#endif +} + +/* Run after command line and cpu_init init, but before all other checks */ +void nmi_watchdog_default(void) +{ + if (nmi_watchdog != NMI_DEFAULT) + return; +#ifdef CONFIG_X86_64 + nmi_watchdog = NMI_NONE; +#else + if (lapic_watchdog_ok()) + nmi_watchdog = NMI_LOCAL_APIC; + else + nmi_watchdog = NMI_IO_APIC; +#endif +} + +#ifdef CONFIG_SMP +/* + * The performance counters used by NMI_LOCAL_APIC don't trigger when + * the CPU is idle. To make sure the NMI watchdog really ticks on all + * CPUs during the test make them busy. + */ +static __init void nmi_cpu_busy(void *data) +{ + local_irq_enable_in_hardirq(); + /* + * Intentionally don't use cpu_relax here. This is + * to make sure that the performance counter really ticks, + * even if there is a simulator or similar that catches the + * pause instruction. On a real HT machine this is fine because + * all other CPUs are busy with "useless" delay loops and don't + * care if they get somewhat less cycles. + */ + while (endflag == 0) + mb(); +} +#endif + +int __init check_nmi_watchdog(void) +{ + unsigned int *prev_nmi_count; + int cpu; + + if (nmi_watchdog == NMI_NONE || nmi_watchdog == NMI_DISABLED) + return 0; + + if (!atomic_read(&nmi_active)) + return 0; + + prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); + if (!prev_nmi_count) + goto error; + + printk(KERN_INFO "Testing NMI watchdog ... "); + +#ifdef CONFIG_SMP + if (nmi_watchdog == NMI_LOCAL_APIC) + smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); +#endif + + for_each_possible_cpu(cpu) + prev_nmi_count[cpu] = get_nmi_count(cpu); + local_irq_enable(); + mdelay((20 * 1000) / nmi_hz); /* wait 20 ticks */ + + for_each_online_cpu(cpu) { + if (!per_cpu(wd_enabled, cpu)) + continue; + if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { + printk(KERN_WARNING "WARNING: CPU#%d: NMI " + "appears to be stuck (%d->%d)!\n", + cpu, + prev_nmi_count[cpu], + get_nmi_count(cpu)); + per_cpu(wd_enabled, cpu) = 0; + atomic_dec(&nmi_active); + } + } + endflag = 1; + if (!atomic_read(&nmi_active)) { + kfree(prev_nmi_count); + atomic_set(&nmi_active, -1); + goto error; + } + printk("OK.\n"); + + /* + * now that we know it works we can reduce NMI frequency to + * something more reasonable; makes a difference in some configs + */ + if (nmi_watchdog == NMI_LOCAL_APIC) + nmi_hz = lapic_adjust_nmi_hz(1); + + kfree(prev_nmi_count); + return 0; + +error: +#ifdef CONFIG_X86_32 + timer_ack = !cpu_has_tsc; +#endif + return -1; +} + +static int __init setup_nmi_watchdog(char *str) +{ + int nmi; + + if (!strncmp(str, "panic", 5)) { + panic_on_timeout = 1; + str = strchr(str, ','); + if (!str) + return 1; + ++str; + } + + get_option(&str, &nmi); + + if (nmi >= NMI_INVALID || nmi < NMI_NONE) + return 0; + + nmi_watchdog = nmi; + return 1; +} +__setup("nmi_watchdog=", setup_nmi_watchdog); + +/* + * Suspend/resume support + */ +#ifdef CONFIG_PM + +static int nmi_pm_active; /* nmi_active before suspend */ + +static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) +{ + /* only CPU0 goes here, other CPUs should be offline */ + nmi_pm_active = atomic_read(&nmi_active); + stop_apic_nmi_watchdog(NULL); + BUG_ON(atomic_read(&nmi_active) != 0); + return 0; +} + +static int lapic_nmi_resume(struct sys_device *dev) +{ + /* only CPU0 goes here, other CPUs should be offline */ + if (nmi_pm_active > 0) { + setup_apic_nmi_watchdog(NULL); + touch_nmi_watchdog(); + } + return 0; +} + +static struct sysdev_class nmi_sysclass = { + .name = "lapic_nmi", + .resume = lapic_nmi_resume, + .suspend = lapic_nmi_suspend, +}; + +static struct sys_device device_lapic_nmi = { + .id = 0, + .cls = &nmi_sysclass, +}; + +static int __init init_lapic_nmi_sysfs(void) +{ + int error; + + /* + * should really be a BUG_ON but b/c this is an + * init call, it just doesn't work. -dcz + */ + if (nmi_watchdog != NMI_LOCAL_APIC) + return 0; + + if (atomic_read(&nmi_active) < 0) + return 0; + + error = sysdev_class_register(&nmi_sysclass); + if (!error) + error = sysdev_register(&device_lapic_nmi); + return error; +} + +/* must come after the local APIC's device_initcall() */ +late_initcall(init_lapic_nmi_sysfs); + +#endif /* CONFIG_PM */ + +static void __acpi_nmi_enable(void *__unused) +{ + apic_write_around(APIC_LVT0, APIC_DM_NMI); +} + +/* + * Enable timer based NMIs on all CPUs: + */ +void acpi_nmi_enable(void) +{ + if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) + on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); +} + +static void __acpi_nmi_disable(void *__unused) +{ + apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); +} + +/* + * Disable timer based NMIs on all CPUs: + */ +void acpi_nmi_disable(void) +{ + if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) + on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); +} + +void setup_apic_nmi_watchdog(void *unused) +{ + if (__get_cpu_var(wd_enabled)) + return; + + /* cheap hack to support suspend/resume */ + /* if cpu0 is not active neither should the other cpus */ + if (smp_processor_id() != 0 && atomic_read(&nmi_active) <= 0) + return; + + switch (nmi_watchdog) { + case NMI_LOCAL_APIC: + /* enable it before to avoid race with handler */ + __get_cpu_var(wd_enabled) = 1; + if (lapic_watchdog_init(nmi_hz) < 0) { + __get_cpu_var(wd_enabled) = 0; + return; + } + /* FALL THROUGH */ + case NMI_IO_APIC: + __get_cpu_var(wd_enabled) = 1; + atomic_inc(&nmi_active); + } +} + +void stop_apic_nmi_watchdog(void *unused) +{ + /* only support LOCAL and IO APICs for now */ + if (nmi_watchdog != NMI_LOCAL_APIC && + nmi_watchdog != NMI_IO_APIC) + return; + if (__get_cpu_var(wd_enabled) == 0) + return; + if (nmi_watchdog == NMI_LOCAL_APIC) + lapic_watchdog_stop(); + __get_cpu_var(wd_enabled) = 0; + atomic_dec(&nmi_active); +} + +/* + * the best way to detect whether a CPU has a 'hard lockup' problem + * is to check it's local APIC timer IRQ counts. If they are not + * changing then that CPU has some problem. + * + * as these watchdog NMI IRQs are generated on every CPU, we only + * have to check the current processor. + * + * since NMIs don't listen to _any_ locks, we have to be extremely + * careful not to rely on unsafe variables. The printk might lock + * up though, so we have to break up any console locks first ... + * [when there will be more tty-related locks, break them up here too!] + */ + +static DEFINE_PER_CPU(unsigned, last_irq_sum); +static DEFINE_PER_CPU(local_t, alert_counter); +static DEFINE_PER_CPU(int, nmi_touch); + +void touch_nmi_watchdog(void) +{ + if (nmi_watchdog > 0) { + unsigned cpu; + + /* + * Tell other CPUs to reset their alert counters. We cannot + * do it ourselves because the alert count increase is not + * atomic. + */ + for_each_present_cpu(cpu) { + if (per_cpu(nmi_touch, cpu) != 1) + per_cpu(nmi_touch, cpu) = 1; + } + } + + /* + * Tickle the softlockup detector too: + */ + touch_softlockup_watchdog(); +} +EXPORT_SYMBOL(touch_nmi_watchdog); + +notrace __kprobes int +nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) +{ + /* + * Since current_thread_info()-> is always on the stack, and we + * always switch the stack NMI-atomically, it's safe to use + * smp_processor_id(). + */ + unsigned int sum; + int touched = 0; + int cpu = smp_processor_id(); + int rc = 0; + + /* check for other users first */ + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) + == NOTIFY_STOP) { + rc = 1; + touched = 1; + } + + sum = get_timer_irqs(cpu); + + if (__get_cpu_var(nmi_touch)) { + __get_cpu_var(nmi_touch) = 0; + touched = 1; + } + + if (cpu_isset(cpu, backtrace_mask)) { + static DEFINE_SPINLOCK(lock); /* Serialise the printks */ + + spin_lock(&lock); + printk("NMI backtrace for cpu %d\n", cpu); + dump_stack(); + spin_unlock(&lock); + cpu_clear(cpu, backtrace_mask); + } + + /* Could check oops_in_progress here too, but it's safer not to */ + if (mce_in_progress()) + touched = 1; + + /* if the none of the timers isn't firing, this cpu isn't doing much */ + if (!touched && __get_cpu_var(last_irq_sum) == sum) { + /* + * Ayiee, looks like this CPU is stuck ... + * wait a few IRQs (5 seconds) before doing the oops ... + */ + local_inc(&__get_cpu_var(alert_counter)); + if (local_read(&__get_cpu_var(alert_counter)) == 5 * nmi_hz) + /* + * die_nmi will return ONLY if NOTIFY_STOP happens.. + */ + die_nmi("BUG: NMI Watchdog detected LOCKUP", + regs, panic_on_timeout); + } else { + __get_cpu_var(last_irq_sum) = sum; + local_set(&__get_cpu_var(alert_counter), 0); + } + + /* see if the nmi watchdog went off */ + if (!__get_cpu_var(wd_enabled)) + return rc; + switch (nmi_watchdog) { + case NMI_LOCAL_APIC: + rc |= lapic_wd_event(nmi_hz); + break; + case NMI_IO_APIC: + /* + * don't know how to accurately check for this. + * just assume it was a watchdog timer interrupt + * This matches the old behaviour. + */ + rc = 1; + break; + } + return rc; +} + +#ifdef CONFIG_SYSCTL + +static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) +{ + unsigned char reason = get_nmi_reason(); + char buf[64]; + + sprintf(buf, "NMI received for unknown reason %02x\n", reason); + die_nmi(buf, regs, 1); /* Always panic here */ + return 0; +} + +/* + * proc handler for /proc/sys/kernel/nmi + */ +int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int old_state; + + nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; + old_state = nmi_watchdog_enabled; + proc_dointvec(table, write, file, buffer, length, ppos); + if (!!old_state == !!nmi_watchdog_enabled) + return 0; + + if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) { + printk(KERN_WARNING + "NMI watchdog is permanently disabled\n"); + return -EIO; + } + + /* if nmi_watchdog is not set yet, then set it */ + nmi_watchdog_default(); + + if (nmi_watchdog == NMI_LOCAL_APIC) { + if (nmi_watchdog_enabled) + enable_lapic_nmi_watchdog(); + else + disable_lapic_nmi_watchdog(); + } else { + printk(KERN_WARNING + "NMI watchdog doesn't know what hardware to touch\n"); + return -EIO; + } + return 0; +} + +#endif /* CONFIG_SYSCTL */ + +int do_nmi_callback(struct pt_regs *regs, int cpu) +{ +#ifdef CONFIG_SYSCTL + if (unknown_nmi_panic) + return unknown_nmi_panic_callback(regs, cpu); +#endif + return 0; +} + +void __trigger_all_cpu_backtrace(void) +{ + int i; + + backtrace_mask = cpu_online_map; + /* Wait for up to 10 seconds for all CPUs to do the backtrace */ + for (i = 0; i < 10 * 1000; i++) { + if (cpus_empty(backtrace_mask)) + break; + mdelay(1); + } +} + +EXPORT_SYMBOL(nmi_active); +EXPORT_SYMBOL(nmi_watchdog); + diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c deleted file mode 100644 index a94dbedf2b7..00000000000 --- a/arch/x86/kernel/nmi_32.c +++ /dev/null @@ -1,506 +0,0 @@ -/* - * NMI watchdog support on APIC systems - * - * Started by Ingo Molnar - * - * Fixes: - * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog. - * Mikael Pettersson : Power Management for local APIC NMI watchdog. - * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog. - * Pavel Machek and - * Mikael Pettersson : PM converted to driver model. Disable/enable API. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "mach_traps.h" - -int unknown_nmi_panic; -int nmi_watchdog_enabled; - -static cpumask_t backtrace_mask = CPU_MASK_NONE; - -/* nmi_active: - * >0: the lapic NMI watchdog is active, but can be disabled - * <0: the lapic NMI watchdog has not been set up, and cannot - * be enabled - * 0: the lapic NMI watchdog is disabled, but can be enabled - */ -atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ -static int panic_on_timeout; - -unsigned int nmi_watchdog = NMI_DEFAULT; -static unsigned int nmi_hz = HZ; - -static DEFINE_PER_CPU(short, wd_enabled); - -static int endflag __initdata = 0; - -static inline unsigned int get_nmi_count(int cpu) -{ - return nmi_count(cpu); -} - -static inline int mce_in_progress(void) -{ - return 0; -} - -/* - * Take the local apic timer and PIT/HPET into account. We don't - * know which one is active, when we have highres/dyntick on - */ -static inline unsigned int get_timer_irqs(int cpu) -{ - return per_cpu(irq_stat, cpu).apic_timer_irqs + - per_cpu(irq_stat, cpu).irq0_irqs; -} - -/* Run after command line and cpu_init init, but before all other checks */ -void nmi_watchdog_default(void) -{ - if (nmi_watchdog != NMI_DEFAULT) - return; - if (lapic_watchdog_ok()) - nmi_watchdog = NMI_LOCAL_APIC; - else - nmi_watchdog = NMI_IO_APIC; -} - -#ifdef CONFIG_SMP -/* The performance counters used by NMI_LOCAL_APIC don't trigger when - * the CPU is idle. To make sure the NMI watchdog really ticks on all - * CPUs during the test make them busy. - */ -static __init void nmi_cpu_busy(void *data) -{ - local_irq_enable_in_hardirq(); - /* Intentionally don't use cpu_relax here. This is - to make sure that the performance counter really ticks, - even if there is a simulator or similar that catches the - pause instruction. On a real HT machine this is fine because - all other CPUs are busy with "useless" delay loops and don't - care if they get somewhat less cycles. */ - while (endflag == 0) - mb(); -} -#endif - -int __init check_nmi_watchdog(void) -{ - unsigned int *prev_nmi_count; - int cpu; - - if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED)) - return 0; - - if (!atomic_read(&nmi_active)) - return 0; - - prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); - if (!prev_nmi_count) - goto error; - - printk(KERN_INFO "Testing NMI watchdog ... "); - -#ifdef CONFIG_SMP - if (nmi_watchdog == NMI_LOCAL_APIC) - smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); -#endif - - for_each_possible_cpu(cpu) - prev_nmi_count[cpu] = get_nmi_count(cpu); - local_irq_enable(); - mdelay((20*1000)/nmi_hz); // wait 20 ticks - - for_each_online_cpu(cpu) { - if (!per_cpu(wd_enabled, cpu)) - continue; - if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { - printk(KERN_WARNING "WARNING: CPU#%d: NMI " - "appears to be stuck (%d->%d)!\n", - cpu, - prev_nmi_count[cpu], - get_nmi_count(cpu)); - per_cpu(wd_enabled, cpu) = 0; - atomic_dec(&nmi_active); - } - } - endflag = 1; - if (!atomic_read(&nmi_active)) { - kfree(prev_nmi_count); - atomic_set(&nmi_active, -1); - goto error; - } - printk("OK.\n"); - - /* now that we know it works we can reduce NMI frequency to - something more reasonable; makes a difference in some configs */ - if (nmi_watchdog == NMI_LOCAL_APIC) - nmi_hz = lapic_adjust_nmi_hz(1); - - kfree(prev_nmi_count); - return 0; -error: - timer_ack = !cpu_has_tsc; - - return -1; -} - -static int __init setup_nmi_watchdog(char *str) -{ - int nmi; - - if (!strncmp(str, "panic", 5)) { - panic_on_timeout = 1; - str = strchr(str, ','); - if (!str) - return 1; - ++str; - } - - get_option(&str, &nmi); - - if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE)) - return 0; - - nmi_watchdog = nmi; - return 1; -} - -__setup("nmi_watchdog=", setup_nmi_watchdog); - - -/* Suspend/resume support */ - -#ifdef CONFIG_PM - -static int nmi_pm_active; /* nmi_active before suspend */ - -static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) -{ - /* only CPU0 goes here, other CPUs should be offline */ - nmi_pm_active = atomic_read(&nmi_active); - stop_apic_nmi_watchdog(NULL); - BUG_ON(atomic_read(&nmi_active) != 0); - return 0; -} - -static int lapic_nmi_resume(struct sys_device *dev) -{ - /* only CPU0 goes here, other CPUs should be offline */ - if (nmi_pm_active > 0) { - setup_apic_nmi_watchdog(NULL); - touch_nmi_watchdog(); - } - return 0; -} - - -static struct sysdev_class nmi_sysclass = { - .name = "lapic_nmi", - .resume = lapic_nmi_resume, - .suspend = lapic_nmi_suspend, -}; - -static struct sys_device device_lapic_nmi = { - .id = 0, - .cls = &nmi_sysclass, -}; - -static int __init init_lapic_nmi_sysfs(void) -{ - int error; - - /* should really be a BUG_ON but b/c this is an - * init call, it just doesn't work. -dcz - */ - if (nmi_watchdog != NMI_LOCAL_APIC) - return 0; - - if (atomic_read(&nmi_active) < 0) - return 0; - - error = sysdev_class_register(&nmi_sysclass); - if (!error) - error = sysdev_register(&device_lapic_nmi); - return error; -} -/* must come after the local APIC's device_initcall() */ -late_initcall(init_lapic_nmi_sysfs); - -#endif /* CONFIG_PM */ - -static void __acpi_nmi_enable(void *__unused) -{ - apic_write_around(APIC_LVT0, APIC_DM_NMI); -} - -/* - * Enable timer based NMIs on all CPUs: - */ -void acpi_nmi_enable(void) -{ - if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) - on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); -} - -static void __acpi_nmi_disable(void *__unused) -{ - apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); -} - -/* - * Disable timer based NMIs on all CPUs: - */ -void acpi_nmi_disable(void) -{ - if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) - on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); -} - -void setup_apic_nmi_watchdog(void *unused) -{ - if (__get_cpu_var(wd_enabled)) - return; - - /* cheap hack to support suspend/resume */ - /* if cpu0 is not active neither should the other cpus */ - if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0)) - return; - - switch (nmi_watchdog) { - case NMI_LOCAL_APIC: - __get_cpu_var(wd_enabled) = 1; /* enable it before to avoid race with handler */ - if (lapic_watchdog_init(nmi_hz) < 0) { - __get_cpu_var(wd_enabled) = 0; - return; - } - /* FALL THROUGH */ - case NMI_IO_APIC: - __get_cpu_var(wd_enabled) = 1; - atomic_inc(&nmi_active); - } -} - -void stop_apic_nmi_watchdog(void *unused) -{ - /* only support LOCAL and IO APICs for now */ - if ((nmi_watchdog != NMI_LOCAL_APIC) && - (nmi_watchdog != NMI_IO_APIC)) - return; - if (__get_cpu_var(wd_enabled) == 0) - return; - if (nmi_watchdog == NMI_LOCAL_APIC) - lapic_watchdog_stop(); - __get_cpu_var(wd_enabled) = 0; - atomic_dec(&nmi_active); -} - -/* - * the best way to detect whether a CPU has a 'hard lockup' problem - * is to check it's local APIC timer IRQ counts. If they are not - * changing then that CPU has some problem. - * - * as these watchdog NMI IRQs are generated on every CPU, we only - * have to check the current processor. - * - * since NMIs don't listen to _any_ locks, we have to be extremely - * careful not to rely on unsafe variables. The printk might lock - * up though, so we have to break up any console locks first ... - * [when there will be more tty-related locks, break them up - * here too!] - */ - -static DEFINE_PER_CPU(unsigned, last_irq_sum); -static DEFINE_PER_CPU(local_t, alert_counter); -static DEFINE_PER_CPU(int, nmi_touch); - -void touch_nmi_watchdog(void) -{ - if (nmi_watchdog > 0) { - unsigned cpu; - - /* - * Tell other CPUs to reset their alert counters. We cannot - * do it ourselves because the alert count increase is not - * atomic. - */ - for_each_present_cpu(cpu) { - if (per_cpu(nmi_touch, cpu) != 1) - per_cpu(nmi_touch, cpu) = 1; - } - } - - /* - * Tickle the softlockup detector too: - */ - touch_softlockup_watchdog(); -} -EXPORT_SYMBOL(touch_nmi_watchdog); - -notrace __kprobes int -nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) -{ - - /* - * Since current_thread_info()-> is always on the stack, and we - * always switch the stack NMI-atomically, it's safe to use - * smp_processor_id(). - */ - unsigned int sum; - int touched = 0; - int cpu = smp_processor_id(); - int rc = 0; - - /* check for other users first */ - if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) - == NOTIFY_STOP) { - rc = 1; - touched = 1; - } - - sum = get_timer_irqs(cpu); - - if (__get_cpu_var(nmi_touch)) { - __get_cpu_var(nmi_touch) = 0; - touched = 1; - } - - if (cpu_isset(cpu, backtrace_mask)) { - static DEFINE_SPINLOCK(lock); /* Serialise the printks */ - - spin_lock(&lock); - printk("NMI backtrace for cpu %d\n", cpu); - dump_stack(); - spin_unlock(&lock); - cpu_clear(cpu, backtrace_mask); - } - - /* Could check oops_in_progress here too, but it's safer not to */ - if (mce_in_progress()) - touched = 1; - - /* if the none of the timers isn't firing, this cpu isn't doing much */ - if (!touched && __get_cpu_var(last_irq_sum) == sum) { - /* - * Ayiee, looks like this CPU is stuck ... - * wait a few IRQs (5 seconds) before doing the oops ... - */ - local_inc(&__get_cpu_var(alert_counter)); - if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) - /* - * die_nmi will return ONLY if NOTIFY_STOP happens.. - */ - die_nmi("BUG: NMI Watchdog detected LOCKUP", - regs, panic_on_timeout); - } else { - __get_cpu_var(last_irq_sum) = sum; - local_set(&__get_cpu_var(alert_counter), 0); - } - /* see if the nmi watchdog went off */ - if (!__get_cpu_var(wd_enabled)) - return rc; - switch (nmi_watchdog) { - case NMI_LOCAL_APIC: - rc |= lapic_wd_event(nmi_hz); - break; - case NMI_IO_APIC: - /* don't know how to accurately check for this. - * just assume it was a watchdog timer interrupt - * This matches the old behaviour. - */ - rc = 1; - break; - } - return rc; -} - -#ifdef CONFIG_SYSCTL - -static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) -{ - unsigned char reason = get_nmi_reason(); - char buf[64]; - - sprintf(buf, "NMI received for unknown reason %02x\n", reason); - die_nmi(buf, regs, 1); /* Always panic here */ - return 0; -} - -/* - * proc handler for /proc/sys/kernel/nmi - */ -int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, - void __user *buffer, size_t *length, loff_t *ppos) -{ - int old_state; - - nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; - old_state = nmi_watchdog_enabled; - proc_dointvec(table, write, file, buffer, length, ppos); - if (!!old_state == !!nmi_watchdog_enabled) - return 0; - - if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) { - printk( KERN_WARNING "NMI watchdog is permanently disabled\n"); - return -EIO; - } - - /* if nmi_watchdog is not set yet, then set it */ - nmi_watchdog_default(); - - if (nmi_watchdog == NMI_LOCAL_APIC) { - if (nmi_watchdog_enabled) - enable_lapic_nmi_watchdog(); - else - disable_lapic_nmi_watchdog(); - } else { - printk( KERN_WARNING - "NMI watchdog doesn't know what hardware to touch\n"); - return -EIO; - } - return 0; -} - -#endif - -int do_nmi_callback(struct pt_regs *regs, int cpu) -{ -#ifdef CONFIG_SYSCTL - if (unknown_nmi_panic) - return unknown_nmi_panic_callback(regs, cpu); -#endif - return 0; -} - -void __trigger_all_cpu_backtrace(void) -{ - int i; - - backtrace_mask = cpu_online_map; - /* Wait for up to 10 seconds for all CPUs to do the backtrace */ - for (i = 0; i < 10 * 1000; i++) { - if (cpus_empty(backtrace_mask)) - break; - mdelay(1); - } -} - -EXPORT_SYMBOL(nmi_active); -EXPORT_SYMBOL(nmi_watchdog); diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c deleted file mode 100644 index c6802447262..00000000000 --- a/arch/x86/kernel/nmi_64.c +++ /dev/null @@ -1,477 +0,0 @@ -/* - * NMI watchdog support on APIC systems - * - * Started by Ingo Molnar - * - * Fixes: - * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog. - * Mikael Pettersson : Power Management for local APIC NMI watchdog. - * Pavel Machek and - * Mikael Pettersson : PM converted to driver model. Disable/enable API. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include - -int unknown_nmi_panic; -int nmi_watchdog_enabled; - -static cpumask_t backtrace_mask = CPU_MASK_NONE; - -/* nmi_active: - * >0: the lapic NMI watchdog is active, but can be disabled - * <0: the lapic NMI watchdog has not been set up, and cannot - * be enabled - * 0: the lapic NMI watchdog is disabled, but can be enabled - */ -atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ -static int panic_on_timeout; - -unsigned int nmi_watchdog = NMI_DEFAULT; -static unsigned int nmi_hz = HZ; - -static DEFINE_PER_CPU(short, wd_enabled); - -static int endflag __initdata = 0; - -static inline unsigned int get_nmi_count(int cpu) -{ - return cpu_pda(cpu)->__nmi_count; -} - -static inline int mce_in_progress(void) -{ -#ifdef CONFIG_X86_MCE - return atomic_read(&mce_entry) > 0; -#endif - return 0; -} - -/* - * Take the local apic timer and PIT/HPET into account. We don't - * know which one is active, when we have highres/dyntick on - */ -static inline unsigned int get_timer_irqs(int cpu) -{ - return read_pda(apic_timer_irqs) + read_pda(irq0_irqs); -} - -/* Run after command line and cpu_init init, but before all other checks */ -void nmi_watchdog_default(void) -{ - if (nmi_watchdog != NMI_DEFAULT) - return; - nmi_watchdog = NMI_NONE; -} - -#ifdef CONFIG_SMP -/* The performance counters used by NMI_LOCAL_APIC don't trigger when - * the CPU is idle. To make sure the NMI watchdog really ticks on all - * CPUs during the test make them busy. - */ -static __init void nmi_cpu_busy(void *data) -{ - local_irq_enable_in_hardirq(); - /* Intentionally don't use cpu_relax here. This is - to make sure that the performance counter really ticks, - even if there is a simulator or similar that catches the - pause instruction. On a real HT machine this is fine because - all other CPUs are busy with "useless" delay loops and don't - care if they get somewhat less cycles. */ - while (endflag == 0) - mb(); -} -#endif - -int __init check_nmi_watchdog(void) -{ - unsigned int *prev_nmi_count; - int cpu; - - if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED)) - return 0; - - if (!atomic_read(&nmi_active)) - return 0; - - prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); - if (!prev_nmi_count) - return -1; - - printk(KERN_INFO "Testing NMI watchdog ... "); - -#ifdef CONFIG_SMP - if (nmi_watchdog == NMI_LOCAL_APIC) - smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); -#endif - - for_each_possible_cpu(cpu) - prev_nmi_count[cpu] = get_nmi_count(cpu); - local_irq_enable(); - mdelay((20*1000)/nmi_hz); // wait 20 ticks - - for_each_online_cpu(cpu) { - if (!per_cpu(wd_enabled, cpu)) - continue; - if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { - printk(KERN_WARNING "WARNING: CPU#%d: NMI " - "appears to be stuck (%d->%d)!\n", - cpu, - prev_nmi_count[cpu], - get_nmi_count(cpu)); - per_cpu(wd_enabled, cpu) = 0; - atomic_dec(&nmi_active); - } - } - endflag = 1; - if (!atomic_read(&nmi_active)) { - kfree(prev_nmi_count); - atomic_set(&nmi_active, -1); - return -1; - } - printk("OK.\n"); - - /* now that we know it works we can reduce NMI frequency to - something more reasonable; makes a difference in some configs */ - if (nmi_watchdog == NMI_LOCAL_APIC) - nmi_hz = lapic_adjust_nmi_hz(1); - - kfree(prev_nmi_count); - return 0; -} - -static int __init setup_nmi_watchdog(char *str) -{ - int nmi; - - if (!strncmp(str,"panic",5)) { - panic_on_timeout = 1; - str = strchr(str, ','); - if (!str) - return 1; - ++str; - } - - get_option(&str, &nmi); - - if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE)) - return 0; - - nmi_watchdog = nmi; - return 1; -} - -__setup("nmi_watchdog=", setup_nmi_watchdog); - -#ifdef CONFIG_PM - -static int nmi_pm_active; /* nmi_active before suspend */ - -static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) -{ - /* only CPU0 goes here, other CPUs should be offline */ - nmi_pm_active = atomic_read(&nmi_active); - stop_apic_nmi_watchdog(NULL); - BUG_ON(atomic_read(&nmi_active) != 0); - return 0; -} - -static int lapic_nmi_resume(struct sys_device *dev) -{ - /* only CPU0 goes here, other CPUs should be offline */ - if (nmi_pm_active > 0) { - setup_apic_nmi_watchdog(NULL); - touch_nmi_watchdog(); - } - return 0; -} - -static struct sysdev_class nmi_sysclass = { - .name = "lapic_nmi", - .resume = lapic_nmi_resume, - .suspend = lapic_nmi_suspend, -}; - -static struct sys_device device_lapic_nmi = { - .id = 0, - .cls = &nmi_sysclass, -}; - -static int __init init_lapic_nmi_sysfs(void) -{ - int error; - - /* should really be a BUG_ON but b/c this is an - * init call, it just doesn't work. -dcz - */ - if (nmi_watchdog != NMI_LOCAL_APIC) - return 0; - - if (atomic_read(&nmi_active) < 0) - return 0; - - error = sysdev_class_register(&nmi_sysclass); - if (!error) - error = sysdev_register(&device_lapic_nmi); - return error; -} -/* must come after the local APIC's device_initcall() */ -late_initcall(init_lapic_nmi_sysfs); - -#endif /* CONFIG_PM */ - -static void __acpi_nmi_enable(void *__unused) -{ - apic_write_around(APIC_LVT0, APIC_DM_NMI); -} - -/* - * Enable timer based NMIs on all CPUs: - */ -void acpi_nmi_enable(void) -{ - if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) - on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); -} - -static void __acpi_nmi_disable(void *__unused) -{ - apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); -} - -/* - * Disable timer based NMIs on all CPUs: - */ -void acpi_nmi_disable(void) -{ - if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) - on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); -} - -void setup_apic_nmi_watchdog(void *unused) -{ - if (__get_cpu_var(wd_enabled)) - return; - - /* cheap hack to support suspend/resume */ - /* if cpu0 is not active neither should the other cpus */ - if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0)) - return; - - switch (nmi_watchdog) { - case NMI_LOCAL_APIC: - __get_cpu_var(wd_enabled) = 1; - if (lapic_watchdog_init(nmi_hz) < 0) { - __get_cpu_var(wd_enabled) = 0; - return; - } - /* FALL THROUGH */ - case NMI_IO_APIC: - __get_cpu_var(wd_enabled) = 1; - atomic_inc(&nmi_active); - } -} - -void stop_apic_nmi_watchdog(void *unused) -{ - /* only support LOCAL and IO APICs for now */ - if ((nmi_watchdog != NMI_LOCAL_APIC) && - (nmi_watchdog != NMI_IO_APIC)) - return; - if (__get_cpu_var(wd_enabled) == 0) - return; - if (nmi_watchdog == NMI_LOCAL_APIC) - lapic_watchdog_stop(); - __get_cpu_var(wd_enabled) = 0; - atomic_dec(&nmi_active); -} - -/* - * the best way to detect whether a CPU has a 'hard lockup' problem - * is to check it's local APIC timer IRQ counts. If they are not - * changing then that CPU has some problem. - * - * as these watchdog NMI IRQs are generated on every CPU, we only - * have to check the current processor. - */ - -static DEFINE_PER_CPU(unsigned, last_irq_sum); -static DEFINE_PER_CPU(local_t, alert_counter); -static DEFINE_PER_CPU(int, nmi_touch); - -void touch_nmi_watchdog(void) -{ - if (nmi_watchdog > 0) { - unsigned cpu; - - /* - * Tell other CPUs to reset their alert counters. We cannot - * do it ourselves because the alert count increase is not - * atomic. - */ - for_each_present_cpu(cpu) { - if (per_cpu(nmi_touch, cpu) != 1) - per_cpu(nmi_touch, cpu) = 1; - } - } - - touch_softlockup_watchdog(); -} -EXPORT_SYMBOL(touch_nmi_watchdog); - -notrace __kprobes int -nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) -{ - unsigned int sum; - int touched = 0; - int cpu = smp_processor_id(); - int rc = 0; - - /* check for other users first */ - if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) - == NOTIFY_STOP) { - rc = 1; - touched = 1; - } - - sum = get_timer_irqs(cpu); - - if (__get_cpu_var(nmi_touch)) { - __get_cpu_var(nmi_touch) = 0; - touched = 1; - } - - if (cpu_isset(cpu, backtrace_mask)) { - static DEFINE_SPINLOCK(lock); /* Serialise the printks */ - - spin_lock(&lock); - printk("NMI backtrace for cpu %d\n", cpu); - dump_stack(); - spin_unlock(&lock); - cpu_clear(cpu, backtrace_mask); - } - - if (mce_in_progress()) - touched = 1; - - /* if the apic timer isn't firing, this cpu isn't doing much */ - if (!touched && __get_cpu_var(last_irq_sum) == sum) { - /* - * Ayiee, looks like this CPU is stuck ... - * wait a few IRQs (5 seconds) before doing the oops ... - */ - local_inc(&__get_cpu_var(alert_counter)); - if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) - die_nmi("NMI Watchdog detected LOCKUP", - regs, panic_on_timeout); - } else { - __get_cpu_var(last_irq_sum) = sum; - local_set(&__get_cpu_var(alert_counter), 0); - } - - /* see if the nmi watchdog went off */ - if (!__get_cpu_var(wd_enabled)) - return rc; - switch (nmi_watchdog) { - case NMI_LOCAL_APIC: - rc |= lapic_wd_event(nmi_hz); - break; - case NMI_IO_APIC: - /* don't know how to accurately check for this. - * just assume it was a watchdog timer interrupt - * This matches the old behaviour. - */ - rc = 1; - break; - } - return rc; -} - -#ifdef CONFIG_SYSCTL - -static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) -{ - unsigned char reason = get_nmi_reason(); - char buf[64]; - - sprintf(buf, "NMI received for unknown reason %02x\n", reason); - die_nmi(buf, regs, 1); /* Always panic here */ - return 0; -} - -/* - * proc handler for /proc/sys/kernel/nmi - */ -int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, - void __user *buffer, size_t *length, loff_t *ppos) -{ - int old_state; - - nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; - old_state = nmi_watchdog_enabled; - proc_dointvec(table, write, file, buffer, length, ppos); - if (!!old_state == !!nmi_watchdog_enabled) - return 0; - - if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) { - printk( KERN_WARNING "NMI watchdog is permanently disabled\n"); - return -EIO; - } - - /* if nmi_watchdog is not set yet, then set it */ - nmi_watchdog_default(); - - if (nmi_watchdog == NMI_LOCAL_APIC) { - if (nmi_watchdog_enabled) - enable_lapic_nmi_watchdog(); - else - disable_lapic_nmi_watchdog(); - } else { - printk( KERN_WARNING - "NMI watchdog doesn't know what hardware to touch\n"); - return -EIO; - } - return 0; -} - -#endif - -int do_nmi_callback(struct pt_regs *regs, int cpu) -{ -#ifdef CONFIG_SYSCTL - if (unknown_nmi_panic) - return unknown_nmi_panic_callback(regs, cpu); -#endif - return 0; -} - -void __trigger_all_cpu_backtrace(void) -{ - int i; - - backtrace_mask = cpu_online_map; - /* Wait for up to 10 seconds for all CPUs to do the backtrace */ - for (i = 0; i < 10 * 1000; i++) { - if (cpus_empty(backtrace_mask)) - break; - mdelay(1); - } -} - -EXPORT_SYMBOL(nmi_active); -EXPORT_SYMBOL(nmi_watchdog); -- cgit v1.2.3-70-g09d2 From d44b9d17faf7bca165ce73a1acb936b65a3f0cc6 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Tue, 3 Jun 2008 13:06:07 -0700 Subject: x86: move bugs_64.c to cpu/bugs_64.c It looks good to move bugs_64.c to cpu/bugs_64.c. Signed-off-by: Hiroshi Shimamoto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/Makefile | 1 - arch/x86/kernel/bugs_64.c | 33 --------------------------------- arch/x86/kernel/cpu/Makefile | 1 + arch/x86/kernel/cpu/bugs_64.c | 33 +++++++++++++++++++++++++++++++++ 4 files changed, 34 insertions(+), 34 deletions(-) delete mode 100644 arch/x86/kernel/bugs_64.c create mode 100644 arch/x86/kernel/cpu/bugs_64.c (limited to 'arch/x86/kernel/Makefile') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5e618c3b472..4934fec38c4 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -25,7 +25,6 @@ obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o setup64.o obj-y += bootflag.o e820_$(BITS).o obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o -obj-$(CONFIG_X86_64) += bugs_64.o obj-y += tsc_$(BITS).o io_delay.o rtc.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o diff --git a/arch/x86/kernel/bugs_64.c b/arch/x86/kernel/bugs_64.c deleted file mode 100644 index 9a3ed0649d4..00000000000 --- a/arch/x86/kernel/bugs_64.c +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (C) 1994 Linus Torvalds - * Copyright (C) 2000 SuSE - */ - -#include -#include -#include -#include -#include -#include -#include - -void __init check_bugs(void) -{ - identify_boot_cpu(); -#if !defined(CONFIG_SMP) - printk("CPU: "); - print_cpu_info(&boot_cpu_data); -#endif - alternative_instructions(); - - /* - * Make sure the first 2MB area is not mapped by huge pages - * There are typically fixed size MTRRs in there and overlapping - * MTRRs into large pages causes slow downs. - * - * Right now we don't do that with gbpages because there seems - * very little benefit for that case. - */ - if (!direct_gbpages) - set_memory_4k((unsigned long)__va(0), 1); -} diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index c77a1c50d94..65b1be5fe9c 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -6,6 +6,7 @@ obj-y := intel_cacheinfo.o addon_cpuid_features.o obj-y += proc.o feature_names.o obj-$(CONFIG_X86_32) += common.o bugs.o +obj-$(CONFIG_X86_64) += bugs_64.o obj-$(CONFIG_X86_32) += amd.o obj-$(CONFIG_X86_64) += amd_64.o obj-$(CONFIG_X86_32) += cyrix.o diff --git a/arch/x86/kernel/cpu/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c new file mode 100644 index 00000000000..9a3ed0649d4 --- /dev/null +++ b/arch/x86/kernel/cpu/bugs_64.c @@ -0,0 +1,33 @@ +/* + * Copyright (C) 1994 Linus Torvalds + * Copyright (C) 2000 SuSE + */ + +#include +#include +#include +#include +#include +#include +#include + +void __init check_bugs(void) +{ + identify_boot_cpu(); +#if !defined(CONFIG_SMP) + printk("CPU: "); + print_cpu_info(&boot_cpu_data); +#endif + alternative_instructions(); + + /* + * Make sure the first 2MB area is not mapped by huge pages + * There are typically fixed size MTRRs in there and overlapping + * MTRRs into large pages causes slow downs. + * + * Right now we don't do that with gbpages because there seems + * very little benefit for that case. + */ + if (!direct_gbpages) + set_memory_4k((unsigned long)__va(0), 1); +} -- cgit v1.2.3-70-g09d2 From 0c51a965ed3c44dd50497e74492a015680e49899 Mon Sep 17 00:00:00 2001 From: "Huang, Ying" Date: Mon, 2 Jun 2008 14:26:23 +0800 Subject: x86: extract common part of head32.c and head64.c into head.c This patch extracts the common part of head32.c and head64.c into head.c. Signed-off-by: Huang Ying Cc: andi@firstfloor.org Cc: mingo@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/Makefile | 1 + arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/head.c | 55 +++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/head32.c | 50 ----------------------------------------- arch/x86/kernel/head64.c | 50 ----------------------------------------- include/asm-x86/bios_ebda.h | 2 ++ 6 files changed, 59 insertions(+), 101 deletions(-) create mode 100644 arch/x86/kernel/head.c (limited to 'arch/x86/kernel/Makefile') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 3cff3c894cf..30b046bcccb 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -160,6 +160,7 @@ KBUILD_AFLAGS += $(mflags-y) head-y := arch/x86/kernel/head_$(BITS).o head-y += arch/x86/kernel/head$(BITS).o +head-y += arch/x86/kernel/head.o head-y += arch/x86/kernel/init_task.o libs-y += arch/x86/lib/ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5369a4e36f1..7e01ce39c83 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -2,7 +2,7 @@ # Makefile for the linux kernel. # -extra-y := head_$(BITS).o head$(BITS).o init_task.o vmlinux.lds +extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinux.lds CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c new file mode 100644 index 00000000000..e0d0ce58979 --- /dev/null +++ b/arch/x86/kernel/head.c @@ -0,0 +1,55 @@ +#include +#include + +#include +#include + +#define BIOS_LOWMEM_KILOBYTES 0x413 + +/* + * The BIOS places the EBDA/XBDA at the top of conventional + * memory, and usually decreases the reported amount of + * conventional memory (int 0x12) too. This also contains a + * workaround for Dell systems that neglect to reserve EBDA. + * The same workaround also avoids a problem with the AMD768MPX + * chipset: reserve a page before VGA to prevent PCI prefetch + * into it (errata #56). Usually the page is reserved anyways, + * unless you have no PS/2 mouse plugged in. + */ +void __init reserve_ebda_region(void) +{ + unsigned int lowmem, ebda_addr; + + /* To determine the position of the EBDA and the */ + /* end of conventional memory, we need to look at */ + /* the BIOS data area. In a paravirtual environment */ + /* that area is absent. We'll just have to assume */ + /* that the paravirt case can handle memory setup */ + /* correctly, without our help. */ + if (paravirt_enabled()) + return; + + /* end of low (conventional) memory */ + lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES); + lowmem <<= 10; + + /* start of EBDA area */ + ebda_addr = get_bios_ebda(); + + /* Fixup: bios puts an EBDA in the top 64K segment */ + /* of conventional memory, but does not adjust lowmem. */ + if ((lowmem - ebda_addr) <= 0x10000) + lowmem = ebda_addr; + + /* Fixup: bios does not report an EBDA at all. */ + /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */ + if ((ebda_addr == 0) && (lowmem >= 0x9f000)) + lowmem = 0x9f000; + + /* Paranoia: should never happen, but... */ + if ((lowmem == 0) || (lowmem >= 0x100000)) + lowmem = 0x9f000; + + /* reserve all memory between lowmem and the 1MB mark */ + reserve_early(lowmem, 0x100000, "BIOS reserved"); +} diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 0b6cb9fba71..fa1d25dd83e 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -13,56 +13,6 @@ #include #include -#define BIOS_LOWMEM_KILOBYTES 0x413 - -/* - * The BIOS places the EBDA/XBDA at the top of conventional - * memory, and usually decreases the reported amount of - * conventional memory (int 0x12) too. This also contains a - * workaround for Dell systems that neglect to reserve EBDA. - * The same workaround also avoids a problem with the AMD768MPX - * chipset: reserve a page before VGA to prevent PCI prefetch - * into it (errata #56). Usually the page is reserved anyways, - * unless you have no PS/2 mouse plugged in. - */ -static void __init reserve_ebda_region(void) -{ - unsigned int lowmem, ebda_addr; - - /* To determine the position of the EBDA and the */ - /* end of conventional memory, we need to look at */ - /* the BIOS data area. In a paravirtual environment */ - /* that area is absent. We'll just have to assume */ - /* that the paravirt case can handle memory setup */ - /* correctly, without our help. */ - if (paravirt_enabled()) - return; - - /* end of low (conventional) memory */ - lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES); - lowmem <<= 10; - - /* start of EBDA area */ - ebda_addr = get_bios_ebda(); - - /* Fixup: bios puts an EBDA in the top 64K segment */ - /* of conventional memory, but does not adjust lowmem. */ - if ((lowmem - ebda_addr) <= 0x10000) - lowmem = ebda_addr; - - /* Fixup: bios does not report an EBDA at all. */ - /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */ - if ((ebda_addr == 0) && (lowmem >= 0x9f000)) - lowmem = 0x9f000; - - /* Paranoia: should never happen, but... */ - if ((lowmem == 0) || (lowmem >= 0x100000)) - lowmem = 0x9f000; - - /* reserve all memory between lowmem and the 1MB mark */ - reserve_early(lowmem, 0x100000, "BIOS reserved"); -} - void __init i386_start_kernel(void) { reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index e25c57b8aa8..f1773c8ee21 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -51,56 +51,6 @@ static void __init copy_bootdata(char *real_mode_data) } } -#define BIOS_LOWMEM_KILOBYTES 0x413 - -/* - * The BIOS places the EBDA/XBDA at the top of conventional - * memory, and usually decreases the reported amount of - * conventional memory (int 0x12) too. This also contains a - * workaround for Dell systems that neglect to reserve EBDA. - * The same workaround also avoids a problem with the AMD768MPX - * chipset: reserve a page before VGA to prevent PCI prefetch - * into it (errata #56). Usually the page is reserved anyways, - * unless you have no PS/2 mouse plugged in. - */ -static void __init reserve_ebda_region(void) -{ - unsigned int lowmem, ebda_addr; - - /* To determine the position of the EBDA and the */ - /* end of conventional memory, we need to look at */ - /* the BIOS data area. In a paravirtual environment */ - /* that area is absent. We'll just have to assume */ - /* that the paravirt case can handle memory setup */ - /* correctly, without our help. */ - if (paravirt_enabled()) - return; - - /* end of low (conventional) memory */ - lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES); - lowmem <<= 10; - - /* start of EBDA area */ - ebda_addr = get_bios_ebda(); - - /* Fixup: bios puts an EBDA in the top 64K segment */ - /* of conventional memory, but does not adjust lowmem. */ - if ((lowmem - ebda_addr) <= 0x10000) - lowmem = ebda_addr; - - /* Fixup: bios does not report an EBDA at all. */ - /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */ - if ((ebda_addr == 0) && (lowmem >= 0x9f000)) - lowmem = 0x9f000; - - /* Paranoia: should never happen, but... */ - if ((lowmem == 0) || (lowmem >= 0x100000)) - lowmem = 0x9f000; - - /* reserve all memory between lowmem and the 1MB mark */ - reserve_early(lowmem, 0x100000, "BIOS reserved"); -} - static void __init reserve_setup_data(void) { struct setup_data *data; diff --git a/include/asm-x86/bios_ebda.h b/include/asm-x86/bios_ebda.h index b4a46b7be79..0033e50c13b 100644 --- a/include/asm-x86/bios_ebda.h +++ b/include/asm-x86/bios_ebda.h @@ -14,4 +14,6 @@ static inline unsigned int get_bios_ebda(void) return address; /* 0 means none */ } +void reserve_ebda_region(void); + #endif /* _MACH_BIOS_EBDA_H */ -- cgit v1.2.3-70-g09d2 From ca7ed057ae25e5e60814f950995f22f051d2e449 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 26 Jun 2008 21:27:42 +0200 Subject: x86, AMD IOMMU: add amd_iommu_init.c to Makefile This patch adds the source file amd_iommu_init.c to the kernel Makefile for the x86 architecture. Signed-off-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: Sebastian.Biemueller@amd.com Cc: robert.richter@amd.com Cc: joro@8bytes.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel/Makefile') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 77807d4769c..1e4e00aca18 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -100,6 +100,7 @@ ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o + obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o -- cgit v1.2.3-70-g09d2 From 000fca2dfcfbd2859fed1208c38c899ab4873049 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 26 Jun 2008 21:27:54 +0200 Subject: x86, AMD IOMMU: add amd_iommu.c to Makefile This patch adds the new amd_iommu.c file to the architecture kernel Makefile for x86. Signed-off-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: Sebastian.Biemueller@amd.com Cc: robert.richter@amd.com Cc: joro@8bytes.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel/Makefile') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 1e4e00aca18..b34e0bb18b6 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -100,7 +100,7 @@ ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o - obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o + obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o -- cgit v1.2.3-70-g09d2 From 064d25f12014ae1d97c2882f9ab874995321f2b2 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 16 Jun 2008 19:58:28 -0700 Subject: x86: merge setup_memory_map with e820 ... and kill e820_32/64.c and e820_32/64.h Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/e820.c | 72 +++++++++++++++++++++++++++++++++ arch/x86/kernel/e820_32.c | 29 -------------- arch/x86/kernel/e820_64.c | 92 ------------------------------------------- arch/x86/kernel/setup_64.c | 8 +--- arch/x86/mach-default/setup.c | 47 ---------------------- arch/x86/mm/init_64.c | 12 ++++++ include/asm-x86/e820.h | 9 +++-- include/asm-x86/e820_32.h | 24 ----------- include/asm-x86/e820_64.h | 26 ------------ include/asm-x86/setup.h | 5 --- 11 files changed, 92 insertions(+), 234 deletions(-) delete mode 100644 arch/x86/kernel/e820_32.c delete mode 100644 arch/x86/kernel/e820_64.c delete mode 100644 include/asm-x86/e820_32.h delete mode 100644 include/asm-x86/e820_64.h (limited to 'arch/x86/kernel/Makefile') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index dc3c636d113..bcc2b123dab 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -22,7 +22,7 @@ obj-y += setup_$(BITS).o i8259_$(BITS).o setup.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o setup64.o -obj-y += bootflag.o e820_$(BITS).o e820.o +obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o obj-$(CONFIG_X86_64) += bugs_64.o diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 432c4917857..49477484a2f 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1029,4 +1029,76 @@ void __init e820_reserve_resources(void) } } +char *__init __attribute__((weak)) machine_specific_memory_setup(void) +{ + char *who = "BIOS-e820"; + int new_nr; + /* + * Try to copy the BIOS-supplied E820-map. + * + * Otherwise fake a memory map; one section from 0k->640k, + * the next section from 1mb->appropriate_mem_k + */ + new_nr = boot_params.e820_entries; + sanitize_e820_map(boot_params.e820_map, + ARRAY_SIZE(boot_params.e820_map), + &new_nr); + boot_params.e820_entries = new_nr; + if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0) { +#ifdef CONFIG_X86_64 + early_panic("Cannot find a valid memory map"); +#else + unsigned long mem_size; + + /* compare results from other methods and take the greater */ + if (boot_params.alt_mem_k + < boot_params.screen_info.ext_mem_k) { + mem_size = boot_params.screen_info.ext_mem_k; + who = "BIOS-88"; + } else { + mem_size = boot_params.alt_mem_k; + who = "BIOS-e801"; + } + + e820.nr_map = 0; + e820_add_region(0, LOWMEMSIZE(), E820_RAM); + e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM); +#endif + } + + /* In case someone cares... */ + return who; +} + +/* Overridden in paravirt.c if CONFIG_PARAVIRT */ +char * __init __attribute__((weak)) memory_setup(void) +{ + return machine_specific_memory_setup(); +} + +void __init setup_memory_map(void) +{ + printk(KERN_INFO "BIOS-provided physical RAM map:\n"); + e820_print_map(memory_setup()); +} + +#ifdef CONFIG_X86_64 +int __init arch_get_ram_range(int slot, u64 *addr, u64 *size) +{ + int i; + if (slot < 0 || slot >= e820.nr_map) + return -1; + for (i = slot; i < e820.nr_map; i++) { + if (e820.map[i].type != E820_RAM) + continue; + break; + } + if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT)) + return -1; + *addr = e820.map[i].addr; + *size = min_t(u64, e820.map[i].size + e820.map[i].addr, + max_pfn << PAGE_SHIFT) - *addr; + return i + 1; +} +#endif diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c deleted file mode 100644 index 1205ccf086d..00000000000 --- a/arch/x86/kernel/e820_32.c +++ /dev/null @@ -1,29 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -/* Overridden in paravirt.c if CONFIG_PARAVIRT */ -char * __init __attribute__((weak)) memory_setup(void) -{ - return machine_specific_memory_setup(); -} - -void __init setup_memory_map(void) -{ - printk(KERN_INFO "BIOS-provided physical RAM map:\n"); - e820_print_map(memory_setup()); -} - diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c deleted file mode 100644 index cb03bff9fa2..00000000000 --- a/arch/x86/kernel/e820_64.c +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Handle the memory map. - * The functions here do the job until bootmem takes over. - * - * Getting sanitize_e820_map() in sync with i386 version by applying change: - * - Provisions for empty E820 memory regions (reported by certain BIOSes). - * Alex Achenbach , December 2002. - * Venkatesh Pallipadi - * - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * PFN of last memory page. - */ -unsigned long end_pfn; - -/* - * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. - * The direct mapping extends to max_pfn_mapped, so that we can directly access - * apertures, ACPI and other tables without having to play with fixmaps. - */ -unsigned long max_pfn_mapped; - -static void early_panic(char *msg) -{ - early_printk(msg); - panic(msg); -} - -/* We're not void only for x86 32-bit compat */ -char *__init machine_specific_memory_setup(void) -{ - char *who = "BIOS-e820"; - int new_nr; - /* - * Try to copy the BIOS-supplied E820-map. - * - * Otherwise fake a memory map; one section from 0k->640k, - * the next section from 1mb->appropriate_mem_k - */ - new_nr = boot_params.e820_entries; - sanitize_e820_map(boot_params.e820_map, - ARRAY_SIZE(boot_params.e820_map), - &new_nr); - boot_params.e820_entries = new_nr; - if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0) - early_panic("Cannot find a valid memory map"); - printk(KERN_INFO "BIOS-provided physical RAM map:\n"); - e820_print_map(who); - - /* In case someone cares... */ - return who; -} - -int __init arch_get_ram_range(int slot, u64 *addr, u64 *size) -{ - int i; - - if (slot < 0 || slot >= e820.nr_map) - return -1; - for (i = slot; i < e820.nr_map; i++) { - if (e820.map[i].type != E820_RAM) - continue; - break; - } - if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT)) - return -1; - *addr = e820.map[i].addr; - *size = min_t(u64, e820.map[i].size + e820.map[i].addr, - max_pfn << PAGE_SHIFT) - *addr; - return i + 1; -} diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index ec65696ca2c..3220c7b56eb 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -266,12 +266,6 @@ static inline void __init reserve_crashkernel(void) {} #endif -/* Overridden in paravirt.c if CONFIG_PARAVIRT */ -void __attribute__((weak)) __init memory_setup(void) -{ - machine_specific_memory_setup(); -} - #ifdef CONFIG_PCI_MMCONFIG extern void __cpuinit fam10h_check_enable_mmcfg(void); extern void __init check_enable_amd_mmconf_dmi(void); @@ -316,7 +310,7 @@ void __init setup_arch(char **cmdline_p) ARCH_SETUP - memory_setup(); + setup_memory_map(); copy_edd(); if (!boot_params.hdr.root_flags) diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c index 7ae692a7676..2f5e277686b 100644 --- a/arch/x86/mach-default/setup.c +++ b/arch/x86/mach-default/setup.c @@ -142,50 +142,3 @@ static int __init print_ipi_mode(void) late_initcall(print_ipi_mode); -/** - * machine_specific_memory_setup - Hook for machine specific memory setup. - * - * Description: - * This is included late in kernel/setup.c so that it can make - * use of all of the static functions. - **/ - -char * __init machine_specific_memory_setup(void) -{ - char *who; - int new_nr; - - - who = "BIOS-e820"; - - /* - * Try to copy the BIOS-supplied E820-map. - * - * Otherwise fake a memory map; one section from 0k->640k, - * the next section from 1mb->appropriate_mem_k - */ - new_nr = boot_params.e820_entries; - sanitize_e820_map(boot_params.e820_map, - ARRAY_SIZE(boot_params.e820_map), - &new_nr); - boot_params.e820_entries = new_nr; - if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) - < 0) { - unsigned long mem_size; - - /* compare results from other methods and take the greater */ - if (boot_params.alt_mem_k - < boot_params.screen_info.ext_mem_k) { - mem_size = boot_params.screen_info.ext_mem_k; - who = "BIOS-88"; - } else { - mem_size = boot_params.alt_mem_k; - who = "BIOS-e801"; - } - - e820.nr_map = 0; - e820_add_region(0, LOWMEMSIZE(), E820_RAM); - e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM); - } - return who; -} diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index b8c2c1ef7ad..5266f3141d6 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -47,6 +47,18 @@ #include #include +/* + * PFN of last memory page. + */ +unsigned long end_pfn; + +/* + * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. + * The direct mapping extends to max_pfn_mapped, so that we can directly access + * apertures, ACPI and other tables without having to play with fixmaps. + */ +unsigned long max_pfn_mapped; + static unsigned long dma_reserve __initdata; DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h index e860fe758e7..83144cb6c68 100644 --- a/include/asm-x86/e820.h +++ b/include/asm-x86/e820.h @@ -101,6 +101,9 @@ extern void e820_register_active_regions(int nid, unsigned long start_pfn, extern u64 e820_hole_size(u64 start, u64 end); extern void finish_e820_parsing(void); extern void e820_reserve_resources(void); +extern void setup_memory_map(void); +extern char *machine_specific_memory_setup(void); +extern char *memory_setup(void); #endif /* __ASSEMBLY__ */ @@ -111,10 +114,10 @@ extern void e820_reserve_resources(void); #define BIOS_END 0x00100000 #ifdef __KERNEL__ +#include + #ifdef CONFIG_X86_32 -# include "e820_32.h" -#else -# include "e820_64.h" +#define HIGH_MEMORY (1024*1024) #endif #endif /* __KERNEL__ */ diff --git a/include/asm-x86/e820_32.h b/include/asm-x86/e820_32.h deleted file mode 100644 index 557b890549f..00000000000 --- a/include/asm-x86/e820_32.h +++ /dev/null @@ -1,24 +0,0 @@ -/* - * structures and definitions for the int 15, ax=e820 memory map - * scheme. - * - * In a nutshell, arch/i386/boot/setup.S populates a scratch table - * in the empty_zero_block that contains a list of usable address/size - * duples. In arch/i386/kernel/setup.c, this information is - * transferred into the e820map, and in arch/i386/mm/init.c, that - * new information is used to mark pages reserved or not. - * - */ -#ifndef __E820_HEADER -#define __E820_HEADER - -#include - -#define HIGH_MEMORY (1024*1024) - -#ifndef __ASSEMBLY__ - -extern void setup_memory_map(void); - -#endif/*!__ASSEMBLY__*/ -#endif/*__E820_HEADER*/ diff --git a/include/asm-x86/e820_64.h b/include/asm-x86/e820_64.h deleted file mode 100644 index 8d992109969..00000000000 --- a/include/asm-x86/e820_64.h +++ /dev/null @@ -1,26 +0,0 @@ -/* - * structures and definitions for the int 15, ax=e820 memory map - * scheme. - * - * In a nutshell, setup.S populates a scratch table in the - * empty_zero_block that contains a list of usable address/size - * duples. setup.c, this information is transferred into the e820map, - * and in init.c/numa.c, that new information is used to mark pages - * reserved or not. - */ -#ifndef __E820_HEADER -#define __E820_HEADER - -#include - -#ifndef __ASSEMBLY__ -extern void setup_memory_region(void); -extern void contig_e820_setup(void); -extern int e820_any_non_reserved(unsigned long start, unsigned long end); -extern int is_memory_any_valid(unsigned long start, unsigned long end); -extern int e820_all_non_reserved(unsigned long start, unsigned long end); -extern int is_memory_all_valid(unsigned long start, unsigned long end); - -#endif/*!__ASSEMBLY__*/ - -#endif/*__E820_HEADER*/ diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h index 9e163fc3e98..b676b0be798 100644 --- a/include/asm-x86/setup.h +++ b/include/asm-x86/setup.h @@ -8,7 +8,6 @@ /* Interrupt control for vSMPowered x86_64 systems */ void vsmp_init(void); -char *machine_specific_memory_setup(void); #ifndef CONFIG_PARAVIRT #define paravirt_post_allocator_init() do {} while (0) #endif @@ -50,10 +49,6 @@ extern struct boot_params boot_params; */ #define LOWMEMSIZE() (0x9f000) -char * __init machine_specific_memory_setup(void); -char *memory_setup(void); - - void __init i386_start_kernel(void); extern unsigned long init_pg_tables_start; -- cgit v1.2.3-70-g09d2 From 1812924bb1823950c1dc95c478b71b037057356e Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Mon, 2 Jun 2008 08:56:14 -0500 Subject: x86, SGI UV: TLB shootdown using broadcast assist unit TLB shootdown for SGI UV. Depends on patch (in tip/x86/irq): x86-update-macros-used-by-uv-platform.patch Jack Steiner May 29 This patch provides the ability to flush TLB's in cpu's that are not on the local node. The hardware mechanism for distributing the flush messages is the UV's "broadcast assist unit". The hook to intercept TLB shootdown requests is a 2-line change to native_flush_tlb_others() (arch/x86/kernel/tlb_64.c). This code has been tested on a hardware simulator. The real hardware is not yet available. The shootdown statistics are provided through /proc/sgi_uv/ptc_statistics. The use of /sys was considered, but would have required the use of many /sys files. The debugfs was also considered, but these statistics should be available on an ongoing basis, not just for debugging. Issues to be fixed later: - The IRQ for the messaging interrupt is currently hardcoded as 200 (see UV_BAU_MESSAGE). It should be dynamically assigned in the future. - The use of appropriate udelay()'s is untested, as they are a problem in the simulator. Signed-off-by: Cliff Wickman Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/entry_64.S | 4 + arch/x86/kernel/tlb_64.c | 5 + arch/x86/kernel/tlb_uv.c | 736 ++++++++++++++++++++++++++++++++++++++++++++ include/asm-x86/uv/uv_bau.h | 331 ++++++++++++++++++++ 5 files changed, 1077 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/tlb_uv.c create mode 100644 include/asm-x86/uv/uv_bau.h (limited to 'arch/x86/kernel/Makefile') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 605995e11ea..4078e98f012 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -94,7 +94,7 @@ obj-$(CONFIG_OLPC) += olpc.o ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) - obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o + obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o obj-$(CONFIG_AUDIT) += audit_64.o diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 556a8df522a..6fd1987466e 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -720,6 +720,10 @@ ENTRY(apic_timer_interrupt) apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt END(apic_timer_interrupt) +ENTRY(uv_bau_message_intr1) + apicinterrupt 220,uv_bau_message_interrupt +END(uv_bau_message_intr1) + ENTRY(error_interrupt) apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt END(error_interrupt) diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c index a1f07d79320..fc132113bda 100644 --- a/arch/x86/kernel/tlb_64.c +++ b/arch/x86/kernel/tlb_64.c @@ -15,6 +15,8 @@ #include #include #include +#include +#include #include /* @@ -162,6 +164,9 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, union smp_flush_state *f; cpumask_t cpumask = *cpumaskp; + if (is_uv_system() && uv_flush_tlb_others(&cpumask, mm, va)) + return; + /* Caller has disabled preemption */ sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; f = &per_cpu(flush_state, sender); diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c new file mode 100644 index 00000000000..28e7c68d9d7 --- /dev/null +++ b/arch/x86/kernel/tlb_uv.c @@ -0,0 +1,736 @@ +/* + * SGI UltraViolet TLB flush routines. + * + * (c) 2008 Cliff Wickman , SGI. + * + * This code is released under the GNU General Public License version 2 or + * later. + */ +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +struct bau_control **uv_bau_table_bases; +static int uv_bau_retry_limit; +static int uv_nshift; /* position of pnode (which is nasid>>1) */ +static unsigned long uv_mmask; + +char *status_table[] = { + "IDLE", + "ACTIVE", + "DESTINATION TIMEOUT", + "SOURCE TIMEOUT" +}; + +DEFINE_PER_CPU(struct ptc_stats, ptcstats); +DEFINE_PER_CPU(struct bau_control, bau_control); + +/* + * Free a software acknowledge hardware resource by clearing its Pending + * bit. This will return a reply to the sender. + * If the message has timed out, a reply has already been sent by the + * hardware but the resource has not been released. In that case our + * clear of the Timeout bit (as well) will free the resource. No reply will + * be sent (the hardware will only do one reply per message). + */ +static void +uv_reply_to_message(int resource, + struct bau_payload_queue_entry *msg, + struct bau_msg_status *msp) +{ + int fw; + + fw = (1 << (resource + UV_SW_ACK_NPENDING)) | (1 << resource); + msg->replied_to = 1; + msg->sw_ack_vector = 0; + if (msp) + msp->seen_by.bits = 0; + uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, fw); + return; +} + +/* + * Do all the things a cpu should do for a TLB shootdown message. + * Other cpu's may come here at the same time for this message. + */ +static void +uv_bau_process_message(struct bau_payload_queue_entry *msg, + int msg_slot, int sw_ack_slot) +{ + int cpu; + unsigned long this_cpu_mask; + struct bau_msg_status *msp; + + msp = __get_cpu_var(bau_control).msg_statuses + msg_slot; + cpu = uv_blade_processor_id(); + msg->number_of_cpus = + uv_blade_nr_online_cpus(uv_node_to_blade_id(numa_node_id())); + this_cpu_mask = (unsigned long)1 << cpu; + if (msp->seen_by.bits & this_cpu_mask) + return; + atomic_or_long(&msp->seen_by.bits, this_cpu_mask); + + if (msg->replied_to == 1) + return; + + if (msg->address == TLB_FLUSH_ALL) { + local_flush_tlb(); + __get_cpu_var(ptcstats).alltlb++; + } else { + __flush_tlb_one(msg->address); + __get_cpu_var(ptcstats).onetlb++; + } + + __get_cpu_var(ptcstats).requestee++; + + atomic_inc_short(&msg->acknowledge_count); + if (msg->number_of_cpus == msg->acknowledge_count) + uv_reply_to_message(sw_ack_slot, msg, msp); + return; +} + +/* + * Examine the payload queue on all the distribution nodes to see + * which messages have not been seen, and which cpu(s) have not seen them. + * + * Returns the number of cpu's that have not responded. + */ +static int +uv_examine_destinations(struct bau_target_nodemask *distribution) +{ + int sender; + int i; + int j; + int k; + int count = 0; + struct bau_control *bau_tablesp; + struct bau_payload_queue_entry *msg; + struct bau_msg_status *msp; + + sender = smp_processor_id(); + for (i = 0; i < (sizeof(struct bau_target_nodemask) * BITSPERBYTE); + i++) { + if (bau_node_isset(i, distribution)) { + bau_tablesp = uv_bau_table_bases[i]; + for (msg = bau_tablesp->va_queue_first, j = 0; + j < DESTINATION_PAYLOAD_QUEUE_SIZE; msg++, j++) { + if ((msg->sending_cpu == sender) && + (!msg->replied_to)) { + msp = bau_tablesp->msg_statuses + j; + printk(KERN_DEBUG + "blade %d: address:%#lx %d of %d, not cpu(s): ", + i, msg->address, + msg->acknowledge_count, + msg->number_of_cpus); + for (k = 0; k < msg->number_of_cpus; + k++) { + if (!((long)1 << k & msp-> + seen_by.bits)) { + count++; + printk("%d ", k); + } + } + printk("\n"); + } + } + } + } + return count; +} + +/** + * uv_flush_tlb_others - globally purge translation cache of a virtual + * address or all TLB's + * @cpumaskp: mask of all cpu's in which the address is to be removed + * @mm: mm_struct containing virtual address range + * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu) + * + * This is the entry point for initiating any UV global TLB shootdown. + * + * Purges the translation caches of all specified processors of the given + * virtual address, or purges all TLB's on specified processors. + * + * The caller has derived the cpumaskp from the mm_struct and has subtracted + * the local cpu from the mask. This function is called only if there + * are bits set in the mask. (e.g. flush_tlb_page()) + * + * The cpumaskp is converted into a nodemask of the nodes containing + * the cpus. + */ +int +uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm, unsigned long va) +{ + int i; + int blade; + int cpu; + int bit; + int right_shift; + int this_blade; + int exams = 0; + int tries = 0; + long source_timeouts = 0; + long destination_timeouts = 0; + unsigned long index; + unsigned long mmr_offset; + unsigned long descriptor_status; + struct bau_activation_descriptor *bau_desc; + ktime_t time1, time2; + + cpu = uv_blade_processor_id(); + this_blade = uv_numa_blade_id(); + bau_desc = __get_cpu_var(bau_control).descriptor_base; + bau_desc += (UV_ITEMS_PER_DESCRIPTOR * cpu); + + bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); + + i = 0; + for_each_cpu_mask(bit, *cpumaskp) { + blade = uv_cpu_to_blade_id(bit); + if (blade > (UV_DISTRIBUTION_SIZE - 1)) + BUG(); + if (blade == this_blade) + continue; + bau_node_set(blade, &bau_desc->distribution); + /* leave the bits for the remote cpu's in the mask until + success; on failure we fall back to the IPI method */ + i++; + } + if (i == 0) + goto none_to_flush; + __get_cpu_var(ptcstats).requestor++; + __get_cpu_var(ptcstats).ntargeted += i; + + bau_desc->payload.address = va; + bau_desc->payload.sending_cpu = smp_processor_id(); + + if (cpu < UV_CPUS_PER_ACT_STATUS) { + mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; + right_shift = cpu * UV_ACT_STATUS_SIZE; + } else { + mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1; + right_shift = + ((cpu - UV_CPUS_PER_ACT_STATUS) * UV_ACT_STATUS_SIZE); + } + time1 = ktime_get(); + +retry: + tries++; + index = ((unsigned long) + 1 << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) | cpu; + uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index); + + while ((descriptor_status = (((unsigned long) + uv_read_local_mmr(mmr_offset) >> + right_shift) & UV_ACT_STATUS_MASK)) != + DESC_STATUS_IDLE) { + if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) { + source_timeouts++; + if (source_timeouts > SOURCE_TIMEOUT_LIMIT) + source_timeouts = 0; + __get_cpu_var(ptcstats).s_retry++; + goto retry; + } + /* spin here looking for progress at the destinations */ + if (descriptor_status == DESC_STATUS_DESTINATION_TIMEOUT) { + destination_timeouts++; + if (destination_timeouts > DESTINATION_TIMEOUT_LIMIT) { + /* returns # of cpus not responding */ + if (uv_examine_destinations + (&bau_desc->distribution) == 0) { + __get_cpu_var(ptcstats).d_retry++; + goto retry; + } + exams++; + if (exams >= uv_bau_retry_limit) { + printk(KERN_DEBUG + "uv_flush_tlb_others"); + printk("giving up on cpu %d\n", + smp_processor_id()); + goto unsuccessful; + } + /* delays can hang up the simulator + udelay(1000); + */ + destination_timeouts = 0; + } + } + } + if (tries > 1) + __get_cpu_var(ptcstats).retriesok++; + /* on success, clear the remote cpu's from the mask so we don't + use the IPI method of shootdown on them */ + for_each_cpu_mask(bit, *cpumaskp) { + blade = uv_cpu_to_blade_id(bit); + if (blade == this_blade) + continue; + cpu_clear(bit, *cpumaskp); + } + +unsuccessful: + time2 = ktime_get(); + __get_cpu_var(ptcstats).sflush_ns += (time2.tv64 - time1.tv64); + +none_to_flush: + if (cpus_empty(*cpumaskp)) + return 1; + + /* Cause the caller to do an IPI-style TLB shootdown on + the cpu's still in the mask */ + __get_cpu_var(ptcstats).ptc_i++; + return 0; +} + +/* + * The BAU message interrupt comes here. (registered by set_intr_gate) + * See entry_64.S + * + * We received a broadcast assist message. + * + * Interrupts may have been disabled; this interrupt could represent + * the receipt of several messages. + * + * All cores/threads on this node get this interrupt. + * The last one to see it does the s/w ack. + * (the resource will not be freed until noninterruptable cpus see this + * interrupt; hardware will timeout the s/w ack and reply ERROR) + */ +void +uv_bau_message_interrupt(struct pt_regs *regs) +{ + struct bau_payload_queue_entry *pqp; + struct bau_payload_queue_entry *msg; + struct pt_regs *old_regs = set_irq_regs(regs); + ktime_t time1, time2; + int msg_slot; + int sw_ack_slot; + int fw; + int count = 0; + unsigned long local_pnode; + + ack_APIC_irq(); + exit_idle(); + irq_enter(); + + time1 = ktime_get(); + + local_pnode = uv_blade_to_pnode(uv_numa_blade_id()); + + pqp = __get_cpu_var(bau_control).va_queue_first; + msg = __get_cpu_var(bau_control).bau_msg_head; + while (msg->sw_ack_vector) { + count++; + fw = msg->sw_ack_vector; + msg_slot = msg - pqp; + sw_ack_slot = ffs(fw) - 1; + + uv_bau_process_message(msg, msg_slot, sw_ack_slot); + + msg++; + if (msg > __get_cpu_var(bau_control).va_queue_last) + msg = __get_cpu_var(bau_control).va_queue_first; + __get_cpu_var(bau_control).bau_msg_head = msg; + } + if (!count) + __get_cpu_var(ptcstats).nomsg++; + else if (count > 1) + __get_cpu_var(ptcstats).multmsg++; + + time2 = ktime_get(); + __get_cpu_var(ptcstats).dflush_ns += (time2.tv64 - time1.tv64); + + irq_exit(); + set_irq_regs(old_regs); + return; +} + +static void +uv_enable_timeouts(void) +{ + int i; + int blade; + int last_blade; + int pnode; + int cur_cpu = 0; + unsigned long apicid; + + /* better if we had each_online_blade */ + last_blade = -1; + for_each_online_node(i) { + blade = uv_node_to_blade_id(i); + if (blade == last_blade) + continue; + last_blade = blade; + apicid = per_cpu(x86_cpu_to_apicid, cur_cpu); + pnode = uv_blade_to_pnode(blade); + cur_cpu += uv_blade_nr_possible_cpus(i); + } + return; +} + +static void * +uv_ptc_seq_start(struct seq_file *file, loff_t *offset) +{ + if (*offset < num_possible_cpus()) + return offset; + return NULL; +} + +static void * +uv_ptc_seq_next(struct seq_file *file, void *data, loff_t *offset) +{ + (*offset)++; + if (*offset < num_possible_cpus()) + return offset; + return NULL; +} + +static void +uv_ptc_seq_stop(struct seq_file *file, void *data) +{ +} + +/* + * Display the statistics thru /proc + * data points to the cpu number + */ +static int +uv_ptc_seq_show(struct seq_file *file, void *data) +{ + struct ptc_stats *stat; + int cpu; + + cpu = *(loff_t *)data; + + if (!cpu) { + seq_printf(file, + "# cpu requestor requestee one all sretry dretry ptc_i "); + seq_printf(file, + "sw_ack sflush_us dflush_us sok dnomsg dmult starget\n"); + } + if (cpu < num_possible_cpus() && cpu_online(cpu)) { + stat = &per_cpu(ptcstats, cpu); + seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld ", + cpu, stat->requestor, + stat->requestee, stat->onetlb, stat->alltlb, + stat->s_retry, stat->d_retry, stat->ptc_i); + seq_printf(file, "%lx %ld %ld %ld %ld %ld %ld\n", + uv_read_global_mmr64(uv_blade_to_pnode + (uv_cpu_to_blade_id(cpu)), + UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE), + stat->sflush_ns / 1000, stat->dflush_ns / 1000, + stat->retriesok, stat->nomsg, + stat->multmsg, stat->ntargeted); + } + + return 0; +} + +/* + * 0: display meaning of the statistics + * >0: retry limit + */ +static ssize_t +uv_ptc_proc_write(struct file *file, const char __user *user, + size_t count, loff_t *data) +{ + long newmode; + char optstr[64]; + + if (copy_from_user(optstr, user, count)) + return -EFAULT; + optstr[count - 1] = '\0'; + if (strict_strtoul(optstr, 10, &newmode) < 0) { + printk(KERN_DEBUG "%s is invalid\n", optstr); + return -EINVAL; + } + + if (newmode == 0) { + printk(KERN_DEBUG "# cpu: cpu number\n"); + printk(KERN_DEBUG + "requestor: times this cpu was the flush requestor\n"); + printk(KERN_DEBUG + "requestee: times this cpu was requested to flush its TLBs\n"); + printk(KERN_DEBUG + "one: times requested to flush a single address\n"); + printk(KERN_DEBUG + "all: times requested to flush all TLB's\n"); + printk(KERN_DEBUG + "sretry: number of retries of source-side timeouts\n"); + printk(KERN_DEBUG + "dretry: number of retries of destination-side timeouts\n"); + printk(KERN_DEBUG + "ptc_i: times UV fell through to IPI-style flushes\n"); + printk(KERN_DEBUG + "sw_ack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n"); + printk(KERN_DEBUG + "sflush_us: microseconds spent in uv_flush_tlb_others()\n"); + printk(KERN_DEBUG + "dflush_us: microseconds spent in handling flush requests\n"); + printk(KERN_DEBUG "sok: successes on retry\n"); + printk(KERN_DEBUG "dnomsg: interrupts with no message\n"); + printk(KERN_DEBUG + "dmult: interrupts with multiple messages\n"); + printk(KERN_DEBUG "starget: nodes targeted\n"); + } else { + uv_bau_retry_limit = newmode; + printk(KERN_DEBUG "timeout retry limit:%d\n", + uv_bau_retry_limit); + } + + return count; +} + +static const struct seq_operations uv_ptc_seq_ops = { + .start = uv_ptc_seq_start, + .next = uv_ptc_seq_next, + .stop = uv_ptc_seq_stop, + .show = uv_ptc_seq_show +}; + +static int +uv_ptc_proc_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &uv_ptc_seq_ops); +} + +static const struct file_operations proc_uv_ptc_operations = { + .open = uv_ptc_proc_open, + .read = seq_read, + .write = uv_ptc_proc_write, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct proc_dir_entry *proc_uv_ptc; + +static int __init +uv_ptc_init(void) +{ + static struct proc_dir_entry *sgi_proc_dir; + + sgi_proc_dir = NULL; + + if (!is_uv_system()) + return 0; + + sgi_proc_dir = proc_mkdir("sgi_uv", NULL); + if (!sgi_proc_dir) + return -EINVAL; + + proc_uv_ptc = create_proc_entry(UV_PTC_BASENAME, 0444, NULL); + if (!proc_uv_ptc) { + printk(KERN_ERR "unable to create %s proc entry\n", + UV_PTC_BASENAME); + return -EINVAL; + } + proc_uv_ptc->proc_fops = &proc_uv_ptc_operations; + return 0; +} + +static void __exit +uv_ptc_exit(void) +{ + remove_proc_entry(UV_PTC_BASENAME, NULL); +} + +module_init(uv_ptc_init); +module_exit(uv_ptc_exit); + +/* + * Initialization of BAU-related structures + */ +int __init +uv_bau_init(void) +{ + int i; + int j; + int blade; + int nblades; + int *ip; + int pnode; + int last_blade; + int cur_cpu = 0; + unsigned long pa; + unsigned long n; + unsigned long m; + unsigned long mmr_image; + unsigned long apicid; + char *cp; + struct bau_control *bau_tablesp; + struct bau_activation_descriptor *adp, *ad2; + struct bau_payload_queue_entry *pqp; + struct bau_msg_status *msp; + struct bau_control *bcp; + + if (!is_uv_system()) + return 0; + + uv_bau_retry_limit = 1; + + if ((sizeof(struct bau_local_cpumask) * BITSPERBYTE) < + MAX_CPUS_PER_NODE) { + printk(KERN_ERR + "uv_bau_init: bau_local_cpumask.bits too small\n"); + BUG(); + } + + uv_nshift = uv_hub_info->n_val; + uv_mmask = ((unsigned long)1 << uv_hub_info->n_val) - 1; + nblades = 0; + last_blade = -1; + for_each_online_node(i) { + blade = uv_node_to_blade_id(i); + if (blade == last_blade) + continue; + last_blade = blade; + nblades++; + } + + uv_bau_table_bases = (struct bau_control **) + kmalloc(nblades * sizeof(struct bau_control *), GFP_KERNEL); + if (!uv_bau_table_bases) + BUG(); + + /* better if we had each_online_blade */ + last_blade = -1; + for_each_online_node(i) { + blade = uv_node_to_blade_id(i); + if (blade == last_blade) + continue; + last_blade = blade; + + bau_tablesp = + kmalloc_node(sizeof(struct bau_control), GFP_KERNEL, i); + if (!bau_tablesp) + BUG(); + + bau_tablesp->msg_statuses = + kmalloc_node(sizeof(struct bau_msg_status) * + DESTINATION_PAYLOAD_QUEUE_SIZE, GFP_KERNEL, i); + if (!bau_tablesp->msg_statuses) + BUG(); + for (j = 0, msp = bau_tablesp->msg_statuses; + j < DESTINATION_PAYLOAD_QUEUE_SIZE; j++, msp++) { + bau_cpubits_clear(&msp->seen_by, (int) + uv_blade_nr_possible_cpus(blade)); + } + + bau_tablesp->watching = + kmalloc_node(sizeof(int) * DESTINATION_NUM_RESOURCES, + GFP_KERNEL, i); + if (!bau_tablesp->watching) + BUG(); + for (j = 0, ip = bau_tablesp->watching; + j < DESTINATION_PAYLOAD_QUEUE_SIZE; j++, ip++) { + *ip = 0; + } + + uv_bau_table_bases[i] = bau_tablesp; + + pnode = uv_blade_to_pnode(blade); + + if (sizeof(struct bau_activation_descriptor) != 64) + BUG(); + + adp = (struct bau_activation_descriptor *) + kmalloc_node(16384, GFP_KERNEL, i); + if (!adp) + BUG(); + if ((unsigned long)adp & 0xfff) + BUG(); + pa = __pa((unsigned long)adp); + n = pa >> uv_nshift; + m = pa & uv_mmask; + + mmr_image = uv_read_global_mmr64(pnode, + UVH_LB_BAU_SB_DESCRIPTOR_BASE); + if (mmr_image) + uv_write_global_mmr64(pnode, (unsigned long) + UVH_LB_BAU_SB_DESCRIPTOR_BASE, + (n << UV_DESC_BASE_PNODE_SHIFT | + m)); + for (j = 0, ad2 = adp; j < UV_ACTIVATION_DESCRIPTOR_SIZE; + j++, ad2++) { + memset(ad2, 0, + sizeof(struct bau_activation_descriptor)); + ad2->header.sw_ack_flag = 1; + ad2->header.base_dest_nodeid = + uv_blade_to_pnode(uv_cpu_to_blade_id(0)); + ad2->header.command = UV_NET_ENDPOINT_INTD; + ad2->header.int_both = 1; + /* all others need to be set to zero: + fairness chaining multilevel count replied_to */ + } + + pqp = (struct bau_payload_queue_entry *) + kmalloc_node((DESTINATION_PAYLOAD_QUEUE_SIZE + 1) * + sizeof(struct bau_payload_queue_entry), + GFP_KERNEL, i); + if (!pqp) + BUG(); + if (sizeof(struct bau_payload_queue_entry) != 32) + BUG(); + if ((unsigned long)(&((struct bau_payload_queue_entry *)0)-> + sw_ack_vector) != 15) + BUG(); + + cp = (char *)pqp + 31; + pqp = (struct bau_payload_queue_entry *) + (((unsigned long)cp >> 5) << 5); + bau_tablesp->va_queue_first = pqp; + uv_write_global_mmr64(pnode, + UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, + ((unsigned long)pnode << + UV_PAYLOADQ_PNODE_SHIFT) | + uv_physnodeaddr(pqp)); + uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL, + uv_physnodeaddr(pqp)); + bau_tablesp->va_queue_last = + pqp + (DESTINATION_PAYLOAD_QUEUE_SIZE - 1); + uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST, + (unsigned long) + uv_physnodeaddr(bau_tablesp-> + va_queue_last)); + memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * + DESTINATION_PAYLOAD_QUEUE_SIZE); + + /* this initialization can't be in firmware because the + messaging IRQ will be determined by the OS */ + apicid = per_cpu(x86_cpu_to_apicid, cur_cpu); + pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG); + if ((pa & 0xff) != UV_BAU_MESSAGE) { + uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, + ((apicid << 32) | + UV_BAU_MESSAGE)); + } + + for (j = cur_cpu; j < (cur_cpu + uv_blade_nr_possible_cpus(i)); + j++) { + bcp = (struct bau_control *)&per_cpu(bau_control, j); + bcp->bau_msg_head = bau_tablesp->va_queue_first; + bcp->va_queue_first = bau_tablesp->va_queue_first; + + bcp->va_queue_last = bau_tablesp->va_queue_last; + bcp->watching = bau_tablesp->watching; + bcp->msg_statuses = bau_tablesp->msg_statuses; + bcp->descriptor_base = adp; + } + cur_cpu += uv_blade_nr_possible_cpus(i); + } + + set_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1); + + uv_enable_timeouts(); + + return 0; +} + +__initcall(uv_bau_init); diff --git a/include/asm-x86/uv/uv_bau.h b/include/asm-x86/uv/uv_bau.h new file mode 100644 index 00000000000..f125f86c89a --- /dev/null +++ b/include/asm-x86/uv/uv_bau.h @@ -0,0 +1,331 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * SGI UV Broadcast Assist Unit definitions + * + * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved. + */ + +#ifndef __ASM_X86_UV_BAU__ +#define __ASM_X86_UV_BAU__ + +#include +#define BITSPERBYTE 8 + +/* Broadcast Assist Unit messaging structures */ + +/* + * Selective Broadcast activations are induced by software action + * specifying a particular 8-descriptor "set" via a 6-bit index written + * to an MMR. + * Thus there are 64 unique 512-byte sets of SB descriptors - one set for + * each 6-bit index value. These descriptor sets are mapped in sequence + * starting with set 0 located at the address specified in the + * BAU_SB_DESCRIPTOR_BASE register, set 1 is located at BASE + 512, + * set 2 is at BASE + 2*512, set 3 at BASE + 3*512, and so on. + * + * We will use 31 sets, one for sending BAU messages from each of the 32 + * cpu's on the node. + * + * TLB shootdown will use the first of the 8 descriptors of each set. + * Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set). + */ + +#define UV_ITEMS_PER_DESCRIPTOR 8 +#define UV_CPUS_PER_ACT_STATUS 32 +#define UV_ACT_STATUS_MASK 0x3 +#define UV_ACT_STATUS_SIZE 2 +#define UV_ACTIVATION_DESCRIPTOR_SIZE 32 +#define UV_DISTRIBUTION_SIZE 256 +#define UV_SW_ACK_NPENDING 8 +#define UV_BAU_MESSAGE 200 /* Messaging irq; see irq_64.h */ + /* and include/asm-x86/hw_irq_64.h */ + /* To be dynamically allocated in the future */ +#define UV_NET_ENDPOINT_INTD 0x38 +#define UV_DESC_BASE_PNODE_SHIFT 49 /* position of pnode (nasid>>1) in MMR */ +#define UV_PAYLOADQ_PNODE_SHIFT 49 + +#define UV_PTC_BASENAME "sgi_uv/ptc_statistics" +#define uv_physnodeaddr(x) ((__pa((unsigned long)(x)) & uv_mmask)) + +/* bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1 */ +#define DESC_STATUS_IDLE 0 +#define DESC_STATUS_ACTIVE 1 +#define DESC_STATUS_DESTINATION_TIMEOUT 2 +#define DESC_STATUS_SOURCE_TIMEOUT 3 + +/* source side threshholds at which message retries print a warning */ +#define SOURCE_TIMEOUT_LIMIT 20 +#define DESTINATION_TIMEOUT_LIMIT 20 + +/* number of entries in the destination side payload queue */ +#define DESTINATION_PAYLOAD_QUEUE_SIZE 17 +/* number of destination side software ack resources */ +#define DESTINATION_NUM_RESOURCES 8 +#define MAX_CPUS_PER_NODE 32 + +/* Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor) */ +/* If the 'multilevel' flag in the header portion of the descriptor + * has been set to 0, then endpoint multi-unicast mode is selected. + * The distribution specification (32 bytes) is interpreted as a 256-bit + * distribution vector. Adjacent bits correspond to consecutive even numbered + * nodeIDs. The result of adding the index of a given bit to the 15-bit + * 'base_dest_nodeid' field of the header corresponds to the + * destination nodeID associated with that specified bit. */ +struct bau_target_nodemask { + unsigned long bits[BITS_TO_LONGS(256)]; +}; + +/* mask of cpu's on a node */ +/* (during initialization we need to check that unsigned long has + enough bits for max. cpu's per node) */ +struct bau_local_cpumask { + unsigned long bits; +}; + +/* + * Payload: 16 bytes (128 bits) (bytes 0x20-0x2f of descriptor) + * only 12 bytes (96 bits) of the payload area are usable. + * An additional 3 bytes (bits 27:4) of the header address are carried + * to the next bytes of the destination payload queue. + * And an additional 2 bytes of the header Suppl_A field are also + * carried to the destination payload queue. + * But the first byte of the Suppl_A becomes bits 127:120 (the 16th byte) + * of the destination payload queue, which is written by the hardware + * with the s/w ack resource bit vector. + * [ effective message contents (16 bytes (128 bits) maximum), not counting + * the s/w ack bit vector ] + */ + +/* The payload is software-defined for INTD transactions */ +struct bau_msg_payload { + unsigned long address; /* signifies a page or all TLB's + of the cpu */ + /* 64 bits */ + unsigned short sending_cpu; /* filled in by sender */ + /* 16 bits */ + unsigned short acknowledge_count;/* filled in by destination */ + /* 16 bits */ + unsigned int reserved1:32; /* not usable */ +}; + + +/* Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor) */ +/* see table 4.2.3.0.1 in broacast_assist spec. */ +struct bau_msg_header { + int dest_subnodeid:6; /* must be zero */ + /* bits 5:0 */ + int base_dest_nodeid:15; /* nasid>>1 (pnode) of first bit in node_map */ + /* bits 20:6 */ + int command:8; /* message type */ + /* bits 28:21 */ + /* 0x38: SN3net EndPoint Message */ + int rsvd_1:3; /* must be zero */ + /* bits 31:29 */ + /* int will align on 32 bits */ + int rsvd_2:9; /* must be zero */ + /* bits 40:32 */ + /* Suppl_A is 56-41 */ + int payload_2a:8; /* becomes byte 16 of msg */ + /* bits 48:41 */ /* not currently using */ + int payload_2b:8; /* becomes byte 17 of msg */ + /* bits 56:49 */ /* not currently using */ + /* Address field (96:57) is never used as an + address (these are address bits 42:3) */ + int rsvd_3:1; /* must be zero */ + /* bit 57 */ + /* address bits 27:4 are payload */ + /* these 24 bits become bytes 12-14 of msg */ + int replied_to:1; /* sent as 0 by the source to byte 12 */ + /* bit 58 */ + + int payload_1a:5; /* not currently used */ + /* bits 63:59 */ + int payload_1b:8; /* not currently used */ + /* bits 71:64 */ + int payload_1c:8; /* not currently used */ + /* bits 79:72 */ + int payload_1d:2; /* not currently used */ + /* bits 81:80 */ + + int rsvd_4:7; /* must be zero */ + /* bits 88:82 */ + int sw_ack_flag:1; /* software acknowledge flag */ + /* bit 89 */ + /* INTD trasactions at destination are to + wait for software acknowledge */ + int rsvd_5:6; /* must be zero */ + /* bits 95:90 */ + int rsvd_6:5; /* must be zero */ + /* bits 100:96 */ + int int_both:1; /* if 1, interrupt both sockets on the blade */ + /* bit 101*/ + int fairness:3; /* usually zero */ + /* bits 104:102 */ + int multilevel:1; /* multi-level multicast format */ + /* bit 105 */ + /* 0 for TLB: endpoint multi-unicast messages */ + int chaining:1; /* next descriptor is part of this activation*/ + /* bit 106 */ + int rsvd_7:21; /* must be zero */ + /* bits 127:107 */ +}; + +/* The format of the message to send, plus all accompanying control */ +/* Should be 64 bytes */ +struct bau_activation_descriptor { + struct bau_target_nodemask distribution; + /* message template, consisting of header and payload: */ + struct bau_msg_header header; + struct bau_msg_payload payload; +}; +/* + * -payload-- ---------header------ + * bytes 0-11 bits 41-56 bits 58-81 + * A B (2) C (3) + * + * A/B/C are moved to: + * A C B + * bytes 0-11 bytes 12-14 bytes 16-17 (byte 15 filled in by hw as vector) + * ------------payload queue----------- + */ + +/* + * The payload queue on the destination side is an array of these. + * With BAU_MISC_CONTROL set for software acknowledge mode, the messages + * are 32 bytes (2 micropackets) (256 bits) in length, but contain only 17 + * bytes of usable data, including the sw ack vector in byte 15 (bits 127:120) + * (12 bytes come from bau_msg_payload, 3 from payload_1, 2 from + * sw_ack_vector and payload_2) + * "Enabling Software Acknowledgment mode (see Section 4.3.3 Software + * Acknowledge Processing) also selects 32 byte (17 bytes usable) payload + * operation." + */ +struct bau_payload_queue_entry { + unsigned long address; /* signifies a page or all TLB's + of the cpu */ + /* 64 bits, bytes 0-7 */ + + unsigned short sending_cpu; /* cpu that sent the message */ + /* 16 bits, bytes 8-9 */ + + unsigned short acknowledge_count; /* filled in by destination */ + /* 16 bits, bytes 10-11 */ + + unsigned short replied_to:1; /* sent as 0 by the source */ + /* 1 bit */ + unsigned short unused1:7; /* not currently using */ + /* 7 bits: byte 12) */ + + unsigned char unused2[2]; /* not currently using */ + /* bytes 13-14 */ + + unsigned char sw_ack_vector; /* filled in by the hardware */ + /* byte 15 (bits 127:120) */ + + unsigned char unused4[3]; /* not currently using bytes 17-19 */ + /* bytes 17-19 */ + + int number_of_cpus; /* filled in at destination */ + /* 32 bits, bytes 20-23 (aligned) */ + + unsigned char unused5[8]; /* not using */ + /* bytes 24-31 */ +}; + +/* one for every slot in the destination payload queue */ +struct bau_msg_status { + struct bau_local_cpumask seen_by; /* map of cpu's */ +}; + +/* one for every slot in the destination software ack resources */ +struct bau_sw_ack_status { + struct bau_payload_queue_entry *msg; /* associated message */ + int watcher; /* cpu monitoring, or -1 */ +}; + +/* one on every node and per-cpu; to locate the software tables */ +struct bau_control { + struct bau_activation_descriptor *descriptor_base; + struct bau_payload_queue_entry *bau_msg_head; + struct bau_payload_queue_entry *va_queue_first; + struct bau_payload_queue_entry *va_queue_last; + struct bau_msg_status *msg_statuses; + int *watching; /* pointer to array */ +}; + +/* + * This structure is allocated per_cpu for UV TLB shootdown statistics. + */ +struct ptc_stats { + unsigned long ptc_i; /* number of IPI-style flushes */ + unsigned long requestor; /* number of nodes this cpu sent to */ + unsigned long requestee; /* times cpu was remotely requested */ + unsigned long alltlb; /* times all tlb's on this cpu were flushed */ + unsigned long onetlb; /* times just one tlb on this cpu was flushed */ + unsigned long s_retry; /* retries on source side timeouts */ + unsigned long d_retry; /* retries on destination side timeouts */ + unsigned long sflush_ns;/* nanoseconds spent in uv_flush_tlb_others */ + unsigned long dflush_ns;/* nanoseconds spent destination side */ + unsigned long retriesok; /* successes on retries */ + unsigned long nomsg; /* interrupts with no message */ + unsigned long multmsg; /* interrupts with multiple messages */ + unsigned long ntargeted;/* nodes targeted */ +}; + +static inline int bau_node_isset(int node, struct bau_target_nodemask *dstp) +{ + return constant_test_bit(node, &dstp->bits[0]); +} +static inline void bau_node_set(int node, struct bau_target_nodemask *dstp) +{ + __set_bit(node, &dstp->bits[0]); +} +static inline void bau_nodes_clear(struct bau_target_nodemask *dstp, int nbits) +{ + bitmap_zero(&dstp->bits[0], nbits); +} + +static inline void bau_cpubits_clear(struct bau_local_cpumask *dstp, int nbits) +{ + bitmap_zero(&dstp->bits, nbits); +} + +/* + * atomic increment of a short integer + * (rather than using the __sync_add_and_fetch() intrinsic) + * + * returns the new value of the variable + */ +static inline short int atomic_inc_short(short int *v) +{ + asm volatile("movw $1, %%cx\n" + "lock ; xaddw %%cx, %0\n" + : "+m" (*v) /* outputs */ + : : "%cx", "memory"); /* inputs : clobbereds */ + return *v; +} + +/* + * atomic OR of two long integers + * (rather than using the __sync_or_and_fetch() intrinsic) + */ +static inline void atomic_or_long(unsigned long *v1, unsigned long v2) +{ + asm volatile("movq %0, %%rax; lea %1, %%rdx\n" + "lock ; orq %%rax, %%rdx\n" + : "+m" (*v1) /* outputs */ + : "m" (v1), "m" (v2) /* inputs */ + : "memory"); /* clobbereds */ +} + +#define cpubit_isset(cpu, bau_local_cpumask) \ + test_bit((cpu), (bau_local_cpumask).bits) + +int uv_flush_tlb_others(cpumask_t *, struct mm_struct *, unsigned long); +void uv_bau_message_intr1(void); +void uv_bau_timeout_intr1(void); + +#endif /* __ASM_X86_UV_BAU__ */ -- cgit v1.2.3-70-g09d2 From a9c1182fbd349882fe912245d6e03cd30943be2d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 21 Jun 2008 15:39:41 -0700 Subject: x86: seperate probe_roms into another file it is only needed for 32bit Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 1 + arch/x86/kernel/probe_roms_32.c | 166 ++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/setup_32.c | 146 ----------------------------------- include/asm-x86/setup.h | 1 + 4 files changed, 168 insertions(+), 146 deletions(-) create mode 100644 arch/x86/kernel/probe_roms_32.c (limited to 'arch/x86/kernel/Makefile') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 9a71be82792..9c9ce75e3ae 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -19,6 +19,7 @@ obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o obj-y += traps_$(BITS).o irq_$(BITS).o obj-y += time_$(BITS).o ioport.o ldt.o obj-y += setup_$(BITS).o i8259.o irqinit_$(BITS).o setup.o +obj-$(CONFIG_X86_32) += probe_roms_32.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o setup64.o diff --git a/arch/x86/kernel/probe_roms_32.c b/arch/x86/kernel/probe_roms_32.c new file mode 100644 index 00000000000..675a48c404a --- /dev/null +++ b/arch/x86/kernel/probe_roms_32.c @@ -0,0 +1,166 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include +#include +#include +#include +#include +#include + +static struct resource system_rom_resource = { + .name = "System ROM", + .start = 0xf0000, + .end = 0xfffff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +static struct resource extension_rom_resource = { + .name = "Extension ROM", + .start = 0xe0000, + .end = 0xeffff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +static struct resource adapter_rom_resources[] = { { + .name = "Adapter ROM", + .start = 0xc8000, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +} }; + +static struct resource video_rom_resource = { + .name = "Video ROM", + .start = 0xc0000, + .end = 0xc7fff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +#define ROMSIGNATURE 0xaa55 + +static int __init romsignature(const unsigned char *rom) +{ + const unsigned short * const ptr = (const unsigned short *)rom; + unsigned short sig; + + return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE; +} + +static int __init romchecksum(const unsigned char *rom, unsigned long length) +{ + unsigned char sum, c; + + for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--) + sum += c; + return !length && !sum; +} + +void __init probe_roms(void) +{ + const unsigned char *rom; + unsigned long start, length, upper; + unsigned char c; + int i; + + /* video rom */ + upper = adapter_rom_resources[0].start; + for (start = video_rom_resource.start; start < upper; start += 2048) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + video_rom_resource.start = start; + + if (probe_kernel_address(rom + 2, c) != 0) + continue; + + /* 0 < length <= 0x7f * 512, historically */ + length = c * 512; + + /* if checksum okay, trust length byte */ + if (length && romchecksum(rom, length)) + video_rom_resource.end = start + length - 1; + + request_resource(&iomem_resource, &video_rom_resource); + break; + } + + start = (video_rom_resource.end + 1 + 2047) & ~2047UL; + if (start < upper) + start = upper; + + /* system rom */ + request_resource(&iomem_resource, &system_rom_resource); + upper = system_rom_resource.start; + + /* check for extension rom (ignore length byte!) */ + rom = isa_bus_to_virt(extension_rom_resource.start); + if (romsignature(rom)) { + length = extension_rom_resource.end - extension_rom_resource.start + 1; + if (romchecksum(rom, length)) { + request_resource(&iomem_resource, &extension_rom_resource); + upper = extension_rom_resource.start; + } + } + + /* check for adapter roms on 2k boundaries */ + for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + if (probe_kernel_address(rom + 2, c) != 0) + continue; + + /* 0 < length <= 0x7f * 512, historically */ + length = c * 512; + + /* but accept any length that fits if checksum okay */ + if (!length || start + length > upper || !romchecksum(rom, length)) + continue; + + adapter_rom_resources[i].start = start; + adapter_rom_resources[i].end = start + length - 1; + request_resource(&iomem_resource, &adapter_rom_resources[i]); + + start = adapter_rom_resources[i++].end & ~2047UL; + } +} + diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index e274ee6ff58..865838b1179 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -606,8 +606,6 @@ static void set_mca_bus(int x) static void set_mca_bus(int x) { } #endif -static void probe_roms(void); - /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -843,147 +841,3 @@ void __init setup_arch(char **cmdline_p) #endif } -static struct resource system_rom_resource = { - .name = "System ROM", - .start = 0xf0000, - .end = 0xfffff, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}; - -static struct resource extension_rom_resource = { - .name = "Extension ROM", - .start = 0xe0000, - .end = 0xeffff, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}; - -static struct resource adapter_rom_resources[] = { { - .name = "Adapter ROM", - .start = 0xc8000, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -} }; - -static struct resource video_rom_resource = { - .name = "Video ROM", - .start = 0xc0000, - .end = 0xc7fff, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}; - -#define ROMSIGNATURE 0xaa55 - -static int __init romsignature(const unsigned char *rom) -{ - const unsigned short * const ptr = (const unsigned short *)rom; - unsigned short sig; - - return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE; -} - -static int __init romchecksum(const unsigned char *rom, unsigned long length) -{ - unsigned char sum, c; - - for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--) - sum += c; - return !length && !sum; -} - -static void __init probe_roms(void) -{ - const unsigned char *rom; - unsigned long start, length, upper; - unsigned char c; - int i; - - /* video rom */ - upper = adapter_rom_resources[0].start; - for (start = video_rom_resource.start; start < upper; start += 2048) { - rom = isa_bus_to_virt(start); - if (!romsignature(rom)) - continue; - - video_rom_resource.start = start; - - if (probe_kernel_address(rom + 2, c) != 0) - continue; - - /* 0 < length <= 0x7f * 512, historically */ - length = c * 512; - - /* if checksum okay, trust length byte */ - if (length && romchecksum(rom, length)) - video_rom_resource.end = start + length - 1; - - request_resource(&iomem_resource, &video_rom_resource); - break; - } - - start = (video_rom_resource.end + 1 + 2047) & ~2047UL; - if (start < upper) - start = upper; - - /* system rom */ - request_resource(&iomem_resource, &system_rom_resource); - upper = system_rom_resource.start; - - /* check for extension rom (ignore length byte!) */ - rom = isa_bus_to_virt(extension_rom_resource.start); - if (romsignature(rom)) { - length = extension_rom_resource.end - extension_rom_resource.start + 1; - if (romchecksum(rom, length)) { - request_resource(&iomem_resource, &extension_rom_resource); - upper = extension_rom_resource.start; - } - } - - /* check for adapter roms on 2k boundaries */ - for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) { - rom = isa_bus_to_virt(start); - if (!romsignature(rom)) - continue; - - if (probe_kernel_address(rom + 2, c) != 0) - continue; - - /* 0 < length <= 0x7f * 512, historically */ - length = c * 512; - - /* but accept any length that fits if checksum okay */ - if (!length || start + length > upper || !romchecksum(rom, length)) - continue; - - adapter_rom_resources[i].start = start; - adapter_rom_resources[i].end = start + length - 1; - request_resource(&iomem_resource, &adapter_rom_resources[i]); - - start = adapter_rom_resources[i++].end & ~2047UL; - } -} - diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h index b434bdd82ba..cf87d6d3675 100644 --- a/include/asm-x86/setup.h +++ b/include/asm-x86/setup.h @@ -54,6 +54,7 @@ extern struct boot_params boot_params; #ifdef __i386__ void __init i386_start_kernel(void); +extern void probe_roms(void); extern unsigned long init_pg_tables_start; extern unsigned long init_pg_tables_end; -- cgit v1.2.3-70-g09d2 From 0f0124fa742da7c51e2e3c5ded7f5e5e06ddc195 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 21 Jun 2008 16:25:37 -0700 Subject: x86: merge setup64.c into common_64.c Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/cpu/common_64.c | 277 +++++++++++++++++++++++++++++++++++++- arch/x86/kernel/setup64.c | 287 ---------------------------------------- 3 files changed, 277 insertions(+), 289 deletions(-) delete mode 100644 arch/x86/kernel/setup64.c (limited to 'arch/x86/kernel/Makefile') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 9c9ce75e3ae..0a1987b4acc 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -22,7 +22,7 @@ obj-y += setup_$(BITS).o i8259.o irqinit_$(BITS).o setup.o obj-$(CONFIG_X86_32) += probe_roms_32.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o -obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o setup64.o +obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index 48ba7996158..9fb5b7caaa8 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -1,10 +1,17 @@ #include +#include +#include +#include +#include +#include +#include +#include +#include #include #include #include #include #include -#include #include #include #include @@ -19,6 +26,15 @@ #include #include #endif +#include +#include +#include +#include +#include +#include +#include +#include +#include #include "cpu.h" @@ -404,3 +420,262 @@ static __init int setup_disablecpuid(char *arg) return 1; } __setup("clearcpuid=", setup_disablecpuid); + +#ifndef CONFIG_DEBUG_BOOT_PARAMS +struct boot_params __initdata boot_params; +#else +struct boot_params boot_params; +#endif + +cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; + +struct x8664_pda **_cpu_pda __read_mostly; +EXPORT_SYMBOL(_cpu_pda); + +struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; + +char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; + +unsigned long __supported_pte_mask __read_mostly = ~0UL; +EXPORT_SYMBOL_GPL(__supported_pte_mask); + +static int do_not_nx __cpuinitdata; + +/* noexec=on|off +Control non executable mappings for 64bit processes. + +on Enable(default) +off Disable +*/ +static int __init nonx_setup(char *str) +{ + if (!str) + return -EINVAL; + if (!strncmp(str, "on", 2)) { + __supported_pte_mask |= _PAGE_NX; + do_not_nx = 0; + } else if (!strncmp(str, "off", 3)) { + do_not_nx = 1; + __supported_pte_mask &= ~_PAGE_NX; + } + return 0; +} +early_param("noexec", nonx_setup); + +int force_personality32; + +/* noexec32=on|off +Control non executable heap for 32bit processes. +To control the stack too use noexec=off + +on PROT_READ does not imply PROT_EXEC for 32bit processes (default) +off PROT_READ implies PROT_EXEC +*/ +static int __init nonx32_setup(char *str) +{ + if (!strcmp(str, "on")) + force_personality32 &= ~READ_IMPLIES_EXEC; + else if (!strcmp(str, "off")) + force_personality32 |= READ_IMPLIES_EXEC; + return 1; +} +__setup("noexec32=", nonx32_setup); + +void pda_init(int cpu) +{ + struct x8664_pda *pda = cpu_pda(cpu); + + /* Setup up data that may be needed in __get_free_pages early */ + asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); + /* Memory clobbers used to order PDA accessed */ + mb(); + wrmsrl(MSR_GS_BASE, pda); + mb(); + + pda->cpunumber = cpu; + pda->irqcount = -1; + pda->kernelstack = (unsigned long)stack_thread_info() - + PDA_STACKOFFSET + THREAD_SIZE; + pda->active_mm = &init_mm; + pda->mmu_state = 0; + + if (cpu == 0) { + /* others are initialized in smpboot.c */ + pda->pcurrent = &init_task; + pda->irqstackptr = boot_cpu_stack; + } else { + pda->irqstackptr = (char *) + __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); + if (!pda->irqstackptr) + panic("cannot allocate irqstack for cpu %d", cpu); + + if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE) + pda->nodenumber = cpu_to_node(cpu); + } + + pda->irqstackptr += IRQSTACKSIZE-64; +} + +char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + + DEBUG_STKSZ] +__attribute__((section(".bss.page_aligned"))); + +extern asmlinkage void ignore_sysret(void); + +/* May not be marked __init: used by software suspend */ +void syscall_init(void) +{ + /* + * LSTAR and STAR live in a bit strange symbiosis. + * They both write to the same internal register. STAR allows to + * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip. + */ + wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); + wrmsrl(MSR_LSTAR, system_call); + wrmsrl(MSR_CSTAR, ignore_sysret); + +#ifdef CONFIG_IA32_EMULATION + syscall32_cpu_init(); +#endif + + /* Flags to clear on syscall */ + wrmsrl(MSR_SYSCALL_MASK, + X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL); +} + +void __cpuinit check_efer(void) +{ + unsigned long efer; + + rdmsrl(MSR_EFER, efer); + if (!(efer & EFER_NX) || do_not_nx) + __supported_pte_mask &= ~_PAGE_NX; +} + +unsigned long kernel_eflags; + +/* + * Copies of the original ist values from the tss are only accessed during + * debugging, no special alignment required. + */ +DEFINE_PER_CPU(struct orig_ist, orig_ist); + +/* + * cpu_init() initializes state that is per-CPU. Some data is already + * initialized (naturally) in the bootstrap process, such as the GDT + * and IDT. We reload them nevertheless, this function acts as a + * 'CPU state barrier', nothing should get across. + * A lot of state is already set up in PDA init. + */ +void __cpuinit cpu_init(void) +{ + int cpu = stack_smp_processor_id(); + struct tss_struct *t = &per_cpu(init_tss, cpu); + struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu); + unsigned long v; + char *estacks = NULL; + struct task_struct *me; + int i; + + /* CPU 0 is initialised in head64.c */ + if (cpu != 0) + pda_init(cpu); + else + estacks = boot_exception_stacks; + + me = current; + + if (cpu_test_and_set(cpu, cpu_initialized)) + panic("CPU#%d already initialized!\n", cpu); + + printk(KERN_INFO "Initializing CPU#%d\n", cpu); + + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + + /* + * Initialize the per-CPU GDT with the boot GDT, + * and set up the GDT descriptor: + */ + + switch_to_new_gdt(); + load_idt((const struct desc_ptr *)&idt_descr); + + memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); + syscall_init(); + + wrmsrl(MSR_FS_BASE, 0); + wrmsrl(MSR_KERNEL_GS_BASE, 0); + barrier(); + + check_efer(); + + /* + * set up and load the per-CPU TSS + */ + for (v = 0; v < N_EXCEPTION_STACKS; v++) { + static const unsigned int order[N_EXCEPTION_STACKS] = { + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, + [DEBUG_STACK - 1] = DEBUG_STACK_ORDER + }; + if (cpu) { + estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); + if (!estacks) + panic("Cannot allocate exception stack %ld %d\n", + v, cpu); + } + estacks += PAGE_SIZE << order[v]; + orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; + } + + t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); + /* + * <= is required because the CPU will access up to + * 8 bits beyond the end of the IO permission bitmap. + */ + for (i = 0; i <= IO_BITMAP_LONGS; i++) + t->io_bitmap[i] = ~0UL; + + atomic_inc(&init_mm.mm_count); + me->active_mm = &init_mm; + if (me->mm) + BUG(); + enter_lazy_tlb(&init_mm, me); + + load_sp0(t, ¤t->thread); + set_tss_desc(cpu, t); + load_TR_desc(); + load_LDT(&init_mm.context); + +#ifdef CONFIG_KGDB + /* + * If the kgdb is connected no debug regs should be altered. This + * is only applicable when KGDB and a KGDB I/O module are built + * into the kernel and you are using early debugging with + * kgdbwait. KGDB will control the kernel HW breakpoint registers. + */ + if (kgdb_connected && arch_kgdb_ops.correct_hw_break) + arch_kgdb_ops.correct_hw_break(); + else { +#endif + /* + * Clear all 6 debug registers: + */ + + set_debugreg(0UL, 0); + set_debugreg(0UL, 1); + set_debugreg(0UL, 2); + set_debugreg(0UL, 3); + set_debugreg(0UL, 6); + set_debugreg(0UL, 7); +#ifdef CONFIG_KGDB + /* If the kgdb is connected no debug regs should be altered. */ + } +#endif + + fpu_init(); + + raw_local_save_flags(kernel_eflags); + + if (is_uv_system()) + uv_cpu_init(); +} diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c deleted file mode 100644 index 151d3155ddf..00000000000 --- a/arch/x86/kernel/setup64.c +++ /dev/null @@ -1,287 +0,0 @@ -/* - * X86-64 specific CPU setup. - * Copyright (C) 1995 Linus Torvalds - * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen. - * See setup.c for older changelog. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifndef CONFIG_DEBUG_BOOT_PARAMS -struct boot_params __initdata boot_params; -#else -struct boot_params boot_params; -#endif - -cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; - -struct x8664_pda **_cpu_pda __read_mostly; -EXPORT_SYMBOL(_cpu_pda); - -struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; - -char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; - -unsigned long __supported_pte_mask __read_mostly = ~0UL; -EXPORT_SYMBOL_GPL(__supported_pte_mask); - -static int do_not_nx __cpuinitdata = 0; - -/* noexec=on|off -Control non executable mappings for 64bit processes. - -on Enable(default) -off Disable -*/ -static int __init nonx_setup(char *str) -{ - if (!str) - return -EINVAL; - if (!strncmp(str, "on", 2)) { - __supported_pte_mask |= _PAGE_NX; - do_not_nx = 0; - } else if (!strncmp(str, "off", 3)) { - do_not_nx = 1; - __supported_pte_mask &= ~_PAGE_NX; - } - return 0; -} -early_param("noexec", nonx_setup); - -int force_personality32 = 0; - -/* noexec32=on|off -Control non executable heap for 32bit processes. -To control the stack too use noexec=off - -on PROT_READ does not imply PROT_EXEC for 32bit processes (default) -off PROT_READ implies PROT_EXEC -*/ -static int __init nonx32_setup(char *str) -{ - if (!strcmp(str, "on")) - force_personality32 &= ~READ_IMPLIES_EXEC; - else if (!strcmp(str, "off")) - force_personality32 |= READ_IMPLIES_EXEC; - return 1; -} -__setup("noexec32=", nonx32_setup); - -void pda_init(int cpu) -{ - struct x8664_pda *pda = cpu_pda(cpu); - - /* Setup up data that may be needed in __get_free_pages early */ - asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); - /* Memory clobbers used to order PDA accessed */ - mb(); - wrmsrl(MSR_GS_BASE, pda); - mb(); - - pda->cpunumber = cpu; - pda->irqcount = -1; - pda->kernelstack = - (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE; - pda->active_mm = &init_mm; - pda->mmu_state = 0; - - if (cpu == 0) { - /* others are initialized in smpboot.c */ - pda->pcurrent = &init_task; - pda->irqstackptr = boot_cpu_stack; - } else { - pda->irqstackptr = (char *) - __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); - if (!pda->irqstackptr) - panic("cannot allocate irqstack for cpu %d", cpu); - - if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE) - pda->nodenumber = cpu_to_node(cpu); - } - - pda->irqstackptr += IRQSTACKSIZE-64; -} - -char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ] -__attribute__((section(".bss.page_aligned"))); - -extern asmlinkage void ignore_sysret(void); - -/* May not be marked __init: used by software suspend */ -void syscall_init(void) -{ - /* - * LSTAR and STAR live in a bit strange symbiosis. - * They both write to the same internal register. STAR allows to set CS/DS - * but only a 32bit target. LSTAR sets the 64bit rip. - */ - wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); - wrmsrl(MSR_LSTAR, system_call); - wrmsrl(MSR_CSTAR, ignore_sysret); - -#ifdef CONFIG_IA32_EMULATION - syscall32_cpu_init (); -#endif - - /* Flags to clear on syscall */ - wrmsrl(MSR_SYSCALL_MASK, - X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL); -} - -void __cpuinit check_efer(void) -{ - unsigned long efer; - - rdmsrl(MSR_EFER, efer); - if (!(efer & EFER_NX) || do_not_nx) { - __supported_pte_mask &= ~_PAGE_NX; - } -} - -unsigned long kernel_eflags; - -/* - * Copies of the original ist values from the tss are only accessed during - * debugging, no special alignment required. - */ -DEFINE_PER_CPU(struct orig_ist, orig_ist); - -/* - * cpu_init() initializes state that is per-CPU. Some data is already - * initialized (naturally) in the bootstrap process, such as the GDT - * and IDT. We reload them nevertheless, this function acts as a - * 'CPU state barrier', nothing should get across. - * A lot of state is already set up in PDA init. - */ -void __cpuinit cpu_init (void) -{ - int cpu = stack_smp_processor_id(); - struct tss_struct *t = &per_cpu(init_tss, cpu); - struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu); - unsigned long v; - char *estacks = NULL; - struct task_struct *me; - int i; - - /* CPU 0 is initialised in head64.c */ - if (cpu != 0) { - pda_init(cpu); - } else - estacks = boot_exception_stacks; - - me = current; - - if (cpu_test_and_set(cpu, cpu_initialized)) - panic("CPU#%d already initialized!\n", cpu); - - printk("Initializing CPU#%d\n", cpu); - - clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); - - /* - * Initialize the per-CPU GDT with the boot GDT, - * and set up the GDT descriptor: - */ - - switch_to_new_gdt(); - load_idt((const struct desc_ptr *)&idt_descr); - - memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); - syscall_init(); - - wrmsrl(MSR_FS_BASE, 0); - wrmsrl(MSR_KERNEL_GS_BASE, 0); - barrier(); - - check_efer(); - - /* - * set up and load the per-CPU TSS - */ - for (v = 0; v < N_EXCEPTION_STACKS; v++) { - static const unsigned int order[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, - [DEBUG_STACK - 1] = DEBUG_STACK_ORDER - }; - if (cpu) { - estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); - if (!estacks) - panic("Cannot allocate exception stack %ld %d\n", - v, cpu); - } - estacks += PAGE_SIZE << order[v]; - orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; - } - - t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); - /* - * <= is required because the CPU will access up to - * 8 bits beyond the end of the IO permission bitmap. - */ - for (i = 0; i <= IO_BITMAP_LONGS; i++) - t->io_bitmap[i] = ~0UL; - - atomic_inc(&init_mm.mm_count); - me->active_mm = &init_mm; - if (me->mm) - BUG(); - enter_lazy_tlb(&init_mm, me); - - load_sp0(t, ¤t->thread); - set_tss_desc(cpu, t); - load_TR_desc(); - load_LDT(&init_mm.context); - -#ifdef CONFIG_KGDB - /* - * If the kgdb is connected no debug regs should be altered. This - * is only applicable when KGDB and a KGDB I/O module are built - * into the kernel and you are using early debugging with - * kgdbwait. KGDB will control the kernel HW breakpoint registers. - */ - if (kgdb_connected && arch_kgdb_ops.correct_hw_break) - arch_kgdb_ops.correct_hw_break(); - else { -#endif - /* - * Clear all 6 debug registers: - */ - - set_debugreg(0UL, 0); - set_debugreg(0UL, 1); - set_debugreg(0UL, 2); - set_debugreg(0UL, 3); - set_debugreg(0UL, 6); - set_debugreg(0UL, 7); -#ifdef CONFIG_KGDB - /* If the kgdb is connected no debug regs should be altered. */ - } -#endif - - fpu_init(); - - raw_local_save_flags(kernel_eflags); - - if (is_uv_system()) - uv_cpu_init(); -} -- cgit v1.2.3-70-g09d2 From 378b39a4f91ab0846eb6e13d47ea812bc82b44d9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 25 Jun 2008 17:48:14 -0700 Subject: x86: rename setup.c to setup_percpu.c some functions need to be moved to setup_numa.c after we merge setup32/64.c, some funcs need to be moved back to setup.c Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/setup.c | 528 ----------------------------------------- arch/x86/kernel/setup_percpu.c | 528 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 529 insertions(+), 529 deletions(-) delete mode 100644 arch/x86/kernel/setup.c create mode 100644 arch/x86/kernel/setup_percpu.c (limited to 'arch/x86/kernel/Makefile') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0a1987b4acc..5e1537b6253 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -18,7 +18,7 @@ CFLAGS_tsc_64.o := $(nostackp) obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o obj-y += traps_$(BITS).o irq_$(BITS).o obj-y += time_$(BITS).o ioport.o ldt.o -obj-y += setup_$(BITS).o i8259.o irqinit_$(BITS).o setup.o +obj-y += setup_$(BITS).o i8259.o irqinit_$(BITS).o setup_percpu.o obj-$(CONFIG_X86_32) += probe_roms_32.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c deleted file mode 100644 index 5c0c4bb5727..00000000000 --- a/arch/x86/kernel/setup.c +++ /dev/null @@ -1,528 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifndef CONFIG_DEBUG_BOOT_PARAMS -struct boot_params __initdata boot_params; -#else -struct boot_params boot_params; -#endif - -#ifdef CONFIG_X86_LOCAL_APIC -unsigned int num_processors; -unsigned disabled_cpus __cpuinitdata; -/* Processor that is doing the boot up */ -unsigned int boot_cpu_physical_apicid = -1U; -unsigned int max_physical_apicid; -EXPORT_SYMBOL(boot_cpu_physical_apicid); - -/* Bitmask of physically existing CPUs */ -physid_mask_t phys_cpu_present_map; -#endif - -/* map cpu index to physical APIC ID */ -DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID); -DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID); -EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); -EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); - -#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) -#define X86_64_NUMA 1 - -/* map cpu index to node index */ -DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); -EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); - -/* which logical CPUs are on which nodes */ -cpumask_t *node_to_cpumask_map; -EXPORT_SYMBOL(node_to_cpumask_map); - -/* setup node_to_cpumask_map */ -static void __init setup_node_to_cpumask_map(void); - -#else -static inline void setup_node_to_cpumask_map(void) { } -#endif - -#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP) -/* - * Copy data used in early init routines from the initial arrays to the - * per cpu data areas. These arrays then become expendable and the - * *_early_ptr's are zeroed indicating that the static arrays are gone. - */ -static void __init setup_per_cpu_maps(void) -{ - int cpu; - - for_each_possible_cpu(cpu) { - per_cpu(x86_cpu_to_apicid, cpu) = - early_per_cpu_map(x86_cpu_to_apicid, cpu); - per_cpu(x86_bios_cpu_apicid, cpu) = - early_per_cpu_map(x86_bios_cpu_apicid, cpu); -#ifdef X86_64_NUMA - per_cpu(x86_cpu_to_node_map, cpu) = - early_per_cpu_map(x86_cpu_to_node_map, cpu); -#endif - } - - /* indicate the early static arrays will soon be gone */ - early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; - early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; -#ifdef X86_64_NUMA - early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; -#endif -} - -#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP -cpumask_t *cpumask_of_cpu_map __read_mostly; -EXPORT_SYMBOL(cpumask_of_cpu_map); - -/* requires nr_cpu_ids to be initialized */ -static void __init setup_cpumask_of_cpu(void) -{ - int i; - - /* alloc_bootmem zeroes memory */ - cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids); - for (i = 0; i < nr_cpu_ids; i++) - cpu_set(i, cpumask_of_cpu_map[i]); -} -#else -static inline void setup_cpumask_of_cpu(void) { } -#endif - -#ifdef CONFIG_X86_32 -/* - * Great future not-so-futuristic plan: make i386 and x86_64 do it - * the same way - */ -unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; -EXPORT_SYMBOL(__per_cpu_offset); -static inline void setup_cpu_pda_map(void) { } - -#elif !defined(CONFIG_SMP) -static inline void setup_cpu_pda_map(void) { } - -#else /* CONFIG_SMP && CONFIG_X86_64 */ - -/* - * Allocate cpu_pda pointer table and array via alloc_bootmem. - */ -static void __init setup_cpu_pda_map(void) -{ - char *pda; - struct x8664_pda **new_cpu_pda; - unsigned long size; - int cpu; - - size = roundup(sizeof(struct x8664_pda), cache_line_size()); - - /* allocate cpu_pda array and pointer table */ - { - unsigned long tsize = nr_cpu_ids * sizeof(void *); - unsigned long asize = size * (nr_cpu_ids - 1); - - tsize = roundup(tsize, cache_line_size()); - new_cpu_pda = alloc_bootmem(tsize + asize); - pda = (char *)new_cpu_pda + tsize; - } - - /* initialize pointer table to static pda's */ - for_each_possible_cpu(cpu) { - if (cpu == 0) { - /* leave boot cpu pda in place */ - new_cpu_pda[0] = cpu_pda(0); - continue; - } - new_cpu_pda[cpu] = (struct x8664_pda *)pda; - new_cpu_pda[cpu]->in_bootmem = 1; - pda += size; - } - - /* point to new pointer table */ - _cpu_pda = new_cpu_pda; -} -#endif - -/* - * Great future plan: - * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. - * Always point %gs to its beginning - */ -void __init setup_per_cpu_areas(void) -{ - ssize_t size = PERCPU_ENOUGH_ROOM; - char *ptr; - int cpu; - - /* no processor from mptable or madt */ - if (!num_processors) - num_processors = 1; - -#ifdef CONFIG_HOTPLUG_CPU - prefill_possible_map(); -#else - nr_cpu_ids = num_processors; -#endif - - /* Setup cpu_pda map */ - setup_cpu_pda_map(); - - /* Copy section for each CPU (we discard the original) */ - size = PERCPU_ENOUGH_ROOM; - printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n", - size); - - for_each_possible_cpu(cpu) { -#ifndef CONFIG_NEED_MULTIPLE_NODES - ptr = alloc_bootmem_pages(size); -#else - int node = early_cpu_to_node(cpu); - if (!node_online(node) || !NODE_DATA(node)) { - ptr = alloc_bootmem_pages(size); - printk(KERN_INFO - "cpu %d has no node %d or node-local memory\n", - cpu, node); - } - else - ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); -#endif - per_cpu_offset(cpu) = ptr - __per_cpu_start; - memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); - - } - - printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n", - NR_CPUS, nr_cpu_ids, nr_node_ids); - - /* Setup percpu data maps */ - setup_per_cpu_maps(); - - /* Setup node to cpumask map */ - setup_node_to_cpumask_map(); - - /* Setup cpumask_of_cpu map */ - setup_cpumask_of_cpu(); -} - -#endif - -void __init parse_setup_data(void) -{ - struct setup_data *data; - u64 pa_data; - - if (boot_params.hdr.version < 0x0209) - return; - pa_data = boot_params.hdr.setup_data; - while (pa_data) { - data = early_ioremap(pa_data, PAGE_SIZE); - switch (data->type) { - case SETUP_E820_EXT: - parse_e820_ext(data, pa_data); - break; - default: - break; - } -#ifndef CONFIG_DEBUG_BOOT_PARAMS - free_early(pa_data, pa_data+sizeof(*data)+data->len); -#endif - pa_data = data->next; - early_iounmap(data, PAGE_SIZE); - } -} - -#ifdef X86_64_NUMA - -/* - * Allocate node_to_cpumask_map based on number of available nodes - * Requires node_possible_map to be valid. - * - * Note: node_to_cpumask() is not valid until after this is done. - */ -static void __init setup_node_to_cpumask_map(void) -{ - unsigned int node, num = 0; - cpumask_t *map; - - /* setup nr_node_ids if not done yet */ - if (nr_node_ids == MAX_NUMNODES) { - for_each_node_mask(node, node_possible_map) - num = node; - nr_node_ids = num + 1; - } - - /* allocate the map */ - map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t)); - - Dprintk(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n", - map, nr_node_ids); - - /* node_to_cpumask() will now work */ - node_to_cpumask_map = map; -} - -void __cpuinit numa_set_node(int cpu, int node) -{ - int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); - - if (cpu_pda(cpu) && node != NUMA_NO_NODE) - cpu_pda(cpu)->nodenumber = node; - - if (cpu_to_node_map) - cpu_to_node_map[cpu] = node; - - else if (per_cpu_offset(cpu)) - per_cpu(x86_cpu_to_node_map, cpu) = node; - - else - Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu); -} - -void __cpuinit numa_clear_node(int cpu) -{ - numa_set_node(cpu, NUMA_NO_NODE); -} - -#ifndef CONFIG_DEBUG_PER_CPU_MAPS - -void __cpuinit numa_add_cpu(int cpu) -{ - cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); -} - -void __cpuinit numa_remove_cpu(int cpu) -{ - cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]); -} - -#else /* CONFIG_DEBUG_PER_CPU_MAPS */ - -/* - * --------- debug versions of the numa functions --------- - */ -static void __cpuinit numa_set_cpumask(int cpu, int enable) -{ - int node = cpu_to_node(cpu); - cpumask_t *mask; - char buf[64]; - - if (node_to_cpumask_map == NULL) { - printk(KERN_ERR "node_to_cpumask_map NULL\n"); - dump_stack(); - return; - } - - mask = &node_to_cpumask_map[node]; - if (enable) - cpu_set(cpu, *mask); - else - cpu_clear(cpu, *mask); - - cpulist_scnprintf(buf, sizeof(buf), *mask); - printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", - enable? "numa_add_cpu":"numa_remove_cpu", cpu, node, buf); - } - -void __cpuinit numa_add_cpu(int cpu) -{ - numa_set_cpumask(cpu, 1); -} - -void __cpuinit numa_remove_cpu(int cpu) -{ - numa_set_cpumask(cpu, 0); -} - -int cpu_to_node(int cpu) -{ - if (early_per_cpu_ptr(x86_cpu_to_node_map)) { - printk(KERN_WARNING - "cpu_to_node(%d): usage too early!\n", cpu); - dump_stack(); - return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; - } - return per_cpu(x86_cpu_to_node_map, cpu); -} -EXPORT_SYMBOL(cpu_to_node); - -/* - * Same function as cpu_to_node() but used if called before the - * per_cpu areas are setup. - */ -int early_cpu_to_node(int cpu) -{ - if (early_per_cpu_ptr(x86_cpu_to_node_map)) - return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; - - if (!per_cpu_offset(cpu)) { - printk(KERN_WARNING - "early_cpu_to_node(%d): no per_cpu area!\n", cpu); - dump_stack(); - return NUMA_NO_NODE; - } - return per_cpu(x86_cpu_to_node_map, cpu); -} - -/* - * Returns a pointer to the bitmask of CPUs on Node 'node'. - */ -cpumask_t *_node_to_cpumask_ptr(int node) -{ - if (node_to_cpumask_map == NULL) { - printk(KERN_WARNING - "_node_to_cpumask_ptr(%d): no node_to_cpumask_map!\n", - node); - dump_stack(); - return &cpu_online_map; - } - BUG_ON(node >= nr_node_ids); - return &node_to_cpumask_map[node]; -} -EXPORT_SYMBOL(_node_to_cpumask_ptr); - -/* - * Returns a bitmask of CPUs on Node 'node'. - */ -cpumask_t node_to_cpumask(int node) -{ - if (node_to_cpumask_map == NULL) { - printk(KERN_WARNING - "node_to_cpumask(%d): no node_to_cpumask_map!\n", node); - dump_stack(); - return cpu_online_map; - } - BUG_ON(node >= nr_node_ids); - return node_to_cpumask_map[node]; -} -EXPORT_SYMBOL(node_to_cpumask); - -/* - * --------- end of debug versions of the numa functions --------- - */ - -#endif /* CONFIG_DEBUG_PER_CPU_MAPS */ - -#endif /* X86_64_NUMA */ - - -/* - * --------- Crashkernel reservation ------------------------------ - */ - -static inline unsigned long long get_total_mem(void) -{ - unsigned long long total; - - total = max_low_pfn - min_low_pfn; -#ifdef CONFIG_HIGHMEM - total += highend_pfn - highstart_pfn; -#endif - - return total << PAGE_SHIFT; -} - -#ifdef CONFIG_KEXEC -void __init reserve_crashkernel(void) -{ - unsigned long long total_mem; - unsigned long long crash_size, crash_base; - int ret; - - total_mem = get_total_mem(); - - ret = parse_crashkernel(boot_command_line, total_mem, - &crash_size, &crash_base); - if (ret == 0 && crash_size > 0) { - if (crash_base <= 0) { - printk(KERN_INFO "crashkernel reservation failed - " - "you have to specify a base address\n"); - return; - } - - if (reserve_bootmem_generic(crash_base, crash_size, - BOOTMEM_EXCLUSIVE) < 0) { - printk(KERN_INFO "crashkernel reservation failed - " - "memory is in use\n"); - return; - } - - printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " - "for crashkernel (System RAM: %ldMB)\n", - (unsigned long)(crash_size >> 20), - (unsigned long)(crash_base >> 20), - (unsigned long)(total_mem >> 20)); - - crashk_res.start = crash_base; - crashk_res.end = crash_base + crash_size - 1; - insert_resource(&iomem_resource, &crashk_res); - } -} -#else -void __init reserve_crashkernel(void) -{} -#endif -static struct resource standard_io_resources[] = { - { .name = "dma1", .start = 0x00, .end = 0x1f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "pic1", .start = 0x20, .end = 0x21, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "timer0", .start = 0x40, .end = 0x43, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "timer1", .start = 0x50, .end = 0x53, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "keyboard", .start = 0x60, .end = 0x60, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "keyboard", .start = 0x64, .end = 0x64, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "dma page reg", .start = 0x80, .end = 0x8f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "pic2", .start = 0xa0, .end = 0xa1, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "dma2", .start = 0xc0, .end = 0xdf, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "fpu", .start = 0xf0, .end = 0xff, - .flags = IORESOURCE_BUSY | IORESOURCE_IO } -}; - -void __init reserve_standard_io_resources(void) -{ - int i; - - /* request I/O space for devices used on all i[345]86 PCs */ - for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) - request_resource(&ioport_resource, &standard_io_resources[i]); - -} - -#ifdef CONFIG_PROC_VMCORE -/* elfcorehdr= specifies the location of elf core header - * stored by the crashed kernel. This option will be passed - * by kexec loader to the capture kernel. - */ -static int __init setup_elfcorehdr(char *arg) -{ - char *end; - if (!arg) - return -EINVAL; - elfcorehdr_addr = memparse(arg, &end); - return end > arg ? 0 : -EINVAL; -} -early_param("elfcorehdr", setup_elfcorehdr); -#endif - - - diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c new file mode 100644 index 00000000000..5c0c4bb5727 --- /dev/null +++ b/arch/x86/kernel/setup_percpu.c @@ -0,0 +1,528 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef CONFIG_DEBUG_BOOT_PARAMS +struct boot_params __initdata boot_params; +#else +struct boot_params boot_params; +#endif + +#ifdef CONFIG_X86_LOCAL_APIC +unsigned int num_processors; +unsigned disabled_cpus __cpuinitdata; +/* Processor that is doing the boot up */ +unsigned int boot_cpu_physical_apicid = -1U; +unsigned int max_physical_apicid; +EXPORT_SYMBOL(boot_cpu_physical_apicid); + +/* Bitmask of physically existing CPUs */ +physid_mask_t phys_cpu_present_map; +#endif + +/* map cpu index to physical APIC ID */ +DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID); +DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); + +#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) +#define X86_64_NUMA 1 + +/* map cpu index to node index */ +DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); + +/* which logical CPUs are on which nodes */ +cpumask_t *node_to_cpumask_map; +EXPORT_SYMBOL(node_to_cpumask_map); + +/* setup node_to_cpumask_map */ +static void __init setup_node_to_cpumask_map(void); + +#else +static inline void setup_node_to_cpumask_map(void) { } +#endif + +#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP) +/* + * Copy data used in early init routines from the initial arrays to the + * per cpu data areas. These arrays then become expendable and the + * *_early_ptr's are zeroed indicating that the static arrays are gone. + */ +static void __init setup_per_cpu_maps(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + per_cpu(x86_cpu_to_apicid, cpu) = + early_per_cpu_map(x86_cpu_to_apicid, cpu); + per_cpu(x86_bios_cpu_apicid, cpu) = + early_per_cpu_map(x86_bios_cpu_apicid, cpu); +#ifdef X86_64_NUMA + per_cpu(x86_cpu_to_node_map, cpu) = + early_per_cpu_map(x86_cpu_to_node_map, cpu); +#endif + } + + /* indicate the early static arrays will soon be gone */ + early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; + early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; +#ifdef X86_64_NUMA + early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; +#endif +} + +#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP +cpumask_t *cpumask_of_cpu_map __read_mostly; +EXPORT_SYMBOL(cpumask_of_cpu_map); + +/* requires nr_cpu_ids to be initialized */ +static void __init setup_cpumask_of_cpu(void) +{ + int i; + + /* alloc_bootmem zeroes memory */ + cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids); + for (i = 0; i < nr_cpu_ids; i++) + cpu_set(i, cpumask_of_cpu_map[i]); +} +#else +static inline void setup_cpumask_of_cpu(void) { } +#endif + +#ifdef CONFIG_X86_32 +/* + * Great future not-so-futuristic plan: make i386 and x86_64 do it + * the same way + */ +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(__per_cpu_offset); +static inline void setup_cpu_pda_map(void) { } + +#elif !defined(CONFIG_SMP) +static inline void setup_cpu_pda_map(void) { } + +#else /* CONFIG_SMP && CONFIG_X86_64 */ + +/* + * Allocate cpu_pda pointer table and array via alloc_bootmem. + */ +static void __init setup_cpu_pda_map(void) +{ + char *pda; + struct x8664_pda **new_cpu_pda; + unsigned long size; + int cpu; + + size = roundup(sizeof(struct x8664_pda), cache_line_size()); + + /* allocate cpu_pda array and pointer table */ + { + unsigned long tsize = nr_cpu_ids * sizeof(void *); + unsigned long asize = size * (nr_cpu_ids - 1); + + tsize = roundup(tsize, cache_line_size()); + new_cpu_pda = alloc_bootmem(tsize + asize); + pda = (char *)new_cpu_pda + tsize; + } + + /* initialize pointer table to static pda's */ + for_each_possible_cpu(cpu) { + if (cpu == 0) { + /* leave boot cpu pda in place */ + new_cpu_pda[0] = cpu_pda(0); + continue; + } + new_cpu_pda[cpu] = (struct x8664_pda *)pda; + new_cpu_pda[cpu]->in_bootmem = 1; + pda += size; + } + + /* point to new pointer table */ + _cpu_pda = new_cpu_pda; +} +#endif + +/* + * Great future plan: + * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. + * Always point %gs to its beginning + */ +void __init setup_per_cpu_areas(void) +{ + ssize_t size = PERCPU_ENOUGH_ROOM; + char *ptr; + int cpu; + + /* no processor from mptable or madt */ + if (!num_processors) + num_processors = 1; + +#ifdef CONFIG_HOTPLUG_CPU + prefill_possible_map(); +#else + nr_cpu_ids = num_processors; +#endif + + /* Setup cpu_pda map */ + setup_cpu_pda_map(); + + /* Copy section for each CPU (we discard the original) */ + size = PERCPU_ENOUGH_ROOM; + printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n", + size); + + for_each_possible_cpu(cpu) { +#ifndef CONFIG_NEED_MULTIPLE_NODES + ptr = alloc_bootmem_pages(size); +#else + int node = early_cpu_to_node(cpu); + if (!node_online(node) || !NODE_DATA(node)) { + ptr = alloc_bootmem_pages(size); + printk(KERN_INFO + "cpu %d has no node %d or node-local memory\n", + cpu, node); + } + else + ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); +#endif + per_cpu_offset(cpu) = ptr - __per_cpu_start; + memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); + + } + + printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n", + NR_CPUS, nr_cpu_ids, nr_node_ids); + + /* Setup percpu data maps */ + setup_per_cpu_maps(); + + /* Setup node to cpumask map */ + setup_node_to_cpumask_map(); + + /* Setup cpumask_of_cpu map */ + setup_cpumask_of_cpu(); +} + +#endif + +void __init parse_setup_data(void) +{ + struct setup_data *data; + u64 pa_data; + + if (boot_params.hdr.version < 0x0209) + return; + pa_data = boot_params.hdr.setup_data; + while (pa_data) { + data = early_ioremap(pa_data, PAGE_SIZE); + switch (data->type) { + case SETUP_E820_EXT: + parse_e820_ext(data, pa_data); + break; + default: + break; + } +#ifndef CONFIG_DEBUG_BOOT_PARAMS + free_early(pa_data, pa_data+sizeof(*data)+data->len); +#endif + pa_data = data->next; + early_iounmap(data, PAGE_SIZE); + } +} + +#ifdef X86_64_NUMA + +/* + * Allocate node_to_cpumask_map based on number of available nodes + * Requires node_possible_map to be valid. + * + * Note: node_to_cpumask() is not valid until after this is done. + */ +static void __init setup_node_to_cpumask_map(void) +{ + unsigned int node, num = 0; + cpumask_t *map; + + /* setup nr_node_ids if not done yet */ + if (nr_node_ids == MAX_NUMNODES) { + for_each_node_mask(node, node_possible_map) + num = node; + nr_node_ids = num + 1; + } + + /* allocate the map */ + map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t)); + + Dprintk(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n", + map, nr_node_ids); + + /* node_to_cpumask() will now work */ + node_to_cpumask_map = map; +} + +void __cpuinit numa_set_node(int cpu, int node) +{ + int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); + + if (cpu_pda(cpu) && node != NUMA_NO_NODE) + cpu_pda(cpu)->nodenumber = node; + + if (cpu_to_node_map) + cpu_to_node_map[cpu] = node; + + else if (per_cpu_offset(cpu)) + per_cpu(x86_cpu_to_node_map, cpu) = node; + + else + Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu); +} + +void __cpuinit numa_clear_node(int cpu) +{ + numa_set_node(cpu, NUMA_NO_NODE); +} + +#ifndef CONFIG_DEBUG_PER_CPU_MAPS + +void __cpuinit numa_add_cpu(int cpu) +{ + cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); +} + +void __cpuinit numa_remove_cpu(int cpu) +{ + cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]); +} + +#else /* CONFIG_DEBUG_PER_CPU_MAPS */ + +/* + * --------- debug versions of the numa functions --------- + */ +static void __cpuinit numa_set_cpumask(int cpu, int enable) +{ + int node = cpu_to_node(cpu); + cpumask_t *mask; + char buf[64]; + + if (node_to_cpumask_map == NULL) { + printk(KERN_ERR "node_to_cpumask_map NULL\n"); + dump_stack(); + return; + } + + mask = &node_to_cpumask_map[node]; + if (enable) + cpu_set(cpu, *mask); + else + cpu_clear(cpu, *mask); + + cpulist_scnprintf(buf, sizeof(buf), *mask); + printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", + enable? "numa_add_cpu":"numa_remove_cpu", cpu, node, buf); + } + +void __cpuinit numa_add_cpu(int cpu) +{ + numa_set_cpumask(cpu, 1); +} + +void __cpuinit numa_remove_cpu(int cpu) +{ + numa_set_cpumask(cpu, 0); +} + +int cpu_to_node(int cpu) +{ + if (early_per_cpu_ptr(x86_cpu_to_node_map)) { + printk(KERN_WARNING + "cpu_to_node(%d): usage too early!\n", cpu); + dump_stack(); + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; + } + return per_cpu(x86_cpu_to_node_map, cpu); +} +EXPORT_SYMBOL(cpu_to_node); + +/* + * Same function as cpu_to_node() but used if called before the + * per_cpu areas are setup. + */ +int early_cpu_to_node(int cpu) +{ + if (early_per_cpu_ptr(x86_cpu_to_node_map)) + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; + + if (!per_cpu_offset(cpu)) { + printk(KERN_WARNING + "early_cpu_to_node(%d): no per_cpu area!\n", cpu); + dump_stack(); + return NUMA_NO_NODE; + } + return per_cpu(x86_cpu_to_node_map, cpu); +} + +/* + * Returns a pointer to the bitmask of CPUs on Node 'node'. + */ +cpumask_t *_node_to_cpumask_ptr(int node) +{ + if (node_to_cpumask_map == NULL) { + printk(KERN_WARNING + "_node_to_cpumask_ptr(%d): no node_to_cpumask_map!\n", + node); + dump_stack(); + return &cpu_online_map; + } + BUG_ON(node >= nr_node_ids); + return &node_to_cpumask_map[node]; +} +EXPORT_SYMBOL(_node_to_cpumask_ptr); + +/* + * Returns a bitmask of CPUs on Node 'node'. + */ +cpumask_t node_to_cpumask(int node) +{ + if (node_to_cpumask_map == NULL) { + printk(KERN_WARNING + "node_to_cpumask(%d): no node_to_cpumask_map!\n", node); + dump_stack(); + return cpu_online_map; + } + BUG_ON(node >= nr_node_ids); + return node_to_cpumask_map[node]; +} +EXPORT_SYMBOL(node_to_cpumask); + +/* + * --------- end of debug versions of the numa functions --------- + */ + +#endif /* CONFIG_DEBUG_PER_CPU_MAPS */ + +#endif /* X86_64_NUMA */ + + +/* + * --------- Crashkernel reservation ------------------------------ + */ + +static inline unsigned long long get_total_mem(void) +{ + unsigned long long total; + + total = max_low_pfn - min_low_pfn; +#ifdef CONFIG_HIGHMEM + total += highend_pfn - highstart_pfn; +#endif + + return total << PAGE_SHIFT; +} + +#ifdef CONFIG_KEXEC +void __init reserve_crashkernel(void) +{ + unsigned long long total_mem; + unsigned long long crash_size, crash_base; + int ret; + + total_mem = get_total_mem(); + + ret = parse_crashkernel(boot_command_line, total_mem, + &crash_size, &crash_base); + if (ret == 0 && crash_size > 0) { + if (crash_base <= 0) { + printk(KERN_INFO "crashkernel reservation failed - " + "you have to specify a base address\n"); + return; + } + + if (reserve_bootmem_generic(crash_base, crash_size, + BOOTMEM_EXCLUSIVE) < 0) { + printk(KERN_INFO "crashkernel reservation failed - " + "memory is in use\n"); + return; + } + + printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " + "for crashkernel (System RAM: %ldMB)\n", + (unsigned long)(crash_size >> 20), + (unsigned long)(crash_base >> 20), + (unsigned long)(total_mem >> 20)); + + crashk_res.start = crash_base; + crashk_res.end = crash_base + crash_size - 1; + insert_resource(&iomem_resource, &crashk_res); + } +} +#else +void __init reserve_crashkernel(void) +{} +#endif +static struct resource standard_io_resources[] = { + { .name = "dma1", .start = 0x00, .end = 0x1f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "pic1", .start = 0x20, .end = 0x21, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "timer0", .start = 0x40, .end = 0x43, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "timer1", .start = 0x50, .end = 0x53, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "keyboard", .start = 0x60, .end = 0x60, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "keyboard", .start = 0x64, .end = 0x64, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "dma page reg", .start = 0x80, .end = 0x8f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "pic2", .start = 0xa0, .end = 0xa1, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "dma2", .start = 0xc0, .end = 0xdf, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "fpu", .start = 0xf0, .end = 0xff, + .flags = IORESOURCE_BUSY | IORESOURCE_IO } +}; + +void __init reserve_standard_io_resources(void) +{ + int i; + + /* request I/O space for devices used on all i[345]86 PCs */ + for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) + request_resource(&ioport_resource, &standard_io_resources[i]); + +} + +#ifdef CONFIG_PROC_VMCORE +/* elfcorehdr= specifies the location of elf core header + * stored by the crashed kernel. This option will be passed + * by kexec loader to the capture kernel. + */ +static int __init setup_elfcorehdr(char *arg) +{ + char *end; + if (!arg) + return -EINVAL; + elfcorehdr_addr = memparse(arg, &end); + return end > arg ? 0 : -EINVAL; +} +early_param("elfcorehdr", setup_elfcorehdr); +#endif + + + -- cgit v1.2.3-70-g09d2 From 55f262391a2365d657a00ed68edd1a51bca66af5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 25 Jun 2008 17:54:23 -0700 Subject: x86: rename setup_32.c to setup.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit and let 64 bit use that instead of setup_64.c [ mingo@elte.hu ] x86: build fix fix: arch/x86/kernel/setup.c: In function ‘setup_arch': arch/x86/kernel/setup.c:561: error: implicit declaration of function ‘efi_reserve_early' and: arch/x86/kernel/setup.c:766: error: implicit declaration of function 'init_cpu_to_node' and: arch/x86/kernel/setup.c:676: warning: operation on 'max_pfn_mapped' may be undefined Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/setup.c | 672 +++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/setup_32.c | 671 -------------------------------------------- 3 files changed, 673 insertions(+), 672 deletions(-) create mode 100644 arch/x86/kernel/setup.c delete mode 100644 arch/x86/kernel/setup_32.c (limited to 'arch/x86/kernel/Makefile') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5e1537b6253..212804e7878 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -18,7 +18,7 @@ CFLAGS_tsc_64.o := $(nostackp) obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o obj-y += traps_$(BITS).o irq_$(BITS).o obj-y += time_$(BITS).o ioport.o ldt.o -obj-y += setup_$(BITS).o i8259.o irqinit_$(BITS).o setup_percpu.o +obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o obj-$(CONFIG_X86_32) += probe_roms_32.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c new file mode 100644 index 00000000000..222d7038f2f --- /dev/null +++ b/arch/x86/kernel/setup.c @@ -0,0 +1,672 @@ +/* + * Copyright (C) 1995 Linus Torvalds + * + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + * + * Memory region support + * David Parsons , July-August 1999 + * + * Added E820 sanitization routine (removes overlapping memory regions); + * Brian Moyle , February 2001 + * + * Moved CPU detection code to cpu/${cpu}.c + * Patrick Mochel , March 2002 + * + * Provisions for empty E820 memory regions (reported by certain BIOSes). + * Alex Achenbach , December 2002. + * + */ + +/* + * This file handles the architecture-dependent parts of initialization + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include