diff options
Diffstat (limited to 'arch/x86')
119 files changed, 2241 insertions, 1577 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c9866b0b77d..7b383d8da7b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -40,7 +40,6 @@ config X86 select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_GRAPH_FP_TEST select HAVE_FUNCTION_TRACE_MCOUNT_TEST - select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE select HAVE_SYSCALL_TRACEPOINTS select HAVE_KVM select HAVE_ARCH_KGDB @@ -77,11 +76,13 @@ config X86 select GENERIC_CLOCKEVENTS_MIN_ADJUST select IRQ_FORCED_THREADING select USE_GENERIC_SMP_HELPERS if SMP - select HAVE_BPF_JIT if (X86_64 && NET) + select HAVE_BPF_JIT if X86_64 select CLKEVT_I8253 select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_IOMAP select DCACHE_WORD_ACCESS + select GENERIC_SMP_IDLE_THREAD + select HAVE_ARCH_SECCOMP_FILTER config INSTRUCTION_DECODER def_bool (KPROBES || PERF_EVENTS) @@ -160,9 +161,6 @@ config RWSEM_GENERIC_SPINLOCK config RWSEM_XCHGADD_ALGORITHM def_bool X86_XADD -config ARCH_HAS_CPU_IDLE_WAIT - def_bool y - config GENERIC_CALIBRATE_DELAY def_bool y diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 41a7237606a..277418ff8b5 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -134,6 +134,9 @@ KBUILD_CFLAGS += $(call cc-option,-mno-avx,) KBUILD_CFLAGS += $(mflags-y) KBUILD_AFLAGS += $(mflags-y) +archscripts: + $(Q)$(MAKE) $(build)=arch/x86/tools relocs + ### # Syscall table generation @@ -146,7 +149,6 @@ archheaders: head-y := arch/x86/kernel/head_$(BITS).o head-y += arch/x86/kernel/head$(BITS).o head-y += arch/x86/kernel/head.o -head-y += arch/x86/kernel/init_task.o libs-y += arch/x86/lib/ diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index fd55a2ff3ad..e398bb5d63b 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -40,13 +40,12 @@ OBJCOPYFLAGS_vmlinux.bin := -R .comment -S $(obj)/vmlinux.bin: vmlinux FORCE $(call if_changed,objcopy) +targets += vmlinux.bin.all vmlinux.relocs -targets += vmlinux.bin.all vmlinux.relocs relocs -hostprogs-$(CONFIG_X86_NEED_RELOCS) += relocs - +CMD_RELOCS = arch/x86/tools/relocs quiet_cmd_relocs = RELOCS $@ - cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $< -$(obj)/vmlinux.relocs: vmlinux $(obj)/relocs FORCE + cmd_relocs = $(CMD_RELOCS) $< > $@;$(CMD_RELOCS) --abs-relocs $< +$(obj)/vmlinux.relocs: vmlinux FORCE $(call if_changed,relocs) vmlinux.bin.all-y := $(obj)/vmlinux.bin diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index a69245ba27e..0b3f2354f6a 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -67,6 +67,10 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from) switch (from->si_code >> 16) { case __SI_FAULT >> 16: break; + case __SI_SYS >> 16: + put_user_ex(from->si_syscall, &to->si_syscall); + put_user_ex(from->si_arch, &to->si_arch); + break; case __SI_CHLD >> 16: if (ia32) { put_user_ex(from->si_utime, &to->si_utime); diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index aec2202a596..edca9c0a79c 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -287,11 +287,6 @@ asmlinkage long sys32_sigaction(int sig, struct old_sigaction32 __user *act, return ret; } -asmlinkage long sys32_alarm(unsigned int seconds) -{ - return alarm_setitimer(seconds); -} - asmlinkage long sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr, int options) { @@ -300,11 +295,6 @@ asmlinkage long sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr, /* 32-bit timeval and related flotsam. */ -asmlinkage long sys32_sysfs(int option, u32 arg1, u32 arg2) -{ - return sys_sysfs(option, arg1, arg2); -} - asmlinkage long sys32_sched_rr_get_interval(compat_pid_t pid, struct compat_timespec __user *interval) { @@ -375,19 +365,6 @@ asmlinkage long sys32_pwrite(unsigned int fd, const char __user *ubuf, } -asmlinkage long sys32_personality(unsigned long personality) -{ - int ret; - - if (personality(current->personality) == PER_LINUX32 && - personality == PER_LINUX) - personality = PER_LINUX32; - ret = sys_personality(personality); - if (ret == PER_LINUX32) - ret = PER_LINUX; - return ret; -} - asmlinkage long sys32_sendfile(int out_fd, int in_fd, compat_off_t __user *offset, s32 count) { diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index d8541017126..eaff4790ed9 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -138,6 +138,11 @@ static inline void native_apic_msr_write(u32 reg, u32 v) wrmsr(APIC_BASE_MSR + (reg >> 4), v, 0); } +static inline void native_apic_msr_eoi_write(u32 reg, u32 v) +{ + wrmsr(APIC_BASE_MSR + (APIC_EOI >> 4), APIC_EOI_ACK, 0); +} + static inline u32 native_apic_msr_read(u32 reg) { u64 msr; @@ -351,6 +356,14 @@ struct apic { /* apic ops */ u32 (*read)(u32 reg); void (*write)(u32 reg, u32 v); + /* + * ->eoi_write() has the same signature as ->write(). + * + * Drivers can support both ->eoi_write() and ->write() by passing the same + * callback value. Kernel can override ->eoi_write() and fall back + * on write for EOI. + */ + void (*eoi_write)(u32 reg, u32 v); u64 (*icr_read)(void); void (*icr_write)(u32 low, u32 high); void (*wait_icr_idle)(void); @@ -426,6 +439,11 @@ static inline void apic_write(u32 reg, u32 val) apic->write(reg, val); } +static inline void apic_eoi(void) +{ + apic->eoi_write(APIC_EOI, APIC_EOI_ACK); +} + static inline u64 apic_icr_read(void) { return apic->icr_read(); @@ -450,6 +468,7 @@ static inline u32 safe_apic_wait_icr_idle(void) static inline u32 apic_read(u32 reg) { return 0; } static inline void apic_write(u32 reg, u32 val) { } +static inline void apic_eoi(void) { } static inline u64 apic_icr_read(void) { return 0; } static inline void apic_icr_write(u32 low, u32 high) { } static inline void apic_wait_icr_idle(void) { } @@ -463,9 +482,7 @@ static inline void ack_APIC_irq(void) * ack_APIC_irq() actually gets compiled as a single instruction * ... yummie. */ - - /* Docs say use 0 for future compatibility */ - apic_write(APIC_EOI, 0); + apic_eoi(); } static inline unsigned default_get_apic_id(unsigned long x) diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 134bba00df0..c46bb99d5fb 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -37,7 +37,7 @@ #define APIC_ARBPRI_MASK 0xFFu #define APIC_PROCPRI 0xA0 #define APIC_EOI 0xB0 -#define APIC_EIO_ACK 0x0 +#define APIC_EOI_ACK 0x0 /* Docs say 0 for future compat. */ #define APIC_RRR 0xC0 #define APIC_LDR 0xD0 #define APIC_LDR_MASK (0xFFu << 24) diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index 198119910da..b154de75c90 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -63,7 +63,7 @@ ATOMIC64_DECL(add_unless); /** * atomic64_cmpxchg - cmpxchg atomic64 variable - * @p: pointer to type atomic64_t + * @v: pointer to type atomic64_t * @o: expected value * @n: new value * @@ -98,7 +98,7 @@ static inline long long atomic64_xchg(atomic64_t *v, long long n) /** * atomic64_set - set atomic64 variable * @v: pointer to type atomic64_t - * @n: value to assign + * @i: value to assign * * Atomically sets the value of @v to @n. */ @@ -200,7 +200,7 @@ static inline long long atomic64_sub(long long i, atomic64_t *v) * atomic64_sub_and_test - subtract value from variable and test result * @i: integer value to subtract * @v: pointer to type atomic64_t - * + * * Atomically subtracts @i from @v and returns * true if the result is zero, or false for all * other cases. @@ -224,9 +224,9 @@ static inline void atomic64_inc(atomic64_t *v) /** * atomic64_dec - decrement atomic64 variable - * @ptr: pointer to type atomic64_t + * @v: pointer to type atomic64_t * - * Atomically decrements @ptr by 1. + * Atomically decrements @v by 1. */ static inline void atomic64_dec(atomic64_t *v) { diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index 5e1a2eef3e7..b13fe63bdc5 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -19,7 +19,7 @@ #ifdef CONFIG_X86_64 #define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT #else -#define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT + THREAD_ORDER) +#define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT + THREAD_SIZE_ORDER) #endif #define MIN_KERNEL_ALIGN (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2) diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index d6805798d6f..fedf32b73e6 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -229,7 +229,7 @@ static inline void __user *arch_compat_alloc_user_space(long len) sp = task_pt_regs(current)->sp; } else { /* -128 for the x32 ABI redzone */ - sp = percpu_read(old_rsp) - 128; + sp = this_cpu_read(old_rsp) - 128; } return (void __user *)round_down(sp - len, 16); diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h index 4d447b732d8..9476c04ee63 100644 --- a/arch/x86/include/asm/current.h +++ b/arch/x86/include/asm/current.h @@ -11,7 +11,7 @@ DECLARE_PER_CPU(struct task_struct *, current_task); static __always_inline struct task_struct *get_current(void) { - return percpu_read_stable(current_task); + return this_cpu_read_stable(current_task); } #define current get_current() diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index e95822d683f..8bf1c06070d 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -6,6 +6,7 @@ #include <asm/mmu.h> #include <linux/smp.h> +#include <linux/percpu.h> static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *info) { diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 4fa88154e4d..75f4c6d6a33 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -290,14 +290,14 @@ static inline int __thread_has_fpu(struct task_struct *tsk) static inline void __thread_clear_has_fpu(struct task_struct *tsk) { tsk->thread.fpu.has_fpu = 0; - percpu_write(fpu_owner_task, NULL); + this_cpu_write(fpu_owner_task, NULL); } /* Must be paired with a 'clts' before! */ static inline void __thread_set_has_fpu(struct task_struct *tsk) { tsk->thread.fpu.has_fpu = 1; - percpu_write(fpu_owner_task, tsk); + this_cpu_write(fpu_owner_task, tsk); } /* @@ -344,7 +344,7 @@ typedef struct { int preload; } fpu_switch_t; */ static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu) { - return new == percpu_read_stable(fpu_owner_task) && + return new == this_cpu_read_stable(fpu_owner_task) && cpu == new->thread.fpu.last_cpu; } diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 268c783ab1c..18d9005d9e4 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -34,6 +34,7 @@ #ifndef __ASSEMBLY__ extern void mcount(void); +extern int modifying_ftrace_code; static inline unsigned long ftrace_call_adjust(unsigned long addr) { @@ -50,6 +51,8 @@ struct dyn_arch_ftrace { /* No extra data needed for x86 */ }; +int ftrace_int3_handler(struct pt_regs *regs); + #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* __ASSEMBLY__ */ #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 382f75d735f..d3895dbf4dd 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -35,14 +35,15 @@ DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); #define __ARCH_IRQ_STAT -#define inc_irq_stat(member) percpu_inc(irq_stat.member) +#define inc_irq_stat(member) this_cpu_inc(irq_stat.member) -#define local_softirq_pending() percpu_read(irq_stat.__softirq_pending) +#define local_softirq_pending() this_cpu_read(irq_stat.__softirq_pending) #define __ARCH_SET_SOFTIRQ_PENDING -#define set_softirq_pending(x) percpu_write(irq_stat.__softirq_pending, (x)) -#define or_softirq_pending(x) percpu_or(irq_stat.__softirq_pending, (x)) +#define set_softirq_pending(x) \ + this_cpu_write(irq_stat.__softirq_pending, (x)) +#define or_softirq_pending(x) this_cpu_or(irq_stat.__softirq_pending, (x)) extern void ack_bad_irq(unsigned int irq); diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h index ee52760549f..b04cbdb138c 100644 --- a/arch/x86/include/asm/ia32.h +++ b/arch/x86/include/asm/ia32.h @@ -144,6 +144,12 @@ typedef struct compat_siginfo { int _band; /* POLL_IN, POLL_OUT, POLL_MSG */ int _fd; } _sigpoll; + + struct { + unsigned int _call_addr; /* calling insn */ + int _syscall; /* triggering system call number */ + unsigned int _arch; /* AUDIT_ARCH_* of syscall */ + } _sigsys; } _sifields; } compat_siginfo_t; diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 2c4943de515..73d8c5398ea 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -5,7 +5,7 @@ #include <asm/mpspec.h> #include <asm/apicdef.h> #include <asm/irq_vectors.h> - +#include <asm/x86_init.h> /* * Intel IO-APIC support for SMP and UP systems. * @@ -21,15 +21,6 @@ #define IO_APIC_REDIR_LEVEL_TRIGGER (1 << 15) #define IO_APIC_REDIR_MASKED (1 << 16) -struct io_apic_ops { - void (*init) (void); - unsigned int (*read) (unsigned int apic, unsigned int reg); - void (*write) (unsigned int apic, unsigned int reg, unsigned int value); - void (*modify)(unsigned int apic, unsigned int reg, unsigned int value); -}; - -void __init set_io_apic_ops(const struct io_apic_ops *); - /* * The structure of the IO-APIC: */ @@ -156,7 +147,6 @@ struct io_apic_irq_attr; extern int io_apic_set_pci_routing(struct device *dev, int irq, struct io_apic_irq_attr *irq_attr); void setup_IO_APIC_irq_extra(u32 gsi); -extern void ioapic_and_gsi_init(void); extern void ioapic_insert_resources(void); int io_apic_setup_irq_pin_once(unsigned int irq, int node, struct io_apic_irq_attr *attr); @@ -185,12 +175,29 @@ extern void mp_save_irq(struct mpc_intsrc *m); extern void disable_ioapic_support(void); +extern void __init native_io_apic_init_mappings(void); +extern unsigned int native_io_apic_read(unsigned int apic, unsigned int reg); +extern void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int val); +extern void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val); + +static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) +{ + return x86_io_apic_ops.read(apic, reg); +} + +static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) +{ + x86_io_apic_ops.write(apic, reg, value); +} +static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) +{ + x86_io_apic_ops.modify(apic, reg, value); +} #else /* !CONFIG_X86_IO_APIC */ #define io_apic_assign_pci_irqs 0 #define setup_ioapic_ids_from_mpc x86_init_noop static const int timer_through_8259 = 0; -static inline void ioapic_and_gsi_init(void) { } static inline void ioapic_insert_resources(void) { } #define gsi_top (NR_IRQS_LEGACY) static inline int mp_find_ioapic(u32 gsi) { return 0; } @@ -212,6 +219,10 @@ static inline int restore_ioapic_entries(void) static inline void mp_save_irq(struct mpc_intsrc *m) { }; static inline void disable_ioapic_support(void) { } +#define native_io_apic_init_mappings NULL +#define native_io_apic_read NULL +#define native_io_apic_write NULL +#define native_io_apic_modify NULL #endif #endif /* _ASM_X86_IO_APIC_H */ diff --git a/arch/x86/include/asm/irq_regs.h b/arch/x86/include/asm/irq_regs.h index 77843225b7e..d82250b1deb 100644 --- a/arch/x86/include/asm/irq_regs.h +++ b/arch/x86/include/asm/irq_regs.h @@ -15,7 +15,7 @@ DECLARE_PER_CPU(struct pt_regs *, irq_regs); static inline struct pt_regs *get_irq_regs(void) { - return percpu_read(irq_regs); + return this_cpu_read(irq_regs); } static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs) @@ -23,7 +23,7 @@ static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs) struct pt_regs *old_regs; old_regs = get_irq_regs(); - percpu_write(irq_regs, new_regs); + this_cpu_write(irq_regs, new_regs); return old_regs; } diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 47d99934580..5fb9bbbd2f1 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -1,45 +1,101 @@ -#ifndef _ASM_X86_IRQ_REMAPPING_H -#define _ASM_X86_IRQ_REMAPPING_H +/* + * Copyright (C) 2012 Advanced Micro Devices, Inc. + * Author: Joerg Roedel <joerg.roedel@amd.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * This header file contains the interface of the interrupt remapping code to + * the x86 interrupt management code. + */ -#define IRTE_DEST(dest) ((x2apic_mode) ? dest : dest << 8) +#ifndef __X86_IRQ_REMAPPING_H +#define __X86_IRQ_REMAPPING_H + +#include <asm/io_apic.h> #ifdef CONFIG_IRQ_REMAP -static void irq_remap_modify_chip_defaults(struct irq_chip *chip); -static inline void prepare_irte(struct irte *irte, int vector, - unsigned int dest) + +extern int irq_remapping_enabled; + +extern void setup_irq_remapping_ops(void); +extern int irq_remapping_supported(void); +extern int irq_remapping_prepare(void); +extern int irq_remapping_enable(void); +extern void irq_remapping_disable(void); +extern int irq_remapping_reenable(int); +extern int irq_remap_enable_fault_handling(void); +extern int setup_ioapic_remapped_entry(int irq, + struct IO_APIC_route_entry *entry, + unsigned int destination, + int vector, + struct io_apic_irq_attr *attr); +extern int set_remapped_irq_affinity(struct irq_data *data, + const struct cpumask *mask, + bool force); +extern void free_remapped_irq(int irq); +extern void compose_remapped_msi_msg(struct pci_dev *pdev, + unsigned int irq, unsigned int dest, + struct msi_msg *msg, u8 hpet_id); +extern int msi_alloc_remapped_irq(struct pci_dev *pdev, int irq, int nvec); +extern int msi_setup_remapped_irq(struct pci_dev *pdev, unsigned int irq, + int index, int sub_handle); +extern int setup_hpet_msi_remapped(unsigned int irq, unsigned int id); + +#else /* CONFIG_IRQ_REMAP */ + +#define irq_remapping_enabled 0 + +static inline void setup_irq_remapping_ops(void) { } +static inline int irq_remapping_supported(void) { return 0; } +static inline int irq_remapping_prepare(void) { return -ENODEV; } +static inline int irq_remapping_enable(void) { return -ENODEV; } +static inline void irq_remapping_disable(void) { } +static inline int irq_remapping_reenable(int eim) { return -ENODEV; } +static inline int irq_remap_enable_fault_handling(void) { return -ENODEV; } +static inline int setup_ioapic_remapped_entry(int irq, + struct IO_APIC_route_entry *entry, + unsigned int destination, + int vector, + struct io_apic_irq_attr *attr) +{ + return -ENODEV; +} +static inline int set_remapped_irq_affinity(struct irq_data *data, + const struct cpumask *mask, + bool force) { - memset(irte, 0, sizeof(*irte)); - - irte->present = 1; - irte->dst_mode = apic->irq_dest_mode; - /* - * Trigger mode in the IRTE will always be edge, and for IO-APIC, the - * actual level or edge trigger will be setup in the IO-APIC - * RTE. This will help simplify level triggered irq migration. - * For more details, see the comments (in io_apic.c) explainig IO-APIC - * irq migration in the presence of interrupt-remapping. - */ - irte->trigger_mode = 0; - irte->dlvry_mode = apic->irq_delivery_mode; - irte->vector = vector; - irte->dest_id = IRTE_DEST(dest); - irte->redir_hint = 1; + return 0; } -static inline bool irq_remapped(struct irq_cfg *cfg) +static inline void free_remapped_irq(int irq) { } +static inline void compose_remapped_msi_msg(struct pci_dev *pdev, + unsigned int irq, unsigned int dest, + struct msi_msg *msg, u8 hpet_id) { - return cfg->irq_2_iommu.iommu != NULL; } -#else -static void prepare_irte(struct irte *irte, int vector, unsigned int dest) +static inline int msi_alloc_remapped_irq(struct pci_dev *pdev, int irq, int nvec) { + return -ENODEV; } -static inline bool irq_remapped(struct irq_cfg *cfg) +static inline int msi_setup_remapped_irq(struct pci_dev *pdev, unsigned int irq, + int index, int sub_handle) { - return false; + return -ENODEV; } -static inline void irq_remap_modify_chip_defaults(struct irq_chip *chip) +static inline int setup_hpet_msi_remapped(unsigned int irq, unsigned int id) { + return -ENODEV; } -#endif +#endif /* CONFIG_IRQ_REMAP */ -#endif /* _ASM_X86_IRQ_REMAPPING_H */ +#endif /* __X86_IRQ_REMAPPING_H */ diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 734c3767cfa..183922e13de 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -170,6 +170,9 @@ static inline int kvm_para_available(void) unsigned int eax, ebx, ecx, edx; char signature[13]; + if (boot_cpu_data.cpuid_level < 0) + return 0; /* So we don't blow up on old processors */ + cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx); memcpy(signature + 0, &ebx, 4); memcpy(signature + 4, &ecx, 4); diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 69021528b43..cdbf3677610 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -25,8 +25,8 @@ void destroy_context(struct mm_struct *mm); static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { #ifdef CONFIG_SMP - if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) - percpu_write(cpu_tlbstate.state, TLBSTATE_LAZY); + if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) + this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY); #endif } @@ -37,8 +37,8 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, if (likely(prev != next)) { #ifdef CONFIG_SMP - percpu_write(cpu_tlbstate.state, TLBSTATE_OK); - percpu_write(cpu_tlbstate.active_mm, next); + this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); + this_cpu_write(cpu_tlbstate.active_mm, next); #endif cpumask_set_cpu(cpu, mm_cpumask(next)); @@ -56,8 +56,8 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, } #ifdef CONFIG_SMP else { - percpu_write(cpu_tlbstate.state, TLBSTATE_OK); - BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next); + this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); + BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next); if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) { /* We were in lazy tlb mode and leave_mm disabled diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index ccb805966f6..957ec87385a 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -134,6 +134,8 @@ #define MSR_AMD64_IBSFETCHCTL 0xc0011030 #define MSR_AMD64_IBSFETCHLINAD 0xc0011031 #define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032 +#define MSR_AMD64_IBSFETCH_REG_COUNT 3 +#define MSR_AMD64_IBSFETCH_REG_MASK ((1UL<<MSR_AMD64_IBSFETCH_REG_COUNT)-1) #define MSR_AMD64_IBSOPCTL 0xc0011033 #define MSR_AMD64_IBSOPRIP 0xc0011034 #define MSR_AMD64_IBSOPDATA 0xc0011035 @@ -141,8 +143,11 @@ #define MSR_AMD64_IBSOPDATA3 0xc0011037 #define MSR_AMD64_IBSDCLINAD 0xc0011038 #define MSR_AMD64_IBSDCPHYSAD 0xc0011039 +#define MSR_AMD64_IBSOP_REG_COUNT 7 +#define MSR_AMD64_IBSOP_REG_MASK ((1UL<<MSR_AMD64_IBSOP_REG_COUNT)-1) #define MSR_AMD64_IBSCTL 0xc001103a #define MSR_AMD64_IBSBRTARGET 0xc001103b +#define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */ /* Fam 15h MSRs */ #define MSR_F15H_PERF_CTL 0xc0010200 diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index fd3f9f18cf3..0e3793b821e 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -27,6 +27,8 @@ void arch_trigger_all_cpu_backtrace(void); enum { NMI_LOCAL=0, NMI_UNKNOWN, + NMI_SERR, + NMI_IO_CHECK, NMI_MAX }; @@ -35,8 +37,24 @@ enum { typedef int (*nmi_handler_t)(unsigned int, struct pt_regs *); -int register_nmi_handler(unsigned int, nmi_handler_t, unsigned long, - const char *); +struct nmiaction { + struct list_head list; + nmi_handler_t handler; + unsigned long flags; + const char *name; +}; + +#define register_nmi_handler(t, fn, fg, n) \ +({ \ + static struct nmiaction fn##_na = { \ + .handler = (fn), \ + .name = (n), \ + .flags = (fg), \ + }; \ + __register_nmi_handler((t), &fn##_na); \ +}) + +int __register_nmi_handler(unsigned int, struct nmiaction *); void unregister_nmi_handler(unsigned int, const char *); diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index ade619ff9e2..ef17af01347 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -15,8 +15,8 @@ */ #define __PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) -#define THREAD_ORDER 1 -#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) +#define THREAD_SIZE_ORDER 1 +#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) #define STACKFAULT_STACK 0 #define DOUBLEFAULT_STACK 1 diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 7639dbf5d22..320f7bb95f7 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -1,8 +1,8 @@ #ifndef _ASM_X86_PAGE_64_DEFS_H #define _ASM_X86_PAGE_64_DEFS_H -#define THREAD_ORDER 1 -#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) +#define THREAD_SIZE_ORDER 1 +#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) #define CURRENT_MASK (~(THREAD_SIZE - 1)) #define EXCEPTION_STACK_ORDER 0 diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 7a11910a63c..d9b8e3f7f42 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -46,7 +46,7 @@ #ifdef CONFIG_SMP #define __percpu_prefix "%%"__stringify(__percpu_seg)":" -#define __my_cpu_offset percpu_read(this_cpu_off) +#define __my_cpu_offset this_cpu_read(this_cpu_off) /* * Compared to the generic __my_cpu_offset version, the following @@ -351,23 +351,15 @@ do { \ }) /* - * percpu_read() makes gcc load the percpu variable every time it is - * accessed while percpu_read_stable() allows the value to be cached. - * percpu_read_stable() is more efficient and can be used if its value + * this_cpu_read() makes gcc load the percpu variable every time it is + * accessed while this_cpu_read_stable() allows the value to be cached. + * this_cpu_read_stable() is more efficient and can be used if its value * is guaranteed to be valid across cpus. The current users include * get_current() and get_thread_info() both of which are actually * per-thread variables implemented as per-cpu variables and thus * stable for the duration of the respective task. */ -#define percpu_read(var) percpu_from_op("mov", var, "m" (var)) -#define percpu_read_stable(var) percpu_from_op("mov", var, "p" (&(var))) -#define percpu_write(var, val) percpu_to_op("mov", var, val) -#define percpu_add(var, val) percpu_add_op(var, val) -#define percpu_sub(var, val) percpu_add_op(var, -(val)) -#define percpu_and(var, val) percpu_to_op("and", var, val) -#define percpu_or(var, val) percpu_to_op("or", var, val) -#define percpu_xor(var, val) percpu_to_op("xor", var, val) -#define percpu_inc(var) percpu_unary_op("inc", var) +#define this_cpu_read_stable(var) percpu_from_op("mov", var, "p" (&(var))) #define __this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) #define __this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) @@ -512,7 +504,11 @@ static __always_inline int x86_this_cpu_constant_test_bit(unsigned int nr, { unsigned long __percpu *a = (unsigned long *)addr + nr / BITS_PER_LONG; - return ((1UL << (nr % BITS_PER_LONG)) & percpu_read(*a)) != 0; +#ifdef CONFIG_X86_64 + return ((1UL << (nr % BITS_PER_LONG)) & __this_cpu_read_8(*a)) != 0; +#else + return ((1UL << (nr % BITS_PER_LONG)) & __this_cpu_read_4(*a)) != 0; +#endif } static inline int x86_this_cpu_variable_test_bit(int nr, diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 2291895b183..588f52ea810 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -158,6 +158,7 @@ struct x86_pmu_capability { #define IBS_CAPS_OPCNT (1U<<4) #define IBS_CAPS_BRNTRGT (1U<<5) #define IBS_CAPS_OPCNTEXT (1U<<6) +#define IBS_CAPS_RIPINVALIDCHK (1U<<7) #define IBS_CAPS_DEFAULT (IBS_CAPS_AVAIL \ | IBS_CAPS_FETCHSAM \ @@ -170,21 +171,28 @@ struct x86_pmu_capability { #define IBSCTL_LVT_OFFSET_VALID (1ULL<<8) #define IBSCTL_LVT_OFFSET_MASK 0x0F -/* IbsFetchCtl bits/masks */ +/* ibs fetch bits/masks */ #define IBS_FETCH_RAND_EN (1ULL<<57) #define IBS_FETCH_VAL (1ULL<<49) #define IBS_FETCH_ENABLE (1ULL<<48) #define IBS_FETCH_CNT 0xFFFF0000ULL #define IBS_FETCH_MAX_CNT 0x0000FFFFULL -/* IbsOpCtl bits */ +/* ibs op bits/masks */ +/* lower 4 bits of the current count are ignored: */ +#define IBS_OP_CUR_CNT (0xFFFF0ULL<<32) #define IBS_OP_CNT_CTL (1ULL<<19) #define IBS_OP_VAL (1ULL<<18) #define IBS_OP_ENABLE (1ULL<<17) #define IBS_OP_MAX_CNT 0x0000FFFFULL #define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */ +#define IBS_RIP_INVALID (1ULL<<38) +#ifdef CONFIG_X86_LOCAL_APIC extern u32 get_ibs_caps(void); +#else +static inline u32 get_ibs_caps(void) { return 0; } +#endif #ifdef CONFIG_PERF_EVENTS extern void perf_events_lapic_init(void); diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 4fa7dcceb6c..ccbb1ea99cc 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -974,8 +974,6 @@ extern bool cpu_has_amd_erratum(const int *); #define cpu_has_amd_erratum(x) (false) #endif /* CONFIG_CPU_SUP_AMD */ -void cpu_idle_wait(void); - extern unsigned long arch_align_stack(unsigned long sp); extern void free_init_pages(char *what, unsigned long begin, unsigned long end); diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 0434c400287..f48394513c3 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -62,6 +62,8 @@ DECLARE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid); /* Static state in head.S used to set up a CPU */ extern unsigned long stack_start; /* Initial stack pointer address */ +struct task_struct; + struct smp_ops { void (*smp_prepare_boot_cpu)(void); void (*smp_prepare_cpus)(unsigned max_cpus); @@ -70,7 +72,7 @@ struct smp_ops { void (*stop_other_cpus)(int wait); void (*smp_send_reschedule)(int cpu); - int (*cpu_up)(unsigned cpu); + int (*cpu_up)(unsigned cpu, struct task_struct *tidle); int (*cpu_disable)(void); void (*cpu_die)(unsigned int cpu); void (*play_dead)(void); @@ -113,9 +115,9 @@ static inline void smp_cpus_done(unsigned int max_cpus) smp_ops.smp_cpus_done(max_cpus); } -static inline int __cpu_up(unsigned int cpu) +static inline int __cpu_up(unsigned int cpu, struct task_struct *tidle) { - return smp_ops.cpu_up(cpu); + return smp_ops.cpu_up(cpu, tidle); } static inline int __cpu_disable(void) @@ -152,7 +154,7 @@ void cpu_disable_common(void); void native_smp_prepare_boot_cpu(void); void native_smp_prepare_cpus(unsigned int max_cpus); void native_smp_cpus_done(unsigned int max_cpus); -int native_cpu_up(unsigned int cpunum); +int native_cpu_up(unsigned int cpunum, struct task_struct *tidle); int native_cpu_disable(void); void native_cpu_die(unsigned int cpu); void native_play_dead(void); @@ -162,6 +164,7 @@ int wbinvd_on_all_cpus(void); void native_send_call_func_ipi(const struct cpumask *mask); void native_send_call_func_single_ipi(int cpu); +void x86_idle_thread_init(unsigned int cpu, struct task_struct *idle); void smp_store_cpu_info(int id); #define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu) @@ -188,11 +191,11 @@ extern unsigned disabled_cpus __cpuinitdata; * from the initial startup. We map APIC_BASE very early in page_setup(), * so this is correct in the x86 case. */ -#define raw_smp_processor_id() (percpu_read(cpu_number)) +#define raw_smp_processor_id() (this_cpu_read(cpu_number)) extern int safe_smp_processor_id(void); #elif defined(CONFIG_X86_64_SMP) -#define raw_smp_processor_id() (percpu_read(cpu_number)) +#define raw_smp_processor_id() (this_cpu_read(cpu_number)) #define stack_smp_processor_id() \ ({ \ diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 76bfa2cf301..b315a33867f 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -20,10 +20,8 @@ #ifdef CONFIG_X86_32 # define LOCK_PTR_REG "a" -# define REG_PTR_MODE "k" #else # define LOCK_PTR_REG "D" -# define REG_PTR_MODE "q" #endif #if defined(CONFIG_X86_32) && \ diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index b5d9533d2c3..6a998598f17 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -75,9 +75,9 @@ static __always_inline void boot_init_stack_canary(void) current->stack_canary = canary; #ifdef CONFIG_X86_64 - percpu_write(irq_stack_union.stack_canary, canary); + this_cpu_write(irq_stack_union.stack_canary, canary); #else - percpu_write(stack_canary.canary, canary); + this_cpu_write(stack_canary.canary, canary); #endif } diff --git a/arch/x86/include/asm/stat.h b/arch/x86/include/asm/stat.h index e0b1d9bbcbc..7b3ddc34858 100644 --- a/arch/x86/include/asm/stat.h +++ b/arch/x86/include/asm/stat.h @@ -25,6 +25,12 @@ struct stat { unsigned long __unused5; }; +/* We don't need to memset the whole thing just to initialize the padding */ +#define INIT_STRUCT_STAT_PADDING(st) do { \ + st.__unused4 = 0; \ + st.__unused5 = 0; \ +} while (0) + #define STAT64_HAS_BROKEN_ST_INO 1 /* This matches struct stat64 in glibc2.1, hence the absolutely @@ -63,6 +69,12 @@ struct stat64 { unsigned long long st_ino; }; +/* We don't need to memset the whole thing just to initialize the padding */ +#define INIT_STRUCT_STAT64_PADDING(st) do { \ + memset(&st.__pad0, 0, sizeof(st.__pad0)); \ + memset(&st.__pad3, 0, sizeof(st.__pad3)); \ +} while (0) + #else /* __i386__ */ struct stat { @@ -87,6 +99,15 @@ struct stat { unsigned long st_ctime_nsec; long __unused[3]; }; + +/* We don't need to memset the whole thing just to initialize the padding */ +#define INIT_STRUCT_STAT_PADDING(st) do { \ + st.__pad0 = 0; \ + st.__unused[0] = 0; \ + st.__unused[1] = 0; \ + st.__unused[2] = 0; \ +} while (0) + #endif /* for 32bit emulation and 32 bit kernels */ diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 386b78686c4..1ace47b6259 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -13,9 +13,11 @@ #ifndef _ASM_X86_SYSCALL_H #define _ASM_X86_SYSCALL_H +#include <linux/audit.h> #include <linux/sched.h> #include <linux/err.h> #include <asm/asm-offsets.h> /* For NR_syscalls */ +#include <asm/thread_info.h> /* for TS_COMPAT */ #include <asm/unistd.h> extern const unsigned long sys_call_table[]; @@ -88,6 +90,12 @@ static inline void syscall_set_arguments(struct task_struct *task, memcpy(®s->bx + i, args, n * sizeof(args[0])); } +static inline int syscall_get_arch(struct task_struct *task, + struct pt_regs *regs) +{ + return AUDIT_ARCH_I386; +} + #else /* CONFIG_X86_64 */ static inline void syscall_get_arguments(struct task_struct *task, @@ -212,6 +220,25 @@ static inline void syscall_set_arguments(struct task_struct *task, } } +static inline int syscall_get_arch(struct task_struct *task, + struct pt_regs *regs) +{ +#ifdef CONFIG_IA32_EMULATION + /* + * TS_COMPAT is set for 32-bit syscall entry and then + * remains set until we return to user mode. + * + * TIF_IA32 tasks should always have TS_COMPAT set at + * system call time. + * + * x32 tasks should be considered AUDIT_ARCH_X86_64. + */ + if (task_thread_info(task)->status & TS_COMPAT) + return AUDIT_ARCH_I386; +#endif + /* Both x32 and x86_64 are considered "64-bit". */ + return AUDIT_ARCH_X86_64; +} #endif /* CONFIG_X86_32 */ #endif /* _ASM_X86_SYSCALL_H */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index ad6df8ccd71..3c9aebc00d3 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -155,24 +155,6 @@ struct thread_info { #define PREEMPT_ACTIVE 0x10000000 -/* thread information allocation */ -#ifdef CONFIG_DEBUG_STACK_USAGE -#define THREAD_FLAGS (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) -#else -#define THREAD_FLAGS (GFP_KERNEL | __GFP_NOTRACK) -#endif - -#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR - -#define alloc_thread_info_node(tsk, node) \ -({ \ - struct page *page = alloc_pages_node(node, THREAD_FLAGS, \ - THREAD_ORDER); \ - struct thread_info *ret = page ? page_address(page) : NULL; \ - \ - ret; \ -}) - #ifdef CONFIG_X86_32 #define STACK_WARN (THREAD_SIZE/8) @@ -222,7 +204,7 @@ DECLARE_PER_CPU(unsigned long, kernel_stack); static inline struct thread_info *current_thread_info(void) { struct thread_info *ti; - ti = (void *)(percpu_read_stable(kernel_stack) + + ti = (void *)(this_cpu_read_stable(kernel_stack) + KERNEL_STACK_OFFSET - THREAD_SIZE); return ti; } @@ -282,8 +264,7 @@ static inline bool is_ia32_task(void) #ifndef __ASSEMBLY__ extern void arch_task_cache_init(void); -extern void free_thread_info(struct thread_info *ti); extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); -#define arch_task_cache_init arch_task_cache_init +extern void arch_release_task_struct(struct task_struct *tsk); #endif #endif /* _ASM_X86_THREAD_INFO_H */ diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index c0e108e0807..1620d23f14d 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -156,8 +156,8 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); static inline void reset_lazy_tlbstate(void) { - percpu_write(cpu_tlbstate.state, 0); - percpu_write(cpu_tlbstate.active_mm, &init_mm); + this_cpu_write(cpu_tlbstate.state, 0); + this_cpu_write(cpu_tlbstate.active_mm, &init_mm); } #endif /* SMP */ diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index b9676ae37ad..095b21507b6 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -92,44 +92,6 @@ extern void setup_node_to_cpumask_map(void); #define pcibus_to_node(bus) __pcibus_to_node(bus) -#ifdef CONFIG_X86_32 -# define SD_CACHE_NICE_TRIES 1 -# define SD_IDLE_IDX 1 -#else -# define SD_CACHE_NICE_TRIES 2 -# define SD_IDLE_IDX 2 -#endif - -/* sched_domains SD_NODE_INIT for NUMA machines */ -#define SD_NODE_INIT (struct sched_domain) { \ - .min_interval = 8, \ - .max_interval = 32, \ - .busy_factor = 32, \ - .imbalance_pct = 125, \ - .cache_nice_tries = SD_CACHE_NICE_TRIES, \ - .busy_idx = 3, \ - .idle_idx = SD_IDLE_IDX, \ - .newidle_idx = 0, \ - .wake_idx = 0, \ - .forkexec_idx = 0, \ - \ - .flags = 1*SD_LOAD_BALANCE \ - | 1*SD_BALANCE_NEWIDLE \ - | 1*SD_BALANCE_EXEC \ - | 1*SD_BALANCE_FORK \ - | 0*SD_BALANCE_WAKE \ - | 1*SD_WAKE_AFFINE \ - | 0*SD_PREFER_LOCAL \ - | 0*SD_SHARE_CPUPOWER \ - | 0*SD_POWERSAVINGS_BALANCE \ - | 0*SD_SHARE_PKG_RESOURCES \ - | 1*SD_SERIALIZE \ - | 0*SD_PREFER_SIBLING \ - , \ - .last_balance = jiffies, \ - .balance_interval = 1, \ -} - extern int __node_distance(int, int); #define node_distance(a, b) __node_distance(a, b) diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 764b66a4cf8..c090af10ac7 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -188,11 +188,18 @@ struct x86_msi_ops { void (*restore_msi_irqs)(struct pci_dev *dev, int irq); }; +struct x86_io_apic_ops { + void (*init) (void); + unsigned int (*read) (unsigned int apic, unsigned int reg); + void (*write) (unsigned int apic, unsigned int reg, unsigned int value); + void (*modify)(unsigned int apic, unsigned int reg, unsigned int value); +}; + extern struct x86_init_ops x86_init; extern struct x86_cpuinit_ops x86_cpuinit; extern struct x86_platform_ops x86_platform; extern struct x86_msi_ops x86_msi; - +extern struct x86_io_apic_ops x86_io_apic_ops; extern void x86_init_noop(void); extern void x86_init_uint_noop(unsigned int unused); diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 532d2e090e6..56ebd1f9844 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -2,7 +2,7 @@ # Makefile for the linux kernel. # -extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinux.lds +extra-y := head_$(BITS).o head$(BITS).o head.o vmlinux.lds CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index a415b1f4436..7c439fe4941 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -593,7 +593,7 @@ void __init acpi_set_irq_model_ioapic(void) #ifdef CONFIG_ACPI_HOTPLUG_CPU #include <acpi/processor.h> -static void __cpuinitdata acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) +static void __cpuinit acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) { #ifdef CONFIG_ACPI_NUMA int nid; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index edc24480469..39a222e094a 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -35,6 +35,7 @@ #include <linux/smp.h> #include <linux/mm.h> +#include <asm/irq_remapping.h> #include <asm/perf_event.h> #include <asm/x86_init.h> #include <asm/pgalloc.h> @@ -1325,11 +1326,13 @@ void __cpuinit setup_local_APIC(void) acked); break; } - if (cpu_has_tsc) { - rdtscll(ntsc); - max_loops = (cpu_khz << 10) - (ntsc - tsc); - } else - max_loops--; + if (queued) { + if (cpu_has_tsc) { + rdtscll(ntsc); + max_loops = (cpu_khz << 10) - (ntsc - tsc); + } else + max_loops--; + } } while (queued && max_loops > 0); WARN_ON(max_loops <= 0); @@ -1441,8 +1444,8 @@ void __init bsp_end_local_APIC_setup(void) * Now that local APIC setup is completed for BP, configure the fault * handling for interrupt remapping. */ - if (intr_remapping_enabled) - enable_drhd_fault_handling(); + if (irq_remapping_enabled) + irq_remap_enable_fault_handling(); } @@ -1517,7 +1520,7 @@ void enable_x2apic(void) int __init enable_IR(void) { #ifdef CONFIG_IRQ_REMAP - if (!intr_remapping_supported()) { + if (!irq_remapping_supported()) { pr_debug("intr-remapping not supported\n"); return -1; } @@ -1528,7 +1531,7 @@ int __init enable_IR(void) return -1; } - return enable_intr_remapping(); + return irq_remapping_enable(); #endif return -1; } @@ -1537,10 +1540,13 @@ void __init enable_IR_x2apic(void) { unsigned long flags; int ret, x2apic_enabled = 0; - int dmar_table_init_ret; + int hardware_init_ret; + + /* Make sure irq_remap_ops are initialized */ + setup_irq_remapping_ops(); - dmar_table_init_ret = dmar_table_init(); - if (dmar_table_init_ret && !x2apic_supported()) + hardware_init_ret = irq_remapping_prepare(); + if (hardware_init_ret && !x2apic_supported()) return; ret = save_ioapic_entries(); @@ -1556,7 +1562,7 @@ void __init enable_IR_x2apic(void) if (x2apic_preenabled && nox2apic) disable_x2apic(); - if (dmar_table_init_ret) + if (hardware_init_ret) ret = -1; else ret = enable_IR(); @@ -2176,8 +2182,8 @@ static int lapic_suspend(void) local_irq_save(flags); disable_local_APIC(); - if (intr_remapping_enabled) - disable_intr_remapping(); + if (irq_remapping_enabled) + irq_remapping_disable(); local_irq_restore(flags); return 0; @@ -2193,7 +2199,7 @@ static void lapic_resume(void) return; local_irq_save(flags); - if (intr_remapping_enabled) { + if (irq_remapping_enabled) { /* * IO-APIC and PIC have their own resume routines. * We just mask them here to make sure the interrupt @@ -2245,8 +2251,8 @@ static void lapic_resume(void) apic_write(APIC_ESR, 0); apic_read(APIC_ESR); - if (intr_remapping_enabled) - reenable_intr_remapping(x2apic_mode); + if (irq_remapping_enabled) + irq_remapping_reenable(x2apic_mode); local_irq_restore(flags); } diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 359b6899a36..0e881c46e8c 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -227,6 +227,7 @@ static struct apic apic_flat = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, @@ -386,6 +387,7 @@ static struct apic apic_physflat = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 634ae6cdd5c..a6e4c6e06c0 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -181,6 +181,7 @@ struct apic apic_noop = { .read = noop_apic_read, .write = noop_apic_write, + .eoi_write = noop_apic_write, .icr_read = noop_apic_icr_read, .icr_write = noop_apic_icr_write, .wait_icr_idle = noop_apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 23e75422e01..6ec6d5d297c 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -295,6 +295,7 @@ static struct apic apic_numachip __refconst = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 0cdec7065af..31fbdbfbf96 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -248,6 +248,7 @@ static struct apic apic_bigsmp = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index e42d1d3b913..db4ab1be3c7 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -678,6 +678,7 @@ static struct apic __refdata apic_es7000_cluster = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, @@ -742,6 +743,7 @@ static struct apic __refdata apic_es7000 = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e88300d8e80..ffdc152e507 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -68,23 +68,21 @@ #define for_each_irq_pin(entry, head) \ for (entry = head; entry; entry = entry->next) -static void __init __ioapic_init_mappings(void); - -static unsigned int __io_apic_read (unsigned int apic, unsigned int reg); -static void __io_apic_write (unsigned int apic, unsigned int reg, unsigned int val); -static void __io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val); - -static struct io_apic_ops io_apic_ops = { - .init = __ioapic_init_mappings, - .read = __io_apic_read, - .write = __io_apic_write, - .modify = __io_apic_modify, -}; - -void __init set_io_apic_ops(const struct io_apic_ops *ops) +#ifdef CONFIG_IRQ_REMAP +static void irq_remap_modify_chip_defaults(struct irq_chip *chip); +static inline bool irq_remapped(struct irq_cfg *cfg) +{ + return cfg->irq_2_iommu.iommu != NULL; +} +#else +static inline bool irq_remapped(struct irq_cfg *cfg) +{ + return false; +} +static inline void irq_remap_modify_chip_defaults(struct irq_chip *chip) { - io_apic_ops = *ops; } +#endif /* * Is the SiS APIC rmw bug present ? @@ -313,21 +311,6 @@ static void free_irq_at(unsigned int at, struct irq_cfg *cfg) irq_free_desc(at); } -static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) -{ - return io_apic_ops.read(apic, reg); -} - -static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) -{ - io_apic_ops.write(apic, reg, value); -} - -static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) -{ - io_apic_ops.modify(apic, reg, value); -} - struct io_apic { unsigned int index; @@ -349,14 +332,14 @@ static inline void io_apic_eoi(unsigned int apic, unsigned int vector) writel(vector, &io_apic->eoi); } -static unsigned int __io_apic_read(unsigned int apic, unsigned int reg) +unsigned int native_io_apic_read(unsigned int apic, unsigned int reg) { struct io_apic __iomem *io_apic = io_apic_base(apic); writel(reg, &io_apic->index); return readl(&io_apic->data); } -static void __io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) +void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) { struct io_apic __iomem *io_apic = io_apic_base(apic); @@ -370,7 +353,7 @@ static void __io_apic_write(unsigned int apic, unsigned int reg, unsigned int va * * Older SiS APIC requires we rewrite the index register */ -static void __io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) +void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) { struct io_apic __iomem *io_apic = io_apic_base(apic); @@ -379,29 +362,6 @@ static void __io_apic_modify(unsigned int apic, unsigned int reg, unsigned int v writel(value, &io_apic->data); } -static bool io_apic_level_ack_pending(struct irq_cfg *cfg) -{ - struct irq_pin_list *entry; - unsigned long flags; - - raw_spin_lock_irqsave(&ioapic_lock, flags); - for_each_irq_pin(entry, cfg->irq_2_pin) { - unsigned int reg; - int pin; - - pin = entry->pin; - reg = io_apic_read(entry->apic, 0x10 + pin*2); - /* Is the remote IRR bit set? */ - if (reg & IO_APIC_REDIR_REMOTE_IRR) { - raw_spin_unlock_irqrestore(&ioapic_lock, flags); - return true; - } - } - raw_spin_unlock_irqrestore(&ioapic_lock, flags); - - return false; -} - union entry_union { struct { u32 w1, w2; }; struct IO_APIC_route_entry entry; @@ -1361,77 +1321,13 @@ static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg, fasteoi ? "fasteoi" : "edge"); } - -static int setup_ir_ioapic_entry(int irq, - struct IR_IO_APIC_route_entry *entry, - unsigned int destination, int vector, - struct io_apic_irq_attr *attr) -{ - int index; - struct irte irte; - int ioapic_id = mpc_ioapic_id(attr->ioapic); - struct intel_iommu *iommu = map_ioapic_to_ir(ioapic_id); - - if (!iommu) { - pr_warn("No mapping iommu for ioapic %d\n", ioapic_id); - return -ENODEV; - } - - index = alloc_irte(iommu, irq, 1); - if (index < 0) { - pr_warn("Failed to allocate IRTE for ioapic %d\n", ioapic_id); - return -ENOMEM; - } - - prepare_irte(&irte, vector, destination); - - /* Set source-id of interrupt request */ - set_ioapic_sid(&irte, ioapic_id); - - modify_irte(irq, &irte); - - apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: " - "Set IRTE entry (P:%d FPD:%d Dst_Mode:%d " - "Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X " - "Avail:%X Vector:%02X Dest:%08X " - "SID:%04X SQ:%X SVT:%X)\n", - attr->ioapic, irte.present, irte.fpd, irte.dst_mode, - irte.redir_hint, irte.trigger_mode, irte.dlvry_mode, - irte.avail, irte.vector, irte.dest_id, - irte.sid, irte.sq, irte.svt); - - memset(entry, 0, sizeof(*entry)); - - entry->index2 = (index >> 15) & 0x1; - entry->zero = 0; - entry->format = 1; - entry->index = (index & 0x7fff); - /* - * IO-APIC RTE will be configured with virtual vector. - * irq handler will do the explicit EOI to the io-apic. - */ - entry->vector = attr->ioapic_pin; - entry->mask = 0; /* enable IRQ */ - entry->trigger = attr->trigger; - entry->polarity = attr->polarity; - - /* Mask level triggered irqs. - * Use IRQ_DELAYED_DISABLE for edge triggered irqs. - */ - if (attr->trigger) - entry->mask = 1; - - return 0; -} - static int setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry, unsigned int destination, int vector, struct io_apic_irq_attr *attr) { - if (intr_remapping_enabled) - return setup_ir_ioapic_entry(irq, - (struct IR_IO_APIC_route_entry *)entry, - destination, vector, attr); + if (irq_remapping_enabled) + return setup_ioapic_remapped_entry(irq, entry, destination, + vector, attr); memset(entry, 0, sizeof(*entry)); @@ -1588,7 +1484,7 @@ static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx, { struct IO_APIC_route_entry entry; - if (intr_remapping_enabled) + if (irq_remapping_enabled) return; memset(&entry, 0, sizeof(entry)); @@ -1674,7 +1570,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx) printk(KERN_DEBUG ".... IRQ redirection table:\n"); - if (intr_remapping_enabled) { + if (irq_remapping_enabled) { printk(KERN_DEBUG " NR Indx Fmt Mask Trig IRR" " Pol Stat Indx2 Zero Vect:\n"); } else { @@ -1683,7 +1579,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx) } for (i = 0; i <= reg_01.bits.entries; i++) { - if (intr_remapping_enabled) { + if (irq_remapping_enabled) { struct IO_APIC_route_entry entry; struct IR_IO_APIC_route_entry *ir_entry; @@ -2050,7 +1946,7 @@ void disable_IO_APIC(void) * IOAPIC RTE as well as interrupt-remapping table entry). * As this gets called during crash dump, keep this simple for now. */ - if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) { + if (ioapic_i8259.pin != -1 && !irq_remapping_enabled) { struct IO_APIC_route_entry entry; memset(&entry, 0, sizeof(entry)); @@ -2074,7 +1970,7 @@ void disable_IO_APIC(void) * Use virtual wire A mode when interrupt remapping is enabled. */ if (cpu_has_apic || apic_from_smp_config()) - disconnect_bsp_APIC(!intr_remapping_enabled && + disconnect_bsp_APIC(!irq_remapping_enabled && ioapic_i8259.pin != -1); } @@ -2390,71 +2286,6 @@ ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, return ret; } -#ifdef CONFIG_IRQ_REMAP - -/* - * Migrate the IO-APIC irq in the presence of intr-remapping. - * - * For both level and edge triggered, irq migration is a simple atomic - * update(of vector and cpu destination) of IRTE and flush the hardware cache. - * - * For level triggered, we eliminate the io-apic RTE modification (with the - * updated vector information), by using a virtual vector (io-apic pin number). - * Real vector that is used for interrupting cpu will be coming from - * the interrupt-remapping table entry. - * - * As the migration is a simple atomic update of IRTE, the same mechanism - * is used to migrate MSI irq's in the presence of interrupt-remapping. - */ -static int -ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, - bool force) -{ - struct irq_cfg *cfg = data->chip_data; - unsigned int dest, irq = data->irq; - struct irte irte; - - if (!cpumask_intersects(mask, cpu_online_mask)) - return -EINVAL; - - if (get_irte(irq, &irte)) - return -EBUSY; - - if (assign_irq_vector(irq, cfg, mask)) - return -EBUSY; - - dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); - - irte.vector = cfg->vector; - irte.dest_id = IRTE_DEST(dest); - - /* - * Atomically updates the IRTE with the new destination, vector - * and flushes the interrupt entry cache. - */ - modify_irte(irq, &irte); - - /* - * After this point, all the interrupts will start arriving - * at the new destination. So, time to cleanup the previous - * vector allocation. - */ - if (cfg->move_in_progress) - send_cleanup_vector(cfg); - - cpumask_copy(data->affinity, mask); - return 0; -} - -#else -static inline int -ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, - bool force) -{ - return 0; -} -#endif - asmlinkage void smp_irq_move_cleanup_interrupt(void) { unsigned vector, me; @@ -2552,6 +2383,29 @@ static void ack_apic_edge(struct irq_data *data) atomic_t irq_mis_count; #ifdef CONFIG_GENERIC_PENDING_IRQ +static bool io_apic_level_ack_pending(struct irq_cfg *cfg) +{ + struct irq_pin_list *entry; + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + for_each_irq_pin(entry, cfg->irq_2_pin) { + unsigned int reg; + int pin; + + pin = entry->pin; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + /* Is the remote IRR bit set? */ + if (reg & IO_APIC_REDIR_REMOTE_IRR) { + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + return true; + } + } + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + return false; +} + static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg) { /* If we are moving the irq we need to mask it */ @@ -2699,7 +2553,7 @@ static void irq_remap_modify_chip_defaults(struct irq_chip *chip) chip->irq_eoi = ir_ack_apic_level; #ifdef CONFIG_SMP - chip->irq_set_affinity = ir_ioapic_set_affinity; + chip->irq_set_affinity = set_remapped_irq_affinity; #endif } #endif /* CONFIG_IRQ_REMAP */ @@ -2912,7 +2766,7 @@ static inline void __init check_timer(void) * 8259A. */ if (pin1 == -1) { - if (intr_remapping_enabled) + if (irq_remapping_enabled) panic("BIOS bug: timer not connected to IO-APIC"); pin1 = pin2; apic1 = apic2; @@ -2945,7 +2799,7 @@ static inline void __init check_timer(void) clear_IO_APIC_pin(0, pin1); goto out; } - if (intr_remapping_enabled) + if (irq_remapping_enabled) panic("timer doesn't work through Interrupt-remapped IO-APIC"); local_irq_disable(); clear_IO_APIC_pin(apic1, pin1); @@ -3169,7 +3023,7 @@ void destroy_irq(unsigned int irq) irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE); if (irq_remapped(cfg)) - free_irte(irq); + free_remapped_irq(irq); raw_spin_lock_irqsave(&vector_lock, flags); __clear_irq_vector(irq, cfg); raw_spin_unlock_irqrestore(&vector_lock, flags); @@ -3198,54 +3052,34 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); if (irq_remapped(cfg)) { - struct irte irte; - int ir_index; - u16 sub_handle; - - ir_index = map_irq_to_irte_handle(irq, &sub_handle); - BUG_ON(ir_index == -1); - - prepare_irte(&irte, cfg->vector, dest); - - /* Set source-id of interrupt request */ - if (pdev) - set_msi_sid(&irte, pdev); - else - set_hpet_sid(&irte, hpet_id); - - modify_irte(irq, &irte); + compose_remapped_msi_msg(pdev, irq, dest, msg, hpet_id); + return err; + } + if (x2apic_enabled()) + msg->address_hi = MSI_ADDR_BASE_HI | + MSI_ADDR_EXT_DEST_ID(dest); + else msg->address_hi = MSI_ADDR_BASE_HI; - msg->data = sub_handle; - msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT | - MSI_ADDR_IR_SHV | - MSI_ADDR_IR_INDEX1(ir_index) | - MSI_ADDR_IR_INDEX2(ir_index); - } else { - if (x2apic_enabled()) - msg->address_hi = MSI_ADDR_BASE_HI | - MSI_ADDR_EXT_DEST_ID(dest); - else - msg->address_hi = MSI_ADDR_BASE_HI; - msg->address_lo = - MSI_ADDR_BASE_LO | - ((apic->irq_dest_mode == 0) ? - MSI_ADDR_DEST_MODE_PHYSICAL: - MSI_ADDR_DEST_MODE_LOGICAL) | - ((apic->irq_delivery_mode != dest_LowestPrio) ? - MSI_ADDR_REDIRECTION_CPU: - MSI_ADDR_REDIRECTION_LOWPRI) | - MSI_ADDR_DEST_ID(dest); + msg->address_lo = + MSI_ADDR_BASE_LO | + ((apic->irq_dest_mode == 0) ? + MSI_ADDR_DEST_MODE_PHYSICAL: + MSI_ADDR_DEST_MODE_LOGICAL) | + ((apic->irq_delivery_mode != dest_LowestPrio) ? + MSI_ADDR_REDIRECTION_CPU: + MSI_ADDR_REDIRECTION_LOWPRI) | + MSI_ADDR_DEST_ID(dest); + + msg->data = + MSI_DATA_TRIGGER_EDGE | + MSI_DATA_LEVEL_ASSERT | + ((apic->irq_delivery_mode != dest_LowestPrio) ? + MSI_DATA_DELIVERY_FIXED: + MSI_DATA_DELIVERY_LOWPRI) | + MSI_DATA_VECTOR(cfg->vector); - msg->data = - MSI_DATA_TRIGGER_EDGE | - MSI_DATA_LEVEL_ASSERT | - ((apic->irq_delivery_mode != dest_LowestPrio) ? - MSI_DATA_DELIVERY_FIXED: - MSI_DATA_DELIVERY_LOWPRI) | - MSI_DATA_VECTOR(cfg->vector); - } return err; } @@ -3288,33 +3122,6 @@ static struct irq_chip msi_chip = { .irq_retrigger = ioapic_retrigger_irq, }; -/* - * Map the PCI dev to the corresponding remapping hardware unit - * and allocate 'nvec' consecutive interrupt-remapping table entries - * in it. - */ -static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec) -{ - struct intel_iommu *iommu; - int index; - - iommu = map_dev_to_ir(dev); - if (!iommu) { - printk(KERN_ERR - "Unable to map PCI %s to iommu\n", pci_name(dev)); - return -ENOENT; - } - - index = alloc_irte(iommu, irq, nvec); - if (index < 0) { - printk(KERN_ERR - "Unable to allocate %d IRTE for PCI %s\n", nvec, - pci_name(dev)); - return -ENOSPC; - } - return index; -} - static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) { struct irq_chip *chip = &msi_chip; @@ -3345,7 +3152,6 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) int node, ret, sub_handle, index = 0; unsigned int irq, irq_want; struct msi_desc *msidesc; - struct intel_iommu *iommu = NULL; /* x86 doesn't support multiple MSI yet */ if (type == PCI_CAP_ID_MSI && nvec > 1) @@ -3359,7 +3165,7 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) if (irq == 0) return -1; irq_want = irq + 1; - if (!intr_remapping_enabled) + if (!irq_remapping_enabled) goto no_ir; if (!sub_handle) { @@ -3367,23 +3173,16 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) * allocate the consecutive block of IRTE's * for 'nvec' */ - index = msi_alloc_irte(dev, irq, nvec); + index = msi_alloc_remapped_irq(dev, irq, nvec); if (index < 0) { ret = index; goto error; } } else { - iommu = map_dev_to_ir(dev); - if (!iommu) { - ret = -ENOENT; + ret = msi_setup_remapped_irq(dev, irq, index, + sub_handle); + if (ret < 0) goto error; - } - /* - * setup the mapping between the irq and the IRTE - * base index, the sub_handle pointing to the - * appropriate interrupt remap table entry. - */ - set_irte_irq(irq, iommu, index, sub_handle); } no_ir: ret = setup_msi_irq(dev, msidesc, irq); @@ -3501,15 +3300,8 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id) struct msi_msg msg; int ret; - if (intr_remapping_enabled) { - struct intel_iommu *iommu = map_hpet_to_ir(id); - int index; - - if (!iommu) - return -1; - - index = alloc_irte(iommu, irq, 1); - if (index < 0) + if (irq_remapping_enabled) { + if (!setup_hpet_msi_remapped(irq, id)) return -1; } @@ -3888,8 +3680,8 @@ void __init setup_ioapic_dest(void) else mask = apic->target_cpus(); - if (intr_remapping_enabled) - ir_ioapic_set_affinity(idata, mask, false); + if (irq_remapping_enabled) + set_remapped_irq_affinity(idata, mask, false); else ioapic_set_affinity(idata, mask, false); } @@ -3931,12 +3723,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics) return res; } -void __init ioapic_and_gsi_init(void) -{ - io_apic_ops.init(); -} - -static void __init __ioapic_init_mappings(void) +void __init native_io_apic_init_mappings(void) { unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; struct resource *ioapic_res; diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 00d2422ca7c..f00a68cca37 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -530,6 +530,7 @@ static struct apic __refdata apic_numaq = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index ff2c1b9aac4..1b291da09e6 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -142,6 +142,7 @@ static struct apic apic_default = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index fea000b27f0..659897c0075 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -546,6 +546,7 @@ static struct apic apic_summit = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 48f3103b3c9..ff35cff0e1a 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -260,6 +260,7 @@ static struct apic apic_x2apic_cluster = { .read = native_apic_msr_read, .write = native_apic_msr_write, + .eoi_write = native_apic_msr_eoi_write, .icr_read = native_x2apic_icr_read, .icr_write = native_x2apic_icr_write, .wait_icr_idle = native_x2apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 991e315f422..c17e982db27 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -172,6 +172,7 @@ static struct apic apic_x2apic_phys = { .read = native_apic_msr_read, .write = native_apic_msr_write, + .eoi_write = native_apic_msr_eoi_write, .icr_read = native_x2apic_icr_read, .icr_write = native_x2apic_icr_write, .wait_icr_idle = native_x2apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 87bfa69e216..c6d03f7a440 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -404,6 +404,7 @@ static struct apic __refdata apic_x2apic_uv_x = { .read = native_apic_msr_read, .write = native_apic_msr_write, + .eoi_write = native_apic_msr_eoi_write, .icr_read = native_x2apic_icr_read, .icr_write = native_x2apic_icr_write, .wait_icr_idle = native_x2apic_wait_icr_idle, diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 459e78cbf61..07b0c0db466 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -2401,7 +2401,7 @@ static void __exit apm_exit(void) * (pm_idle), Wait for all processors to update cached/local * copies of pm_idle before proceeding. */ - cpu_idle_wait(); + kick_all_cpus_sync(); } if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0) && (apm_info.connection_version > 0x0100)) { diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c index 5da1269e8dd..e2dbcb7dabd 100644 --- a/arch/x86/kernel/check.c +++ b/arch/x86/kernel/check.c @@ -27,21 +27,29 @@ static int num_scan_areas; static __init int set_corruption_check(char *arg) { - char *end; + ssize_t ret; + unsigned long val; - memory_corruption_check = simple_strtol(arg, &end, 10); + ret = kstrtoul(arg, 10, &val); + if (ret) + return ret; - return (*end == 0) ? 0 : -EINVAL; + memory_corruption_check = val; + return 0; } early_param("memory_corruption_check", set_corruption_check); static __init int set_corruption_check_period(char *arg) { - char *end; + ssize_t ret; + unsigned long val; - corruption_check_period = simple_strtoul(arg, &end, 10); + ret = kstrtoul(arg, 10, &val); + if (ret) + return ret; - return (*end == 0) ? 0 : -EINVAL; + corruption_check_period = val; + return 0; } early_param("memory_corruption_check_period", set_corruption_check_period); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cf79302198a..82f29e70d05 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1185,7 +1185,7 @@ void __cpuinit cpu_init(void) oist = &per_cpu(orig_ist, cpu); #ifdef CONFIG_NUMA - if (cpu != 0 && percpu_read(numa_node) == 0 && + if (cpu != 0 && this_cpu_read(numa_node) == 0 && early_cpu_to_node(cpu) != NUMA_NO_NODE) set_numa_node(early_cpu_to_node(cpu)); #endif diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index b8f3653dddb..9a7c90d80bc 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -615,14 +615,14 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) new_l2 = this_leaf.size/1024; num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; index_msb = get_count_order(num_threads_sharing); - l2_id = c->apicid >> index_msb; + l2_id = c->apicid & ~((1 << index_msb) - 1); break; case 3: new_l3 = this_leaf.size/1024; num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; index_msb = get_count_order( num_threads_sharing); - l3_id = c->apicid >> index_msb; + l3_id = c->apicid & ~((1 << index_msb) - 1); break; default: break; diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index 5502b289341..36565373af8 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -23,7 +23,7 @@ * %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor) * * Arrays used to match for this should also be declared using - * MODULE_DEVICE_TABLE(x86_cpu, ...) + * MODULE_DEVICE_TABLE(x86cpu, ...) * * This always matches against the boot cpu, assuming models and features are * consistent over all CPUs. diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index d086a09c087..297edb1b1fb 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -583,7 +583,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) struct mce m; int i; - percpu_inc(mce_poll_count); + this_cpu_inc(mce_poll_count); mce_gather_info(&m, NULL); @@ -945,9 +945,10 @@ struct mce_info { atomic_t inuse; struct task_struct *t; __u64 paddr; + int restartable; } mce_info[MCE_INFO_MAX]; -static void mce_save_info(__u64 addr) +static void mce_save_info(__u64 addr, int c) { struct mce_info *mi; @@ -955,6 +956,7 @@ static void mce_save_info(__u64 addr) if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) { mi->t = current; mi->paddr = addr; + mi->restartable = c; return; } } @@ -1015,7 +1017,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) atomic_inc(&mce_entry); - percpu_inc(mce_exception_count); + this_cpu_inc(mce_exception_count); if (!banks) goto out; @@ -1130,7 +1132,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_panic("Fatal machine check on current CPU", &m, msg); if (worst == MCE_AR_SEVERITY) { /* schedule action before return to userland */ - mce_save_info(m.addr); + mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV); set_thread_flag(TIF_MCE_NOTIFY); } else if (kill_it) { force_sig(SIGBUS, current); @@ -1179,7 +1181,13 @@ void mce_notify_process(void) pr_err("Uncorrected hardware memory error in user-access at %llx", mi->paddr); - if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0) { + /* + * We must call memory_failure() here even if the current process is + * doomed. We still need to mark the page as poisoned and alert any + * other users of the page. + */ + if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0 || + mi->restartable == 0) { pr_err("Memory error not recovered"); force_sig(SIGBUS, current); } diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index bb8e03407e1..e049d6da018 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -484,9 +484,6 @@ static int __x86_pmu_event_init(struct perf_event *event) /* mark unused */ event->hw.extra_reg.idx = EXTRA_REG_NONE; - - /* mark not used */ - event->hw.extra_reg.idx = EXTRA_REG_NONE; event->hw.branch_reg.idx = EXTRA_REG_NONE; return x86_pmu.hw_config(event); @@ -1186,8 +1183,6 @@ int x86_pmu_handle_irq(struct pt_regs *regs) int idx, handled = 0; u64 val; - perf_sample_data_init(&data, 0); - cpuc = &__get_cpu_var(cpu_hw_events); /* @@ -1222,7 +1217,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs) * event overflow */ handled++; - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, event->hw.last_period); if (!x86_perf_event_set_period(event)) continue; diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 95e7fe1c5f0..65652265fff 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -134,8 +134,13 @@ static u64 amd_pmu_event_map(int hw_event) static int amd_pmu_hw_config(struct perf_event *event) { - int ret = x86_pmu_hw_config(event); + int ret; + /* pass precise event sampling to ibs: */ + if (event->attr.precise_ip && get_ibs_caps()) + return -ENOENT; + + ret = x86_pmu_hw_config(event); if (ret) return ret; @@ -205,10 +210,8 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc, * when we come here */ for (i = 0; i < x86_pmu.num_counters; i++) { - if (nb->owners[i] == event) { - cmpxchg(nb->owners+i, event, NULL); + if (cmpxchg(nb->owners + i, event, NULL) == event) break; - } } } diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index 3b8a2d30d14..da9bcdcd985 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -9,6 +9,7 @@ #include <linux/perf_event.h> #include <linux/module.h> #include <linux/pci.h> +#include <linux/ptrace.h> #include <asm/apic.h> @@ -16,36 +17,591 @@ static u32 ibs_caps; #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) -static struct pmu perf_ibs; +#include <linux/kprobes.h> +#include <linux/hardirq.h> + +#include <asm/nmi.h> + +#define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT) +#define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT + +enum ibs_states { + IBS_ENABLED = 0, + IBS_STARTED = 1, + IBS_STOPPING = 2, + + IBS_MAX_STATES, +}; + +struct cpu_perf_ibs { + struct perf_event *event; + unsigned long state[BITS_TO_LONGS(IBS_MAX_STATES)]; +}; + +struct perf_ibs { + struct pmu pmu; + unsigned int msr; + u64 config_mask; + u64 cnt_mask; + u64 enable_mask; + u64 valid_mask; + u64 max_period; + unsigned long offset_mask[1]; + int offset_max; + struct cpu_perf_ibs __percpu *pcpu; + u64 (*get_count)(u64 config); +}; + +struct perf_ibs_data { + u32 size; + union { + u32 data[0]; /* data buffer starts here */ + u32 caps; + }; + u64 regs[MSR_AMD64_IBS_REG_COUNT_MAX]; +}; + +static int +perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period) +{ + s64 left = local64_read(&hwc->period_left); + s64 period = hwc->sample_period; + int overflow = 0; + + /* + * If we are way outside a reasonable range then just skip forward: + */ + if (unlikely(left <= -period)) { + left = period; + local64_set(&hwc->period_left, left); + hwc->last_period = period; + overflow = 1; + } + + if (unlikely(left < (s64)min)) { + left += period; + local64_set(&hwc->period_left, left); + hwc->last_period = period; + overflow = 1; + } + + /* + * If the hw period that triggers the sw overflow is too short + * we might hit the irq handler. This biases the results. + * Thus we shorten the next-to-last period and set the last + * period to the max period. + */ + if (left > max) { + left -= max; + if (left > max) + left = max; + else if (left < min) + left = min; + } + + *hw_period = (u64)left; + + return overflow; +} + +static int +perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width) +{ + struct hw_perf_event *hwc = &event->hw; + int shift = 64 - width; + u64 prev_raw_count; + u64 delta; + + /* + * Careful: an NMI might modify the previous event value. + * + * Our tactic to handle this is to first atomically read and + * exchange a new raw count - then add that new-prev delta + * count to the generic event atomically: + */ + prev_raw_count = local64_read(&hwc->prev_count); + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, + new_raw_count) != prev_raw_count) + return 0; + + /* + * Now we have the new raw value and have updated the prev + * timestamp already. We can now calculate the elapsed delta + * (event-)time and add that to the generic event. + * + * Careful, not all hw sign-extends above the physical width + * of the count. + */ + delta = (new_raw_count << shift) - (prev_raw_count << shift); + delta >>= shift; + + local64_add(delta, &event->count); + local64_sub(delta, &hwc->period_left); + + return 1; +} + +static struct perf_ibs perf_ibs_fetch; +static struct perf_ibs perf_ibs_op; + +static struct perf_ibs *get_ibs_pmu(int type) +{ + if (perf_ibs_fetch.pmu.type == type) + return &perf_ibs_fetch; + if (perf_ibs_op.pmu.type == type) + return &perf_ibs_op; + return NULL; +} + +/* + * Use IBS for precise event sampling: + * + * perf record -a -e cpu-cycles:p ... # use ibs op counting cycle count + * perf record -a -e r076:p ... # same as -e cpu-cycles:p + * perf record -a -e r0C1:p ... # use ibs op counting micro-ops + * + * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl, + * MSRC001_1033) is used to select either cycle or micro-ops counting + * mode. + * + * The rip of IBS samples has skid 0. Thus, IBS supports precise + * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the + * rip is invalid when IBS was not able to record the rip correctly. + * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then. + * + */ +static int perf_ibs_precise_event(struct perf_event *event, u64 *config) +{ + switch (event->attr.precise_ip) { + case 0: + return -ENOENT; + case 1: + case 2: + break; + default: + return -EOPNOTSUPP; + } + + switch (event->attr.type) { + case PERF_TYPE_HARDWARE: + switch (event->attr.config) { + case PERF_COUNT_HW_CPU_CYCLES: + *config = 0; + return 0; + } + break; + case PERF_TYPE_RAW: + switch (event->attr.config) { + case 0x0076: + *config = 0; + return 0; + case 0x00C1: + *config = IBS_OP_CNT_CTL; + return 0; + } + break; + default: + return -ENOENT; + } + + return -EOPNOTSUPP; +} static int perf_ibs_init(struct perf_event *event) { - if (perf_ibs.type != event->attr.type) + struct hw_perf_event *hwc = &event->hw; + struct perf_ibs *perf_ibs; + u64 max_cnt, config; + int ret; + + perf_ibs = get_ibs_pmu(event->attr.type); + if (perf_ibs) { + config = event->attr.config; + } else { + perf_ibs = &perf_ibs_op; + ret = perf_ibs_precise_event(event, &config); + if (ret) + return ret; + } + + if (event->pmu != &perf_ibs->pmu) return -ENOENT; + + if (config & ~perf_ibs->config_mask) + return -EINVAL; + + if (hwc->sample_period) { + if (config & perf_ibs->cnt_mask) + /* raw max_cnt may not be set */ + return -EINVAL; + if (!event->attr.sample_freq && hwc->sample_period & 0x0f) + /* + * lower 4 bits can not be set in ibs max cnt, + * but allowing it in case we adjust the + * sample period to set a frequency. + */ + return -EINVAL; + hwc->sample_period &= ~0x0FULL; + if (!hwc->sample_period) + hwc->sample_period = 0x10; + } else { + max_cnt = config & perf_ibs->cnt_mask; + config &= ~perf_ibs->cnt_mask; + event->attr.sample_period = max_cnt << 4; + hwc->sample_period = event->attr.sample_period; + } + + if (!hwc->sample_period) + return -EINVAL; + + /* + * If we modify hwc->sample_period, we also need to update + * hwc->last_period and hwc->period_left. + */ + hwc->last_period = hwc->sample_period; + local64_set(&hwc->period_left, hwc->sample_period); + + hwc->config_base = perf_ibs->msr; + hwc->config = config; + return 0; } +static int perf_ibs_set_period(struct perf_ibs *perf_ibs, + struct hw_perf_event *hwc, u64 *period) +{ + int overflow; + + /* ignore lower 4 bits in min count: */ + overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period); + local64_set(&hwc->prev_count, 0); + + return overflow; +} + +static u64 get_ibs_fetch_count(u64 config) +{ + return (config & IBS_FETCH_CNT) >> 12; +} + +static u64 get_ibs_op_count(u64 config) +{ + u64 count = 0; + + if (config & IBS_OP_VAL) + count += (config & IBS_OP_MAX_CNT) << 4; /* cnt rolled over */ + + if (ibs_caps & IBS_CAPS_RDWROPCNT) + count += (config & IBS_OP_CUR_CNT) >> 32; + + return count; +} + +static void +perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event, + u64 *config) +{ + u64 count = perf_ibs->get_count(*config); + + /* + * Set width to 64 since we do not overflow on max width but + * instead on max count. In perf_ibs_set_period() we clear + * prev count manually on overflow. + */ + while (!perf_event_try_update(event, count, 64)) { + rdmsrl(event->hw.config_base, *config); + count = perf_ibs->get_count(*config); + } +} + +static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs, + struct hw_perf_event *hwc, u64 config) +{ + wrmsrl(hwc->config_base, hwc->config | config | perf_ibs->enable_mask); +} + +/* + * Erratum #420 Instruction-Based Sampling Engine May Generate + * Interrupt that Cannot Be Cleared: + * + * Must clear counter mask first, then clear the enable bit. See + * Revision Guide for AMD Family 10h Processors, Publication #41322. + */ +static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs, + struct hw_perf_event *hwc, u64 config) +{ + config &= ~perf_ibs->cnt_mask; + wrmsrl(hwc->config_base, config); + config &= ~perf_ibs->enable_mask; + wrmsrl(hwc->config_base, config); +} + +/* + * We cannot restore the ibs pmu state, so we always needs to update + * the event while stopping it and then reset the state when starting + * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in + * perf_ibs_start()/perf_ibs_stop() and instead always do it. + */ +static void perf_ibs_start(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); + struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + u64 period; + + if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) + return; + + WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); + hwc->state = 0; + + perf_ibs_set_period(perf_ibs, hwc, &period); + set_bit(IBS_STARTED, pcpu->state); + perf_ibs_enable_event(perf_ibs, hwc, period >> 4); + + perf_event_update_userpage(event); +} + +static void perf_ibs_stop(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); + struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + u64 config; + int stopping; + + stopping = test_and_clear_bit(IBS_STARTED, pcpu->state); + + if (!stopping && (hwc->state & PERF_HES_UPTODATE)) + return; + + rdmsrl(hwc->config_base, config); + + if (stopping) { + set_bit(IBS_STOPPING, pcpu->state); + perf_ibs_disable_event(perf_ibs, hwc, config); + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + } + + if (hwc->state & PERF_HES_UPTODATE) + return; + + /* + * Clear valid bit to not count rollovers on update, rollovers + * are only updated in the irq handler. + */ + config &= ~perf_ibs->valid_mask; + + perf_ibs_event_update(perf_ibs, event, &config); + hwc->state |= PERF_HES_UPTODATE; +} + static int perf_ibs_add(struct perf_event *event, int flags) { + struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); + struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + + if (test_and_set_bit(IBS_ENABLED, pcpu->state)) + return -ENOSPC; + + event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + + pcpu->event = event; + + if (flags & PERF_EF_START) + perf_ibs_start(event, PERF_EF_RELOAD); + return 0; } static void perf_ibs_del(struct perf_event *event, int flags) { + struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); + struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + + if (!test_and_clear_bit(IBS_ENABLED, pcpu->state)) + return; + + perf_ibs_stop(event, PERF_EF_UPDATE); + + pcpu->event = NULL; + + perf_event_update_userpage(event); } -static struct pmu perf_ibs = { - .event_init= perf_ibs_init, - .add= perf_ibs_add, - .del= perf_ibs_del, +static void perf_ibs_read(struct perf_event *event) { } + +static struct perf_ibs perf_ibs_fetch = { + .pmu = { + .task_ctx_nr = perf_invalid_context, + + .event_init = perf_ibs_init, + .add = perf_ibs_add, + .del = perf_ibs_del, + .start = perf_ibs_start, + .stop = perf_ibs_stop, + .read = perf_ibs_read, + }, + .msr = MSR_AMD64_IBSFETCHCTL, + .config_mask = IBS_FETCH_CONFIG_MASK, + .cnt_mask = IBS_FETCH_MAX_CNT, + .enable_mask = IBS_FETCH_ENABLE, + .valid_mask = IBS_FETCH_VAL, + .max_period = IBS_FETCH_MAX_CNT << 4, + .offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK }, + .offset_max = MSR_AMD64_IBSFETCH_REG_COUNT, + + .get_count = get_ibs_fetch_count, }; +static struct perf_ibs perf_ibs_op = { + .pmu = { + .task_ctx_nr = perf_invalid_context, + + .event_init = perf_ibs_init, + .add = perf_ibs_add, + .del = perf_ibs_del, + .start = perf_ibs_start, + .stop = perf_ibs_stop, + .read = perf_ibs_read, + }, + .msr = MSR_AMD64_IBSOPCTL, + .config_mask = IBS_OP_CONFIG_MASK, + .cnt_mask = IBS_OP_MAX_CNT, + .enable_mask = IBS_OP_ENABLE, + .valid_mask = IBS_OP_VAL, + .max_period = IBS_OP_MAX_CNT << 4, + .offset_mask = { MSR_AMD64_IBSOP_REG_MASK }, + .offset_max = MSR_AMD64_IBSOP_REG_COUNT, + + .get_count = get_ibs_op_count, +}; + +static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) +{ + struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + struct perf_event *event = pcpu->event; + struct hw_perf_event *hwc = &event->hw; + struct perf_sample_data data; + struct perf_raw_record raw; + struct pt_regs regs; + struct perf_ibs_data ibs_data; + int offset, size, check_rip, offset_max, throttle = 0; + unsigned int msr; + u64 *buf, *config, period; + + if (!test_bit(IBS_STARTED, pcpu->state)) { + /* + * Catch spurious interrupts after stopping IBS: After + * disabling IBS there could be still incomming NMIs + * with samples that even have the valid bit cleared. + * Mark all this NMIs as handled. + */ + return test_and_clear_bit(IBS_STOPPING, pcpu->state) ? 1 : 0; + } + + msr = hwc->config_base; + buf = ibs_data.regs; + rdmsrl(msr, *buf); + if (!(*buf++ & perf_ibs->valid_mask)) + return 0; + + config = &ibs_data.regs[0]; + perf_ibs_event_update(perf_ibs, event, config); + perf_sample_data_init(&data, 0, hwc->last_period); + if (!perf_ibs_set_period(perf_ibs, hwc, &period)) + goto out; /* no sw counter overflow */ + + ibs_data.caps = ibs_caps; + size = 1; + offset = 1; + check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK)); + if (event->attr.sample_type & PERF_SAMPLE_RAW) + offset_max = perf_ibs->offset_max; + else if (check_rip) + offset_max = 2; + else + offset_max = 1; + do { + rdmsrl(msr + offset, *buf++); + size++; + offset = find_next_bit(perf_ibs->offset_mask, + perf_ibs->offset_max, + offset + 1); + } while (offset < offset_max); + ibs_data.size = sizeof(u64) * size; + + regs = *iregs; + if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) { + regs.flags &= ~PERF_EFLAGS_EXACT; + } else { + instruction_pointer_set(®s, ibs_data.regs[1]); + regs.flags |= PERF_EFLAGS_EXACT; + } + + if (event->attr.sample_type & PERF_SAMPLE_RAW) { + raw.size = sizeof(u32) + ibs_data.size; + raw.data = ibs_data.data; + data.raw = &raw; + } + + throttle = perf_event_overflow(event, &data, ®s); +out: + if (throttle) + perf_ibs_disable_event(perf_ibs, hwc, *config); + else + perf_ibs_enable_event(perf_ibs, hwc, period >> 4); + + perf_event_update_userpage(event); + + return 1; +} + +static int __kprobes +perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs) +{ + int handled = 0; + + handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs); + handled += perf_ibs_handle_irq(&perf_ibs_op, regs); + + if (handled) + inc_irq_stat(apic_perf_irqs); + + return handled; +} + +static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name) +{ + struct cpu_perf_ibs __percpu *pcpu; + int ret; + + pcpu = alloc_percpu(struct cpu_perf_ibs); + if (!pcpu) + return -ENOMEM; + + perf_ibs->pcpu = pcpu; + + ret = perf_pmu_register(&perf_ibs->pmu, name, -1); + if (ret) { + perf_ibs->pcpu = NULL; + free_percpu(pcpu); + } + + return ret; +} + static __init int perf_event_ibs_init(void) { if (!ibs_caps) return -ENODEV; /* ibs not supported by the cpu */ - perf_pmu_register(&perf_ibs, "ibs", -1); + perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); + if (ibs_caps & IBS_CAPS_OPCNT) + perf_ibs_op.config_mask |= IBS_OP_CNT_CTL; + perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); + register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs"); printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps); return 0; diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 26b3e2fef10..166546ec6ae 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1027,8 +1027,6 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) u64 status; int handled; - perf_sample_data_init(&data, 0); - cpuc = &__get_cpu_var(cpu_hw_events); /* @@ -1082,7 +1080,7 @@ again: if (!intel_pmu_save_and_restart(event)) continue; - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, event->hw.last_period); if (has_branch_stack(event)) data.br_stack = &cpuc->lbr_stack; diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 7f64df19e7d..5a3edc27f6e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -316,8 +316,7 @@ int intel_pmu_drain_bts_buffer(void) ds->bts_index = ds->bts_buffer_base; - perf_sample_data_init(&data, 0); - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, event->hw.last_period); regs.ip = 0; /* @@ -564,8 +563,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event, if (!intel_pmu_save_and_restart(event)) return; - perf_sample_data_init(&data, 0); - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, event->hw.last_period); /* * We use the interrupt regs as a base because the PEBS record diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index a2dfacfd710..47124a73dd7 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -1005,8 +1005,6 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) int idx, handled = 0; u64 val; - perf_sample_data_init(&data, 0); - cpuc = &__get_cpu_var(cpu_hw_events); for (idx = 0; idx < x86_pmu.num_counters; idx++) { @@ -1034,10 +1032,12 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) handled += overflow; /* event overflow for sure */ - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, hwc->last_period); if (!x86_perf_event_set_period(event)) continue; + + if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); } diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 40989da4bb2..571246d81ed 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -311,16 +311,33 @@ void die(const char *str, struct pt_regs *regs, long err) static int __init kstack_setup(char *s) { + ssize_t ret; + unsigned long val; + if (!s) return -EINVAL; - kstack_depth_to_print = simple_strtoul(s, NULL, 0); + + ret = kstrtoul(s, 0, &val); + if (ret) + return ret; + kstack_depth_to_print = val; return 0; } early_param("kstack", kstack_setup); static int __init code_bytes_setup(char *s) { - code_bytes = simple_strtoul(s, NULL, 0); + ssize_t ret; + unsigned long val; + + if (!s) + return -EINVAL; + + ret = kstrtoul(s, 0, &val); + if (ret) + return ret; + + code_bytes = val; if (code_bytes > 8192) code_bytes = 8192; diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index c9a281f272f..32ff36596ab 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -24,40 +24,21 @@ #include <trace/syscall.h> #include <asm/cacheflush.h> +#include <asm/kprobes.h> #include <asm/ftrace.h> #include <asm/nops.h> -#include <asm/nmi.h> - #ifdef CONFIG_DYNAMIC_FTRACE -/* - * modifying_code is set to notify NMIs that they need to use - * memory barriers when entering or exiting. But we don't want - * to burden NMIs with unnecessary memory barriers when code - * modification is not being done (which is most of the time). - * - * A mutex is already held when ftrace_arch_code_modify_prepare - * and post_process are called. No locks need to be taken here. - * - * Stop machine will make sure currently running NMIs are done - * and new NMIs will see the updated variable before we need - * to worry about NMIs doing memory barriers. - */ -static int modifying_code __read_mostly; -static DEFINE_PER_CPU(int, save_modifying_code); - int ftrace_arch_code_modify_prepare(void) { set_kernel_text_rw(); set_all_modules_text_rw(); - modifying_code = 1; return 0; } int ftrace_arch_code_modify_post_process(void) { - modifying_code = 0; set_all_modules_text_ro(); set_kernel_text_ro(); return 0; @@ -90,134 +71,6 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) return calc.code; } -/* - * Modifying code must take extra care. On an SMP machine, if - * the code being modified is also being executed on another CPU - * that CPU will have undefined results and possibly take a GPF. - * We use kstop_machine to stop other CPUS from exectuing code. - * But this does not stop NMIs from happening. We still need - * to protect against that. We separate out the modification of - * the code to take care of this. - * - * Two buffers are added: An IP buffer and a "code" buffer. - * - * 1) Put the instruction pointer into the IP buffer - * and the new code into the "code" buffer. - * 2) Wait for any running NMIs to finish and set a flag that says - * we are modifying code, it is done in an atomic operation. - * 3) Write the code - * 4) clear the flag. - * 5) Wait for any running NMIs to finish. - * - * If an NMI is executed, the first thing it does is to call - * "ftrace_nmi_enter". This will check if the flag is set to write - * and if it is, it will write what is in the IP and "code" buffers. - * - * The trick is, it does not matter if everyone is writing the same - * content to the code location. Also, if a CPU is executing code - * it is OK to write to that code location if the contents being written - * are the same as what exists. - */ - -#define MOD_CODE_WRITE_FLAG (1 << 31) /* set when NMI should do the write */ -static atomic_t nmi_running = ATOMIC_INIT(0); -static int mod_code_status; /* holds return value of text write */ -static void *mod_code_ip; /* holds the IP to write to */ -static const void *mod_code_newcode; /* holds the text to write to the IP */ - -static unsigned nmi_wait_count; -static atomic_t nmi_update_count = ATOMIC_INIT(0); - -int ftrace_arch_read_dyn_info(char *buf, int size) -{ - int r; - - r = snprintf(buf, size, "%u %u", - nmi_wait_count, - atomic_read(&nmi_update_count)); - return r; -} - -static void clear_mod_flag(void) -{ - int old = atomic_read(&nmi_running); - - for (;;) { - int new = old & ~MOD_CODE_WRITE_FLAG; - - if (old == new) - break; - - old = atomic_cmpxchg(&nmi_running, old, new); - } -} - -static void ftrace_mod_code(void) -{ - /* - * Yes, more than one CPU process can be writing to mod_code_status. - * (and the code itself) - * But if one were to fail, then they all should, and if one were - * to succeed, then they all should. - */ - mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode, - MCOUNT_INSN_SIZE); - - /* if we fail, then kill any new writers */ - if (mod_code_status) - clear_mod_flag(); -} - -void ftrace_nmi_enter(void) -{ - __this_cpu_write(save_modifying_code, modifying_code); - - if (!__this_cpu_read(save_modifying_code)) - return; - - if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) { - smp_rmb(); - ftrace_mod_code(); - atomic_inc(&nmi_update_count); - } - /* Must have previous changes seen before executions */ - smp_mb(); -} - -void ftrace_nmi_exit(void) -{ - if (!__this_cpu_read(save_modifying_code)) - return; - - /* Finish all executions before clearing nmi_running */ - smp_mb(); - atomic_dec(&nmi_running); -} - -static void wait_for_nmi_and_set_mod_flag(void) -{ - if (!atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG)) - return; - - do { - cpu_relax(); - } while (atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG)); - - nmi_wait_count++; -} - -static void wait_for_nmi(void) -{ - if (!atomic_read(&nmi_running)) - return; - - do { - cpu_relax(); - } while (atomic_read(&nmi_running)); - - nmi_wait_count++; -} - static inline int within(unsigned long addr, unsigned long start, unsigned long end) { @@ -238,26 +91,7 @@ do_ftrace_mod_code(unsigned long ip, const void *new_code) if (within(ip, (unsigned long)_text, (unsigned long)_etext)) ip = (unsigned long)__va(__pa(ip)); - mod_code_ip = (void *)ip; - mod_code_newcode = new_code; - - /* The buffers need to be visible before we let NMIs write them */ - smp_mb(); - - wait_for_nmi_and_set_mod_flag(); - - /* Make sure all running NMIs have finished before we write the code */ - smp_mb(); - - ftrace_mod_code(); - - /* Make sure the write happens before clearing the bit */ - smp_mb(); - - clear_mod_flag(); - wait_for_nmi(); - - return mod_code_status; + return probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE); } static const unsigned char *ftrace_nop_replace(void) @@ -334,6 +168,336 @@ int ftrace_update_ftrace_func(ftrace_func_t func) return ret; } +int modifying_ftrace_code __read_mostly; + +/* + * A breakpoint was added to the code address we are about to + * modify, and this is the handle that will just skip over it. + * We are either changing a nop into a trace call, or a trace + * call to a nop. While the change is taking place, we treat + * it just like it was a nop. + */ +int ftrace_int3_handler(struct pt_regs *regs) +{ + if (WARN_ON_ONCE(!regs)) + return 0; + + if (!ftrace_location(regs->ip - 1)) + return 0; + + regs->ip += MCOUNT_INSN_SIZE - 1; + + return 1; +} + +static int ftrace_write(unsigned long ip, const char *val, int size) +{ + /* + * On x86_64, kernel text mappings are mapped read-only with + * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead + * of the kernel text mapping to modify the kernel text. + * + * For 32bit kernels, these mappings are same and we can use + * kernel identity mapping to modify code. + */ + if (within(ip, (unsigned long)_text, (unsigned long)_etext)) + ip = (unsigned long)__va(__pa(ip)); + + return probe_kernel_write((void *)ip, val, size); +} + +static int add_break(unsigned long ip, const char *old) +{ + unsigned char replaced[MCOUNT_INSN_SIZE]; + unsigned char brk = BREAKPOINT_INSTRUCTION; + + if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) + return -EFAULT; + + /* Make sure it is what we expect it to be */ + if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0) + return -EINVAL; + + if (ftrace_write(ip, &brk, 1)) + return -EPERM; + + return 0; +} + +static int add_brk_on_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned const char *old; + unsigned long ip = rec->ip; + + old = ftrace_call_replace(ip, addr); + + return add_break(rec->ip, old); +} + + +static int add_brk_on_nop(struct dyn_ftrace *rec) +{ + unsigned const char *old; + + old = ftrace_nop_replace(); + + return add_break(rec->ip, old); +} + +static int add_breakpoints(struct dyn_ftrace *rec, int enable) +{ + unsigned long ftrace_addr; + int ret; + + ret = ftrace_test_record(rec, enable); + + ftrace_addr = (unsigned long)FTRACE_ADDR; + + switch (ret) { + case FTRACE_UPDATE_IGNORE: + return 0; + + case FTRACE_UPDATE_MAKE_CALL: + /* converting nop to call */ + return add_brk_on_nop(rec); + + case FTRACE_UPDATE_MAKE_NOP: + /* converting a call to a nop */ + return add_brk_on_call(rec, ftrace_addr); + } + return 0; +} + +/* + * On error, we need to remove breakpoints. This needs to + * be done caefully. If the address does not currently have a + * breakpoint, we know we are done. Otherwise, we look at the + * remaining 4 bytes of the instruction. If it matches a nop + * we replace the breakpoint with the nop. Otherwise we replace + * it with the call instruction. + */ +static int remove_breakpoint(struct dyn_ftrace *rec) +{ + unsigned char ins[MCOUNT_INSN_SIZE]; + unsigned char brk = BREAKPOINT_INSTRUCTION; + const unsigned char *nop; + unsigned long ftrace_addr; + unsigned long ip = rec->ip; + + /* If we fail the read, just give up */ + if (probe_kernel_read(ins, (void *)ip, MCOUNT_INSN_SIZE)) + return -EFAULT; + + /* If this does not have a breakpoint, we are done */ + if (ins[0] != brk) + return -1; + + nop = ftrace_nop_replace(); + + /* + * If the last 4 bytes of the instruction do not match + * a nop, then we assume that this is a call to ftrace_addr. + */ + if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) { + /* + * For extra paranoidism, we check if the breakpoint is on + * a call that would actually jump to the ftrace_addr. + * If not, don't touch the breakpoint, we make just create + * a disaster. + */ + ftrace_addr = (unsigned long)FTRACE_ADDR; + nop = ftrace_call_replace(ip, ftrace_addr); + + if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) + return -EINVAL; + } + + return probe_kernel_write((void *)ip, &nop[0], 1); +} + +static int add_update_code(unsigned long ip, unsigned const char *new) +{ + /* skip breakpoint */ + ip++; + new++; + if (ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1)) + return -EPERM; + return 0; +} + +static int add_update_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned long ip = rec->ip; + unsigned const char *new; + + new = ftrace_call_replace(ip, addr); + return add_update_code(ip, new); +} + +static int add_update_nop(struct dyn_ftrace *rec) +{ + unsigned long ip = rec->ip; + unsigned const char *new; + + new = ftrace_nop_replace(); + return add_update_code(ip, new); +} + +static int add_update(struct dyn_ftrace *rec, int enable) +{ + unsigned long ftrace_addr; + int ret; + + ret = ftrace_test_record(rec, enable); + + ftrace_addr = (unsigned long)FTRACE_ADDR; + + switch (ret) { + case FTRACE_UPDATE_IGNORE: + return 0; + + case FTRACE_UPDATE_MAKE_CALL: + /* converting nop to call */ + return add_update_call(rec, ftrace_addr); + + case FTRACE_UPDATE_MAKE_NOP: + /* converting a call to a nop */ + return add_update_nop(rec); + } + + return 0; +} + +static int finish_update_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned long ip = rec->ip; + unsigned const char *new; + + new = ftrace_call_replace(ip, addr); + + if (ftrace_write(ip, new, 1)) + return -EPERM; + + return 0; +} + +static int finish_update_nop(struct dyn_ftrace *rec) +{ + unsigned long ip = rec->ip; + unsigned const char *new; + + new = ftrace_nop_replace(); + + if (ftrace_write(ip, new, 1)) + return -EPERM; + return 0; +} + +static int finish_update(struct dyn_ftrace *rec, int enable) +{ + unsigned long ftrace_addr; + int ret; + + ret = ftrace_update_record(rec, enable); + + ftrace_addr = (unsigned long)FTRACE_ADDR; + + switch (ret) { + case FTRACE_UPDATE_IGNORE: + return 0; + + case FTRACE_UPDATE_MAKE_CALL: + /* converting nop to call */ + return finish_update_call(rec, ftrace_addr); + + case FTRACE_UPDATE_MAKE_NOP: + /* converting a call to a nop */ + return finish_update_nop(rec); + } + + return 0; +} + +static void do_sync_core(void *data) +{ + sync_core(); +} + +static void run_sync(void) +{ + int enable_irqs = irqs_disabled(); + + /* We may be called with interrupts disbled (on bootup). */ + if (enable_irqs) + local_irq_enable(); + on_each_cpu(do_sync_core, NULL, 1); + if (enable_irqs) + local_irq_disable(); +} + +void ftrace_replace_code(int enable) +{ + struct ftrace_rec_iter *iter; + struct dyn_ftrace *rec; + const char *report = "adding breakpoints"; + int count = 0; + int ret; + + for_ftrace_rec_iter(iter) { + rec = ftrace_rec_iter_record(iter); + + ret = add_breakpoints(rec, enable); + if (ret) + goto remove_breakpoints; + count++; + } + + run_sync(); + + report = "updating code"; + + for_ftrace_rec_iter(iter) { + rec = ftrace_rec_iter_record(iter); + + ret = add_update(rec, enable); + if (ret) + goto remove_breakpoints; + } + + run_sync(); + + report = "removing breakpoints"; + + for_ftrace_rec_iter(iter) { + rec = ftrace_rec_iter_record(iter); + + ret = finish_update(rec, enable); + if (ret) + goto remove_breakpoints; + } + + run_sync(); + + return; + + remove_breakpoints: + ftrace_bug(ret, rec ? rec->ip : 0); + printk(KERN_WARNING "Failed on %s (%d):\n", report, count); + for_ftrace_rec_iter(iter) { + rec = ftrace_rec_iter_record(iter); + remove_breakpoint(rec); + } +} + +void arch_ftrace_update_code(int command) +{ + modifying_ftrace_code++; + + ftrace_modify_all_code(command); + + modifying_ftrace_code--; +} + int __init ftrace_dyn_arch_init(void *data) { /* The return code is retured via data */ diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 2d6e6498c17..f250431fb50 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -88,7 +88,7 @@ void kernel_fpu_begin(void) __thread_clear_has_fpu(me); /* We do 'stts()' in kernel_fpu_end() */ } else { - percpu_write(fpu_owner_task, NULL); + this_cpu_write(fpu_owner_task, NULL); clts(); } } diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c deleted file mode 100644 index 43e9ccf4494..00000000000 --- a/arch/x86/kernel/init_task.c +++ /dev/null @@ -1,42 +0,0 @@ -#include <linux/mm.h> -#include <linux/module.h> -#include <linux/sched.h> -#include <linux/init.h> -#include <linux/init_task.h> -#include <linux/fs.h> -#include <linux/mqueue.h> - -#include <asm/uaccess.h> -#include <asm/pgtable.h> -#include <asm/desc.h> - -static struct signal_struct init_signals = INIT_SIGNALS(init_signals); -static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); - -/* - * Initial thread structure. - * - * We need to make sure that this is THREAD_SIZE aligned due to the - * way process stacks are handled. This is done by having a special - * "init_task" linker map entry.. - */ -union thread_union init_thread_union __init_task_data = - { INIT_THREAD_INFO(init_task) }; - -/* - * Initial task structure. - * - * All other task structs will be allocated on slabs in fork.c - */ -struct task_struct init_task = INIT_TASK(init_task); -EXPORT_SYMBOL(init_task); - -/* - * per-CPU TSS segments. Threads are completely 'soft' on Linux, - * no more per-task TSS's. The TSS size is kept cacheline-aligned - * so they are allowed to end up in the .data..cacheline_aligned - * section. Since TSS's are completely CPU-local, we want them - * on exact cacheline boundaries, to eliminate cacheline ping-pong. - */ -DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; - diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 58b7f27cb3e..344faf8d0d6 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -127,8 +127,8 @@ void __cpuinit irq_ctx_init(int cpu) return; irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), - THREAD_FLAGS, - THREAD_ORDER)); + THREADINFO_GFP, + THREAD_SIZE_ORDER)); memset(&irqctx->tinfo, 0, sizeof(struct thread_info)); irqctx->tinfo.cpu = cpu; irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; @@ -137,8 +137,8 @@ void __cpuinit irq_ctx_init(int cpu) per_cpu(hardirq_ctx, cpu) = irqctx; irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), - THREAD_FLAGS, - THREAD_ORDER)); + THREADINFO_GFP, + THREAD_SIZE_ORDER)); memset(&irqctx->tinfo, 0, sizeof(struct thread_info)); irqctx->tinfo.cpu = cpu; irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index b8ba6e4a27e..e554e5ad2fe 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -79,7 +79,6 @@ struct kvm_task_sleep_node { u32 token; int cpu; bool halted; - struct mm_struct *mm; }; static struct kvm_task_sleep_head { @@ -126,9 +125,7 @@ void kvm_async_pf_task_wait(u32 token) n.token = token; n.cpu = smp_processor_id(); - n.mm = current->active_mm; n.halted = idle || preempt_count() > 1; - atomic_inc(&n.mm->mm_count); init_waitqueue_head(&n.wq); hlist_add_head(&n.link, &b->list); spin_unlock(&b->lock); @@ -161,9 +158,6 @@ EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait); static void apf_task_wake_one(struct kvm_task_sleep_node *n) { hlist_del_init(&n->link); - if (!n->mm) - return; - mmdrop(n->mm); if (n->halted) smp_send_reschedule(n->cpu); else if (waitqueue_active(&n->wq)) @@ -207,7 +201,7 @@ again: * async PF was not yet handled. * Add dummy entry for the token. */ - n = kmalloc(sizeof(*n), GFP_ATOMIC); + n = kzalloc(sizeof(*n), GFP_ATOMIC); if (!n) { /* * Allocation failed! Busy wait while other cpu @@ -219,7 +213,6 @@ again: } n->token = token; n->cpu = smp_processor_id(); - n->mm = NULL; init_waitqueue_head(&n->wq); hlist_add_head(&n->link, &b->list); } else diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index c9bda6d6035..fbdfc691718 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -299,12 +299,11 @@ static ssize_t reload_store(struct device *dev, { unsigned long val; int cpu = dev->id; - int ret = 0; - char *end; + ssize_t ret = 0; - val = simple_strtoul(buf, &end, 0); - if (end == buf) - return -EINVAL; + ret = kstrtoul(buf, 0, &val); + if (ret) + return ret; if (val == 1) { get_online_cpus(); diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 3ca42d0e43a..0327e2b3c40 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -147,12 +147,6 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) memset(csig, 0, sizeof(*csig)); - if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || - cpu_has(c, X86_FEATURE_IA64)) { - pr_err("CPU%d not a capable Intel processor\n", cpu_num); - return -1; - } - csig->sig = cpuid_eax(0x00000001); if ((c->x86_model >= 5) || (c->x86 > 6)) { @@ -463,6 +457,14 @@ static struct microcode_ops microcode_intel_ops = { struct microcode_ops * __init init_intel_microcode(void) { + struct cpuinfo_x86 *c = &cpu_data(0); + + if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || + cpu_has(c, X86_FEATURE_IA64)) { + pr_err("Intel CPU family 0x%x not supported\n", c->x86); + return NULL; + } + return µcode_intel_ops; } diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 03c13454496..bffdfd48c1f 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -31,14 +31,6 @@ #include <asm/nmi.h> #include <asm/x86_init.h> -#define NMI_MAX_NAMELEN 16 -struct nmiaction { - struct list_head list; - nmi_handler_t handler; - unsigned int flags; - char *name; -}; - struct nmi_desc { spinlock_t lock; struct list_head head; @@ -54,6 +46,14 @@ static struct nmi_desc nmi_desc[NMI_MAX] = .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock), .head = LIST_HEAD_INIT(nmi_desc[1].head), }, + { + .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock), + .head = LIST_HEAD_INIT(nmi_desc[2].head), + }, + { + .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock), + .head = LIST_HEAD_INIT(nmi_desc[3].head), + }, }; @@ -84,7 +84,7 @@ __setup("unknown_nmi_panic", setup_unknown_nmi_panic); #define nmi_to_desc(type) (&nmi_desc[type]) -static int notrace __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b) +static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b) { struct nmi_desc *desc = nmi_to_desc(type); struct nmiaction *a; @@ -107,11 +107,14 @@ static int notrace __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, return handled; } -static int __setup_nmi(unsigned int type, struct nmiaction *action) +int __register_nmi_handler(unsigned int type, struct nmiaction *action) { struct nmi_desc *desc = nmi_to_desc(type); unsigned long flags; + if (!action->handler) + return -EINVAL; + spin_lock_irqsave(&desc->lock, flags); /* @@ -120,6 +123,8 @@ static int __setup_nmi(unsigned int type, struct nmiaction *action) * to manage expectations */ WARN_ON_ONCE(type == NMI_UNKNOWN && !list_empty(&desc->head)); + WARN_ON_ONCE(type == NMI_SERR && !list_empty(&desc->head)); + WARN_ON_ONCE(type == NMI_IO_CHECK && !list_empty(&desc->head)); /* * some handlers need to be executed first otherwise a fake @@ -133,8 +138,9 @@ static int __setup_nmi(unsigned int type, struct nmiaction *action) spin_unlock_irqrestore(&desc->lock, flags); return 0; } +EXPORT_SYMBOL(__register_nmi_handler); -static struct nmiaction *__free_nmi(unsigned int type, const char *name) +void unregister_nmi_handler(unsigned int type, const char *name) { struct nmi_desc *desc = nmi_to_desc(type); struct nmiaction *n; @@ -157,61 +163,16 @@ static struct nmiaction *__free_nmi(unsigned int type, const char *name) spin_unlock_irqrestore(&desc->lock, flags); synchronize_rcu(); - return (n); } - -int register_nmi_handler(unsigned int type, nmi_handler_t handler, - unsigned long nmiflags, const char *devname) -{ - struct nmiaction *action; - int retval = -ENOMEM; - - if (!handler) - return -EINVAL; - - action = kzalloc(sizeof(struct nmiaction), GFP_KERNEL); - if (!action) - goto fail_action; - - action->handler = handler; - action->flags = nmiflags; - action->name = kstrndup(devname, NMI_MAX_NAMELEN, GFP_KERNEL); - if (!action->name) - goto fail_action_name; - - retval = __setup_nmi(type, action); - - if (retval) - goto fail_setup_nmi; - - return retval; - -fail_setup_nmi: - kfree(action->name); -fail_action_name: - kfree(action); -fail_action: - - return retval; -} -EXPORT_SYMBOL_GPL(register_nmi_handler); - -void unregister_nmi_handler(unsigned int type, const char *name) -{ - struct nmiaction *a; - - a = __free_nmi(type, name); - if (a) { - kfree(a->name); - kfree(a); - } -} - EXPORT_SYMBOL_GPL(unregister_nmi_handler); -static notrace __kprobes void +static __kprobes void pci_serr_error(unsigned char reason, struct pt_regs *regs) { + /* check to see if anyone registered against these types of errors */ + if (nmi_handle(NMI_SERR, regs, false)) + return; + pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n", reason, smp_processor_id()); @@ -236,11 +197,15 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs) outb(reason, NMI_REASON_PORT); } -static notrace __kprobes void +static __kprobes void io_check_error(unsigned char reason, struct pt_regs *regs) { unsigned long i; + /* check to see if anyone registered against these types of errors */ + if (nmi_handle(NMI_IO_CHECK, regs, false)) + return; + pr_emerg( "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n", reason, smp_processor_id()); @@ -263,7 +228,7 @@ io_check_error(unsigned char reason, struct pt_regs *regs) outb(reason, NMI_REASON_PORT); } -static notrace __kprobes void +static __kprobes void unknown_nmi_error(unsigned char reason, struct pt_regs *regs) { int handled; @@ -305,7 +270,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) static DEFINE_PER_CPU(bool, swallow_nmi); static DEFINE_PER_CPU(unsigned long, last_nmi_rip); -static notrace __kprobes void default_do_nmi(struct pt_regs *regs) +static __kprobes void default_do_nmi(struct pt_regs *regs) { unsigned char reason = 0; int handled; diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c index 2c39dcd510f..ff369862508 100644 --- a/arch/x86/kernel/nmi_selftest.c +++ b/arch/x86/kernel/nmi_selftest.c @@ -13,6 +13,7 @@ #include <linux/cpumask.h> #include <linux/delay.h> #include <linux/init.h> +#include <linux/percpu.h> #include <asm/apic.h> #include <asm/nmi.h> diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index ab137605e69..9ce885996fd 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -241,16 +241,16 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA static inline void enter_lazy(enum paravirt_lazy_mode mode) { - BUG_ON(percpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); + BUG_ON(this_cpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); - percpu_write(paravirt_lazy_mode, mode); + this_cpu_write(paravirt_lazy_mode, mode); } static void leave_lazy(enum paravirt_lazy_mode mode) { - BUG_ON(percpu_read(paravirt_lazy_mode) != mode); + BUG_ON(this_cpu_read(paravirt_lazy_mode) != mode); - percpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE); + this_cpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE); } void paravirt_enter_lazy_mmu(void) @@ -267,7 +267,7 @@ void paravirt_start_context_switch(struct task_struct *prev) { BUG_ON(preemptible()); - if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) { + if (this_cpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) { arch_leave_lazy_mmu_mode(); set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES); } @@ -289,7 +289,7 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void) if (in_interrupt()) return PARAVIRT_LAZY_NONE; - return percpu_read(paravirt_lazy_mode); + return this_cpu_read(paravirt_lazy_mode); } void arch_flush_lazy_mmu_mode(void) diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index d0b2fb9ccbb..b72838bae64 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -1480,8 +1480,9 @@ cleanup: static int __init calgary_parse_options(char *p) { unsigned int bridge; + unsigned long val; size_t len; - char* endp; + ssize_t ret; while (*p) { if (!strncmp(p, "64k", 3)) @@ -1512,10 +1513,11 @@ static int __init calgary_parse_options(char *p) ++p; if (*p == '\0') break; - bridge = simple_strtoul(p, &endp, 0); - if (p == endp) + ret = kstrtoul(p, 0, &val); + if (ret) break; + bridge = val; if (bridge < MAX_PHB_BUS_NUM) { printk(KERN_INFO "Calgary: disabling " "translation for PHB %#x\n", bridge); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 856d5bcae5b..f2e8c0f951d 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -27,6 +27,15 @@ #include <asm/debugreg.h> #include <asm/nmi.h> +/* + * per-CPU TSS segments. Threads are completely 'soft' on Linux, + * no more per-task TSS's. The TSS size is kept cacheline-aligned + * so they are allowed to end up in the .data..cacheline_aligned + * section. Since TSS's are completely CPU-local, we want them + * on exact cacheline boundaries, to eliminate cacheline ping-pong. + */ +DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; + #ifdef CONFIG_X86_64 static DEFINE_PER_CPU(unsigned char, is_idle); static ATOMIC_NOTIFIER_HEAD(idle_notifier); @@ -67,10 +76,9 @@ void free_thread_xstate(struct task_struct *tsk) fpu_free(&tsk->thread.fpu); } -void free_thread_info(struct thread_info *ti) +void arch_release_task_struct(struct task_struct *tsk) { - free_thread_xstate(ti->task); - free_pages((unsigned long)ti, THREAD_ORDER); + free_thread_xstate(tsk); } void arch_task_cache_init(void) @@ -371,7 +379,7 @@ static inline void play_dead(void) #ifdef CONFIG_X86_64 void enter_idle(void) { - percpu_write(is_idle, 1); + this_cpu_write(is_idle, 1); atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); } @@ -510,26 +518,6 @@ void stop_this_cpu(void *dummy) } } -static void do_nothing(void *unused) -{ -} - -/* - * cpu_idle_wait - Used to ensure that all the CPUs discard old value of - * pm_idle and update to new pm_idle value. Required while changing pm_idle - * handler on SMP systems. - * - * Caller must have changed pm_idle to the new value before the call. Old - * pm_idle value will not be used by any CPU after the return of this function. - */ -void cpu_idle_wait(void) -{ - smp_mb(); - /* kick all the CPUs so that they exit out of pm_idle */ - smp_call_function(do_nothing, NULL, 1); -} -EXPORT_SYMBOL_GPL(cpu_idle_wait); - /* Default MONITOR/MWAIT with no hints, used for default C1 state */ static void mwait_idle(void) { @@ -588,9 +576,17 @@ int mwait_usable(const struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; + /* Use mwait if idle=mwait boot option is given */ if (boot_option_idle_override == IDLE_FORCE_MWAIT) return 1; + /* + * Any idle= boot option other than idle=mwait means that we must not + * use mwait. Eg: idle=halt or idle=poll or idle=nomwait + */ + if (boot_option_idle_override != IDLE_NO_OVERRIDE) + return 0; + if (c->cpuid_level < MWAIT_INFO) return 0; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index ae6847303e2..01d8d40ccaf 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -302,7 +302,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) switch_fpu_finish(next_p, fpu); - percpu_write(current_task, next_p); + this_cpu_write(current_task, next_p); return prev_p; } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 43d8b48b23e..28e810255a0 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -237,7 +237,7 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, current->thread.usersp = new_sp; regs->ip = new_ip; regs->sp = new_sp; - percpu_write(old_rsp, new_sp); + this_cpu_write(old_rsp, new_sp); regs->cs = _cs; regs->ss = _ss; regs->flags = X86_EFLAGS_IF; @@ -359,11 +359,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Switch the PDA and FPU contexts. */ - prev->usersp = percpu_read(old_rsp); - percpu_write(old_rsp, next->usersp); - percpu_write(current_task, next_p); + prev->usersp = this_cpu_read(old_rsp); + this_cpu_write(old_rsp, next->usersp); + this_cpu_write(current_task, next_p); - percpu_write(kernel_stack, + this_cpu_write(kernel_stack, (unsigned long)task_stack_page(next_p) + THREAD_SIZE - KERNEL_STACK_OFFSET); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 685845cf16e..13b1990c7c5 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1480,7 +1480,11 @@ long syscall_trace_enter(struct pt_regs *regs) regs->flags |= X86_EFLAGS_TF; /* do the secure computing check first */ - secure_computing(regs->orig_ax); + if (secure_computing(regs->orig_ax)) { + /* seccomp failures shouldn't expose any additional code. */ + ret = -1L; + goto out; + } if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) ret = -1L; @@ -1505,6 +1509,7 @@ long syscall_trace_enter(struct pt_regs *regs) regs->dx, regs->r10); #endif +out: return ret ?: regs->orig_ax; } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 1a290156205..9b4204e0666 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -393,10 +393,9 @@ static void __init reserve_initrd(void) initrd_start = 0; if (ramdisk_size >= (end_of_lowmem>>1)) { - memblock_free(ramdisk_image, ramdisk_end - ramdisk_image); - printk(KERN_ERR "initrd too large to handle, " - "disabling initrd\n"); - return; + panic("initrd too large to handle, " + "disabling initrd (%lld needed, %lld available)\n", + ramdisk_size, end_of_lowmem>>1); } printk(KERN_INFO "RAMDISK: %08llx - %08llx\n", ramdisk_image, @@ -1012,7 +1011,8 @@ void __init setup_arch(char **cmdline_p) init_cpu_to_node(); init_apic_mappings(); - ioapic_and_gsi_init(); + if (x86_io_apic_ops.init) + x86_io_apic_ops.init(); kvm_guest_init(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 71f4727da37..5a98aa27218 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -185,10 +185,22 @@ void __init setup_per_cpu_areas(void) #endif rc = -EINVAL; if (pcpu_chosen_fc != PCPU_FC_PAGE) { - const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE; const size_t dyn_size = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE; + size_t atom_size; + /* + * On 64bit, use PMD_SIZE for atom_size so that embedded + * percpu areas are aligned to PMD. This, in the future, + * can also allow using PMD mappings in vmalloc area. Use + * PAGE_SIZE on 32bit as vmalloc space is highly contended + * and large vmalloc area allocs can easily fail. + */ +#ifdef CONFIG_X86_64 + atom_size = PMD_SIZE; +#else + atom_size = PAGE_SIZE; +#endif rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, dyn_size, atom_size, pcpu_cpu_distance, diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 6e1e406038c..433529e29be 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -76,20 +76,8 @@ /* State of each CPU */ DEFINE_PER_CPU(int, cpu_state) = { 0 }; -/* Store all idle threads, this can be reused instead of creating -* a new thread. Also avoids complicated thread destroy functionality -* for idle threads. -*/ #ifdef CONFIG_HOTPLUG_CPU /* - * Needed only for CONFIG_HOTPLUG_CPU because __cpuinitdata is - * removed after init for !CONFIG_HOTPLUG_CPU. - */ -static DEFINE_PER_CPU(struct task_struct *, idle_thread_array); -#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x)) -#define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p)) - -/* * We need this for trampoline_base protection from concurrent accesses when * off- and onlining cores wildly. */ @@ -97,20 +85,16 @@ static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex); void cpu_hotplug_driver_lock(void) { - mutex_lock(&x86_cpu_hotplug_driver_mutex); + mutex_lock(&x86_cpu_hotplug_driver_mutex); } void cpu_hotplug_driver_unlock(void) { - mutex_unlock(&x86_cpu_hotplug_driver_mutex); + mutex_unlock(&x86_cpu_hotplug_driver_mutex); } ssize_t arch_cpu_probe(const char *buf, size_t count) { return -1; } ssize_t arch_cpu_release(const char *buf, size_t count) { return -1; } -#else -static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; -#define get_idle_for_cpu(x) (idle_thread_array[(x)]) -#define set_idle_for_cpu(x, p) (idle_thread_array[(x)] = (p)) #endif /* Number of siblings per CPU package */ @@ -315,59 +299,90 @@ void __cpuinit smp_store_cpu_info(int id) identify_secondary_cpu(c); } -static void __cpuinit link_thread_siblings(int cpu1, int cpu2) +static bool __cpuinit +topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name) { - cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2)); - cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1)); - cpumask_set_cpu(cpu1, cpu_core_mask(cpu2)); - cpumask_set_cpu(cpu2, cpu_core_mask(cpu1)); - cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2)); - cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1)); + int cpu1 = c->cpu_index, cpu2 = o->cpu_index; + + return !WARN_ONCE(cpu_to_node(cpu1) != cpu_to_node(cpu2), + "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! " + "[node: %d != %d]. Ignoring dependency.\n", + cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2)); } +#define link_mask(_m, c1, c2) \ +do { \ + cpumask_set_cpu((c1), cpu_##_m##_mask(c2)); \ + cpumask_set_cpu((c2), cpu_##_m##_mask(c1)); \ +} while (0) + +static bool __cpuinit match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ + if (cpu_has(c, X86_FEATURE_TOPOEXT)) { + int cpu1 = c->cpu_index, cpu2 = o->cpu_index; + + if (c->phys_proc_id == o->phys_proc_id && + per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2) && + c->compute_unit_id == o->compute_unit_id) + return topology_sane(c, o, "smt"); + + } else if (c->phys_proc_id == o->phys_proc_id && + c->cpu_core_id == o->cpu_core_id) { + return topology_sane(c, o, "smt"); + } + + return false; +} + +static bool __cpuinit match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ + int cpu1 = c->cpu_index, cpu2 = o->cpu_index; + + if (per_cpu(cpu_llc_id, cpu1) != BAD_APICID && + per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2)) + return topology_sane(c, o, "llc"); + + return false; +} + +static bool __cpuinit match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ + if (c->phys_proc_id == o->phys_proc_id) + return topology_sane(c, o, "mc"); + + return false; +} void __cpuinit set_cpu_sibling_map(int cpu) { - int i; + bool has_mc = boot_cpu_data.x86_max_cores > 1; + bool has_smt = smp_num_siblings > 1; struct cpuinfo_x86 *c = &cpu_data(cpu); + struct cpuinfo_x86 *o; + int i; cpumask_set_cpu(cpu, cpu_sibling_setup_mask); - if (smp_num_siblings > 1) { - for_each_cpu(i, cpu_sibling_setup_mask) { - struct cpuinfo_x86 *o = &cpu_data(i); - - if (cpu_has(c, X86_FEATURE_TOPOEXT)) { - if (c->phys_proc_id == o->phys_proc_id && - per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i) && - c->compute_unit_id == o->compute_unit_id) - link_thread_siblings(cpu, i); - } else if (c->phys_proc_id == o->phys_proc_id && - c->cpu_core_id == o->cpu_core_id) { - link_thread_siblings(cpu, i); - } - } - } else { + if (!has_smt && !has_mc) { cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); - } - - cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu)); - - if (__this_cpu_read(cpu_info.x86_max_cores) == 1) { - cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu)); + cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu)); + cpumask_set_cpu(cpu, cpu_core_mask(cpu)); c->booted_cores = 1; return; } for_each_cpu(i, cpu_sibling_setup_mask) { - if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && - per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { - cpumask_set_cpu(i, cpu_llc_shared_mask(cpu)); - cpumask_set_cpu(cpu, cpu_llc_shared_mask(i)); - } - if (c->phys_proc_id == cpu_data(i).phys_proc_id) { - cpumask_set_cpu(i, cpu_core_mask(cpu)); - cpumask_set_cpu(cpu, cpu_core_mask(i)); + o = &cpu_data(i); + + if ((i == cpu) || (has_smt && match_smt(c, o))) + link_mask(sibling, cpu, i); + + if ((i == cpu) || (has_mc && match_llc(c, o))) + link_mask(llc_shared, cpu, i); + + if ((i == cpu) || (has_mc && match_mc(c, o))) { + link_mask(core, cpu, i); + /* * Does this new cpu bringup a new core? */ @@ -398,8 +413,7 @@ const struct cpumask *cpu_coregroup_mask(int cpu) * For perf, we return last level cache shared map. * And for power savings, we return cpu_core_map */ - if ((sched_mc_power_savings || sched_smt_power_savings) && - !(cpu_has(c, X86_FEATURE_AMD_DCM))) + if (!(cpu_has(c, X86_FEATURE_AMD_DCM))) return cpu_core_mask(cpu); else return cpu_llc_shared_mask(cpu); @@ -618,22 +632,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) return (send_status | accept_status); } -struct create_idle { - struct work_struct work; - struct task_struct *idle; - struct completion done; - int cpu; -}; - -static void __cpuinit do_fork_idle(struct work_struct *work) -{ - struct create_idle *c_idle = - container_of(work, struct create_idle, work); - - c_idle->idle = fork_idle(c_idle->cpu); - complete(&c_idle->done); -} - /* reduce the number of lines printed when booting a large cpu count system */ static void __cpuinit announce_cpu(int cpu, int apicid) { @@ -660,58 +658,31 @@ static void __cpuinit announce_cpu(int cpu, int apicid) * Returns zero if CPU booted OK, else error code from * ->wakeup_secondary_cpu. */ -static int __cpuinit do_boot_cpu(int apicid, int cpu) +static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) { unsigned long boot_error = 0; unsigned long start_ip; int timeout; - struct create_idle c_idle = { - .cpu = cpu, - .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), - }; - - INIT_WORK_ONSTACK(&c_idle.work, do_fork_idle); alternatives_smp_switch(1); - c_idle.idle = get_idle_for_cpu(cpu); - - /* - * We can't use kernel_thread since we must avoid to - * reschedule the child. - */ - if (c_idle.idle) { - c_idle.idle->thread.sp = (unsigned long) (((struct pt_regs *) - (THREAD_SIZE + task_stack_page(c_idle.idle))) - 1); - init_idle(c_idle.idle, cpu); - goto do_rest; - } - - schedule_work(&c_idle.work); - wait_for_completion(&c_idle.done); + idle->thread.sp = (unsigned long) (((struct pt_regs *) + (THREAD_SIZE + task_stack_page(idle))) - 1); + per_cpu(current_task, cpu) = idle; - if (IS_ERR(c_idle.idle)) { - printk("failed fork for CPU %d\n", cpu); - destroy_work_on_stack(&c_idle.work); - return PTR_ERR(c_idle.idle); - } - - set_idle_for_cpu(cpu, c_idle.idle); -do_rest: - per_cpu(current_task, cpu) = c_idle.idle; #ifdef CONFIG_X86_32 /* Stack for startup_32 can be just as for start_secondary onwards */ irq_ctx_init(cpu); #else - clear_tsk_thread_flag(c_idle.idle, TIF_FORK); + clear_tsk_thread_flag(idle, TIF_FORK); initial_gs = per_cpu_offset(cpu); per_cpu(kernel_stack, cpu) = - (unsigned long)task_stack_page(c_idle.idle) - + (unsigned long)task_stack_page(idle) - KERNEL_STACK_OFFSET + THREAD_SIZE; #endif early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); initial_code = (unsigned long)start_secondary; - stack_start = c_idle.idle->thread.sp; + stack_start = idle->thread.sp; /* start_ip had better be page-aligned! */ start_ip = trampoline_address(); @@ -813,12 +784,10 @@ do_rest: */ smpboot_restore_warm_reset_vector(); } - - destroy_work_on_stack(&c_idle.work); return boot_error; } -int __cpuinit native_cpu_up(unsigned int cpu) +int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle) { int apicid = apic->cpu_present_to_apicid(cpu); unsigned long flags; @@ -851,7 +820,7 @@ int __cpuinit native_cpu_up(unsigned int cpu) per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; - err = do_boot_cpu(apicid, cpu); + err = do_boot_cpu(apicid, cpu, tidle); if (err) { pr_debug("do_boot_cpu failed %d\n", err); return -EIO; diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ff9281f1602..92d5756d85f 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -50,6 +50,7 @@ #include <asm/processor.h> #include <asm/debugreg.h> #include <linux/atomic.h> +#include <asm/ftrace.h> #include <asm/traps.h> #include <asm/desc.h> #include <asm/i387.h> @@ -303,8 +304,13 @@ gp_in_kernel: } /* May run on IST stack. */ -dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) +dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_code) { +#ifdef CONFIG_DYNAMIC_FTRACE + /* ftrace must be first, everything else may cause a recursive crash */ + if (unlikely(modifying_ftrace_code) && ftrace_int3_handler(regs)) + return; +#endif #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, SIGTRAP) == NOTIFY_STOP) diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 9cf71d0b2d3..35c5e543f55 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -18,6 +18,7 @@ #include <asm/e820.h> #include <asm/time.h> #include <asm/irq.h> +#include <asm/io_apic.h> #include <asm/pat.h> #include <asm/tsc.h> #include <asm/iommu.h> @@ -119,3 +120,10 @@ struct x86_msi_ops x86_msi = { .teardown_msi_irqs = default_teardown_msi_irqs, .restore_msi_irqs = default_restore_msi_irqs, }; + +struct x86_io_apic_ops x86_io_apic_ops = { + .init = native_io_apic_init_mappings, + .read = native_io_apic_read, + .write = native_io_apic_write, + .modify = native_io_apic_modify, +}; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 91a5e989abc..185a2b823a2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6581,6 +6581,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, kvm_inject_page_fault(vcpu, &fault); } vcpu->arch.apf.halted = false; + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu) diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c index d6ae30bbd7b..2e4e4b02c37 100644 --- a/arch/x86/lib/usercopy.c +++ b/arch/x86/lib/usercopy.c @@ -44,13 +44,6 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) } EXPORT_SYMBOL_GPL(copy_from_user_nmi); -static inline unsigned long count_bytes(unsigned long mask) -{ - mask = (mask - 1) & ~mask; - mask >>= 7; - return count_masked_bytes(mask); -} - /* * Do a strncpy, return length of string without final '\0'. * 'count' is the user-supplied count (return 'count' if we @@ -69,16 +62,19 @@ static inline long do_strncpy_from_user(char *dst, const char __user *src, long max = count; while (max >= sizeof(unsigned long)) { - unsigned long c; + unsigned long c, mask; /* Fall back to byte-at-a-time if we get a page fault */ if (unlikely(__get_user(c,(unsigned long __user *)(src+res)))) break; - /* This can write a few bytes past the NUL character, but that's ok */ + mask = has_zero(c); + if (mask) { + mask = (mask - 1) & ~mask; + mask >>= 7; + *(unsigned long *)(dst+res) = c & mask; + return res + count_masked_bytes(mask); + } *(unsigned long *)(dst+res) = c; - c = has_zero(c); - if (c) - return res + count_bytes(c); res += sizeof(unsigned long); max -= sizeof(unsigned long); } diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index 53489ff6bf8..871dd886817 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -339,9 +339,11 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) } else { unsigned long n; - n = simple_strtoul(emu_cmdline, NULL, 0); + n = simple_strtoul(emu_cmdline, &emu_cmdline, 0); ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n); } + if (*emu_cmdline == ':') + emu_cmdline++; if (ret < 0) goto no_emu; @@ -418,7 +420,9 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) int physj = emu_nid_to_phys[j]; int dist; - if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) + if (get_option(&emu_cmdline, &dist) == 2) + ; + else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) dist = physi == physj ? LOCAL_DISTANCE : REMOTE_DISTANCE; else diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index d6c0418c3e4..3804471db10 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -61,10 +61,10 @@ static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset); */ void leave_mm(int cpu) { - if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) + if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) BUG(); cpumask_clear_cpu(cpu, - mm_cpumask(percpu_read(cpu_tlbstate.active_mm))); + mm_cpumask(this_cpu_read(cpu_tlbstate.active_mm))); load_cr3(swapper_pg_dir); } EXPORT_SYMBOL_GPL(leave_mm); @@ -152,8 +152,8 @@ void smp_invalidate_interrupt(struct pt_regs *regs) * BUG(); */ - if (f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) { - if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { + if (f->flush_mm == this_cpu_read(cpu_tlbstate.active_mm)) { + if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { if (f->flush_va == TLB_FLUSH_ALL) local_flush_tlb(); else @@ -322,7 +322,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) static void do_flush_tlb_all(void *info) { __flush_tlb_all(); - if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) + if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) leave_mm(smp_processor_id()); } diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index ed2835e148b..fc09c2754e0 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -9,11 +9,11 @@ struct pci_root_info { struct acpi_device *bridge; - char *name; + char name[16]; unsigned int res_num; struct resource *res; - struct list_head *resources; int busnum; + struct pci_sysdata sd; }; static bool pci_use_crs = true; @@ -245,13 +245,6 @@ setup_resource(struct acpi_resource *acpi_res, void *data) return AE_OK; } -static bool resource_contains(struct resource *res, resource_size_t point) -{ - if (res->start <= point && point <= res->end) - return true; - return false; -} - static void coalesce_windows(struct pci_root_info *info, unsigned long type) { int i, j; @@ -272,10 +265,7 @@ static void coalesce_windows(struct pci_root_info *info, unsigned long type) * our resources no longer match the ACPI _CRS, but * the kernel resource tree doesn't allow overlaps. */ - if (resource_contains(res1, res2->start) || - resource_contains(res1, res2->end) || - resource_contains(res2, res1->start) || - resource_contains(res2, res1->end)) { + if (resource_overlaps(res1, res2)) { res1->start = min(res1->start, res2->start); res1->end = max(res1->end, res2->end); dev_info(&info->bridge->dev, @@ -287,7 +277,8 @@ static void coalesce_windows(struct pci_root_info *info, unsigned long type) } } -static void add_resources(struct pci_root_info *info) +static void add_resources(struct pci_root_info *info, + struct list_head *resources) { int i; struct resource *res, *root, *conflict; @@ -311,53 +302,74 @@ static void add_resources(struct pci_root_info *info) "ignoring host bridge window %pR (conflicts with %s %pR)\n", res, conflict->name, conflict); else - pci_add_resource(info->resources, res); + pci_add_resource(resources, res); } } +static void free_pci_root_info_res(struct pci_root_info *info) +{ + kfree(info->res); + info->res = NULL; + info->res_num = 0; +} + +static void __release_pci_root_info(struct pci_root_info *info) +{ + int i; + struct resource *res; + + for (i = 0; i < info->res_num; i++) { + res = &info->res[i]; + + if (!res->parent) + continue; + + if (!(res->flags & (IORESOURCE_MEM | IORESOURCE_IO))) + continue; + + release_resource(res); + } + + free_pci_root_info_res(info); + + kfree(info); +} +static void release_pci_root_info(struct pci_host_bridge *bridge) +{ + struct pci_root_info *info = bridge->release_data; + + __release_pci_root_info(info); +} + static void -get_current_resources(struct acpi_device *device, int busnum, - int domain, struct list_head *resources) +probe_pci_root_info(struct pci_root_info *info, struct acpi_device *device, + int busnum, int domain) { - struct pci_root_info info; size_t size; - info.bridge = device; - info.res_num = 0; - info.resources = resources; + info->bridge = device; + info->res_num = 0; acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_resource, - &info); - if (!info.res_num) + info); + if (!info->res_num) return; - size = sizeof(*info.res) * info.res_num; - info.res = kmalloc(size, GFP_KERNEL); - if (!info.res) + size = sizeof(*info->res) * info->res_num; + info->res_num = 0; + info->res = kmalloc(size, GFP_KERNEL); + if (!info->res) return; - info.name = kasprintf(GFP_KERNEL, "PCI Bus %04x:%02x", domain, busnum); - if (!info.name) - goto name_alloc_fail; + sprintf(info->name, "PCI Bus %04x:%02x", domain, busnum); - info.res_num = 0; acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource, - &info); - - if (pci_use_crs) { - add_resources(&info); - - return; - } - - kfree(info.name); - -name_alloc_fail: - kfree(info.res); + info); } struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) { struct acpi_device *device = root->device; + struct pci_root_info *info = NULL; int domain = root->segment; int busnum = root->secondary.start; LIST_HEAD(resources); @@ -389,17 +401,14 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) if (node != -1 && !node_online(node)) node = -1; - /* Allocate per-root-bus (not per bus) arch-specific data. - * TODO: leak; this memory is never freed. - * It's arguable whether it's worth the trouble to care. - */ - sd = kzalloc(sizeof(*sd), GFP_KERNEL); - if (!sd) { + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) { printk(KERN_WARNING "pci_bus %04x:%02x: " "ignored (out of memory)\n", domain, busnum); return NULL; } + sd = &info->sd; sd->domain = domain; sd->node = node; /* @@ -413,22 +422,32 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) * be replaced by sd. */ memcpy(bus->sysdata, sd, sizeof(*sd)); - kfree(sd); + kfree(info); } else { - get_current_resources(device, busnum, domain, &resources); + probe_pci_root_info(info, device, busnum, domain); /* * _CRS with no apertures is normal, so only fall back to * defaults or native bridge info if we're ignoring _CRS. */ - if (!pci_use_crs) + if (pci_use_crs) + add_resources(info, &resources); + else { + free_pci_root_info_res(info); x86_pci_root_bus_resources(busnum, &resources); + } + bus = pci_create_root_bus(NULL, busnum, &pci_root_ops, sd, &resources); - if (bus) + if (bus) { bus->subordinate = pci_scan_child_bus(bus); - else + pci_set_host_bridge_release( + to_pci_host_bridge(bus->bridge), + release_pci_root_info, info); + } else { pci_free_resource_list(&resources); + __release_pci_root_info(info); + } } /* After the PCI-E bus has been walked and all devices discovered, @@ -445,9 +464,6 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) } } - if (!bus) - kfree(sd); - if (bus && node != -1) { #ifdef CONFIG_ACPI_NUMA if (pxm >= 0) diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index 0567df3890e..5aed49bff05 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -32,6 +32,27 @@ static struct pci_hostbridge_probe pci_probes[] __initdata = { #define RANGE_NUM 16 +static struct pci_root_info __init *find_pci_root_info(int node, int link) +{ + struct pci_root_info *info; + + /* find the position */ + list_for_each_entry(info, &pci_root_infos, list) + if (info->node == node && info->link == link) + return info; + + return NULL; +} + +static void __init set_mp_bus_range_to_node(int min_bus, int max_bus, int node) +{ +#ifdef CONFIG_NUMA + int j; + + for (j = min_bus; j <= max_bus; j++) + set_mp_bus_to_node(j, node); +#endif +} /** * early_fill_mp_bus_to_node() * called before pcibios_scan_root and pci_scan_bus @@ -41,7 +62,6 @@ static struct pci_hostbridge_probe pci_probes[] __initdata = { static int __init early_fill_mp_bus_info(void) { int i; - int j; unsigned bus; unsigned slot; int node; @@ -50,7 +70,6 @@ static int __init early_fill_mp_bus_info(void) int def_link; struct pci_root_info *info; u32 reg; - struct resource *res; u64 start; u64 end; struct range range[RANGE_NUM]; @@ -86,7 +105,6 @@ static int __init early_fill_mp_bus_info(void) if (!found) return 0; - pci_root_num = 0; for (i = 0; i < 4; i++) { int min_bus; int max_bus; @@ -99,19 +117,11 @@ static int __init early_fill_mp_bus_info(void) min_bus = (reg >> 16) & 0xff; max_bus = (reg >> 24) & 0xff; node = (reg >> 4) & 0x07; -#ifdef CONFIG_NUMA - for (j = min_bus; j <= max_bus; j++) - set_mp_bus_to_node(j, node); -#endif + set_mp_bus_range_to_node(min_bus, max_bus, node); link = (reg >> 8) & 0x03; - info = &pci_root_info[pci_root_num]; - info->bus_min = min_bus; - info->bus_max = max_bus; - info->node = node; - info->link = link; + info = alloc_pci_root_info(min_bus, max_bus, node, link); sprintf(info->name, "PCI Bus #%02x", min_bus); - pci_root_num++; } /* get the default node and link for left over res */ @@ -134,16 +144,10 @@ static int __init early_fill_mp_bus_info(void) link = (reg >> 4) & 0x03; end = (reg & 0xfff000) | 0xfff; - /* find the position */ - for (j = 0; j < pci_root_num; j++) { - info = &pci_root_info[j]; - if (info->node == node && info->link == link) - break; - } - if (j == pci_root_num) + info = find_pci_root_info(node, link); + if (!info) continue; /* not found */ - info = &pci_root_info[j]; printk(KERN_DEBUG "node %d link %d: io port [%llx, %llx]\n", node, link, start, end); @@ -155,13 +159,8 @@ static int __init early_fill_mp_bus_info(void) } /* add left over io port range to def node/link, [0, 0xffff] */ /* find the position */ - for (j = 0; j < pci_root_num; j++) { - info = &pci_root_info[j]; - if (info->node == def_node && info->link == def_link) - break; - } - if (j < pci_root_num) { - info = &pci_root_info[j]; + info = find_pci_root_info(def_node, def_link); + if (info) { for (i = 0; i < RANGE_NUM; i++) { if (!range[i].end) continue; @@ -214,16 +213,10 @@ static int __init early_fill_mp_bus_info(void) end <<= 8; end |= 0xffff; - /* find the position */ - for (j = 0; j < pci_root_num; j++) { - info = &pci_root_info[j]; - if (info->node == node && info->link == link) - break; - } - if (j == pci_root_num) - continue; /* not found */ + info = find_pci_root_info(node, link); - info = &pci_root_info[j]; + if (!info) + continue; printk(KERN_DEBUG "node %d link %d: mmio [%llx, %llx]", node, link, start, end); @@ -291,14 +284,8 @@ static int __init early_fill_mp_bus_info(void) * add left over mmio range to def node/link ? * that is tricky, just record range in from start_min to 4G */ - for (j = 0; j < pci_root_num; j++) { - info = &pci_root_info[j]; - if (info->node == def_node && info->link == def_link) - break; - } - if (j < pci_root_num) { - info = &pci_root_info[j]; - + info = find_pci_root_info(def_node, def_link); + if (info) { for (i = 0; i < RANGE_NUM; i++) { if (!range[i].end) continue; @@ -309,20 +296,16 @@ static int __init early_fill_mp_bus_info(void) } } - for (i = 0; i < pci_root_num; i++) { - int res_num; + list_for_each_entry(info, &pci_root_infos, list) { int busnum; + struct pci_root_res *root_res; - info = &pci_root_info[i]; - res_num = info->res_num; busnum = info->bus_min; printk(KERN_DEBUG "bus: [%02x, %02x] on node %x link %x\n", info->bus_min, info->bus_max, info->node, info->link); - for (j = 0; j < res_num; j++) { - res = &info->res[j]; - printk(KERN_DEBUG "bus: %02x index %x %pR\n", - busnum, j, res); - } + list_for_each_entry(root_res, &info->resources, list) + printk(KERN_DEBUG "bus: %02x %pR\n", + busnum, &root_res->res); } return 0; diff --git a/arch/x86/pci/broadcom_bus.c b/arch/x86/pci/broadcom_bus.c index f3a7c569a40..614392ced7d 100644 --- a/arch/x86/pci/broadcom_bus.c +++ b/arch/x86/pci/broadcom_bus.c @@ -22,19 +22,15 @@ static void __init cnb20le_res(u8 bus, u8 slot, u8 func) { struct pci_root_info *info; + struct pci_root_res *root_res; struct resource res; u16 word1, word2; u8 fbus, lbus; - int i; - - info = &pci_root_info[pci_root_num]; - pci_root_num++; /* read the PCI bus numbers */ fbus = read_pci_config_byte(bus, slot, func, 0x44); lbus = read_pci_config_byte(bus, slot, func, 0x45); - info->bus_min = fbus; - info->bus_max = lbus; + info = alloc_pci_root_info(fbus, lbus, 0, 0); /* * Add the legacy IDE ports on bus 0 @@ -86,8 +82,8 @@ static void __init cnb20le_res(u8 bus, u8 slot, u8 func) res.flags = IORESOURCE_BUS; printk(KERN_INFO "CNB20LE PCI Host Bridge (domain 0000 %pR)\n", &res); - for (i = 0; i < info->res_num; i++) - printk(KERN_INFO "host bridge window %pR\n", &info->res[i]); + list_for_each_entry(root_res, &info->resources, list) + printk(KERN_INFO "host bridge window %pR\n", &root_res->res); } static int __init broadcom_postcore_init(void) diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c index fd3f65510e9..306579f7d0f 100644 --- a/arch/x86/pci/bus_numa.c +++ b/arch/x86/pci/bus_numa.c @@ -4,35 +4,38 @@ #include "bus_numa.h" -int pci_root_num; -struct pci_root_info pci_root_info[PCI_ROOT_NR]; +LIST_HEAD(pci_root_infos); -void x86_pci_root_bus_resources(int bus, struct list_head *resources) +static struct pci_root_info *x86_find_pci_root_info(int bus) { - int i; - int j; struct pci_root_info *info; - if (!pci_root_num) - goto default_resources; + if (list_empty(&pci_root_infos)) + return NULL; - for (i = 0; i < pci_root_num; i++) { - if (pci_root_info[i].bus_min == bus) - break; - } + list_for_each_entry(info, &pci_root_infos, list) + if (info->bus_min == bus) + return info; + + return NULL; +} - if (i == pci_root_num) +void x86_pci_root_bus_resources(int bus, struct list_head *resources) +{ + struct pci_root_info *info = x86_find_pci_root_info(bus); + struct pci_root_res *root_res; + + if (!info) goto default_resources; printk(KERN_DEBUG "PCI: root bus %02x: hardware-probed resources\n", bus); - info = &pci_root_info[i]; - for (j = 0; j < info->res_num; j++) { + list_for_each_entry(root_res, &info->resources, list) { struct resource *res; struct resource *root; - res = &info->res[j]; + res = &root_res->res; pci_add_resource(resources, res); if (res->flags & IORESOURCE_IO) root = &ioport_resource; @@ -53,11 +56,32 @@ default_resources: pci_add_resource(resources, &iomem_resource); } +struct pci_root_info __init *alloc_pci_root_info(int bus_min, int bus_max, + int node, int link) +{ + struct pci_root_info *info; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + + if (!info) + return info; + + INIT_LIST_HEAD(&info->resources); + info->bus_min = bus_min; + info->bus_max = bus_max; + info->node = node; + info->link = link; + + list_add_tail(&info->list, &pci_root_infos); + + return info; +} + void __devinit update_res(struct pci_root_info *info, resource_size_t start, resource_size_t end, unsigned long flags, int merge) { - int i; struct resource *res; + struct pci_root_res *root_res; if (start > end) return; @@ -69,11 +93,11 @@ void __devinit update_res(struct pci_root_info *info, resource_size_t start, goto addit; /* try to merge it with old one */ - for (i = 0; i < info->res_num; i++) { + list_for_each_entry(root_res, &info->resources, list) { resource_size_t final_start, final_end; resource_size_t common_start, common_end; - res = &info->res[i]; + res = &root_res->res; if (res->flags != flags) continue; @@ -93,14 +117,15 @@ void __devinit update_res(struct pci_root_info *info, resource_size_t start, addit: /* need to add that */ - if (info->res_num >= RES_NUM) + root_res = kzalloc(sizeof(*root_res), GFP_KERNEL); + if (!root_res) return; - res = &info->res[info->res_num]; + res = &root_res->res; res->name = info->name; res->flags = flags; res->start = start; res->end = end; - res->child = NULL; - info->res_num++; + + list_add_tail(&root_res->list, &info->resources); } diff --git a/arch/x86/pci/bus_numa.h b/arch/x86/pci/bus_numa.h index 804a4b40c31..226a466b2b2 100644 --- a/arch/x86/pci/bus_numa.h +++ b/arch/x86/pci/bus_numa.h @@ -4,22 +4,24 @@ * sub bus (transparent) will use entres from 3 to store extra from * root, so need to make sure we have enough slot there. */ -#define RES_NUM 16 +struct pci_root_res { + struct list_head list; + struct resource res; +}; + struct pci_root_info { + struct list_head list; char name[12]; - unsigned int res_num; - struct resource res[RES_NUM]; + struct list_head resources; int bus_min; int bus_max; int node; int link; }; -/* 4 at this time, it may become to 32 */ -#define PCI_ROOT_NR 4 -extern int pci_root_num; -extern struct pci_root_info pci_root_info[PCI_ROOT_NR]; - +extern struct list_head pci_root_infos; +struct pci_root_info *alloc_pci_root_info(int bus_min, int bus_max, + int node, int link); extern void update_res(struct pci_root_info *info, resource_size_t start, resource_size_t end, unsigned long flags, int merge); #endif diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 323481e06ef..0ad990a20d4 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -11,6 +11,7 @@ #include <linux/dmi.h> #include <linux/slab.h> +#include <asm-generic/pci-bridge.h> #include <asm/acpi.h> #include <asm/segment.h> #include <asm/io.h> @@ -229,6 +230,14 @@ static int __devinit assign_all_busses(const struct dmi_system_id *d) } #endif +static int __devinit set_scan_all(const struct dmi_system_id *d) +{ + printk(KERN_INFO "PCI: %s detected, enabling pci=pcie_scan_all\n", + d->ident); + pci_add_flags(PCI_SCAN_ALL_PCIE_DEVS); + return 0; +} + static const struct dmi_system_id __devinitconst pciprobe_dmi_table[] = { #ifdef __i386__ /* @@ -420,6 +429,13 @@ static const struct dmi_system_id __devinitconst pciprobe_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL585 G2"), }, }, + { + .callback = set_scan_all, + .ident = "Stratus/NEC ftServer", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "ftServer"), + }, + }, {} }; @@ -430,9 +446,7 @@ void __init dmi_check_pciprobe(void) struct pci_bus * __devinit pcibios_scan_root(int busnum) { - LIST_HEAD(resources); struct pci_bus *bus = NULL; - struct pci_sysdata *sd; while ((bus = pci_find_next_bus(bus)) != NULL) { if (bus->number == busnum) { @@ -441,28 +455,10 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum) } } - /* Allocate per-root-bus (not per bus) arch-specific data. - * TODO: leak; this memory is never freed. - * It's arguable whether it's worth the trouble to care. - */ - sd = kzalloc(sizeof(*sd), GFP_KERNEL); - if (!sd) { - printk(KERN_ERR "PCI: OOM, not probing PCI bus %02x\n", busnum); - return NULL; - } - - sd->node = get_mp_bus_to_node(busnum); - - printk(KERN_DEBUG "PCI: Probing PCI hardware (bus %02x)\n", busnum); - x86_pci_root_bus_resources(busnum, &resources); - bus = pci_scan_root_bus(NULL, busnum, &pci_root_ops, sd, &resources); - if (!bus) { - pci_free_resource_list(&resources); - kfree(sd); - } - - return bus; + return pci_scan_bus_on_node(busnum, &pci_root_ops, + get_mp_bus_to_node(busnum)); } + void __init pcibios_set_cache_line_size(void) { struct cpuinfo_x86 *c = &boot_cpu_data; @@ -656,6 +652,7 @@ struct pci_bus * __devinit pci_scan_bus_on_node(int busno, struct pci_ops *ops, } sd->node = node; x86_pci_root_bus_resources(busno, &resources); + printk(KERN_DEBUG "PCI: Probing PCI hardware (bus %02x)\n", busno); bus = pci_scan_root_bus(NULL, busno, ops, sd, &resources); if (!bus) { pci_free_resource_list(&resources); diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 831971e731f..dd8ca6f7223 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -57,7 +57,7 @@ static struct pcibios_fwaddrmap *pcibios_fwaddrmap_lookup(struct pci_dev *dev) { struct pcibios_fwaddrmap *map; - WARN_ON(!spin_is_locked(&pcibios_fwaddrmap_lock)); + WARN_ON_SMP(!spin_is_locked(&pcibios_fwaddrmap_lock)); list_for_each_entry(map, &pcibios_fwaddrmappings, list) if (map->dev == dev) diff --git a/arch/x86/platform/visws/visws_quirks.c b/arch/x86/platform/visws/visws_quirks.c index c7abf13a213..94d8a39332e 100644 --- a/arch/x86/platform/visws/visws_quirks.c +++ b/arch/x86/platform/visws/visws_quirks.c @@ -445,7 +445,7 @@ static void ack_cobalt_irq(struct irq_data *data) spin_lock_irqsave(&cobalt_lock, flags); disable_cobalt_irq(data); - apic_write(APIC_EOI, APIC_EIO_ACK); + apic_write(APIC_EOI, APIC_EOI_ACK); spin_unlock_irqrestore(&cobalt_lock, flags); } diff --git a/arch/x86/tools/.gitignore b/arch/x86/tools/.gitignore new file mode 100644 index 00000000000..be0ed065249 --- /dev/null +++ b/arch/x86/tools/.gitignore @@ -0,0 +1 @@ +relocs diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile index d511aa97533..733057b435b 100644 --- a/arch/x86/tools/Makefile +++ b/arch/x86/tools/Makefile @@ -36,3 +36,7 @@ HOSTCFLAGS_insn_sanity.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x $(obj)/test_get_len.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c $(obj)/insn_sanity.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c + +HOST_EXTRACFLAGS += -I$(srctree)/tools/include +hostprogs-y += relocs +relocs: $(obj)/relocs diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/tools/relocs.c index fb7117a4ade..b43cfcd9bf4 100644 --- a/arch/x86/boot/compressed/relocs.c +++ b/arch/x86/tools/relocs.c @@ -18,6 +18,8 @@ static void die(char *fmt, ...); static Elf32_Ehdr ehdr; static unsigned long reloc_count, reloc_idx; static unsigned long *relocs; +static unsigned long reloc16_count, reloc16_idx; +static unsigned long *relocs16; struct section { Elf32_Shdr shdr; @@ -28,52 +30,86 @@ struct section { }; static struct section *secs; +enum symtype { + S_ABS, + S_REL, + S_SEG, + S_LIN, + S_NSYMTYPES +}; + +static const char * const sym_regex_kernel[S_NSYMTYPES] = { /* * Following symbols have been audited. There values are constant and do * not change if bzImage is loaded at a different physical address than * the address for which it has been compiled. Don't warn user about * absolute relocations present w.r.t these symbols. */ -static const char abs_sym_regex[] = + [S_ABS] = "^(xen_irq_disable_direct_reloc$|" "xen_save_fl_direct_reloc$|" "VDSO|" - "__crc_)"; -static regex_t abs_sym_regex_c; -static int is_abs_reloc(const char *sym_name) -{ - return !regexec(&abs_sym_regex_c, sym_name, 0, NULL, 0); -} + "__crc_)", /* * These symbols are known to be relative, even if the linker marks them * as absolute (typically defined outside any section in the linker script.) */ -static const char rel_sym_regex[] = - "^_end$"; -static regex_t rel_sym_regex_c; -static int is_rel_reloc(const char *sym_name) + [S_REL] = + "^(__init_(begin|end)|" + "__x86_cpu_dev_(start|end)|" + "(__parainstructions|__alt_instructions)(|_end)|" + "(__iommu_table|__apicdrivers|__smp_locks)(|_end)|" + "_end)$" +}; + + +static const char * const sym_regex_realmode[S_NSYMTYPES] = { +/* + * These are 16-bit segment symbols when compiling 16-bit code. + */ + [S_SEG] = + "^real_mode_seg$", + +/* + * These are offsets belonging to segments, as opposed to linear addresses, + * when compiling 16-bit code. + */ + [S_LIN] = + "^pa_", +}; + +static const char * const *sym_regex; + +static regex_t sym_regex_c[S_NSYMTYPES]; +static int is_reloc(enum symtype type, const char *sym_name) { - return !regexec(&rel_sym_regex_c, sym_name, 0, NULL, 0); + return sym_regex[type] && + !regexec(&sym_regex_c[type], sym_name, 0, NULL, 0); } -static void regex_init(void) +static void regex_init(int use_real_mode) { char errbuf[128]; int err; - - err = regcomp(&abs_sym_regex_c, abs_sym_regex, - REG_EXTENDED|REG_NOSUB); - if (err) { - regerror(err, &abs_sym_regex_c, errbuf, sizeof errbuf); - die("%s", errbuf); - } + int i; + + if (use_real_mode) + sym_regex = sym_regex_realmode; + else + sym_regex = sym_regex_kernel; - err = regcomp(&rel_sym_regex_c, rel_sym_regex, - REG_EXTENDED|REG_NOSUB); - if (err) { - regerror(err, &rel_sym_regex_c, errbuf, sizeof errbuf); - die("%s", errbuf); + for (i = 0; i < S_NSYMTYPES; i++) { + if (!sym_regex[i]) + continue; + + err = regcomp(&sym_regex_c[i], sym_regex[i], + REG_EXTENDED|REG_NOSUB); + + if (err) { + regerror(err, &sym_regex_c[i], errbuf, sizeof errbuf); + die("%s", errbuf); + } } } @@ -154,6 +190,10 @@ static const char *rel_type(unsigned type) REL_TYPE(R_386_RELATIVE), REL_TYPE(R_386_GOTOFF), REL_TYPE(R_386_GOTPC), + REL_TYPE(R_386_8), + REL_TYPE(R_386_PC8), + REL_TYPE(R_386_16), + REL_TYPE(R_386_PC16), #undef REL_TYPE }; const char *name = "unknown type rel type name"; @@ -189,7 +229,7 @@ static const char *sym_name(const char *sym_strtab, Elf32_Sym *sym) name = sym_strtab + sym->st_name; } else { - name = sec_name(secs[sym->st_shndx].shdr.sh_name); + name = sec_name(sym->st_shndx); } return name; } @@ -472,7 +512,7 @@ static void print_absolute_relocs(void) * Before warning check if this absolute symbol * relocation is harmless. */ - if (is_abs_reloc(name) || is_rel_reloc(name)) + if (is_reloc(S_ABS, name) || is_reloc(S_REL, name)) continue; if (!printed) { @@ -496,7 +536,8 @@ static void print_absolute_relocs(void) printf("\n"); } -static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym)) +static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym), + int use_real_mode) { int i; /* Walk through the relocations */ @@ -521,30 +562,67 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym)) Elf32_Rel *rel; Elf32_Sym *sym; unsigned r_type; + const char *symname; + int shn_abs; + rel = &sec->reltab[j]; sym = &sh_symtab[ELF32_R_SYM(rel->r_info)]; r_type = ELF32_R_TYPE(rel->r_info); - /* Don't visit relocations to absolute symbols */ - if (sym->st_shndx == SHN_ABS && - !is_rel_reloc(sym_name(sym_strtab, sym))) { - continue; - } + + shn_abs = sym->st_shndx == SHN_ABS; + switch (r_type) { case R_386_NONE: case R_386_PC32: + case R_386_PC16: + case R_386_PC8: /* * NONE can be ignored and and PC relative * relocations don't need to be adjusted. */ break; + + case R_386_16: + symname = sym_name(sym_strtab, sym); + if (!use_real_mode) + goto bad; + if (shn_abs) { + if (is_reloc(S_ABS, symname)) + break; + else if (!is_reloc(S_SEG, symname)) + goto bad; + } else { + if (is_reloc(S_LIN, symname)) + goto bad; + else + break; + } + visit(rel, sym); + break; + case R_386_32: - /* Visit relocations that need to be adjusted */ + symname = sym_name(sym_strtab, sym); + if (shn_abs) { + if (is_reloc(S_ABS, symname)) + break; + else if (!is_reloc(S_REL, symname)) + goto bad; + } else { + if (use_real_mode && + !is_reloc(S_LIN, symname)) + break; + } visit(rel, sym); break; default: die("Unsupported relocation type: %s (%d)\n", rel_type(r_type), r_type); break; + bad: + symname = sym_name(sym_strtab, sym); + die("Invalid %s %s relocation: %s\n", + shn_abs ? "absolute" : "relative", + rel_type(r_type), symname); } } } @@ -552,13 +630,19 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym)) static void count_reloc(Elf32_Rel *rel, Elf32_Sym *sym) { - reloc_count += 1; + if (ELF32_R_TYPE(rel->r_info) == R_386_16) + reloc16_count++; + else + reloc_count++; } static void collect_reloc(Elf32_Rel *rel, Elf32_Sym *sym) { /* Remember the address that needs to be adjusted. */ - relocs[reloc_idx++] = rel->r_offset; + if (ELF32_R_TYPE(rel->r_info) == R_386_16) + relocs16[reloc16_idx++] = rel->r_offset; + else + relocs[reloc_idx++] = rel->r_offset; } static int cmp_relocs(const void *va, const void *vb) @@ -568,23 +652,41 @@ static int cmp_relocs(const void *va, const void *vb) return (*a == *b)? 0 : (*a > *b)? 1 : -1; } -static void emit_relocs(int as_text) +static int write32(unsigned int v, FILE *f) +{ + unsigned char buf[4]; + + put_unaligned_le32(v, buf); + return fwrite(buf, 1, 4, f) == 4 ? 0 : -1; +} + +static void emit_relocs(int as_text, int use_real_mode) { int i; /* Count how many relocations I have and allocate space for them. */ reloc_count = 0; - walk_relocs(count_reloc); + walk_relocs(count_reloc, use_real_mode); relocs = malloc(reloc_count * sizeof(relocs[0])); if (!relocs) { die("malloc of %d entries for relocs failed\n", reloc_count); } + + relocs16 = malloc(reloc16_count * sizeof(relocs[0])); + if (!relocs16) { + die("malloc of %d entries for relocs16 failed\n", + reloc16_count); + } /* Collect up the relocations */ reloc_idx = 0; - walk_relocs(collect_reloc); + walk_relocs(collect_reloc, use_real_mode); + + if (reloc16_count && !use_real_mode) + die("Segment relocations found but --realmode not specified\n"); /* Order the relocations for more efficient processing */ qsort(relocs, reloc_count, sizeof(relocs[0]), cmp_relocs); + qsort(relocs16, reloc16_count, sizeof(relocs16[0]), cmp_relocs); /* Print the relocations */ if (as_text) { @@ -593,58 +695,83 @@ static void emit_relocs(int as_text) */ printf(".section \".data.reloc\",\"a\"\n"); printf(".balign 4\n"); - for (i = 0; i < reloc_count; i++) { - printf("\t .long 0x%08lx\n", relocs[i]); + if (use_real_mode) { + printf("\t.long %lu\n", reloc16_count); + for (i = 0; i < reloc16_count; i++) + printf("\t.long 0x%08lx\n", relocs16[i]); + printf("\t.long %lu\n", reloc_count); + for (i = 0; i < reloc_count; i++) { + printf("\t.long 0x%08lx\n", relocs[i]); + } + } else { + /* Print a stop */ + printf("\t.long 0x%08lx\n", (unsigned long)0); + for (i = 0; i < reloc_count; i++) { + printf("\t.long 0x%08lx\n", relocs[i]); + } } + printf("\n"); } else { - unsigned char buf[4]; - /* Print a stop */ - fwrite("\0\0\0\0", 4, 1, stdout); - /* Now print each relocation */ - for (i = 0; i < reloc_count; i++) { - put_unaligned_le32(relocs[i], buf); - fwrite(buf, 4, 1, stdout); + if (use_real_mode) { + write32(reloc16_count, stdout); + for (i = 0; i < reloc16_count; i++) + write32(relocs16[i], stdout); + write32(reloc_count, stdout); + + /* Now print each relocation */ + for (i = 0; i < reloc_count; i++) + write32(relocs[i], stdout); + } else { + /* Print a stop */ + write32(0, stdout); + + /* Now print each relocation */ + for (i = 0; i < reloc_count; i++) { + write32(relocs[i], stdout); + } } } } static void usage(void) { - die("relocs [--abs-syms |--abs-relocs | --text] vmlinux\n"); + die("relocs [--abs-syms|--abs-relocs|--text|--realmode] vmlinux\n"); } int main(int argc, char **argv) { int show_absolute_syms, show_absolute_relocs; - int as_text; + int as_text, use_real_mode; const char *fname; FILE *fp; int i; - regex_init(); - show_absolute_syms = 0; show_absolute_relocs = 0; as_text = 0; + use_real_mode = 0; fname = NULL; for (i = 1; i < argc; i++) { char *arg = argv[i]; if (*arg == '-') { - if (strcmp(argv[1], "--abs-syms") == 0) { + if (strcmp(arg, "--abs-syms") == 0) { show_absolute_syms = 1; continue; } - - if (strcmp(argv[1], "--abs-relocs") == 0) { + if (strcmp(arg, "--abs-relocs") == 0) { show_absolute_relocs = 1; continue; } - else if (strcmp(argv[1], "--text") == 0) { + if (strcmp(arg, "--text") == 0) { as_text = 1; continue; } + if (strcmp(arg, "--realmode") == 0) { + use_real_mode = 1; + continue; + } } else if (!fname) { fname = arg; @@ -655,6 +782,7 @@ int main(int argc, char **argv) if (!fname) { usage(); } + regex_init(use_real_mode); fp = fopen(fname, "r"); if (!fp) { die("Cannot open %s: %s\n", @@ -673,6 +801,6 @@ int main(int argc, char **argv) print_absolute_relocs(); return 0; } - emit_relocs(as_text); + emit_relocs(as_text, use_real_mode); return 0; } diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h index f3b0633b69a..0e07adc8cbe 100644 --- a/arch/x86/um/asm/elf.h +++ b/arch/x86/um/asm/elf.h @@ -34,25 +34,25 @@ #define ELF_ARCH EM_386 #define ELF_PLAT_INIT(regs, load_addr) do { \ - PT_REGS_EBX(regs) = 0; \ - PT_REGS_ECX(regs) = 0; \ - PT_REGS_EDX(regs) = 0; \ - PT_REGS_ESI(regs) = 0; \ - PT_REGS_EDI(regs) = 0; \ - PT_REGS_EBP(regs) = 0; \ - PT_REGS_EAX(regs) = 0; \ + PT_REGS_BX(regs) = 0; \ + PT_REGS_CX(regs) = 0; \ + PT_REGS_DX(regs) = 0; \ + PT_REGS_SI(regs) = 0; \ + PT_REGS_DI(regs) = 0; \ + PT_REGS_BP(regs) = 0; \ + PT_REGS_AX(regs) = 0; \ } while (0) /* Shamelessly stolen from include/asm-i386/elf.h */ #define ELF_CORE_COPY_REGS(pr_reg, regs) do { \ - pr_reg[0] = PT_REGS_EBX(regs); \ - pr_reg[1] = PT_REGS_ECX(regs); \ - pr_reg[2] = PT_REGS_EDX(regs); \ - pr_reg[3] = PT_REGS_ESI(regs); \ - pr_reg[4] = PT_REGS_EDI(regs); \ - pr_reg[5] = PT_REGS_EBP(regs); \ - pr_reg[6] = PT_REGS_EAX(regs); \ + pr_reg[0] = PT_REGS_BX(regs); \ + pr_reg[1] = PT_REGS_CX(regs); \ + pr_reg[2] = PT_REGS_DX(regs); \ + pr_reg[3] = PT_REGS_SI(regs); \ + pr_reg[4] = PT_REGS_DI(regs); \ + pr_reg[5] = PT_REGS_BP(regs); \ + pr_reg[6] = PT_REGS_AX(regs); \ pr_reg[7] = PT_REGS_DS(regs); \ pr_reg[8] = PT_REGS_ES(regs); \ /* fake once used fs and gs selectors? */ \ @@ -130,13 +130,13 @@ do { \ #define ELF_ARCH EM_X86_64 #define ELF_PLAT_INIT(regs, load_addr) do { \ - PT_REGS_RBX(regs) = 0; \ - PT_REGS_RCX(regs) = 0; \ - PT_REGS_RDX(regs) = 0; \ - PT_REGS_RSI(regs) = 0; \ - PT_REGS_RDI(regs) = 0; \ - PT_REGS_RBP(regs) = 0; \ - PT_REGS_RAX(regs) = 0; \ + PT_REGS_BX(regs) = 0; \ + PT_REGS_CX(regs) = 0; \ + PT_REGS_DX(regs) = 0; \ + PT_REGS_SI(regs) = 0; \ + PT_REGS_DI(regs) = 0; \ + PT_REGS_BP(regs) = 0; \ + PT_REGS_AX(regs) = 0; \ PT_REGS_R8(regs) = 0; \ PT_REGS_R9(regs) = 0; \ PT_REGS_R10(regs) = 0; \ diff --git a/arch/x86/um/asm/ptrace.h b/arch/x86/um/asm/ptrace.h index c8aca8c501b..950dfb7b841 100644 --- a/arch/x86/um/asm/ptrace.h +++ b/arch/x86/um/asm/ptrace.h @@ -1,5 +1,39 @@ +#ifndef __UM_X86_PTRACE_H +#define __UM_X86_PTRACE_H + #ifdef CONFIG_X86_32 # include "ptrace_32.h" #else # include "ptrace_64.h" #endif + +#define PT_REGS_AX(r) UPT_AX(&(r)->regs) +#define PT_REGS_BX(r) UPT_BX(&(r)->regs) +#define PT_REGS_CX(r) UPT_CX(&(r)->regs) +#define PT_REGS_DX(r) UPT_DX(&(r)->regs) + +#define PT_REGS_SI(r) UPT_SI(&(r)->regs) +#define PT_REGS_DI(r) UPT_DI(&(r)->regs) +#define PT_REGS_BP(r) UPT_BP(&(r)->regs) +#define PT_REGS_EFLAGS(r) UPT_EFLAGS(&(r)->regs) + +#define PT_REGS_CS(r) UPT_CS(&(r)->regs) +#define PT_REGS_SS(r) UPT_SS(&(r)->regs) +#define PT_REGS_DS(r) UPT_DS(&(r)->regs) +#define PT_REGS_ES(r) UPT_ES(&(r)->regs) + +#define PT_REGS_ORIG_SYSCALL(r) PT_REGS_AX(r) +#define PT_REGS_SYSCALL_RET(r) PT_REGS_AX(r) + +#define PT_FIX_EXEC_STACK(sp) do ; while(0) + +#define profile_pc(regs) PT_REGS_IP(regs) + +#define UPT_RESTART_SYSCALL(r) (UPT_IP(r) -= 2) +#define UPT_SET_SYSCALL_RETURN(r, res) (UPT_AX(r) = (res)) + +static inline long regs_return_value(struct uml_pt_regs *regs) +{ + return UPT_AX(regs); +} +#endif /* __UM_X86_PTRACE_H */ diff --git a/arch/x86/um/asm/ptrace_32.h b/arch/x86/um/asm/ptrace_32.h index 5d2a5911253..2cf225351b6 100644 --- a/arch/x86/um/asm/ptrace_32.h +++ b/arch/x86/um/asm/ptrace_32.h @@ -11,29 +11,6 @@ #include "linux/compiler.h" #include "asm/ptrace-generic.h" -#define PT_REGS_EAX(r) UPT_EAX(&(r)->regs) -#define PT_REGS_EBX(r) UPT_EBX(&(r)->regs) -#define PT_REGS_ECX(r) UPT_ECX(&(r)->regs) -#define PT_REGS_EDX(r) UPT_EDX(&(r)->regs) -#define PT_REGS_ESI(r) UPT_ESI(&(r)->regs) -#define PT_REGS_EDI(r) UPT_EDI(&(r)->regs) -#define PT_REGS_EBP(r) UPT_EBP(&(r)->regs) - -#define PT_REGS_CS(r) UPT_CS(&(r)->regs) -#define PT_REGS_SS(r) UPT_SS(&(r)->regs) -#define PT_REGS_DS(r) UPT_DS(&(r)->regs) -#define PT_REGS_ES(r) UPT_ES(&(r)->regs) -#define PT_REGS_FS(r) UPT_FS(&(r)->regs) -#define PT_REGS_GS(r) UPT_GS(&(r)->regs) - -#define PT_REGS_EFLAGS(r) UPT_EFLAGS(&(r)->regs) - -#define PT_REGS_ORIG_SYSCALL(r) PT_REGS_EAX(r) -#define PT_REGS_SYSCALL_RET(r) PT_REGS_EAX(r) -#define PT_FIX_EXEC_STACK(sp) do ; while(0) - -#define profile_pc(regs) PT_REGS_IP(regs) - #define user_mode(r) UPT_IS_USER(&(r)->regs) /* diff --git a/arch/x86/um/asm/ptrace_64.h b/arch/x86/um/asm/ptrace_64.h index 706a0d80545..ea7bff39432 100644 --- a/arch/x86/um/asm/ptrace_64.h +++ b/arch/x86/um/asm/ptrace_64.h @@ -15,13 +15,6 @@ #define HOST_AUDIT_ARCH AUDIT_ARCH_X86_64 -#define PT_REGS_RBX(r) UPT_RBX(&(r)->regs) -#define PT_REGS_RCX(r) UPT_RCX(&(r)->regs) -#define PT_REGS_RDX(r) UPT_RDX(&(r)->regs) -#define PT_REGS_RSI(r) UPT_RSI(&(r)->regs) -#define PT_REGS_RDI(r) UPT_RDI(&(r)->regs) -#define PT_REGS_RBP(r) UPT_RBP(&(r)->regs) -#define PT_REGS_RAX(r) UPT_RAX(&(r)->regs) #define PT_REGS_R8(r) UPT_R8(&(r)->regs) #define PT_REGS_R9(r) UPT_R9(&(r)->regs) #define PT_REGS_R10(r) UPT_R10(&(r)->regs) @@ -31,27 +24,8 @@ #define PT_REGS_R14(r) UPT_R14(&(r)->regs) #define PT_REGS_R15(r) UPT_R15(&(r)->regs) -#define PT_REGS_FS(r) UPT_FS(&(r)->regs) -#define PT_REGS_GS(r) UPT_GS(&(r)->regs) -#define PT_REGS_DS(r) UPT_DS(&(r)->regs) -#define PT_REGS_ES(r) UPT_ES(&(r)->regs) -#define PT_REGS_SS(r) UPT_SS(&(r)->regs) -#define PT_REGS_CS(r) UPT_CS(&(r)->regs) - -#define PT_REGS_ORIG_RAX(r) UPT_ORIG_RAX(&(r)->regs) -#define PT_REGS_RIP(r) UPT_IP(&(r)->regs) -#define PT_REGS_SP(r) UPT_SP(&(r)->regs) - -#define PT_REGS_EFLAGS(r) UPT_EFLAGS(&(r)->regs) - /* XXX */ #define user_mode(r) UPT_IS_USER(&(r)->regs) -#define PT_REGS_ORIG_SYSCALL(r) PT_REGS_RAX(r) -#define PT_REGS_SYSCALL_RET(r) PT_REGS_RAX(r) - -#define PT_FIX_EXEC_STACK(sp) do ; while(0) - -#define profile_pc(regs) PT_REGS_IP(regs) struct user_desc; diff --git a/arch/x86/um/shared/sysdep/ptrace.h b/arch/x86/um/shared/sysdep/ptrace.h index 2bbe1ec2d96..6ce2d76eb90 100644 --- a/arch/x86/um/shared/sysdep/ptrace.h +++ b/arch/x86/um/shared/sysdep/ptrace.h @@ -1,15 +1,74 @@ #ifndef __SYSDEP_X86_PTRACE_H #define __SYSDEP_X86_PTRACE_H +#include <generated/user_constants.h> +#include "sysdep/faultinfo.h" + +#define MAX_REG_OFFSET (UM_FRAME_SIZE) +#define MAX_REG_NR ((MAX_REG_OFFSET) / sizeof(unsigned long)) + +#define REGS_IP(r) ((r)[HOST_IP]) +#define REGS_SP(r) ((r)[HOST_SP]) +#define REGS_EFLAGS(r) ((r)[HOST_EFLAGS]) +#define REGS_AX(r) ((r)[HOST_AX]) +#define REGS_BX(r) ((r)[HOST_BX]) +#define REGS_CX(r) ((r)[HOST_CX]) +#define REGS_DX(r) ((r)[HOST_DX]) +#define REGS_SI(r) ((r)[HOST_SI]) +#define REGS_DI(r) ((r)[HOST_DI]) +#define REGS_BP(r) ((r)[HOST_BP]) +#define REGS_CS(r) ((r)[HOST_CS]) +#define REGS_SS(r) ((r)[HOST_SS]) +#define REGS_DS(r) ((r)[HOST_DS]) +#define REGS_ES(r) ((r)[HOST_ES]) + +#define UPT_IP(r) REGS_IP((r)->gp) +#define UPT_SP(r) REGS_SP((r)->gp) +#define UPT_EFLAGS(r) REGS_EFLAGS((r)->gp) +#define UPT_AX(r) REGS_AX((r)->gp) +#define UPT_BX(r) REGS_BX((r)->gp) +#define UPT_CX(r) REGS_CX((r)->gp) +#define UPT_DX(r) REGS_DX((r)->gp) +#define UPT_SI(r) REGS_SI((r)->gp) +#define UPT_DI(r) REGS_DI((r)->gp) +#define UPT_BP(r) REGS_BP((r)->gp) +#define UPT_CS(r) REGS_CS((r)->gp) +#define UPT_SS(r) REGS_SS((r)->gp) +#define UPT_DS(r) REGS_DS((r)->gp) +#define UPT_ES(r) REGS_ES((r)->gp) + #ifdef __i386__ #include "ptrace_32.h" #else #include "ptrace_64.h" #endif -static inline long regs_return_value(struct uml_pt_regs *regs) -{ - return UPT_SYSCALL_RET(regs); -} +struct syscall_args { + unsigned long args[6]; +}; + +#define SYSCALL_ARGS(r) ((struct syscall_args) \ + { .args = { UPT_SYSCALL_ARG1(r), \ + UPT_SYSCALL_ARG2(r), \ + UPT_SYSCALL_ARG3(r), \ + UPT_SYSCALL_ARG4(r), \ + UPT_SYSCALL_ARG5(r), \ + UPT_SYSCALL_ARG6(r) } } ) + +struct uml_pt_regs { + unsigned long gp[MAX_REG_NR]; + unsigned long fp[MAX_FP_NR]; + struct faultinfo faultinfo; + long syscall; + int is_user; +}; + +#define EMPTY_UML_PT_REGS { } + +#define UPT_SYSCALL_NR(r) ((r)->syscall) +#define UPT_FAULTINFO(r) (&(r)->faultinfo) +#define UPT_IS_USER(r) ((r)->is_user) + +extern int user_context(unsigned long sp); #endif /* __SYSDEP_X86_PTRACE_H */ diff --git a/arch/x86/um/shared/sysdep/ptrace_32.h b/arch/x86/um/shared/sysdep/ptrace_32.h index befd1df32ed..b94a108de1d 100644 --- a/arch/x86/um/shared/sysdep/ptrace_32.h +++ b/arch/x86/um/shared/sysdep/ptrace_32.h @@ -6,11 +6,7 @@ #ifndef __SYSDEP_I386_PTRACE_H #define __SYSDEP_I386_PTRACE_H -#include <generated/user_constants.h> -#include "sysdep/faultinfo.h" - -#define MAX_REG_NR (UM_FRAME_SIZE / sizeof(unsigned long)) -#define MAX_REG_OFFSET (UM_FRAME_SIZE) +#define MAX_FP_NR HOST_FPX_SIZE static inline void update_debugregs(int seq) {} @@ -24,90 +20,16 @@ void set_using_sysemu(int value); int get_using_sysemu(void); extern int sysemu_supported; -#define REGS_IP(r) ((r)[HOST_IP]) -#define REGS_SP(r) ((r)[HOST_SP]) -#define REGS_EFLAGS(r) ((r)[HOST_EFLAGS]) -#define REGS_EAX(r) ((r)[HOST_AX]) -#define REGS_EBX(r) ((r)[HOST_BX]) -#define REGS_ECX(r) ((r)[HOST_CX]) -#define REGS_EDX(r) ((r)[HOST_DX]) -#define REGS_ESI(r) ((r)[HOST_SI]) -#define REGS_EDI(r) ((r)[HOST_DI]) -#define REGS_EBP(r) ((r)[HOST_BP]) -#define REGS_CS(r) ((r)[HOST_CS]) -#define REGS_SS(r) ((r)[HOST_SS]) -#define REGS_DS(r) ((r)[HOST_DS]) -#define REGS_ES(r) ((r)[HOST_ES]) -#define REGS_FS(r) ((r)[HOST_FS]) -#define REGS_GS(r) ((r)[HOST_GS]) - -#define REGS_SET_SYSCALL_RETURN(r, res) REGS_EAX(r) = (res) - -#define IP_RESTART_SYSCALL(ip) ((ip) -= 2) -#define REGS_RESTART_SYSCALL(r) IP_RESTART_SYSCALL(REGS_IP(r)) - #ifndef PTRACE_SYSEMU_SINGLESTEP #define PTRACE_SYSEMU_SINGLESTEP 32 #endif -struct uml_pt_regs { - unsigned long gp[MAX_REG_NR]; - unsigned long fp[HOST_FPX_SIZE]; - struct faultinfo faultinfo; - long syscall; - int is_user; -}; - -#define EMPTY_UML_PT_REGS { } - -#define UPT_IP(r) REGS_IP((r)->gp) -#define UPT_SP(r) REGS_SP((r)->gp) -#define UPT_EFLAGS(r) REGS_EFLAGS((r)->gp) -#define UPT_EAX(r) REGS_EAX((r)->gp) -#define UPT_EBX(r) REGS_EBX((r)->gp) -#define UPT_ECX(r) REGS_ECX((r)->gp) -#define UPT_EDX(r) REGS_EDX((r)->gp) -#define UPT_ESI(r) REGS_ESI((r)->gp) -#define UPT_EDI(r) REGS_EDI((r)->gp) -#define UPT_EBP(r) REGS_EBP((r)->gp) -#define UPT_ORIG_EAX(r) ((r)->syscall) -#define UPT_CS(r) REGS_CS((r)->gp) -#define UPT_SS(r) REGS_SS((r)->gp) -#define UPT_DS(r) REGS_DS((r)->gp) -#define UPT_ES(r) REGS_ES((r)->gp) -#define UPT_FS(r) REGS_FS((r)->gp) -#define UPT_GS(r) REGS_GS((r)->gp) - -#define UPT_SYSCALL_ARG1(r) UPT_EBX(r) -#define UPT_SYSCALL_ARG2(r) UPT_ECX(r) -#define UPT_SYSCALL_ARG3(r) UPT_EDX(r) -#define UPT_SYSCALL_ARG4(r) UPT_ESI(r) -#define UPT_SYSCALL_ARG5(r) UPT_EDI(r) -#define UPT_SYSCALL_ARG6(r) UPT_EBP(r) - -extern int user_context(unsigned long sp); - -#define UPT_IS_USER(r) ((r)->is_user) - -struct syscall_args { - unsigned long args[6]; -}; - -#define SYSCALL_ARGS(r) ((struct syscall_args) \ - { .args = { UPT_SYSCALL_ARG1(r), \ - UPT_SYSCALL_ARG2(r), \ - UPT_SYSCALL_ARG3(r), \ - UPT_SYSCALL_ARG4(r), \ - UPT_SYSCALL_ARG5(r), \ - UPT_SYSCALL_ARG6(r) } } ) - -#define UPT_RESTART_SYSCALL(r) REGS_RESTART_SYSCALL((r)->gp) - -#define UPT_ORIG_SYSCALL(r) UPT_EAX(r) -#define UPT_SYSCALL_NR(r) UPT_ORIG_EAX(r) -#define UPT_SYSCALL_RET(r) UPT_EAX(r) - -#define UPT_FAULTINFO(r) (&(r)->faultinfo) +#define UPT_SYSCALL_ARG1(r) UPT_BX(r) +#define UPT_SYSCALL_ARG2(r) UPT_CX(r) +#define UPT_SYSCALL_ARG3(r) UPT_DX(r) +#define UPT_SYSCALL_ARG4(r) UPT_SI(r) +#define UPT_SYSCALL_ARG5(r) UPT_DI(r) +#define UPT_SYSCALL_ARG6(r) UPT_BP(r) extern void arch_init_registers(int pid); diff --git a/arch/x86/um/shared/sysdep/ptrace_64.h b/arch/x86/um/shared/sysdep/ptrace_64.h index 031edc53ac5..919789f1071 100644 --- a/arch/x86/um/shared/sysdep/ptrace_64.h +++ b/arch/x86/um/shared/sysdep/ptrace_64.h @@ -8,22 +8,8 @@ #ifndef __SYSDEP_X86_64_PTRACE_H #define __SYSDEP_X86_64_PTRACE_H -#include <generated/user_constants.h> -#include "sysdep/faultinfo.h" +#define MAX_FP_NR HOST_FP_SIZE -#define MAX_REG_OFFSET (UM_FRAME_SIZE) -#define MAX_REG_NR ((MAX_REG_OFFSET) / sizeof(unsigned long)) - -#define REGS_IP(r) ((r)[HOST_IP]) -#define REGS_SP(r) ((r)[HOST_SP]) - -#define REGS_RBX(r) ((r)[HOST_BX]) -#define REGS_RCX(r) ((r)[HOST_CX]) -#define REGS_RDX(r) ((r)[HOST_DX]) -#define REGS_RSI(r) ((r)[HOST_SI]) -#define REGS_RDI(r) ((r)[HOST_DI]) -#define REGS_RBP(r) ((r)[HOST_BP]) -#define REGS_RAX(r) ((r)[HOST_AX]) #define REGS_R8(r) ((r)[HOST_R8]) #define REGS_R9(r) ((r)[HOST_R9]) #define REGS_R10(r) ((r)[HOST_R10]) @@ -32,9 +18,6 @@ #define REGS_R13(r) ((r)[HOST_R13]) #define REGS_R14(r) ((r)[HOST_R14]) #define REGS_R15(r) ((r)[HOST_R15]) -#define REGS_CS(r) ((r)[HOST_CS]) -#define REGS_EFLAGS(r) ((r)[HOST_EFLAGS]) -#define REGS_SS(r) ((r)[HOST_SS]) #define HOST_FS_BASE 21 #define HOST_GS_BASE 22 @@ -58,45 +41,6 @@ #define GS (HOST_GS * sizeof(long)) #endif -#define REGS_FS_BASE(r) ((r)[HOST_FS_BASE]) -#define REGS_GS_BASE(r) ((r)[HOST_GS_BASE]) -#define REGS_DS(r) ((r)[HOST_DS]) -#define REGS_ES(r) ((r)[HOST_ES]) -#define REGS_FS(r) ((r)[HOST_FS]) -#define REGS_GS(r) ((r)[HOST_GS]) - -#define REGS_ORIG_RAX(r) ((r)[HOST_ORIG_AX]) - -#define REGS_SET_SYSCALL_RETURN(r, res) REGS_RAX(r) = (res) - -#define IP_RESTART_SYSCALL(ip) ((ip) -= 2) -#define REGS_RESTART_SYSCALL(r) IP_RESTART_SYSCALL(REGS_IP(r)) - -#define REGS_FAULT_ADDR(r) ((r)->fault_addr) - -#define REGS_FAULT_WRITE(r) FAULT_WRITE((r)->fault_type) - -#define REGS_TRAP(r) ((r)->trap_type) - -#define REGS_ERR(r) ((r)->fault_type) - -struct uml_pt_regs { - unsigned long gp[MAX_REG_NR]; - unsigned long fp[HOST_FP_SIZE]; - struct faultinfo faultinfo; - long syscall; - int is_user; -}; - -#define EMPTY_UML_PT_REGS { } - -#define UPT_RBX(r) REGS_RBX((r)->gp) -#define UPT_RCX(r) REGS_RCX((r)->gp) -#define UPT_RDX(r) REGS_RDX((r)->gp) -#define UPT_RSI(r) REGS_RSI((r)->gp) -#define UPT_RDI(r) REGS_RDI((r)->gp) -#define UPT_RBP(r) REGS_RBP((r)->gp) -#define UPT_RAX(r) REGS_RAX((r)->gp) #define UPT_R8(r) REGS_R8((r)->gp) #define UPT_R9(r) REGS_R9((r)->gp) #define UPT_R10(r) REGS_R10((r)->gp) @@ -105,51 +49,14 @@ struct uml_pt_regs { #define UPT_R13(r) REGS_R13((r)->gp) #define UPT_R14(r) REGS_R14((r)->gp) #define UPT_R15(r) REGS_R15((r)->gp) -#define UPT_CS(r) REGS_CS((r)->gp) -#define UPT_FS_BASE(r) REGS_FS_BASE((r)->gp) -#define UPT_FS(r) REGS_FS((r)->gp) -#define UPT_GS_BASE(r) REGS_GS_BASE((r)->gp) -#define UPT_GS(r) REGS_GS((r)->gp) -#define UPT_DS(r) REGS_DS((r)->gp) -#define UPT_ES(r) REGS_ES((r)->gp) -#define UPT_CS(r) REGS_CS((r)->gp) -#define UPT_SS(r) REGS_SS((r)->gp) -#define UPT_ORIG_RAX(r) REGS_ORIG_RAX((r)->gp) - -#define UPT_IP(r) REGS_IP((r)->gp) -#define UPT_SP(r) REGS_SP((r)->gp) - -#define UPT_EFLAGS(r) REGS_EFLAGS((r)->gp) -#define UPT_SYSCALL_NR(r) ((r)->syscall) -#define UPT_SYSCALL_RET(r) UPT_RAX(r) - -extern int user_context(unsigned long sp); -#define UPT_IS_USER(r) ((r)->is_user) - -#define UPT_SYSCALL_ARG1(r) UPT_RDI(r) -#define UPT_SYSCALL_ARG2(r) UPT_RSI(r) -#define UPT_SYSCALL_ARG3(r) UPT_RDX(r) +#define UPT_SYSCALL_ARG1(r) UPT_DI(r) +#define UPT_SYSCALL_ARG2(r) UPT_SI(r) +#define UPT_SYSCALL_ARG3(r) UPT_DX(r) #define UPT_SYSCALL_ARG4(r) UPT_R10(r) #define UPT_SYSCALL_ARG5(r) UPT_R8(r) #define UPT_SYSCALL_ARG6(r) UPT_R9(r) -struct syscall_args { - unsigned long args[6]; -}; - -#define SYSCALL_ARGS(r) ((struct syscall_args) \ - { .args = { UPT_SYSCALL_ARG1(r), \ - UPT_SYSCALL_ARG2(r), \ - UPT_SYSCALL_ARG3(r), \ - UPT_SYSCALL_ARG4(r), \ - UPT_SYSCALL_ARG5(r), \ - UPT_SYSCALL_ARG6(r) } } ) - -#define UPT_RESTART_SYSCALL(r) REGS_RESTART_SYSCALL((r)->gp) - -#define UPT_FAULTINFO(r) (&(r)->faultinfo) - static inline void arch_init_registers(int pid) { } diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c index 4883b954601..bb0fb03b9f8 100644 --- a/arch/x86/um/signal.c +++ b/arch/x86/um/signal.c @@ -156,6 +156,9 @@ static int copy_sc_from_user(struct pt_regs *regs, struct sigcontext sc; int err, pid; + /* Always make any pending restarted system calls return -EINTR */ + current_thread_info()->restart_block.fn = do_no_restart_syscall; + err = copy_from_user(&sc, from, sizeof(sc)); if (err) return err; @@ -410,9 +413,9 @@ int setup_signal_stack_sc(unsigned long stack_top, int sig, PT_REGS_SP(regs) = (unsigned long) frame; PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler; - PT_REGS_EAX(regs) = (unsigned long) sig; - PT_REGS_EDX(regs) = (unsigned long) 0; - PT_REGS_ECX(regs) = (unsigned long) 0; + PT_REGS_AX(regs) = (unsigned long) sig; + PT_REGS_DX(regs) = (unsigned long) 0; + PT_REGS_CX(regs) = (unsigned long) 0; if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED)) ptrace_notify(SIGTRAP); @@ -460,9 +463,9 @@ int setup_signal_stack_si(unsigned long stack_top, int sig, PT_REGS_SP(regs) = (unsigned long) frame; PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler; - PT_REGS_EAX(regs) = (unsigned long) sig; - PT_REGS_EDX(regs) = (unsigned long) &frame->info; - PT_REGS_ECX(regs) = (unsigned long) &frame->uc; + PT_REGS_AX(regs) = (unsigned long) sig; + PT_REGS_DX(regs) = (unsigned long) &frame->info; + PT_REGS_CX(regs) = (unsigned long) &frame->uc; if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED)) ptrace_notify(SIGTRAP); @@ -541,8 +544,8 @@ int setup_signal_stack_si(unsigned long stack_top, int sig, set->sig[0]); err |= __put_user(&frame->fpstate, &frame->uc.uc_mcontext.fpstate); if (sizeof(*set) == 16) { - __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]); - __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); + err |= __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]); + err |= __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); } else err |= __copy_to_user(&frame->uc.uc_sigmask, set, @@ -570,17 +573,17 @@ int setup_signal_stack_si(unsigned long stack_top, int sig, } PT_REGS_SP(regs) = (unsigned long) frame; - PT_REGS_RDI(regs) = sig; + PT_REGS_DI(regs) = sig; /* In case the signal handler was declared without prototypes */ - PT_REGS_RAX(regs) = 0; + PT_REGS_AX(regs) = 0; /* * This also works for non SA_SIGINFO handlers because they expect the * next argument after the signal number on the stack. */ - PT_REGS_RSI(regs) = (unsigned long) &frame->info; - PT_REGS_RDX(regs) = (unsigned long) &frame->uc; - PT_REGS_RIP(regs) = (unsigned long) ka->sa.sa_handler; + PT_REGS_SI(regs) = (unsigned long) &frame->info; + PT_REGS_DX(regs) = (unsigned long) &frame->uc; + PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler; out: return err; } diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index 9924776f426..170bd926a69 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -31,7 +31,6 @@ #define stub_fork sys_fork #define stub_vfork sys_vfork #define stub_execve sys_execve -#define stub_rt_sigsuspend sys_rt_sigsuspend #define stub_sigaltstack sys_sigaltstack #define stub_rt_sigreturn sys_rt_sigreturn diff --git a/arch/x86/um/syscalls_32.c b/arch/x86/um/syscalls_32.c index 70ca357393b..b853e8600b9 100644 --- a/arch/x86/um/syscalls_32.c +++ b/arch/x86/um/syscalls_32.c @@ -44,10 +44,10 @@ long sys_sigaction(int sig, const struct old_sigaction __user *act, old_sigset_t mask; if (!access_ok(VERIFY_READ, act, sizeof(*act)) || __get_user(new_ka.sa.sa_handler, &act->sa_handler) || - __get_user(new_ka.sa.sa_restorer, &act->sa_restorer)) + __get_user(new_ka.sa.sa_restorer, &act->sa_restorer) || + __get_user(new_ka.sa.sa_flags, &act->sa_flags) || + __get_user(mask, &act->sa_mask)) return -EFAULT; - __get_user(new_ka.sa.sa_flags, &act->sa_flags); - __get_user(mask, &act->sa_mask); siginitset(&new_ka.sa.sa_mask, mask); } @@ -56,10 +56,10 @@ long sys_sigaction(int sig, const struct old_sigaction __user *act, if (!ret && oact) { if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || - __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer)) + __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer) || + __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || + __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask)) return -EFAULT; - __put_user(old_ka.sa.sa_flags, &oact->sa_flags); - __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask); } return ret; diff --git a/arch/x86/um/sysrq_32.c b/arch/x86/um/sysrq_32.c index 171b3e9dc86..2d5cc51e9be 100644 --- a/arch/x86/um/sysrq_32.c +++ b/arch/x86/um/sysrq_32.c @@ -23,12 +23,10 @@ void show_regs(struct pt_regs *regs) printk(" EFLAGS: %08lx\n %s\n", PT_REGS_EFLAGS(regs), print_tainted()); printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", - PT_REGS_EAX(regs), PT_REGS_EBX(regs), - PT_REGS_ECX(regs), - PT_REGS_EDX(regs)); + PT_REGS_AX(regs), PT_REGS_BX(regs), + PT_REGS_CX(regs), PT_REGS_DX(regs)); printk("ESI: %08lx EDI: %08lx EBP: %08lx", - PT_REGS_ESI(regs), PT_REGS_EDI(regs), - PT_REGS_EBP(regs)); + PT_REGS_SI(regs), PT_REGS_DI(regs), PT_REGS_BP(regs)); printk(" DS: %04lx ES: %04lx\n", 0xffff & PT_REGS_DS(regs), 0xffff & PT_REGS_ES(regs)); diff --git a/arch/x86/um/sysrq_64.c b/arch/x86/um/sysrq_64.c index e8913436d7d..08258f17996 100644 --- a/arch/x86/um/sysrq_64.c +++ b/arch/x86/um/sysrq_64.c @@ -19,15 +19,15 @@ void __show_regs(struct pt_regs *regs) printk(KERN_INFO "Pid: %d, comm: %.20s %s %s\n", task_pid_nr(current), current->comm, print_tainted(), init_utsname()->release); printk(KERN_INFO "RIP: %04lx:[<%016lx>]\n", PT_REGS_CS(regs) & 0xffff, - PT_REGS_RIP(regs)); + PT_REGS_IP(regs)); printk(KERN_INFO "RSP: %016lx EFLAGS: %08lx\n", PT_REGS_SP(regs), PT_REGS_EFLAGS(regs)); printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n", - PT_REGS_RAX(regs), PT_REGS_RBX(regs), PT_REGS_RCX(regs)); + PT_REGS_AX(regs), PT_REGS_BX(regs), PT_REGS_CX(regs)); printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n", - PT_REGS_RDX(regs), PT_REGS_RSI(regs), PT_REGS_RDI(regs)); + PT_REGS_DX(regs), PT_REGS_SI(regs), PT_REGS_DI(regs)); printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n", - PT_REGS_RBP(regs), PT_REGS_R8(regs), PT_REGS_R9(regs)); + PT_REGS_BP(regs), PT_REGS_R8(regs), PT_REGS_R9(regs)); printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n", PT_REGS_R10(regs), PT_REGS_R11(regs), PT_REGS_R12(regs)); printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n", diff --git a/arch/x86/um/tls_32.c b/arch/x86/um/tls_32.c index c6c7131e563..baba84f8ecb 100644 --- a/arch/x86/um/tls_32.c +++ b/arch/x86/um/tls_32.c @@ -219,7 +219,7 @@ int arch_copy_tls(struct task_struct *new) int idx, ret = -EFAULT; if (copy_from_user(&info, - (void __user *) UPT_ESI(&new->thread.regs.regs), + (void __user *) UPT_SI(&new->thread.regs.regs), sizeof(info))) goto out; diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index add2c2d729c..96ab2c09cb6 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -20,5 +20,5 @@ obj-$(CONFIG_EVENT_TRACING) += trace.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o -obj-$(CONFIG_XEN_DOM0) += vga.o +obj-$(CONFIG_XEN_DOM0) += apic.o vga.o obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c new file mode 100644 index 00000000000..ec57bd3818a --- /dev/null +++ b/arch/x86/xen/apic.c @@ -0,0 +1,33 @@ +#include <linux/init.h> + +#include <asm/x86_init.h> +#include <asm/apic.h> +#include <asm/xen/hypercall.h> + +#include <xen/xen.h> +#include <xen/interface/physdev.h> + +unsigned int xen_io_apic_read(unsigned apic, unsigned reg) +{ + struct physdev_apic apic_op; + int ret; + + apic_op.apic_physbase = mpc_ioapic_addr(apic); + apic_op.reg = reg; + ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op); + if (!ret) + return apic_op.value; + + /* fallback to return an emulated IO_APIC values */ + if (reg == 0x1) + return 0x00170020; + else if (reg == 0x0) + return apic << 24; + + return 0xfd; +} + +void __init xen_init_apic(void) +{ + x86_io_apic_ops.read = xen_io_apic_read; +} diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index a8f8844b8d3..c0f5facdb10 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -63,6 +63,7 @@ #include <asm/stackprotector.h> #include <asm/hypervisor.h> #include <asm/mwait.h> +#include <asm/pci_x86.h> #ifdef CONFIG_ACPI #include <linux/acpi.h> @@ -809,9 +810,40 @@ static void xen_io_delay(void) } #ifdef CONFIG_X86_LOCAL_APIC +static unsigned long xen_set_apic_id(unsigned int x) +{ + WARN_ON(1); + return x; +} +static unsigned int xen_get_apic_id(unsigned long x) +{ + return ((x)>>24) & 0xFFu; +} static u32 xen_apic_read(u32 reg) { - return 0; + struct xen_platform_op op = { + .cmd = XENPF_get_cpuinfo, + .interface_version = XENPF_INTERFACE_VERSION, + .u.pcpu_info.xen_cpuid = 0, + }; + int ret = 0; + + /* Shouldn't need this as APIC is turned off for PV, and we only + * get called on the bootup processor. But just in case. */ + if (!xen_initial_domain() || smp_processor_id()) + return 0; + + if (reg == APIC_LVR) + return 0x10; + + if (reg != APIC_ID) + return 0; + + ret = HYPERVISOR_dom0_op(&op); + if (ret) + return 0; + + return op.u.pcpu_info.apic_id << 24; } static void xen_apic_write(u32 reg, u32 val) @@ -849,6 +881,8 @@ static void set_xen_basic_apic_ops(void) apic->icr_write = xen_apic_icr_write; apic->wait_icr_idle = xen_apic_wait_icr_idle; apic->safe_wait_icr_idle = xen_safe_apic_wait_icr_idle; + apic->set_apic_id = xen_set_apic_id; + apic->get_apic_id = xen_get_apic_id; } #endif @@ -1362,11 +1396,15 @@ asmlinkage void __init xen_start_kernel(void) xen_start_info->console.domU.mfn = 0; xen_start_info->console.domU.evtchn = 0; + xen_init_apic(); + /* Make sure ACS will be enabled */ pci_request_acs(); } - - +#ifdef CONFIG_PCI + /* PCI BIOS service won't work from a PV guest. */ + pci_probe &= ~PCI_PROBE_BIOS; +#endif xen_raw_console_write("about to get started...\n"); xen_setup_runstate_info(0); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index b8e279479a6..3506cd4f9a4 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -353,8 +353,13 @@ static pteval_t pte_mfn_to_pfn(pteval_t val) { if (val & _PAGE_PRESENT) { unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; + unsigned long pfn = mfn_to_pfn(mfn); + pteval_t flags = val & PTE_FLAGS_MASK; - val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags; + if (unlikely(pfn == ~0)) + val = flags & ~_PAGE_PRESENT; + else + val = ((pteval_t)pfn << PAGE_SHIFT) | flags; } return val; @@ -1859,7 +1864,6 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, #endif /* CONFIG_X86_64 */ static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss; -static unsigned char fake_ioapic_mapping[PAGE_SIZE] __page_aligned_bss; static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) { @@ -1900,7 +1904,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) * We just don't map the IO APIC - all access is via * hypercalls. Keep the address in the pte for reference. */ - pte = pfn_pte(PFN_DOWN(__pa(fake_ioapic_mapping)), PAGE_KERNEL); + pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL); break; #endif @@ -2065,7 +2069,6 @@ void __init xen_init_mmu_ops(void) pv_mmu_ops = xen_mmu_ops; memset(dummy_mapping, 0xff, PAGE_SIZE); - memset(fake_ioapic_mapping, 0xfd, PAGE_SIZE); } /* Protected by xen_reservation_lock. */ diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 0503c0c493a..3700945ed0d 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -265,18 +265,8 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus) set_cpu_possible(cpu, false); } - for_each_possible_cpu (cpu) { - struct task_struct *idle; - - if (cpu == 0) - continue; - - idle = fork_idle(cpu); - if (IS_ERR(idle)) - panic("failed fork for CPU %d", cpu); - + for_each_possible_cpu(cpu) set_cpu_present(cpu, true); - } } static int __cpuinit @@ -346,9 +336,8 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) return 0; } -static int __cpuinit xen_cpu_up(unsigned int cpu) +static int __cpuinit xen_cpu_up(unsigned int cpu, struct task_struct *idle) { - struct task_struct *idle = idle_task(cpu); int rc; per_cpu(current_task, cpu) = idle; @@ -562,10 +551,10 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) xen_init_lock_cpu(0); } -static int __cpuinit xen_hvm_cpu_up(unsigned int cpu) +static int __cpuinit xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle) { int rc; - rc = native_cpu_up(cpu); + rc = native_cpu_up(cpu, tidle); WARN_ON (xen_smp_intr_init(cpu)); return rc; } diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index b095739ccd4..45c0c0667bd 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -92,11 +92,15 @@ struct dom0_vga_console_info; #ifdef CONFIG_XEN_DOM0 void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size); +void __init xen_init_apic(void); #else static inline void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size) { } +static inline void __init xen_init_apic(void) +{ +} #endif /* Declare an asm function, along with symbols needed to make it |