From 9b483417527f2e47985856867c5716df013227c7 Mon Sep 17 00:00:00 2001 From: Andreas Mohr Date: Thu, 7 Dec 2006 02:14:00 +0100 Subject: [PATCH] i386: fix buggy MTRR address checks Fix checks that failed to realize that values are 4-kB-unit-sized (note the format strings in this same diff context which *do* realize the unit size, via appended "000"!). Also fix an incorrect below-1MB area check (as gathered from Jan Beulich's unapplied patch at http://www.ussg.iu.edu/hypermail/linux/kernel/0411.1/1378.html ) Update mtrr_add_page() docu to make 4-kB-sized calculation more obvious. Given several further items mentioned in Jan's patch mail, all in all MTRR code seems surprisingly buggy, for a surprisingly long period of time (many years). Further work/investigation would be useful. TBD Note that my patch is pretty much UNTESTED, since I can only verify that it TBD successfully boots my machine, but I cannot test against actual buggy TBD hardware which would require these (formerly broken) checks. Long -mm TBD simmering would make sense, especially since these now-working checks might TBD turn out to have adverse effects on unaffected hardware. Signed-off-by: Andreas Mohr Signed-off-by: Andi Kleen Acked-by: Jan Beulich Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/cpu/mtrr/generic.c | 4 ++-- arch/i386/kernel/cpu/mtrr/main.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/cpu/mtrr/generic.c b/arch/i386/kernel/cpu/mtrr/generic.c index 0b61eed8bbd..ee8dc675395 100644 --- a/arch/i386/kernel/cpu/mtrr/generic.c +++ b/arch/i386/kernel/cpu/mtrr/generic.c @@ -366,7 +366,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i printk(KERN_WARNING "mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); return -EINVAL; } - if (!(base + size < 0x70000000 || base > 0x7003FFFF) && + if (!(base + size < 0x70000 || base > 0x7003F) && (type == MTRR_TYPE_WRCOMB || type == MTRR_TYPE_WRBACK)) { printk(KERN_WARNING "mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n"); @@ -374,7 +374,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i } } - if (base + size < 0x100) { + if (base < 0x100) { printk(KERN_WARNING "mtrr: cannot set region below 1 MiB (0x%lx000,0x%lx000)\n", base, size); return -EINVAL; diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c index fff90bda473..2b8b0b361cc 100644 --- a/arch/i386/kernel/cpu/mtrr/main.c +++ b/arch/i386/kernel/cpu/mtrr/main.c @@ -263,8 +263,8 @@ static void set_mtrr(unsigned int reg, unsigned long base, /** * mtrr_add_page - Add a memory type region - * @base: Physical base address of region in pages (4 KB) - * @size: Physical size of region in pages (4 KB) + * @base: Physical base address of region in pages (in units of 4 kB!) + * @size: Physical size of region in pages (4 kB) * @type: Type of MTRR desired * @increment: If this is true do usage counting on the region * -- cgit v1.2.3-70-g09d2 From b615ebdac97c648a2ae7d23c5a0bbb3972adf928 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 7 Dec 2006 02:14:00 +0100 Subject: [PATCH] x86: shorten lines in unwinder to be <= 80 characters Andrew complained about > 80 character lines in the new unwinder. Fix that. Signed-off-by: Andi Kleen --- arch/i386/kernel/traps.c | 16 ++++++++++------ arch/x86_64/kernel/traps.c | 21 +++++++++++++-------- 2 files changed, 23 insertions(+), 14 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index fe9c5e8e7e6..fe81d89c50d 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -173,6 +173,8 @@ dump_trace_unwind(struct unwind_frame_info *info, void *data) return n; } +#define MSG(msg) ops->warning(data, msg) + void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, struct stacktrace_ops *ops, void *data) @@ -191,29 +193,31 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (unwind_init_frame_info(&info, task, regs) == 0) unw_ret = dump_trace_unwind(&info, &oad); } else if (task == current) - unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad); + unw_ret = unwind_init_running(&info, dump_trace_unwind, + &oad); else { if (unwind_init_blocked(&info, task) == 0) unw_ret = dump_trace_unwind(&info, &oad); } if (unw_ret > 0) { if (call_trace == 1 && !arch_unw_user_mode(&info)) { - ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n", + ops->warning_symbol(data, + "DWARF2 unwinder stuck at %s\n", UNW_PC(&info)); if (UNW_SP(&info) >= PAGE_OFFSET) { - ops->warning(data, "Leftover inexact backtrace:\n"); + MSG("Leftover inexact backtrace:\n"); stack = (void *)UNW_SP(&info); if (!stack) return; ebp = UNW_FP(&info); } else - ops->warning(data, "Full inexact backtrace again:\n"); + MSG("Full inexact backtrace again:\n"); } else if (call_trace >= 1) return; else - ops->warning(data, "Full inexact backtrace again:\n"); + MSG("Full inexact backtrace again:\n"); } else - ops->warning(data, "Inexact backtrace:\n"); + MSG("Inexact backtrace:\n"); } if (!stack) { unsigned long dummy; diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 0d65b22f229..d3f43c958ca 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -235,6 +235,8 @@ static int dump_trace_unwind(struct unwind_frame_info *info, void *context) return n; } +#define MSG(txt) ops->warning(data, txt) + /* * x86-64 can have upto three kernel stacks: * process stack @@ -248,11 +250,12 @@ static inline int valid_stack_ptr(struct thread_info *tinfo, void *p) return p > t && p < t + THREAD_SIZE - 3; } -void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack, +void dump_trace(struct task_struct *tsk, struct pt_regs *regs, + unsigned long *stack, struct stacktrace_ops *ops, void *data) { const unsigned cpu = smp_processor_id(); - unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; + unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr; unsigned used = 0; struct thread_info *tinfo; @@ -268,28 +271,30 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s if (unwind_init_frame_info(&info, tsk, regs) == 0) unw_ret = dump_trace_unwind(&info, &oad); } else if (tsk == current) - unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad); + unw_ret = unwind_init_running(&info, dump_trace_unwind, + &oad); else { if (unwind_init_blocked(&info, tsk) == 0) unw_ret = dump_trace_unwind(&info, &oad); } if (unw_ret > 0) { if (call_trace == 1 && !arch_unw_user_mode(&info)) { - ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n", + ops->warning_symbol(data, + "DWARF2 unwinder stuck at %s\n", UNW_PC(&info)); if ((long)UNW_SP(&info) < 0) { - ops->warning(data, "Leftover inexact backtrace:\n"); + MSG("Leftover inexact backtrace:"); stack = (unsigned long *)UNW_SP(&info); if (!stack) return; } else - ops->warning(data, "Full inexact backtrace again:\n"); + MSG("Full inexact backtrace again:\n"); } else if (call_trace >= 1) return; else - ops->warning(data, "Full inexact backtrace again:\n"); + MSG("Full inexact backtrace again:\n"); } else - ops->warning(data, "Inexact backtrace:\n"); + MSG("Inexact backtrace:\n"); } if (!stack) { unsigned long dummy; -- cgit v1.2.3-70-g09d2 From dd315df1767cf56bd4fb8d730fdff4a3d7e15d84 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 7 Dec 2006 02:14:00 +0100 Subject: [PATCH] x86: Compress stack unwinder output The unwinder has some extra newlines, which eat up loads of screen space when it spews. (See https://bugzilla.redhat.com/bugzilla/attachment.cgi?id=137900 for a nasty example). warning_symbol-> and warning-> already printk a newline, so don't add one in the strings passed to them. [AK: redone for new code] Signed-off-by: Dave Jones Signed-off-by: Andi Kleen --- arch/i386/kernel/traps.c | 10 +++++----- arch/x86_64/kernel/traps.c | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index fe81d89c50d..396041a46e3 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -202,22 +202,22 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (unw_ret > 0) { if (call_trace == 1 && !arch_unw_user_mode(&info)) { ops->warning_symbol(data, - "DWARF2 unwinder stuck at %s\n", + "DWARF2 unwinder stuck at %s", UNW_PC(&info)); if (UNW_SP(&info) >= PAGE_OFFSET) { - MSG("Leftover inexact backtrace:\n"); + MSG("Leftover inexact backtrace:"); stack = (void *)UNW_SP(&info); if (!stack) return; ebp = UNW_FP(&info); } else - MSG("Full inexact backtrace again:\n"); + MSG("Full inexact backtrace again:"); } else if (call_trace >= 1) return; else - MSG("Full inexact backtrace again:\n"); + MSG("Full inexact backtrace again:"); } else - MSG("Inexact backtrace:\n"); + MSG("Inexact backtrace:"); } if (!stack) { unsigned long dummy; diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index d3f43c958ca..eedd4e759c3 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -280,7 +280,7 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, if (unw_ret > 0) { if (call_trace == 1 && !arch_unw_user_mode(&info)) { ops->warning_symbol(data, - "DWARF2 unwinder stuck at %s\n", + "DWARF2 unwinder stuck at %s", UNW_PC(&info)); if ((long)UNW_SP(&info) < 0) { MSG("Leftover inexact backtrace:"); @@ -288,13 +288,13 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, if (!stack) return; } else - MSG("Full inexact backtrace again:\n"); + MSG("Full inexact backtrace again:"); } else if (call_trace >= 1) return; else - MSG("Full inexact backtrace again:\n"); + MSG("Full inexact backtrace again:"); } else - MSG("Inexact backtrace:\n"); + MSG("Inexact backtrace:"); } if (!stack) { unsigned long dummy; -- cgit v1.2.3-70-g09d2 From a63954b5cad5765e52870bb649992bf636f32a6b Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Thu, 7 Dec 2006 02:14:00 +0100 Subject: [PATCH] i386: remove pointless printk from i386 oops output This just got removed on x86-64, do the same on 32bit. It always annoyed me when this ate a line of oops output pushing interesting stuff off the screen. Signed-off-by: Dave Jones Signed-off-by: Andi Kleen --- arch/i386/kernel/traps.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 396041a46e3..48ebfab661b 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -777,7 +777,6 @@ void __kprobes die_nmi(struct pt_regs *regs, const char *msg) printk(" on CPU%d, eip %08lx, registers:\n", smp_processor_id(), regs->eip); show_registers(regs); - printk(KERN_EMERG "console shuts up ...\n"); console_silent(); spin_unlock(&nmi_print_lock); bust_spinlocks(0); -- cgit v1.2.3-70-g09d2 From 42ed458aa51337357d7632c64aed4528f923e829 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 7 Dec 2006 02:14:01 +0100 Subject: [PATCH] i386: i386 add X86_FEATURE_PEBS and detection Here is a patch (used by perfmon2) to detect the presence of the Precise Event Based Sampling (PEBS) feature for i386. The patch also adds the cpu_has_pebs macro. - adds X86_FEATURE_PEBS - adds cpu_has_pebs to test for X86_FEATURE_PEBS Signed-off-by: stephane eranian Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/cpu/intel.c | 8 +++++++- include/asm-i386/cpufeature.h | 2 ++ 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c index 94a95aa5227..798c2f617e8 100644 --- a/arch/i386/kernel/cpu/intel.c +++ b/arch/i386/kernel/cpu/intel.c @@ -195,8 +195,14 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) if ((c->x86 == 0xf && c->x86_model >= 0x03) || (c->x86 == 0x6 && c->x86_model >= 0x0e)) set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability); -} + if (cpu_has_ds) { + unsigned int l1; + rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); + if (!(l1 & (1<<12))) + set_bit(X86_FEATURE_PEBS, c->x86_capability); + } +} static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 * c, unsigned int size) { diff --git a/include/asm-i386/cpufeature.h b/include/asm-i386/cpufeature.h index 69ce35049a0..231672558c1 100644 --- a/include/asm-i386/cpufeature.h +++ b/include/asm-i386/cpufeature.h @@ -73,6 +73,7 @@ #define X86_FEATURE_UP (3*32+ 9) /* smp kernel running on up */ #define X86_FEATURE_FXSAVE_LEAK (3*32+10) /* FXSAVE leaks FOP/FIP/FOP */ #define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */ +#define X86_FEATURE_PEBS (3*32+12) /* Precise-Event Based Sampling */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ #define X86_FEATURE_XMM3 (4*32+ 0) /* Streaming SIMD Extensions-3 */ @@ -135,6 +136,7 @@ #define cpu_has_pmm boot_cpu_has(X86_FEATURE_PMM) #define cpu_has_pmm_enabled boot_cpu_has(X86_FEATURE_PMM_EN) #define cpu_has_ds boot_cpu_has(X86_FEATURE_DS) +#define cpu_has_pebs boot_cpu_has(X86_FEATURE_PEBS) #endif /* __ASM_I386_CPUFEATURE_H */ -- cgit v1.2.3-70-g09d2 From e5e3a0428968dcc1f9318ce1c941a918e99f8b84 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 7 Dec 2006 02:14:01 +0100 Subject: [PATCH] i386: remove default_ldt, and simplify ldt-setting. This patch removes the default_ldt[] array, as it has been unused since iBCS stopped being supported. This means it is now possible to actually set an empty LDT segment. In order to deal with this, the set_ldt_desc/load_LDT pair has been replaced with a single set_ldt() operation which is responsible for both setting up the LDT descriptor in the GDT, and reloading the LDT register. If there are no LDT entries, the LDT register is loaded with a NULL descriptor. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Andi Kleen Acked-by: Zachary Amsden Signed-off-by: Andrew Morton --- arch/i386/kernel/ldt.c | 4 +--- arch/i386/kernel/traps.c | 3 --- include/asm-i386/desc.h | 50 ++++++++++++++++-------------------------- include/asm-i386/mmu_context.h | 4 ++-- 4 files changed, 22 insertions(+), 39 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/ldt.c b/arch/i386/kernel/ldt.c index 445211eb2d5..b410e5fb034 100644 --- a/arch/i386/kernel/ldt.c +++ b/arch/i386/kernel/ldt.c @@ -160,16 +160,14 @@ static int read_default_ldt(void __user * ptr, unsigned long bytecount) { int err; unsigned long size; - void *address; err = 0; - address = &default_ldt[0]; size = 5*sizeof(struct desc_struct); if (size > bytecount) size = bytecount; err = size; - if (copy_to_user(ptr, address, size)) + if (clear_user(ptr, size)) err = -EFAULT; return err; diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 48ebfab661b..56655ea8d98 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -61,9 +61,6 @@ int panic_on_unrecovered_nmi; asmlinkage int system_call(void); -struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 }, - { 0, 0 }, { 0, 0 } }; - /* Do we ignore FPU interrupts ? */ char ignore_fpu_irq = 0; diff --git a/include/asm-i386/desc.h b/include/asm-i386/desc.h index 5874ef119ff..a0398f780ca 100644 --- a/include/asm-i386/desc.h +++ b/include/asm-i386/desc.h @@ -33,11 +33,6 @@ static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address; } -/* - * This is the ldt that every process will get unless we need - * something other than this. - */ -extern struct desc_struct default_ldt[]; extern struct desc_struct idt_table[]; extern void set_intr_gate(unsigned int irq, void * addr); @@ -65,7 +60,6 @@ static inline void pack_gate(__u32 *a, __u32 *b, #define DESCTYPE_S 0x10 /* !system */ #define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8)) -#define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8)) #define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr)) #define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr)) @@ -115,13 +109,20 @@ static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, const vo write_gdt_entry(get_cpu_gdt_table(cpu), entry, a, b); } -static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int entries) +static inline void set_ldt(void *addr, unsigned int entries) { - __u32 a, b; - pack_descriptor(&a, &b, (unsigned long)addr, - entries * sizeof(struct desc_struct) - 1, - DESCTYPE_LDT, 0); - write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, a, b); + if (likely(entries == 0)) + __asm__ __volatile__("lldt %w0"::"q" (0)); + else { + unsigned cpu = smp_processor_id(); + __u32 a, b; + + pack_descriptor(&a, &b, (unsigned long)addr, + entries * sizeof(struct desc_struct) - 1, + DESCTYPE_LDT, 0); + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, a, b); + __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8)); + } } #define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr) @@ -153,35 +154,22 @@ static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int entri static inline void clear_LDT(void) { - int cpu = get_cpu(); - - set_ldt_desc(cpu, &default_ldt[0], 5); - load_LDT_desc(); - put_cpu(); + set_ldt(NULL, 0); } /* * load one particular LDT into the current CPU */ -static inline void load_LDT_nolock(mm_context_t *pc, int cpu) +static inline void load_LDT_nolock(mm_context_t *pc) { - void *segments = pc->ldt; - int count = pc->size; - - if (likely(!count)) { - segments = &default_ldt[0]; - count = 5; - } - - set_ldt_desc(cpu, segments, count); - load_LDT_desc(); + set_ldt(pc->ldt, pc->size); } static inline void load_LDT(mm_context_t *pc) { - int cpu = get_cpu(); - load_LDT_nolock(pc, cpu); - put_cpu(); + preempt_disable(); + load_LDT_nolock(pc); + preempt_enable(); } static inline unsigned long get_desc_base(unsigned long *desc) diff --git a/include/asm-i386/mmu_context.h b/include/asm-i386/mmu_context.h index 62b7bf18409..1b1495372c4 100644 --- a/include/asm-i386/mmu_context.h +++ b/include/asm-i386/mmu_context.h @@ -44,7 +44,7 @@ static inline void switch_mm(struct mm_struct *prev, * load the LDT, if the LDT is different: */ if (unlikely(prev->context.ldt != next->context.ldt)) - load_LDT_nolock(&next->context, cpu); + load_LDT_nolock(&next->context); } #ifdef CONFIG_SMP else { @@ -56,7 +56,7 @@ static inline void switch_mm(struct mm_struct *prev, * tlb flush IPI delivery. We must reload %cr3. */ load_cr3(next->pgd); - load_LDT_nolock(&next->context, cpu); + load_LDT_nolock(&next->context); } } #endif -- cgit v1.2.3-70-g09d2 From bb81a09e55eaf7e5f798468ab971469b6f66a259 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 7 Dec 2006 02:14:01 +0100 Subject: [PATCH] x86: all cpu backtrace When a spinlock lockup occurs, arrange for the NMI code to emit an all-cpu backtrace, so we get to see which CPU is holding the lock, and where. Cc: Andi Kleen Cc: Ingo Molnar Cc: Badari Pulavarty Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/nmi.c | 26 ++++++++++++++++++++++++++ arch/x86_64/kernel/nmi.c | 29 ++++++++++++++++++++++++++++- include/asm-i386/nmi.h | 8 ++++++++ include/asm-x86_64/nmi.h | 3 +++ include/linux/nmi.h | 5 +++++ lib/spinlock_debug.c | 4 ++++ 6 files changed, 74 insertions(+), 1 deletion(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index eaafe233a5d..171194ccb7b 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -42,6 +43,8 @@ int nmi_watchdog_enabled; static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner); static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[3]); +static cpumask_t backtrace_mask = CPU_MASK_NONE; + /* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now) */ @@ -907,6 +910,16 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) touched = 1; } + if (cpu_isset(cpu, backtrace_mask)) { + static DEFINE_SPINLOCK(lock); /* Serialise the printks */ + + spin_lock(&lock); + printk("NMI backtrace for cpu %d\n", cpu); + dump_stack(); + spin_unlock(&lock); + cpu_clear(cpu, backtrace_mask); + } + sum = per_cpu(irq_stat, cpu).apic_timer_irqs; /* if the apic timer isn't firing, this cpu isn't doing much */ @@ -1033,6 +1046,19 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, #endif +void __trigger_all_cpu_backtrace(void) +{ + int i; + + backtrace_mask = cpu_online_map; + /* Wait for up to 10 seconds for all CPUs to do the backtrace */ + for (i = 0; i < 10 * 1000; i++) { + if (cpus_empty(backtrace_mask)) + break; + mdelay(1); + } +} + EXPORT_SYMBOL(nmi_active); EXPORT_SYMBOL(nmi_watchdog); EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi); diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 7af9cb3e2d9..27e95e7922c 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -12,14 +12,15 @@ * Mikael Pettersson : PM converted to driver model. Disable/enable API. */ +#include #include #include #include #include #include -#include #include #include +#include #include #include @@ -41,6 +42,8 @@ int panic_on_unrecovered_nmi; static DEFINE_PER_CPU(unsigned, perfctr_nmi_owner); static DEFINE_PER_CPU(unsigned, evntsel_nmi_owner[2]); +static cpumask_t backtrace_mask = CPU_MASK_NONE; + /* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now) */ @@ -782,6 +785,7 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) { int sum; int touched = 0; + int cpu = smp_processor_id(); struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); u64 dummy; int rc=0; @@ -799,6 +803,16 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) touched = 1; } + if (cpu_isset(cpu, backtrace_mask)) { + static DEFINE_SPINLOCK(lock); /* Serialise the printks */ + + spin_lock(&lock); + printk("NMI backtrace for cpu %d\n", cpu); + dump_stack(); + spin_unlock(&lock); + cpu_clear(cpu, backtrace_mask); + } + #ifdef CONFIG_X86_MCE /* Could check oops_in_progress here too, but it's safer not too */ @@ -931,6 +945,19 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, #endif +void __trigger_all_cpu_backtrace(void) +{ + int i; + + backtrace_mask = cpu_online_map; + /* Wait for up to 10 seconds for all CPUs to do the backtrace */ + for (i = 0; i < 10 * 1000; i++) { + if (cpus_empty(backtrace_mask)) + break; + mdelay(1); + } +} + EXPORT_SYMBOL(nmi_active); EXPORT_SYMBOL(nmi_watchdog); EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi); diff --git a/include/asm-i386/nmi.h b/include/asm-i386/nmi.h index 269d315719c..b04333ea6f3 100644 --- a/include/asm-i386/nmi.h +++ b/include/asm-i386/nmi.h @@ -5,6 +5,9 @@ #define ASM_NMI_H #include +#include + +#ifdef ARCH_HAS_NMI_WATCHDOG /** * do_nmi_callback @@ -42,4 +45,9 @@ extern int proc_nmi_enabled(struct ctl_table *, int , struct file *, void __user *, size_t *, loff_t *); extern int unknown_nmi_panic; +void __trigger_all_cpu_backtrace(void); +#define trigger_all_cpu_backtrace() __trigger_all_cpu_backtrace() + +#endif + #endif /* ASM_NMI_H */ diff --git a/include/asm-x86_64/nmi.h b/include/asm-x86_64/nmi.h index f367d4014b4..72375e7d32a 100644 --- a/include/asm-x86_64/nmi.h +++ b/include/asm-x86_64/nmi.h @@ -77,4 +77,7 @@ extern int proc_nmi_enabled(struct ctl_table *, int , struct file *, extern int unknown_nmi_panic; +void __trigger_all_cpu_backtrace(void); +#define trigger_all_cpu_backtrace() __trigger_all_cpu_backtrace() + #endif /* ASM_NMI_H */ diff --git a/include/linux/nmi.h b/include/linux/nmi.h index e16904e28c3..acb4ed13024 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -15,9 +15,14 @@ * disables interrupts for a long time. This call is stateless. */ #ifdef ARCH_HAS_NMI_WATCHDOG +#include extern void touch_nmi_watchdog(void); #else # define touch_nmi_watchdog() touch_softlockup_watchdog() #endif +#ifndef trigger_all_cpu_backtrace +#define trigger_all_cpu_backtrace() do { } while (0) +#endif + #endif diff --git a/lib/spinlock_debug.c b/lib/spinlock_debug.c index b6c4f898197..479fd462eaa 100644 --- a/lib/spinlock_debug.c +++ b/lib/spinlock_debug.c @@ -7,6 +7,7 @@ */ #include +#include #include #include #include @@ -117,6 +118,9 @@ static void __spin_lock_debug(spinlock_t *lock) raw_smp_processor_id(), current->comm, current->pid, lock); dump_stack(); +#ifdef CONFIG_SMP + trigger_all_cpu_backtrace(); +#endif } } } -- cgit v1.2.3-70-g09d2 From be44d2aabce2d62f72d5751d1871b6212bf7a1c7 Mon Sep 17 00:00:00 2001 From: Stas Sergeev Date: Thu, 7 Dec 2006 02:14:01 +0100 Subject: [PATCH] i386: espfix cleanup Clean up the espfix code: - Introduced PER_CPU() macro to be used from asm - Introduced GET_DESC_BASE() macro to be used from asm - Rewrote the fixup code in asm, as calling a C code with the altered %ss appeared to be unsafe - No longer altering the stack from a .fixup section - 16bit per-cpu stack is no longer used, instead the stack segment base is patched the way so that the high word of the kernel and user %esp are the same. - Added the limit-patching for the espfix segment. (Chuck Ebbert) [jeremy@goop.org: use the x86 scaling addressing mode rather than shifting] Signed-off-by: Stas Sergeev Signed-off-by: Andi Kleen Acked-by: Zachary Amsden Acked-by: Chuck Ebbert <76306.1226@compuserve.com> Acked-by: Jan Beulich Cc: Andi Kleen Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andrew Morton --- arch/i386/kernel/asm-offsets.c | 5 +++ arch/i386/kernel/cpu/common.c | 11 ------- arch/i386/kernel/entry.S | 73 +++++++++++++++++++----------------------- arch/i386/kernel/head.S | 2 +- arch/i386/kernel/traps.c | 57 +++++++++------------------------ include/asm-i386/desc.h | 27 +++++++++++++--- include/asm-i386/percpu.h | 25 +++++++++++++++ 7 files changed, 103 insertions(+), 97 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index c80271f8f08..e94d910a28b 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -58,6 +58,11 @@ void foo(void) OFFSET(TI_sysenter_return, thread_info, sysenter_return); BLANK(); + OFFSET(GDS_size, Xgt_desc_struct, size); + OFFSET(GDS_address, Xgt_desc_struct, address); + OFFSET(GDS_pad, Xgt_desc_struct, pad); + BLANK(); + OFFSET(EXEC_DOMAIN_handler, exec_domain, handler); OFFSET(RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext); BLANK(); diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index d9f3e3c31f0..5532fc4e1bf 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -24,9 +24,6 @@ DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr); EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr); -DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]); -EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack); - static int cachesize_override __cpuinitdata = -1; static int disable_x86_fxsr __cpuinitdata; static int disable_x86_serial_nr __cpuinitdata = 1; @@ -603,7 +600,6 @@ void __cpuinit cpu_init(void) struct tss_struct * t = &per_cpu(init_tss, cpu); struct thread_struct *thread = ¤t->thread; struct desc_struct *gdt; - __u32 stk16_off = (__u32)&per_cpu(cpu_16bit_stack, cpu); struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); if (cpu_test_and_set(cpu, cpu_initialized)) { @@ -651,13 +647,6 @@ old_gdt: * and set up the GDT descriptor: */ memcpy(gdt, cpu_gdt_table, GDT_SIZE); - - /* Set up GDT entry for 16bit stack */ - *(__u64 *)(&gdt[GDT_ENTRY_ESPFIX_SS]) |= - ((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) | - ((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) | - (CPU_16BIT_STACK_SIZE - 1); - cpu_gdt_descr->size = GDT_SIZE - 1; cpu_gdt_descr->address = (unsigned long)gdt; diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 5a63d6fdb70..c38d801ba0b 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -48,6 +48,7 @@ #include #include #include +#include #include #include "irq_vectors.h" @@ -418,23 +419,18 @@ ldt_ss: * This is an "official" bug of all the x86-compatible * CPUs, which we can try to work around to make * dosemu and wine happy. */ - subl $8, %esp # reserve space for switch16 pointer - CFI_ADJUST_CFA_OFFSET 8 + movl OLDESP(%esp), %eax + movl %esp, %edx + call patch_espfix_desc + pushl $__ESPFIX_SS + CFI_ADJUST_CFA_OFFSET 4 + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 DISABLE_INTERRUPTS TRACE_IRQS_OFF - movl %esp, %eax - /* Set up the 16bit stack frame with switch32 pointer on top, - * and a switch16 pointer on top of the current frame. */ - call setup_x86_bogus_stack - CFI_ADJUST_CFA_OFFSET -8 # frame has moved - TRACE_IRQS_IRET - RESTORE_REGS - lss 20+4(%esp), %esp # switch to 16bit stack -1: INTERRUPT_RETURN -.section __ex_table,"a" - .align 4 - .long 1b,iret_exc -.previous + lss (%esp), %esp + CFI_ADJUST_CFA_OFFSET -8 + jmp restore_nocheck CFI_ENDPROC # perform work that needs to be done immediately before resumption @@ -524,30 +520,30 @@ syscall_badsys: CFI_ENDPROC #define FIXUP_ESPFIX_STACK \ - movl %esp, %eax; \ - /* switch to 32bit stack using the pointer on top of 16bit stack */ \ - lss %ss:CPU_16BIT_STACK_SIZE-8, %esp; \ - /* copy data from 16bit stack to 32bit stack */ \ - call fixup_x86_bogus_stack; \ - /* put ESP to the proper location */ \ - movl %eax, %esp; -#define UNWIND_ESPFIX_STACK \ + /* since we are on a wrong stack, we cant make it a C code :( */ \ + GET_THREAD_INFO(%ebp); \ + movl TI_cpu(%ebp), %ebx; \ + PER_CPU(cpu_gdt_descr, %ebx); \ + movl GDS_address(%ebx), %ebx; \ + GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \ + addl %esp, %eax; \ + pushl $__KERNEL_DS; \ + CFI_ADJUST_CFA_OFFSET 4; \ pushl %eax; \ CFI_ADJUST_CFA_OFFSET 4; \ + lss (%esp), %esp; \ + CFI_ADJUST_CFA_OFFSET -8; +#define UNWIND_ESPFIX_STACK \ movl %ss, %eax; \ - /* see if on 16bit stack */ \ + /* see if on espfix stack */ \ cmpw $__ESPFIX_SS, %ax; \ - je 28f; \ -27: popl %eax; \ - CFI_ADJUST_CFA_OFFSET -4; \ -.section .fixup,"ax"; \ -28: movl $__KERNEL_DS, %eax; \ + jne 27f; \ + movl $__KERNEL_DS, %eax; \ movl %eax, %ds; \ movl %eax, %es; \ - /* switch to 32bit stack */ \ + /* switch to normal stack */ \ FIXUP_ESPFIX_STACK; \ - jmp 27b; \ -.previous +27:; /* * Build the entry stubs and pointer table with @@ -614,7 +610,6 @@ error_code: pushl %eax CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET eax, 0 - xorl %eax, %eax pushl %ebp CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET ebp, 0 @@ -627,7 +622,6 @@ error_code: pushl %edx CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET edx, 0 - decl %eax # eax = -1 pushl %ecx CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET ecx, 0 @@ -644,7 +638,7 @@ error_code: /*CFI_REGISTER es, ecx*/ movl ES(%esp), %edi # get the function address movl ORIG_EAX(%esp), %edx # get the error code - movl %eax, ORIG_EAX(%esp) + movl $-1, ORIG_EAX(%esp) movl %ecx, ES(%esp) /*CFI_REL_OFFSET es, ES*/ movl $(__USER_DS), %ecx @@ -754,7 +748,7 @@ KPROBE_ENTRY(nmi) cmpw $__ESPFIX_SS, %ax popl %eax CFI_ADJUST_CFA_OFFSET -4 - je nmi_16bit_stack + je nmi_espfix_stack cmpl $sysenter_entry,(%esp) je nmi_stack_fixup pushl %eax @@ -797,7 +791,7 @@ nmi_debug_stack_check: FIX_STACK(24,nmi_stack_correct, 1) jmp nmi_stack_correct -nmi_16bit_stack: +nmi_espfix_stack: /* We have a RING0_INT_FRAME here. * * create the pointer to lss back @@ -806,7 +800,6 @@ nmi_16bit_stack: CFI_ADJUST_CFA_OFFSET 4 pushl %esp CFI_ADJUST_CFA_OFFSET 4 - movzwl %sp, %esp addw $4, (%esp) /* copy the iret frame of 12 bytes */ .rept 3 @@ -817,11 +810,11 @@ nmi_16bit_stack: CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL FIXUP_ESPFIX_STACK # %eax == %esp - CFI_ADJUST_CFA_OFFSET -20 # the frame has now moved xorl %edx,%edx # zero error code call do_nmi RESTORE_REGS - lss 12+4(%esp), %esp # back to 16bit stack + lss 12+4(%esp), %esp # back to espfix stack + CFI_ADJUST_CFA_OFFSET -24 1: INTERRUPT_RETURN CFI_ENDPROC .section __ex_table,"a" diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index ca31f18d277..b1f1df11fcc 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -584,7 +584,7 @@ ENTRY(cpu_gdt_table) .quad 0x00009a000000ffff /* 0xc0 APM CS 16 code (16 bit) */ .quad 0x004092000000ffff /* 0xc8 APM DS data */ - .quad 0x0000920000000000 /* 0xd0 - ESPFIX 16-bit SS */ + .quad 0x00c0920000000000 /* 0xd0 - ESPFIX SS */ .quad 0x0000000000000000 /* 0xd8 - unused */ .quad 0x0000000000000000 /* 0xe0 - unused */ .quad 0x0000000000000000 /* 0xe8 - unused */ diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 56655ea8d98..f9bb1f89d68 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -1088,49 +1088,24 @@ fastcall void do_spurious_interrupt_bug(struct pt_regs * regs, #endif } -fastcall void setup_x86_bogus_stack(unsigned char * stk) +fastcall unsigned long patch_espfix_desc(unsigned long uesp, + unsigned long kesp) { - unsigned long *switch16_ptr, *switch32_ptr; - struct pt_regs *regs; - unsigned long stack_top, stack_bot; - unsigned short iret_frame16_off; int cpu = smp_processor_id(); - /* reserve the space on 32bit stack for the magic switch16 pointer */ - memmove(stk, stk + 8, sizeof(struct pt_regs)); - switch16_ptr = (unsigned long *)(stk + sizeof(struct pt_regs)); - regs = (struct pt_regs *)stk; - /* now the switch32 on 16bit stack */ - stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu); - stack_top = stack_bot + CPU_16BIT_STACK_SIZE; - switch32_ptr = (unsigned long *)(stack_top - 8); - iret_frame16_off = CPU_16BIT_STACK_SIZE - 8 - 20; - /* copy iret frame on 16bit stack */ - memcpy((void *)(stack_bot + iret_frame16_off), ®s->eip, 20); - /* fill in the switch pointers */ - switch16_ptr[0] = (regs->esp & 0xffff0000) | iret_frame16_off; - switch16_ptr[1] = __ESPFIX_SS; - switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) + - 8 - CPU_16BIT_STACK_SIZE; - switch32_ptr[1] = __KERNEL_DS; -} - -fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp) -{ - unsigned long *switch32_ptr; - unsigned char *stack16, *stack32; - unsigned long stack_top, stack_bot; - int len; - int cpu = smp_processor_id(); - stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu); - stack_top = stack_bot + CPU_16BIT_STACK_SIZE; - switch32_ptr = (unsigned long *)(stack_top - 8); - /* copy the data from 16bit stack to 32bit stack */ - len = CPU_16BIT_STACK_SIZE - 8 - sp; - stack16 = (unsigned char *)(stack_bot + sp); - stack32 = (unsigned char *) - (switch32_ptr[0] + CPU_16BIT_STACK_SIZE - 8 - len); - memcpy(stack32, stack16, len); - return stack32; + struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); + struct desc_struct *gdt = (struct desc_struct *)cpu_gdt_descr->address; + unsigned long base = (kesp - uesp) & -THREAD_SIZE; + unsigned long new_kesp = kesp - base; + unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT; + __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS]; + /* Set up base for espfix segment */ + desc &= 0x00f0ff0000000000ULL; + desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) | + ((((__u64)base) << 32) & 0xff00000000000000ULL) | + ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) | + (lim_pages & 0xffff); + *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc; + return new_kesp; } /* diff --git a/include/asm-i386/desc.h b/include/asm-i386/desc.h index a0398f780ca..6cf2ac2bfde 100644 --- a/include/asm-i386/desc.h +++ b/include/asm-i386/desc.h @@ -4,8 +4,6 @@ #include #include -#define CPU_16BIT_STACK_SIZE 1024 - #ifndef __ASSEMBLY__ #include @@ -16,8 +14,6 @@ extern struct desc_struct cpu_gdt_table[GDT_ENTRIES]; -DECLARE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]); - struct Xgt_desc_struct { unsigned short size; unsigned long address __attribute__((packed)); @@ -181,6 +177,29 @@ static inline unsigned long get_desc_base(unsigned long *desc) return base; } +#else /* __ASSEMBLY__ */ + +/* + * GET_DESC_BASE reads the descriptor base of the specified segment. + * + * Args: + * idx - descriptor index + * gdt - GDT pointer + * base - 32bit register to which the base will be written + * lo_w - lo word of the "base" register + * lo_b - lo byte of the "base" register + * hi_b - hi byte of the low word of the "base" register + * + * Example: + * GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah) + * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax. + */ +#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \ + movb idx*8+4(gdt), lo_b; \ + movb idx*8+7(gdt), hi_b; \ + shll $16, base; \ + movw idx*8+2(gdt), lo_w; + #endif /* !__ASSEMBLY__ */ #endif diff --git a/include/asm-i386/percpu.h b/include/asm-i386/percpu.h index 5764afa4b6a..510ae1d3486 100644 --- a/include/asm-i386/percpu.h +++ b/include/asm-i386/percpu.h @@ -1,6 +1,31 @@ #ifndef __ARCH_I386_PERCPU__ #define __ARCH_I386_PERCPU__ +#ifndef __ASSEMBLY__ #include +#else + +/* + * PER_CPU finds an address of a per-cpu variable. + * + * Args: + * var - variable name + * cpu - 32bit register containing the current CPU number + * + * The resulting address is stored in the "cpu" argument. + * + * Example: + * PER_CPU(cpu_gdt_descr, %ebx) + */ +#ifdef CONFIG_SMP +#define PER_CPU(var, cpu) \ + movl __per_cpu_offset(,cpu,4), cpu; \ + addl $per_cpu__/**/var, cpu; +#else /* ! SMP */ +#define PER_CPU(var, cpu) \ + movl $per_cpu__/**/var, cpu; +#endif /* SMP */ + +#endif /* !__ASSEMBLY__ */ #endif /* __ARCH_I386_PERCPU__ */ -- cgit v1.2.3-70-g09d2 From acc207616a91a413a50fdd8847a747c4a7324167 Mon Sep 17 00:00:00 2001 From: Chuck Ebbert <76306.1226@compuserve.com> Date: Thu, 7 Dec 2006 02:14:01 +0100 Subject: [PATCH] i386: add sleazy FPU optimization i386 port of the sLeAZY-fpu feature. Chuck reports that this gives him a +/- 0.4% improvement on his simple benchmark x86_64 description follows: Right now the kernel on x86-64 has a 100% lazy fpu behavior: after *every* context switch a trap is taken for the first FPU use to restore the FPU context lazily. This is of course great for applications that have very sporadic or no FPU use (since then you avoid doing the expensive save/restore all the time). However for very frequent FPU users... you take an extra trap every context switch. The patch below adds a simple heuristic to this code: After 5 consecutive context switches of FPU use, the lazy behavior is disabled and the context gets restored every context switch. If the app indeed uses the FPU, the trap is avoided. (the chance of the 6th time slice using FPU after the previous 5 having done so are quite high obviously). After 256 switches, this is reset and lazy behavior is returned (until there are 5 consecutive ones again). The reason for this is to give apps that do longer bursts of FPU use still the lazy behavior back after some time. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/process.c | 12 ++++++++++++ arch/i386/kernel/traps.c | 3 ++- include/asm-i386/i387.h | 5 ++++- 3 files changed, 18 insertions(+), 2 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index dd53c58f64f..ae924c416b6 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -648,6 +648,11 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas __unlazy_fpu(prev_p); + + /* we're going to use this soon, after a few expensive things */ + if (next_p->fpu_counter > 5) + prefetch(&next->i387.fxsave); + /* * Reload esp0. */ @@ -697,6 +702,13 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas disable_tsc(prev_p, next_p); + /* If the task has used fpu the last 5 timeslices, just do a full + * restore of the math state immediately to avoid the trap; the + * chances of needing FPU soon are obviously high now + */ + if (next_p->fpu_counter > 5) + math_state_restore(); + return prev_p; } diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index f9bb1f89d68..4a6fa2837df 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -1118,7 +1118,7 @@ fastcall unsigned long patch_espfix_desc(unsigned long uesp, * Must be called with kernel preemption disabled (in this case, * local interrupts are disabled at the call-site in entry.S). */ -asmlinkage void math_state_restore(struct pt_regs regs) +asmlinkage void math_state_restore(void) { struct thread_info *thread = current_thread_info(); struct task_struct *tsk = thread->task; @@ -1128,6 +1128,7 @@ asmlinkage void math_state_restore(struct pt_regs regs) init_fpu(tsk); restore_fpu(tsk); thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ + tsk->fpu_counter++; } #ifndef CONFIG_MATH_EMULATION diff --git a/include/asm-i386/i387.h b/include/asm-i386/i387.h index bc1d6edae1e..434936c732d 100644 --- a/include/asm-i386/i387.h +++ b/include/asm-i386/i387.h @@ -76,7 +76,9 @@ static inline void __save_init_fpu( struct task_struct *tsk ) #define __unlazy_fpu( tsk ) do { \ if (task_thread_info(tsk)->status & TS_USEDFPU) \ - save_init_fpu( tsk ); \ + save_init_fpu( tsk ); \ + else \ + tsk->fpu_counter = 0; \ } while (0) #define __clear_fpu( tsk ) \ @@ -118,6 +120,7 @@ static inline void save_init_fpu( struct task_struct *tsk ) extern unsigned short get_fpu_cwd( struct task_struct *tsk ); extern unsigned short get_fpu_swd( struct task_struct *tsk ); extern unsigned short get_fpu_mxcsr( struct task_struct *tsk ); +extern asmlinkage void math_state_restore(void); /* * Signal frame handlers... -- cgit v1.2.3-70-g09d2 From c0e84b9901c0924e2503c0aab3772a4469ba4aef Mon Sep 17 00:00:00 2001 From: Amol Lad Date: Thu, 7 Dec 2006 02:14:02 +0100 Subject: [PATCH] i386: Add iounmap in error paths in hpet code Signed-off-by: Amol Lad Signed-off-by: Andi Kleen --- arch/i386/kernel/time_hpet.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/time_hpet.c b/arch/i386/kernel/time_hpet.c index 1a2a979cf6a..1e4702dfcd0 100644 --- a/arch/i386/kernel/time_hpet.c +++ b/arch/i386/kernel/time_hpet.c @@ -132,14 +132,20 @@ int __init hpet_enable(void) * the single HPET timer for system time. */ #ifdef CONFIG_HPET_EMULATE_RTC - if (!(id & HPET_ID_NUMBER)) + if (!(id & HPET_ID_NUMBER)) { + iounmap(hpet_virt_address); + hpet_virt_address = NULL; return -1; + } #endif hpet_period = hpet_readl(HPET_PERIOD); - if ((hpet_period < HPET_MIN_PERIOD) || (hpet_period > HPET_MAX_PERIOD)) + if ((hpet_period < HPET_MIN_PERIOD) || (hpet_period > HPET_MAX_PERIOD)) { + iounmap(hpet_virt_address); + hpet_virt_address = NULL; return -1; + } /* * 64 bit math @@ -156,8 +162,11 @@ int __init hpet_enable(void) hpet_use_timer = id & HPET_ID_LEGSUP; - if (hpet_timer_stop_set_go(hpet_tick)) + if (hpet_timer_stop_set_go(hpet_tick)) { + iounmap(hpet_virt_address); + hpet_virt_address = NULL; return -1; + } use_hpet = 1; -- cgit v1.2.3-70-g09d2 From fa5cecd111d235819a1d807d43216ae459a0dd6f Mon Sep 17 00:00:00 2001 From: Amol Lad Date: Thu, 7 Dec 2006 02:14:02 +0100 Subject: [PATCH] i386: add missing iounmap in i386 hpet clocksource code ioremap must be balanced by an iounmap and failing to do so can result in a memory leak. Tested (compilation only): - using allmodconfig - making sure the files are compiling without any warning/error due to new changes Signed-off-by: Amol Lad Signed-off-by: Andi Kleen --- arch/i386/kernel/hpet.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/hpet.c b/arch/i386/kernel/hpet.c index 17647a530b2..45a8685bb60 100644 --- a/arch/i386/kernel/hpet.c +++ b/arch/i386/kernel/hpet.c @@ -34,6 +34,7 @@ static int __init init_hpet_clocksource(void) unsigned long hpet_period; void __iomem* hpet_base; u64 tmp; + int err; if (!is_hpet_enabled()) return -ENODEV; @@ -61,7 +62,11 @@ static int __init init_hpet_clocksource(void) do_div(tmp, FSEC_PER_NSEC); clocksource_hpet.mult = (u32)tmp; - return clocksource_register(&clocksource_hpet); + err = clocksource_register(&clocksource_hpet); + if (err) + iounmap(hpet_base); + + return err; } module_init(init_hpet_clocksource); -- cgit v1.2.3-70-g09d2 From eb5b7b9d86f46b45ba1f986302fdf7df84fb8297 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 7 Dec 2006 02:14:02 +0100 Subject: [PATCH] i386: Use asm-offsets for the offsets of registers into the pt_regs struct Use asm-offsets for the offsets of registers into the pt_regs struct, rather than having hard-coded constants I left the constants in the comments of entry.S because they're useful for reference; the code in entry.S is very dependent on the layout of pt_regs, even when using asm-offsets. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Keith Owens Signed-off-by: Andrew Morton --- arch/i386/kernel/asm-offsets.c | 17 ++++++ arch/i386/kernel/entry.S | 120 ++++++++++++++++++----------------------- 2 files changed, 69 insertions(+), 68 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index e94d910a28b..70b19807acf 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -63,6 +63,23 @@ void foo(void) OFFSET(GDS_pad, Xgt_desc_struct, pad); BLANK(); + OFFSET(PT_EBX, pt_regs, ebx); + OFFSET(PT_ECX, pt_regs, ecx); + OFFSET(PT_EDX, pt_regs, edx); + OFFSET(PT_ESI, pt_regs, esi); + OFFSET(PT_EDI, pt_regs, edi); + OFFSET(PT_EBP, pt_regs, ebp); + OFFSET(PT_EAX, pt_regs, eax); + OFFSET(PT_DS, pt_regs, xds); + OFFSET(PT_ES, pt_regs, xes); + OFFSET(PT_ORIG_EAX, pt_regs, orig_eax); + OFFSET(PT_EIP, pt_regs, eip); + OFFSET(PT_CS, pt_regs, xcs); + OFFSET(PT_EFLAGS, pt_regs, eflags); + OFFSET(PT_OLDESP, pt_regs, esp); + OFFSET(PT_OLDSS, pt_regs, xss); + BLANK(); + OFFSET(EXEC_DOMAIN_handler, exec_domain, handler); OFFSET(RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext); BLANK(); diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index c38d801ba0b..0069bf01603 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -54,22 +54,6 @@ #define nr_syscalls ((syscall_table_size)/4) -EBX = 0x00 -ECX = 0x04 -EDX = 0x08 -ESI = 0x0C -EDI = 0x10 -EBP = 0x14 -EAX = 0x18 -DS = 0x1C -ES = 0x20 -ORIG_EAX = 0x24 -EIP = 0x28 -CS = 0x2C -EFLAGS = 0x30 -OLDESP = 0x34 -OLDSS = 0x38 - CF_MASK = 0x00000001 TF_MASK = 0x00000100 IF_MASK = 0x00000200 @@ -93,7 +77,7 @@ VM_MASK = 0x00020000 .macro TRACE_IRQS_IRET #ifdef CONFIG_TRACE_IRQFLAGS - testl $IF_MASK,EFLAGS(%esp) # interrupts off? + testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off? jz 1f TRACE_IRQS_ON 1: @@ -199,18 +183,18 @@ VM_MASK = 0x00020000 #define RING0_PTREGS_FRAME \ CFI_STARTPROC simple;\ CFI_SIGNAL_FRAME;\ - CFI_DEF_CFA esp, OLDESP-EBX;\ - /*CFI_OFFSET cs, CS-OLDESP;*/\ - CFI_OFFSET eip, EIP-OLDESP;\ - /*CFI_OFFSET es, ES-OLDESP;*/\ - /*CFI_OFFSET ds, DS-OLDESP;*/\ - CFI_OFFSET eax, EAX-OLDESP;\ - CFI_OFFSET ebp, EBP-OLDESP;\ - CFI_OFFSET edi, EDI-OLDESP;\ - CFI_OFFSET esi, ESI-OLDESP;\ - CFI_OFFSET edx, EDX-OLDESP;\ - CFI_OFFSET ecx, ECX-OLDESP;\ - CFI_OFFSET ebx, EBX-OLDESP + CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\ + /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\ + CFI_OFFSET eip, PT_EIP-PT_OLDESP;\ + /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\ + /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\ + CFI_OFFSET eax, PT_EAX-PT_OLDESP;\ + CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\ + CFI_OFFSET edi, PT_EDI-PT_OLDESP;\ + CFI_OFFSET esi, PT_ESI-PT_OLDESP;\ + CFI_OFFSET edx, PT_EDX-PT_OLDESP;\ + CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\ + CFI_OFFSET ebx, PT_EBX-PT_OLDESP ENTRY(ret_from_fork) CFI_STARTPROC @@ -242,8 +226,8 @@ ret_from_exception: ret_from_intr: GET_THREAD_INFO(%ebp) check_userspace: - movl EFLAGS(%esp), %eax # mix EFLAGS and CS - movb CS(%esp), %al + movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS + movb PT_CS(%esp), %al andl $(VM_MASK | SEGMENT_RPL_MASK), %eax cmpl $USER_RPL, %eax jb resume_kernel # not returning to v8086 or userspace @@ -266,7 +250,7 @@ need_resched: movl TI_flags(%ebp), %ecx # need_resched set ? testb $_TIF_NEED_RESCHED, %cl jz restore_all - testl $IF_MASK,EFLAGS(%esp) # interrupts off (exception path) ? + testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off (exception path) ? jz restore_all call preempt_schedule_irq jmp need_resched @@ -332,15 +316,15 @@ sysenter_past_esp: cmpl $(nr_syscalls), %eax jae syscall_badsys call *sys_call_table(,%eax,4) - movl %eax,EAX(%esp) + movl %eax,PT_EAX(%esp) DISABLE_INTERRUPTS TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx jne syscall_exit_work /* if something modifies registers it must also disable sysexit */ - movl EIP(%esp), %edx - movl OLDESP(%esp), %ecx + movl PT_EIP(%esp), %edx + movl PT_OLDESP(%esp), %ecx xorl %ebp,%ebp TRACE_IRQS_ON ENABLE_INTERRUPTS_SYSEXIT @@ -354,7 +338,7 @@ ENTRY(system_call) CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL GET_THREAD_INFO(%ebp) - testl $TF_MASK,EFLAGS(%esp) + testl $TF_MASK,PT_EFLAGS(%esp) jz no_singlestep orl $_TIF_SINGLESTEP,TI_flags(%ebp) no_singlestep: @@ -366,7 +350,7 @@ no_singlestep: jae syscall_badsys syscall_call: call *sys_call_table(,%eax,4) - movl %eax,EAX(%esp) # store the return value + movl %eax,PT_EAX(%esp) # store the return value syscall_exit: DISABLE_INTERRUPTS # make sure we don't miss an interrupt # setting need_resched or sigpending @@ -377,12 +361,12 @@ syscall_exit: jne syscall_exit_work restore_all: - movl EFLAGS(%esp), %eax # mix EFLAGS, SS and CS - # Warning: OLDSS(%esp) contains the wrong/random values if we + movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS + # Warning: PT_OLDSS(%esp) contains the wrong/random values if we # are returning to the kernel. # See comments in process.c:copy_thread() for details. - movb OLDSS(%esp), %ah - movb CS(%esp), %al + movb PT_OLDSS(%esp), %ah + movb PT_CS(%esp), %al andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax CFI_REMEMBER_STATE @@ -409,7 +393,7 @@ iret_exc: CFI_RESTORE_STATE ldt_ss: - larl OLDSS(%esp), %eax + larl PT_OLDSS(%esp), %eax jnz restore_nocheck testl $0x00400000, %eax # returning to 32bit stack? jnz restore_nocheck # allright, normal return @@ -419,7 +403,7 @@ ldt_ss: * This is an "official" bug of all the x86-compatible * CPUs, which we can try to work around to make * dosemu and wine happy. */ - movl OLDESP(%esp), %eax + movl PT_OLDESP(%esp), %eax movl %esp, %edx call patch_espfix_desc pushl $__ESPFIX_SS @@ -454,7 +438,7 @@ work_resched: work_notifysig: # deal with pending signals and # notify-resume requests - testl $VM_MASK, EFLAGS(%esp) + testl $VM_MASK, PT_EFLAGS(%esp) movl %esp, %eax jne work_notifysig_v86 # returning to kernel-space or # vm86-space @@ -479,14 +463,14 @@ work_notifysig_v86: # perform syscall exit tracing ALIGN syscall_trace_entry: - movl $-ENOSYS,EAX(%esp) + movl $-ENOSYS,PT_EAX(%esp) movl %esp, %eax xorl %edx,%edx call do_syscall_trace cmpl $0, %eax jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU, # so must skip actual syscall - movl ORIG_EAX(%esp), %eax + movl PT_ORIG_EAX(%esp), %eax cmpl $(nr_syscalls), %eax jnae syscall_call jmp syscall_exit @@ -511,11 +495,11 @@ syscall_fault: CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL GET_THREAD_INFO(%ebp) - movl $-EFAULT,EAX(%esp) + movl $-EFAULT,PT_EAX(%esp) jmp resume_userspace syscall_badsys: - movl $-ENOSYS,EAX(%esp) + movl $-ENOSYS,PT_EAX(%esp) jmp resume_userspace CFI_ENDPROC @@ -636,10 +620,10 @@ error_code: popl %ecx CFI_ADJUST_CFA_OFFSET -4 /*CFI_REGISTER es, ecx*/ - movl ES(%esp), %edi # get the function address - movl ORIG_EAX(%esp), %edx # get the error code - movl $-1, ORIG_EAX(%esp) - movl %ecx, ES(%esp) + movl PT_ES(%esp), %edi # get the function address + movl PT_ORIG_EAX(%esp), %edx # get the error code + movl $-1, PT_ORIG_EAX(%esp) + movl %ecx, PT_ES(%esp) /*CFI_REL_OFFSET es, ES*/ movl $(__USER_DS), %ecx movl %ecx, %ds @@ -942,26 +926,26 @@ ENTRY(arch_unwind_init_running) movl 4(%esp), %edx movl (%esp), %ecx leal 4(%esp), %eax - movl %ebx, EBX(%edx) + movl %ebx, PT_EBX(%edx) xorl %ebx, %ebx - movl %ebx, ECX(%edx) - movl %ebx, EDX(%edx) - movl %esi, ESI(%edx) - movl %edi, EDI(%edx) - movl %ebp, EBP(%edx) - movl %ebx, EAX(%edx) - movl $__USER_DS, DS(%edx) - movl $__USER_DS, ES(%edx) - movl %ebx, ORIG_EAX(%edx) - movl %ecx, EIP(%edx) + movl %ebx, PT_ECX(%edx) + movl %ebx, PT_EDX(%edx) + movl %esi, PT_ESI(%edx) + movl %edi, PT_EDI(%edx) + movl %ebp, PT_EBP(%edx) + movl %ebx, PT_EAX(%edx) + movl $__USER_DS, PT_DS(%edx) + movl $__USER_DS, PT_ES(%edx) + movl %ebx, PT_ORIG_EAX(%edx) + movl %ecx, PT_EIP(%edx) movl 12(%esp), %ecx - movl $__KERNEL_CS, CS(%edx) - movl %ebx, EFLAGS(%edx) - movl %eax, OLDESP(%edx) + movl $__KERNEL_CS, PT_CS(%edx) + movl %ebx, PT_EFLAGS(%edx) + movl %eax, PT_OLDESP(%edx) movl 8(%esp), %eax movl %ecx, 8(%esp) - movl EBX(%edx), %ebx - movl $__KERNEL_DS, OLDSS(%edx) + movl PT_EBX(%edx), %ebx + movl $__KERNEL_DS, PT_OLDSS(%edx) jmpl *%eax CFI_ENDPROC ENDPROC(arch_unwind_init_running) -- cgit v1.2.3-70-g09d2 From 9ca36101a8d74704d78f10910f89d62de96f9dc8 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 7 Dec 2006 02:14:02 +0100 Subject: [PATCH] i386: Basic definitions for i386-pda This patch has the basic definitions of struct i386_pda, and the segment selector in the GDT. asm-i386/pda.h is more or less a direct copy of asm-x86_64/pda.h. The most interesting difference is the use of _proxy_pda, which is used to give gcc a model for the actual memory operations on the real pda structure. No actual reference is ever made to _proxy_pda, so it is never defined. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Chuck Ebbert <76306.1226@compuserve.com> Cc: Zachary Amsden Cc: Jan Beulich Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/head.S | 2 +- include/asm-i386/pda.h | 95 ++++++++++++++++++++++++++++++++++++++++++++++ include/asm-i386/segment.h | 5 ++- 3 files changed, 100 insertions(+), 2 deletions(-) create mode 100644 include/asm-i386/pda.h (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index b1f1df11fcc..4a83384c5a6 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -585,7 +585,7 @@ ENTRY(cpu_gdt_table) .quad 0x004092000000ffff /* 0xc8 APM DS data */ .quad 0x00c0920000000000 /* 0xd0 - ESPFIX SS */ - .quad 0x0000000000000000 /* 0xd8 - unused */ + .quad 0x0000000000000000 /* 0xd8 - PDA */ .quad 0x0000000000000000 /* 0xe0 - unused */ .quad 0x0000000000000000 /* 0xe8 - unused */ .quad 0x0000000000000000 /* 0xf0 - unused */ diff --git a/include/asm-i386/pda.h b/include/asm-i386/pda.h new file mode 100644 index 00000000000..4c39ccb1305 --- /dev/null +++ b/include/asm-i386/pda.h @@ -0,0 +1,95 @@ +/* + Per-processor Data Areas + Jeremy Fitzhardinge 2006 + Based on asm-x86_64/pda.h by Andi Kleen. + */ +#ifndef _I386_PDA_H +#define _I386_PDA_H + +#include + +struct i386_pda +{ + struct i386_pda *_pda; /* pointer to self */ +}; + +extern struct i386_pda *_cpu_pda[]; + +#define cpu_pda(i) (_cpu_pda[i]) + +#define pda_offset(field) offsetof(struct i386_pda, field) + +extern void __bad_pda_field(void); + +/* This variable is never instantiated. It is only used as a stand-in + for the real per-cpu PDA memory, so that gcc can understand what + memory operations the inline asms() below are performing. This + eliminates the need to make the asms volatile or have memory + clobbers, so gcc can readily analyse them. */ +extern struct i386_pda _proxy_pda; + +#define pda_to_op(op,field,val) \ + do { \ + typedef typeof(_proxy_pda.field) T__; \ + if (0) { T__ tmp__; tmp__ = (val); } \ + switch (sizeof(_proxy_pda.field)) { \ + case 1: \ + asm(op "b %1,%%gs:%c2" \ + : "+m" (_proxy_pda.field) \ + :"ri" ((T__)val), \ + "i"(pda_offset(field))); \ + break; \ + case 2: \ + asm(op "w %1,%%gs:%c2" \ + : "+m" (_proxy_pda.field) \ + :"ri" ((T__)val), \ + "i"(pda_offset(field))); \ + break; \ + case 4: \ + asm(op "l %1,%%gs:%c2" \ + : "+m" (_proxy_pda.field) \ + :"ri" ((T__)val), \ + "i"(pda_offset(field))); \ + break; \ + default: __bad_pda_field(); \ + } \ + } while (0) + +#define pda_from_op(op,field) \ + ({ \ + typeof(_proxy_pda.field) ret__; \ + switch (sizeof(_proxy_pda.field)) { \ + case 1: \ + asm(op "b %%gs:%c1,%0" \ + : "=r" (ret__) \ + : "i" (pda_offset(field)), \ + "m" (_proxy_pda.field)); \ + break; \ + case 2: \ + asm(op "w %%gs:%c1,%0" \ + : "=r" (ret__) \ + : "i" (pda_offset(field)), \ + "m" (_proxy_pda.field)); \ + break; \ + case 4: \ + asm(op "l %%gs:%c1,%0" \ + : "=r" (ret__) \ + : "i" (pda_offset(field)), \ + "m" (_proxy_pda.field)); \ + break; \ + default: __bad_pda_field(); \ + } \ + ret__; }) + +/* Return a pointer to a pda field */ +#define pda_addr(field) \ + ((typeof(_proxy_pda.field) *)((unsigned char *)read_pda(_pda) + \ + pda_offset(field))) + +#define read_pda(field) pda_from_op("mov",field) +#define write_pda(field,val) pda_to_op("mov",field,val) +#define add_pda(field,val) pda_to_op("add",field,val) +#define sub_pda(field,val) pda_to_op("sub",field,val) +#define or_pda(field,val) pda_to_op("or",field,val) + +#endif /* _I386_PDA_H */ diff --git a/include/asm-i386/segment.h b/include/asm-i386/segment.h index b7ab59685ba..5bdda79b6b5 100644 --- a/include/asm-i386/segment.h +++ b/include/asm-i386/segment.h @@ -39,7 +39,7 @@ * 25 - APM BIOS support * * 26 - ESPFIX small SS - * 27 - unused + * 27 - PDA [ per-cpu private data area ] * 28 - unused * 29 - unused * 30 - unused @@ -74,6 +74,9 @@ #define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14) #define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8) +#define GDT_ENTRY_PDA (GDT_ENTRY_KERNEL_BASE + 15) +#define __KERNEL_PDA (GDT_ENTRY_PDA * 8) + #define GDT_ENTRY_DOUBLEFAULT_TSS 31 /* -- cgit v1.2.3-70-g09d2 From 62111195800d80c66cdc69063ea3145878c99fbf Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 7 Dec 2006 02:14:02 +0100 Subject: [PATCH] i386: Initialize the per-CPU data area When a CPU is brought up, a PDA and GDT are allocated for it. The GDT's __KERNEL_PDA entry is pointed to the allocated PDA memory, so that all references using this segment descriptor will refer to the PDA. This patch rearranges CPU initialization a bit, so that the GDT/PDA are set up as early as possible in cpu_init(). Also for secondary CPUs, GDT+PDA are preallocated and initialized so all the secondary CPU needs to do is set up the ldt and load %gs. This will be important once smp_processor_id() and current use the PDA. In all cases, the PDA is set up in head.S, before a CPU starts running C code, so the PDA is always available. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Chuck Ebbert <76306.1226@compuserve.com> Cc: Zachary Amsden Cc: Jan Beulich Cc: Andi Kleen Cc: James Bottomley Cc: Matt Tolentino Signed-off-by: Andrew Morton --- arch/i386/kernel/cpu/common.c | 177 ++++++++++++++++++++++++++--------- arch/i386/kernel/smpboot.c | 28 ++++-- arch/i386/mach-voyager/voyager_smp.c | 14 ++- include/asm-i386/processor.h | 3 + 4 files changed, 172 insertions(+), 50 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index 5532fc4e1bf..2534e25ed74 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -18,12 +18,16 @@ #include #include #endif +#include #include "cpu.h" DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr); EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr); +struct i386_pda *_cpu_pda[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(_cpu_pda); + static int cachesize_override __cpuinitdata = -1; static int disable_x86_fxsr __cpuinitdata; static int disable_x86_serial_nr __cpuinitdata = 1; @@ -588,41 +592,16 @@ void __init early_cpu_init(void) disable_pse = 1; #endif } -/* - * cpu_init() initializes state that is per-CPU. Some data is already - * initialized (naturally) in the bootstrap process, such as the GDT - * and IDT. We reload them nevertheless, this function acts as a - * 'CPU state barrier', nothing should get across. - */ -void __cpuinit cpu_init(void) + +__cpuinit int alloc_gdt(int cpu) { - int cpu = smp_processor_id(); - struct tss_struct * t = &per_cpu(init_tss, cpu); - struct thread_struct *thread = ¤t->thread; - struct desc_struct *gdt; struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); + struct desc_struct *gdt; + struct i386_pda *pda; - if (cpu_test_and_set(cpu, cpu_initialized)) { - printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); - for (;;) local_irq_enable(); - } - printk(KERN_INFO "Initializing CPU#%d\n", cpu); + gdt = (struct desc_struct *)cpu_gdt_descr->address; + pda = cpu_pda(cpu); - if (cpu_has_vme || cpu_has_tsc || cpu_has_de) - clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); - if (tsc_disable && cpu_has_tsc) { - printk(KERN_NOTICE "Disabling TSC...\n"); - /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/ - clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability); - set_in_cr4(X86_CR4_TSD); - } - - /* The CPU hotplug case */ - if (cpu_gdt_descr->address) { - gdt = (struct desc_struct *)cpu_gdt_descr->address; - memset(gdt, 0, PAGE_SIZE); - goto old_gdt; - } /* * This is a horrible hack to allocate the GDT. The problem * is that cpu_init() is called really early for the boot CPU @@ -630,36 +609,117 @@ void __cpuinit cpu_init(void) * CPUs, when bootmem will have gone away */ if (NODE_DATA(0)->bdata->node_bootmem_map) { - gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE); - /* alloc_bootmem_pages panics on failure, so no check */ + BUG_ON(gdt != NULL || pda != NULL); + + gdt = alloc_bootmem_pages(PAGE_SIZE); + pda = alloc_bootmem(sizeof(*pda)); + /* alloc_bootmem(_pages) panics on failure, so no check */ + memset(gdt, 0, PAGE_SIZE); + memset(pda, 0, sizeof(*pda)); } else { - gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL); - if (unlikely(!gdt)) { - printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu); - for (;;) - local_irq_enable(); + /* GDT and PDA might already have been allocated if + this is a CPU hotplug re-insertion. */ + if (gdt == NULL) + gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL); + + if (pda == NULL) + pda = kmalloc_node(sizeof(*pda), GFP_KERNEL, cpu_to_node(cpu)); + + if (unlikely(!gdt || !pda)) { + free_pages((unsigned long)gdt, 0); + kfree(pda); + return 0; } } -old_gdt: + + cpu_gdt_descr->address = (unsigned long)gdt; + cpu_pda(cpu) = pda; + + return 1; +} + +/* Initial PDA used by boot CPU */ +struct i386_pda boot_pda = { + ._pda = &boot_pda, +}; + +/* Initialize the CPU's GDT and PDA. The boot CPU does this for + itself, but secondaries find this done for them. */ +__cpuinit int init_gdt(int cpu, struct task_struct *idle) +{ + struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); + struct desc_struct *gdt; + struct i386_pda *pda; + + /* For non-boot CPUs, the GDT and PDA should already have been + allocated. */ + if (!alloc_gdt(cpu)) { + printk(KERN_CRIT "CPU%d failed to allocate GDT or PDA\n", cpu); + return 0; + } + + gdt = (struct desc_struct *)cpu_gdt_descr->address; + pda = cpu_pda(cpu); + + BUG_ON(gdt == NULL || pda == NULL); + /* * Initialize the per-CPU GDT with the boot GDT, * and set up the GDT descriptor: */ memcpy(gdt, cpu_gdt_table, GDT_SIZE); cpu_gdt_descr->size = GDT_SIZE - 1; - cpu_gdt_descr->address = (unsigned long)gdt; + pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a, + (u32 *)&gdt[GDT_ENTRY_PDA].b, + (unsigned long)pda, sizeof(*pda) - 1, + 0x80 | DESCTYPE_S | 0x2, 0); /* present read-write data segment */ + + memset(pda, 0, sizeof(*pda)); + pda->_pda = pda; + + return 1; +} + +/* Common CPU init for both boot and secondary CPUs */ +static void __cpuinit _cpu_init(int cpu, struct task_struct *curr) +{ + struct tss_struct * t = &per_cpu(init_tss, cpu); + struct thread_struct *thread = &curr->thread; + struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); + + /* Reinit these anyway, even if they've already been done (on + the boot CPU, this will transition from the boot gdt+pda to + the real ones). */ load_gdt(cpu_gdt_descr); + + if (cpu_test_and_set(cpu, cpu_initialized)) { + printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); + for (;;) local_irq_enable(); + } + + printk(KERN_INFO "Initializing CPU#%d\n", cpu); + + if (cpu_has_vme || cpu_has_tsc || cpu_has_de) + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + if (tsc_disable && cpu_has_tsc) { + printk(KERN_NOTICE "Disabling TSC...\n"); + /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/ + clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability); + set_in_cr4(X86_CR4_TSD); + } + load_idt(&idt_descr); /* * Set up and load the per-CPU TSS and LDT */ atomic_inc(&init_mm.mm_count); - current->active_mm = &init_mm; - BUG_ON(current->mm); - enter_lazy_tlb(&init_mm, current); + curr->active_mm = &init_mm; + if (curr->mm) + BUG(); + enter_lazy_tlb(&init_mm, curr); load_esp0(t, thread); set_tss_desc(cpu,t); @@ -690,6 +750,37 @@ old_gdt: mxcsr_feature_mask_init(); } +/* Entrypoint to initialize secondary CPU */ +void __cpuinit secondary_cpu_init(void) +{ + int cpu = smp_processor_id(); + struct task_struct *curr = current; + + _cpu_init(cpu, curr); +} + +/* + * cpu_init() initializes state that is per-CPU. Some data is already + * initialized (naturally) in the bootstrap process, such as the GDT + * and IDT. We reload them nevertheless, this function acts as a + * 'CPU state barrier', nothing should get across. + */ +void __cpuinit cpu_init(void) +{ + int cpu = smp_processor_id(); + struct task_struct *curr = current; + + /* Set up the real GDT and PDA, so we can transition from the + boot versions. */ + if (!init_gdt(cpu, curr)) { + /* failed to allocate something; not much we can do... */ + for (;;) + local_irq_enable(); + } + + _cpu_init(cpu, curr); +} + #ifdef CONFIG_HOTPLUG_CPU void __cpuinit cpu_uninit(void) { diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 4bb8b77cd65..095636620fa 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include @@ -536,11 +537,11 @@ set_cpu_sibling_map(int cpu) static void __devinit start_secondary(void *unused) { /* - * Dont put anything before smp_callin(), SMP + * Don't put *anything* before secondary_cpu_init(), SMP * booting is too fragile that we want to limit the * things done here to the most necessary things. */ - cpu_init(); + secondary_cpu_init(); preempt_disable(); smp_callin(); while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) @@ -599,13 +600,16 @@ void __devinit initialize_secondary(void) "movl %0,%%esp\n\t" "jmp *%1" : - :"r" (current->thread.esp),"r" (current->thread.eip)); + :"m" (current->thread.esp),"m" (current->thread.eip)); } +/* Static state in head.S used to set up a CPU */ extern struct { void * esp; unsigned short ss; } stack_start; +extern struct i386_pda *start_pda; +extern struct Xgt_desc_struct cpu_gdt_descr; #ifdef CONFIG_NUMA @@ -936,9 +940,6 @@ static int __devinit do_boot_cpu(int apicid, int cpu) unsigned long start_eip; unsigned short nmi_high = 0, nmi_low = 0; - ++cpucount; - alternatives_smp_switch(1); - /* * We can't use kernel_thread since we must avoid to * reschedule the child. @@ -946,15 +947,30 @@ static int __devinit do_boot_cpu(int apicid, int cpu) idle = alloc_idle_task(cpu); if (IS_ERR(idle)) panic("failed fork for CPU %d", cpu); + + /* Pre-allocate and initialize the CPU's GDT and PDA so it + doesn't have to do any memory allocation during the + delicate CPU-bringup phase. */ + if (!init_gdt(cpu, idle)) { + printk(KERN_INFO "Couldn't allocate GDT/PDA for CPU %d\n", cpu); + return -1; /* ? */ + } + idle->thread.eip = (unsigned long) start_secondary; /* start_eip had better be page-aligned! */ start_eip = setup_trampoline(); + ++cpucount; + alternatives_smp_switch(1); + /* So we see what's up */ printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip); /* Stack for startup_32 can be just as for start_secondary onwards */ stack_start.esp = (void *) idle->thread.esp; + start_pda = cpu_pda(cpu); + cpu_gdt_descr = per_cpu(cpu_gdt_descr, cpu); + irq_ctx_init(cpu); x86_cpu_to_apicid[cpu] = apicid; diff --git a/arch/i386/mach-voyager/voyager_smp.c b/arch/i386/mach-voyager/voyager_smp.c index f3fea2ad50f..55428e656a3 100644 --- a/arch/i386/mach-voyager/voyager_smp.c +++ b/arch/i386/mach-voyager/voyager_smp.c @@ -28,6 +28,7 @@ #include #include #include +#include /* TLB state -- visible externally, indexed physically */ DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0 }; @@ -422,6 +423,7 @@ find_smp_config(void) VOYAGER_SUS_IN_CONTROL_PORT); current_thread_info()->cpu = boot_cpu_id; + write_pda(cpu_number, boot_cpu_id); } /* @@ -458,7 +460,7 @@ start_secondary(void *unused) /* external functions not defined in the headers */ extern void calibrate_delay(void); - cpu_init(); + secondary_cpu_init(); /* OK, we're in the routine */ ack_CPI(VIC_CPU_BOOT_CPI); @@ -578,6 +580,15 @@ do_boot_cpu(__u8 cpu) /* init_tasks (in sched.c) is indexed logically */ stack_start.esp = (void *) idle->thread.esp; + /* Pre-allocate and initialize the CPU's GDT and PDA so it + doesn't have to do any memory allocation during the + delicate CPU-bringup phase. */ + if (!init_gdt(cpu, idle)) { + printk(KERN_INFO "Couldn't allocate GDT/PDA for CPU %d\n", cpu); + cpucount--; + return; + } + irq_ctx_init(cpu); /* Note: Don't modify initial ss override */ @@ -1963,4 +1974,5 @@ void __init smp_setup_processor_id(void) { current_thread_info()->cpu = hard_smp_processor_id(); + write_pda(cpu_number, hard_smp_processor_id()); } diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h index e0ddca94d50..a9f2041c7c8 100644 --- a/include/asm-i386/processor.h +++ b/include/asm-i386/processor.h @@ -727,4 +727,7 @@ extern unsigned long boot_option_idle_override; extern void enable_sep_cpu(void); extern int sysenter_setup(void); +extern int init_gdt(int cpu, struct task_struct *idle); +extern void secondary_cpu_init(void); + #endif /* __ASM_I386_PROCESSOR_H */ -- cgit v1.2.3-70-g09d2 From f95d47caae5302a63d92be9a0292abc90e2a14e1 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 7 Dec 2006 02:14:02 +0100 Subject: [PATCH] i386: Use %gs as the PDA base-segment in the kernel This patch is the meat of the PDA change. This patch makes several related changes: 1: Most significantly, %gs is now used in the kernel. This means that on entry, the old value of %gs is saved away, and it is reloaded with __KERNEL_PDA. 2: entry.S constructs the stack in the shape of struct pt_regs, and this is passed around the kernel so that the process's saved register state can be accessed. Unfortunately struct pt_regs doesn't currently have space for %gs (or %fs). This patch extends pt_regs to add space for gs (no space is allocated for %fs, since it won't be used, and it would just complicate the code in entry.S to work around the space). 3: Because %gs is now saved on the stack like %ds, %es and the integer registers, there are a number of places where it no longer needs to be handled specially; namely context switch, and saving/restoring the register state in a signal context. 4: And since kernel threads run in kernel space and call normal kernel code, they need to be created with their %gs == __KERNEL_PDA. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Chuck Ebbert <76306.1226@compuserve.com> Cc: Zachary Amsden Cc: Jan Beulich Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/asm-offsets.c | 1 + arch/i386/kernel/cpu/common.c | 21 +++++++++++-- arch/i386/kernel/entry.S | 70 +++++++++++++++++++++++++++++------------- arch/i386/kernel/head.S | 31 ++++++++++++++++--- arch/i386/kernel/process.c | 26 ++++++++-------- arch/i386/kernel/signal.c | 6 ++-- include/asm-i386/mmu_context.h | 4 +-- include/asm-i386/processor.h | 4 ++- include/asm-i386/ptrace.h | 2 ++ kernel/fork.c | 2 +- 10 files changed, 117 insertions(+), 50 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index 70b19807acf..9620872d353 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -72,6 +72,7 @@ void foo(void) OFFSET(PT_EAX, pt_regs, eax); OFFSET(PT_DS, pt_regs, xds); OFFSET(PT_ES, pt_regs, xes); + OFFSET(PT_GS, pt_regs, xgs); OFFSET(PT_ORIG_EAX, pt_regs, orig_eax); OFFSET(PT_EIP, pt_regs, eip); OFFSET(PT_CS, pt_regs, xcs); diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index 2534e25ed74..4e63d8ce602 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -593,6 +593,14 @@ void __init early_cpu_init(void) #endif } +/* Make sure %gs is initialized properly in idle threads */ +struct pt_regs * __devinit idle_regs(struct pt_regs *regs) +{ + memset(regs, 0, sizeof(struct pt_regs)); + regs->xgs = __KERNEL_PDA; + return regs; +} + __cpuinit int alloc_gdt(int cpu) { struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); @@ -644,6 +652,14 @@ struct i386_pda boot_pda = { ._pda = &boot_pda, }; +static inline void set_kernel_gs(void) +{ + /* Set %gs for this CPU's PDA. Memory clobber is to create a + barrier with respect to any PDA operations, so the compiler + doesn't move any before here. */ + asm volatile ("mov %0, %%gs" : : "r" (__KERNEL_PDA) : "memory"); +} + /* Initialize the CPU's GDT and PDA. The boot CPU does this for itself, but secondaries find this done for them. */ __cpuinit int init_gdt(int cpu, struct task_struct *idle) @@ -693,6 +709,7 @@ static void __cpuinit _cpu_init(int cpu, struct task_struct *curr) the boot CPU, this will transition from the boot gdt+pda to the real ones). */ load_gdt(cpu_gdt_descr); + set_kernel_gs(); if (cpu_test_and_set(cpu, cpu_initialized)) { printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); @@ -731,8 +748,8 @@ static void __cpuinit _cpu_init(int cpu, struct task_struct *curr) __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); #endif - /* Clear %fs and %gs. */ - asm volatile ("movl %0, %%fs; movl %0, %%gs" : : "r" (0)); + /* Clear %fs. */ + asm volatile ("mov %0, %%fs" : : "r" (0)); /* Clear all 6 debug registers: */ set_debugreg(0, 0); diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 0069bf01603..b99d4a16007 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -30,12 +30,13 @@ * 18(%esp) - %eax * 1C(%esp) - %ds * 20(%esp) - %es - * 24(%esp) - orig_eax - * 28(%esp) - %eip - * 2C(%esp) - %cs - * 30(%esp) - %eflags - * 34(%esp) - %oldesp - * 38(%esp) - %oldss + * 24(%esp) - %gs + * 28(%esp) - orig_eax + * 2C(%esp) - %eip + * 30(%esp) - %cs + * 34(%esp) - %eflags + * 38(%esp) - %oldesp + * 3C(%esp) - %oldss * * "current" is in register %ebx during any slow entries. */ @@ -92,6 +93,9 @@ VM_MASK = 0x00020000 #define SAVE_ALL \ cld; \ + pushl %gs; \ + CFI_ADJUST_CFA_OFFSET 4;\ + /*CFI_REL_OFFSET gs, 0;*/\ pushl %es; \ CFI_ADJUST_CFA_OFFSET 4;\ /*CFI_REL_OFFSET es, 0;*/\ @@ -121,7 +125,9 @@ VM_MASK = 0x00020000 CFI_REL_OFFSET ebx, 0;\ movl $(__USER_DS), %edx; \ movl %edx, %ds; \ - movl %edx, %es; + movl %edx, %es; \ + movl $(__KERNEL_PDA), %edx; \ + movl %edx, %gs #define RESTORE_INT_REGS \ popl %ebx; \ @@ -154,17 +160,22 @@ VM_MASK = 0x00020000 2: popl %es; \ CFI_ADJUST_CFA_OFFSET -4;\ /*CFI_RESTORE es;*/\ -.section .fixup,"ax"; \ -3: movl $0,(%esp); \ - jmp 1b; \ +3: popl %gs; \ + CFI_ADJUST_CFA_OFFSET -4;\ + /*CFI_RESTORE gs;*/\ +.pushsection .fixup,"ax"; \ 4: movl $0,(%esp); \ + jmp 1b; \ +5: movl $0,(%esp); \ jmp 2b; \ -.previous; \ +6: movl $0,(%esp); \ + jmp 3b; \ .section __ex_table,"a";\ .align 4; \ - .long 1b,3b; \ - .long 2b,4b; \ -.previous + .long 1b,4b; \ + .long 2b,5b; \ + .long 3b,6b; \ +.popsection #define RING0_INT_FRAME \ CFI_STARTPROC simple;\ @@ -231,6 +242,7 @@ check_userspace: andl $(VM_MASK | SEGMENT_RPL_MASK), %eax cmpl $USER_RPL, %eax jb resume_kernel # not returning to v8086 or userspace + ENTRY(resume_userspace) DISABLE_INTERRUPTS # make sure we don't miss an interrupt # setting need_resched or sigpending @@ -327,9 +339,16 @@ sysenter_past_esp: movl PT_OLDESP(%esp), %ecx xorl %ebp,%ebp TRACE_IRQS_ON +1: mov PT_GS(%esp), %gs ENABLE_INTERRUPTS_SYSEXIT CFI_ENDPROC - +.pushsection .fixup,"ax" +2: movl $0,PT_GS(%esp) + jmp 1b +.section __ex_table,"a" + .align 4 + .long 1b,2b +.popsection # system call handler stub ENTRY(system_call) @@ -375,7 +394,7 @@ restore_nocheck: TRACE_IRQS_IRET restore_nocheck_notrace: RESTORE_REGS - addl $4, %esp + addl $4, %esp # skip orig_eax/error_code CFI_ADJUST_CFA_OFFSET -4 1: INTERRUPT_RETURN .section .fixup,"ax" @@ -588,6 +607,10 @@ KPROBE_ENTRY(page_fault) CFI_ADJUST_CFA_OFFSET 4 ALIGN error_code: + /* the function address is in %gs's slot on the stack */ + pushl %es + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET es, 0*/ pushl %ds CFI_ADJUST_CFA_OFFSET 4 /*CFI_REL_OFFSET ds, 0*/ @@ -613,18 +636,20 @@ error_code: CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET ebx, 0 cld - pushl %es + pushl %gs CFI_ADJUST_CFA_OFFSET 4 - /*CFI_REL_OFFSET es, 0*/ + /*CFI_REL_OFFSET gs, 0*/ + movl $(__KERNEL_PDA), %ecx + movl %ecx, %gs UNWIND_ESPFIX_STACK popl %ecx CFI_ADJUST_CFA_OFFSET -4 /*CFI_REGISTER es, ecx*/ - movl PT_ES(%esp), %edi # get the function address + movl PT_GS(%esp), %edi # get the function address movl PT_ORIG_EAX(%esp), %edx # get the error code - movl $-1, PT_ORIG_EAX(%esp) - movl %ecx, PT_ES(%esp) - /*CFI_REL_OFFSET es, ES*/ + movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart + mov %ecx, PT_GS(%esp) + /*CFI_REL_OFFSET gs, ES*/ movl $(__USER_DS), %ecx movl %ecx, %ds movl %ecx, %es @@ -936,6 +961,7 @@ ENTRY(arch_unwind_init_running) movl %ebx, PT_EAX(%edx) movl $__USER_DS, PT_DS(%edx) movl $__USER_DS, PT_ES(%edx) + movl $0, PT_GS(%edx) movl %ebx, PT_ORIG_EAX(%edx) movl %ecx, PT_EIP(%edx) movl 12(%esp), %ecx diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index 4a83384c5a6..5b14e95ac8b 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -302,6 +302,7 @@ is386: movl $2,%ecx # set MP movl %eax,%cr0 call check_x87 + call setup_pda lgdt cpu_gdt_descr lidt idt_descr ljmp $(__KERNEL_CS),$1f @@ -312,10 +313,13 @@ is386: movl $2,%ecx # set MP movl %eax,%ds movl %eax,%es - xorl %eax,%eax # Clear FS/GS and LDT + xorl %eax,%eax # Clear FS and LDT movl %eax,%fs - movl %eax,%gs lldt %ax + + movl $(__KERNEL_PDA),%eax + mov %eax,%gs + cld # gcc2 wants the direction flag cleared at all times pushl $0 # fake return address for unwinder #ifdef CONFIG_SMP @@ -345,6 +349,23 @@ check_x87: .byte 0xDB,0xE4 /* fsetpm for 287, ignored by 387 */ ret +/* + * Point the GDT at this CPU's PDA. On boot this will be + * cpu_gdt_table and boot_pda; for secondary CPUs, these will be + * that CPU's GDT and PDA. + */ +setup_pda: + /* get the PDA pointer */ + movl start_pda, %eax + + /* slot the PDA address into the GDT */ + mov cpu_gdt_descr+2, %ecx + mov %ax, (__KERNEL_PDA+0+2)(%ecx) /* base & 0x0000ffff */ + shr $16, %eax + mov %al, (__KERNEL_PDA+4+0)(%ecx) /* base & 0x00ff0000 */ + mov %ah, (__KERNEL_PDA+4+3)(%ecx) /* base & 0xff000000 */ + ret + /* * setup_idt * @@ -484,6 +505,8 @@ ENTRY(empty_zero_page) * This starts the data section. */ .data +ENTRY(start_pda) + .long boot_pda ENTRY(stack_start) .long init_thread_union+THREAD_SIZE @@ -525,7 +548,7 @@ idt_descr: # boot GDT descriptor (later on used by CPU#0): .word 0 # 32 bit align gdt_desc.address -cpu_gdt_descr: +ENTRY(cpu_gdt_descr) .word GDT_ENTRIES*8-1 .long cpu_gdt_table @@ -585,7 +608,7 @@ ENTRY(cpu_gdt_table) .quad 0x004092000000ffff /* 0xc8 APM DS data */ .quad 0x00c0920000000000 /* 0xd0 - ESPFIX SS */ - .quad 0x0000000000000000 /* 0xd8 - PDA */ + .quad 0x00cf92000000ffff /* 0xd8 - PDA */ .quad 0x0000000000000000 /* 0xe0 - unused */ .quad 0x0000000000000000 /* 0xe8 - unused */ .quad 0x0000000000000000 /* 0xf0 - unused */ diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index ae924c416b6..905364d4284 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -56,6 +56,7 @@ #include #include +#include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -346,6 +347,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) regs.xds = __USER_DS; regs.xes = __USER_DS; + regs.xgs = __KERNEL_PDA; regs.orig_eax = -1; regs.eip = (unsigned long) kernel_thread_helper; regs.xcs = __KERNEL_CS | get_kernel_rpl(); @@ -431,7 +433,6 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, p->thread.eip = (unsigned long) ret_from_fork; savesegment(fs,p->thread.fs); - savesegment(gs,p->thread.gs); tsk = current; if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { @@ -659,16 +660,16 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas load_esp0(tss, next); /* - * Save away %fs and %gs. No need to save %es and %ds, as - * those are always kernel segments while inside the kernel. - * Doing this before setting the new TLS descriptors avoids - * the situation where we temporarily have non-reloadable - * segments in %fs and %gs. This could be an issue if the - * NMI handler ever used %fs or %gs (it does not today), or - * if the kernel is running inside of a hypervisor layer. + * Save away %fs. No need to save %gs, as it was saved on the + * stack on entry. No need to save %es and %ds, as those are + * always kernel segments while inside the kernel. Doing this + * before setting the new TLS descriptors avoids the situation + * where we temporarily have non-reloadable segments in %fs + * and %gs. This could be an issue if the NMI handler ever + * used %fs or %gs (it does not today), or if the kernel is + * running inside of a hypervisor layer. */ savesegment(fs, prev->fs); - savesegment(gs, prev->gs); /* * Load the per-thread Thread-Local Storage descriptor. @@ -676,16 +677,13 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas load_TLS(next, cpu); /* - * Restore %fs and %gs if needed. + * Restore %fs if needed. * - * Glibc normally makes %fs be zero, and %gs is one of - * the TLS segments. + * Glibc normally makes %fs be zero. */ if (unlikely(prev->fs | next->fs)) loadsegment(fs, next->fs); - if (prev->gs | next->gs) - loadsegment(gs, next->gs); /* * Restore IOPL if needed. diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c index 43002cfb40c..65d7620eaa0 100644 --- a/arch/i386/kernel/signal.c +++ b/arch/i386/kernel/signal.c @@ -128,7 +128,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \ X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF) - GET_SEG(gs); + COPY_SEG(gs); GET_SEG(fs); COPY_SEG(es); COPY_SEG(ds); @@ -244,9 +244,7 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate, { int tmp, err = 0; - tmp = 0; - savesegment(gs, tmp); - err |= __put_user(tmp, (unsigned int __user *)&sc->gs); + err |= __put_user(regs->xgs, (unsigned int __user *)&sc->gs); savesegment(fs, tmp); err |= __put_user(tmp, (unsigned int __user *)&sc->fs); diff --git a/include/asm-i386/mmu_context.h b/include/asm-i386/mmu_context.h index 1b1495372c4..68ff102d6f5 100644 --- a/include/asm-i386/mmu_context.h +++ b/include/asm-i386/mmu_context.h @@ -62,8 +62,8 @@ static inline void switch_mm(struct mm_struct *prev, #endif } -#define deactivate_mm(tsk, mm) \ - asm("movl %0,%%fs ; movl %0,%%gs": :"r" (0)) +#define deactivate_mm(tsk, mm) \ + asm("movl %0,%%fs": :"r" (0)); #define activate_mm(prev, next) \ switch_mm((prev),(next),NULL) diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h index a9f2041c7c8..f73cf836e64 100644 --- a/include/asm-i386/processor.h +++ b/include/asm-i386/processor.h @@ -473,6 +473,7 @@ struct thread_struct { .vm86_info = NULL, \ .sysenter_cs = __KERNEL_CS, \ .io_bitmap_ptr = NULL, \ + .gs = __KERNEL_PDA, \ } /* @@ -500,7 +501,8 @@ static inline void load_esp0(struct tss_struct *tss, struct thread_struct *threa } #define start_thread(regs, new_eip, new_esp) do { \ - __asm__("movl %0,%%fs ; movl %0,%%gs": :"r" (0)); \ + __asm__("movl %0,%%fs": :"r" (0)); \ + regs->xgs = 0; \ set_fs(USER_DS); \ regs->xds = __USER_DS; \ regs->xes = __USER_DS; \ diff --git a/include/asm-i386/ptrace.h b/include/asm-i386/ptrace.h index d505f501077..bdbc894339b 100644 --- a/include/asm-i386/ptrace.h +++ b/include/asm-i386/ptrace.h @@ -16,6 +16,8 @@ struct pt_regs { long eax; int xds; int xes; + /* int xfs; */ + int xgs; long orig_eax; long eip; int xcs; diff --git a/kernel/fork.c b/kernel/fork.c index 8cdd3e72ba5..fd22245e388 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1303,7 +1303,7 @@ fork_out: return ERR_PTR(retval); } -struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs) +noinline struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs) { memset(regs, 0, sizeof(struct pt_regs)); return regs; -- cgit v1.2.3-70-g09d2 From 66e10a44d724f1464b5e8b5a3eae1e2cbbc2cca6 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 7 Dec 2006 02:14:02 +0100 Subject: [PATCH] i386: Fix places where using %gs changes the usermode ABI There are a few places where the change in struct pt_regs and the use of %gs affect the userspace ABI. These are primarily debugging interfaces where thread state can be inspected or extracted. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Chuck Ebbert <76306.1226@compuserve.com> Cc: Zachary Amsden Cc: Jan Beulich Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/process.c | 6 +++--- arch/i386/kernel/ptrace.c | 18 ++++++------------ include/asm-i386/elf.h | 2 +- include/asm-i386/unwind.h | 1 + 4 files changed, 11 insertions(+), 16 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 905364d4284..dc427254517 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -315,8 +315,8 @@ void show_regs(struct pt_regs * regs) regs->eax,regs->ebx,regs->ecx,regs->edx); printk("ESI: %08lx EDI: %08lx EBP: %08lx", regs->esi, regs->edi, regs->ebp); - printk(" DS: %04x ES: %04x\n", - 0xffff & regs->xds,0xffff & regs->xes); + printk(" DS: %04x ES: %04x GS: %04x\n", + 0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xgs); cr0 = read_cr0(); cr2 = read_cr2(); @@ -509,7 +509,7 @@ void dump_thread(struct pt_regs * regs, struct user * dump) dump->regs.ds = regs->xds; dump->regs.es = regs->xes; savesegment(fs,dump->regs.fs); - savesegment(gs,dump->regs.gs); + dump->regs.gs = regs->xgs; dump->regs.orig_eax = regs->orig_eax; dump->regs.eip = regs->eip; dump->regs.cs = regs->xcs; diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c index 775f50e9395..f3f94ac5736 100644 --- a/arch/i386/kernel/ptrace.c +++ b/arch/i386/kernel/ptrace.c @@ -94,13 +94,9 @@ static int putreg(struct task_struct *child, return -EIO; child->thread.fs = value; return 0; - case GS: - if (value && (value & 3) != 3) - return -EIO; - child->thread.gs = value; - return 0; case DS: case ES: + case GS: if (value && (value & 3) != 3) return -EIO; value &= 0xffff; @@ -116,8 +112,8 @@ static int putreg(struct task_struct *child, value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK; break; } - if (regno > GS*4) - regno -= 2*4; + if (regno > ES*4) + regno -= 1*4; put_stack_long(child, regno - sizeof(struct pt_regs), value); return 0; } @@ -131,18 +127,16 @@ static unsigned long getreg(struct task_struct *child, case FS: retval = child->thread.fs; break; - case GS: - retval = child->thread.gs; - break; case DS: case ES: + case GS: case SS: case CS: retval = 0xffff; /* fall through */ default: - if (regno > GS*4) - regno -= 2*4; + if (regno > ES*4) + regno -= 1*4; regno = regno - sizeof(struct pt_regs); retval &= get_stack_long(child, regno); } diff --git a/include/asm-i386/elf.h b/include/asm-i386/elf.h index 3a05436f31c..45d21a0c95b 100644 --- a/include/asm-i386/elf.h +++ b/include/asm-i386/elf.h @@ -91,7 +91,7 @@ typedef struct user_fxsr_struct elf_fpxregset_t; pr_reg[7] = regs->xds; \ pr_reg[8] = regs->xes; \ savesegment(fs,pr_reg[9]); \ - savesegment(gs,pr_reg[10]); \ + pr_reg[10] = regs->xgs; \ pr_reg[11] = regs->orig_eax; \ pr_reg[12] = regs->eip; \ pr_reg[13] = regs->xcs; \ diff --git a/include/asm-i386/unwind.h b/include/asm-i386/unwind.h index 5031d693b89..601fc67bd77 100644 --- a/include/asm-i386/unwind.h +++ b/include/asm-i386/unwind.h @@ -71,6 +71,7 @@ static inline void arch_unw_init_blocked(struct unwind_frame_info *info) info->regs.xss = __KERNEL_DS; info->regs.xds = __USER_DS; info->regs.xes = __USER_DS; + info->regs.xgs = __KERNEL_PDA; } extern asmlinkage int arch_unwind_init_running(struct unwind_frame_info *, -- cgit v1.2.3-70-g09d2 From 49d26b6eaa8e970c8cf6e299e6ccba2474191bf5 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 7 Dec 2006 02:14:03 +0100 Subject: [PATCH] i386: Update sys_vm86 to cope with changed pt_regs and %gs usage sys_vm86 uses a struct kernel_vm86_regs, which is identical to pt_regs, but adds an extra space for all the segment registers. Previously this structure was completely independent, so changes in pt_regs had to be reflected in kernel_vm86_regs. This changes just embeds pt_regs in kernel_vm86_regs, and makes the appropriate changes to vm86.c to deal with the new naming. Also, since %gs is dealt with differently in the kernel, this change adjusts vm86.c to reflect this. While making these changes, I also cleaned up some frankly bizarre code which was added when auditing was added to sys_vm86. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Chuck Ebbert <76306.1226@compuserve.com> Cc: Zachary Amsden Cc: Jan Beulich Cc: Andi Kleen Cc: Al Viro Cc: Jason Baron Cc: Chris Wright Signed-off-by: Andrew Morton --- arch/i386/kernel/vm86.c | 121 +++++++++++++++++++++++++++++------------------- include/asm-i386/vm86.h | 17 +------ 2 files changed, 76 insertions(+), 62 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/vm86.c b/arch/i386/kernel/vm86.c index cbcd61d6120..be2f96e67f7 100644 --- a/arch/i386/kernel/vm86.c +++ b/arch/i386/kernel/vm86.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -72,10 +73,10 @@ /* * 8- and 16-bit register defines.. */ -#define AL(regs) (((unsigned char *)&((regs)->eax))[0]) -#define AH(regs) (((unsigned char *)&((regs)->eax))[1]) -#define IP(regs) (*(unsigned short *)&((regs)->eip)) -#define SP(regs) (*(unsigned short *)&((regs)->esp)) +#define AL(regs) (((unsigned char *)&((regs)->pt.eax))[0]) +#define AH(regs) (((unsigned char *)&((regs)->pt.eax))[1]) +#define IP(regs) (*(unsigned short *)&((regs)->pt.eip)) +#define SP(regs) (*(unsigned short *)&((regs)->pt.esp)) /* * virtual flags (16 and 32-bit versions) @@ -89,10 +90,37 @@ #define SAFE_MASK (0xDD5) #define RETURN_MASK (0xDFF) -#define VM86_REGS_PART2 orig_eax -#define VM86_REGS_SIZE1 \ - ( (unsigned)( & (((struct kernel_vm86_regs *)0)->VM86_REGS_PART2) ) ) -#define VM86_REGS_SIZE2 (sizeof(struct kernel_vm86_regs) - VM86_REGS_SIZE1) +/* convert kernel_vm86_regs to vm86_regs */ +static int copy_vm86_regs_to_user(struct vm86_regs __user *user, + const struct kernel_vm86_regs *regs) +{ + int ret = 0; + + /* kernel_vm86_regs is missing xfs, so copy everything up to + (but not including) xgs, and then rest after xgs. */ + ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.xgs)); + ret += copy_to_user(&user->__null_gs, ®s->pt.xgs, + sizeof(struct kernel_vm86_regs) - + offsetof(struct kernel_vm86_regs, pt.xgs)); + + return ret; +} + +/* convert vm86_regs to kernel_vm86_regs */ +static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs, + const struct vm86_regs __user *user, + unsigned extra) +{ + int ret = 0; + + ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.xgs)); + ret += copy_from_user(®s->pt.xgs, &user->__null_gs, + sizeof(struct kernel_vm86_regs) - + offsetof(struct kernel_vm86_regs, pt.xgs) + + extra); + + return ret; +} struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs)); struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs) @@ -112,10 +140,8 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs) printk("no vm86_info: BAD\n"); do_exit(SIGSEGV); } - set_flags(regs->eflags, VEFLAGS, VIF_MASK | current->thread.v86mask); - tmp = copy_to_user(¤t->thread.vm86_info->regs,regs, VM86_REGS_SIZE1); - tmp += copy_to_user(¤t->thread.vm86_info->regs.VM86_REGS_PART2, - ®s->VM86_REGS_PART2, VM86_REGS_SIZE2); + set_flags(regs->pt.eflags, VEFLAGS, VIF_MASK | current->thread.v86mask); + tmp = copy_vm86_regs_to_user(¤t->thread.vm86_info->regs,regs); tmp += put_user(current->thread.screen_bitmap,¤t->thread.vm86_info->screen_bitmap); if (tmp) { printk("vm86: could not access userspace vm86_info\n"); @@ -129,9 +155,11 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs) current->thread.saved_esp0 = 0; put_cpu(); - loadsegment(fs, current->thread.saved_fs); - loadsegment(gs, current->thread.saved_gs); ret = KVM86->regs32; + + loadsegment(fs, current->thread.saved_fs); + ret->xgs = current->thread.saved_gs; + return ret; } @@ -183,9 +211,9 @@ asmlinkage int sys_vm86old(struct pt_regs regs) tsk = current; if (tsk->thread.saved_esp0) goto out; - tmp = copy_from_user(&info, v86, VM86_REGS_SIZE1); - tmp += copy_from_user(&info.regs.VM86_REGS_PART2, &v86->regs.VM86_REGS_PART2, - (long)&info.vm86plus - (long)&info.regs.VM86_REGS_PART2); + tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, + offsetof(struct kernel_vm86_struct, vm86plus) - + sizeof(info.regs)); ret = -EFAULT; if (tmp) goto out; @@ -233,9 +261,9 @@ asmlinkage int sys_vm86(struct pt_regs regs) if (tsk->thread.saved_esp0) goto out; v86 = (struct vm86plus_struct __user *)regs.ecx; - tmp = copy_from_user(&info, v86, VM86_REGS_SIZE1); - tmp += copy_from_user(&info.regs.VM86_REGS_PART2, &v86->regs.VM86_REGS_PART2, - (long)&info.regs32 - (long)&info.regs.VM86_REGS_PART2); + tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, + offsetof(struct kernel_vm86_struct, regs32) - + sizeof(info.regs)); ret = -EFAULT; if (tmp) goto out; @@ -252,15 +280,15 @@ out: static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk) { struct tss_struct *tss; - long eax; /* * make sure the vm86() system call doesn't try to do anything silly */ - info->regs.__null_ds = 0; - info->regs.__null_es = 0; + info->regs.pt.xds = 0; + info->regs.pt.xes = 0; + info->regs.pt.xgs = 0; -/* we are clearing fs,gs later just before "jmp resume_userspace", - * because starting with Linux 2.1.x they aren't no longer saved/restored +/* we are clearing fs later just before "jmp resume_userspace", + * because it is not saved/restored. */ /* @@ -268,10 +296,10 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk * has set it up safely, so this makes sure interrupt etc flags are * inherited from protected mode. */ - VEFLAGS = info->regs.eflags; - info->regs.eflags &= SAFE_MASK; - info->regs.eflags |= info->regs32->eflags & ~SAFE_MASK; - info->regs.eflags |= VM_MASK; + VEFLAGS = info->regs.pt.eflags; + info->regs.pt.eflags &= SAFE_MASK; + info->regs.pt.eflags |= info->regs32->eflags & ~SAFE_MASK; + info->regs.pt.eflags |= VM_MASK; switch (info->cpu_type) { case CPU_286: @@ -294,7 +322,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk info->regs32->eax = 0; tsk->thread.saved_esp0 = tsk->thread.esp0; savesegment(fs, tsk->thread.saved_fs); - savesegment(gs, tsk->thread.saved_gs); + tsk->thread.saved_gs = info->regs32->xgs; tss = &per_cpu(init_tss, get_cpu()); tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0; @@ -306,19 +334,18 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk tsk->thread.screen_bitmap = info->screen_bitmap; if (info->flags & VM86_SCREEN_BITMAP) mark_screen_rdonly(tsk->mm); - __asm__ __volatile__("xorl %eax,%eax; movl %eax,%fs; movl %eax,%gs\n\t"); - __asm__ __volatile__("movl %%eax, %0\n" :"=r"(eax)); /*call audit_syscall_exit since we do not exit via the normal paths */ if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(eax), eax); + audit_syscall_exit(AUDITSC_RESULT(0), 0); __asm__ __volatile__( "movl %0,%%esp\n\t" "movl %1,%%ebp\n\t" + "mov %2, %%fs\n\t" "jmp resume_userspace" : /* no outputs */ - :"r" (&info->regs), "r" (task_thread_info(tsk))); + :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0)); /* we never return here */ } @@ -348,12 +375,12 @@ static inline void clear_IF(struct kernel_vm86_regs * regs) static inline void clear_TF(struct kernel_vm86_regs * regs) { - regs->eflags &= ~TF_MASK; + regs->pt.eflags &= ~TF_MASK; } static inline void clear_AC(struct kernel_vm86_regs * regs) { - regs->eflags &= ~AC_MASK; + regs->pt.eflags &= ~AC_MASK; } /* It is correct to call set_IF(regs) from the set_vflags_* @@ -370,7 +397,7 @@ static inline void clear_AC(struct kernel_vm86_regs * regs) static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs * regs) { set_flags(VEFLAGS, eflags, current->thread.v86mask); - set_flags(regs->eflags, eflags, SAFE_MASK); + set_flags(regs->pt.eflags, eflags, SAFE_MASK); if (eflags & IF_MASK) set_IF(regs); else @@ -380,7 +407,7 @@ static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs) { set_flags(VFLAGS, flags, current->thread.v86mask); - set_flags(regs->eflags, flags, SAFE_MASK); + set_flags(regs->pt.eflags, flags, SAFE_MASK); if (flags & IF_MASK) set_IF(regs); else @@ -389,7 +416,7 @@ static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_reg static inline unsigned long get_vflags(struct kernel_vm86_regs * regs) { - unsigned long flags = regs->eflags & RETURN_MASK; + unsigned long flags = regs->pt.eflags & RETURN_MASK; if (VEFLAGS & VIF_MASK) flags |= IF_MASK; @@ -493,7 +520,7 @@ static void do_int(struct kernel_vm86_regs *regs, int i, unsigned long __user *intr_ptr; unsigned long segoffs; - if (regs->cs == BIOSSEG) + if (regs->pt.xcs == BIOSSEG) goto cannot_handle; if (is_revectored(i, &KVM86->int_revectored)) goto cannot_handle; @@ -505,9 +532,9 @@ static void do_int(struct kernel_vm86_regs *regs, int i, if ((segoffs >> 16) == BIOSSEG) goto cannot_handle; pushw(ssp, sp, get_vflags(regs), cannot_handle); - pushw(ssp, sp, regs->cs, cannot_handle); + pushw(ssp, sp, regs->pt.xcs, cannot_handle); pushw(ssp, sp, IP(regs), cannot_handle); - regs->cs = segoffs >> 16; + regs->pt.xcs = segoffs >> 16; SP(regs) -= 6; IP(regs) = segoffs & 0xffff; clear_TF(regs); @@ -524,7 +551,7 @@ int handle_vm86_trap(struct kernel_vm86_regs * regs, long error_code, int trapno if (VMPI.is_vm86pus) { if ( (trapno==3) || (trapno==1) ) return_to_32bit(regs, VM86_TRAP + (trapno << 8)); - do_int(regs, trapno, (unsigned char __user *) (regs->ss << 4), SP(regs)); + do_int(regs, trapno, (unsigned char __user *) (regs->pt.xss << 4), SP(regs)); return 0; } if (trapno !=1) @@ -560,10 +587,10 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code) handle_vm86_trap(regs, 0, 1); \ return; } while (0) - orig_flags = *(unsigned short *)®s->eflags; + orig_flags = *(unsigned short *)®s->pt.eflags; - csp = (unsigned char __user *) (regs->cs << 4); - ssp = (unsigned char __user *) (regs->ss << 4); + csp = (unsigned char __user *) (regs->pt.xcs << 4); + ssp = (unsigned char __user *) (regs->pt.xss << 4); sp = SP(regs); ip = IP(regs); @@ -650,7 +677,7 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code) SP(regs) += 6; } IP(regs) = newip; - regs->cs = newcs; + regs->pt.xcs = newcs; CHECK_IF_IN_TRAP; if (data32) { set_vflags_long(newflags, regs); diff --git a/include/asm-i386/vm86.h b/include/asm-i386/vm86.h index 952fd695738..a5edf517b99 100644 --- a/include/asm-i386/vm86.h +++ b/include/asm-i386/vm86.h @@ -145,26 +145,13 @@ struct vm86plus_struct { * at the end of the structure. Look at ptrace.h to see the "normal" * setup. For user space layout see 'struct vm86_regs' above. */ +#include struct kernel_vm86_regs { /* * normal regs, with special meaning for the segment descriptors.. */ - long ebx; - long ecx; - long edx; - long esi; - long edi; - long ebp; - long eax; - long __null_ds; - long __null_es; - long orig_eax; - long eip; - unsigned short cs, __csh; - long eflags; - long esp; - unsigned short ss, __ssh; + struct pt_regs pt; /* * these are specific to v86 mode: */ -- cgit v1.2.3-70-g09d2 From b2938f880890ebfcccad356275e0000193153623 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 7 Dec 2006 02:14:03 +0100 Subject: [PATCH] i386: Implement smp_processor_id() with the PDA Use the cpu_number in the PDA to implement raw_smp_processor_id. This is a little simpler than using thread_info, though the cpu field in thread_info cannot be removed since it is used for things other than getting the current CPU in common code. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Chuck Ebbert <76306.1226@compuserve.com> Cc: Zachary Amsden Cc: Jan Beulich Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/asm-offsets.c | 4 +++- arch/i386/kernel/cpu/common.c | 2 ++ arch/i386/kernel/entry.S | 3 +-- include/asm-i386/pda.h | 2 ++ include/asm-i386/smp.h | 3 ++- 5 files changed, 10 insertions(+), 4 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index 9620872d353..85f1b038e9c 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -51,7 +51,6 @@ void foo(void) OFFSET(TI_exec_domain, thread_info, exec_domain); OFFSET(TI_flags, thread_info, flags); OFFSET(TI_status, thread_info, status); - OFFSET(TI_cpu, thread_info, cpu); OFFSET(TI_preempt_count, thread_info, preempt_count); OFFSET(TI_addr_limit, thread_info, addr_limit); OFFSET(TI_restart_block, thread_info, restart_block); @@ -97,4 +96,7 @@ void foo(void) DEFINE(VDSO_PRELINK, VDSO_PRELINK); OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); + + BLANK(); + OFFSET(PDA_cpu, i386_pda, cpu_number); } diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index 4e63d8ce602..e476202b887 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -650,6 +650,7 @@ __cpuinit int alloc_gdt(int cpu) /* Initial PDA used by boot CPU */ struct i386_pda boot_pda = { ._pda = &boot_pda, + .cpu_number = 0, }; static inline void set_kernel_gs(void) @@ -694,6 +695,7 @@ __cpuinit int init_gdt(int cpu, struct task_struct *idle) memset(pda, 0, sizeof(*pda)); pda->_pda = pda; + pda->cpu_number = cpu; return 1; } diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index b99d4a16007..d7423efaeea 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -524,8 +524,7 @@ syscall_badsys: #define FIXUP_ESPFIX_STACK \ /* since we are on a wrong stack, we cant make it a C code :( */ \ - GET_THREAD_INFO(%ebp); \ - movl TI_cpu(%ebp), %ebx; \ + movl %gs:PDA_cpu, %ebx; \ PER_CPU(cpu_gdt_descr, %ebx); \ movl GDS_address(%ebx), %ebx; \ GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \ diff --git a/include/asm-i386/pda.h b/include/asm-i386/pda.h index 4c39ccb1305..f90fde22566 100644 --- a/include/asm-i386/pda.h +++ b/include/asm-i386/pda.h @@ -11,6 +11,8 @@ struct i386_pda { struct i386_pda *_pda; /* pointer to self */ + + int cpu_number; }; extern struct i386_pda *_cpu_pda[]; diff --git a/include/asm-i386/smp.h b/include/asm-i386/smp.h index bd59c1508e7..64fe624c02c 100644 --- a/include/asm-i386/smp.h +++ b/include/asm-i386/smp.h @@ -8,6 +8,7 @@ #include #include #include +#include #endif #ifdef CONFIG_X86_LOCAL_APIC @@ -56,7 +57,7 @@ extern void cpu_uninit(void); * from the initial startup. We map APIC_BASE very early in page_setup(), * so this is correct in the x86 case. */ -#define raw_smp_processor_id() (current_thread_info()->cpu) +#define raw_smp_processor_id() (read_pda(cpu_number)) extern cpumask_t cpu_callout_map; extern cpumask_t cpu_callin_map; -- cgit v1.2.3-70-g09d2 From ec7fcaabbfb3c5bd5189f857b6ac7bb9745ef291 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 7 Dec 2006 02:14:03 +0100 Subject: [PATCH] i386: Implement "current" with the PDA Use the pcurrent field in the PDA to implement the "current" macro. This ends up compiling down to a single instruction to get the current task. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Chuck Ebbert <76306.1226@compuserve.com> Cc: Zachary Amsden Cc: Jan Beulich Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/asm-offsets.c | 2 ++ arch/i386/kernel/cpu/common.c | 2 ++ arch/i386/kernel/process.c | 1 + include/asm-i386/current.h | 7 ++++--- include/asm-i386/pda.h | 2 ++ 5 files changed, 11 insertions(+), 3 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index 85f1b038e9c..0666eb0ed7b 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -15,6 +15,7 @@ #include #include #include +#include #define DEFINE(sym, val) \ asm volatile("\n->" #sym " %0 " #val : : "i" (val)) @@ -99,4 +100,5 @@ void foo(void) BLANK(); OFFSET(PDA_cpu, i386_pda, cpu_number); + OFFSET(PDA_pcurrent, i386_pda, pcurrent); } diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index e476202b887..6958ae5e2fa 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -651,6 +651,7 @@ __cpuinit int alloc_gdt(int cpu) struct i386_pda boot_pda = { ._pda = &boot_pda, .cpu_number = 0, + .pcurrent = &init_task, }; static inline void set_kernel_gs(void) @@ -696,6 +697,7 @@ __cpuinit int init_gdt(int cpu, struct task_struct *idle) memset(pda, 0, sizeof(*pda)); pda->_pda = pda; pda->cpu_number = cpu; + pda->pcurrent = idle; return 1; } diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index dc427254517..8749b10d380 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -684,6 +684,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas if (unlikely(prev->fs | next->fs)) loadsegment(fs, next->fs); + write_pda(pcurrent, next_p); /* * Restore IOPL if needed. diff --git a/include/asm-i386/current.h b/include/asm-i386/current.h index 3cbbecd7901..5252ee0f6d7 100644 --- a/include/asm-i386/current.h +++ b/include/asm-i386/current.h @@ -1,13 +1,14 @@ #ifndef _I386_CURRENT_H #define _I386_CURRENT_H -#include +#include +#include struct task_struct; -static __always_inline struct task_struct * get_current(void) +static __always_inline struct task_struct *get_current(void) { - return current_thread_info()->task; + return read_pda(pcurrent); } #define current get_current() diff --git a/include/asm-i386/pda.h b/include/asm-i386/pda.h index f90fde22566..08a35c478af 100644 --- a/include/asm-i386/pda.h +++ b/include/asm-i386/pda.h @@ -7,12 +7,14 @@ #define _I386_PDA_H #include +#include struct i386_pda { struct i386_pda *_pda; /* pointer to self */ int cpu_number; + struct task_struct *pcurrent; /* current process */ }; extern struct i386_pda *_cpu_pda[]; -- cgit v1.2.3-70-g09d2 From 72690a21188586022a9e65cb6f1cc8845167555a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 7 Dec 2006 02:14:03 +0100 Subject: [PATCH] x86: Don't use nested idle loops Currently the idle loop has two nested loops -- one high level in cpu_idle and in some low level idle functions another one. Looping in the low level idle functions breaks the idle notifiers because interrupts waking up sleep states need to execute exit_idle() which is only in cpu_idle(). So don't do that, only loop in cpu_idle(). This only removes code. In some cases e.g. poll_idle the idle loop is a little longer now because cpu_idle checks more things. I hope that isn't a problem ACPI idle doesn't change behaviour because it never looped anyways. Cc: len.brown@intel.com Cc: eranian@hpl.hp.com Signed-off-by: Andi Kleen --- arch/i386/kernel/process.c | 30 +++++++++--------------------- arch/x86_64/kernel/process.c | 30 +++++++++--------------------- 2 files changed, 18 insertions(+), 42 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 8749b10d380..8f42659ef9d 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -100,22 +100,18 @@ EXPORT_SYMBOL(enable_hlt); */ void default_idle(void) { - local_irq_enable(); - if (!hlt_counter && boot_cpu_data.hlt_works_ok) { current_thread_info()->status &= ~TS_POLLING; smp_mb__after_clear_bit(); - while (!need_resched()) { - local_irq_disable(); - if (!need_resched()) - safe_halt(); - else - local_irq_enable(); - } + local_irq_disable(); + if (!need_resched()) + safe_halt(); /* enables interrupts racelessly */ + else + local_irq_enable(); current_thread_info()->status |= TS_POLLING; } else { - while (!need_resched()) - cpu_relax(); + /* loop is done by the caller */ + cpu_relax(); } } #ifdef CONFIG_APM_MODULE @@ -129,14 +125,7 @@ EXPORT_SYMBOL(default_idle); */ static void poll_idle (void) { - local_irq_enable(); - - asm volatile( - "2:" - "testl %0, %1;" - "rep; nop;" - "je 2b;" - : : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags)); + cpu_relax(); } #ifdef CONFIG_HOTPLUG_CPU @@ -257,8 +246,7 @@ void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) static void mwait_idle(void) { local_irq_enable(); - while (!need_resched()) - mwait_idle_with_hints(0, 0); + mwait_idle_with_hints(0, 0); } void __devinit select_idle_routine(const struct cpuinfo_x86 *c) diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index 7451a4c43c1..0b7b4caa4f7 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -108,17 +108,15 @@ void exit_idle(void) */ static void default_idle(void) { - local_irq_enable(); - current_thread_info()->status &= ~TS_POLLING; smp_mb__after_clear_bit(); - while (!need_resched()) { - local_irq_disable(); - if (!need_resched()) - safe_halt(); - else - local_irq_enable(); - } + local_irq_disable(); + if (!need_resched()) { + /* Enables interrupts one instruction before HLT. + x86 special cases this so there is no race. */ + safe_halt(); + } else + local_irq_enable(); current_thread_info()->status |= TS_POLLING; } @@ -129,16 +127,7 @@ static void default_idle(void) */ static void poll_idle (void) { - local_irq_enable(); - - asm volatile( - "2:" - "testl %0,%1;" - "rep; nop;" - "je 2b;" - : : - "i" (_TIF_NEED_RESCHED), - "m" (current_thread_info()->flags)); + cpu_relax(); } void cpu_idle_wait(void) @@ -257,8 +246,7 @@ void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) static void mwait_idle(void) { local_irq_enable(); - while (!need_resched()) - mwait_idle_with_hints(0,0); + mwait_idle_with_hints(0,0); } void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) -- cgit v1.2.3-70-g09d2 From 9c5f8be4625e73f17e28fea89399ed871a30e064 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 7 Dec 2006 02:14:03 +0100 Subject: [PATCH] x86: Mention PCI instead of RAM in NMI parity error message On modern systems RAM errors don't cause NMIs, but it's usually caused by PCI SERR. Mention PCI instead of RAM in the printk. Reported by r_hayashi@ctc-g.co.jp (Ryutaro Hayashi) Cc: r_hayashi@ctc-g.co.jp Signed-off-by: Andi Kleen --- arch/i386/kernel/traps.c | 3 +-- arch/x86_64/kernel/traps.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 4a6fa2837df..237f4884a1e 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -708,8 +708,7 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs) { printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on " "CPU %d.\n", reason, smp_processor_id()); - printk(KERN_EMERG "You probably have a hardware problem with your RAM " - "chips\n"); + printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n"); if (panic_on_unrecovered_nmi) panic("NMI: Not continuing"); diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index e37b4d77d5a..70bfaab9822 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -793,8 +793,7 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs) { printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", reason); - printk(KERN_EMERG "You probably have a hardware problem with your " - "RAM chips\n"); + printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n"); if (panic_on_unrecovered_nmi) panic("NMI: Not continuing"); -- cgit v1.2.3-70-g09d2 From 6569580de7ae367def89b7671029cb97c1965574 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 7 Dec 2006 02:14:03 +0100 Subject: [PATCH] i386: Distinguish absolute symbols Ld knows about 2 kinds of symbols, absolute and section relative. Section relative symbols symbols change value when a section is moved and absolute symbols do not. Currently in the linker script we have several labels marking the beginning and ending of sections that are outside of sections, making them absolute symbols. Having a mixture of absolute and section relative symbols refereing to the same data is currently harmless but it is confusing. This must be done carefully as newer revs of ld do not place symbols that appear in sections without data and instead ld makes those symbols global :( My ultimate goal is to build a relocatable kernel. The safest and least intrusive technique is to generate relocation entries so the kernel can be relocated at load time. The only penalty would be an increase in the size of the kernel binary. The problem is that if absolute and relocatable symbols are not properly specified absolute symbols will be relocated or section relative symbols won't be, which is fatal. The practical motivation is that when generating kernels that will run from a reserved area for analyzing what caused a kernel panic, it is simpler if you don't need to hard code the physical memory location they will run at, especially for the distributions. [AK: and merged:] o Also put a message so that in future people can be aware of it and avoid introducing absolute symbols. Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/i386/kernel/vmlinux.lds.S | 113 +++++++++++++++++++++----------------- include/asm-generic/vmlinux.lds.h | 10 ++-- 2 files changed, 68 insertions(+), 55 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index c6f84a0322b..cbd24860fbb 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -1,5 +1,11 @@ /* ld script to make i386 Linux kernel * Written by Martin Mares ; + * + * Don't define absolute symbols until and unless you know that symbol + * value is should remain constant even if kernel image is relocated + * at run time. Absolute symbols are not relocated. If symbol value should + * change if kernel is relocated, make the symbol section relative and + * put it inside the section definition. */ #define LOAD_OFFSET __PAGE_OFFSET @@ -24,31 +30,32 @@ SECTIONS . = __KERNEL_START; phys_startup_32 = startup_32 - LOAD_OFFSET; /* read-only */ - _text = .; /* Text and read-only data */ .text : AT(ADDR(.text) - LOAD_OFFSET) { + _text = .; /* Text and read-only data */ *(.text) SCHED_TEXT LOCK_TEXT KPROBES_TEXT *(.fixup) *(.gnu.warning) - } :text = 0x9090 - - _etext = .; /* End of text section */ + _etext = .; /* End of text section */ + } :text = 0x9090 . = ALIGN(16); /* Exception table */ - __start___ex_table = .; - __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) } - __stop___ex_table = .; + __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { + __start___ex_table = .; + *(__ex_table) + __stop___ex_table = .; + } RODATA . = ALIGN(4); - __tracedata_start = .; .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) { + __tracedata_start = .; *(.tracedata) + __tracedata_end = .; } - __tracedata_end = .; /* writeable */ . = ALIGN(4096); @@ -58,10 +65,12 @@ SECTIONS } :data . = ALIGN(4096); - __nosave_begin = .; - .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) } - . = ALIGN(4096); - __nosave_end = .; + .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { + __nosave_begin = .; + *(.data.nosave) + . = ALIGN(4096); + __nosave_end = .; + } . = ALIGN(4096); .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { @@ -75,8 +84,10 @@ SECTIONS /* rarely changed data like cpu maps */ . = ALIGN(32); - .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { *(.data.read_mostly) } - _edata = .; /* End of data section */ + .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { + *(.data.read_mostly) + _edata = .; /* End of data section */ + } #ifdef CONFIG_STACK_UNWIND . = ALIGN(4); @@ -94,54 +105,56 @@ SECTIONS /* might get freed after init */ . = ALIGN(4096); - __smp_alt_begin = .; - __smp_alt_instructions = .; .smp_altinstructions : AT(ADDR(.smp_altinstructions) - LOAD_OFFSET) { + __smp_alt_begin = .; + __smp_alt_instructions = .; *(.smp_altinstructions) + __smp_alt_instructions_end = .; } - __smp_alt_instructions_end = .; . = ALIGN(4); - __smp_locks = .; .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { + __smp_locks = .; *(.smp_locks) + __smp_locks_end = .; } - __smp_locks_end = .; .smp_altinstr_replacement : AT(ADDR(.smp_altinstr_replacement) - LOAD_OFFSET) { *(.smp_altinstr_replacement) + __smp_alt_end = .; } . = ALIGN(4096); - __smp_alt_end = .; /* will be freed after init */ . = ALIGN(4096); /* Init code and data */ - __init_begin = .; .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { + __init_begin = .; _sinittext = .; *(.init.text) _einittext = .; } .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) } . = ALIGN(16); - __setup_start = .; - .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) } - __setup_end = .; - __initcall_start = .; + .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { + __setup_start = .; + *(.init.setup) + __setup_end = .; + } .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { + __initcall_start = .; INITCALLS + __initcall_end = .; } - __initcall_end = .; - __con_initcall_start = .; .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { + __con_initcall_start = .; *(.con_initcall.init) + __con_initcall_end = .; } - __con_initcall_end = .; SECURITY_INIT . = ALIGN(4); - __alt_instructions = .; .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { + __alt_instructions = .; *(.altinstructions) + __alt_instructions_end = .; } - __alt_instructions_end = .; .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { *(.altinstr_replacement) } @@ -150,32 +163,32 @@ SECTIONS .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) } .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) } . = ALIGN(4096); - __initramfs_start = .; - .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) } - __initramfs_end = .; + .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { + __initramfs_start = .; + *(.init.ramfs) + __initramfs_end = .; + } . = ALIGN(L1_CACHE_BYTES); - __per_cpu_start = .; - .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) } - __per_cpu_end = .; + .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { + __per_cpu_start = .; + *(.data.percpu) + __per_cpu_end = .; + } . = ALIGN(4096); - __init_end = .; /* freed after init ends here */ - __bss_start = .; /* BSS */ - .bss.page_aligned : AT(ADDR(.bss.page_aligned) - LOAD_OFFSET) { - *(.bss.page_aligned) - } .bss : AT(ADDR(.bss) - LOAD_OFFSET) { + __init_end = .; + __bss_start = .; /* BSS */ + *(.bss.page_aligned) *(.bss) + . = ALIGN(4); + __bss_stop = .; + _end = . ; + /* This is where the kernel creates the early boot page tables */ + . = ALIGN(4096); + pg0 = . ; } - . = ALIGN(4); - __bss_stop = .; - - _end = . ; - - /* This is where the kernel creates the early boot page tables */ - . = ALIGN(4096); - pg0 = .; /* Sections to be discarded */ /DISCARD/ : { diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index e60d6f21fa6..9f4747780da 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -11,8 +11,8 @@ #define RODATA \ . = ALIGN(4096); \ - __start_rodata = .; \ .rodata : AT(ADDR(.rodata) - LOAD_OFFSET) { \ + VMLINUX_SYMBOL(__start_rodata) = .; \ *(.rodata) *(.rodata.*) \ *(__vermagic) /* Kernel version magic */ \ } \ @@ -119,17 +119,17 @@ *(__ksymtab_strings) \ } \ \ + /* Unwind data binary search table */ \ + EH_FRAME_HDR \ + \ /* Built-in module parameters. */ \ __param : AT(ADDR(__param) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___param) = .; \ *(__param) \ VMLINUX_SYMBOL(__stop___param) = .; \ + VMLINUX_SYMBOL(__end_rodata) = .; \ } \ \ - /* Unwind data binary search table */ \ - EH_FRAME_HDR \ - \ - __end_rodata = .; \ . = ALIGN(4096); #define SECURITY_INIT \ -- cgit v1.2.3-70-g09d2 From 6ed018845f1172cdc94f8a20ad807df901c6b7eb Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 7 Dec 2006 02:14:03 +0100 Subject: [PATCH] i386: Add comment for align to vmlinux.lds Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/i386/kernel/vmlinux.lds.S | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index cbd24860fbb..c217e18f108 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -121,6 +121,12 @@ SECTIONS *(.smp_altinstr_replacement) __smp_alt_end = .; } + /* will be freed after init + * Following ALIGN() is required to make sure no other data falls on the + * same page where __smp_alt_end is pointing as that page might be freed + * after boot. Always make sure that ALIGN() directive is present after + * the section which contains __smp_alt_end. + */ . = ALIGN(4096); /* will be freed after init */ -- cgit v1.2.3-70-g09d2 From 8621b81c744ff8880a1efe095a4dcd09763ddb5a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 7 Dec 2006 02:14:03 +0100 Subject: [PATCH] i386: Reserve kernel memory starting from _text Currently when we are reserving the memory the kernel text resides in we start at __PHYSICAL_START which happens to be correct but not very obvious. In addition when we start relocating the kernel __PHYSICAL_START is the wrong value, as it is an absolute symbol that does not get relocated. By starting the reservation at __pa_symbol(_text) the code is clearer and will be correct when relocated. Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/i386/kernel/setup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index 141041dde74..61539afbdf2 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -1118,8 +1118,8 @@ void __init setup_bootmem_allocator(void) * the (very unlikely) case of us accidentally initializing the * bootmem allocator with an invalid RAM area. */ - reserve_bootmem(__PHYSICAL_START, (PFN_PHYS(min_low_pfn) + - bootmap_size + PAGE_SIZE-1) - (__PHYSICAL_START)); + reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) + + bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text)); /* * reserve physical page 0 - it's a special BIOS page on many boxes, -- cgit v1.2.3-70-g09d2 From 2a43f3ede48ea3d5790b863b719a1e21c90a3697 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 7 Dec 2006 02:14:04 +0100 Subject: [PATCH] i386: CONFIG_PHYSICAL_START cleanup Defining __PHYSICAL_START and __KERNEL_START in asm-i386/page.h works but it triggers a full kernel rebuild for the silliest of reasons. This modifies the users to directly use CONFIG_PHYSICAL_START and linux/config.h which prevents the full rebuild problem, which makes the code much more maintainer and hopefully user friendly. Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/i386/boot/compressed/head.S | 7 +++---- arch/i386/boot/compressed/misc.c | 7 +++---- arch/i386/kernel/vmlinux.lds.S | 2 +- include/asm-i386/page.h | 3 --- 4 files changed, 7 insertions(+), 12 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/boot/compressed/head.S b/arch/i386/boot/compressed/head.S index b5893e4ecd3..40a8de8270a 100644 --- a/arch/i386/boot/compressed/head.S +++ b/arch/i386/boot/compressed/head.S @@ -25,7 +25,6 @@ #include #include -#include .globl startup_32 @@ -75,7 +74,7 @@ startup_32: popl %esi # discard address popl %esi # real mode pointer xorl %ebx,%ebx - ljmp $(__BOOT_CS), $__PHYSICAL_START + ljmp $(__BOOT_CS), $CONFIG_PHYSICAL_START /* * We come here, if we were loaded high. @@ -100,7 +99,7 @@ startup_32: popl %ecx # lcount popl %edx # high_buffer_start popl %eax # hcount - movl $__PHYSICAL_START,%edi + movl $CONFIG_PHYSICAL_START,%edi cli # make sure we don't get interrupted ljmp $(__BOOT_CS), $0x1000 # and jump to the move routine @@ -125,5 +124,5 @@ move_routine_start: movsl movl %ebx,%esi # Restore setup pointer xorl %ebx,%ebx - ljmp $(__BOOT_CS), $__PHYSICAL_START + ljmp $(__BOOT_CS), $CONFIG_PHYSICAL_START move_routine_end: diff --git a/arch/i386/boot/compressed/misc.c b/arch/i386/boot/compressed/misc.c index b2ccd543410..20970ff4411 100644 --- a/arch/i386/boot/compressed/misc.c +++ b/arch/i386/boot/compressed/misc.c @@ -13,7 +13,6 @@ #include #include #include -#include /* * gzip declarations @@ -303,7 +302,7 @@ static void setup_normal_output_buffer(void) #else if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < 1024) error("Less than 2MB of memory"); #endif - output_data = (unsigned char *)__PHYSICAL_START; /* Normally Points to 1M */ + output_data = (unsigned char *)CONFIG_PHYSICAL_START; /* Normally Points to 1M */ free_mem_end_ptr = (long)real_mode; } @@ -326,8 +325,8 @@ static void setup_output_buffer_if_we_run_high(struct moveparams *mv) low_buffer_size = low_buffer_end - LOW_BUFFER_START; high_loaded = 1; free_mem_end_ptr = (long)high_buffer_start; - if ( (__PHYSICAL_START + low_buffer_size) > ((ulg)high_buffer_start)) { - high_buffer_start = (uch *)(__PHYSICAL_START + low_buffer_size); + if ( (CONFIG_PHYSICAL_START + low_buffer_size) > ((ulg)high_buffer_start)) { + high_buffer_start = (uch *)(CONFIG_PHYSICAL_START + low_buffer_size); mv->hcount = 0; /* say: we need not to move high_buffer */ } else mv->hcount = -1; diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index c217e18f108..f8d61ec4c8c 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -27,7 +27,7 @@ PHDRS { } SECTIONS { - . = __KERNEL_START; + . = LOAD_OFFSET + CONFIG_PHYSICAL_START; phys_startup_32 = startup_32 - LOAD_OFFSET; /* read-only */ .text : AT(ADDR(.text) - LOAD_OFFSET) { diff --git a/include/asm-i386/page.h b/include/asm-i386/page.h index 5a70501291d..2b69686107a 100644 --- a/include/asm-i386/page.h +++ b/include/asm-i386/page.h @@ -112,12 +112,9 @@ extern int page_is_ram(unsigned long pagenr); #ifdef __ASSEMBLY__ #define __PAGE_OFFSET CONFIG_PAGE_OFFSET -#define __PHYSICAL_START CONFIG_PHYSICAL_START #else #define __PAGE_OFFSET ((unsigned long)CONFIG_PAGE_OFFSET) -#define __PHYSICAL_START ((unsigned long)CONFIG_PHYSICAL_START) #endif -#define __KERNEL_START (__PAGE_OFFSET + __PHYSICAL_START) #define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET) -- cgit v1.2.3-70-g09d2 From e69f202d0a1419219198566e1c22218a5c71a9a6 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 7 Dec 2006 02:14:04 +0100 Subject: [PATCH] i386: Implement CONFIG_PHYSICAL_ALIGN o Now CONFIG_PHYSICAL_START is being replaced with CONFIG_PHYSICAL_ALIGN. Hardcoding the kernel physical start value creates a problem in relocatable kernel context due to boot loader limitations. For ex, if somebody compiles a relocatable kernel to be run from address 4MB, but this kernel will run from location 1MB as grub loads the kernel at physical address 1MB. Kernel thinks that I am a relocatable kernel and I should run from the address I have been loaded at. So somebody wanting to run kernel from 4MB alignment location (for improved performance regions) can't do that. o Hence, Eric proposed that probably CONFIG_PHYSICAL_ALIGN will make more sense in relocatable kernel context. At run time kernel will move itself to a physical addr location which meets user specified alignment restrictions. Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/i386/Kconfig | 33 ++++++++++++++++++--------------- arch/i386/boot/compressed/head.S | 26 ++++++++++++++------------ arch/i386/boot/compressed/misc.c | 7 ++++--- arch/i386/kernel/vmlinux.lds.S | 3 ++- include/asm-i386/boot.h | 6 +++++- 5 files changed, 43 insertions(+), 32 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index d588ca874bb..fd2fa7a7ec5 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -785,23 +785,26 @@ config RELOCATABLE must live at a different physical address than the primary kernel. -config PHYSICAL_START - hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) - - default "0x1000000" if CRASH_DUMP +config PHYSICAL_ALIGN + hex "Alignment value to which kernel should be aligned" default "0x100000" + range 0x2000 0x400000 help - This gives the physical address where the kernel is loaded. Normally - for regular kernels this value is 0x100000 (1MB). But in the case - of kexec on panic the fail safe kernel needs to run at a different - address than the panic-ed kernel. This option is used to set the load - address for kernels used to capture crash dump on being kexec'ed - after panic. The default value for crash dump kernels is - 0x1000000 (16MB). This can also be set based on the "X" value as - specified in the "crashkernel=YM@XM" command line boot parameter - passed to the panic-ed kernel. Typically this parameter is set as - crashkernel=64M@16M. Please take a look at - Documentation/kdump/kdump.txt for more details about crash dumps. + This value puts the alignment restrictions on physical address + where kernel is loaded and run from. Kernel is compiled for an + address which meets above alignment restriction. + + If bootloader loads the kernel at a non-aligned address and + CONFIG_RELOCATABLE is set, kernel will move itself to nearest + address aligned to above value and run from there. + + If bootloader loads the kernel at a non-aligned address and + CONFIG_RELOCATABLE is not set, kernel will ignore the run time + load address and decompress itself to the address it has been + compiled for and run from there. The address for which kernel is + compiled already meets above alignment restrictions. Hence the + end result is that kernel runs from a physical address meeting + above alignment restrictions. Don't change this unless you know what you are doing. diff --git a/arch/i386/boot/compressed/head.S b/arch/i386/boot/compressed/head.S index e4dd7a6b9b0..f395a4bb38b 100644 --- a/arch/i386/boot/compressed/head.S +++ b/arch/i386/boot/compressed/head.S @@ -26,6 +26,7 @@ #include #include #include +#include .section ".text.head" .globl startup_32 @@ -52,17 +53,17 @@ startup_32: 1: popl %ebp subl $1b, %ebp -/* Compute the delta between where we were compiled to run at - * and where the code will actually run at. +/* %ebp contains the address we are loaded at by the boot loader and %ebx + * contains the address where we should move the kernel image temporarily + * for safe in-place decompression. */ - /* Start with the delta to where the kernel will run at. If we are - * a relocatable kernel this is the delta to our load address otherwise - * this is the delta to CONFIG_PHYSICAL start. - */ + #ifdef CONFIG_RELOCATABLE - movl %ebp, %ebx + movl %ebp, %ebx + addl $(CONFIG_PHYSICAL_ALIGN - 1), %ebx + andl $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebx #else - movl $(CONFIG_PHYSICAL_START - startup_32), %ebx + movl $LOAD_PHYSICAL_ADDR, %ebx #endif /* Replace the compressed data size with the uncompressed size */ @@ -94,9 +95,10 @@ startup_32: /* Compute the kernel start address. */ #ifdef CONFIG_RELOCATABLE - leal startup_32(%ebp), %ebp + addl $(CONFIG_PHYSICAL_ALIGN - 1), %ebp + andl $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebp #else - movl $CONFIG_PHYSICAL_START, %ebp + movl $LOAD_PHYSICAL_ADDR, %ebp #endif /* @@ -150,8 +152,8 @@ relocated: * and where it was actually loaded. */ movl %ebp, %ebx - subl $CONFIG_PHYSICAL_START, %ebx - + subl $LOAD_PHYSICAL_ADDR, %ebx + jz 2f /* Nothing to be done if loaded at compiled addr. */ /* * Process relocations. */ diff --git a/arch/i386/boot/compressed/misc.c b/arch/i386/boot/compressed/misc.c index 4eac24e95a1..dc153893155 100644 --- a/arch/i386/boot/compressed/misc.c +++ b/arch/i386/boot/compressed/misc.c @@ -14,6 +14,7 @@ #include #include #include +#include /* WARNING!! * This code is compiled with -fPIC and it is relocated dynamically @@ -360,12 +361,12 @@ asmlinkage void decompress_kernel(void *rmode, unsigned long end, insize = input_len; inptr = 0; - if (((u32)output - CONFIG_PHYSICAL_START) & 0x3fffff) - error("Destination address not 4M aligned"); + if ((u32)output & (CONFIG_PHYSICAL_ALIGN -1)) + error("Destination address not CONFIG_PHYSICAL_ALIGN aligned"); if (end > ((-__PAGE_OFFSET-(512 <<20)-1) & 0x7fffffff)) error("Destination address too large"); #ifndef CONFIG_RELOCATABLE - if ((u32)output != CONFIG_PHYSICAL_START) + if ((u32)output != LOAD_PHYSICAL_ADDR) error("Wrong destination address"); #endif diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index f8d61ec4c8c..6860f20aa57 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -14,6 +14,7 @@ #include #include #include +#include OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") OUTPUT_ARCH(i386) @@ -27,7 +28,7 @@ PHDRS { } SECTIONS { - . = LOAD_OFFSET + CONFIG_PHYSICAL_START; + . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; phys_startup_32 = startup_32 - LOAD_OFFSET; /* read-only */ .text : AT(ADDR(.text) - LOAD_OFFSET) { diff --git a/include/asm-i386/boot.h b/include/asm-i386/boot.h index 96b228e6e79..8ce79a6fa89 100644 --- a/include/asm-i386/boot.h +++ b/include/asm-i386/boot.h @@ -12,4 +12,8 @@ #define EXTENDED_VGA 0xfffe /* 80x50 mode */ #define ASK_VGA 0xfffd /* ask for it at bootup */ -#endif +/* Physical address where kenrel should be loaded. */ +#define LOAD_PHYSICAL_ADDR ((0x100000 + CONFIG_PHYSICAL_ALIGN - 1) \ + & ~(CONFIG_PHYSICAL_ALIGN - 1)) + +#endif /* _LINUX_BOOT_H */ -- cgit v1.2.3-70-g09d2 From 74b47a7844501445d41d704fe7c626f4b1819508 Mon Sep 17 00:00:00 2001 From: Joe Korty Date: Thu, 7 Dec 2006 02:14:04 +0100 Subject: [PATCH] i386: Fix entry.S code with !CONFIG_VM86 The entry.S code at work_notifysig is surely wrong. It drops into unrelated code if the branch to work_notifysig_v86 is taken, and CONFIG_VM86=n. [PATCH] Make vm86 support optional tree 9b5daef5280800a0006343a17f63072658d91a1d pushed to git Jan 8, 2006, and first appears in 2.6.16 The 'fix' here is to also compile out the vm86 test & branch when CONFIG_VM86=n. Signed-off-by: Joe Korty Signed-off-by: Andi Kleen --- arch/i386/kernel/entry.S | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index d7423efaeea..0220bc8cbb4 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -457,6 +457,7 @@ work_resched: work_notifysig: # deal with pending signals and # notify-resume requests +#ifdef CONFIG_VM86 testl $VM_MASK, PT_EFLAGS(%esp) movl %esp, %eax jne work_notifysig_v86 # returning to kernel-space or @@ -467,17 +468,18 @@ work_notifysig: # deal with pending signals and ALIGN work_notifysig_v86: -#ifdef CONFIG_VM86 pushl %ecx # save ti_flags for do_notify_resume CFI_ADJUST_CFA_OFFSET 4 call save_v86_state # %eax contains pt_regs pointer popl %ecx CFI_ADJUST_CFA_OFFSET -4 movl %eax, %esp +#else + movl %esp, %eax +#endif xorl %edx, %edx call do_notify_resume jmp resume_userspace_sig -#endif # perform syscall exit tracing ALIGN -- cgit v1.2.3-70-g09d2 From 770d132f03ac15b12919f1bac481f4beda13e094 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 7 Dec 2006 02:14:05 +0100 Subject: [PATCH] i386: Retrieve CLFLUSH size from CPUID Also report it in /proc/cpuinfo similar to x86-64. Needed for followon patch Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/common.c | 3 +++ arch/i386/kernel/cpu/proc.c | 3 ++- include/asm-i386/processor.h | 1 + 3 files changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index 6958ae5e2fa..cda41aef79a 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -309,6 +309,8 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 * c) #else c->apicid = (ebx >> 24) & 0xFF; #endif + if (c->x86_capability[0] & (1<<19)) + c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8; } else { /* Have CPUID level 0 only - unheard of */ c->x86 = 4; @@ -373,6 +375,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c) c->x86_vendor_id[0] = '\0'; /* Unset */ c->x86_model_id[0] = '\0'; /* Unset */ c->x86_max_cores = 1; + c->x86_clflush_size = 32; memset(&c->x86_capability, 0, sizeof c->x86_capability); if (!have_cpuid_p()) { diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c index 76aac088a32..6624d8583c4 100644 --- a/arch/i386/kernel/cpu/proc.c +++ b/arch/i386/kernel/cpu/proc.c @@ -152,9 +152,10 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, " [%d]", i); } - seq_printf(m, "\nbogomips\t: %lu.%02lu\n\n", + seq_printf(m, "\nbogomips\t: %lu.%02lu\n", c->loops_per_jiffy/(500000/HZ), (c->loops_per_jiffy/(5000/HZ)) % 100); + seq_printf(m, "clflush size\t: %u\n\n", c->x86_clflush_size); return 0; } diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h index f73cf836e64..98fa73b7176 100644 --- a/include/asm-i386/processor.h +++ b/include/asm-i386/processor.h @@ -72,6 +72,7 @@ struct cpuinfo_x86 { #endif unsigned char x86_max_cores; /* cpuid returned max cores value */ unsigned char apicid; + unsigned short x86_clflush_size; #ifdef CONFIG_SMP unsigned char booted_cores; /* number of cores as seen by OS */ __u8 phys_proc_id; /* Physical processor id. */ -- cgit v1.2.3-70-g09d2 From 11a4180c0b03e2ee0c948fd8430ee092dc1625b3 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 7 Dec 2006 02:14:06 +0100 Subject: [PATCH] i386: Use probe_kernel_address instead of __get_user in fault paths Makes the intention of the code cleaner to read and avoids a potential deadlock on mmap_sem. Also change the types of the arguments to not include __user because they're really not user addresses. Signed-off-by: Andi Kleen --- arch/i386/kernel/traps.c | 24 +++++++++++++----------- arch/i386/mm/fault.c | 12 ++++++------ 2 files changed, 19 insertions(+), 17 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 237f4884a1e..7b2f9f02208 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -380,7 +380,7 @@ void show_registers(struct pt_regs *regs) * time of the fault.. */ if (in_kernel) { - u8 __user *eip; + u8 *eip; int code_bytes = 64; unsigned char c; @@ -389,18 +389,20 @@ void show_registers(struct pt_regs *regs) printk(KERN_EMERG "Code: "); - eip = (u8 __user *)regs->eip - 43; - if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) { + eip = (u8 *)regs->eip - 43; + if (eip < (u8 *)PAGE_OFFSET || + probe_kernel_address(eip, c)) { /* try starting at EIP */ - eip = (u8 __user *)regs->eip; + eip = (u8 *)regs->eip; code_bytes = 32; } for (i = 0; i < code_bytes; i++, eip++) { - if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) { + if (eip < (u8 *)PAGE_OFFSET || + probe_kernel_address(eip, c)) { printk(" Bad EIP value."); break; } - if (eip == (u8 __user *)regs->eip) + if (eip == (u8 *)regs->eip) printk("<%02x> ", c); else printk("%02x ", c); @@ -416,7 +418,7 @@ static void handle_BUG(struct pt_regs *regs) if (eip < PAGE_OFFSET) return; - if (probe_kernel_address((unsigned short __user *)eip, ud2)) + if (probe_kernel_address((unsigned short *)eip, ud2)) return; if (ud2 != 0x0b0f) return; @@ -429,11 +431,11 @@ static void handle_BUG(struct pt_regs *regs) char *file; char c; - if (probe_kernel_address((unsigned short __user *)(eip + 2), - line)) + if (probe_kernel_address((unsigned short *)(eip + 2), line)) break; - if (__get_user(file, (char * __user *)(eip + 4)) || - (unsigned long)file < PAGE_OFFSET || __get_user(c, file)) + if (probe_kernel_address((char **)(eip + 4), file) || + (unsigned long)file < PAGE_OFFSET || + probe_kernel_address(file, c)) file = ""; printk(KERN_EMERG "kernel BUG at %s:%d!\n", file, line); diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c index 2581575786c..aaaa4d225f7 100644 --- a/arch/i386/mm/fault.c +++ b/arch/i386/mm/fault.c @@ -22,9 +22,9 @@ #include #include #include +#include #include -#include #include #include #include @@ -167,7 +167,7 @@ static inline unsigned long get_segment_eip(struct pt_regs *regs, static int __is_prefetch(struct pt_regs *regs, unsigned long addr) { unsigned long limit; - unsigned long instr = get_segment_eip (regs, &limit); + unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit); int scan_more = 1; int prefetch = 0; int i; @@ -177,9 +177,9 @@ static int __is_prefetch(struct pt_regs *regs, unsigned long addr) unsigned char instr_hi; unsigned char instr_lo; - if (instr > limit) + if (instr > (unsigned char *)limit) break; - if (__get_user(opcode, (unsigned char __user *) instr)) + if (probe_kernel_address(instr, opcode)) break; instr_hi = opcode & 0xf0; @@ -204,9 +204,9 @@ static int __is_prefetch(struct pt_regs *regs, unsigned long addr) case 0x00: /* Prefetch instruction is 0x0F0D or 0x0F18 */ scan_more = 0; - if (instr > limit) + if (instr > (unsigned char *)limit) break; - if (__get_user(opcode, (unsigned char __user *) instr)) + if (probe_kernel_address(instr, opcode)) break; prefetch = (instr_lo == 0xF) && (opcode == 0x0D || opcode == 0x18); -- cgit v1.2.3-70-g09d2 From 269c2d81ed66af7c09a1619ffe165f03e7470a5b Mon Sep 17 00:00:00 2001 From: "bibo,mao" Date: Thu, 7 Dec 2006 02:14:06 +0100 Subject: [PATCH] i386: i386 create e820.c to handle standard io/mem resources This patch creates new file named e820.c to hanle standard io/mem resources, moving request_standard_resources function from setup.c to e820.c. Also this patch modifies Makfile to compile file e820.c. Signed-off-by: bibo,mao Signed-off-by: Andi Kleen Makefile | 2 arch/i386/kernel/Makefile | 2 arch/i386/kernel/e820.c | 289 ++++++++++++++++++++++++++++++++++++++++++++++ arch/i386/kernel/setup.c | 276 ------------------------------------------- 3 files changed, 293 insertions(+), 274 deletions(-) --- arch/i386/kernel/Makefile | 2 +- arch/i386/kernel/e820.c | 289 ++++++++++++++++++++++++++++++++++++++++++++++ arch/i386/kernel/setup.c | 276 +------------------------------------------ 3 files changed, 293 insertions(+), 274 deletions(-) create mode 100644 arch/i386/kernel/e820.c (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index 1a884b6e6e5..f614854bd71 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -6,7 +6,7 @@ extra-y := head.o init_task.o vmlinux.lds obj-y := process.o signal.o entry.o traps.o irq.o \ ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \ - pci-dma.o i386_ksyms.o i387.o bootflag.o \ + pci-dma.o i386_ksyms.o i387.o bootflag.o e820.o\ quirks.o i8237.o topology.o alternative.o i8253.o tsc.o obj-$(CONFIG_STACKTRACE) += stacktrace.o diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c new file mode 100644 index 00000000000..cce70604948 --- /dev/null +++ b/arch/i386/kernel/e820.c @@ -0,0 +1,289 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#ifdef CONFIG_EFI +int efi_enabled = 0; +EXPORT_SYMBOL(efi_enabled); +#endif + +struct e820map e820; +struct resource data_resource = { + .name = "Kernel data", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +struct resource code_resource = { + .name = "Kernel code", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +static struct resource system_rom_resource = { + .name = "System ROM", + .start = 0xf0000, + .end = 0xfffff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +static struct resource extension_rom_resource = { + .name = "Extension ROM", + .start = 0xe0000, + .end = 0xeffff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +static struct resource adapter_rom_resources[] = { { + .name = "Adapter ROM", + .start = 0xc8000, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +} }; + +static struct resource video_rom_resource = { + .name = "Video ROM", + .start = 0xc0000, + .end = 0xc7fff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +static struct resource video_ram_resource = { + .name = "Video RAM area", + .start = 0xa0000, + .end = 0xbffff, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +static struct resource standard_io_resources[] = { { + .name = "dma1", + .start = 0x0000, + .end = 0x001f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "pic1", + .start = 0x0020, + .end = 0x0021, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "timer0", + .start = 0x0040, + .end = 0x0043, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "timer1", + .start = 0x0050, + .end = 0x0053, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "keyboard", + .start = 0x0060, + .end = 0x006f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "dma page reg", + .start = 0x0080, + .end = 0x008f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "pic2", + .start = 0x00a0, + .end = 0x00a1, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "dma2", + .start = 0x00c0, + .end = 0x00df, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "fpu", + .start = 0x00f0, + .end = 0x00ff, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +} }; + +#define romsignature(x) (*(unsigned short *)(x) == 0xaa55) + +static int __init romchecksum(unsigned char *rom, unsigned long length) +{ + unsigned char *p, sum = 0; + + for (p = rom; p < rom + length; p++) + sum += *p; + return sum == 0; +} + +static void __init probe_roms(void) +{ + unsigned long start, length, upper; + unsigned char *rom; + int i; + + /* video rom */ + upper = adapter_rom_resources[0].start; + for (start = video_rom_resource.start; start < upper; start += 2048) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + video_rom_resource.start = start; + + /* 0 < length <= 0x7f * 512, historically */ + length = rom[2] * 512; + + /* if checksum okay, trust length byte */ + if (length && romchecksum(rom, length)) + video_rom_resource.end = start + length - 1; + + request_resource(&iomem_resource, &video_rom_resource); + break; + } + + start = (video_rom_resource.end + 1 + 2047) & ~2047UL; + if (start < upper) + start = upper; + + /* system rom */ + request_resource(&iomem_resource, &system_rom_resource); + upper = system_rom_resource.start; + + /* check for extension rom (ignore length byte!) */ + rom = isa_bus_to_virt(extension_rom_resource.start); + if (romsignature(rom)) { + length = extension_rom_resource.end - extension_rom_resource.start + 1; + if (romchecksum(rom, length)) { + request_resource(&iomem_resource, &extension_rom_resource); + upper = extension_rom_resource.start; + } + } + + /* check for adapter roms on 2k boundaries */ + for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + /* 0 < length <= 0x7f * 512, historically */ + length = rom[2] * 512; + + /* but accept any length that fits if checksum okay */ + if (!length || start + length > upper || !romchecksum(rom, length)) + continue; + + adapter_rom_resources[i].start = start; + adapter_rom_resources[i].end = start + length - 1; + request_resource(&iomem_resource, &adapter_rom_resources[i]); + + start = adapter_rom_resources[i++].end & ~2047UL; + } +} + +/* + * Request address space for all standard RAM and ROM resources + * and also for regions reported as reserved by the e820. + */ +static void __init +legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource) +{ + int i; + + probe_roms(); + for (i = 0; i < e820.nr_map; i++) { + struct resource *res; +#ifndef CONFIG_RESOURCES_64BIT + if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL) + continue; +#endif + res = kzalloc(sizeof(struct resource), GFP_ATOMIC); + switch (e820.map[i].type) { + case E820_RAM: res->name = "System RAM"; break; + case E820_ACPI: res->name = "ACPI Tables"; break; + case E820_NVS: res->name = "ACPI Non-volatile Storage"; break; + default: res->name = "reserved"; + } + res->start = e820.map[i].addr; + res->end = res->start + e820.map[i].size - 1; + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + if (request_resource(&iomem_resource, res)) { + kfree(res); + continue; + } + if (e820.map[i].type == E820_RAM) { + /* + * We don't know which RAM region contains kernel data, + * so we try it repeatedly and let the resource manager + * test it. + */ + request_resource(res, code_resource); + request_resource(res, data_resource); +#ifdef CONFIG_KEXEC + request_resource(res, &crashk_res); +#endif + } + } +} + +/* + * Request address space for all standard resources + * + * This is called just before pcibios_init(), which is also a + * subsys_initcall, but is linked in later (in arch/i386/pci/common.c). + */ +static int __init request_standard_resources(void) +{ + int i; + + printk("Setting up standard PCI resources\n"); + if (efi_enabled) + efi_initialize_iomem_resources(&code_resource, &data_resource); + else + legacy_init_iomem_resources(&code_resource, &data_resource); + + /* EFI systems may still have VGA */ + request_resource(&iomem_resource, &video_ram_resource); + + /* request I/O space for devices used on all i[345]86 PCs */ + for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) + request_resource(&ioport_resource, &standard_io_resources[i]); + return 0; +} + +subsys_initcall(request_standard_resources); diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index 61539afbdf2..acd2d9392ab 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -76,11 +76,9 @@ int disable_pse __devinitdata = 0; /* * Machine setup.. */ - -#ifdef CONFIG_EFI -int efi_enabled = 0; -EXPORT_SYMBOL(efi_enabled); -#endif +extern struct e820map e820; +extern struct resource code_resource; +extern struct resource data_resource; /* cpu data as detected by the assembly code in head.S */ struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; @@ -134,7 +132,6 @@ struct ist_info ist_info; defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE) EXPORT_SYMBOL(ist_info); #endif -struct e820map e820; extern void early_cpu_init(void); extern int root_mountflags; @@ -149,203 +146,6 @@ static char command_line[COMMAND_LINE_SIZE]; unsigned char __initdata boot_params[PARAM_SIZE]; -static struct resource data_resource = { - .name = "Kernel data", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - -static struct resource code_resource = { - .name = "Kernel code", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - -static struct resource system_rom_resource = { - .name = "System ROM", - .start = 0xf0000, - .end = 0xfffff, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}; - -static struct resource extension_rom_resource = { - .name = "Extension ROM", - .start = 0xe0000, - .end = 0xeffff, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}; - -static struct resource adapter_rom_resources[] = { { - .name = "Adapter ROM", - .start = 0xc8000, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -} }; - -static struct resource video_rom_resource = { - .name = "Video ROM", - .start = 0xc0000, - .end = 0xc7fff, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}; - -static struct resource video_ram_resource = { - .name = "Video RAM area", - .start = 0xa0000, - .end = 0xbffff, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - -static struct resource standard_io_resources[] = { { - .name = "dma1", - .start = 0x0000, - .end = 0x001f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "pic1", - .start = 0x0020, - .end = 0x0021, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "timer0", - .start = 0x0040, - .end = 0x0043, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "timer1", - .start = 0x0050, - .end = 0x0053, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "keyboard", - .start = 0x0060, - .end = 0x006f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "dma page reg", - .start = 0x0080, - .end = 0x008f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "pic2", - .start = 0x00a0, - .end = 0x00a1, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "dma2", - .start = 0x00c0, - .end = 0x00df, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "fpu", - .start = 0x00f0, - .end = 0x00ff, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -} }; - -#define romsignature(x) (*(unsigned short *)(x) == 0xaa55) - -static int __init romchecksum(unsigned char *rom, unsigned long length) -{ - unsigned char *p, sum = 0; - - for (p = rom; p < rom + length; p++) - sum += *p; - return sum == 0; -} - -static void __init probe_roms(void) -{ - unsigned long start, length, upper; - unsigned char *rom; - int i; - - /* video rom */ - upper = adapter_rom_resources[0].start; - for (start = video_rom_resource.start; start < upper; start += 2048) { - rom = isa_bus_to_virt(start); - if (!romsignature(rom)) - continue; - - video_rom_resource.start = start; - - /* 0 < length <= 0x7f * 512, historically */ - length = rom[2] * 512; - - /* if checksum okay, trust length byte */ - if (length && romchecksum(rom, length)) - video_rom_resource.end = start + length - 1; - - request_resource(&iomem_resource, &video_rom_resource); - break; - } - - start = (video_rom_resource.end + 1 + 2047) & ~2047UL; - if (start < upper) - start = upper; - - /* system rom */ - request_resource(&iomem_resource, &system_rom_resource); - upper = system_rom_resource.start; - - /* check for extension rom (ignore length byte!) */ - rom = isa_bus_to_virt(extension_rom_resource.start); - if (romsignature(rom)) { - length = extension_rom_resource.end - extension_rom_resource.start + 1; - if (romchecksum(rom, length)) { - request_resource(&iomem_resource, &extension_rom_resource); - upper = extension_rom_resource.start; - } - } - - /* check for adapter roms on 2k boundaries */ - for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) { - rom = isa_bus_to_virt(start); - if (!romsignature(rom)) - continue; - - /* 0 < length <= 0x7f * 512, historically */ - length = rom[2] * 512; - - /* but accept any length that fits if checksum okay */ - if (!length || start + length > upper || !romchecksum(rom, length)) - continue; - - adapter_rom_resources[i].start = start; - adapter_rom_resources[i].end = start + length - 1; - request_resource(&iomem_resource, &adapter_rom_resources[i]); - - start = adapter_rom_resources[i++].end & ~2047UL; - } -} - static void __init limit_regions(unsigned long long size) { unsigned long long current_addr = 0; @@ -1200,77 +1000,7 @@ void __init remapped_pgdat_init(void) } } -/* - * Request address space for all standard RAM and ROM resources - * and also for regions reported as reserved by the e820. - */ -static void __init -legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource) -{ - int i; - - probe_roms(); - for (i = 0; i < e820.nr_map; i++) { - struct resource *res; -#ifndef CONFIG_RESOURCES_64BIT - if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL) - continue; -#endif - res = kzalloc(sizeof(struct resource), GFP_ATOMIC); - switch (e820.map[i].type) { - case E820_RAM: res->name = "System RAM"; break; - case E820_ACPI: res->name = "ACPI Tables"; break; - case E820_NVS: res->name = "ACPI Non-volatile Storage"; break; - default: res->name = "reserved"; - } - res->start = e820.map[i].addr; - res->end = res->start + e820.map[i].size - 1; - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; - if (request_resource(&iomem_resource, res)) { - kfree(res); - continue; - } - if (e820.map[i].type == E820_RAM) { - /* - * We don't know which RAM region contains kernel data, - * so we try it repeatedly and let the resource manager - * test it. - */ - request_resource(res, code_resource); - request_resource(res, data_resource); -#ifdef CONFIG_KEXEC - request_resource(res, &crashk_res); -#endif - } - } -} - -/* - * Request address space for all standard resources - * - * This is called just before pcibios_init(), which is also a - * subsys_initcall, but is linked in later (in arch/i386/pci/common.c). - */ -static int __init request_standard_resources(void) -{ - int i; - - printk("Setting up standard PCI resources\n"); - if (efi_enabled) - efi_initialize_iomem_resources(&code_resource, &data_resource); - else - legacy_init_iomem_resources(&code_resource, &data_resource); - - /* EFI systems may still have VGA */ - request_resource(&iomem_resource, &video_ram_resource); - - /* request I/O space for devices used on all i[345]86 PCs */ - for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) - request_resource(&ioport_resource, &standard_io_resources[i]); - return 0; -} -subsys_initcall(request_standard_resources); static void __init register_memory(void) { -- cgit v1.2.3-70-g09d2 From 8e3342f736dd1c19ce7c28625dedd7d8730fc7ad Mon Sep 17 00:00:00 2001 From: "bibo,mao" Date: Thu, 7 Dec 2006 02:14:06 +0100 Subject: [PATCH] i386: create e820.c for e820 map sanitize and copy function This patch moves bios e820 map sanitize and copy function from setup.c to e820.c Signed-off-by: bibo,mao Signed-off-by: Andi Kleen arch/i386/kernel/e820.c | 252 +++++++++++++++++++++++++++++++++++++++++++++++ arch/i386/kernel/setup.c | 240 -------------------------------------------- 2 files changed, 252 insertions(+), 240 deletions(-) --- arch/i386/kernel/e820.c | 252 +++++++++++++++++++++++++++++++++++++++++++++++ arch/i386/kernel/setup.c | 240 -------------------------------------------- 2 files changed, 252 insertions(+), 240 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c index cce70604948..0db95760b07 100644 --- a/arch/i386/kernel/e820.c +++ b/arch/i386/kernel/e820.c @@ -19,6 +19,14 @@ EXPORT_SYMBOL(efi_enabled); #endif struct e820map e820; +struct change_member { + struct e820entry *pbios; /* pointer to original bios entry */ + unsigned long long addr; /* address for this change point */ +}; +static struct change_member change_point_list[2*E820MAX] __initdata; +static struct change_member *change_point[2*E820MAX] __initdata; +static struct e820entry *overlap_list[E820MAX] __initdata; +static struct e820entry new_bios[E820MAX] __initdata; struct resource data_resource = { .name = "Kernel data", .start = 0, @@ -287,3 +295,247 @@ static int __init request_standard_resources(void) } subsys_initcall(request_standard_resources); + +void __init add_memory_region(unsigned long long start, + unsigned long long size, int type) +{ + int x; + + if (!efi_enabled) { + x = e820.nr_map; + + if (x == E820MAX) { + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); + return; + } + + e820.map[x].addr = start; + e820.map[x].size = size; + e820.map[x].type = type; + e820.nr_map++; + } +} /* add_memory_region */ + +/* + * Sanitize the BIOS e820 map. + * + * Some e820 responses include overlapping entries. The following + * replaces the original e820 map with a new one, removing overlaps. + * + */ +int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) +{ + struct change_member *change_tmp; + unsigned long current_type, last_type; + unsigned long long last_addr; + int chgidx, still_changing; + int overlap_entries; + int new_bios_entry; + int old_nr, new_nr, chg_nr; + int i; + + /* + Visually we're performing the following (1,2,3,4 = memory types)... + + Sample memory map (w/overlaps): + ____22__________________ + ______________________4_ + ____1111________________ + _44_____________________ + 11111111________________ + ____________________33__ + ___________44___________ + __________33333_________ + ______________22________ + ___________________2222_ + _________111111111______ + _____________________11_ + _________________4______ + + Sanitized equivalent (no overlap): + 1_______________________ + _44_____________________ + ___1____________________ + ____22__________________ + ______11________________ + _________1______________ + __________3_____________ + ___________44___________ + _____________33_________ + _______________2________ + ________________1_______ + _________________4______ + ___________________2____ + ____________________33__ + ______________________4_ + */ + printk("sanitize start\n"); + /* if there's only one memory region, don't bother */ + if (*pnr_map < 2) { + printk("sanitize bail 0\n"); + return -1; + } + + old_nr = *pnr_map; + + /* bail out if we find any unreasonable addresses in bios map */ + for (i=0; iaddr = biosmap[i].addr; + change_point[chgidx++]->pbios = &biosmap[i]; + change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; + change_point[chgidx++]->pbios = &biosmap[i]; + } + } + chg_nr = chgidx; /* true number of change-points */ + + /* sort change-point list by memory addresses (low -> high) */ + still_changing = 1; + while (still_changing) { + still_changing = 0; + for (i=1; i < chg_nr; i++) { + /* if > , swap */ + /* or, if current= & last=, swap */ + if ((change_point[i]->addr < change_point[i-1]->addr) || + ((change_point[i]->addr == change_point[i-1]->addr) && + (change_point[i]->addr == change_point[i]->pbios->addr) && + (change_point[i-1]->addr != change_point[i-1]->pbios->addr)) + ) + { + change_tmp = change_point[i]; + change_point[i] = change_point[i-1]; + change_point[i-1] = change_tmp; + still_changing=1; + } + } + } + + /* create a new bios memory map, removing overlaps */ + overlap_entries=0; /* number of entries in the overlap table */ + new_bios_entry=0; /* index for creating new bios map entries */ + last_type = 0; /* start with undefined memory type */ + last_addr = 0; /* start with 0 as last starting address */ + /* loop through change-points, determining affect on the new bios map */ + for (chgidx=0; chgidx < chg_nr; chgidx++) + { + /* keep track of all overlapping bios entries */ + if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) + { + /* add map entry to overlap list (> 1 entry implies an overlap) */ + overlap_list[overlap_entries++]=change_point[chgidx]->pbios; + } + else + { + /* remove entry from list (order independent, so swap with last) */ + for (i=0; ipbios) + overlap_list[i] = overlap_list[overlap_entries-1]; + } + overlap_entries--; + } + /* if there are overlapping entries, decide which "type" to use */ + /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */ + current_type = 0; + for (i=0; itype > current_type) + current_type = overlap_list[i]->type; + /* continue building up new bios map based on this information */ + if (current_type != last_type) { + if (last_type != 0) { + new_bios[new_bios_entry].size = + change_point[chgidx]->addr - last_addr; + /* move forward only if the new size was non-zero */ + if (new_bios[new_bios_entry].size != 0) + if (++new_bios_entry >= E820MAX) + break; /* no more space left for new bios entries */ + } + if (current_type != 0) { + new_bios[new_bios_entry].addr = change_point[chgidx]->addr; + new_bios[new_bios_entry].type = current_type; + last_addr=change_point[chgidx]->addr; + } + last_type = current_type; + } + } + new_nr = new_bios_entry; /* retain count for new bios entries */ + + /* copy new bios mapping into original location */ + memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); + *pnr_map = new_nr; + + printk("sanitize end\n"); + return 0; +} + +/* + * Copy the BIOS e820 map into a safe place. + * + * Sanity-check it while we're at it.. + * + * If we're lucky and live on a modern system, the setup code + * will have given us a memory map that we can use to properly + * set up memory. If we aren't, we'll fake a memory map. + * + * We check to see that the memory map contains at least 2 elements + * before we'll use it, because the detection code in setup.S may + * not be perfect and most every PC known to man has two memory + * regions: one from 0 to 640k, and one from 1mb up. (The IBM + * thinkpad 560x, for example, does not cooperate with the memory + * detection code.) + */ +int __init copy_e820_map(struct e820entry * biosmap, int nr_map) +{ + /* Only one memory region (or negative)? Ignore it */ + if (nr_map < 2) + return -1; + + do { + unsigned long long start = biosmap->addr; + unsigned long long size = biosmap->size; + unsigned long long end = start + size; + unsigned long type = biosmap->type; + printk("copy_e820_map() start: %016Lx size: %016Lx end: %016Lx type: %ld\n", start, size, end, type); + + /* Overflow in 64 bits? Ignore the memory map. */ + if (start > end) + return -1; + + /* + * Some BIOSes claim RAM in the 640k - 1M region. + * Not right. Fix it up. + */ + if (type == E820_RAM) { + printk("copy_e820_map() type is E820_RAM\n"); + if (start < 0x100000ULL && end > 0xA0000ULL) { + printk("copy_e820_map() lies in range...\n"); + if (start < 0xA0000ULL) { + printk("copy_e820_map() start < 0xA0000ULL\n"); + add_memory_region(start, 0xA0000ULL-start, type); + } + if (end <= 0x100000ULL) { + printk("copy_e820_map() end <= 0x100000ULL\n"); + continue; + } + start = 0x100000ULL; + size = end - start; + } + } + add_memory_region(start, size, type); + } while (biosmap++,--nr_map); + return 0; +} + diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index acd2d9392ab..b7509aec0eb 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -191,26 +191,6 @@ static void __init limit_regions(unsigned long long size) } } -void __init add_memory_region(unsigned long long start, - unsigned long long size, int type) -{ - int x; - - if (!efi_enabled) { - x = e820.nr_map; - - if (x == E820MAX) { - printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); - return; - } - - e820.map[x].addr = start; - e820.map[x].size = size; - e820.map[x].type = type; - e820.nr_map++; - } -} /* add_memory_region */ - #define E820_DEBUG 1 static void __init print_memory_map(char *who) @@ -239,226 +219,6 @@ static void __init print_memory_map(char *who) } } -/* - * Sanitize the BIOS e820 map. - * - * Some e820 responses include overlapping entries. The following - * replaces the original e820 map with a new one, removing overlaps. - * - */ -struct change_member { - struct e820entry *pbios; /* pointer to original bios entry */ - unsigned long long addr; /* address for this change point */ -}; -static struct change_member change_point_list[2*E820MAX] __initdata; -static struct change_member *change_point[2*E820MAX] __initdata; -static struct e820entry *overlap_list[E820MAX] __initdata; -static struct e820entry new_bios[E820MAX] __initdata; - -int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) -{ - struct change_member *change_tmp; - unsigned long current_type, last_type; - unsigned long long last_addr; - int chgidx, still_changing; - int overlap_entries; - int new_bios_entry; - int old_nr, new_nr, chg_nr; - int i; - - /* - Visually we're performing the following (1,2,3,4 = memory types)... - - Sample memory map (w/overlaps): - ____22__________________ - ______________________4_ - ____1111________________ - _44_____________________ - 11111111________________ - ____________________33__ - ___________44___________ - __________33333_________ - ______________22________ - ___________________2222_ - _________111111111______ - _____________________11_ - _________________4______ - - Sanitized equivalent (no overlap): - 1_______________________ - _44_____________________ - ___1____________________ - ____22__________________ - ______11________________ - _________1______________ - __________3_____________ - ___________44___________ - _____________33_________ - _______________2________ - ________________1_______ - _________________4______ - ___________________2____ - ____________________33__ - ______________________4_ - */ - - /* if there's only one memory region, don't bother */ - if (*pnr_map < 2) - return -1; - - old_nr = *pnr_map; - - /* bail out if we find any unreasonable addresses in bios map */ - for (i=0; iaddr = biosmap[i].addr; - change_point[chgidx++]->pbios = &biosmap[i]; - change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; - change_point[chgidx++]->pbios = &biosmap[i]; - } - } - chg_nr = chgidx; /* true number of change-points */ - - /* sort change-point list by memory addresses (low -> high) */ - still_changing = 1; - while (still_changing) { - still_changing = 0; - for (i=1; i < chg_nr; i++) { - /* if > , swap */ - /* or, if current= & last=, swap */ - if ((change_point[i]->addr < change_point[i-1]->addr) || - ((change_point[i]->addr == change_point[i-1]->addr) && - (change_point[i]->addr == change_point[i]->pbios->addr) && - (change_point[i-1]->addr != change_point[i-1]->pbios->addr)) - ) - { - change_tmp = change_point[i]; - change_point[i] = change_point[i-1]; - change_point[i-1] = change_tmp; - still_changing=1; - } - } - } - - /* create a new bios memory map, removing overlaps */ - overlap_entries=0; /* number of entries in the overlap table */ - new_bios_entry=0; /* index for creating new bios map entries */ - last_type = 0; /* start with undefined memory type */ - last_addr = 0; /* start with 0 as last starting address */ - /* loop through change-points, determining affect on the new bios map */ - for (chgidx=0; chgidx < chg_nr; chgidx++) - { - /* keep track of all overlapping bios entries */ - if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) - { - /* add map entry to overlap list (> 1 entry implies an overlap) */ - overlap_list[overlap_entries++]=change_point[chgidx]->pbios; - } - else - { - /* remove entry from list (order independent, so swap with last) */ - for (i=0; ipbios) - overlap_list[i] = overlap_list[overlap_entries-1]; - } - overlap_entries--; - } - /* if there are overlapping entries, decide which "type" to use */ - /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */ - current_type = 0; - for (i=0; itype > current_type) - current_type = overlap_list[i]->type; - /* continue building up new bios map based on this information */ - if (current_type != last_type) { - if (last_type != 0) { - new_bios[new_bios_entry].size = - change_point[chgidx]->addr - last_addr; - /* move forward only if the new size was non-zero */ - if (new_bios[new_bios_entry].size != 0) - if (++new_bios_entry >= E820MAX) - break; /* no more space left for new bios entries */ - } - if (current_type != 0) { - new_bios[new_bios_entry].addr = change_point[chgidx]->addr; - new_bios[new_bios_entry].type = current_type; - last_addr=change_point[chgidx]->addr; - } - last_type = current_type; - } - } - new_nr = new_bios_entry; /* retain count for new bios entries */ - - /* copy new bios mapping into original location */ - memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); - *pnr_map = new_nr; - - return 0; -} - -/* - * Copy the BIOS e820 map into a safe place. - * - * Sanity-check it while we're at it.. - * - * If we're lucky and live on a modern system, the setup code - * will have given us a memory map that we can use to properly - * set up memory. If we aren't, we'll fake a memory map. - * - * We check to see that the memory map contains at least 2 elements - * before we'll use it, because the detection code in setup.S may - * not be perfect and most every PC known to man has two memory - * regions: one from 0 to 640k, and one from 1mb up. (The IBM - * thinkpad 560x, for example, does not cooperate with the memory - * detection code.) - */ -int __init copy_e820_map(struct e820entry * biosmap, int nr_map) -{ - /* Only one memory region (or negative)? Ignore it */ - if (nr_map < 2) - return -1; - - do { - unsigned long long start = biosmap->addr; - unsigned long long size = biosmap->size; - unsigned long long end = start + size; - unsigned long type = biosmap->type; - - /* Overflow in 64 bits? Ignore the memory map. */ - if (start > end) - return -1; - - /* - * Some BIOSes claim RAM in the 640k - 1M region. - * Not right. Fix it up. - */ - if (type == E820_RAM) { - if (start < 0x100000ULL && end > 0xA0000ULL) { - if (start < 0xA0000ULL) - add_memory_region(start, 0xA0000ULL-start, type); - if (end <= 0x100000ULL) - continue; - start = 0x100000ULL; - size = end - start; - } - } - add_memory_region(start, size, type); - } while (biosmap++,--nr_map); - return 0; -} - #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) struct edd edd; #ifdef CONFIG_EDD_MODULE -- cgit v1.2.3-70-g09d2 From b2dff6a88cbed59d787a8ca7367c76ba385e1187 Mon Sep 17 00:00:00 2001 From: "bibo,mao" Date: Thu, 7 Dec 2006 02:14:06 +0100 Subject: [PATCH] i386: Move find_max_pfn function to e820.c Move more code from setup.c into e820.c Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/e820.c | 52 +++++++++++++++++++++++++++++++++++++++++++++ arch/i386/kernel/setup.c | 55 ------------------------------------------------ include/asm-i386/e820.h | 1 + 3 files changed, 53 insertions(+), 55 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c index 0db95760b07..be4934f6f85 100644 --- a/arch/i386/kernel/e820.c +++ b/arch/i386/kernel/e820.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -539,3 +540,54 @@ int __init copy_e820_map(struct e820entry * biosmap, int nr_map) return 0; } +/* + * Callback for efi_memory_walk. + */ +static int __init +efi_find_max_pfn(unsigned long start, unsigned long end, void *arg) +{ + unsigned long *max_pfn = arg, pfn; + + if (start < end) { + pfn = PFN_UP(end -1); + if (pfn > *max_pfn) + *max_pfn = pfn; + } + return 0; +} + +static int __init +efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg) +{ + memory_present(0, PFN_UP(start), PFN_DOWN(end)); + return 0; +} + +/* + * Find the highest page frame number we have available + */ +void __init find_max_pfn(void) +{ + int i; + + max_pfn = 0; + if (efi_enabled) { + efi_memmap_walk(efi_find_max_pfn, &max_pfn); + efi_memmap_walk(efi_memory_present_wrapper, NULL); + return; + } + + for (i = 0; i < e820.nr_map; i++) { + unsigned long start, end; + /* RAM? */ + if (e820.map[i].type != E820_RAM) + continue; + start = PFN_UP(e820.map[i].addr); + end = PFN_DOWN(e820.map[i].addr + e820.map[i].size); + if (start >= end) + continue; + if (end > max_pfn) + max_pfn = end; + memory_present(0, start, end); + } +} diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index b7509aec0eb..3d808054fdf 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -63,9 +63,6 @@ #include #include -/* Forward Declaration. */ -void __init find_max_pfn(void); - /* This value is set up by the early boot code to point to the value immediately after the boot time page tables. It contains a *physical* address, and must not be in the .bss segment! */ @@ -387,29 +384,6 @@ static int __init parse_reservetop(char *arg) } early_param("reservetop", parse_reservetop); -/* - * Callback for efi_memory_walk. - */ -static int __init -efi_find_max_pfn(unsigned long start, unsigned long end, void *arg) -{ - unsigned long *max_pfn = arg, pfn; - - if (start < end) { - pfn = PFN_UP(end -1); - if (pfn > *max_pfn) - *max_pfn = pfn; - } - return 0; -} - -static int __init -efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg) -{ - memory_present(0, PFN_UP(start), PFN_DOWN(end)); - return 0; -} - /* * This function checks if the entire range is mapped with type. * @@ -442,35 +416,6 @@ e820_all_mapped(unsigned long s, unsigned long e, unsigned type) return 0; } -/* - * Find the highest page frame number we have available - */ -void __init find_max_pfn(void) -{ - int i; - - max_pfn = 0; - if (efi_enabled) { - efi_memmap_walk(efi_find_max_pfn, &max_pfn); - efi_memmap_walk(efi_memory_present_wrapper, NULL); - return; - } - - for (i = 0; i < e820.nr_map; i++) { - unsigned long start, end; - /* RAM? */ - if (e820.map[i].type != E820_RAM) - continue; - start = PFN_UP(e820.map[i].addr); - end = PFN_DOWN(e820.map[i].addr + e820.map[i].size); - if (start >= end) - continue; - if (end > max_pfn) - max_pfn = end; - memory_present(0, start, end); - } -} - /* * Determine low and high memory ranges: */ diff --git a/include/asm-i386/e820.h b/include/asm-i386/e820.h index f7514fb6e8e..14756942515 100644 --- a/include/asm-i386/e820.h +++ b/include/asm-i386/e820.h @@ -38,6 +38,7 @@ extern struct e820map e820; extern int e820_all_mapped(unsigned long start, unsigned long end, unsigned type); +extern void find_max_pfn(void); #endif/*!__ASSEMBLY__*/ -- cgit v1.2.3-70-g09d2 From b5b2405706005cc7765f6ecd00965d29e93f090a Mon Sep 17 00:00:00 2001 From: "bibo,mao" Date: Thu, 7 Dec 2006 02:14:06 +0100 Subject: [PATCH] i386: Move e820/efi memmap walking code to e820.c This patch moves e820/efi memmap table walking function from setup.c to e820.c, also this patch adds extern declaration in header file. Signed-off-by: bibo,mao Signed-off-by: Andi Kleen arch/i386/kernel/e820.c | 115 +++++++++++++++++++++++++++++++++ arch/i386/kernel/setup.c | 118 ----------------------------------- include/asm-i386/e820.h | 2 arch/i386/kernel/e820.c | 115 +++++++++++++++++++++++++++++++++++++++++++++ arch/i386/kernel/setup.c | 118 ----------------------------------------------- include/asm-i386/e820.h | 2 3 files changed, 117 insertions(+), 118 deletions(-) --- arch/i386/kernel/e820.c | 115 +++++++++++++++++++++++++++++++++++++++++++++ arch/i386/kernel/setup.c | 118 ----------------------------------------------- include/asm-i386/e820.h | 2 + 3 files changed, 117 insertions(+), 118 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c index be4934f6f85..47c495bf0cb 100644 --- a/arch/i386/kernel/e820.c +++ b/arch/i386/kernel/e820.c @@ -28,6 +28,11 @@ static struct change_member change_point_list[2*E820MAX] __initdata; static struct change_member *change_point[2*E820MAX] __initdata; static struct e820entry *overlap_list[E820MAX] __initdata; static struct e820entry new_bios[E820MAX] __initdata; +/* For PCI or other memory-mapped resources */ +unsigned long pci_mem_start = 0x10000000; +#ifdef CONFIG_PCI +EXPORT_SYMBOL(pci_mem_start); +#endif struct resource data_resource = { .name = "Kernel data", .start = 0, @@ -591,3 +596,113 @@ void __init find_max_pfn(void) memory_present(0, start, end); } } + +/* + * Free all available memory for boot time allocation. Used + * as a callback function by efi_memory_walk() + */ + +static int __init +free_available_memory(unsigned long start, unsigned long end, void *arg) +{ + /* check max_low_pfn */ + if (start >= (max_low_pfn << PAGE_SHIFT)) + return 0; + if (end >= (max_low_pfn << PAGE_SHIFT)) + end = max_low_pfn << PAGE_SHIFT; + if (start < end) + free_bootmem(start, end - start); + + return 0; +} +/* + * Register fully available low RAM pages with the bootmem allocator. + */ +void __init register_bootmem_low_pages(unsigned long max_low_pfn) +{ + int i; + + if (efi_enabled) { + efi_memmap_walk(free_available_memory, NULL); + return; + } + for (i = 0; i < e820.nr_map; i++) { + unsigned long curr_pfn, last_pfn, size; + /* + * Reserve usable low memory + */ + if (e820.map[i].type != E820_RAM) + continue; + /* + * We are rounding up the start address of usable memory: + */ + curr_pfn = PFN_UP(e820.map[i].addr); + if (curr_pfn >= max_low_pfn) + continue; + /* + * ... and at the end of the usable range downwards: + */ + last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size); + + if (last_pfn > max_low_pfn) + last_pfn = max_low_pfn; + + /* + * .. finally, did all the rounding and playing + * around just make the area go away? + */ + if (last_pfn <= curr_pfn) + continue; + + size = last_pfn - curr_pfn; + free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size)); + } +} + +void __init register_memory(void) +{ + unsigned long gapstart, gapsize, round; + unsigned long long last; + int i; + + /* + * Search for the bigest gap in the low 32 bits of the e820 + * memory space. + */ + last = 0x100000000ull; + gapstart = 0x10000000; + gapsize = 0x400000; + i = e820.nr_map; + while (--i >= 0) { + unsigned long long start = e820.map[i].addr; + unsigned long long end = start + e820.map[i].size; + + /* + * Since "last" is at most 4GB, we know we'll + * fit in 32 bits if this condition is true + */ + if (last > end) { + unsigned long gap = last - end; + + if (gap > gapsize) { + gapsize = gap; + gapstart = end; + } + } + if (start < last) + last = start; + } + + /* + * See how much we want to round up: start off with + * rounding to the next 1MB area. + */ + round = 0x100000; + while ((gapsize >> 4) > round) + round += round; + /* Fun with two's complement */ + pci_mem_start = (gapstart + round) & -round; + + printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n", + pci_mem_start, gapstart, gapsize); +} diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index 3d808054fdf..51ed015a1f3 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -94,12 +94,6 @@ unsigned int machine_submodel_id; unsigned int BIOS_revision; unsigned int mca_pentium_flag; -/* For PCI or other memory-mapped resources */ -unsigned long pci_mem_start = 0x10000000; -#ifdef CONFIG_PCI -EXPORT_SYMBOL(pci_mem_start); -#endif - /* Boot loader ID as an integer, for the benefit of proc_dointvec */ int bootloader_type; @@ -475,68 +469,6 @@ unsigned long __init find_max_low_pfn(void) return max_low_pfn; } -/* - * Free all available memory for boot time allocation. Used - * as a callback function by efi_memory_walk() - */ - -static int __init -free_available_memory(unsigned long start, unsigned long end, void *arg) -{ - /* check max_low_pfn */ - if (start >= (max_low_pfn << PAGE_SHIFT)) - return 0; - if (end >= (max_low_pfn << PAGE_SHIFT)) - end = max_low_pfn << PAGE_SHIFT; - if (start < end) - free_bootmem(start, end - start); - - return 0; -} -/* - * Register fully available low RAM pages with the bootmem allocator. - */ -static void __init register_bootmem_low_pages(unsigned long max_low_pfn) -{ - int i; - - if (efi_enabled) { - efi_memmap_walk(free_available_memory, NULL); - return; - } - for (i = 0; i < e820.nr_map; i++) { - unsigned long curr_pfn, last_pfn, size; - /* - * Reserve usable low memory - */ - if (e820.map[i].type != E820_RAM) - continue; - /* - * We are rounding up the start address of usable memory: - */ - curr_pfn = PFN_UP(e820.map[i].addr); - if (curr_pfn >= max_low_pfn) - continue; - /* - * ... and at the end of the usable range downwards: - */ - last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size); - - if (last_pfn > max_low_pfn) - last_pfn = max_low_pfn; - - /* - * .. finally, did all the rounding and playing - * around just make the area go away? - */ - if (last_pfn <= curr_pfn) - continue; - - size = last_pfn - curr_pfn; - free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size)); - } -} - /* * workaround for Dell systems that neglect to reserve EBDA */ @@ -705,56 +637,6 @@ void __init remapped_pgdat_init(void) } } - - -static void __init register_memory(void) -{ - unsigned long gapstart, gapsize, round; - unsigned long long last; - int i; - - /* - * Search for the bigest gap in the low 32 bits of the e820 - * memory space. - */ - last = 0x100000000ull; - gapstart = 0x10000000; - gapsize = 0x400000; - i = e820.nr_map; - while (--i >= 0) { - unsigned long long start = e820.map[i].addr; - unsigned long long end = start + e820.map[i].size; - - /* - * Since "last" is at most 4GB, we know we'll - * fit in 32 bits if this condition is true - */ - if (last > end) { - unsigned long gap = last - end; - - if (gap > gapsize) { - gapsize = gap; - gapstart = end; - } - } - if (start < last) - last = start; - } - - /* - * See how much we want to round up: start off with - * rounding to the next 1MB area. - */ - round = 0x100000; - while ((gapsize >> 4) > round) - round += round; - /* Fun with two's complement */ - pci_mem_start = (gapstart + round) & -round; - - printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n", - pci_mem_start, gapstart, gapsize); -} - #ifdef CONFIG_MCA static void set_mca_bus(int x) { diff --git a/include/asm-i386/e820.h b/include/asm-i386/e820.h index 14756942515..8da4175a553 100644 --- a/include/asm-i386/e820.h +++ b/include/asm-i386/e820.h @@ -39,6 +39,8 @@ extern struct e820map e820; extern int e820_all_mapped(unsigned long start, unsigned long end, unsigned type); extern void find_max_pfn(void); +extern void register_bootmem_low_pages(unsigned long max_low_pfn); +extern void register_memory(void); #endif/*!__ASSEMBLY__*/ -- cgit v1.2.3-70-g09d2 From cef518e88b8ed94ea483c436ef5e5b151a3fabc6 Mon Sep 17 00:00:00 2001 From: "bibo,mao" Date: Thu, 7 Dec 2006 02:14:06 +0100 Subject: [PATCH] i386: Move memory map printing and other code to e820.c This patch moves e820 memory map print and memmap boot param parsing function from setup.c to e820.c, also adds limit_regions and print_memory_map declaration in header file. Signed-off-by: bibo,mao Signed-off-by: Andi Kleen arch/i386/kernel/e820.c | 152 +++++++++++++++++++++++++++ arch/i386/kernel/setup.c | 158 --------------------------------- include/asm-i386/e820.h | 2 arch/i386/kernel/e820.c | 152 ++++++++++++++++++++++++++++++++++++++++++++++ arch/i386/kernel/setup.c | 153 ----------------------------------------------- include/asm-i386/e820.h | 2 3 files changed, 155 insertions(+), 152 deletions(-) --- arch/i386/kernel/e820.c | 152 ++++++++++++++++++++++++++++++++++++++++++++++ arch/i386/kernel/setup.c | 153 +---------------------------------------------- include/asm-i386/e820.h | 2 + 3 files changed, 155 insertions(+), 152 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c index 47c495bf0cb..b755255f272 100644 --- a/arch/i386/kernel/e820.c +++ b/arch/i386/kernel/e820.c @@ -33,6 +33,7 @@ unsigned long pci_mem_start = 0x10000000; #ifdef CONFIG_PCI EXPORT_SYMBOL(pci_mem_start); #endif +extern int user_defined_memmap; struct resource data_resource = { .name = "Kernel data", .start = 0, @@ -706,3 +707,154 @@ void __init register_memory(void) printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n", pci_mem_start, gapstart, gapsize); } + +void __init print_memory_map(char *who) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + printk(" %s: %016Lx - %016Lx ", who, + e820.map[i].addr, + e820.map[i].addr + e820.map[i].size); + switch (e820.map[i].type) { + case E820_RAM: printk("(usable)\n"); + break; + case E820_RESERVED: + printk("(reserved)\n"); + break; + case E820_ACPI: + printk("(ACPI data)\n"); + break; + case E820_NVS: + printk("(ACPI NVS)\n"); + break; + default: printk("type %lu\n", e820.map[i].type); + break; + } + } +} + +void __init limit_regions(unsigned long long size) +{ + unsigned long long current_addr = 0; + int i; + + print_memory_map("limit_regions start"); + if (efi_enabled) { + efi_memory_desc_t *md; + void *p; + + for (p = memmap.map, i = 0; p < memmap.map_end; + p += memmap.desc_size, i++) { + md = p; + current_addr = md->phys_addr + (md->num_pages << 12); + if (md->type == EFI_CONVENTIONAL_MEMORY) { + if (current_addr >= size) { + md->num_pages -= + (((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT); + memmap.nr_map = i + 1; + return; + } + } + } + } + for (i = 0; i < e820.nr_map; i++) { + current_addr = e820.map[i].addr + e820.map[i].size; + if (current_addr < size) + continue; + + if (e820.map[i].type != E820_RAM) + continue; + + if (e820.map[i].addr >= size) { + /* + * This region starts past the end of the + * requested size, skip it completely. + */ + e820.nr_map = i; + } else { + e820.nr_map = i + 1; + e820.map[i].size -= current_addr - size; + } + print_memory_map("limit_regions endfor"); + return; + } + print_memory_map("limit_regions endfunc"); +} + + /* + * This function checks if the entire range is mapped with type. + * + * Note: this function only works correct if the e820 table is sorted and + * not-overlapping, which is the case + */ +int __init +e820_all_mapped(unsigned long s, unsigned long e, unsigned type) +{ + u64 start = s; + u64 end = e; + int i; + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + if (type && ei->type != type) + continue; + /* is the region (part) in overlap with the current region ?*/ + if (ei->addr >= end || ei->addr + ei->size <= start) + continue; + /* if the region is at the beginning of we move + * start to the end of the region since it's ok until there + */ + if (ei->addr <= start) + start = ei->addr + ei->size; + /* if start is now at or beyond end, we're done, full + * coverage */ + if (start >= end) + return 1; /* we're done */ + } + return 0; +} + +static int __init parse_memmap(char *arg) +{ + if (!arg) + return -EINVAL; + + if (strcmp(arg, "exactmap") == 0) { +#ifdef CONFIG_CRASH_DUMP + /* If we are doing a crash dump, we + * still need to know the real mem + * size before original memory map is + * reset. + */ + find_max_pfn(); + saved_max_pfn = max_pfn; +#endif + e820.nr_map = 0; + user_defined_memmap = 1; + } else { + /* If the user specifies memory size, we + * limit the BIOS-provided memory map to + * that size. exactmap can be used to specify + * the exact map. mem=number can be used to + * trim the existing memory map. + */ + unsigned long long start_at, mem_size; + + mem_size = memparse(arg, &arg); + if (*arg == '@') { + start_at = memparse(arg+1, &arg); + add_memory_region(start_at, mem_size, E820_RAM); + } else if (*arg == '#') { + start_at = memparse(arg+1, &arg); + add_memory_region(start_at, mem_size, E820_ACPI); + } else if (*arg == '$') { + start_at = memparse(arg+1, &arg); + add_memory_region(start_at, mem_size, E820_RESERVED); + } else { + limit_regions(mem_size); + user_defined_memmap = 1; + } + } + return 0; +} +early_param("memmap", parse_memmap); diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index 51ed015a1f3..e5bb87aa5a4 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -73,7 +73,6 @@ int disable_pse __devinitdata = 0; /* * Machine setup.. */ -extern struct e820map e820; extern struct resource code_resource; extern struct resource data_resource; @@ -137,79 +136,6 @@ static char command_line[COMMAND_LINE_SIZE]; unsigned char __initdata boot_params[PARAM_SIZE]; -static void __init limit_regions(unsigned long long size) -{ - unsigned long long current_addr = 0; - int i; - - if (efi_enabled) { - efi_memory_desc_t *md; - void *p; - - for (p = memmap.map, i = 0; p < memmap.map_end; - p += memmap.desc_size, i++) { - md = p; - current_addr = md->phys_addr + (md->num_pages << 12); - if (md->type == EFI_CONVENTIONAL_MEMORY) { - if (current_addr >= size) { - md->num_pages -= - (((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT); - memmap.nr_map = i + 1; - return; - } - } - } - } - for (i = 0; i < e820.nr_map; i++) { - current_addr = e820.map[i].addr + e820.map[i].size; - if (current_addr < size) - continue; - - if (e820.map[i].type != E820_RAM) - continue; - - if (e820.map[i].addr >= size) { - /* - * This region starts past the end of the - * requested size, skip it completely. - */ - e820.nr_map = i; - } else { - e820.nr_map = i + 1; - e820.map[i].size -= current_addr - size; - } - return; - } -} - -#define E820_DEBUG 1 - -static void __init print_memory_map(char *who) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - printk(" %s: %016Lx - %016Lx ", who, - e820.map[i].addr, - e820.map[i].addr + e820.map[i].size); - switch (e820.map[i].type) { - case E820_RAM: printk("(usable)\n"); - break; - case E820_RESERVED: - printk("(reserved)\n"); - break; - case E820_ACPI: - printk("(ACPI data)\n"); - break; - case E820_NVS: - printk("(ACPI NVS)\n"); - break; - default: printk("type %lu\n", e820.map[i].type); - break; - } - } -} - #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) struct edd edd; #ifdef CONFIG_EDD_MODULE @@ -233,7 +159,7 @@ static inline void copy_edd(void) } #endif -static int __initdata user_defined_memmap = 0; +int __initdata user_defined_memmap = 0; /* * "mem=nopentium" disables the 4MB page tables. @@ -270,51 +196,6 @@ static int __init parse_mem(char *arg) } early_param("mem", parse_mem); -static int __init parse_memmap(char *arg) -{ - if (!arg) - return -EINVAL; - - if (strcmp(arg, "exactmap") == 0) { -#ifdef CONFIG_CRASH_DUMP - /* If we are doing a crash dump, we - * still need to know the real mem - * size before original memory map is - * reset. - */ - find_max_pfn(); - saved_max_pfn = max_pfn; -#endif - e820.nr_map = 0; - user_defined_memmap = 1; - } else { - /* If the user specifies memory size, we - * limit the BIOS-provided memory map to - * that size. exactmap can be used to specify - * the exact map. mem=number can be used to - * trim the existing memory map. - */ - unsigned long long start_at, mem_size; - - mem_size = memparse(arg, &arg); - if (*arg == '@') { - start_at = memparse(arg+1, &arg); - add_memory_region(start_at, mem_size, E820_RAM); - } else if (*arg == '#') { - start_at = memparse(arg+1, &arg); - add_memory_region(start_at, mem_size, E820_ACPI); - } else if (*arg == '$') { - start_at = memparse(arg+1, &arg); - add_memory_region(start_at, mem_size, E820_RESERVED); - } else { - limit_regions(mem_size); - user_defined_memmap = 1; - } - } - return 0; -} -early_param("memmap", parse_memmap); - #ifdef CONFIG_PROC_VMCORE /* elfcorehdr= specifies the location of elf core header * stored by the crashed kernel. @@ -378,38 +259,6 @@ static int __init parse_reservetop(char *arg) } early_param("reservetop", parse_reservetop); - /* - * This function checks if the entire range is mapped with type. - * - * Note: this function only works correct if the e820 table is sorted and - * not-overlapping, which is the case - */ -int __init -e820_all_mapped(unsigned long s, unsigned long e, unsigned type) -{ - u64 start = s; - u64 end = e; - int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - if (type && ei->type != type) - continue; - /* is the region (part) in overlap with the current region ?*/ - if (ei->addr >= end || ei->addr + ei->size <= start) - continue; - /* if the region is at the beginning of we move - * start to the end of the region since it's ok until there - */ - if (ei->addr <= start) - start = ei->addr + ei->size; - /* if start is now at or beyond end, we're done, full - * coverage */ - if (start >= end) - return 1; /* we're done */ - } - return 0; -} - /* * Determine low and high memory ranges: */ diff --git a/include/asm-i386/e820.h b/include/asm-i386/e820.h index 8da4175a553..395077aba58 100644 --- a/include/asm-i386/e820.h +++ b/include/asm-i386/e820.h @@ -41,6 +41,8 @@ extern int e820_all_mapped(unsigned long start, unsigned long end, extern void find_max_pfn(void); extern void register_bootmem_low_pages(unsigned long max_low_pfn); extern void register_memory(void); +extern void limit_regions(unsigned long long size); +extern void print_memory_map(char *who); #endif/*!__ASSEMBLY__*/ -- cgit v1.2.3-70-g09d2 From d15512f442ef1ea60f6195b0444fb27b3cf8d0e6 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 7 Dec 2006 02:14:07 +0100 Subject: [PATCH] i386: Fix race in IO-APIC routing entry setup. Interrupt could happen between setting the IO-APIC entry and setting its interrupt data. Pointed out by Linus. Signed-off-by: Andi Kleen --- arch/i386/kernel/io_apic.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index 3b7a63e0ed1..e33b7a84529 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -153,14 +153,20 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) * the interrupt, and we need to make sure the entry is fully populated * before that happens. */ -static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +static void +__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) { - unsigned long flags; union entry_union eu; eu.entry = e; - spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(apic, 0x11 + 2*pin, eu.w2); io_apic_write(apic, 0x10 + 2*pin, eu.w1); +} + +static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +{ + unsigned long flags; + spin_lock_irqsave(&ioapic_lock, flags); + __ioapic_write_entry(apic, pin, e); spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -1360,8 +1366,8 @@ static void __init setup_IO_APIC_irqs(void) if (!apic && (irq < 16)) disable_8259A_irq(irq); } - ioapic_write_entry(apic, pin, entry); spin_lock_irqsave(&ioapic_lock, flags); + __ioapic_write_entry(apic, pin, entry); set_native_irq_info(irq, TARGET_CPUS); spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -2856,8 +2862,8 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a if (!ioapic && (irq < 16)) disable_8259A_irq(irq); - ioapic_write_entry(ioapic, pin, entry); spin_lock_irqsave(&ioapic_lock, flags); + __ioapic_write_entry(ioapic, pin, entry); set_native_irq_info(irq, TARGET_CPUS); spin_unlock_irqrestore(&ioapic_lock, flags); -- cgit v1.2.3-70-g09d2 From 8c89812684de3b47066d800031dfd7098abbdc74 Mon Sep 17 00:00:00 2001 From: Chuck Ebbert <76306.1226@compuserve.com> Date: Thu, 7 Dec 2006 02:14:07 +0100 Subject: [PATCH] i386: remove IOPL check on task switch IOPL is implicitly saved and restored on task switch, so explicit check is no longer needed. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Signed-off-by: Andi Kleen --- arch/i386/kernel/process.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 8f42659ef9d..99308510a17 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -674,12 +674,6 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas write_pda(pcurrent, next_p); - /* - * Restore IOPL if needed. - */ - if (unlikely(prev->iopl != next->iopl)) - set_iopl_mask(next->iopl); - /* * Now maybe handle debug registers and/or IO bitmaps */ -- cgit v1.2.3-70-g09d2 From db91b882aabd0b3b55a87cbfb344f2798bb740b4 Mon Sep 17 00:00:00 2001 From: Nicolas Kaiser Date: Thu, 7 Dec 2006 02:14:07 +0100 Subject: [PATCH] i386: Fix double #includes in arch/i386 Fix double #includes in arch/i386 Signed-off-by: Nicolas Kaiser Signed-off-by: Andi Kleen --- arch/i386/kernel/cpuid.c | 1 - arch/i386/kernel/tsc.c | 1 - 2 files changed, 2 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/cpuid.c b/arch/i386/kernel/cpuid.c index ab0c327e79d..5c5d4507ee7 100644 --- a/arch/i386/kernel/cpuid.c +++ b/arch/i386/kernel/cpuid.c @@ -34,7 +34,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c index fbc95828cd7..7f22e03253e 100644 --- a/arch/i386/kernel/tsc.c +++ b/arch/i386/kernel/tsc.c @@ -13,7 +13,6 @@ #include #include -#include #include #include "mach_timer.h" -- cgit v1.2.3-70-g09d2 From d3561b7fa0fb0fc583bab0eeda32bec9e4c4056d Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 7 Dec 2006 02:14:07 +0100 Subject: [PATCH] paravirt: header and stubs for paravirtualisation Create a paravirt.h header for all the critical operations which need to be replaced with hypervisor calls, and include that instead of defining native operations, when CONFIG_PARAVIRT. This patch does the dumbest possible replacement of paravirtualized instructions: calls through a "paravirt_ops" structure. Currently these are function implementations of native hardware: hypervisors will override the ops structure with their own variants. All the pv-ops functions are declared "fastcall" so that a specific register-based ABI is used, to make inlining assember easier. And: +From: Andy Whitcroft The paravirt ops introduce a 'weak' attribute onto memory_setup(). Code ordering leads to the following warnings on x86: arch/i386/kernel/setup.c:651: warning: weak declaration of `memory_setup' after first use results in unspecified behavior Move memory_setup() to avoid this. Signed-off-by: Rusty Russell Signed-off-by: Chris Wright Signed-off-by: Andi Kleen Cc: Jeremy Fitzhardinge Cc: Zachary Amsden Signed-off-by: Andrew Morton Signed-off-by: Andy Whitcroft --- arch/i386/Kconfig | 11 + arch/i386/boot/compressed/misc.c | 1 + arch/i386/kernel/Makefile | 1 + arch/i386/kernel/asm-offsets.c | 10 + arch/i386/kernel/entry.S | 34 ++- arch/i386/kernel/i8259.c | 5 +- arch/i386/kernel/paravirt.c | 404 +++++++++++++++++++++++++++++ arch/i386/kernel/setup.c | 8 +- arch/i386/kernel/smpboot.c | 5 + arch/i386/kernel/time.c | 15 +- arch/i386/power/cpu.c | 8 +- drivers/net/de600.c | 1 - include/asm-i386/delay.h | 8 + include/asm-i386/desc.h | 9 +- include/asm-i386/io.h | 8 +- include/asm-i386/irq.h | 3 + include/asm-i386/irqflags.h | 42 +-- include/asm-i386/mach-default/setup_arch.h | 2 + include/asm-i386/msr.h | 5 + include/asm-i386/paravirt.h | 281 ++++++++++++++++++++ include/asm-i386/processor.h | 15 +- include/asm-i386/segment.h | 2 + include/asm-i386/setup.h | 1 + include/asm-i386/spinlock.h | 4 + include/asm-i386/suspend.h | 8 +- include/asm-i386/system.h | 16 +- include/asm-i386/time.h | 41 +++ 27 files changed, 890 insertions(+), 58 deletions(-) create mode 100644 arch/i386/kernel/paravirt.c create mode 100644 include/asm-i386/paravirt.h create mode 100644 include/asm-i386/time.h (limited to 'arch/i386/kernel') diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index 1f0f7b60995..bb1fa061c6c 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -182,6 +182,17 @@ config X86_ES7000 endchoice +config PARAVIRT + bool "Paravirtualization support (EXPERIMENTAL)" + depends on EXPERIMENTAL + help + Paravirtualization is a way of running multiple instances of + Linux on the same machine, under a hypervisor. This option + changes the kernel so it can modify itself when it is run + under a hypervisor, improving performance significantly. + However, when run without a hypervisor the kernel is + theoretically slower. If in doubt, say N. + config ACPI_SRAT bool default y diff --git a/arch/i386/boot/compressed/misc.c b/arch/i386/boot/compressed/misc.c index dc153893155..c6798c75c67 100644 --- a/arch/i386/boot/compressed/misc.c +++ b/arch/i386/boot/compressed/misc.c @@ -9,6 +9,7 @@ * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 */ +#undef CONFIG_PARAVIRT #include #include #include diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index f614854bd71..40661213604 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -39,6 +39,7 @@ obj-$(CONFIG_VM86) += vm86.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_K8_NB) += k8.o +obj-$(CONFIG_PARAVIRT) += paravirt.o EXTRA_AFLAGS := -traditional diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index 0666eb0ed7b..1b2f3cd3327 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -101,4 +101,14 @@ void foo(void) BLANK(); OFFSET(PDA_cpu, i386_pda, cpu_number); OFFSET(PDA_pcurrent, i386_pda, pcurrent); + +#ifdef CONFIG_PARAVIRT + BLANK(); + OFFSET(PARAVIRT_enabled, paravirt_ops, paravirt_enabled); + OFFSET(PARAVIRT_irq_disable, paravirt_ops, irq_disable); + OFFSET(PARAVIRT_irq_enable, paravirt_ops, irq_enable); + OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, irq_enable_sysexit); + OFFSET(PARAVIRT_iret, paravirt_ops, iret); + OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0); +#endif } diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 0220bc8cbb4..d274612e05c 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -62,13 +62,6 @@ DF_MASK = 0x00000400 NT_MASK = 0x00004000 VM_MASK = 0x00020000 -/* These are replaces for paravirtualization */ -#define DISABLE_INTERRUPTS cli -#define ENABLE_INTERRUPTS sti -#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit -#define INTERRUPT_RETURN iret -#define GET_CR0_INTO_EAX movl %cr0, %eax - #ifdef CONFIG_PREEMPT #define preempt_stop DISABLE_INTERRUPTS; TRACE_IRQS_OFF #else @@ -416,6 +409,20 @@ ldt_ss: jnz restore_nocheck testl $0x00400000, %eax # returning to 32bit stack? jnz restore_nocheck # allright, normal return + +#ifdef CONFIG_PARAVIRT + /* + * The kernel can't run on a non-flat stack if paravirt mode + * is active. Rather than try to fixup the high bits of + * ESP, bypass this code entirely. This may break DOSemu + * and/or Wine support in a paravirt VM, although the option + * is still available to implement the setting of the high + * 16-bits in the INTERRUPT_RETURN paravirt-op. + */ + cmpl $0, paravirt_ops+PARAVIRT_enabled + jne restore_nocheck +#endif + /* If returning to userspace with 16bit stack, * try to fix the higher word of ESP, as the CPU * won't restore it. @@ -833,6 +840,19 @@ nmi_espfix_stack: .previous KPROBE_END(nmi) +#ifdef CONFIG_PARAVIRT +ENTRY(native_iret) +1: iret +.section __ex_table,"a" + .align 4 + .long 1b,iret_exc +.previous + +ENTRY(native_irq_enable_sysexit) + sti + sysexit +#endif + KPROBE_ENTRY(int3) RING0_INT_FRAME pushl $-1 # mark this as an int diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c index 62996cd1708..c8d45821c78 100644 --- a/arch/i386/kernel/i8259.c +++ b/arch/i386/kernel/i8259.c @@ -381,7 +381,10 @@ void __init init_ISA_irqs (void) } } -void __init init_IRQ(void) +/* Overridden in paravirt.c */ +void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); + +void __init native_init_IRQ(void) { int i; diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c new file mode 100644 index 00000000000..478192cd4b9 --- /dev/null +++ b/arch/i386/kernel/paravirt.c @@ -0,0 +1,404 @@ +/* Paravirtualization interfaces + Copyright (C) 2006 Rusty Russell IBM Corporation + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +/* nop stub */ +static void native_nop(void) +{ +} + +static void __init default_banner(void) +{ + printk(KERN_INFO "Booting paravirtualized kernel on %s\n", + paravirt_ops.name); +} + +char *memory_setup(void) +{ + return paravirt_ops.memory_setup(); +} + +static fastcall unsigned long native_get_debugreg(int regno) +{ + unsigned long val = 0; /* Damn you, gcc! */ + + switch (regno) { + case 0: + asm("movl %%db0, %0" :"=r" (val)); break; + case 1: + asm("movl %%db1, %0" :"=r" (val)); break; + case 2: + asm("movl %%db2, %0" :"=r" (val)); break; + case 3: + asm("movl %%db3, %0" :"=r" (val)); break; + case 6: + asm("movl %%db6, %0" :"=r" (val)); break; + case 7: + asm("movl %%db7, %0" :"=r" (val)); break; + default: + BUG(); + } + return val; +} + +static fastcall void native_set_debugreg(int regno, unsigned long value) +{ + switch (regno) { + case 0: + asm("movl %0,%%db0" : /* no output */ :"r" (value)); + break; + case 1: + asm("movl %0,%%db1" : /* no output */ :"r" (value)); + break; + case 2: + asm("movl %0,%%db2" : /* no output */ :"r" (value)); + break; + case 3: + asm("movl %0,%%db3" : /* no output */ :"r" (value)); + break; + case 6: + asm("movl %0,%%db6" : /* no output */ :"r" (value)); + break; + case 7: + asm("movl %0,%%db7" : /* no output */ :"r" (value)); + break; + default: + BUG(); + } +} + +void init_IRQ(void) +{ + paravirt_ops.init_IRQ(); +} + +static fastcall void native_clts(void) +{ + asm volatile ("clts"); +} + +static fastcall unsigned long native_read_cr0(void) +{ + unsigned long val; + asm volatile("movl %%cr0,%0\n\t" :"=r" (val)); + return val; +} + +static fastcall void native_write_cr0(unsigned long val) +{ + asm volatile("movl %0,%%cr0": :"r" (val)); +} + +static fastcall unsigned long native_read_cr2(void) +{ + unsigned long val; + asm volatile("movl %%cr2,%0\n\t" :"=r" (val)); + return val; +} + +static fastcall void native_write_cr2(unsigned long val) +{ + asm volatile("movl %0,%%cr2": :"r" (val)); +} + +static fastcall unsigned long native_read_cr3(void) +{ + unsigned long val; + asm volatile("movl %%cr3,%0\n\t" :"=r" (val)); + return val; +} + +static fastcall void native_write_cr3(unsigned long val) +{ + asm volatile("movl %0,%%cr3": :"r" (val)); +} + +static fastcall unsigned long native_read_cr4(void) +{ + unsigned long val; + asm volatile("movl %%cr4,%0\n\t" :"=r" (val)); + return val; +} + +static fastcall unsigned long native_read_cr4_safe(void) +{ + unsigned long val; + /* This could fault if %cr4 does not exist */ + asm("1: movl %%cr4, %0 \n" + "2: \n" + ".section __ex_table,\"a\" \n" + ".long 1b,2b \n" + ".previous \n" + : "=r" (val): "0" (0)); + return val; +} + +static fastcall void native_write_cr4(unsigned long val) +{ + asm volatile("movl %0,%%cr4": :"r" (val)); +} + +static fastcall unsigned long native_save_fl(void) +{ + unsigned long f; + asm volatile("pushfl ; popl %0":"=g" (f): /* no input */); + return f; +} + +static fastcall void native_restore_fl(unsigned long f) +{ + asm volatile("pushl %0 ; popfl": /* no output */ + :"g" (f) + :"memory", "cc"); +} + +static fastcall void native_irq_disable(void) +{ + asm volatile("cli": : :"memory"); +} + +static fastcall void native_irq_enable(void) +{ + asm volatile("sti": : :"memory"); +} + +static fastcall void native_safe_halt(void) +{ + asm volatile("sti; hlt": : :"memory"); +} + +static fastcall void native_halt(void) +{ + asm volatile("hlt": : :"memory"); +} + +static fastcall void native_wbinvd(void) +{ + asm volatile("wbinvd": : :"memory"); +} + +static fastcall unsigned long long native_read_msr(unsigned int msr, int *err) +{ + unsigned long long val; + + asm volatile("2: rdmsr ; xorl %0,%0\n" + "1:\n\t" + ".section .fixup,\"ax\"\n\t" + "3: movl %3,%0 ; jmp 1b\n\t" + ".previous\n\t" + ".section __ex_table,\"a\"\n" + " .align 4\n\t" + " .long 2b,3b\n\t" + ".previous" + : "=r" (*err), "=A" (val) + : "c" (msr), "i" (-EFAULT)); + + return val; +} + +static fastcall int native_write_msr(unsigned int msr, unsigned long long val) +{ + int err; + asm volatile("2: wrmsr ; xorl %0,%0\n" + "1:\n\t" + ".section .fixup,\"ax\"\n\t" + "3: movl %4,%0 ; jmp 1b\n\t" + ".previous\n\t" + ".section __ex_table,\"a\"\n" + " .align 4\n\t" + " .long 2b,3b\n\t" + ".previous" + : "=a" (err) + : "c" (msr), "0" ((u32)val), "d" ((u32)(val>>32)), + "i" (-EFAULT)); + return err; +} + +static fastcall unsigned long long native_read_tsc(void) +{ + unsigned long long val; + asm volatile("rdtsc" : "=A" (val)); + return val; +} + +static fastcall unsigned long long native_read_pmc(void) +{ + unsigned long long val; + asm volatile("rdpmc" : "=A" (val)); + return val; +} + +static fastcall void native_load_tr_desc(void) +{ + asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); +} + +static fastcall void native_load_gdt(const struct Xgt_desc_struct *dtr) +{ + asm volatile("lgdt %0"::"m" (*dtr)); +} + +static fastcall void native_load_idt(const struct Xgt_desc_struct *dtr) +{ + asm volatile("lidt %0"::"m" (*dtr)); +} + +static fastcall void native_store_gdt(struct Xgt_desc_struct *dtr) +{ + asm ("sgdt %0":"=m" (*dtr)); +} + +static fastcall void native_store_idt(struct Xgt_desc_struct *dtr) +{ + asm ("sidt %0":"=m" (*dtr)); +} + +static fastcall unsigned long native_store_tr(void) +{ + unsigned long tr; + asm ("str %0":"=r" (tr)); + return tr; +} + +static fastcall void native_load_tls(struct thread_struct *t, unsigned int cpu) +{ +#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i] + C(0); C(1); C(2); +#undef C +} + +static inline void native_write_dt_entry(void *dt, int entry, u32 entry_low, u32 entry_high) +{ + u32 *lp = (u32 *)((char *)dt + entry*8); + lp[0] = entry_low; + lp[1] = entry_high; +} + +static fastcall void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high) +{ + native_write_dt_entry(dt, entrynum, low, high); +} + +static fastcall void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high) +{ + native_write_dt_entry(dt, entrynum, low, high); +} + +static fastcall void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high) +{ + native_write_dt_entry(dt, entrynum, low, high); +} + +static fastcall void native_load_esp0(struct tss_struct *tss, + struct thread_struct *thread) +{ + tss->esp0 = thread->esp0; + + /* This can only happen when SEP is enabled, no need to test "SEP"arately */ + if (unlikely(tss->ss1 != thread->sysenter_cs)) { + tss->ss1 = thread->sysenter_cs; + wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); + } +} + +static fastcall void native_io_delay(void) +{ + asm volatile("outb %al,$0x80"); +} + +/* These are in entry.S */ +extern fastcall void native_iret(void); +extern fastcall void native_irq_enable_sysexit(void); + +static int __init print_banner(void) +{ + paravirt_ops.banner(); + return 0; +} +core_initcall(print_banner); + +struct paravirt_ops paravirt_ops = { + .name = "bare hardware", + .paravirt_enabled = 0, + .kernel_rpl = 0, + + .banner = default_banner, + .arch_setup = native_nop, + .memory_setup = machine_specific_memory_setup, + .get_wallclock = native_get_wallclock, + .set_wallclock = native_set_wallclock, + .time_init = time_init_hook, + .init_IRQ = native_init_IRQ, + + .cpuid = native_cpuid, + .get_debugreg = native_get_debugreg, + .set_debugreg = native_set_debugreg, + .clts = native_clts, + .read_cr0 = native_read_cr0, + .write_cr0 = native_write_cr0, + .read_cr2 = native_read_cr2, + .write_cr2 = native_write_cr2, + .read_cr3 = native_read_cr3, + .write_cr3 = native_write_cr3, + .read_cr4 = native_read_cr4, + .read_cr4_safe = native_read_cr4_safe, + .write_cr4 = native_write_cr4, + .save_fl = native_save_fl, + .restore_fl = native_restore_fl, + .irq_disable = native_irq_disable, + .irq_enable = native_irq_enable, + .safe_halt = native_safe_halt, + .halt = native_halt, + .wbinvd = native_wbinvd, + .read_msr = native_read_msr, + .write_msr = native_write_msr, + .read_tsc = native_read_tsc, + .read_pmc = native_read_pmc, + .load_tr_desc = native_load_tr_desc, + .set_ldt = native_set_ldt, + .load_gdt = native_load_gdt, + .load_idt = native_load_idt, + .store_gdt = native_store_gdt, + .store_idt = native_store_idt, + .store_tr = native_store_tr, + .load_tls = native_load_tls, + .write_ldt_entry = native_write_ldt_entry, + .write_gdt_entry = native_write_gdt_entry, + .write_idt_entry = native_write_idt_entry, + .load_esp0 = native_load_esp0, + + .set_iopl_mask = native_set_iopl_mask, + .io_delay = native_io_delay, + .const_udelay = __const_udelay, + + .irq_enable_sysexit = native_irq_enable_sysexit, + .iret = native_iret, +}; +EXPORT_SYMBOL(paravirt_ops); diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index e5bb87aa5a4..695d53fd14d 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -495,6 +495,12 @@ static void set_mca_bus(int x) static void set_mca_bus(int x) { } #endif +/* Overridden in paravirt.c if CONFIG_PARAVIRT */ +char * __attribute__((weak)) memory_setup(void) +{ + return machine_specific_memory_setup(); +} + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -547,7 +553,7 @@ void __init setup_arch(char **cmdline_p) efi_init(); else { printk(KERN_INFO "BIOS-provided physical RAM map:\n"); - print_memory_map(machine_specific_memory_setup()); + print_memory_map(memory_setup()); } copy_edd(); diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 095636620fa..cd7de9c9654 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -33,6 +33,11 @@ * Dave Jones : Report invalid combinations of Athlon CPUs. * Rusty Russell : Hacked into shape for new "hotplug" boot process. */ + +/* SMP boot always wants to use real time delay to allow sufficient time for + * the APs to come online */ +#define USE_REAL_TIME_DELAY + #include #include #include diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c index 78af572fd17..c505b16c099 100644 --- a/arch/i386/kernel/time.c +++ b/arch/i386/kernel/time.c @@ -56,6 +56,7 @@ #include #include #include +#include #include "mach_time.h" @@ -116,10 +117,7 @@ static int set_rtc_mmss(unsigned long nowtime) /* gets recalled with irq locally disabled */ /* XXX - does irqsave resolve this? -johnstul */ spin_lock_irqsave(&rtc_lock, flags); - if (efi_enabled) - retval = efi_set_rtc_mmss(nowtime); - else - retval = mach_set_rtc_mmss(nowtime); + retval = set_wallclock(nowtime); spin_unlock_irqrestore(&rtc_lock, flags); return retval; @@ -223,10 +221,7 @@ unsigned long get_cmos_time(void) spin_lock_irqsave(&rtc_lock, flags); - if (efi_enabled) - retval = efi_get_time(); - else - retval = mach_get_cmos_time(); + retval = get_wallclock(); spin_unlock_irqrestore(&rtc_lock, flags); @@ -370,7 +365,7 @@ static void __init hpet_time_init(void) printk("Using HPET for base-timer\n"); } - time_init_hook(); + do_time_init(); } #endif @@ -392,5 +387,5 @@ void __init time_init(void) do_settimeofday(&ts); - time_init_hook(); + do_time_init(); } diff --git a/arch/i386/power/cpu.c b/arch/i386/power/cpu.c index 5a1abeff033..2c15500f871 100644 --- a/arch/i386/power/cpu.c +++ b/arch/i386/power/cpu.c @@ -26,8 +26,8 @@ void __save_processor_state(struct saved_context *ctxt) /* * descriptor tables */ - store_gdt(&ctxt->gdt_limit); - store_idt(&ctxt->idt_limit); + store_gdt(&ctxt->gdt); + store_idt(&ctxt->idt); store_tr(ctxt->tr); /* @@ -99,8 +99,8 @@ void __restore_processor_state(struct saved_context *ctxt) * now restore the descriptor tables to their proper values * ltr is done i fix_processor_context(). */ - load_gdt(&ctxt->gdt_limit); - load_idt(&ctxt->idt_limit); + load_gdt(&ctxt->gdt); + load_idt(&ctxt->idt); /* * segment registers diff --git a/drivers/net/de600.c b/drivers/net/de600.c index 690bb40b353..8396e411f1c 100644 --- a/drivers/net/de600.c +++ b/drivers/net/de600.c @@ -43,7 +43,6 @@ static const char version[] = "de600.c: $Revision: 1.41-2.5 $, Bjorn Ekwall (bj * modify the following "#define": (see for more info) #define REALLY_SLOW_IO */ -#define SLOW_IO_BY_JUMPING /* Looks "better" than dummy write to port 0x80 :-) */ /* use 0 for production, 1 for verification, >2 for debug */ #ifdef DE600_DEBUG diff --git a/include/asm-i386/delay.h b/include/asm-i386/delay.h index 9ae5e3782ed..32d6678d0bb 100644 --- a/include/asm-i386/delay.h +++ b/include/asm-i386/delay.h @@ -16,6 +16,13 @@ extern void __ndelay(unsigned long nsecs); extern void __const_udelay(unsigned long usecs); extern void __delay(unsigned long loops); +#if defined(CONFIG_PARAVIRT) && !defined(USE_REAL_TIME_DELAY) +#define udelay(n) paravirt_ops.const_udelay((n) * 0x10c7ul) + +#define ndelay(n) paravirt_ops.const_udelay((n) * 5ul) + +#else /* !PARAVIRT || USE_REAL_TIME_DELAY */ + /* 0x10c7 is 2**32 / 1000000 (rounded up) */ #define udelay(n) (__builtin_constant_p(n) ? \ ((n) > 20000 ? __bad_udelay() : __const_udelay((n) * 0x10c7ul)) : \ @@ -25,6 +32,7 @@ extern void __delay(unsigned long loops); #define ndelay(n) (__builtin_constant_p(n) ? \ ((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \ __ndelay(n)) +#endif void use_tsc_delay(void); diff --git a/include/asm-i386/desc.h b/include/asm-i386/desc.h index 6cf2ac2bfde..f19820f0834 100644 --- a/include/asm-i386/desc.h +++ b/include/asm-i386/desc.h @@ -55,6 +55,9 @@ static inline void pack_gate(__u32 *a, __u32 *b, #define DESCTYPE_DPL3 0x60 /* DPL-3 */ #define DESCTYPE_S 0x10 /* !system */ +#ifdef CONFIG_PARAVIRT +#include +#else #define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8)) #define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr)) @@ -105,7 +108,11 @@ static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, const vo write_gdt_entry(get_cpu_gdt_table(cpu), entry, a, b); } -static inline void set_ldt(void *addr, unsigned int entries) +#define set_ldt native_set_ldt +#endif /* CONFIG_PARAVIRT */ + +static inline fastcall void native_set_ldt(const void *addr, + unsigned int entries) { if (likely(entries == 0)) __asm__ __volatile__("lldt %w0"::"q" (0)); diff --git a/include/asm-i386/io.h b/include/asm-i386/io.h index 68df0dc3ab8..86ff5e83be2 100644 --- a/include/asm-i386/io.h +++ b/include/asm-i386/io.h @@ -256,11 +256,11 @@ static inline void flush_write_buffers(void) #endif /* __KERNEL__ */ -#ifdef SLOW_IO_BY_JUMPING -#define __SLOW_DOWN_IO "jmp 1f; 1: jmp 1f; 1:" +#if defined(CONFIG_PARAVIRT) +#include #else + #define __SLOW_DOWN_IO "outb %%al,$0x80;" -#endif static inline void slow_down_io(void) { __asm__ __volatile__( @@ -271,6 +271,8 @@ static inline void slow_down_io(void) { : : ); } +#endif + #ifdef CONFIG_X86_NUMAQ extern void *xquad_portio; /* Where the IO area was mapped */ #define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port) diff --git a/include/asm-i386/irq.h b/include/asm-i386/irq.h index 331726b4112..9e15ce0006e 100644 --- a/include/asm-i386/irq.h +++ b/include/asm-i386/irq.h @@ -41,4 +41,7 @@ extern int irqbalance_disable(char *str); extern void fixup_irqs(cpumask_t map); #endif +void init_IRQ(void); +void __init native_init_IRQ(void); + #endif /* _ASM_IRQ_H */ diff --git a/include/asm-i386/irqflags.h b/include/asm-i386/irqflags.h index e1bdb97c07f..9ce01f3fb7b 100644 --- a/include/asm-i386/irqflags.h +++ b/include/asm-i386/irqflags.h @@ -10,6 +10,9 @@ #ifndef _ASM_IRQFLAGS_H #define _ASM_IRQFLAGS_H +#ifdef CONFIG_PARAVIRT +#include +#else #ifndef __ASSEMBLY__ static inline unsigned long __raw_local_save_flags(void) @@ -25,9 +28,6 @@ static inline unsigned long __raw_local_save_flags(void) return flags; } -#define raw_local_save_flags(flags) \ - do { (flags) = __raw_local_save_flags(); } while (0) - static inline void raw_local_irq_restore(unsigned long flags) { __asm__ __volatile__( @@ -66,18 +66,6 @@ static inline void halt(void) __asm__ __volatile__("hlt": : :"memory"); } -static inline int raw_irqs_disabled_flags(unsigned long flags) -{ - return !(flags & (1 << 9)); -} - -static inline int raw_irqs_disabled(void) -{ - unsigned long flags = __raw_local_save_flags(); - - return raw_irqs_disabled_flags(flags); -} - /* * For spinlocks, etc: */ @@ -90,9 +78,33 @@ static inline unsigned long __raw_local_irq_save(void) return flags; } +#else +#define DISABLE_INTERRUPTS cli +#define ENABLE_INTERRUPTS sti +#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit +#define INTERRUPT_RETURN iret +#define GET_CR0_INTO_EAX movl %cr0, %eax +#endif /* __ASSEMBLY__ */ +#endif /* CONFIG_PARAVIRT */ + +#ifndef __ASSEMBLY__ +#define raw_local_save_flags(flags) \ + do { (flags) = __raw_local_save_flags(); } while (0) + #define raw_local_irq_save(flags) \ do { (flags) = __raw_local_irq_save(); } while (0) +static inline int raw_irqs_disabled_flags(unsigned long flags) +{ + return !(flags & (1 << 9)); +} + +static inline int raw_irqs_disabled(void) +{ + unsigned long flags = __raw_local_save_flags(); + + return raw_irqs_disabled_flags(flags); +} #endif /* __ASSEMBLY__ */ /* diff --git a/include/asm-i386/mach-default/setup_arch.h b/include/asm-i386/mach-default/setup_arch.h index fb42099e7bd..605e3ccb991 100644 --- a/include/asm-i386/mach-default/setup_arch.h +++ b/include/asm-i386/mach-default/setup_arch.h @@ -2,4 +2,6 @@ /* no action for generic */ +#ifndef ARCH_SETUP #define ARCH_SETUP +#endif diff --git a/include/asm-i386/msr.h b/include/asm-i386/msr.h index 1820d9d73af..5679d499307 100644 --- a/include/asm-i386/msr.h +++ b/include/asm-i386/msr.h @@ -1,6 +1,10 @@ #ifndef __ASM_MSR_H #define __ASM_MSR_H +#ifdef CONFIG_PARAVIRT +#include +#else + /* * Access to machine-specific registers (available on 586 and better only) * Note: the rd* operations modify the parameters directly (without using @@ -77,6 +81,7 @@ static inline void wrmsrl (unsigned long msr, unsigned long long val) __asm__ __volatile__("rdpmc" \ : "=a" (low), "=d" (high) \ : "c" (counter)) +#endif /* !CONFIG_PARAVIRT */ /* symbolic names for some interesting MSRs */ /* Intel defined MSRs. */ diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h new file mode 100644 index 00000000000..a7551a44686 --- /dev/null +++ b/include/asm-i386/paravirt.h @@ -0,0 +1,281 @@ +#ifndef __ASM_PARAVIRT_H +#define __ASM_PARAVIRT_H +/* Various instructions on x86 need to be replaced for + * para-virtualization: those hooks are defined here. */ +#include + +#ifdef CONFIG_PARAVIRT +#ifndef __ASSEMBLY__ +struct thread_struct; +struct Xgt_desc_struct; +struct tss_struct; +struct paravirt_ops +{ + unsigned int kernel_rpl; + int paravirt_enabled; + const char *name; + + void (*arch_setup)(void); + char *(*memory_setup)(void); + void (*init_IRQ)(void); + + void (*banner)(void); + + unsigned long (*get_wallclock)(void); + int (*set_wallclock)(unsigned long); + void (*time_init)(void); + + /* All the function pointers here are declared as "fastcall" + so that we get a specific register-based calling + convention. This makes it easier to implement inline + assembler replacements. */ + + void (fastcall *cpuid)(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx); + + unsigned long (fastcall *get_debugreg)(int regno); + void (fastcall *set_debugreg)(int regno, unsigned long value); + + void (fastcall *clts)(void); + + unsigned long (fastcall *read_cr0)(void); + void (fastcall *write_cr0)(unsigned long); + + unsigned long (fastcall *read_cr2)(void); + void (fastcall *write_cr2)(unsigned long); + + unsigned long (fastcall *read_cr3)(void); + void (fastcall *write_cr3)(unsigned long); + + unsigned long (fastcall *read_cr4_safe)(void); + unsigned long (fastcall *read_cr4)(void); + void (fastcall *write_cr4)(unsigned long); + + unsigned long (fastcall *save_fl)(void); + void (fastcall *restore_fl)(unsigned long); + void (fastcall *irq_disable)(void); + void (fastcall *irq_enable)(void); + void (fastcall *safe_halt)(void); + void (fastcall *halt)(void); + void (fastcall *wbinvd)(void); + + /* err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */ + u64 (fastcall *read_msr)(unsigned int msr, int *err); + int (fastcall *write_msr)(unsigned int msr, u64 val); + + u64 (fastcall *read_tsc)(void); + u64 (fastcall *read_pmc)(void); + + void (fastcall *load_tr_desc)(void); + void (fastcall *load_gdt)(const struct Xgt_desc_struct *); + void (fastcall *load_idt)(const struct Xgt_desc_struct *); + void (fastcall *store_gdt)(struct Xgt_desc_struct *); + void (fastcall *store_idt)(struct Xgt_desc_struct *); + void (fastcall *set_ldt)(const void *desc, unsigned entries); + unsigned long (fastcall *store_tr)(void); + void (fastcall *load_tls)(struct thread_struct *t, unsigned int cpu); + void (fastcall *write_ldt_entry)(void *dt, int entrynum, + u32 low, u32 high); + void (fastcall *write_gdt_entry)(void *dt, int entrynum, + u32 low, u32 high); + void (fastcall *write_idt_entry)(void *dt, int entrynum, + u32 low, u32 high); + void (fastcall *load_esp0)(struct tss_struct *tss, + struct thread_struct *thread); + + void (fastcall *set_iopl_mask)(unsigned mask); + + void (fastcall *io_delay)(void); + void (*const_udelay)(unsigned long loops); + + /* These two are jmp to, not actually called. */ + void (fastcall *irq_enable_sysexit)(void); + void (fastcall *iret)(void); +}; + +extern struct paravirt_ops paravirt_ops; + +#define paravirt_enabled() (paravirt_ops.paravirt_enabled) + +static inline void load_esp0(struct tss_struct *tss, + struct thread_struct *thread) +{ + paravirt_ops.load_esp0(tss, thread); +} + +#define ARCH_SETUP paravirt_ops.arch_setup(); +static inline unsigned long get_wallclock(void) +{ + return paravirt_ops.get_wallclock(); +} + +static inline int set_wallclock(unsigned long nowtime) +{ + return paravirt_ops.set_wallclock(nowtime); +} + +static inline void do_time_init(void) +{ + return paravirt_ops.time_init(); +} + +/* The paravirtualized CPUID instruction. */ +static inline void __cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + paravirt_ops.cpuid(eax, ebx, ecx, edx); +} + +/* + * These special macros can be used to get or set a debugging register + */ +#define get_debugreg(var, reg) var = paravirt_ops.get_debugreg(reg) +#define set_debugreg(val, reg) paravirt_ops.set_debugreg(reg, val) + +#define clts() paravirt_ops.clts() + +#define read_cr0() paravirt_ops.read_cr0() +#define write_cr0(x) paravirt_ops.write_cr0(x) + +#define read_cr2() paravirt_ops.read_cr2() +#define write_cr2(x) paravirt_ops.write_cr2(x) + +#define read_cr3() paravirt_ops.read_cr3() +#define write_cr3(x) paravirt_ops.write_cr3(x) + +#define read_cr4() paravirt_ops.read_cr4() +#define read_cr4_safe(x) paravirt_ops.read_cr4_safe() +#define write_cr4(x) paravirt_ops.write_cr4(x) + +static inline unsigned long __raw_local_save_flags(void) +{ + return paravirt_ops.save_fl(); +} + +static inline void raw_local_irq_restore(unsigned long flags) +{ + return paravirt_ops.restore_fl(flags); +} + +static inline void raw_local_irq_disable(void) +{ + paravirt_ops.irq_disable(); +} + +static inline void raw_local_irq_enable(void) +{ + paravirt_ops.irq_enable(); +} + +static inline unsigned long __raw_local_irq_save(void) +{ + unsigned long flags = paravirt_ops.save_fl(); + + paravirt_ops.irq_disable(); + + return flags; +} + +static inline void raw_safe_halt(void) +{ + paravirt_ops.safe_halt(); +} + +static inline void halt(void) +{ + paravirt_ops.safe_halt(); +} +#define wbinvd() paravirt_ops.wbinvd() + +#define get_kernel_rpl() (paravirt_ops.kernel_rpl) + +#define rdmsr(msr,val1,val2) do { \ + int _err; \ + u64 _l = paravirt_ops.read_msr(msr,&_err); \ + val1 = (u32)_l; \ + val2 = _l >> 32; \ +} while(0) + +#define wrmsr(msr,val1,val2) do { \ + u64 _l = ((u64)(val2) << 32) | (val1); \ + paravirt_ops.write_msr((msr), _l); \ +} while(0) + +#define rdmsrl(msr,val) do { \ + int _err; \ + val = paravirt_ops.read_msr((msr),&_err); \ +} while(0) + +#define wrmsrl(msr,val) (paravirt_ops.write_msr((msr),(val))) +#define wrmsr_safe(msr,a,b) ({ \ + u64 _l = ((u64)(b) << 32) | (a); \ + paravirt_ops.write_msr((msr),_l); \ +}) + +/* rdmsr with exception handling */ +#define rdmsr_safe(msr,a,b) ({ \ + int _err; \ + u64 _l = paravirt_ops.read_msr(msr,&_err); \ + (*a) = (u32)_l; \ + (*b) = _l >> 32; \ + _err; }) + +#define rdtsc(low,high) do { \ + u64 _l = paravirt_ops.read_tsc(); \ + low = (u32)_l; \ + high = _l >> 32; \ +} while(0) + +#define rdtscl(low) do { \ + u64 _l = paravirt_ops.read_tsc(); \ + low = (int)_l; \ +} while(0) + +#define rdtscll(val) (val = paravirt_ops.read_tsc()) + +#define write_tsc(val1,val2) wrmsr(0x10, val1, val2) + +#define rdpmc(counter,low,high) do { \ + u64 _l = paravirt_ops.read_pmc(); \ + low = (u32)_l; \ + high = _l >> 32; \ +} while(0) + +#define load_TR_desc() (paravirt_ops.load_tr_desc()) +#define load_gdt(dtr) (paravirt_ops.load_gdt(dtr)) +#define load_idt(dtr) (paravirt_ops.load_idt(dtr)) +#define set_ldt(addr, entries) (paravirt_ops.set_ldt((addr), (entries))) +#define store_gdt(dtr) (paravirt_ops.store_gdt(dtr)) +#define store_idt(dtr) (paravirt_ops.store_idt(dtr)) +#define store_tr(tr) ((tr) = paravirt_ops.store_tr()) +#define load_TLS(t,cpu) (paravirt_ops.load_tls((t),(cpu))) +#define write_ldt_entry(dt, entry, low, high) \ + (paravirt_ops.write_ldt_entry((dt), (entry), (low), (high))) +#define write_gdt_entry(dt, entry, low, high) \ + (paravirt_ops.write_gdt_entry((dt), (entry), (low), (high))) +#define write_idt_entry(dt, entry, low, high) \ + (paravirt_ops.write_idt_entry((dt), (entry), (low), (high))) +#define set_iopl_mask(mask) (paravirt_ops.set_iopl_mask(mask)) + +/* The paravirtualized I/O functions */ +static inline void slow_down_io(void) { + paravirt_ops.io_delay(); +#ifdef REALLY_SLOW_IO + paravirt_ops.io_delay(); + paravirt_ops.io_delay(); + paravirt_ops.io_delay(); +#endif +} + +#define CLI_STRING "pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_disable; popl %edx; popl %ecx; popl %eax" +#define STI_STRING "pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax" +#else /* __ASSEMBLY__ */ + +#define INTERRUPT_RETURN jmp *%cs:paravirt_ops+PARAVIRT_iret +#define DISABLE_INTERRUPTS pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_disable; popl %edx; popl %ecx; popl %eax +#define ENABLE_INTERRUPTS pushl %eax; pushl %ecx; pushl %edx; call *%cs:paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax +#define ENABLE_INTERRUPTS_SYSEXIT jmp *%cs:paravirt_ops+PARAVIRT_irq_enable_sysexit +#define GET_CR0_INTO_EAX call *paravirt_ops+PARAVIRT_read_cr0 +#endif /* __ASSEMBLY__ */ +#endif /* CONFIG_PARAVIRT */ +#endif /* __ASM_PARAVIRT_H */ diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h index 98fa73b7176..6c2c4457be0 100644 --- a/include/asm-i386/processor.h +++ b/include/asm-i386/processor.h @@ -144,8 +144,8 @@ static inline void detect_ht(struct cpuinfo_x86 *c) {} #define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */ #define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */ -static inline void __cpuid(unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx) +static inline fastcall void native_cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) { /* ecx is often an input as well as an output. */ __asm__("cpuid" @@ -491,6 +491,12 @@ struct thread_struct { .io_bitmap = { [ 0 ... IO_BITMAP_LONGS] = ~0 }, \ } +#ifdef CONFIG_PARAVIRT +#include +#else +#define paravirt_enabled() 0 +#define __cpuid native_cpuid + static inline void load_esp0(struct tss_struct *tss, struct thread_struct *thread) { tss->esp0 = thread->esp0; @@ -524,10 +530,13 @@ static inline void load_esp0(struct tss_struct *tss, struct thread_struct *threa : /* no output */ \ :"r" (value)) +#define set_iopl_mask native_set_iopl_mask +#endif /* CONFIG_PARAVIRT */ + /* * Set IOPL bits in EFLAGS from given mask */ -static inline void set_iopl_mask(unsigned mask) +static fastcall inline void native_set_iopl_mask(unsigned mask) { unsigned int reg; __asm__ __volatile__ ("pushfl;" diff --git a/include/asm-i386/segment.h b/include/asm-i386/segment.h index 5bdda79b6b5..3c796af3377 100644 --- a/include/asm-i386/segment.h +++ b/include/asm-i386/segment.h @@ -131,5 +131,7 @@ #define SEGMENT_LDT 0x4 #define SEGMENT_GDT 0x0 +#ifndef CONFIG_PARAVIRT #define get_kernel_rpl() 0 #endif +#endif diff --git a/include/asm-i386/setup.h b/include/asm-i386/setup.h index 2734909eff8..9930c5a355f 100644 --- a/include/asm-i386/setup.h +++ b/include/asm-i386/setup.h @@ -70,6 +70,7 @@ extern unsigned char boot_params[PARAM_SIZE]; struct e820entry; char * __init machine_specific_memory_setup(void); +char *memory_setup(void); int __init copy_e820_map(struct e820entry * biosmap, int nr_map); int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map); diff --git a/include/asm-i386/spinlock.h b/include/asm-i386/spinlock.h index c18b71fae6b..dea60709db2 100644 --- a/include/asm-i386/spinlock.h +++ b/include/asm-i386/spinlock.h @@ -7,8 +7,12 @@ #include #include +#ifdef CONFIG_PARAVIRT +#include +#else #define CLI_STRING "cli" #define STI_STRING "sti" +#endif /* CONFIG_PARAVIRT */ /* * Your basic SMP spinlocks, allowing only a single CPU anywhere diff --git a/include/asm-i386/suspend.h b/include/asm-i386/suspend.h index 08be1e5009d..30361526d56 100644 --- a/include/asm-i386/suspend.h +++ b/include/asm-i386/suspend.h @@ -23,12 +23,8 @@ arch_prepare_suspend(void) struct saved_context { u16 es, fs, gs, ss; unsigned long cr0, cr2, cr3, cr4; - u16 gdt_pad; - u16 gdt_limit; - unsigned long gdt_base; - u16 idt_pad; - u16 idt_limit; - unsigned long idt_base; + struct Xgt_desc_struct gdt; + struct Xgt_desc_struct idt; u16 ldt; u16 tss; unsigned long tr; diff --git a/include/asm-i386/system.h b/include/asm-i386/system.h index a6dabbcd6e6..a6d20d9a1a3 100644 --- a/include/asm-i386/system.h +++ b/include/asm-i386/system.h @@ -88,6 +88,9 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t" \ #define savesegment(seg, value) \ asm volatile("mov %%" #seg ",%0":"=rm" (value)) +#ifdef CONFIG_PARAVIRT +#include +#else #define read_cr0() ({ \ unsigned int __dummy; \ __asm__ __volatile__( \ @@ -139,17 +142,18 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t" \ #define write_cr4(x) \ __asm__ __volatile__("movl %0,%%cr4": :"r" (x)) -/* - * Clear and set 'TS' bit respectively - */ +#define wbinvd() \ + __asm__ __volatile__ ("wbinvd": : :"memory") + +/* Clear the 'TS' bit */ #define clts() __asm__ __volatile__ ("clts") +#endif/* CONFIG_PARAVIRT */ + +/* Set the 'TS' bit */ #define stts() write_cr0(8 | read_cr0()) #endif /* __KERNEL__ */ -#define wbinvd() \ - __asm__ __volatile__ ("wbinvd": : :"memory") - static inline unsigned long get_limit(unsigned long segment) { unsigned long __limit; diff --git a/include/asm-i386/time.h b/include/asm-i386/time.h new file mode 100644 index 00000000000..ea8065af825 --- /dev/null +++ b/include/asm-i386/time.h @@ -0,0 +1,41 @@ +#ifndef _ASMi386_TIME_H +#define _ASMi386_TIME_H + +#include +#include "mach_time.h" + +static inline unsigned long native_get_wallclock(void) +{ + unsigned long retval; + + if (efi_enabled) + retval = efi_get_time(); + else + retval = mach_get_cmos_time(); + + return retval; +} + +static inline int native_set_wallclock(unsigned long nowtime) +{ + int retval; + + if (efi_enabled) + retval = efi_set_rtc_mmss(nowtime); + else + retval = mach_set_rtc_mmss(nowtime); + + return retval; +} + +#ifdef CONFIG_PARAVIRT +#include +#else /* !CONFIG_PARAVIRT */ + +#define get_wallclock() native_get_wallclock() +#define set_wallclock(x) native_set_wallclock(x) +#define do_time_init() time_init_hook() + +#endif /* CONFIG_PARAVIRT */ + +#endif -- cgit v1.2.3-70-g09d2 From 139ec7c416248b9ea227d21839235344edfee1e0 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 7 Dec 2006 02:14:08 +0100 Subject: [PATCH] paravirt: Patch inline replacements for paravirt intercepts It turns out that the most called ops, by several orders of magnitude, are the interrupt manipulation ops. These are obvious candidates for patching, so mark them up and create infrastructure for it. The method used is that the ops structure has a patch function, which is called for each place which needs to be patched: this returns a number of instructions (the rest are NOP-padded). Usually we can spare a register (%eax) for the binary patched code to use, but in a couple of critical places in entry.S we can't: we make the clobbers explicit at the call site, and manually clobber the allowed registers in debug mode as an extra check. And: Don't abuse CONFIG_DEBUG_KERNEL, add CONFIG_DEBUG_PARAVIRT. And: AK: Fix warnings in x86-64 alternative.c build And: AK: Fix compilation with defconfig And: ^From: Andrew Morton Some binutlises still like to emit references to __stop_parainstructions and __start_parainstructions. And: AK: Fix warnings about unused variables when PARAVIRT is disabled. Signed-off-by: Rusty Russell Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Chris Wright Signed-off-by: Zachary Amsden Signed-off-by: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/Kconfig.debug | 10 ++ arch/i386/kernel/alternative.c | 63 ++++++++++--- arch/i386/kernel/entry.S | 39 +++++--- arch/i386/kernel/module.c | 11 ++- arch/i386/kernel/paravirt.c | 44 +++++++++ arch/i386/kernel/vmlinux.lds.S | 6 ++ include/asm-i386/alternative.h | 13 ++- include/asm-i386/desc.h | 41 ++++---- include/asm-i386/irqflags.h | 4 +- include/asm-i386/paravirt.h | 189 ++++++++++++++++++++++++++++++------- include/asm-i386/processor.h | 198 +++++++++++++++++++-------------------- include/asm-i386/spinlock.h | 15 ++- include/asm-x86_64/alternative.h | 12 +++ scripts/mod/modpost.c | 2 + 14 files changed, 459 insertions(+), 188 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/Kconfig.debug b/arch/i386/Kconfig.debug index b31c0802e1c..f68cc6f215f 100644 --- a/arch/i386/Kconfig.debug +++ b/arch/i386/Kconfig.debug @@ -85,4 +85,14 @@ config DOUBLEFAULT option saves about 4k and might cause you much additional grey hair. +config DEBUG_PARAVIRT + bool "Enable some paravirtualization debugging" + default y + depends on PARAVIRT && DEBUG_KERNEL + help + Currently deliberately clobbers regs which are allowed to be + clobbered in inlined paravirt hooks, even in native mode. + If turning this off solves a problem, then DISABLE_INTERRUPTS() or + ENABLE_INTERRUPTS() is lying about what registers can be clobbered. + endmenu diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c index 535f9794fba..9eca21b49f6 100644 --- a/arch/i386/kernel/alternative.c +++ b/arch/i386/kernel/alternative.c @@ -124,6 +124,20 @@ static unsigned char** find_nop_table(void) #endif /* CONFIG_X86_64 */ +static void nop_out(void *insns, unsigned int len) +{ + unsigned char **noptable = find_nop_table(); + + while (len > 0) { + unsigned int noplen = len; + if (noplen > ASM_NOP_MAX) + noplen = ASM_NOP_MAX; + memcpy(insns, noptable[noplen], noplen); + insns += noplen; + len -= noplen; + } +} + extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[]; extern u8 *__smp_locks[], *__smp_locks_end[]; @@ -138,10 +152,9 @@ extern u8 __smp_alt_begin[], __smp_alt_end[]; void apply_alternatives(struct alt_instr *start, struct alt_instr *end) { - unsigned char **noptable = find_nop_table(); struct alt_instr *a; u8 *instr; - int diff, i, k; + int diff; DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end); for (a = start; a < end; a++) { @@ -159,13 +172,7 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end) #endif memcpy(instr, a->replacement, a->replacementlen); diff = a->instrlen - a->replacementlen; - /* Pad the rest with nops */ - for (i = a->replacementlen; diff > 0; diff -= k, i += k) { - k = diff; - if (k > ASM_NOP_MAX) - k = ASM_NOP_MAX; - memcpy(a->instr + i, noptable[k], k); - } + nop_out(instr + a->replacementlen, diff); } } @@ -209,7 +216,6 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end) static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) { - unsigned char **noptable = find_nop_table(); u8 **ptr; for (ptr = start; ptr < end; ptr++) { @@ -217,7 +223,7 @@ static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end continue; if (*ptr > text_end) continue; - **ptr = noptable[1][0]; + nop_out(*ptr, 1); }; } @@ -343,6 +349,40 @@ void alternatives_smp_switch(int smp) #endif +#ifdef CONFIG_PARAVIRT +void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end) +{ + struct paravirt_patch *p; + + for (p = start; p < end; p++) { + unsigned int used; + + used = paravirt_ops.patch(p->instrtype, p->clobbers, p->instr, + p->len); +#ifdef CONFIG_DEBUG_PARAVIRT + { + int i; + /* Deliberately clobber regs using "not %reg" to find bugs. */ + for (i = 0; i < 3; i++) { + if (p->len - used >= 2 && (p->clobbers & (1 << i))) { + memcpy(p->instr + used, "\xf7\xd0", 2); + p->instr[used+1] |= i; + used += 2; + } + } + } +#endif + /* Pad the rest with nops */ + nop_out(p->instr + used, p->len - used); + } + + /* Sync to be conservative, in case we patched following instructions */ + sync_core(); +} +extern struct paravirt_patch __start_parainstructions[], + __stop_parainstructions[]; +#endif /* CONFIG_PARAVIRT */ + void __init alternative_instructions(void) { unsigned long flags; @@ -390,5 +430,6 @@ void __init alternative_instructions(void) alternatives_smp_switch(0); } #endif + apply_paravirt(__start_parainstructions, __stop_parainstructions); local_irq_restore(flags); } diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index d274612e05c..de34b7fed3c 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -53,6 +53,19 @@ #include #include "irq_vectors.h" +/* + * We use macros for low-level operations which need to be overridden + * for paravirtualization. The following will never clobber any registers: + * INTERRUPT_RETURN (aka. "iret") + * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") + * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit"). + * + * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must + * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). + * Allowing a register to be clobbered can shrink the paravirt replacement + * enough to patch inline, increasing performance. + */ + #define nr_syscalls ((syscall_table_size)/4) CF_MASK = 0x00000001 @@ -63,9 +76,9 @@ NT_MASK = 0x00004000 VM_MASK = 0x00020000 #ifdef CONFIG_PREEMPT -#define preempt_stop DISABLE_INTERRUPTS; TRACE_IRQS_OFF +#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF #else -#define preempt_stop +#define preempt_stop(clobbers) #define resume_kernel restore_nocheck #endif @@ -226,7 +239,7 @@ ENTRY(ret_from_fork) ALIGN RING0_PTREGS_FRAME ret_from_exception: - preempt_stop + preempt_stop(CLBR_ANY) ret_from_intr: GET_THREAD_INFO(%ebp) check_userspace: @@ -237,7 +250,7 @@ check_userspace: jb resume_kernel # not returning to v8086 or userspace ENTRY(resume_userspace) - DISABLE_INTERRUPTS # make sure we don't miss an interrupt + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret movl TI_flags(%ebp), %ecx @@ -248,7 +261,7 @@ ENTRY(resume_userspace) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) - DISABLE_INTERRUPTS + DISABLE_INTERRUPTS(CLBR_ANY) cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? jnz restore_nocheck need_resched: @@ -277,7 +290,7 @@ sysenter_past_esp: * No need to follow this irqs on/off section: the syscall * disabled irqs and here we enable it straight after entry: */ - ENABLE_INTERRUPTS + ENABLE_INTERRUPTS(CLBR_NONE) pushl $(__USER_DS) CFI_ADJUST_CFA_OFFSET 4 /*CFI_REL_OFFSET ss, 0*/ @@ -322,7 +335,7 @@ sysenter_past_esp: jae syscall_badsys call *sys_call_table(,%eax,4) movl %eax,PT_EAX(%esp) - DISABLE_INTERRUPTS + DISABLE_INTERRUPTS(CLBR_ECX|CLBR_EDX) TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx @@ -364,7 +377,7 @@ syscall_call: call *sys_call_table(,%eax,4) movl %eax,PT_EAX(%esp) # store the return value syscall_exit: - DISABLE_INTERRUPTS # make sure we don't miss an interrupt + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret TRACE_IRQS_OFF @@ -393,7 +406,7 @@ restore_nocheck_notrace: .section .fixup,"ax" iret_exc: TRACE_IRQS_ON - ENABLE_INTERRUPTS + ENABLE_INTERRUPTS(CLBR_NONE) pushl $0 # no error code pushl $do_iret_error jmp error_code @@ -436,7 +449,7 @@ ldt_ss: CFI_ADJUST_CFA_OFFSET 4 pushl %eax CFI_ADJUST_CFA_OFFSET 4 - DISABLE_INTERRUPTS + DISABLE_INTERRUPTS(CLBR_EAX) TRACE_IRQS_OFF lss (%esp), %esp CFI_ADJUST_CFA_OFFSET -8 @@ -451,7 +464,7 @@ work_pending: jz work_notifysig work_resched: call schedule - DISABLE_INTERRUPTS # make sure we don't miss an interrupt + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret TRACE_IRQS_OFF @@ -509,7 +522,7 @@ syscall_exit_work: testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl jz work_pending TRACE_IRQS_ON - ENABLE_INTERRUPTS # could let do_syscall_trace() call + ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call # schedule() instead movl %esp, %eax movl $1, %edx @@ -693,7 +706,7 @@ ENTRY(device_not_available) GET_CR0_INTO_EAX testl $0x4, %eax # EM (math emulation bit) jne device_not_available_emulate - preempt_stop + preempt_stop(CLBR_ANY) call math_state_restore jmp ret_from_exception device_not_available_emulate: diff --git a/arch/i386/kernel/module.c b/arch/i386/kernel/module.c index 470cf97e7cd..d7d9c8b23f7 100644 --- a/arch/i386/kernel/module.c +++ b/arch/i386/kernel/module.c @@ -108,7 +108,8 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *me) { - const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL; + const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, + *para = NULL; char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { @@ -118,6 +119,8 @@ int module_finalize(const Elf_Ehdr *hdr, alt = s; if (!strcmp(".smp_locks", secstrings + s->sh_name)) locks= s; + if (!strcmp(".parainstructions", secstrings + s->sh_name)) + para = s; } if (alt) { @@ -132,6 +135,12 @@ int module_finalize(const Elf_Ehdr *hdr, lseg, lseg + locks->sh_size, tseg, tseg + text->sh_size); } + + if (para) { + void *pseg = (void *)para->sh_addr; + apply_paravirt(pseg, pseg + para->sh_size); + } + return 0; } diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index 478192cd4b9..d4646042644 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -45,6 +45,49 @@ char *memory_setup(void) return paravirt_ops.memory_setup(); } +/* Simple instruction patching code. */ +#define DEF_NATIVE(name, code) \ + extern const char start_##name[], end_##name[]; \ + asm("start_" #name ": " code "; end_" #name ":") +DEF_NATIVE(cli, "cli"); +DEF_NATIVE(sti, "sti"); +DEF_NATIVE(popf, "push %eax; popf"); +DEF_NATIVE(pushf, "pushf; pop %eax"); +DEF_NATIVE(pushf_cli, "pushf; pop %eax; cli"); +DEF_NATIVE(iret, "iret"); +DEF_NATIVE(sti_sysexit, "sti; sysexit"); + +static const struct native_insns +{ + const char *start, *end; +} native_insns[] = { + [PARAVIRT_IRQ_DISABLE] = { start_cli, end_cli }, + [PARAVIRT_IRQ_ENABLE] = { start_sti, end_sti }, + [PARAVIRT_RESTORE_FLAGS] = { start_popf, end_popf }, + [PARAVIRT_SAVE_FLAGS] = { start_pushf, end_pushf }, + [PARAVIRT_SAVE_FLAGS_IRQ_DISABLE] = { start_pushf_cli, end_pushf_cli }, + [PARAVIRT_INTERRUPT_RETURN] = { start_iret, end_iret }, + [PARAVIRT_STI_SYSEXIT] = { start_sti_sysexit, end_sti_sysexit }, +}; + +static unsigned native_patch(u8 type, u16 clobbers, void *insns, unsigned len) +{ + unsigned int insn_len; + + /* Don't touch it if we don't have a replacement */ + if (type >= ARRAY_SIZE(native_insns) || !native_insns[type].start) + return len; + + insn_len = native_insns[type].end - native_insns[type].start; + + /* Similarly if we can't fit replacement. */ + if (len < insn_len) + return len; + + memcpy(insns, native_insns[type].start, insn_len); + return insn_len; +} + static fastcall unsigned long native_get_debugreg(int regno) { unsigned long val = 0; /* Damn you, gcc! */ @@ -349,6 +392,7 @@ struct paravirt_ops paravirt_ops = { .paravirt_enabled = 0, .kernel_rpl = 0, + .patch = native_patch, .banner = default_banner, .arch_setup = native_nop, .memory_setup = machine_specific_memory_setup, diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index 6860f20aa57..5c69cf0e594 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -165,6 +165,12 @@ SECTIONS .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { *(.altinstr_replacement) } + . = ALIGN(4); + __start_parainstructions = .; + .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { + *(.parainstructions) + } + __stop_parainstructions = .; /* .exit.text is discard at runtime, not link time, to deal with references from .altinstructions and .eh_frame */ .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) } diff --git a/include/asm-i386/alternative.h b/include/asm-i386/alternative.h index b01a7ec409c..b8fa9557c53 100644 --- a/include/asm-i386/alternative.h +++ b/include/asm-i386/alternative.h @@ -4,7 +4,7 @@ #ifdef __KERNEL__ #include - +#include #include struct alt_instr { @@ -118,4 +118,15 @@ static inline void alternatives_smp_switch(int smp) {} #define LOCK_PREFIX "" #endif +struct paravirt_patch; +#ifdef CONFIG_PARAVIRT +void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end); +#else +static inline void +apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end) +{} +#define __start_parainstructions NULL +#define __stop_parainstructions NULL +#endif + #endif /* _I386_ALTERNATIVE_H */ diff --git a/include/asm-i386/desc.h b/include/asm-i386/desc.h index f19820f0834..f398cc45644 100644 --- a/include/asm-i386/desc.h +++ b/include/asm-i386/desc.h @@ -81,31 +81,15 @@ static inline void load_TLS(struct thread_struct *t, unsigned int cpu) #undef C } -static inline void write_dt_entry(void *dt, int entry, __u32 entry_a, __u32 entry_b) -{ - __u32 *lp = (__u32 *)((char *)dt + entry*8); - *lp = entry_a; - *(lp+1) = entry_b; -} - #define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b) #define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b) #define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b) -static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg) -{ - __u32 a, b; - pack_gate(&a, &b, (unsigned long)addr, seg, type, 0); - write_idt_entry(idt_table, gate, a, b); -} - -static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, const void *addr) +static inline void write_dt_entry(void *dt, int entry, __u32 entry_a, __u32 entry_b) { - __u32 a, b; - pack_descriptor(&a, &b, (unsigned long)addr, - offsetof(struct tss_struct, __cacheline_filler) - 1, - DESCTYPE_TSS, 0); - write_gdt_entry(get_cpu_gdt_table(cpu), entry, a, b); + __u32 *lp = (__u32 *)((char *)dt + entry*8); + *lp = entry_a; + *(lp+1) = entry_b; } #define set_ldt native_set_ldt @@ -128,6 +112,23 @@ static inline fastcall void native_set_ldt(const void *addr, } } +static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg) +{ + __u32 a, b; + pack_gate(&a, &b, (unsigned long)addr, seg, type, 0); + write_idt_entry(idt_table, gate, a, b); +} + +static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, const void *addr) +{ + __u32 a, b; + pack_descriptor(&a, &b, (unsigned long)addr, + offsetof(struct tss_struct, __cacheline_filler) - 1, + DESCTYPE_TSS, 0); + write_gdt_entry(get_cpu_gdt_table(cpu), entry, a, b); +} + + #define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr) #define LDT_entry_a(info) \ diff --git a/include/asm-i386/irqflags.h b/include/asm-i386/irqflags.h index 9ce01f3fb7b..17b18cf4fe9 100644 --- a/include/asm-i386/irqflags.h +++ b/include/asm-i386/irqflags.h @@ -79,8 +79,8 @@ static inline unsigned long __raw_local_irq_save(void) } #else -#define DISABLE_INTERRUPTS cli -#define ENABLE_INTERRUPTS sti +#define DISABLE_INTERRUPTS(clobbers) cli +#define ENABLE_INTERRUPTS(clobbers) sti #define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit #define INTERRUPT_RETURN iret #define GET_CR0_INTO_EAX movl %cr0, %eax diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h index a7551a44686..081194751ad 100644 --- a/include/asm-i386/paravirt.h +++ b/include/asm-i386/paravirt.h @@ -3,8 +3,26 @@ /* Various instructions on x86 need to be replaced for * para-virtualization: those hooks are defined here. */ #include +#include #ifdef CONFIG_PARAVIRT +/* These are the most performance critical ops, so we want to be able to patch + * callers */ +#define PARAVIRT_IRQ_DISABLE 0 +#define PARAVIRT_IRQ_ENABLE 1 +#define PARAVIRT_RESTORE_FLAGS 2 +#define PARAVIRT_SAVE_FLAGS 3 +#define PARAVIRT_SAVE_FLAGS_IRQ_DISABLE 4 +#define PARAVIRT_INTERRUPT_RETURN 5 +#define PARAVIRT_STI_SYSEXIT 6 + +/* Bitmask of what can be clobbered: usually at least eax. */ +#define CLBR_NONE 0x0 +#define CLBR_EAX 0x1 +#define CLBR_ECX 0x2 +#define CLBR_EDX 0x4 +#define CLBR_ANY 0x7 + #ifndef __ASSEMBLY__ struct thread_struct; struct Xgt_desc_struct; @@ -15,6 +33,15 @@ struct paravirt_ops int paravirt_enabled; const char *name; + /* + * Patch may replace one of the defined code sequences with arbitrary + * code, subject to the same register constraints. This generally + * means the code is not free to clobber any registers other than EAX. + * The patch function should return the number of bytes of code + * generated, as we nop pad the rest in generic code. + */ + unsigned (*patch)(u8 type, u16 clobber, void *firstinsn, unsigned len); + void (*arch_setup)(void); char *(*memory_setup)(void); void (*init_IRQ)(void); @@ -147,35 +174,6 @@ static inline void __cpuid(unsigned int *eax, unsigned int *ebx, #define read_cr4_safe(x) paravirt_ops.read_cr4_safe() #define write_cr4(x) paravirt_ops.write_cr4(x) -static inline unsigned long __raw_local_save_flags(void) -{ - return paravirt_ops.save_fl(); -} - -static inline void raw_local_irq_restore(unsigned long flags) -{ - return paravirt_ops.restore_fl(flags); -} - -static inline void raw_local_irq_disable(void) -{ - paravirt_ops.irq_disable(); -} - -static inline void raw_local_irq_enable(void) -{ - paravirt_ops.irq_enable(); -} - -static inline unsigned long __raw_local_irq_save(void) -{ - unsigned long flags = paravirt_ops.save_fl(); - - paravirt_ops.irq_disable(); - - return flags; -} - static inline void raw_safe_halt(void) { paravirt_ops.safe_halt(); @@ -267,15 +265,134 @@ static inline void slow_down_io(void) { #endif } -#define CLI_STRING "pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_disable; popl %edx; popl %ecx; popl %eax" -#define STI_STRING "pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax" +/* These all sit in the .parainstructions section to tell us what to patch. */ +struct paravirt_patch { + u8 *instr; /* original instructions */ + u8 instrtype; /* type of this instruction */ + u8 len; /* length of original instruction */ + u16 clobbers; /* what registers you may clobber */ +}; + +#define paravirt_alt(insn_string, typenum, clobber) \ + "771:\n\t" insn_string "\n" "772:\n" \ + ".pushsection .parainstructions,\"a\"\n" \ + " .long 771b\n" \ + " .byte " __stringify(typenum) "\n" \ + " .byte 772b-771b\n" \ + " .short " __stringify(clobber) "\n" \ + ".popsection" + +static inline unsigned long __raw_local_save_flags(void) +{ + unsigned long f; + + __asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;" + "call *%1;" + "popl %%edx; popl %%ecx", + PARAVIRT_SAVE_FLAGS, CLBR_NONE) + : "=a"(f): "m"(paravirt_ops.save_fl) + : "memory", "cc"); + return f; +} + +static inline void raw_local_irq_restore(unsigned long f) +{ + __asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;" + "call *%1;" + "popl %%edx; popl %%ecx", + PARAVIRT_RESTORE_FLAGS, CLBR_EAX) + : "=a"(f) : "m" (paravirt_ops.restore_fl), "0"(f) + : "memory", "cc"); +} + +static inline void raw_local_irq_disable(void) +{ + __asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;" + "call *%0;" + "popl %%edx; popl %%ecx", + PARAVIRT_IRQ_DISABLE, CLBR_EAX) + : : "m" (paravirt_ops.irq_disable) + : "memory", "eax", "cc"); +} + +static inline void raw_local_irq_enable(void) +{ + __asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;" + "call *%0;" + "popl %%edx; popl %%ecx", + PARAVIRT_IRQ_ENABLE, CLBR_EAX) + : : "m" (paravirt_ops.irq_enable) + : "memory", "eax", "cc"); +} + +static inline unsigned long __raw_local_irq_save(void) +{ + unsigned long f; + + __asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;" + "call *%1; pushl %%eax;" + "call *%2; popl %%eax;" + "popl %%edx; popl %%ecx", + PARAVIRT_SAVE_FLAGS_IRQ_DISABLE, + CLBR_NONE) + : "=a"(f) + : "m" (paravirt_ops.save_fl), + "m" (paravirt_ops.irq_disable) + : "memory", "cc"); + return f; +} + +#define CLI_STRING paravirt_alt("pushl %%ecx; pushl %%edx;" \ + "call *paravirt_ops+%c[irq_disable];" \ + "popl %%edx; popl %%ecx", \ + PARAVIRT_IRQ_DISABLE, CLBR_EAX) + +#define STI_STRING paravirt_alt("pushl %%ecx; pushl %%edx;" \ + "call *paravirt_ops+%c[irq_enable];" \ + "popl %%edx; popl %%ecx", \ + PARAVIRT_IRQ_ENABLE, CLBR_EAX) +#define CLI_STI_CLOBBERS , "%eax" +#define CLI_STI_INPUT_ARGS \ + , \ + [irq_disable] "i" (offsetof(struct paravirt_ops, irq_disable)), \ + [irq_enable] "i" (offsetof(struct paravirt_ops, irq_enable)) + #else /* __ASSEMBLY__ */ -#define INTERRUPT_RETURN jmp *%cs:paravirt_ops+PARAVIRT_iret -#define DISABLE_INTERRUPTS pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_disable; popl %edx; popl %ecx; popl %eax -#define ENABLE_INTERRUPTS pushl %eax; pushl %ecx; pushl %edx; call *%cs:paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax -#define ENABLE_INTERRUPTS_SYSEXIT jmp *%cs:paravirt_ops+PARAVIRT_irq_enable_sysexit -#define GET_CR0_INTO_EAX call *paravirt_ops+PARAVIRT_read_cr0 +#define PARA_PATCH(ptype, clobbers, ops) \ +771:; \ + ops; \ +772:; \ + .pushsection .parainstructions,"a"; \ + .long 771b; \ + .byte ptype; \ + .byte 772b-771b; \ + .short clobbers; \ + .popsection + +#define INTERRUPT_RETURN \ + PARA_PATCH(PARAVIRT_INTERRUPT_RETURN, CLBR_ANY, \ + jmp *%cs:paravirt_ops+PARAVIRT_iret) + +#define DISABLE_INTERRUPTS(clobbers) \ + PARA_PATCH(PARAVIRT_IRQ_DISABLE, clobbers, \ + pushl %ecx; pushl %edx; \ + call *paravirt_ops+PARAVIRT_irq_disable; \ + popl %edx; popl %ecx) \ + +#define ENABLE_INTERRUPTS(clobbers) \ + PARA_PATCH(PARAVIRT_IRQ_ENABLE, clobbers, \ + pushl %ecx; pushl %edx; \ + call *%cs:paravirt_ops+PARAVIRT_irq_enable; \ + popl %edx; popl %ecx) + +#define ENABLE_INTERRUPTS_SYSEXIT \ + PARA_PATCH(PARAVIRT_STI_SYSEXIT, CLBR_ANY, \ + jmp *%cs:paravirt_ops+PARAVIRT_irq_enable_sysexit) + +#define GET_CR0_INTO_EAX \ + call *paravirt_ops+PARAVIRT_read_cr0 + #endif /* __ASSEMBLY__ */ #endif /* CONFIG_PARAVIRT */ #endif /* __ASM_PARAVIRT_H */ diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h index 6c2c4457be0..5f0418d0078 100644 --- a/include/asm-i386/processor.h +++ b/include/asm-i386/processor.h @@ -156,59 +156,6 @@ static inline fastcall void native_cpuid(unsigned int *eax, unsigned int *ebx, : "0" (*eax), "2" (*ecx)); } -/* - * Generic CPUID function - * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx - * resulting in stale register contents being returned. - */ -static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) -{ - *eax = op; - *ecx = 0; - __cpuid(eax, ebx, ecx, edx); -} - -/* Some CPUID calls want 'count' to be placed in ecx */ -static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx, - int *edx) -{ - *eax = op; - *ecx = count; - __cpuid(eax, ebx, ecx, edx); -} - -/* - * CPUID functions returning a single datum - */ -static inline unsigned int cpuid_eax(unsigned int op) -{ - unsigned int eax, ebx, ecx, edx; - - cpuid(op, &eax, &ebx, &ecx, &edx); - return eax; -} -static inline unsigned int cpuid_ebx(unsigned int op) -{ - unsigned int eax, ebx, ecx, edx; - - cpuid(op, &eax, &ebx, &ecx, &edx); - return ebx; -} -static inline unsigned int cpuid_ecx(unsigned int op) -{ - unsigned int eax, ebx, ecx, edx; - - cpuid(op, &eax, &ebx, &ecx, &edx); - return ecx; -} -static inline unsigned int cpuid_edx(unsigned int op) -{ - unsigned int eax, ebx, ecx, edx; - - cpuid(op, &eax, &ebx, &ecx, &edx); - return edx; -} - #define load_cr3(pgdir) write_cr3(__pa(pgdir)) /* @@ -491,22 +438,6 @@ struct thread_struct { .io_bitmap = { [ 0 ... IO_BITMAP_LONGS] = ~0 }, \ } -#ifdef CONFIG_PARAVIRT -#include -#else -#define paravirt_enabled() 0 -#define __cpuid native_cpuid - -static inline void load_esp0(struct tss_struct *tss, struct thread_struct *thread) -{ - tss->esp0 = thread->esp0; - /* This can only happen when SEP is enabled, no need to test "SEP"arately */ - if (unlikely(tss->ss1 != thread->sysenter_cs)) { - tss->ss1 = thread->sysenter_cs; - wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); - } -} - #define start_thread(regs, new_eip, new_esp) do { \ __asm__("movl %0,%%fs": :"r" (0)); \ regs->xgs = 0; \ @@ -519,36 +450,6 @@ static inline void load_esp0(struct tss_struct *tss, struct thread_struct *threa regs->esp = new_esp; \ } while (0) -/* - * These special macros can be used to get or set a debugging register - */ -#define get_debugreg(var, register) \ - __asm__("movl %%db" #register ", %0" \ - :"=r" (var)) -#define set_debugreg(value, register) \ - __asm__("movl %0,%%db" #register \ - : /* no output */ \ - :"r" (value)) - -#define set_iopl_mask native_set_iopl_mask -#endif /* CONFIG_PARAVIRT */ - -/* - * Set IOPL bits in EFLAGS from given mask - */ -static fastcall inline void native_set_iopl_mask(unsigned mask) -{ - unsigned int reg; - __asm__ __volatile__ ("pushfl;" - "popl %0;" - "andl %1, %0;" - "orl %2, %0;" - "pushl %0;" - "popfl" - : "=&r" (reg) - : "i" (~X86_EFLAGS_IOPL), "r" (mask)); -} - /* Forward declaration, a strange C thing */ struct task_struct; struct mm_struct; @@ -640,6 +541,105 @@ static inline void rep_nop(void) #define cpu_relax() rep_nop() +#ifdef CONFIG_PARAVIRT +#include +#else +#define paravirt_enabled() 0 +#define __cpuid native_cpuid + +static inline void load_esp0(struct tss_struct *tss, struct thread_struct *thread) +{ + tss->esp0 = thread->esp0; + /* This can only happen when SEP is enabled, no need to test "SEP"arately */ + if (unlikely(tss->ss1 != thread->sysenter_cs)) { + tss->ss1 = thread->sysenter_cs; + wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); + } +} + +/* + * These special macros can be used to get or set a debugging register + */ +#define get_debugreg(var, register) \ + __asm__("movl %%db" #register ", %0" \ + :"=r" (var)) +#define set_debugreg(value, register) \ + __asm__("movl %0,%%db" #register \ + : /* no output */ \ + :"r" (value)) + +#define set_iopl_mask native_set_iopl_mask +#endif /* CONFIG_PARAVIRT */ + +/* + * Set IOPL bits in EFLAGS from given mask + */ +static fastcall inline void native_set_iopl_mask(unsigned mask) +{ + unsigned int reg; + __asm__ __volatile__ ("pushfl;" + "popl %0;" + "andl %1, %0;" + "orl %2, %0;" + "pushl %0;" + "popfl" + : "=&r" (reg) + : "i" (~X86_EFLAGS_IOPL), "r" (mask)); +} + +/* + * Generic CPUID function + * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx + * resulting in stale register contents being returned. + */ +static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) +{ + *eax = op; + *ecx = 0; + __cpuid(eax, ebx, ecx, edx); +} + +/* Some CPUID calls want 'count' to be placed in ecx */ +static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx, + int *edx) +{ + *eax = op; + *ecx = count; + __cpuid(eax, ebx, ecx, edx); +} + +/* + * CPUID functions returning a single datum + */ +static inline unsigned int cpuid_eax(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + return eax; +} +static inline unsigned int cpuid_ebx(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + return ebx; +} +static inline unsigned int cpuid_ecx(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + return ecx; +} +static inline unsigned int cpuid_edx(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + return edx; +} + /* generic versions from gas */ #define GENERIC_NOP1 ".byte 0x90\n" #define GENERIC_NOP2 ".byte 0x89,0xf6\n" diff --git a/include/asm-i386/spinlock.h b/include/asm-i386/spinlock.h index dea60709db2..d3bcebed60c 100644 --- a/include/asm-i386/spinlock.h +++ b/include/asm-i386/spinlock.h @@ -12,6 +12,8 @@ #else #define CLI_STRING "cli" #define STI_STRING "sti" +#define CLI_STI_CLOBBERS +#define CLI_STI_INPUT_ARGS #endif /* CONFIG_PARAVIRT */ /* @@ -57,25 +59,28 @@ static inline void __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long fla { asm volatile( "\n1:\t" - LOCK_PREFIX " ; decb %0\n\t" + LOCK_PREFIX " ; decb %[slock]\n\t" "jns 5f\n" "2:\t" - "testl $0x200, %1\n\t" + "testl $0x200, %[flags]\n\t" "jz 4f\n\t" STI_STRING "\n" "3:\t" "rep;nop\n\t" - "cmpb $0, %0\n\t" + "cmpb $0, %[slock]\n\t" "jle 3b\n\t" CLI_STRING "\n\t" "jmp 1b\n" "4:\t" "rep;nop\n\t" - "cmpb $0, %0\n\t" + "cmpb $0, %[slock]\n\t" "jg 1b\n\t" "jmp 4b\n" "5:\n\t" - : "+m" (lock->slock) : "r" (flags) : "memory"); + : [slock] "+m" (lock->slock) + : [flags] "r" (flags) + CLI_STI_INPUT_ARGS + : "memory" CLI_STI_CLOBBERS); } #endif diff --git a/include/asm-x86_64/alternative.h b/include/asm-x86_64/alternative.h index a584826cc57..a6657b4f3e0 100644 --- a/include/asm-x86_64/alternative.h +++ b/include/asm-x86_64/alternative.h @@ -4,6 +4,7 @@ #ifdef __KERNEL__ #include +#include #include struct alt_instr { @@ -133,4 +134,15 @@ static inline void alternatives_smp_switch(int smp) {} #define LOCK_PREFIX "" #endif +struct paravirt_patch; +#ifdef CONFIG_PARAVIRT +void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end); +#else +static inline void +apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end) +{} +#define __start_parainstructions NULL +#define __stop_parainstructions NULL +#endif + #endif /* _X86_64_ALTERNATIVE_H */ diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 2e114162314..ac0a5822299 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -911,6 +911,7 @@ static int init_section_ref_ok(const char *name) ".toc1", /* used by ppc64 */ ".stab", ".rodata", + ".parainstructions", ".text.lock", "__bug_table", /* used by powerpc for BUG() */ ".pci_fixup_header", @@ -931,6 +932,7 @@ static int init_section_ref_ok(const char *name) ".altinstructions", ".eh_frame", ".debug", + ".parainstructions", NULL }; /* part of section name */ -- cgit v1.2.3-70-g09d2 From d7cd56111f30259e1b532a12e06f59f8e0a20355 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 7 Dec 2006 02:14:08 +0100 Subject: [PATCH] i386: cpu_detect extraction Both lhype and Xen want to call the core of the x86 cpu detect code before calling start_kernel. (extracted from larger patch) AK: folded in start_kernel header patch Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Rusty Russell Signed-off-by: Andi Kleen Cc: Jeremy Fitzhardinge Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/cpu/common.c | 37 +++++++++++++++++++++---------------- include/asm-i386/processor.h | 3 +++ include/linux/start_kernel.h | 12 ++++++++++++ init/main.c | 1 + 4 files changed, 37 insertions(+), 16 deletions(-) create mode 100644 include/linux/start_kernel.h (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index cda41aef79a..68bcb687019 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -236,29 +236,14 @@ static int __cpuinit have_cpuid_p(void) return flag_is_changeable_p(X86_EFLAGS_ID); } -/* Do minimum CPU detection early. - Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment. - The others are not touched to avoid unwanted side effects. - - WARNING: this function is only called on the BP. Don't add code here - that is supposed to run on all CPUs. */ -static void __init early_cpu_detect(void) +void __init cpu_detect(struct cpuinfo_x86 *c) { - struct cpuinfo_x86 *c = &boot_cpu_data; - - c->x86_cache_alignment = 32; - - if (!have_cpuid_p()) - return; - /* Get vendor name */ cpuid(0x00000000, &c->cpuid_level, (int *)&c->x86_vendor_id[0], (int *)&c->x86_vendor_id[8], (int *)&c->x86_vendor_id[4]); - get_cpu_vendor(c, 1); - c->x86 = 4; if (c->cpuid_level >= 0x00000001) { u32 junk, tfms, cap0, misc; @@ -275,6 +260,26 @@ static void __init early_cpu_detect(void) } } +/* Do minimum CPU detection early. + Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment. + The others are not touched to avoid unwanted side effects. + + WARNING: this function is only called on the BP. Don't add code here + that is supposed to run on all CPUs. */ +static void __init early_cpu_detect(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + c->x86_cache_alignment = 32; + + if (!have_cpuid_p()) + return; + + cpu_detect(c); + + get_cpu_vendor(c, 1); +} + static void __cpuinit generic_identify(struct cpuinfo_x86 * c) { u32 tfms, xlvl; diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h index 5f0418d0078..a52d6544042 100644 --- a/include/asm-i386/processor.h +++ b/include/asm-i386/processor.h @@ -20,6 +20,7 @@ #include #include #include +#include /* flag for disabling the tsc */ extern int tsc_disable; @@ -112,6 +113,8 @@ extern struct cpuinfo_x86 cpu_data[]; extern int cpu_llc_id[NR_CPUS]; extern char ignore_fpu_irq; +void __init cpu_detect(struct cpuinfo_x86 *c); + extern void identify_cpu(struct cpuinfo_x86 *); extern void print_cpu_info(struct cpuinfo_x86 *); extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); diff --git a/include/linux/start_kernel.h b/include/linux/start_kernel.h new file mode 100644 index 00000000000..d3e5f275654 --- /dev/null +++ b/include/linux/start_kernel.h @@ -0,0 +1,12 @@ +#ifndef _LINUX_START_KERNEL_H +#define _LINUX_START_KERNEL_H + +#include +#include + +/* Define the prototype for start_kernel here, rather than cluttering + up something else. */ + +extern asmlinkage void __init start_kernel(void); + +#endif /* _LINUX_START_KERNEL_H */ diff --git a/init/main.c b/init/main.c index 36f608a7cfb..985c9ed8605 100644 --- a/init/main.c +++ b/init/main.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3-70-g09d2 From c9ccf30d77f04064fe5436027ab9d2230c7cdd94 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 7 Dec 2006 02:14:08 +0100 Subject: [PATCH] paravirt: Add startup infrastructure for paravirtualization 1) Each hypervisor writes a probe function to detect whether we are running under that hypervisor. paravirt_probe() registers this function. 2) If vmlinux is booted with ring != 0, we call all the probe functions (with registers except %esp intact) in link order: the winner will not return. Signed-off-by: Rusty Russell Signed-off-by: Chris Wright Signed-off-by: Andi Kleen Cc: Jeremy Fitzhardinge Cc: Zachary Amsden Signed-off-by: Andrew Morton --- arch/i386/kernel/Makefile | 2 ++ arch/i386/kernel/head.S | 33 +++++++++++++++++++++++++++++++++ arch/i386/kernel/paravirt.c | 4 ++++ arch/i386/kernel/vmlinux.lds.S | 6 ++++++ include/asm-i386/paravirt.h | 5 +++++ 5 files changed, 50 insertions(+) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index 40661213604..1e8988e558c 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -39,6 +39,8 @@ obj-$(CONFIG_VM86) += vm86.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_K8_NB) += k8.o + +# Make sure this is linked after any other paravirt_ops structs: see head.S obj-$(CONFIG_PARAVIRT) += paravirt.o EXTRA_AFLAGS := -traditional diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index 5b14e95ac8b..edef5084ce1 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -55,6 +55,12 @@ */ ENTRY(startup_32) +#ifdef CONFIG_PARAVIRT + movl %cs, %eax + testl $0x3, %eax + jnz startup_paravirt +#endif + /* * Set segments to known values. */ @@ -486,6 +492,33 @@ ignore_int: #endif iret +#ifdef CONFIG_PARAVIRT +startup_paravirt: + cld + movl $(init_thread_union+THREAD_SIZE),%esp + + /* We take pains to preserve all the regs. */ + pushl %edx + pushl %ecx + pushl %eax + + /* paravirt.o is last in link, and that probe fn never returns */ + pushl $__start_paravirtprobe +1: + movl 0(%esp), %eax + pushl (%eax) + movl 8(%esp), %eax + call *(%esp) + popl %eax + + movl 4(%esp), %eax + movl 8(%esp), %ecx + movl 12(%esp), %edx + + addl $4, (%esp) + jmp 1b +#endif + /* * Real beginning of normal "text" segment */ diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index d4646042644..5a9bd3250a1 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -387,6 +388,9 @@ static int __init print_banner(void) } core_initcall(print_banner); +/* We simply declare start_kernel to be the paravirt probe of last resort. */ +paravirt_probe(start_kernel); + struct paravirt_ops paravirt_ops = { .name = "bare hardware", .paravirt_enabled = 0, diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index 5c69cf0e594..877dc5cfe3a 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -65,6 +65,12 @@ SECTIONS CONSTRUCTORS } :data + __start_paravirtprobe = .; + .paravirtprobe : AT(ADDR(.paravirtprobe) - LOAD_OFFSET) { + *(.paravirtprobe) + } + __stop_paravirtprobe = .; + . = ALIGN(4096); .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { __nosave_begin = .; diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h index 081194751ad..dd707d8c827 100644 --- a/include/asm-i386/paravirt.h +++ b/include/asm-i386/paravirt.h @@ -120,6 +120,11 @@ struct paravirt_ops void (fastcall *iret)(void); }; +/* Mark a paravirt probe function. */ +#define paravirt_probe(fn) \ + static asmlinkage void (*__paravirtprobe_##fn)(void) __attribute_used__ \ + __attribute__((__section__(".paravirtprobe"))) = fn + extern struct paravirt_ops paravirt_ops; #define paravirt_enabled() (paravirt_ops.paravirt_enabled) -- cgit v1.2.3-70-g09d2 From 4f205fd45a5c192907188d6f8f6d7e66db859248 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 7 Dec 2006 02:14:08 +0100 Subject: [PATCH] paravirt: Allow selected bug checks to be Allow selected bug checks to be skipped by paravirt kernels. The two most important are the F00F workaround (which is either done by the hypervisor, or not required), and the 'hlt' instruction check, which can break under some hypervisors. Signed-off-by: Zachary Amsden Signed-off-by: Chris Wright Signed-off-by: Andi Kleen Cc: Rusty Russell Cc: Jeremy Fitzhardinge Signed-off-by: Andrew Morton --- arch/i386/kernel/cpu/intel.c | 2 +- include/asm-i386/bugs.h | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c index 798c2f617e8..3ae795e9056 100644 --- a/arch/i386/kernel/cpu/intel.c +++ b/arch/i386/kernel/cpu/intel.c @@ -107,7 +107,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) * Note that the workaround only should be initialized once... */ c->f00f_bug = 0; - if ( c->x86 == 5 ) { + if (!paravirt_enabled() && c->x86 == 5) { static int f00f_workaround_enabled = 0; c->f00f_bug = 1; diff --git a/include/asm-i386/bugs.h b/include/asm-i386/bugs.h index 592ffeeda45..38f1aebbbdb 100644 --- a/include/asm-i386/bugs.h +++ b/include/asm-i386/bugs.h @@ -21,6 +21,7 @@ #include #include #include +#include static int __init no_halt(char *s) { @@ -91,6 +92,9 @@ static void __init check_fpu(void) static void __init check_hlt(void) { + if (paravirt_enabled()) + return; + printk(KERN_INFO "Checking 'hlt' instruction... "); if (!boot_cpu_data.hlt_works_ok) { printk("disabled\n"); -- cgit v1.2.3-70-g09d2 From 3bbf54725467d604698721384d858b5983b87e8f Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 7 Dec 2006 02:14:08 +0100 Subject: [PATCH] paravirt: Disable vdso by default when CONFIG_PARAVIRT is enabled They don't work together and this way even glibc still works. Signed-off-by: Andi Kleen --- arch/i386/kernel/sysenter.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c index 713ba39d32c..92849c7def5 100644 --- a/arch/i386/kernel/sysenter.c +++ b/arch/i386/kernel/sysenter.c @@ -27,7 +27,11 @@ * Should the kernel map a VDSO page into processes and pass its * address down to glibc upon exec()? */ +#ifdef CONFIG_PARAVIRT +unsigned int __read_mostly vdso_enabled = 0; +#else unsigned int __read_mostly vdso_enabled = 1; +#endif EXPORT_SYMBOL_GPL(vdso_enabled); -- cgit v1.2.3-70-g09d2 From 6020c8f315709a508b027ef6749e85b125190947 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 7 Dec 2006 02:14:08 +0100 Subject: [PATCH] paravirt: Allow disable power management under hypervisor Two legacy power management modes are much easier to just explicitly disable when running in paravirtualized mode - neither APM nor PnP is still relevant. The status of ACPI is still debatable, and noacpi is still a common enough boot parameter that it is not necessary to explicitly disable ACPI. Signed-off-by: Zachary Amsden Signed-off-by: Chris Wright Signed-off-by: Andi Kleen Cc: Rusty Russell Cc: Jeremy Fitzhardinge Signed-off-by: Andrew Morton --- arch/i386/kernel/apm.c | 3 ++- drivers/pnp/pnpbios/core.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c index a60358fe9a4..a97847da9ed 100644 --- a/arch/i386/kernel/apm.c +++ b/arch/i386/kernel/apm.c @@ -231,6 +231,7 @@ #include #include #include +#include #include "io_ports.h" @@ -2235,7 +2236,7 @@ static int __init apm_init(void) dmi_check_system(apm_dmi_table); - if (apm_info.bios.version == 0) { + if (apm_info.bios.version == 0 || paravirt_enabled()) { printk(KERN_INFO "apm: BIOS not found.\n"); return -ENODEV; } diff --git a/drivers/pnp/pnpbios/core.c b/drivers/pnp/pnpbios/core.c index 81a6c83d89a..80066ad792f 100644 --- a/drivers/pnp/pnpbios/core.c +++ b/drivers/pnp/pnpbios/core.c @@ -530,7 +530,8 @@ static int __init pnpbios_init(void) if (check_legacy_ioport(PNPBIOS_BASE)) return -ENODEV; #endif - if (pnpbios_disabled || dmi_check_system(pnpbios_dmi_table)) { + if (pnpbios_disabled || dmi_check_system(pnpbios_dmi_table) || + paravirt_enabled()) { printk(KERN_INFO "PnPBIOS: Disabled\n"); return -ENODEV; } -- cgit v1.2.3-70-g09d2 From 13623d79309dd82e1964458fa017979d16f33fa8 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 7 Dec 2006 02:14:08 +0100 Subject: [PATCH] paravirt: Add APIC accessors to paravirt-ops. Add APIC accessors to paravirt-ops. Unfortunately, we need two write functions, as some older broken hardware requires workarounds for Pentium APIC errata - this is the purpose of apic_write_atomic. AK: replaced __inline with inline Signed-off-by: Zachary Amsden Signed-off-by: Chris Wright Signed-off-by: Andi Kleen Cc: Rusty Russell Cc: Jeremy Fitzhardinge Signed-off-by: Andrew Morton --- arch/i386/kernel/paravirt.c | 8 ++++++++ include/asm-i386/apic.h | 15 ++++++++++++--- include/asm-i386/paravirt.h | 27 +++++++++++++++++++++++++++ 3 files changed, 47 insertions(+), 3 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index 5a9bd3250a1..fe82eb3adf4 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -29,6 +29,8 @@ #include #include #include +#include +#include /* nop stub */ static void native_nop(void) @@ -446,6 +448,12 @@ struct paravirt_ops paravirt_ops = { .io_delay = native_io_delay, .const_udelay = __const_udelay, +#ifdef CONFIG_X86_LOCAL_APIC + .apic_write = native_apic_write, + .apic_write_atomic = native_apic_write_atomic, + .apic_read = native_apic_read, +#endif + .irq_enable_sysexit = native_irq_enable_sysexit, .iret = native_iret, }; diff --git a/include/asm-i386/apic.h b/include/asm-i386/apic.h index b9529578fc3..41a44319905 100644 --- a/include/asm-i386/apic.h +++ b/include/asm-i386/apic.h @@ -37,18 +37,27 @@ extern void generic_apic_probe(void); /* * Basic functions accessing APICs. */ +#ifdef CONFIG_PARAVIRT +#include +#else +#define apic_write native_apic_write +#define apic_write_atomic native_apic_write_atomic +#define apic_read native_apic_read +#endif -static __inline void apic_write(unsigned long reg, unsigned long v) +static __inline fastcall void native_apic_write(unsigned long reg, + unsigned long v) { *((volatile unsigned long *)(APIC_BASE+reg)) = v; } -static __inline void apic_write_atomic(unsigned long reg, unsigned long v) +static __inline fastcall void native_apic_write_atomic(unsigned long reg, + unsigned long v) { xchg((volatile unsigned long *)(APIC_BASE+reg), v); } -static __inline unsigned long apic_read(unsigned long reg) +static __inline fastcall unsigned long native_apic_read(unsigned long reg) { return *((volatile unsigned long *)(APIC_BASE+reg)); } diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h index dd707d8c827..e2c803fadb1 100644 --- a/include/asm-i386/paravirt.h +++ b/include/asm-i386/paravirt.h @@ -115,6 +115,12 @@ struct paravirt_ops void (fastcall *io_delay)(void); void (*const_udelay)(unsigned long loops); +#ifdef CONFIG_X86_LOCAL_APIC + void (fastcall *apic_write)(unsigned long reg, unsigned long v); + void (fastcall *apic_write_atomic)(unsigned long reg, unsigned long v); + unsigned long (fastcall *apic_read)(unsigned long reg); +#endif + /* These two are jmp to, not actually called. */ void (fastcall *irq_enable_sysexit)(void); void (fastcall *iret)(void); @@ -270,6 +276,27 @@ static inline void slow_down_io(void) { #endif } +#ifdef CONFIG_X86_LOCAL_APIC +/* + * Basic functions accessing APICs. + */ +static inline void apic_write(unsigned long reg, unsigned long v) +{ + paravirt_ops.apic_write(reg,v); +} + +static inline void apic_write_atomic(unsigned long reg, unsigned long v) +{ + paravirt_ops.apic_write_atomic(reg,v); +} + +static inline unsigned long apic_read(unsigned long reg) +{ + return paravirt_ops.apic_read(reg); +} +#endif + + /* These all sit in the .parainstructions section to tell us what to patch. */ struct paravirt_patch { u8 *instr; /* original instructions */ -- cgit v1.2.3-70-g09d2 From da181a8b3916aa7f2e3c5775d2bd2fe3454cf82d Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 7 Dec 2006 02:14:08 +0100 Subject: [PATCH] paravirt: Add MMU virtualization to paravirt_ops Add the three bare TLB accessor functions to paravirt-ops. Most amusingly, flush_tlb is redefined on SMP, so I can't call the paravirt op flush_tlb. Instead, I chose to indicate the actual flush type, kernel (global) vs. user (non-global). Global in this sense means using the global bit in the page table entry, which makes TLB entries persistent across CR3 reloads, not global as in the SMP sense of invoking remote shootdowns, so the term is confusingly overloaded. AK: folded in fix from Zach for PAE compilation Signed-off-by: Zachary Amsden Signed-off-by: Chris Wright Signed-off-by: Andi Kleen Cc: Rusty Russell Cc: Jeremy Fitzhardinge Signed-off-by: Andrew Morton --- arch/i386/kernel/paravirt.c | 109 ++++++++++++++++++++++++++++++++++++++ arch/i386/mm/boot_ioremap.c | 1 + include/asm-i386/paravirt.h | 75 ++++++++++++++++++++++++++ include/asm-i386/pgtable-2level.h | 5 +- include/asm-i386/pgtable-3level.h | 40 +++++++------- include/asm-i386/pgtable.h | 4 +- include/asm-i386/tlbflush.h | 18 +++++-- 7 files changed, 226 insertions(+), 26 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index fe82eb3adf4..3dceab5828f 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -31,6 +31,7 @@ #include #include #include +#include /* nop stub */ static void native_nop(void) @@ -379,6 +380,97 @@ static fastcall void native_io_delay(void) asm volatile("outb %al,$0x80"); } +static fastcall void native_flush_tlb(void) +{ + __native_flush_tlb(); +} + +/* + * Global pages have to be flushed a bit differently. Not a real + * performance problem because this does not happen often. + */ +static fastcall void native_flush_tlb_global(void) +{ + __native_flush_tlb_global(); +} + +static fastcall void native_flush_tlb_single(u32 addr) +{ + __native_flush_tlb_single(addr); +} + +#ifndef CONFIG_X86_PAE +static fastcall void native_set_pte(pte_t *ptep, pte_t pteval) +{ + *ptep = pteval; +} + +static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval) +{ + *ptep = pteval; +} + +static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval) +{ + *pmdp = pmdval; +} + +#else /* CONFIG_X86_PAE */ + +static fastcall void native_set_pte(pte_t *ptep, pte_t pte) +{ + ptep->pte_high = pte.pte_high; + smp_wmb(); + ptep->pte_low = pte.pte_low; +} + +static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte) +{ + ptep->pte_high = pte.pte_high; + smp_wmb(); + ptep->pte_low = pte.pte_low; +} + +static fastcall void native_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) +{ + ptep->pte_low = 0; + smp_wmb(); + ptep->pte_high = pte.pte_high; + smp_wmb(); + ptep->pte_low = pte.pte_low; +} + +static fastcall void native_set_pte_atomic(pte_t *ptep, pte_t pteval) +{ + set_64bit((unsigned long long *)ptep,pte_val(pteval)); +} + +static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval) +{ + set_64bit((unsigned long long *)pmdp,pmd_val(pmdval)); +} + +static fastcall void native_set_pud(pud_t *pudp, pud_t pudval) +{ + *pudp = pudval; +} + +static fastcall void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + ptep->pte_low = 0; + smp_wmb(); + ptep->pte_high = 0; +} + +static fastcall void native_pmd_clear(pmd_t *pmd) +{ + u32 *tmp = (u32 *)pmd; + *tmp = 0; + smp_wmb(); + *(tmp + 1) = 0; +} +#endif /* CONFIG_X86_PAE */ + /* These are in entry.S */ extern fastcall void native_iret(void); extern fastcall void native_irq_enable_sysexit(void); @@ -454,6 +546,23 @@ struct paravirt_ops paravirt_ops = { .apic_read = native_apic_read, #endif + .flush_tlb_user = native_flush_tlb, + .flush_tlb_kernel = native_flush_tlb_global, + .flush_tlb_single = native_flush_tlb_single, + + .set_pte = native_set_pte, + .set_pte_at = native_set_pte_at, + .set_pmd = native_set_pmd, + .pte_update = (void *)native_nop, + .pte_update_defer = (void *)native_nop, +#ifdef CONFIG_X86_PAE + .set_pte_atomic = native_set_pte_atomic, + .set_pte_present = native_set_pte_present, + .set_pud = native_set_pud, + .pte_clear = native_pte_clear, + .pmd_clear = native_pmd_clear, +#endif + .irq_enable_sysexit = native_irq_enable_sysexit, .iret = native_iret, }; diff --git a/arch/i386/mm/boot_ioremap.c b/arch/i386/mm/boot_ioremap.c index 4de11f508c3..4de95a17a7d 100644 --- a/arch/i386/mm/boot_ioremap.c +++ b/arch/i386/mm/boot_ioremap.c @@ -16,6 +16,7 @@ */ #undef CONFIG_X86_PAE +#undef CONFIG_PARAVIRT #include #include #include diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h index e2c803fadb1..9f06265065f 100644 --- a/include/asm-i386/paravirt.h +++ b/include/asm-i386/paravirt.h @@ -4,6 +4,7 @@ * para-virtualization: those hooks are defined here. */ #include #include +#include #ifdef CONFIG_PARAVIRT /* These are the most performance critical ops, so we want to be able to patch @@ -27,6 +28,7 @@ struct thread_struct; struct Xgt_desc_struct; struct tss_struct; +struct mm_struct; struct paravirt_ops { unsigned int kernel_rpl; @@ -121,6 +123,23 @@ struct paravirt_ops unsigned long (fastcall *apic_read)(unsigned long reg); #endif + void (fastcall *flush_tlb_user)(void); + void (fastcall *flush_tlb_kernel)(void); + void (fastcall *flush_tlb_single)(u32 addr); + + void (fastcall *set_pte)(pte_t *ptep, pte_t pteval); + void (fastcall *set_pte_at)(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval); + void (fastcall *set_pmd)(pmd_t *pmdp, pmd_t pmdval); + void (fastcall *pte_update)(struct mm_struct *mm, u32 addr, pte_t *ptep); + void (fastcall *pte_update_defer)(struct mm_struct *mm, u32 addr, pte_t *ptep); +#ifdef CONFIG_X86_PAE + void (fastcall *set_pte_atomic)(pte_t *ptep, pte_t pteval); + void (fastcall *set_pte_present)(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); + void (fastcall *set_pud)(pud_t *pudp, pud_t pudval); + void (fastcall *pte_clear)(struct mm_struct *mm, unsigned long addr, pte_t *ptep); + void (fastcall *pmd_clear)(pmd_t *pmdp); +#endif + /* These two are jmp to, not actually called. */ void (fastcall *irq_enable_sysexit)(void); void (fastcall *iret)(void); @@ -297,6 +316,62 @@ static inline unsigned long apic_read(unsigned long reg) #endif +#define __flush_tlb() paravirt_ops.flush_tlb_user() +#define __flush_tlb_global() paravirt_ops.flush_tlb_kernel() +#define __flush_tlb_single(addr) paravirt_ops.flush_tlb_single(addr) + +static inline void set_pte(pte_t *ptep, pte_t pteval) +{ + paravirt_ops.set_pte(ptep, pteval); +} + +static inline void set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval) +{ + paravirt_ops.set_pte_at(mm, addr, ptep, pteval); +} + +static inline void set_pmd(pmd_t *pmdp, pmd_t pmdval) +{ + paravirt_ops.set_pmd(pmdp, pmdval); +} + +static inline void pte_update(struct mm_struct *mm, u32 addr, pte_t *ptep) +{ + paravirt_ops.pte_update(mm, addr, ptep); +} + +static inline void pte_update_defer(struct mm_struct *mm, u32 addr, pte_t *ptep) +{ + paravirt_ops.pte_update_defer(mm, addr, ptep); +} + +#ifdef CONFIG_X86_PAE +static inline void set_pte_atomic(pte_t *ptep, pte_t pteval) +{ + paravirt_ops.set_pte_atomic(ptep, pteval); +} + +static inline void set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) +{ + paravirt_ops.set_pte_present(mm, addr, ptep, pte); +} + +static inline void set_pud(pud_t *pudp, pud_t pudval) +{ + paravirt_ops.set_pud(pudp, pudval); +} + +static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + paravirt_ops.pte_clear(mm, addr, ptep); +} + +static inline void pmd_clear(pmd_t *pmdp) +{ + paravirt_ops.pmd_clear(pmdp); +} +#endif + /* These all sit in the .parainstructions section to tell us what to patch. */ struct paravirt_patch { u8 *instr; /* original instructions */ diff --git a/include/asm-i386/pgtable-2level.h b/include/asm-i386/pgtable-2level.h index 8d8d3b9ecdb..04d6186abc2 100644 --- a/include/asm-i386/pgtable-2level.h +++ b/include/asm-i386/pgtable-2level.h @@ -13,11 +13,14 @@ * within a page table are directly modified. Thus, the following * hook is made available. */ +#ifndef CONFIG_PARAVIRT #define set_pte(pteptr, pteval) (*(pteptr) = pteval) #define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval) +#define set_pmd(pmdptr, pmdval) (*(pmdptr) = (pmdval)) +#endif + #define set_pte_atomic(pteptr, pteval) set_pte(pteptr,pteval) #define set_pte_present(mm,addr,ptep,pteval) set_pte_at(mm,addr,ptep,pteval) -#define set_pmd(pmdptr, pmdval) (*(pmdptr) = (pmdval)) #define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0) #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) diff --git a/include/asm-i386/pgtable-3level.h b/include/asm-i386/pgtable-3level.h index c2d701ea35b..2a6e67db8bc 100644 --- a/include/asm-i386/pgtable-3level.h +++ b/include/asm-i386/pgtable-3level.h @@ -44,6 +44,7 @@ static inline int pte_exec_kernel(pte_t pte) return pte_x(pte); } +#ifndef CONFIG_PARAVIRT /* Rules for using set_pte: the pte being assigned *must* be * either not present or in a state where the hardware will * not attempt to update the pte. In places where this is @@ -80,25 +81,6 @@ static inline void set_pte_present(struct mm_struct *mm, unsigned long addr, pte #define set_pud(pudptr,pudval) \ (*(pudptr) = (pudval)) -/* - * Pentium-II erratum A13: in PAE mode we explicitly have to flush - * the TLB via cr3 if the top-level pgd is changed... - * We do not let the generic code free and clear pgd entries due to - * this erratum. - */ -static inline void pud_clear (pud_t * pud) { } - -#define pud_page(pud) \ -((struct page *) __va(pud_val(pud) & PAGE_MASK)) - -#define pud_page_vaddr(pud) \ -((unsigned long) __va(pud_val(pud) & PAGE_MASK)) - - -/* Find an entry in the second-level page table.. */ -#define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \ - pmd_index(address)) - /* * For PTEs and PDEs, we must clear the P-bit first when clearing a page table * entry, so clear the bottom half first and enforce ordering with a compiler @@ -118,6 +100,26 @@ static inline void pmd_clear(pmd_t *pmd) smp_wmb(); *(tmp + 1) = 0; } +#endif + +/* + * Pentium-II erratum A13: in PAE mode we explicitly have to flush + * the TLB via cr3 if the top-level pgd is changed... + * We do not let the generic code free and clear pgd entries due to + * this erratum. + */ +static inline void pud_clear (pud_t * pud) { } + +#define pud_page(pud) \ +((struct page *) __va(pud_val(pud) & PAGE_MASK)) + +#define pud_page_vaddr(pud) \ +((unsigned long) __va(pud_val(pud) & PAGE_MASK)) + + +/* Find an entry in the second-level page table.. */ +#define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \ + pmd_index(address)) #define __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h index 7d398f493dd..efd7d90789d 100644 --- a/include/asm-i386/pgtable.h +++ b/include/asm-i386/pgtable.h @@ -15,6 +15,7 @@ #include #include #include +#include #ifndef _I386_BITOPS_H #include @@ -246,6 +247,7 @@ static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= _PAGE_PSE; return p # include #endif +#ifndef CONFIG_PARAVIRT /* * Rules for using pte_update - it must be called after any PTE update which * has not been done using the set_pte / clear_pte interfaces. It is used by @@ -261,7 +263,7 @@ static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= _PAGE_PSE; return p */ #define pte_update(mm, addr, ptep) do { } while (0) #define pte_update_defer(mm, addr, ptep) do { } while (0) - +#endif /* * We only update the dirty/accessed state if we set diff --git a/include/asm-i386/tlbflush.h b/include/asm-i386/tlbflush.h index 360648b0f2b..4dd82840d53 100644 --- a/include/asm-i386/tlbflush.h +++ b/include/asm-i386/tlbflush.h @@ -4,7 +4,15 @@ #include #include -#define __flush_tlb() \ +#ifdef CONFIG_PARAVIRT +#include +#else +#define __flush_tlb() __native_flush_tlb() +#define __flush_tlb_global() __native_flush_tlb_global() +#define __flush_tlb_single(addr) __native_flush_tlb_single(addr) +#endif + +#define __native_flush_tlb() \ do { \ unsigned int tmpreg; \ \ @@ -19,7 +27,7 @@ * Global pages have to be flushed a bit differently. Not a real * performance problem because this does not happen often. */ -#define __flush_tlb_global() \ +#define __native_flush_tlb_global() \ do { \ unsigned int tmpreg, cr4, cr4_orig; \ \ @@ -36,6 +44,9 @@ : "memory"); \ } while (0) +#define __native_flush_tlb_single(addr) \ + __asm__ __volatile__("invlpg (%0)" ::"r" (addr) : "memory") + # define __flush_tlb_all() \ do { \ if (cpu_has_pge) \ @@ -46,9 +57,6 @@ #define cpu_has_invlpg (boot_cpu_data.x86 > 3) -#define __flush_tlb_single(addr) \ - __asm__ __volatile__("invlpg (%0)" ::"r" (addr) : "memory") - #ifdef CONFIG_X86_INVLPG # define __flush_tlb_one(addr) __flush_tlb_single(addr) #else -- cgit v1.2.3-70-g09d2 From bd472c794bbf6771c3fc1c58f188bc16c393d2fe Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 7 Dec 2006 02:14:08 +0100 Subject: [PATCH] paravirt: Be careful about touching BIOS address space BIOS ROM areas may not be mapped into the guest address space, so be careful when touching those addresses to make sure they appear to be mapped. [akpm@osdl.org: fix unused var warning] AK: Changed __get_user to probe_kernel_address Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Rusty Russell Signed-off-by: Andi Kleen Cc: Jeremy Fitzhardinge Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/e820.c | 10 +++++++++- arch/i386/pci/pcbios.c | 11 +++++++++-- 2 files changed, 18 insertions(+), 3 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c index b755255f272..b704790f796 100644 --- a/arch/i386/kernel/e820.c +++ b/arch/i386/kernel/e820.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -155,7 +156,14 @@ static struct resource standard_io_resources[] = { { .flags = IORESOURCE_BUSY | IORESOURCE_IO } }; -#define romsignature(x) (*(unsigned short *)(x) == 0xaa55) +static int romsignature(const unsigned char *x) +{ + unsigned short sig; + int ret = 0; + if (probe_kernel_address((const unsigned short *)x, sig) == 0) + ret = (sig == 0xaa55); + return ret; +} static int __init romchecksum(unsigned char *rom, unsigned long length) { diff --git a/arch/i386/pci/pcbios.c b/arch/i386/pci/pcbios.c index ed1512a175a..5f5193401be 100644 --- a/arch/i386/pci/pcbios.c +++ b/arch/i386/pci/pcbios.c @@ -5,6 +5,7 @@ #include #include #include +#include #include "pci.h" #include "pci-functions.h" @@ -314,6 +315,10 @@ static struct pci_raw_ops * __devinit pci_find_bios(void) for (check = (union bios32 *) __va(0xe0000); check <= (union bios32 *) __va(0xffff0); ++check) { + long sig; + if (probe_kernel_address(&check->fields.signature, sig)) + continue; + if (check->fields.signature != BIOS32_SIGNATURE) continue; length = check->fields.length * 16; @@ -331,11 +336,13 @@ static struct pci_raw_ops * __devinit pci_find_bios(void) } DBG("PCI: BIOS32 Service Directory structure at 0x%p\n", check); if (check->fields.entry >= 0x100000) { - printk("PCI: BIOS32 entry (0x%p) in high memory, cannot use.\n", check); + printk("PCI: BIOS32 entry (0x%p) in high memory, " + "cannot use.\n", check); return NULL; } else { unsigned long bios32_entry = check->fields.entry; - DBG("PCI: BIOS32 Service Directory entry at 0x%lx\n", bios32_entry); + DBG("PCI: BIOS32 Service Directory entry at 0x%lx\n", + bios32_entry); bios32_indirect.address = bios32_entry + PAGE_OFFSET; if (check_pcibios()) return &pci_bios_access; -- cgit v1.2.3-70-g09d2 From 8542b200cbe5609edd7aae0c304c091a1c290452 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Thu, 7 Dec 2006 02:14:09 +0100 Subject: [PATCH] paravirt: Add option to allow skipping the timer check Add a way to disable the timer IRQ routing check via a boot option. The VMI timer code uses this to avoid triggering the pester Mingo code, which probes for some very unusual and broken motherboard routings. It fires 100% of the time when using a paravirtual delay mechanism instead of using a realtime delay, since there is no elapsed real time, and the 4 timer IRQs have not yet been delivered. In addition, it is entirely possible, though improbable, that this bug could surface on real hardware which picks a particularly bad time to enter SMM mode, causing a long latency during one of the timer IRQs. While here, make check_timer be __init. Signed-off-by: Zachary Amsden Signed-off-by: Andi Kleen [chrisw: use no_timer_check to bring inline with x86_64 as per Andi's request] Signed-off-by: Chris Wright Cc: Andi Kleen Signed-off-by: Andrew Morton --- Documentation/kernel-parameters.txt | 7 +++++-- arch/i386/kernel/io_apic.c | 16 ++++++++++++++-- 2 files changed, 19 insertions(+), 4 deletions(-) (limited to 'arch/i386/kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 2e1898e4e8f..4e90aa427ae 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -599,8 +599,6 @@ and is between 256 and 4096 characters. It is defined in the file hugepages= [HW,IA-32,IA-64] Maximal number of HugeTLB pages. - noirqbalance [IA-32,SMP,KNL] Disable kernel irq balancing - i8042.direct [HW] Put keyboard port into non-translated mode i8042.dumbkbd [HW] Pretend that controller can only read data from keyboard and cannot control its state @@ -1052,9 +1050,14 @@ and is between 256 and 4096 characters. It is defined in the file in certain environments such as networked servers or real-time systems. + noirqbalance [IA-32,SMP,KNL] Disable kernel irq balancing + noirqdebug [IA-32] Disables the code which attempts to detect and disable unhandled interrupt sources. + no_timer_check [IA-32,X86_64,APIC] Disables the code which tests for + broken timer IRQ sources. + noisapnp [ISAPNP] Disables ISA PnP code. noinitrd [RAM] Tells the kernel not to load any configured diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index e33b7a84529..993150f206e 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -1932,6 +1932,15 @@ static void __init setup_ioapic_ids_from_mpc(void) static void __init setup_ioapic_ids_from_mpc(void) { } #endif +static int no_timer_check __initdata; + +static int __init notimercheck(char *s) +{ + no_timer_check = 1; + return 1; +} +__setup("no_timer_check", notimercheck); + /* * There is a nasty bug in some older SMP boards, their mptable lies * about the timer IRQ. We do the following to work around the situation: @@ -1940,10 +1949,13 @@ static void __init setup_ioapic_ids_from_mpc(void) { } * - if this function detects that timer IRQs are defunct, then we fall * back to ISA timer IRQs */ -static int __init timer_irq_works(void) +int __init timer_irq_works(void) { unsigned long t1 = jiffies; + if (no_timer_check) + return 1; + local_irq_enable(); /* Let ten ticks pass... */ mdelay((10 * 1000) / HZ); @@ -2214,7 +2226,7 @@ int timer_uses_ioapic_pin_0; * is so screwy. Thanks to Brian Perkins for testing/hacking this beast * fanatically on his truly buggy board. */ -static inline void check_timer(void) +static inline void __init check_timer(void) { int apic1, pin1, apic2, pin2; int vector; -- cgit v1.2.3-70-g09d2 From c6ea396de6836bdeb2d2433368130642bf0f6e15 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 7 Dec 2006 02:14:09 +0100 Subject: [PATCH] i386: Don't touch per cpu memory of offline CPUs in touch_nmi_watchdog Just like on x86-64, don't touch foreign CPUs' memory if the watchdog isn't enabled at all. Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/i386/kernel/nmi.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index 171194ccb7b..f5bc7e1be80 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -870,14 +870,16 @@ static unsigned int void touch_nmi_watchdog (void) { - int i; + if (nmi_watchdog > 0) { + unsigned cpu; - /* - * Just reset the alert counters, (other CPUs might be - * spinning on locks we hold): - */ - for_each_possible_cpu(i) - alert_counter[i] = 0; + /* + * Just reset the alert counters, (other CPUs might be + * spinning on locks we hold): + */ + for_each_present_cpu (cpu) + alert_counter[cpu] = 0; + } /* * Tickle the softlockup detector too: -- cgit v1.2.3-70-g09d2 From 475850c86b908ae026d5a4be02a1b1e9c408c75a Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 7 Dec 2006 02:14:09 +0100 Subject: [PATCH] i386: conditionalize inclusion of some MTRR flavors Avoid inclusion of code that's dead for x86-64. Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/mtrr/Makefile | 4 +--- arch/i386/kernel/cpu/mtrr/main.c | 6 ++++++ 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/cpu/mtrr/Makefile b/arch/i386/kernel/cpu/mtrr/Makefile index a25b701ab84..191fc053364 100644 --- a/arch/i386/kernel/cpu/mtrr/Makefile +++ b/arch/i386/kernel/cpu/mtrr/Makefile @@ -1,5 +1,3 @@ obj-y := main.o if.o generic.o state.o -obj-y += amd.o -obj-y += cyrix.o -obj-y += centaur.o +obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c index 2b8b0b361cc..a4de30b9d3d 100644 --- a/arch/i386/kernel/cpu/mtrr/main.c +++ b/arch/i386/kernel/cpu/mtrr/main.c @@ -59,7 +59,11 @@ struct mtrr_ops * mtrr_if = NULL; static void set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type); +#ifndef CONFIG_X86_64 extern int arr3_protected; +#else +#define arr3_protected 0 +#endif void set_mtrr_ops(struct mtrr_ops * ops) { @@ -544,9 +548,11 @@ extern void centaur_init_mtrr(void); static void __init init_ifs(void) { +#ifndef CONFIG_X86_64 amd_init_mtrr(); cyrix_init_mtrr(); centaur_init_mtrr(); +#endif } /* The suspend/resume methods are only for CPU without MTRR. CPU using generic -- cgit v1.2.3-70-g09d2 From 365bff806e9faba000fb4956c7486fbf3a746d96 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 7 Dec 2006 02:14:09 +0100 Subject: [PATCH] i386: fix MTRR code Until not so long ago, there were system log messages pointing to inconsistent MTRR setup of the video frame buffer caused by the way vesafb and X worked. While vesafb was fixed meanwhile, I believe fixing it there only hides a shortcoming in the MTRR code itself, in that that code is not symmetric with respect to the ordering of attempts to set up two (or more) regions where one contains the other. In the current shape, it permits only setting up sub-regions of pre-exisiting ones. The patch below makes this symmetric. While working on that I noticed a few more inconsistencies in that code, namely - use of 'unsigned int' for sizes in many, but not all places (the patch is converting this to use 'unsigned long' everywhere, which specifically might be necessary for x86-64 once a processor supporting more than 44 physical address bits would become available) - the code to correct inconsistent settings during secondary processor startup tried (if necessary) to correct, among other things, the value in IA32_MTRR_DEF_TYPE, however the newly computed value would never get used (i.e. stored in the respective MSR) - the generic range validation code checked that the end of the to-be-added range would be above 1MB; the value checked should have been the start of the range - when contained regions are detected, previously this was allowed only when the old region was uncacheable; this can be symmetric (i.e. the new region can also be uncacheable) and even further as per Intel's documentation write-trough and write-back for either region is also compatible with the respective opposite in the other Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/mtrr/amd.c | 2 +- arch/i386/kernel/cpu/mtrr/centaur.c | 9 +++-- arch/i386/kernel/cpu/mtrr/cyrix.c | 25 +++++++++--- arch/i386/kernel/cpu/mtrr/generic.c | 78 +++++++++++++++++++++++++++++++------ arch/i386/kernel/cpu/mtrr/if.c | 28 +++++++------ arch/i386/kernel/cpu/mtrr/main.c | 55 +++++++++++++++++++------- arch/i386/kernel/cpu/mtrr/mtrr.h | 25 ++++++------ 7 files changed, 162 insertions(+), 60 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/cpu/mtrr/amd.c b/arch/i386/kernel/cpu/mtrr/amd.c index 1a1e04b6fd0..0949cdbf848 100644 --- a/arch/i386/kernel/cpu/mtrr/amd.c +++ b/arch/i386/kernel/cpu/mtrr/amd.c @@ -7,7 +7,7 @@ static void amd_get_mtrr(unsigned int reg, unsigned long *base, - unsigned int *size, mtrr_type * type) + unsigned long *size, mtrr_type * type) { unsigned long low, high; diff --git a/arch/i386/kernel/cpu/mtrr/centaur.c b/arch/i386/kernel/cpu/mtrr/centaur.c index 33f00ac314e..cb9aa3a7a7a 100644 --- a/arch/i386/kernel/cpu/mtrr/centaur.c +++ b/arch/i386/kernel/cpu/mtrr/centaur.c @@ -17,7 +17,7 @@ static u8 centaur_mcr_type; /* 0 for winchip, 1 for winchip2 */ */ static int -centaur_get_free_region(unsigned long base, unsigned long size) +centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg) /* [SUMMARY] Get a free MTRR. The starting (base) address of the region. The size (in bytes) of the region. @@ -26,10 +26,11 @@ centaur_get_free_region(unsigned long base, unsigned long size) { int i, max; mtrr_type ltype; - unsigned long lbase; - unsigned int lsize; + unsigned long lbase, lsize; max = num_var_ranges; + if (replace_reg >= 0 && replace_reg < max) + return replace_reg; for (i = 0; i < max; ++i) { if (centaur_mcr_reserved & (1 << i)) continue; @@ -49,7 +50,7 @@ mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) static void centaur_get_mcr(unsigned int reg, unsigned long *base, - unsigned int *size, mtrr_type * type) + unsigned long *size, mtrr_type * type) { *base = centaur_mcr[reg].high >> PAGE_SHIFT; *size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT; diff --git a/arch/i386/kernel/cpu/mtrr/cyrix.c b/arch/i386/kernel/cpu/mtrr/cyrix.c index 9027a987006..0737a596db4 100644 --- a/arch/i386/kernel/cpu/mtrr/cyrix.c +++ b/arch/i386/kernel/cpu/mtrr/cyrix.c @@ -9,7 +9,7 @@ int arr3_protected; static void cyrix_get_arr(unsigned int reg, unsigned long *base, - unsigned int *size, mtrr_type * type) + unsigned long *size, mtrr_type * type) { unsigned long flags; unsigned char arr, ccr3, rcr, shift; @@ -77,7 +77,7 @@ cyrix_get_arr(unsigned int reg, unsigned long *base, } static int -cyrix_get_free_region(unsigned long base, unsigned long size) +cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg) /* [SUMMARY] Get a free ARR. The starting (base) address of the region. The size (in bytes) of the region. @@ -86,9 +86,24 @@ cyrix_get_free_region(unsigned long base, unsigned long size) { int i; mtrr_type ltype; - unsigned long lbase; - unsigned int lsize; + unsigned long lbase, lsize; + switch (replace_reg) { + case 7: + if (size < 0x40) + break; + case 6: + case 5: + case 4: + return replace_reg; + case 3: + if (arr3_protected) + break; + case 2: + case 1: + case 0: + return replace_reg; + } /* If we are to set up a region >32M then look at ARR7 immediately */ if (size > 0x2000) { cyrix_get_arr(7, &lbase, &lsize, <ype); @@ -214,7 +229,7 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base, typedef struct { unsigned long base; - unsigned int size; + unsigned long size; mtrr_type type; } arr_state_t; diff --git a/arch/i386/kernel/cpu/mtrr/generic.c b/arch/i386/kernel/cpu/mtrr/generic.c index ee8dc675395..f77fc53db65 100644 --- a/arch/i386/kernel/cpu/mtrr/generic.c +++ b/arch/i386/kernel/cpu/mtrr/generic.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -15,12 +16,19 @@ struct mtrr_state { struct mtrr_var_range *var_ranges; mtrr_type fixed_ranges[NUM_FIXED_RANGES]; unsigned char enabled; + unsigned char have_fixed; mtrr_type def_type; }; static unsigned long smp_changes_mask; static struct mtrr_state mtrr_state = {}; +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "mtrr." + +static __initdata int mtrr_show; +module_param_named(show, mtrr_show, bool, 0); + /* Get the MSR pair relating to a var range */ static void __init get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) @@ -43,6 +51,14 @@ get_fixed_ranges(mtrr_type * frs) rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]); } +static void __init print_fixed(unsigned base, unsigned step, const mtrr_type*types) +{ + unsigned i; + + for (i = 0; i < 8; ++i, ++types, base += step) + printk(KERN_INFO "MTRR %05X-%05X %s\n", base, base + step - 1, mtrr_attrib_to_str(*types)); +} + /* Grab all of the MTRR state for this CPU into *state */ void __init get_mtrr_state(void) { @@ -58,13 +74,49 @@ void __init get_mtrr_state(void) } vrs = mtrr_state.var_ranges; + rdmsr(MTRRcap_MSR, lo, dummy); + mtrr_state.have_fixed = (lo >> 8) & 1; + for (i = 0; i < num_var_ranges; i++) get_mtrr_var_range(i, &vrs[i]); - get_fixed_ranges(mtrr_state.fixed_ranges); + if (mtrr_state.have_fixed) + get_fixed_ranges(mtrr_state.fixed_ranges); rdmsr(MTRRdefType_MSR, lo, dummy); mtrr_state.def_type = (lo & 0xff); mtrr_state.enabled = (lo & 0xc00) >> 10; + + if (mtrr_show) { + int high_width; + + printk(KERN_INFO "MTRR default type: %s\n", mtrr_attrib_to_str(mtrr_state.def_type)); + if (mtrr_state.have_fixed) { + printk(KERN_INFO "MTRR fixed ranges %sabled:\n", + mtrr_state.enabled & 1 ? "en" : "dis"); + print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0); + for (i = 0; i < 2; ++i) + print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8); + for (i = 0; i < 8; ++i) + print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8); + } + printk(KERN_INFO "MTRR variable ranges %sabled:\n", + mtrr_state.enabled & 2 ? "en" : "dis"); + high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4; + for (i = 0; i < num_var_ranges; ++i) { + if (mtrr_state.var_ranges[i].mask_lo & (1 << 11)) + printk(KERN_INFO "MTRR %u base %0*X%05X000 mask %0*X%05X000 %s\n", + i, + high_width, + mtrr_state.var_ranges[i].base_hi, + mtrr_state.var_ranges[i].base_lo >> 12, + high_width, + mtrr_state.var_ranges[i].mask_hi, + mtrr_state.var_ranges[i].mask_lo >> 12, + mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff)); + else + printk(KERN_INFO "MTRR %u disabled\n", i); + } + } } /* Some BIOS's are fucked and don't set all MTRRs the same! */ @@ -95,7 +147,7 @@ void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b) smp_processor_id(), msr, a, b); } -int generic_get_free_region(unsigned long base, unsigned long size) +int generic_get_free_region(unsigned long base, unsigned long size, int replace_reg) /* [SUMMARY] Get a free MTRR. The starting (base) address of the region. The size (in bytes) of the region. @@ -104,10 +156,11 @@ int generic_get_free_region(unsigned long base, unsigned long size) { int i, max; mtrr_type ltype; - unsigned long lbase; - unsigned lsize; + unsigned long lbase, lsize; max = num_var_ranges; + if (replace_reg >= 0 && replace_reg < max) + return replace_reg; for (i = 0; i < max; ++i) { mtrr_if->get(i, &lbase, &lsize, <ype); if (lsize == 0) @@ -117,7 +170,7 @@ int generic_get_free_region(unsigned long base, unsigned long size) } static void generic_get_mtrr(unsigned int reg, unsigned long *base, - unsigned int *size, mtrr_type * type) + unsigned long *size, mtrr_type *type) { unsigned int mask_lo, mask_hi, base_lo, base_hi; @@ -202,7 +255,9 @@ static int set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) return changed; } -static unsigned long set_mtrr_state(u32 deftype_lo, u32 deftype_hi) +static u32 deftype_lo, deftype_hi; + +static unsigned long set_mtrr_state(void) /* [SUMMARY] Set the MTRR state for this CPU. The MTRR state information to read. Some relevant CPU context. @@ -217,14 +272,14 @@ static unsigned long set_mtrr_state(u32 deftype_lo, u32 deftype_hi) if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i])) change_mask |= MTRR_CHANGE_MASK_VARIABLE; - if (set_fixed_ranges(mtrr_state.fixed_ranges)) + if (mtrr_state.have_fixed && set_fixed_ranges(mtrr_state.fixed_ranges)) change_mask |= MTRR_CHANGE_MASK_FIXED; /* Set_mtrr_restore restores the old value of MTRRdefType, so to set it we fiddle with the saved value */ if ((deftype_lo & 0xff) != mtrr_state.def_type || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) { - deftype_lo |= (mtrr_state.def_type | mtrr_state.enabled << 10); + deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type | (mtrr_state.enabled << 10); change_mask |= MTRR_CHANGE_MASK_DEFTYPE; } @@ -233,7 +288,6 @@ static unsigned long set_mtrr_state(u32 deftype_lo, u32 deftype_hi) static unsigned long cr4 = 0; -static u32 deftype_lo, deftype_hi; static DEFINE_SPINLOCK(set_atomicity_lock); /* @@ -271,7 +325,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); /* Disable MTRRs, and set the default type to uncached */ - mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & 0xf300UL, deftype_hi); + mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & ~0xcff, deftype_hi); } static void post_set(void) __releases(set_atomicity_lock) @@ -300,7 +354,7 @@ static void generic_set_all(void) prepare_set(); /* Actually set the state */ - mask = set_mtrr_state(deftype_lo,deftype_hi); + mask = set_mtrr_state(); post_set(); local_irq_restore(flags); @@ -374,7 +428,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i } } - if (base < 0x100) { + if (base + size < 0x100) { printk(KERN_WARNING "mtrr: cannot set region below 1 MiB (0x%lx000,0x%lx000)\n", base, size); return -EINVAL; diff --git a/arch/i386/kernel/cpu/mtrr/if.c b/arch/i386/kernel/cpu/mtrr/if.c index 5ac051bb9d5..9753bc6a1f3 100644 --- a/arch/i386/kernel/cpu/mtrr/if.c +++ b/arch/i386/kernel/cpu/mtrr/if.c @@ -17,7 +17,7 @@ extern unsigned int *usage_table; #define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private) -static char *mtrr_strings[MTRR_NUM_TYPES] = +static const char *const mtrr_strings[MTRR_NUM_TYPES] = { "uncachable", /* 0 */ "write-combining", /* 1 */ @@ -28,7 +28,7 @@ static char *mtrr_strings[MTRR_NUM_TYPES] = "write-back", /* 6 */ }; -char *mtrr_attrib_to_str(int x) +const char *mtrr_attrib_to_str(int x) { return (x <= 6) ? mtrr_strings[x] : "?"; } @@ -155,6 +155,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) { int err = 0; mtrr_type type; + unsigned long size; struct mtrr_sentry sentry; struct mtrr_gentry gentry; void __user *arg = (void __user *) __arg; @@ -235,15 +236,15 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) case MTRRIOC_GET_ENTRY: if (gentry.regnum >= num_var_ranges) return -EINVAL; - mtrr_if->get(gentry.regnum, &gentry.base, &gentry.size, &type); + mtrr_if->get(gentry.regnum, &gentry.base, &size, &type); /* Hide entries that go above 4GB */ - if (gentry.base + gentry.size > 0x100000 - || gentry.size == 0x100000) + if (gentry.base + size - 1 >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT)) + || size >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT))) gentry.base = gentry.size = gentry.type = 0; else { gentry.base <<= PAGE_SHIFT; - gentry.size <<= PAGE_SHIFT; + gentry.size = size << PAGE_SHIFT; gentry.type = type; } @@ -273,8 +274,14 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) case MTRRIOC_GET_PAGE_ENTRY: if (gentry.regnum >= num_var_ranges) return -EINVAL; - mtrr_if->get(gentry.regnum, &gentry.base, &gentry.size, &type); - gentry.type = type; + mtrr_if->get(gentry.regnum, &gentry.base, &size, &type); + /* Hide entries that would overflow */ + if (size != (__typeof__(gentry.size))size) + gentry.base = gentry.size = gentry.type = 0; + else { + gentry.size = size; + gentry.type = type; + } break; } @@ -353,8 +360,7 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset) char factor; int i, max, len; mtrr_type type; - unsigned long base; - unsigned int size; + unsigned long base, size; len = 0; max = num_var_ranges; @@ -373,7 +379,7 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset) } /* RED-PEN: base can be > 32bit */ len += seq_printf(seq, - "reg%02i: base=0x%05lx000 (%4liMB), size=%4i%cB: %s, count=%d\n", + "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n", i, base, base >> (20 - PAGE_SHIFT), size, factor, mtrr_attrib_to_str(type), usage_table[i]); } diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c index a4de30b9d3d..aeea23e8a05 100644 --- a/arch/i386/kernel/cpu/mtrr/main.c +++ b/arch/i386/kernel/cpu/mtrr/main.c @@ -172,6 +172,13 @@ static void ipi_handler(void *info) #endif +static inline int types_compatible(mtrr_type type1, mtrr_type type2) { + return type1 == MTRR_TYPE_UNCACHABLE || + type2 == MTRR_TYPE_UNCACHABLE || + (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK) || + (type1 == MTRR_TYPE_WRBACK && type2 == MTRR_TYPE_WRTHROUGH); +} + /** * set_mtrr - update mtrrs on all processors * @reg: mtrr in question @@ -304,11 +311,9 @@ static void set_mtrr(unsigned int reg, unsigned long base, int mtrr_add_page(unsigned long base, unsigned long size, unsigned int type, char increment) { - int i; + int i, replace, error; mtrr_type ltype; - unsigned long lbase; - unsigned int lsize; - int error; + unsigned long lbase, lsize; if (!mtrr_if) return -ENXIO; @@ -328,12 +333,18 @@ int mtrr_add_page(unsigned long base, unsigned long size, return -ENOSYS; } + if (!size) { + printk(KERN_WARNING "mtrr: zero sized request\n"); + return -EINVAL; + } + if (base & size_or_mask || size & size_or_mask) { printk(KERN_WARNING "mtrr: base or size exceeds the MTRR width\n"); return -EINVAL; } error = -EINVAL; + replace = -1; /* No CPU hotplug when we change MTRR entries */ lock_cpu_hotplug(); @@ -341,21 +352,28 @@ int mtrr_add_page(unsigned long base, unsigned long size, mutex_lock(&mtrr_mutex); for (i = 0; i < num_var_ranges; ++i) { mtrr_if->get(i, &lbase, &lsize, <ype); - if (base >= lbase + lsize) - continue; - if ((base < lbase) && (base + size <= lbase)) + if (!lsize || base > lbase + lsize - 1 || base + size - 1 < lbase) continue; /* At this point we know there is some kind of overlap/enclosure */ - if ((base < lbase) || (base + size > lbase + lsize)) { + if (base < lbase || base + size - 1 > lbase + lsize - 1) { + if (base <= lbase && base + size - 1 >= lbase + lsize - 1) { + /* New region encloses an existing region */ + if (type == ltype) { + replace = replace == -1 ? i : -2; + continue; + } + else if (types_compatible(type, ltype)) + continue; + } printk(KERN_WARNING "mtrr: 0x%lx000,0x%lx000 overlaps existing" - " 0x%lx000,0x%x000\n", base, size, lbase, + " 0x%lx000,0x%lx000\n", base, size, lbase, lsize); goto out; } /* New region is enclosed by an existing region */ if (ltype != type) { - if (type == MTRR_TYPE_UNCACHABLE) + if (types_compatible(type, ltype)) continue; printk (KERN_WARNING "mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n", base, size, mtrr_attrib_to_str(ltype), @@ -368,10 +386,18 @@ int mtrr_add_page(unsigned long base, unsigned long size, goto out; } /* Search for an empty MTRR */ - i = mtrr_if->get_free_region(base, size); + i = mtrr_if->get_free_region(base, size, replace); if (i >= 0) { set_mtrr(i, base, size, type); - usage_table[i] = 1; + if (likely(replace < 0)) + usage_table[i] = 1; + else { + usage_table[i] = usage_table[replace] + !!increment; + if (unlikely(replace != i)) { + set_mtrr(replace, 0, 0, 0); + usage_table[replace] = 0; + } + } } else printk(KERN_INFO "mtrr: no more MTRRs available\n"); error = i; @@ -459,8 +485,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) { int i, max; mtrr_type ltype; - unsigned long lbase; - unsigned int lsize; + unsigned long lbase, lsize; int error = -EINVAL; if (!mtrr_if) @@ -561,7 +586,7 @@ static void __init init_ifs(void) struct mtrr_value { mtrr_type ltype; unsigned long lbase; - unsigned int lsize; + unsigned long lsize; }; static struct mtrr_value * mtrr_state; diff --git a/arch/i386/kernel/cpu/mtrr/mtrr.h b/arch/i386/kernel/cpu/mtrr/mtrr.h index 99c9f268204..d61ea9db6cf 100644 --- a/arch/i386/kernel/cpu/mtrr/mtrr.h +++ b/arch/i386/kernel/cpu/mtrr/mtrr.h @@ -43,15 +43,16 @@ struct mtrr_ops { void (*set_all)(void); void (*get)(unsigned int reg, unsigned long *base, - unsigned int *size, mtrr_type * type); - int (*get_free_region) (unsigned long base, unsigned long size); - + unsigned long *size, mtrr_type * type); + int (*get_free_region)(unsigned long base, unsigned long size, + int replace_reg); int (*validate_add_page)(unsigned long base, unsigned long size, unsigned int type); int (*have_wrcomb)(void); }; -extern int generic_get_free_region(unsigned long base, unsigned long size); +extern int generic_get_free_region(unsigned long base, unsigned long size, + int replace_reg); extern int generic_validate_add_page(unsigned long base, unsigned long size, unsigned int type); @@ -62,17 +63,17 @@ extern int positive_have_wrcomb(void); /* library functions for processor-specific routines */ struct set_mtrr_context { unsigned long flags; - unsigned long deftype_lo; - unsigned long deftype_hi; unsigned long cr4val; - unsigned long ccr3; + u32 deftype_lo; + u32 deftype_hi; + u32 ccr3; }; struct mtrr_var_range { - unsigned long base_lo; - unsigned long base_hi; - unsigned long mask_lo; - unsigned long mask_hi; + u32 base_lo; + u32 base_hi; + u32 mask_lo; + u32 mask_hi; }; void set_mtrr_done(struct set_mtrr_context *ctxt); @@ -92,6 +93,6 @@ extern struct mtrr_ops * mtrr_if; extern unsigned int num_var_ranges; void mtrr_state_warn(void); -char *mtrr_attrib_to_str(int x); +const char *mtrr_attrib_to_str(int x); void mtrr_wrmsr(unsigned, unsigned, unsigned); -- cgit v1.2.3-70-g09d2 From ba10650a880c2df23bd1db6c0570ddb66f389641 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Thu, 7 Dec 2006 02:14:10 +0100 Subject: [PATCH] i386: alloc_gdt() static Make the needlessly global alloc_gdt() static. (against) pda-percpu-init Signed-off-by: Adrian Bunk Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Jeremy Fitzhardinge Signed-off-by: Andrew Morton --- arch/i386/kernel/cpu/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index 68bcb687019..1b34c56f812 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -609,7 +609,7 @@ struct pt_regs * __devinit idle_regs(struct pt_regs *regs) return regs; } -__cpuinit int alloc_gdt(int cpu) +static __cpuinit int alloc_gdt(int cpu) { struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); struct desc_struct *gdt; -- cgit v1.2.3-70-g09d2 From 79929fd1c1887d2a057cbb80d487a2e2f1c01a02 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 7 Dec 2006 02:14:10 +0100 Subject: [PATCH] i386: Convert more absolute symbols to section relative o Convert more absolute symbols to section relative to keep the theme in vmlinux.lds.S file and to avoid problem if kernel is relocated. o Also put a message so that in future people can be aware of it and avoid introducing absolute symbols. Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/vmlinux.lds.S | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index 877dc5cfe3a..25581e87c60 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -8,6 +8,12 @@ * put it inside the section definition. */ +/* Don't define absolute symbols until and unless you know that symbol + * value is should remain constant even if kernel image is relocated + * at run time. Absolute symbols are not relocated. If symbol value should + * change if kernel is relocated, make the symbol section relative and + * put it inside the section definition. + */ #define LOAD_OFFSET __PAGE_OFFSET #include @@ -65,11 +71,11 @@ SECTIONS CONSTRUCTORS } :data - __start_paravirtprobe = .; .paravirtprobe : AT(ADDR(.paravirtprobe) - LOAD_OFFSET) { + __start_paravirtprobe = .; *(.paravirtprobe) + __stop_paravirtprobe = .; } - __stop_paravirtprobe = .; . = ALIGN(4096); .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { @@ -172,11 +178,11 @@ SECTIONS *(.altinstr_replacement) } . = ALIGN(4); - __start_parainstructions = .; .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { + __start_parainstructions = .; *(.parainstructions) + __stop_parainstructions = .; } - __stop_parainstructions = .; /* .exit.text is discard at runtime, not link time, to deal with references from .altinstructions and .eh_frame */ .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) } -- cgit v1.2.3-70-g09d2 From fd6d7d26897dec834d0b9fbdc59819b0332a1257 Mon Sep 17 00:00:00 2001 From: "Siddha, Suresh B" Date: Thu, 7 Dec 2006 02:14:10 +0100 Subject: [PATCH] i386: introduce the mechanism of disabling cpu hotplug control Add 'enable_cpu_hotplug' flag and when cleared, the hotplug control file ("online") will not be added under /sys/devices/system/cpu/cpuX/ Next patch doing PCI quirks will use this. Signed-off-by: Suresh Siddha Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: "Li, Shaohua" Signed-off-by: Andrew Morton --- arch/i386/kernel/topology.c | 6 +++++- include/asm-i386/cpu.h | 3 +++ 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/topology.c b/arch/i386/kernel/topology.c index 07d6da36a82..844c08fdb22 100644 --- a/arch/i386/kernel/topology.c +++ b/arch/i386/kernel/topology.c @@ -40,14 +40,18 @@ int arch_register_cpu(int num) * restrictions and assumptions in kernel. This basically * doesnt add a control file, one cannot attempt to offline * BSP. + * + * Also certain PCI quirks require not to enable hotplug control + * for all CPU's. */ - if (!num) + if (!num || !enable_cpu_hotplug) cpu_devices[num].cpu.no_control = 1; return register_cpu(&cpu_devices[num].cpu, num); } #ifdef CONFIG_HOTPLUG_CPU +int enable_cpu_hotplug = 1; void arch_unregister_cpu(int num) { return unregister_cpu(&cpu_devices[num].cpu); diff --git a/include/asm-i386/cpu.h b/include/asm-i386/cpu.h index b1bc7b1b64b..9d914e1e4aa 100644 --- a/include/asm-i386/cpu.h +++ b/include/asm-i386/cpu.h @@ -13,6 +13,9 @@ struct i386_cpu { extern int arch_register_cpu(int num); #ifdef CONFIG_HOTPLUG_CPU extern void arch_unregister_cpu(int); +extern int enable_cpu_hotplug; +#else +#define enable_cpu_hotplug 0 #endif DECLARE_PER_CPU(int, cpu_state); -- cgit v1.2.3-70-g09d2 From 72486f1f8f0a2bc828b9d30cf4690cf2dd6807fc Mon Sep 17 00:00:00 2001 From: "Siddha, Suresh B" Date: Thu, 7 Dec 2006 02:14:10 +0100 Subject: [PATCH] i386: change the 'no_control' field to 'hotpluggable' in the struct cpu Change the 'no_control' field in the cpu struct to a more positive and better term 'hotpluggable'. And change(/cleanup) the logic accordingly. Signed-off-by: Suresh Siddha Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: "Li, Shaohua" Signed-off-by: Andrew Morton --- arch/i386/kernel/topology.c | 4 ++-- arch/ia64/kernel/topology.c | 8 ++++---- arch/powerpc/kernel/sysfs.c | 8 ++++---- drivers/base/cpu.c | 6 +++--- include/linux/cpu.h | 2 +- 5 files changed, 14 insertions(+), 14 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/topology.c b/arch/i386/kernel/topology.c index 844c08fdb22..79cf608e14c 100644 --- a/arch/i386/kernel/topology.c +++ b/arch/i386/kernel/topology.c @@ -44,8 +44,8 @@ int arch_register_cpu(int num) * Also certain PCI quirks require not to enable hotplug control * for all CPU's. */ - if (!num || !enable_cpu_hotplug) - cpu_devices[num].cpu.no_control = 1; + if (num && enable_cpu_hotplug) + cpu_devices[num].cpu.hotpluggable = 1; return register_cpu(&cpu_devices[num].cpu, num); } diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c index 5629b45e89c..687500ddb4b 100644 --- a/arch/ia64/kernel/topology.c +++ b/arch/ia64/kernel/topology.c @@ -31,11 +31,11 @@ int arch_register_cpu(int num) { #if defined (CONFIG_ACPI) && defined (CONFIG_HOTPLUG_CPU) /* - * If CPEI cannot be re-targetted, and this is - * CPEI target, then dont create the control file + * If CPEI can be re-targetted or if this is not + * CPEI target, then it is hotpluggable */ - if (!can_cpei_retarget() && is_cpu_cpei_target(num)) - sysfs_cpus[num].cpu.no_control = 1; + if (can_cpei_retarget() || !is_cpu_cpei_target(num)) + sysfs_cpus[num].cpu.hotpluggable = 1; map_cpu_to_node(num, node_cpuid[num].nid); #endif diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index 22123a0d541..63ed265b7f0 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -239,7 +239,7 @@ static void unregister_cpu_online(unsigned int cpu) struct cpu *c = &per_cpu(cpu_devices, cpu); struct sys_device *s = &c->sysdev; - BUG_ON(c->no_control); + BUG_ON(!c->hotpluggable); if (!firmware_has_feature(FW_FEATURE_ISERIES) && cpu_has_feature(CPU_FTR_SMT)) @@ -424,10 +424,10 @@ static int __init topology_init(void) * CPU. For instance, the boot cpu might never be valid * for hotplugging. */ - if (!ppc_md.cpu_die) - c->no_control = 1; + if (ppc_md.cpu_die) + c->hotpluggable = 1; - if (cpu_online(cpu) || (c->no_control == 0)) { + if (cpu_online(cpu) || c->hotpluggable) { register_cpu(c, cpu); sysdev_create_file(&c->sysdev, &attr_physical_id); diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 1f745f12f94..7fd095efaeb 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -104,8 +104,8 @@ static SYSDEV_ATTR(crash_notes, 0400, show_crash_notes, NULL); /* * register_cpu - Setup a driverfs device for a CPU. - * @cpu - Callers can set the cpu->no_control field to 1, to indicate not to - * generate a control file in sysfs for this CPU. + * @cpu - cpu->hotpluggable field set to 1 will generate a control file in + * sysfs for this CPU. * @num - CPU number to use when creating the device. * * Initialize and register the CPU device. @@ -119,7 +119,7 @@ int __devinit register_cpu(struct cpu *cpu, int num) error = sysdev_register(&cpu->sysdev); - if (!error && !cpu->no_control) + if (!error && cpu->hotpluggable) register_cpu_control(cpu); if (!error) cpu_sys_devices[num] = &cpu->sysdev; diff --git a/include/linux/cpu.h b/include/linux/cpu.h index f02d71bf689..ad90340e7db 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -27,7 +27,7 @@ struct cpu { int node_id; /* The node which contains the CPU */ - int no_control; /* Should the sysfs control file be created? */ + int hotpluggable; /* creates sysfs control file if hotpluggable */ struct sys_device sysdev; }; -- cgit v1.2.3-70-g09d2 From b0d0a4ba45760b10ecee9035ed45b442c1a6cc84 Mon Sep 17 00:00:00 2001 From: "Siddha, Suresh B" Date: Thu, 7 Dec 2006 02:14:10 +0100 Subject: [PATCH] x86: fix the irqbalance quirk for E7320/E7520/E7525 Move the irqbalance quirks for E7320/E7520/E7525(Errata 23 in http://download.intel.com/design/chipsets/specupdt/30304203.pdf) to early quirks. And add a PCI quirk for these platforms to check(which happens very late during the boot) if the APIC routing is indeed set to default flat mode. This fixes the breakage(in x86_64) of this quirk due to cpu hotplug which selects physical mode instead of the logical flat(as needed for this errata workaround). Signed-off-by: Suresh Siddha Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: "Li, Shaohua" Signed-off-by: Andrew Morton --- arch/i386/kernel/acpi/earlyquirk.c | 21 +++++++++++++++++ arch/i386/kernel/quirks.c | 46 +++++++++++++++++++++++++++++--------- arch/i386/kernel/smpboot.c | 7 ++++++ arch/x86_64/kernel/early-quirks.c | 13 +++++++++++ arch/x86_64/kernel/smpboot.c | 8 +++++++ include/asm-i386/genapic.h | 2 +- include/asm-i386/irq.h | 2 ++ include/asm-x86_64/proto.h | 1 + 8 files changed, 88 insertions(+), 12 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/acpi/earlyquirk.c b/arch/i386/kernel/acpi/earlyquirk.c index c9841692bb7..4b60af7f91d 100644 --- a/arch/i386/kernel/acpi/earlyquirk.c +++ b/arch/i386/kernel/acpi/earlyquirk.c @@ -10,6 +10,7 @@ #include #include #include +#include #ifdef CONFIG_ACPI @@ -49,6 +50,24 @@ static int __init check_bridge(int vendor, int device) return 0; } +static void check_intel(void) +{ + u16 vendor, device; + + vendor = read_pci_config_16(0, 0, 0, PCI_VENDOR_ID); + + if (vendor != PCI_VENDOR_ID_INTEL) + return; + + device = read_pci_config_16(0, 0, 0, PCI_DEVICE_ID); +#ifdef CONFIG_SMP + if (device == PCI_DEVICE_ID_INTEL_E7320_MCH || + device == PCI_DEVICE_ID_INTEL_E7520_MCH || + device == PCI_DEVICE_ID_INTEL_E7525_MCH) + quirk_intel_irqbalance(); +#endif +} + void __init check_acpi_pci(void) { int num, slot, func; @@ -60,6 +79,8 @@ void __init check_acpi_pci(void) if (!early_pci_allowed()) return; + check_intel(); + /* Poor man's PCI discovery */ for (num = 0; num < 32; num++) { for (slot = 0; slot < 32; slot++) { diff --git a/arch/i386/kernel/quirks.c b/arch/i386/kernel/quirks.c index 9f6ab1789bb..a01320a7b63 100644 --- a/arch/i386/kernel/quirks.c +++ b/arch/i386/kernel/quirks.c @@ -3,10 +3,23 @@ */ #include #include +#include +#include +#include #if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) +static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev) +{ +#ifdef CONFIG_X86_64 + if (genapic != &apic_flat) + panic("APIC mode must be flat on this system\n"); +#elif defined(CONFIG_X86_GENERICARCH) + if (genapic != &apic_default) + panic("APIC mode must be default(flat) on this system. Use apic=default\n"); +#endif +} -static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) +void __init quirk_intel_irqbalance(void) { u8 config, rev; u32 word; @@ -16,18 +29,18 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) * based platforms. * Disable SW irqbalance/affinity on those platforms. */ - pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); + rev = read_pci_config_byte(0, 0, 0, PCI_CLASS_REVISION); if (rev > 0x9) return; printk(KERN_INFO "Intel E7520/7320/7525 detected."); - /* enable access to config space*/ - pci_read_config_byte(dev, 0xf4, &config); - pci_write_config_byte(dev, 0xf4, config|0x2); + /* enable access to config space */ + config = read_pci_config_byte(0, 0, 0, 0xf4); + write_pci_config_byte(0, 0, 0, 0xf4, config|0x2); /* read xTPR register */ - raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word); + word = read_pci_config_16(0, 0, 0x40, 0x4c); if (!(word & (1 << 13))) { printk(KERN_INFO "Disabling irq balancing and affinity\n"); @@ -37,14 +50,25 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) noirqdebug_setup(""); #ifdef CONFIG_PROC_FS no_irq_affinity = 1; +#endif +#ifdef CONFIG_HOTPLUG_CPU + printk(KERN_INFO "Disabling cpu hotplug control\n"); + enable_cpu_hotplug = 0; +#endif +#ifdef CONFIG_X86_64 + /* force the genapic selection to flat mode so that + * interrupts can be redirected to more than one CPU. + */ + genapic_force = &apic_flat; #endif } - /* put back the original value for config space*/ + /* put back the original value for config space */ if (!(config & 0x2)) - pci_write_config_byte(dev, 0xf4, config); + write_pci_config_byte(0, 0, 0, 0xf4, config); } -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, verify_quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, verify_quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, verify_quirk_intel_irqbalance); + #endif diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index cd7de9c9654..346f27f4c79 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -58,6 +58,7 @@ #include #include #include +#include #include #include @@ -1482,6 +1483,12 @@ int __devinit __cpu_up(unsigned int cpu) cpu_set(cpu, smp_commenced_mask); while (!cpu_isset(cpu, cpu_online_map)) cpu_relax(); + +#ifdef CONFIG_X86_GENERICARCH + if (num_online_cpus() > 8 && genapic == &apic_default) + panic("Default flat APIC routing can't be used with > 8 cpus\n"); +#endif + return 0; } diff --git a/arch/x86_64/kernel/early-quirks.c b/arch/x86_64/kernel/early-quirks.c index fb0c6da41b7..829698f6d04 100644 --- a/arch/x86_64/kernel/early-quirks.c +++ b/arch/x86_64/kernel/early-quirks.c @@ -71,6 +71,18 @@ static void ati_bugs(void) { } +static void intel_bugs(void) +{ + u16 device = read_pci_config_16(0, 0, 0, PCI_DEVICE_ID); + +#ifdef CONFIG_SMP + if (device == PCI_DEVICE_ID_INTEL_E7320_MCH || + device == PCI_DEVICE_ID_INTEL_E7520_MCH || + device == PCI_DEVICE_ID_INTEL_E7525_MCH) + quirk_intel_irqbalance(); +#endif +} + struct chipset { u16 vendor; void (*f)(void); @@ -80,6 +92,7 @@ static struct chipset early_qrk[] = { { PCI_VENDOR_ID_NVIDIA, nvidia_bugs }, { PCI_VENDOR_ID_VIA, via_bugs }, { PCI_VENDOR_ID_ATI, ati_bugs }, + { PCI_VENDOR_ID_INTEL, intel_bugs}, {} }; diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index 62c2e747af5..4c161c208d5 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -60,6 +60,7 @@ #include #include #include +#include /* Number of siblings per CPU package */ int smp_num_siblings = 1; @@ -1167,6 +1168,13 @@ int __cpuinit __cpu_up(unsigned int cpu) while (!cpu_isset(cpu, cpu_online_map)) cpu_relax(); + + if (num_online_cpus() > 8 && genapic == &apic_flat) { + printk(KERN_WARNING + "flat APIC routing can't be used with > 8 cpus\n"); + BUG(); + } + err = 0; return err; diff --git a/include/asm-i386/genapic.h b/include/asm-i386/genapic.h index 8ffbb0f0745..fd2be593b06 100644 --- a/include/asm-i386/genapic.h +++ b/include/asm-i386/genapic.h @@ -122,6 +122,6 @@ struct genapic { APICFUNC(phys_pkg_id) \ } -extern struct genapic *genapic; +extern struct genapic *genapic, apic_default; #endif diff --git a/include/asm-i386/irq.h b/include/asm-i386/irq.h index 9e15ce0006e..11761cdaae1 100644 --- a/include/asm-i386/irq.h +++ b/include/asm-i386/irq.h @@ -37,6 +37,8 @@ static __inline__ int irq_canonicalize(int irq) extern int irqbalance_disable(char *str); #endif +extern void quirk_intel_irqbalance(void); + #ifdef CONFIG_HOTPLUG_CPU extern void fixup_irqs(cpumask_t map); #endif diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h index 21fa5afa232..6d324b83897 100644 --- a/include/asm-x86_64/proto.h +++ b/include/asm-x86_64/proto.h @@ -87,6 +87,7 @@ extern void syscall32_cpu_init(void); extern void setup_node_bootmem(int nodeid, unsigned long start, unsigned long end); extern void early_quirks(void); +extern void quirk_intel_irqbalance(void); extern void check_efer(void); extern int unhandled_signal(struct task_struct *tsk, int sig); -- cgit v1.2.3-70-g09d2 From e1cccf48b182dd743c3c83a4fdf8dc570a43b393 Mon Sep 17 00:00:00 2001 From: Artiom Myaskouvskey Date: Thu, 7 Dec 2006 02:14:11 +0100 Subject: [PATCH] i386: call efi_get_time during suspend Function efi_get_time called not only during init kernel phase but also during suspend (from get_cmos_time). When it is called from get_cmos_time the corresponding runtime service should be called in virtual and not in physical mode. Signed-off-by: Artiom Myaskouvskey Signed-off-by: Andi Kleen Cc: "Narayanan, Chandramouli" Cc: "Jiossy, Rami" Cc: "Satt, Shai" Cc: Andi Kleen Cc: Matt Domsch Signed-off-by: Andrew Morton --- arch/i386/kernel/efi.c | 17 ++++++++++++----- include/linux/efi.h | 2 +- 2 files changed, 13 insertions(+), 6 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/efi.c b/arch/i386/kernel/efi.c index 8b40648d0ef..b92c7f0a358 100644 --- a/arch/i386/kernel/efi.c +++ b/arch/i386/kernel/efi.c @@ -194,17 +194,24 @@ inline int efi_set_rtc_mmss(unsigned long nowtime) return 0; } /* - * This should only be used during kernel init and before runtime - * services have been remapped, therefore, we'll need to call in physical - * mode. Note, this call isn't used later, so mark it __init. + * This is used during kernel init before runtime + * services have been remapped and also during suspend, therefore, + * we'll need to call both in physical and virtual modes. */ -inline unsigned long __init efi_get_time(void) +inline unsigned long efi_get_time(void) { efi_status_t status; efi_time_t eft; efi_time_cap_t cap; - status = phys_efi_get_time(&eft, &cap); + if (efi.get_time) { + /* if we are in virtual mode use remapped function */ + status = efi.get_time(&eft, &cap); + } else { + /* we are in physical mode */ + status = phys_efi_get_time(&eft, &cap); + } + if (status != EFI_SUCCESS) printk("Oops: efitime: can't read time status: 0x%lx\n",status); diff --git a/include/linux/efi.h b/include/linux/efi.h index 66d621dbcb6..91ecf49fbf2 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -300,7 +300,7 @@ extern int efi_mem_attribute_range (unsigned long phys_addr, unsigned long size, extern int __init efi_uart_console_only (void); extern void efi_initialize_iomem_resources(struct resource *code_resource, struct resource *data_resource); -extern unsigned long __init efi_get_time(void); +extern unsigned long efi_get_time(void); extern int __init efi_set_rtc_mmss(unsigned long nowtime); extern struct efi_memory_map memmap; -- cgit v1.2.3-70-g09d2 From 956fb53197f82257974f1f9835485aeeef4510b3 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Thu, 7 Dec 2006 02:14:11 +0100 Subject: [PATCH] i386: handle a negative return value The Coverity checker noted that bad things might happen if find_isa_irq_apic() returned -1. [akpm@osdl.org: add debugging checks] Signed-off-by: Adrian Bunk Signed-off-by: Andi Kleen Cc: Andi Kleen Acked-by: Ingo Molnar Signed-off-by: Andrew Morton --- arch/i386/kernel/io_apic.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index 993150f206e..7bfd6c3ec87 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -2179,9 +2179,15 @@ static inline void unlock_ExtINT_logic(void) unsigned char save_control, save_freq_select; pin = find_isa_irq_pin(8, mp_INT); + if (pin == -1) { + WARN_ON_ONCE(1); + return; + } apic = find_isa_irq_apic(8, mp_INT); - if (pin == -1) + if (apic == -1) { + WARN_ON_ONCE(1); return; + } entry0 = ioapic_read_entry(apic, pin); clear_IO_APIC_pin(apic, pin); -- cgit v1.2.3-70-g09d2 From 7e95b593a1aeb6fe1d3904e799d23a45261f2c19 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Thu, 7 Dec 2006 02:14:11 +0100 Subject: [PATCH] i386: Make irq_vector static irq_vector[] can now become static. Signed-off-by: Adrian Bunk Signed-off-by: Andi Kleen Acked-by: Eric W. Biederman Acked-by: Ingo Molnar Signed-off-by: Andrew Morton --- arch/i386/kernel/io_apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index 7bfd6c3ec87..56f571c9fc0 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -1241,7 +1241,7 @@ static inline int IO_APIC_irq_trigger(int irq) } /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ -u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 }; +static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 }; static int __assign_irq_vector(int irq) { -- cgit v1.2.3-70-g09d2 From 538f188e03c821c93b355c9fc346806cdd34e286 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 7 Dec 2006 02:14:11 +0100 Subject: [PATCH] i386: i386 add Intel BTS cpufeature bit and detection (take 2) Here is a small patch for i386 which adds a cpufeature flag and detection code for Intel's Branch Trace Store (BTS) feature. This feature can be found on Intel P4 and Core 2 processors among others. It can also be used by perfmon. changelog: - add CPU_FEATURE_BTS - add Branch Trace Store detection signed-off-by: stephane eranian Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/intel.c | 2 ++ include/asm-i386/cpufeature.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c index 3ae795e9056..56fe2658495 100644 --- a/arch/i386/kernel/cpu/intel.c +++ b/arch/i386/kernel/cpu/intel.c @@ -199,6 +199,8 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) if (cpu_has_ds) { unsigned int l1; rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); + if (!(l1 & (1<<11))) + set_bit(X86_FEATURE_BTS, c->x86_capability); if (!(l1 & (1<<12))) set_bit(X86_FEATURE_PEBS, c->x86_capability); } diff --git a/include/asm-i386/cpufeature.h b/include/asm-i386/cpufeature.h index 4c83e059228..3f92b94e0d7 100644 --- a/include/asm-i386/cpufeature.h +++ b/include/asm-i386/cpufeature.h @@ -74,6 +74,7 @@ #define X86_FEATURE_FXSAVE_LEAK (3*32+10) /* FXSAVE leaks FOP/FIP/FOP */ #define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */ #define X86_FEATURE_PEBS (3*32+12) /* Precise-Event Based Sampling */ +#define X86_FEATURE_BTS (3*32+13) /* Branch Trace Store */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ #define X86_FEATURE_XMM3 (4*32+ 0) /* Streaming SIMD Extensions-3 */ @@ -138,6 +139,7 @@ #define cpu_has_ds boot_cpu_has(X86_FEATURE_DS) #define cpu_has_pebs boot_cpu_has(X86_FEATURE_PEBS) #define cpu_has_clflush boot_cpu_has(X86_FEATURE_CLFLSH) +#define cpu_has_bts boot_cpu_has(X86_FEATURE_BTS) #endif /* __ASM_I386_CPUFEATURE_H */ -- cgit v1.2.3-70-g09d2 From f990fff427d68af3e4e1d16fe799c106abc0bf53 Mon Sep 17 00:00:00 2001 From: Karsten Wiese Date: Thu, 7 Dec 2006 02:14:11 +0100 Subject: [PATCH] x86: Regard MSRs in lapic_suspend()/lapic_resume() Read/Write APIC_LVTPC and APIC_LVTTHMR only, if get_maxlvt() returns certain values. This is done like everywhere else in i386/kernel/apic.c, so I guess its correct. Suspends/Resumes to disk fine and eleminates an smp_error_interrupt() here on a K8. AK: ported to x86-64 too Signed-off-by: Karsten Wiese Signed-off-by: Andi Kleen --- arch/i386/kernel/apic.c | 22 ++++++++++++++++++---- arch/x86_64/kernel/apic.c | 22 ++++++++++++++++++---- 2 files changed, 36 insertions(+), 8 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c index 2fd4b7d927c..776d9be26af 100644 --- a/arch/i386/kernel/apic.c +++ b/arch/i386/kernel/apic.c @@ -647,23 +647,30 @@ static struct { static int lapic_suspend(struct sys_device *dev, pm_message_t state) { unsigned long flags; + int maxlvt; if (!apic_pm_state.active) return 0; + maxlvt = get_maxlvt(); + apic_pm_state.apic_id = apic_read(APIC_ID); apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); apic_pm_state.apic_ldr = apic_read(APIC_LDR); apic_pm_state.apic_dfr = apic_read(APIC_DFR); apic_pm_state.apic_spiv = apic_read(APIC_SPIV); apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); - apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); + if (maxlvt >= 4) + apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); apic_pm_state.apic_tmict = apic_read(APIC_TMICT); apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); - apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); +#ifdef CONFIG_X86_MCE_P4THERMAL + if (maxlvt >= 5) + apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); +#endif local_irq_save(flags); disable_local_APIC(); @@ -675,10 +682,13 @@ static int lapic_resume(struct sys_device *dev) { unsigned int l, h; unsigned long flags; + int maxlvt; if (!apic_pm_state.active) return 0; + maxlvt = get_maxlvt(); + local_irq_save(flags); /* @@ -700,8 +710,12 @@ static int lapic_resume(struct sys_device *dev) apic_write(APIC_SPIV, apic_pm_state.apic_spiv); apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); - apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); - apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); +#ifdef CONFIG_X86_MCE_P4THERMAL + if (maxlvt >= 5) + apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); +#endif + if (maxlvt >= 4) + apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); apic_write(APIC_TMICT, apic_pm_state.apic_tmict); diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 5c468971e64..f0b00d8731c 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -459,23 +459,30 @@ static struct { static int lapic_suspend(struct sys_device *dev, pm_message_t state) { unsigned long flags; + int maxlvt; if (!apic_pm_state.active) return 0; + maxlvt = get_maxlvt(); + apic_pm_state.apic_id = apic_read(APIC_ID); apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); apic_pm_state.apic_ldr = apic_read(APIC_LDR); apic_pm_state.apic_dfr = apic_read(APIC_DFR); apic_pm_state.apic_spiv = apic_read(APIC_SPIV); apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); - apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); + if (maxlvt >= 4) + apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); apic_pm_state.apic_tmict = apic_read(APIC_TMICT); apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); - apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); +#ifdef CONFIG_X86_MCE_INTEL + if (maxlvt >= 5) + apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); +#endif local_irq_save(flags); disable_local_APIC(); local_irq_restore(flags); @@ -486,10 +493,13 @@ static int lapic_resume(struct sys_device *dev) { unsigned int l, h; unsigned long flags; + int maxlvt; if (!apic_pm_state.active) return 0; + maxlvt = get_maxlvt(); + local_irq_save(flags); rdmsr(MSR_IA32_APICBASE, l, h); l &= ~MSR_IA32_APICBASE_BASE; @@ -503,8 +513,12 @@ static int lapic_resume(struct sys_device *dev) apic_write(APIC_SPIV, apic_pm_state.apic_spiv); apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); - apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); - apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); +#ifdef CONFIG_X86_MCE_INTEL + if (maxlvt >= 5) + apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); +#endif + if (maxlvt >= 4) + apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); apic_write(APIC_TMICT, apic_pm_state.apic_tmict); -- cgit v1.2.3-70-g09d2 From bf7e6a196318316e921f357557fca9d11d15f486 Mon Sep 17 00:00:00 2001 From: Artiom Myaskouvskey Date: Thu, 7 Dec 2006 02:14:11 +0100 Subject: [PATCH] i386: Preserve EFI run time regions with memmap parameter When using memmap kernel parameter in EFI boot we should also add to memory map memory regions of runtime services to enable their mapping later. AK: merged and cleaned up the patch Signed-off-by: Artiom Myaskouvskey Signed-off-by: Andi Kleen --- arch/i386/kernel/e820.c | 60 +++++++++++++++++++++++++++++++++++-------------- arch/i386/mm/init.c | 2 -- include/linux/efi.h | 1 + 3 files changed, 44 insertions(+), 19 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c index b704790f796..2f7d0a92fd7 100644 --- a/arch/i386/kernel/e820.c +++ b/arch/i386/kernel/e820.c @@ -742,29 +742,55 @@ void __init print_memory_map(char *who) } } -void __init limit_regions(unsigned long long size) +static __init __always_inline void efi_limit_regions(unsigned long long size) { unsigned long long current_addr = 0; + efi_memory_desc_t *md, *next_md; + void *p, *p1; + int i, j; + + j = 0; + p1 = memmap.map; + for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) { + md = p; + next_md = p1; + current_addr = md->phys_addr + + PFN_PHYS(md->num_pages); + if (is_available_memory(md)) { + if (md->phys_addr >= size) continue; + memcpy(next_md, md, memmap.desc_size); + if (current_addr >= size) { + next_md->num_pages -= + PFN_UP(current_addr-size); + } + p1 += memmap.desc_size; + next_md = p1; + j++; + } else if ((md->attribute & EFI_MEMORY_RUNTIME) == + EFI_MEMORY_RUNTIME) { + /* In order to make runtime services + * available we have to include runtime + * memory regions in memory map */ + memcpy(next_md, md, memmap.desc_size); + p1 += memmap.desc_size; + next_md = p1; + j++; + } + } + memmap.nr_map = j; + memmap.map_end = memmap.map + + (memmap.nr_map * memmap.desc_size); +} + +void __init limit_regions(unsigned long long size) +{ + unsigned long long current_addr; int i; print_memory_map("limit_regions start"); if (efi_enabled) { - efi_memory_desc_t *md; - void *p; - - for (p = memmap.map, i = 0; p < memmap.map_end; - p += memmap.desc_size, i++) { - md = p; - current_addr = md->phys_addr + (md->num_pages << 12); - if (md->type == EFI_CONVENTIONAL_MEMORY) { - if (current_addr >= size) { - md->num_pages -= - (((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT); - memmap.nr_map = i + 1; - return; - } - } - } + efi_limit_regions(size); + return; } for (i = 0; i < e820.nr_map; i++) { current_addr = e820.map[i].addr + e820.map[i].size; diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index 167416155ee..f4dd048187f 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -192,8 +192,6 @@ static inline int page_kills_ppro(unsigned long pagenr) return 0; } -extern int is_available_memory(efi_memory_desc_t *); - int page_is_ram(unsigned long pagenr) { int i; diff --git a/include/linux/efi.h b/include/linux/efi.h index 91ecf49fbf2..df1c91855f0 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -302,6 +302,7 @@ extern void efi_initialize_iomem_resources(struct resource *code_resource, struct resource *data_resource); extern unsigned long efi_get_time(void); extern int __init efi_set_rtc_mmss(unsigned long nowtime); +extern int is_available_memory(efi_memory_desc_t * md); extern struct efi_memory_map memmap; /** -- cgit v1.2.3-70-g09d2 From 6df0532eef0187c293d3ab1d4c158f92e8f24f8a Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Thu, 7 Dec 2006 02:14:11 +0100 Subject: [PATCH] i386: remove duplicate printk We do the exact same printk about a dozen lines above with no intermediate printk's. Signed-off-by: Dave Jones Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/amd.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c index e4758095d87..41cfea57232 100644 --- a/arch/i386/kernel/cpu/amd.c +++ b/arch/i386/kernel/cpu/amd.c @@ -104,10 +104,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) f_vide(); rdtscl(d2); d = d2-d; - - /* Knock these two lines out if it debugs out ok */ - printk(KERN_INFO "AMD K6 stepping B detected - "); - /* -- cut here -- */ + if (d > 20*K6_BUG_LOOP) printk("system stability may be impaired when more than 32 MB are used.\n"); else -- cgit v1.2.3-70-g09d2 From 0741f4d207a644482d7a040f05cd264c98cf7ee8 Mon Sep 17 00:00:00 2001 From: Chuck Ebbert <76306.1226@compuserve.com> Date: Thu, 7 Dec 2006 02:14:11 +0100 Subject: [PATCH] x86: add sysctl for kstack_depth_to_print Add sysctl for kstack_depth_to_print. This lets users change the amount of raw stack data printed in dump_stack() without having to reboot. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Signed-off-by: Andi Kleen --- Documentation/sysctl/kernel.txt | 8 ++++++++ arch/i386/kernel/traps.c | 2 +- arch/x86_64/kernel/traps.c | 2 +- include/asm-x86_64/stacktrace.h | 2 ++ kernel/sysctl.c | 9 +++++++++ 5 files changed, 21 insertions(+), 2 deletions(-) (limited to 'arch/i386/kernel') diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 0bc7f1e3c9e..5922e84d913 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -27,6 +27,7 @@ show up in /proc/sys/kernel: - hotplug - java-appletviewer [ binfmt_java, obsolete ] - java-interpreter [ binfmt_java, obsolete ] +- kstack_depth_to_print [ X86 only ] - l2cr [ PPC only ] - modprobe ==> Documentation/kmod.txt - msgmax @@ -170,6 +171,13 @@ This flag controls the L2 cache of G3 processor boards. If ============================================================== +kstack_depth_to_print: (X86 only) + +Controls the number of words to print when dumping the raw +kernel stack. + +============================================================== + osrelease, ostype & version: # cat osrelease diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 7b2f9f02208..1d48a75fa33 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -91,7 +91,7 @@ asmlinkage void alignment_check(void); asmlinkage void spurious_interrupt_bug(void); asmlinkage void machine_check(void); -static int kstack_depth_to_print = 24; +int kstack_depth_to_print = 24; #ifdef CONFIG_STACK_UNWIND static int call_trace = 1; #else diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 264db33476a..75ceccee178 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -108,7 +108,7 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) preempt_enable_no_resched(); } -static int kstack_depth_to_print = 12; +int kstack_depth_to_print = 12; #ifdef CONFIG_STACK_UNWIND static int call_trace = 1; #else diff --git a/include/asm-x86_64/stacktrace.h b/include/asm-x86_64/stacktrace.h index 5eb9799bef7..6f0b5459430 100644 --- a/include/asm-x86_64/stacktrace.h +++ b/include/asm-x86_64/stacktrace.h @@ -1,6 +1,8 @@ #ifndef _ASM_STACKTRACE_H #define _ASM_STACKTRACE_H 1 +extern int kstack_depth_to_print; + /* Generic stack tracer with callbacks */ struct stacktrace_ops { diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 09e569f4792..6fc5e17086f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -54,6 +54,7 @@ extern int proc_nr_files(ctl_table *table, int write, struct file *filp, #ifdef CONFIG_X86 #include +#include #endif #if defined(CONFIG_SYSCTL) @@ -707,6 +708,14 @@ static ctl_table kern_table[] = { .mode = 0444, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "kstack_depth_to_print", + .data = &kstack_depth_to_print, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #endif #if defined(CONFIG_MMU) { -- cgit v1.2.3-70-g09d2 From a36df98ab1cdd8a9e7daa4c1b5c48ffa2ad6ea09 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Thu, 7 Dec 2006 02:14:12 +0100 Subject: [PATCH] i386: touch softlockup during backtracing Sometimes the soft watchdog fires after we're done oopsing. See http://projects.info-pull.com/mokb/MOKB-25-11-2006.html for an example. AK: changed to touch_nmi_watchdog() Signed-off-by: Dave Jones Signed-off-by: Andi Kleen --- arch/i386/kernel/traps.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 1d48a75fa33..86d8476be4f 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -29,6 +29,7 @@ #include #include #include +#include #ifdef CONFIG_EISA #include @@ -248,6 +249,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, stack = (unsigned long*)context->previous_esp; if (!stack) break; + touch_nmi_watchdog(); } } EXPORT_SYMBOL(dump_trace); -- cgit v1.2.3-70-g09d2 From 359ad0d4015a9ab39243f2ebc4eb07915bd618b2 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 7 Dec 2006 02:14:13 +0100 Subject: [PATCH] unwinder: more sanity checks in Dwarf2 unwinder Tighten the requirements on both input to and output from the Dwarf2 unwinder. Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/i386/kernel/traps.c | 7 +++++++ arch/x86_64/kernel/traps.c | 7 +++++++ include/asm-i386/unwind.h | 12 ++++-------- include/asm-x86_64/unwind.h | 8 ++------ kernel/unwind.c | 16 +++++++++++++++- 5 files changed, 35 insertions(+), 15 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 86d8476be4f..c447807e2a4 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -161,12 +161,19 @@ dump_trace_unwind(struct unwind_frame_info *info, void *data) { struct ops_and_data *oad = (struct ops_and_data *)data; int n = 0; + unsigned long sp = UNW_SP(info); + if (arch_unw_user_mode(info)) + return -1; while (unwind(info) == 0 && UNW_PC(info)) { n++; oad->ops->address(oad->data, UNW_PC(info)); if (arch_unw_user_mode(info)) break; + if ((sp & ~(PAGE_SIZE - 1)) == (UNW_SP(info) & ~(PAGE_SIZE - 1)) + && sp > UNW_SP(info)) + break; + sp = UNW_SP(info); } return n; } diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 9864d195c40..4fdd162f0be 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -225,12 +225,19 @@ static int dump_trace_unwind(struct unwind_frame_info *info, void *context) { struct ops_and_data *oad = (struct ops_and_data *)context; int n = 0; + unsigned long sp = UNW_SP(info); + if (arch_unw_user_mode(info)) + return -1; while (unwind(info) == 0 && UNW_PC(info)) { n++; oad->ops->address(oad->data, UNW_PC(info)); if (arch_unw_user_mode(info)) break; + if ((sp & ~(PAGE_SIZE - 1)) == (UNW_SP(info) & ~(PAGE_SIZE - 1)) + && sp > UNW_SP(info)) + break; + sp = UNW_SP(info); } return n; } diff --git a/include/asm-i386/unwind.h b/include/asm-i386/unwind.h index 601fc67bd77..aa2c931e30d 100644 --- a/include/asm-i386/unwind.h +++ b/include/asm-i386/unwind.h @@ -79,17 +79,13 @@ extern asmlinkage int arch_unwind_init_running(struct unwind_frame_info *, void *arg), void *arg); -static inline int arch_unw_user_mode(const struct unwind_frame_info *info) +static inline int arch_unw_user_mode(/*const*/ struct unwind_frame_info *info) { -#if 0 /* This can only work when selector register and EFLAGS saves/restores - are properly annotated (and tracked in UNW_REGISTER_INFO). */ - return user_mode_vm(&info->regs); -#else - return info->regs.eip < PAGE_OFFSET + return user_mode_vm(&info->regs) + || info->regs.eip < PAGE_OFFSET || (info->regs.eip >= __fix_to_virt(FIX_VDSO) - && info->regs.eip < __fix_to_virt(FIX_VDSO) + PAGE_SIZE) + && info->regs.eip < __fix_to_virt(FIX_VDSO) + PAGE_SIZE) || info->regs.esp < PAGE_OFFSET; -#endif } #else diff --git a/include/asm-x86_64/unwind.h b/include/asm-x86_64/unwind.h index 2e7ff10fd77..2f6349e4871 100644 --- a/include/asm-x86_64/unwind.h +++ b/include/asm-x86_64/unwind.h @@ -87,14 +87,10 @@ extern int arch_unwind_init_running(struct unwind_frame_info *, static inline int arch_unw_user_mode(const struct unwind_frame_info *info) { -#if 0 /* This can only work when selector register saves/restores - are properly annotated (and tracked in UNW_REGISTER_INFO). */ - return user_mode(&info->regs); -#else - return (long)info->regs.rip >= 0 + return user_mode(&info->regs) + || (long)info->regs.rip >= 0 || (info->regs.rip >= VSYSCALL_START && info->regs.rip < VSYSCALL_END) || (long)info->regs.rsp >= 0; -#endif } #else diff --git a/kernel/unwind.c b/kernel/unwind.c index af48168a3af..7e721f10410 100644 --- a/kernel/unwind.c +++ b/kernel/unwind.c @@ -95,6 +95,7 @@ static const struct { typedef unsigned long uleb128_t; typedef signed long sleb128_t; +#define sleb128abs __builtin_labs static struct unwind_table { struct { @@ -787,7 +788,7 @@ int unwind(struct unwind_frame_info *frame) #define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs]) const u32 *fde = NULL, *cie = NULL; const u8 *ptr = NULL, *end = NULL; - unsigned long pc = UNW_PC(frame) - frame->call_frame; + unsigned long pc = UNW_PC(frame) - frame->call_frame, sp; unsigned long startLoc = 0, endLoc = 0, cfa; unsigned i; signed ptrType = -1; @@ -936,6 +937,9 @@ int unwind(struct unwind_frame_info *frame) state.dataAlign = get_sleb128(&ptr, end); if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end) cie = NULL; + else if (UNW_PC(frame) % state.codeAlign + || UNW_SP(frame) % sleb128abs(state.dataAlign)) + return -EPERM; else { retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end); /* skip augmentation */ @@ -968,6 +972,8 @@ int unwind(struct unwind_frame_info *frame) #ifdef CONFIG_FRAME_POINTER unsigned long top, bottom; + if ((UNW_SP(frame) | UNW_FP(frame)) % sizeof(unsigned long)) + return -EPERM; top = STACK_TOP(frame->task); bottom = STACK_BOTTOM(frame->task); # if FRAME_RETADDR_OFFSET < 0 @@ -1018,6 +1024,7 @@ int unwind(struct unwind_frame_info *frame) || state.regs[retAddrReg].where == Nowhere || state.cfa.reg >= ARRAY_SIZE(reg_info) || reg_info[state.cfa.reg].width != sizeof(unsigned long) + || FRAME_REG(state.cfa.reg, unsigned long) % sizeof(unsigned long) || state.cfa.offs % sizeof(unsigned long)) return -EIO; /* update frame */ @@ -1038,6 +1045,8 @@ int unwind(struct unwind_frame_info *frame) #else # define CASES CASE(8); CASE(16); CASE(32); CASE(64) #endif + pc = UNW_PC(frame); + sp = UNW_SP(frame); for (i = 0; i < ARRAY_SIZE(state.regs); ++i) { if (REG_INVALID(i)) { if (state.regs[i].where == Nowhere) @@ -1118,6 +1127,11 @@ int unwind(struct unwind_frame_info *frame) } } + if (UNW_PC(frame) % state.codeAlign + || UNW_SP(frame) % sleb128abs(state.dataAlign) + || (pc == UNW_PC(frame) && sp == UNW_SP(frame))) + return -EIO; + return 0; #undef CASES #undef FRAME_REG -- cgit v1.2.3-70-g09d2 From 9cfa5b5dfafcfe64c1a48906f243cdd302f82471 Mon Sep 17 00:00:00 2001 From: Burman Yan Date: Thu, 7 Dec 2006 02:14:13 +0100 Subject: [PATCH] x86-64: replace kmalloc+memset with kzalloc in MTRR code Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/mtrr/if.c | 3 +-- arch/i386/kernel/cpu/mtrr/main.c | 6 ++---- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/cpu/mtrr/if.c b/arch/i386/kernel/cpu/mtrr/if.c index 9753bc6a1f3..5ae1705eafa 100644 --- a/arch/i386/kernel/cpu/mtrr/if.c +++ b/arch/i386/kernel/cpu/mtrr/if.c @@ -44,10 +44,9 @@ mtrr_file_add(unsigned long base, unsigned long size, max = num_var_ranges; if (fcount == NULL) { - fcount = kmalloc(max * sizeof *fcount, GFP_KERNEL); + fcount = kzalloc(max * sizeof *fcount, GFP_KERNEL); if (!fcount) return -ENOMEM; - memset(fcount, 0, max * sizeof *fcount); FILE_FCOUNT(file) = fcount; } if (!page) { diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c index aeea23e8a05..16bb7ea8714 100644 --- a/arch/i386/kernel/cpu/mtrr/main.c +++ b/arch/i386/kernel/cpu/mtrr/main.c @@ -596,10 +596,8 @@ static int mtrr_save(struct sys_device * sysdev, pm_message_t state) int i; int size = num_var_ranges * sizeof(struct mtrr_value); - mtrr_state = kmalloc(size,GFP_ATOMIC); - if (mtrr_state) - memset(mtrr_state,0,size); - else + mtrr_state = kzalloc(size,GFP_ATOMIC); + if (!mtrr_state) return -ENOMEM; for (i = 0; i < num_var_ranges; i++) { -- cgit v1.2.3-70-g09d2 From f475ff352c5e05d473c462b97c3a13a5b803af5a Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 7 Dec 2006 02:14:13 +0100 Subject: [PATCH] x86-64: remove unused variable Remove unused variable in msr_write(). Reported by D Binderman . Cc: H. Peter Anvin Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/msr.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/msr.c b/arch/i386/kernel/msr.c index a773f776c9e..fd45059c908 100644 --- a/arch/i386/kernel/msr.c +++ b/arch/i386/kernel/msr.c @@ -195,7 +195,6 @@ static ssize_t msr_write(struct file *file, const char __user *buf, { const u32 __user *tmp = (const u32 __user *)buf; u32 data[2]; - size_t rv; u32 reg = *ppos; int cpu = iminor(file->f_dentry->d_inode); int err; @@ -203,7 +202,7 @@ static ssize_t msr_write(struct file *file, const char __user *buf, if (count % 8) return -EINVAL; /* Invalid chunk size */ - for (rv = 0; count; count -= 8) { + for (; count; count -= 8) { if (copy_from_user(&data, tmp, 8)) return -EFAULT; err = do_wrmsr(cpu, reg, data[0], data[1]); -- cgit v1.2.3-70-g09d2 From d7fb02712818643bab79a6b3cb8270a747d0227b Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Thu, 7 Dec 2006 02:14:19 +0100 Subject: [PATCH] x86-64: remove remaining pc98 code Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/io_apic.c | 23 ++--------------------- arch/i386/kernel/mpparse.c | 2 -- include/asm-i386/mpspec_def.h | 2 -- 3 files changed, 2 insertions(+), 25 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index 56f571c9fc0..7f015a71ab5 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -842,8 +842,7 @@ static int __init find_isa_irq_pin(int irq, int type) if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || mp_bus_id_to_type[lbus] == MP_BUS_EISA || - mp_bus_id_to_type[lbus] == MP_BUS_MCA || - mp_bus_id_to_type[lbus] == MP_BUS_NEC98 + mp_bus_id_to_type[lbus] == MP_BUS_MCA ) && (mp_irqs[i].mpc_irqtype == type) && (mp_irqs[i].mpc_srcbusirq == irq)) @@ -862,8 +861,7 @@ static int __init find_isa_irq_apic(int irq, int type) if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || mp_bus_id_to_type[lbus] == MP_BUS_EISA || - mp_bus_id_to_type[lbus] == MP_BUS_MCA || - mp_bus_id_to_type[lbus] == MP_BUS_NEC98 + mp_bus_id_to_type[lbus] == MP_BUS_MCA ) && (mp_irqs[i].mpc_irqtype == type) && (mp_irqs[i].mpc_srcbusirq == irq)) @@ -993,12 +991,6 @@ static int EISA_ELCR(unsigned int irq) #define default_MCA_trigger(idx) (1) #define default_MCA_polarity(idx) (0) -/* NEC98 interrupts are always polarity zero edge triggered, - * when listed as conforming in the MP table. */ - -#define default_NEC98_trigger(idx) (0) -#define default_NEC98_polarity(idx) (0) - static int __init MPBIOS_polarity(int idx) { int bus = mp_irqs[idx].mpc_srcbus; @@ -1033,11 +1025,6 @@ static int __init MPBIOS_polarity(int idx) polarity = default_MCA_polarity(idx); break; } - case MP_BUS_NEC98: /* NEC 98 pin */ - { - polarity = default_NEC98_polarity(idx); - break; - } default: { printk(KERN_WARNING "broken BIOS!!\n"); @@ -1107,11 +1094,6 @@ static int MPBIOS_trigger(int idx) trigger = default_MCA_trigger(idx); break; } - case MP_BUS_NEC98: /* NEC 98 pin */ - { - trigger = default_NEC98_trigger(idx); - break; - } default: { printk(KERN_WARNING "broken BIOS!!\n"); @@ -1173,7 +1155,6 @@ static int pin_2_irq(int idx, int apic, int pin) case MP_BUS_ISA: /* ISA pin */ case MP_BUS_EISA: case MP_BUS_MCA: - case MP_BUS_NEC98: { irq = mp_irqs[idx].mpc_srcbusirq; break; diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c index 442aaf8c77e..2ce67228dff 100644 --- a/arch/i386/kernel/mpparse.c +++ b/arch/i386/kernel/mpparse.c @@ -249,8 +249,6 @@ static void __init MP_bus_info (struct mpc_config_bus *m) mp_current_pci_id++; } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) { mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA; - } else if (strncmp(str, BUSTYPE_NEC98, sizeof(BUSTYPE_NEC98)-1) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_NEC98; } else { printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); } diff --git a/include/asm-i386/mpspec_def.h b/include/asm-i386/mpspec_def.h index 76feedf85a8..13bafb16e7a 100644 --- a/include/asm-i386/mpspec_def.h +++ b/include/asm-i386/mpspec_def.h @@ -97,7 +97,6 @@ struct mpc_config_bus #define BUSTYPE_TC "TC" #define BUSTYPE_VME "VME" #define BUSTYPE_XPRESS "XPRESS" -#define BUSTYPE_NEC98 "NEC98" struct mpc_config_ioapic { @@ -182,7 +181,6 @@ enum mp_bustype { MP_BUS_EISA, MP_BUS_PCI, MP_BUS_MCA, - MP_BUS_NEC98 }; #endif -- cgit v1.2.3-70-g09d2 From 116780fc04d9f6cd3ceeab0251681f1dfda53367 Mon Sep 17 00:00:00 2001 From: Burman Yan Date: Thu, 7 Dec 2006 02:14:19 +0100 Subject: [PATCH] i386: replace kmalloc+memset with kzalloc Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/intel_cacheinfo.c | 11 +++-------- arch/i386/kernel/mca.c | 13 ++++--------- arch/i386/kernel/pci-dma.c | 6 ++---- arch/i386/mach-voyager/voyager_cat.c | 6 ++---- include/asm-i386/thread_info.h | 10 +--------- 5 files changed, 12 insertions(+), 34 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c index 5c43be47587..80b4c5d421b 100644 --- a/arch/i386/kernel/cpu/intel_cacheinfo.c +++ b/arch/i386/kernel/cpu/intel_cacheinfo.c @@ -480,12 +480,10 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu) if (num_cache_leaves == 0) return -ENOENT; - cpuid4_info[cpu] = kmalloc( + cpuid4_info[cpu] = kzalloc( sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL); if (unlikely(cpuid4_info[cpu] == NULL)) return -ENOMEM; - memset(cpuid4_info[cpu], 0, - sizeof(struct _cpuid4_info) * num_cache_leaves); oldmask = current->cpus_allowed; retval = set_cpus_allowed(current, cpumask_of_cpu(cpu)); @@ -658,17 +656,14 @@ static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu) return -ENOENT; /* Allocate all required memory */ - cache_kobject[cpu] = kmalloc(sizeof(struct kobject), GFP_KERNEL); + cache_kobject[cpu] = kzalloc(sizeof(struct kobject), GFP_KERNEL); if (unlikely(cache_kobject[cpu] == NULL)) goto err_out; - memset(cache_kobject[cpu], 0, sizeof(struct kobject)); - index_kobject[cpu] = kmalloc( + index_kobject[cpu] = kzalloc( sizeof(struct _index_kobject ) * num_cache_leaves, GFP_KERNEL); if (unlikely(index_kobject[cpu] == NULL)) goto err_out; - memset(index_kobject[cpu], 0, - sizeof(struct _index_kobject) * num_cache_leaves); return 0; diff --git a/arch/i386/kernel/mca.c b/arch/i386/kernel/mca.c index eb57a851789..b83672b8952 100644 --- a/arch/i386/kernel/mca.c +++ b/arch/i386/kernel/mca.c @@ -283,10 +283,9 @@ static int __init mca_init(void) bus->f.mca_transform_memory = mca_dummy_transform_memory; /* get the motherboard device */ - mca_dev = kmalloc(sizeof(struct mca_device), GFP_KERNEL); + mca_dev = kzalloc(sizeof(struct mca_device), GFP_KERNEL); if(unlikely(!mca_dev)) goto out_nomem; - memset(mca_dev, 0, sizeof(struct mca_device)); /* * We do not expect many MCA interrupts during initialization, @@ -310,11 +309,9 @@ static int __init mca_init(void) mca_dev->slot = MCA_MOTHERBOARD; mca_register_device(MCA_PRIMARY_BUS, mca_dev); - mca_dev = kmalloc(sizeof(struct mca_device), GFP_ATOMIC); + mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC); if(unlikely(!mca_dev)) goto out_unlock_nomem; - memset(mca_dev, 0, sizeof(struct mca_device)); - /* Put motherboard into video setup mode, read integrated video * POS registers, and turn motherboard setup off. @@ -349,10 +346,9 @@ static int __init mca_init(void) } if(which_scsi) { /* found a scsi card */ - mca_dev = kmalloc(sizeof(struct mca_device), GFP_ATOMIC); + mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC); if(unlikely(!mca_dev)) goto out_unlock_nomem; - memset(mca_dev, 0, sizeof(struct mca_device)); for(j = 0; j < 8; j++) mca_dev->pos[j] = pos[j]; @@ -378,10 +374,9 @@ static int __init mca_init(void) if(!mca_read_and_store_pos(pos)) continue; - mca_dev = kmalloc(sizeof(struct mca_device), GFP_ATOMIC); + mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC); if(unlikely(!mca_dev)) goto out_unlock_nomem; - memset(mca_dev, 0, sizeof(struct mca_device)); for(j=0; j<8; j++) mca_dev->pos[j]=pos[j]; diff --git a/arch/i386/kernel/pci-dma.c b/arch/i386/kernel/pci-dma.c index 5c8c6ef1fc5..41af692c158 100644 --- a/arch/i386/kernel/pci-dma.c +++ b/arch/i386/kernel/pci-dma.c @@ -92,14 +92,12 @@ int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, if (!mem_base) goto out; - dev->dma_mem = kmalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); + dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); if (!dev->dma_mem) goto out; - memset(dev->dma_mem, 0, sizeof(struct dma_coherent_mem)); - dev->dma_mem->bitmap = kmalloc(bitmap_size, GFP_KERNEL); + dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL); if (!dev->dma_mem->bitmap) goto free1_out; - memset(dev->dma_mem->bitmap, 0, bitmap_size); dev->dma_mem->virt_base = mem_base; dev->dma_mem->device_base = device_addr; diff --git a/arch/i386/mach-voyager/voyager_cat.c b/arch/i386/mach-voyager/voyager_cat.c index f50c6c6ad68..943a9473b13 100644 --- a/arch/i386/mach-voyager/voyager_cat.c +++ b/arch/i386/mach-voyager/voyager_cat.c @@ -776,7 +776,7 @@ voyager_cat_init(void) for(asic=0; asic < (*modpp)->num_asics; asic++) { int j; voyager_asic_t *asicp = *asicpp - = kmalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count++];*/ + = kzalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count++];*/ voyager_sp_table_t *sp_table; voyager_at_t *asic_table; voyager_jtt_t *jtag_table; @@ -785,7 +785,6 @@ voyager_cat_init(void) printk("**WARNING** kmalloc failure in cat_init\n"); continue; } - memset(asicp, 0, sizeof(voyager_asic_t)); asicpp = &(asicp->next); asicp->asic_location = asic; sp_table = (voyager_sp_table_t *)(eprom_buf + sp_offset); @@ -851,8 +850,7 @@ voyager_cat_init(void) #endif { - struct resource *res = kmalloc(sizeof(struct resource),GFP_KERNEL); - memset(res, 0, sizeof(struct resource)); + struct resource *res = kzalloc(sizeof(struct resource),GFP_KERNEL); res->name = kmalloc(128, GFP_KERNEL); sprintf((char *)res->name, "Voyager %s Quad CPI", cat_module_name(i)); res->start = qic_addr; diff --git a/include/asm-i386/thread_info.h b/include/asm-i386/thread_info.h index 54d6d7aea93..46d32ad9208 100644 --- a/include/asm-i386/thread_info.h +++ b/include/asm-i386/thread_info.h @@ -95,15 +95,7 @@ static inline struct thread_info *current_thread_info(void) /* thread information allocation */ #ifdef CONFIG_DEBUG_STACK_USAGE -#define alloc_thread_info(tsk) \ - ({ \ - struct thread_info *ret; \ - \ - ret = kmalloc(THREAD_SIZE, GFP_KERNEL); \ - if (ret) \ - memset(ret, 0, THREAD_SIZE); \ - ret; \ - }) +#define alloc_thread_info(tsk) kzalloc(THREAD_SIZE, GFP_KERNEL) #else #define alloc_thread_info(tsk) kmalloc(THREAD_SIZE, GFP_KERNEL) #endif -- cgit v1.2.3-70-g09d2 From 6bedb2ccb02dcc70ffc8eb76df71c746378190ad Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 7 Dec 2006 02:14:19 +0100 Subject: [PATCH] x86-64: don't use set_irq_regs() We don't need to setup _irq_regs in smp_xxx_interrupt (except apic timer). These handlers run with irqs disabled and do not call functions which need "struct pt_regs". Signed-off-by: Oleg Nesterov Signed-off-by: Andi Kleen Acked-by: Ingo Molnar Cc: Andi Kleen Acked-By: David Howells Signed-off-by: Andrew Morton --- arch/i386/kernel/smp.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index 31e5c6573aa..1b080ab8a49 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -321,7 +321,6 @@ static inline void leave_mm (unsigned long cpu) fastcall void smp_invalidate_interrupt(struct pt_regs *regs) { - struct pt_regs *old_regs = set_irq_regs(regs); unsigned long cpu; cpu = get_cpu(); @@ -352,7 +351,6 @@ fastcall void smp_invalidate_interrupt(struct pt_regs *regs) smp_mb__after_clear_bit(); out: put_cpu_no_resched(); - set_irq_regs(old_regs); } static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, @@ -607,14 +605,11 @@ void smp_send_stop(void) */ fastcall void smp_reschedule_interrupt(struct pt_regs *regs) { - struct pt_regs *old_regs = set_irq_regs(regs); ack_APIC_irq(); - set_irq_regs(old_regs); } fastcall void smp_call_function_interrupt(struct pt_regs *regs) { - struct pt_regs *old_regs = set_irq_regs(regs); void (*func) (void *info) = call_data->func; void *info = call_data->info; int wait = call_data->wait; @@ -637,7 +632,6 @@ fastcall void smp_call_function_interrupt(struct pt_regs *regs) mb(); atomic_inc(&call_data->finished); } - set_irq_regs(old_regs); } /* -- cgit v1.2.3-70-g09d2 From b65780e123ba9b762276482bbfb52836e4d41fd9 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 7 Dec 2006 02:14:19 +0100 Subject: [PATCH] unwinder: move .eh_frame to RODATA The .eh_frame section contents is never written to, so it can as well benefit from CONFIG_DEBUG_RODATA. Diff-ed against firstfloor tree. Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/i386/kernel/vmlinux.lds.S | 9 --------- arch/x86_64/kernel/vmlinux.lds.S | 9 --------- include/asm-generic/vmlinux.lds.h | 17 ++++++++++++----- kernel/unwind.c | 2 +- 4 files changed, 13 insertions(+), 24 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index 25581e87c60..56e6ad5cb04 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -102,15 +102,6 @@ SECTIONS _edata = .; /* End of data section */ } -#ifdef CONFIG_STACK_UNWIND - . = ALIGN(4); - .eh_frame : AT(ADDR(.eh_frame) - LOAD_OFFSET) { - __start_unwind = .; - *(.eh_frame) - __end_unwind = .; - } -#endif - . = ALIGN(THREAD_SIZE); /* init_task */ .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { *(.data.init_task) diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index d9534e750d4..6a1f8f491e5 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -51,15 +51,6 @@ SECTIONS RODATA -#ifdef CONFIG_STACK_UNWIND - . = ALIGN(8); - .eh_frame : AT(ADDR(.eh_frame) - LOAD_OFFSET) { - __start_unwind = .; - *(.eh_frame) - __end_unwind = .; - } -#endif - . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */ /* Data */ .data : AT(ADDR(.data) - LOAD_OFFSET) { diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 9f4747780da..4d4c62d1105 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -119,8 +119,7 @@ *(__ksymtab_strings) \ } \ \ - /* Unwind data binary search table */ \ - EH_FRAME_HDR \ + EH_FRAME \ \ /* Built-in module parameters. */ \ __param : AT(ADDR(__param) - LOAD_OFFSET) { \ @@ -162,15 +161,23 @@ VMLINUX_SYMBOL(__kprobes_text_end) = .; #ifdef CONFIG_STACK_UNWIND - /* Unwind data binary search table */ -#define EH_FRAME_HDR \ +#define EH_FRAME \ + /* Unwind data binary search table */ \ + . = ALIGN(8); \ .eh_frame_hdr : AT(ADDR(.eh_frame_hdr) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start_unwind_hdr) = .; \ *(.eh_frame_hdr) \ VMLINUX_SYMBOL(__end_unwind_hdr) = .; \ + } \ + /* Unwind data */ \ + . = ALIGN(8); \ + .eh_frame : AT(ADDR(.eh_frame) - LOAD_OFFSET) { \ + VMLINUX_SYMBOL(__start_unwind) = .; \ + *(.eh_frame) \ + VMLINUX_SYMBOL(__end_unwind) = .; \ } #else -#define EH_FRAME_HDR +#define EH_FRAME #endif /* DWARF debug sections. diff --git a/kernel/unwind.c b/kernel/unwind.c index 08645aa7c2d..09c26132924 100644 --- a/kernel/unwind.c +++ b/kernel/unwind.c @@ -19,7 +19,7 @@ #include #include -extern char __start_unwind[], __end_unwind[]; +extern const char __start_unwind[], __end_unwind[]; extern const u8 __start_unwind_hdr[], __end_unwind_hdr[]; #define MAX_STACK_DEPTH 8 -- cgit v1.2.3-70-g09d2 From d9408cefe677636bc1c100fdcfac0b2ab9ff87bf Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Thu, 7 Dec 2006 02:14:19 +0100 Subject: [PATCH] i386: Clean up smp_tune_scheduling() - remove the write-only local variable "bandwidth" - don't set "max_cache_size" in the (cachesize < 0) case: that's already handled in kernel/sched.c:measure_migration_cost() Signed-off-by: Adrian Bunk Signed-off-by: Andi Kleen Acked-by: Ingo Molnar Signed-off-by: Andrew Morton --- arch/i386/kernel/smpboot.c | 29 +++++------------------------ 1 file changed, 5 insertions(+), 24 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 346f27f4c79..b4e6f32de45 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -1130,34 +1130,15 @@ exit: } #endif -static void smp_tune_scheduling (void) +static void smp_tune_scheduling(void) { unsigned long cachesize; /* kB */ - unsigned long bandwidth = 350; /* MB/s */ - /* - * Rough estimation for SMP scheduling, this is the number of - * cycles it takes for a fully memory-limited process to flush - * the SMP-local cache. - * - * (For a P5 this pretty much means we will choose another idle - * CPU almost always at wakeup time (this is due to the small - * L1 cache), on PIIs it's around 50-100 usecs, depending on - * the cache size) - */ - if (!cpu_khz) { - /* - * this basically disables processor-affinity - * scheduling on SMP without a TSC. - */ - return; - } else { + if (cpu_khz) { cachesize = boot_cpu_data.x86_cache_size; - if (cachesize == -1) { - cachesize = 16; /* Pentiums, 2x8kB cache */ - bandwidth = 100; - } - max_cache_size = cachesize * 1024; + + if (cachesize > 0) + max_cache_size = cachesize * 1024; } } -- cgit v1.2.3-70-g09d2 From a120586873d3d64de93bd6d593d237e131994e58 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Wed, 6 Dec 2006 20:32:37 -0800 Subject: [PATCH] Allow NULL pointers in percpu_free The patch (as824b) makes percpu_free() ignore NULL arguments, as one would expect for a deallocation routine. (Note that free_percpu is #defined as percpu_free in include/linux/percpu.h.) A few callers are updated to remove now-unneeded tests for NULL. A few other callers already seem to assume that passing a NULL pointer to percpu_free() is okay! The patch also removes an unnecessary NULL check in percpu_depopulate(). Signed-off-by: Alan Stern Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/acpi/cstate.c | 6 ++---- block/blktrace.c | 3 +-- mm/allocpercpu.c | 9 +++++---- net/ipv6/af_inet6.c | 6 ++---- 4 files changed, 10 insertions(+), 14 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/acpi/cstate.c b/arch/i386/kernel/acpi/cstate.c index 4664b55f623..12e937c1ce4 100644 --- a/arch/i386/kernel/acpi/cstate.c +++ b/arch/i386/kernel/acpi/cstate.c @@ -156,10 +156,8 @@ static int __init ffh_cstate_init(void) static void __exit ffh_cstate_exit(void) { - if (cpu_cstate_entry) { - free_percpu(cpu_cstate_entry); - cpu_cstate_entry = NULL; - } + free_percpu(cpu_cstate_entry); + cpu_cstate_entry = NULL; } arch_initcall(ffh_cstate_init); diff --git a/block/blktrace.c b/block/blktrace.c index 74e02c04b2d..d3679dd1d22 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -394,8 +394,7 @@ err: if (bt) { if (bt->dropped_file) debugfs_remove(bt->dropped_file); - if (bt->sequence) - free_percpu(bt->sequence); + free_percpu(bt->sequence); if (bt->rchan) relay_close(bt->rchan); kfree(bt); diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c index eaa9abeea53..b2486cf887a 100644 --- a/mm/allocpercpu.c +++ b/mm/allocpercpu.c @@ -17,10 +17,9 @@ void percpu_depopulate(void *__pdata, int cpu) { struct percpu_data *pdata = __percpu_disguise(__pdata); - if (pdata->ptrs[cpu]) { - kfree(pdata->ptrs[cpu]); - pdata->ptrs[cpu] = NULL; - } + + kfree(pdata->ptrs[cpu]); + pdata->ptrs[cpu] = NULL; } EXPORT_SYMBOL_GPL(percpu_depopulate); @@ -123,6 +122,8 @@ EXPORT_SYMBOL_GPL(__percpu_alloc_mask); */ void percpu_free(void *__pdata) { + if (unlikely(!__pdata)) + return; __percpu_depopulate_mask(__pdata, &cpu_possible_map); kfree(__percpu_disguise(__pdata)); } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 87c8f54872b..e5cd83b2205 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -720,10 +720,8 @@ snmp6_mib_free(void *ptr[2]) { if (ptr == NULL) return; - if (ptr[0]) - free_percpu(ptr[0]); - if (ptr[1]) - free_percpu(ptr[1]); + free_percpu(ptr[0]); + free_percpu(ptr[1]); ptr[0] = ptr[1] = NULL; } -- cgit v1.2.3-70-g09d2 From e94b1766097d53e6f3ccfb36c8baa562ffeda3fc Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 6 Dec 2006 20:33:17 -0800 Subject: [PATCH] slab: remove SLAB_KERNEL SLAB_KERNEL is an alias of GFP_KERNEL. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/sysenter.c | 2 +- arch/ia64/ia32/binfmt_elf32.c | 8 ++++---- arch/ia64/kernel/perfmon.c | 2 +- arch/ia64/mm/init.c | 4 ++-- arch/powerpc/kernel/vdso.c | 2 +- arch/powerpc/platforms/cell/spufs/inode.c | 2 +- arch/sh/kernel/vsyscall/vsyscall.c | 2 +- arch/x86_64/ia32/ia32_binfmt.c | 2 +- arch/x86_64/ia32/syscall32.c | 2 +- drivers/atm/he.c | 4 ++-- drivers/base/dmapool.c | 2 +- drivers/dma/ioatdma.c | 4 ++-- drivers/ieee1394/hosts.c | 2 +- drivers/ieee1394/ohci1394.c | 8 ++++---- drivers/ieee1394/pcilynx.c | 2 +- drivers/ieee1394/raw1394.c | 10 +++++----- drivers/infiniband/hw/ehca/ehca_av.c | 2 +- drivers/infiniband/hw/ehca/ehca_cq.c | 2 +- drivers/infiniband/hw/ehca/ehca_main.c | 2 +- drivers/infiniband/hw/ehca/ehca_mrmw.c | 4 ++-- drivers/infiniband/hw/ehca/ehca_pd.c | 2 +- drivers/infiniband/hw/ehca/ehca_qp.c | 2 +- drivers/input/touchscreen/ads7846.c | 2 +- drivers/isdn/gigaset/bas-gigaset.c | 14 +++++++------- drivers/isdn/gigaset/usb-gigaset.c | 6 +++--- drivers/media/dvb/cinergyT2/cinergyT2.c | 2 +- drivers/mtd/devices/m25p80.c | 2 +- drivers/scsi/ipr.c | 2 +- drivers/spi/spi.c | 4 ++-- drivers/spi/spi_bitbang.c | 2 +- drivers/usb/core/hub.c | 4 ++-- drivers/usb/gadget/gmidi.c | 2 +- drivers/usb/gadget/goku_udc.c | 2 +- drivers/usb/gadget/inode.c | 6 +++--- drivers/usb/gadget/net2280.c | 2 +- drivers/usb/gadget/omap_udc.c | 2 +- drivers/usb/gadget/zero.c | 2 +- drivers/usb/host/hc_crisv10.c | 2 +- drivers/usb/host/ohci-pnx4008.c | 2 +- drivers/usb/input/acecad.c | 2 +- drivers/usb/input/usbtouchscreen.c | 2 +- drivers/usb/misc/usbtest.c | 28 ++++++++++++++-------------- drivers/usb/net/rndis_host.c | 2 +- drivers/usb/net/usbnet.c | 4 ++-- fs/adfs/super.c | 2 +- fs/affs/super.c | 2 +- fs/afs/super.c | 2 +- fs/befs/linuxvfs.c | 2 +- fs/bfs/inode.c | 2 +- fs/block_dev.c | 2 +- fs/cifs/cifsfs.c | 2 +- fs/cifs/misc.c | 4 ++-- fs/cifs/transport.c | 4 ++-- fs/coda/inode.c | 2 +- fs/dnotify.c | 2 +- fs/ecryptfs/crypto.c | 2 +- fs/ecryptfs/file.c | 2 +- fs/ecryptfs/inode.c | 4 ++-- fs/ecryptfs/keystore.c | 2 +- fs/ecryptfs/main.c | 4 ++-- fs/ecryptfs/super.c | 2 +- fs/efs/super.c | 2 +- fs/eventpoll.c | 4 ++-- fs/exec.c | 2 +- fs/ext2/super.c | 2 +- fs/fat/cache.c | 2 +- fs/fat/inode.c | 2 +- fs/fcntl.c | 2 +- fs/freevxfs/vxfs_inode.c | 4 ++-- fs/fuse/dev.c | 2 +- fs/fuse/inode.c | 2 +- fs/hfs/super.c | 2 +- fs/hfsplus/super.c | 2 +- fs/hugetlbfs/inode.c | 2 +- fs/inode.c | 2 +- fs/isofs/inode.c | 2 +- fs/jffs2/super.c | 2 +- fs/locks.c | 2 +- fs/minix/inode.c | 2 +- fs/ncpfs/inode.c | 2 +- fs/nfs/direct.c | 2 +- fs/nfs/inode.c | 2 +- fs/nfs/pagelist.c | 2 +- fs/openpromfs/inode.c | 2 +- fs/proc/inode.c | 2 +- fs/qnx4/inode.c | 2 +- fs/reiserfs/super.c | 2 +- fs/romfs/inode.c | 2 +- fs/smbfs/inode.c | 2 +- fs/smbfs/request.c | 2 +- fs/sysv/inode.c | 2 +- fs/udf/super.c | 2 +- fs/ufs/super.c | 2 +- include/linux/fs.h | 2 +- include/linux/rmap.h | 2 +- include/linux/slab.h | 1 - include/linux/taskstats_kern.h | 2 +- ipc/mqueue.c | 2 +- kernel/delayacct.c | 2 +- kernel/fork.c | 6 +++--- kernel/taskstats.c | 2 +- kernel/user.c | 2 +- mm/mempolicy.c | 2 +- mm/mmap.c | 4 ++-- mm/shmem.c | 2 +- mm/slab.c | 2 +- net/decnet/dn_table.c | 2 +- net/ipv4/fib_hash.c | 4 ++-- net/ipv4/fib_trie.c | 4 ++-- net/socket.c | 2 +- net/sunrpc/rpc_pipe.c | 2 +- security/keys/key.c | 2 +- security/selinux/hooks.c | 2 +- security/selinux/ss/avtab.c | 2 +- 114 files changed, 164 insertions(+), 165 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c index 713ba39d32c..0bbacd0ec17 100644 --- a/arch/i386/kernel/sysenter.c +++ b/arch/i386/kernel/sysenter.c @@ -132,7 +132,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) goto up_fail; } - vma = kmem_cache_zalloc(vm_area_cachep, SLAB_KERNEL); + vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (!vma) { ret = -ENOMEM; goto up_fail; diff --git a/arch/ia64/ia32/binfmt_elf32.c b/arch/ia64/ia32/binfmt_elf32.c index daa6b91bc92..578737ec762 100644 --- a/arch/ia64/ia32/binfmt_elf32.c +++ b/arch/ia64/ia32/binfmt_elf32.c @@ -91,7 +91,7 @@ ia64_elf32_init (struct pt_regs *regs) * it with privilege level 3 because the IVE uses non-privileged accesses to these * tables. IA-32 segmentation is used to protect against IA-32 accesses to them. */ - vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (vma) { memset(vma, 0, sizeof(*vma)); vma->vm_mm = current->mm; @@ -117,7 +117,7 @@ ia64_elf32_init (struct pt_regs *regs) * code is locked in specific gate page, which is pointed by pretcode * when setup_frame_ia32 */ - vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (vma) { memset(vma, 0, sizeof(*vma)); vma->vm_mm = current->mm; @@ -142,7 +142,7 @@ ia64_elf32_init (struct pt_regs *regs) * Install LDT as anonymous memory. This gives us all-zero segment descriptors * until a task modifies them via modify_ldt(). */ - vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (vma) { memset(vma, 0, sizeof(*vma)); vma->vm_mm = current->mm; @@ -214,7 +214,7 @@ ia32_setup_arg_pages (struct linux_binprm *bprm, int executable_stack) bprm->loader += stack_base; bprm->exec += stack_base; - mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + mpnt = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!mpnt) return -ENOMEM; diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index 3aaede0d698..e2321536ee4 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -2302,7 +2302,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned lon DPRINT(("smpl_buf @%p\n", smpl_buf)); /* allocate vma */ - vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!vma) { DPRINT(("Cannot allocate vma\n")); goto error_kmem; diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index ff87a5cba39..56dc2024220 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -156,7 +156,7 @@ ia64_init_addr_space (void) * the problem. When the process attempts to write to the register backing store * for the first time, it will get a SEGFAULT in this case. */ - vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (vma) { memset(vma, 0, sizeof(*vma)); vma->vm_mm = current->mm; @@ -175,7 +175,7 @@ ia64_init_addr_space (void) /* map NaT-page at address zero to speed up speculative dereferencing of NULL: */ if (!(current->personality & MMAP_PAGE_ZERO)) { - vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (vma) { memset(vma, 0, sizeof(*vma)); vma->vm_mm = current->mm; diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index c913ad5cad2..a4b28c73bba 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -264,7 +264,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, /* Allocate a VMA structure and fill it up */ - vma = kmem_cache_zalloc(vm_area_cachep, SLAB_KERNEL); + vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (vma == NULL) { rc = -ENOMEM; goto fail_mmapsem; diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c index c7d010749a1..7edfcc9d285 100644 --- a/arch/powerpc/platforms/cell/spufs/inode.c +++ b/arch/powerpc/platforms/cell/spufs/inode.c @@ -48,7 +48,7 @@ spufs_alloc_inode(struct super_block *sb) { struct spufs_inode_info *ei; - ei = kmem_cache_alloc(spufs_inode_cache, SLAB_KERNEL); + ei = kmem_cache_alloc(spufs_inode_cache, GFP_KERNEL); if (!ei) return NULL; diff --git a/arch/sh/kernel/vsyscall/vsyscall.c b/arch/sh/kernel/vsyscall/vsyscall.c index 075d6cc1a2d..deb46941f31 100644 --- a/arch/sh/kernel/vsyscall/vsyscall.c +++ b/arch/sh/kernel/vsyscall/vsyscall.c @@ -97,7 +97,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, goto up_fail; } - vma = kmem_cache_zalloc(vm_area_cachep, SLAB_KERNEL); + vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (!vma) { ret = -ENOMEM; goto up_fail; diff --git a/arch/x86_64/ia32/ia32_binfmt.c b/arch/x86_64/ia32/ia32_binfmt.c index 82ef182de6a..932a62ad6c8 100644 --- a/arch/x86_64/ia32/ia32_binfmt.c +++ b/arch/x86_64/ia32/ia32_binfmt.c @@ -351,7 +351,7 @@ int ia32_setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, bprm->loader += stack_base; bprm->exec += stack_base; - mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + mpnt = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!mpnt) return -ENOMEM; diff --git a/arch/x86_64/ia32/syscall32.c b/arch/x86_64/ia32/syscall32.c index 3a01329473a..3e5ed20cba4 100644 --- a/arch/x86_64/ia32/syscall32.c +++ b/arch/x86_64/ia32/syscall32.c @@ -49,7 +49,7 @@ int syscall32_setup_pages(struct linux_binprm *bprm, int exstack) struct mm_struct *mm = current->mm; int ret; - vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!vma) return -ENOMEM; diff --git a/drivers/atm/he.c b/drivers/atm/he.c index 2a2f0fc2288..ec8a7a633e6 100644 --- a/drivers/atm/he.c +++ b/drivers/atm/he.c @@ -820,7 +820,7 @@ he_init_group(struct he_dev *he_dev, int group) void *cpuaddr; #ifdef USE_RBPS_POOL - cpuaddr = pci_pool_alloc(he_dev->rbps_pool, SLAB_KERNEL|SLAB_DMA, &dma_handle); + cpuaddr = pci_pool_alloc(he_dev->rbps_pool, GFP_KERNEL|SLAB_DMA, &dma_handle); if (cpuaddr == NULL) return -ENOMEM; #else @@ -884,7 +884,7 @@ he_init_group(struct he_dev *he_dev, int group) void *cpuaddr; #ifdef USE_RBPL_POOL - cpuaddr = pci_pool_alloc(he_dev->rbpl_pool, SLAB_KERNEL|SLAB_DMA, &dma_handle); + cpuaddr = pci_pool_alloc(he_dev->rbpl_pool, GFP_KERNEL|SLAB_DMA, &dma_handle); if (cpuaddr == NULL) return -ENOMEM; #else diff --git a/drivers/base/dmapool.c b/drivers/base/dmapool.c index fa4675254f6..dbe0735f8c9 100644 --- a/drivers/base/dmapool.c +++ b/drivers/base/dmapool.c @@ -126,7 +126,7 @@ dma_pool_create (const char *name, struct device *dev, } else if (allocation < size) return NULL; - if (!(retval = kmalloc (sizeof *retval, SLAB_KERNEL))) + if (!(retval = kmalloc (sizeof *retval, GFP_KERNEL))) return retval; strlcpy (retval->name, name, sizeof retval->name); diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c index 0358419a0e4..8e872610461 100644 --- a/drivers/dma/ioatdma.c +++ b/drivers/dma/ioatdma.c @@ -636,10 +636,10 @@ static int ioat_self_test(struct ioat_device *device) dma_cookie_t cookie; int err = 0; - src = kzalloc(sizeof(u8) * IOAT_TEST_SIZE, SLAB_KERNEL); + src = kzalloc(sizeof(u8) * IOAT_TEST_SIZE, GFP_KERNEL); if (!src) return -ENOMEM; - dest = kzalloc(sizeof(u8) * IOAT_TEST_SIZE, SLAB_KERNEL); + dest = kzalloc(sizeof(u8) * IOAT_TEST_SIZE, GFP_KERNEL); if (!dest) { kfree(src); return -ENOMEM; diff --git a/drivers/ieee1394/hosts.c b/drivers/ieee1394/hosts.c index 8f4378a1631..b935e08695a 100644 --- a/drivers/ieee1394/hosts.c +++ b/drivers/ieee1394/hosts.c @@ -123,7 +123,7 @@ struct hpsb_host *hpsb_alloc_host(struct hpsb_host_driver *drv, size_t extra, int i; int hostnum = 0; - h = kzalloc(sizeof(*h) + extra, SLAB_KERNEL); + h = kzalloc(sizeof(*h) + extra, GFP_KERNEL); if (!h) return NULL; diff --git a/drivers/ieee1394/ohci1394.c b/drivers/ieee1394/ohci1394.c index 6e8ea9110c4..eae97d8dcf0 100644 --- a/drivers/ieee1394/ohci1394.c +++ b/drivers/ieee1394/ohci1394.c @@ -1225,7 +1225,7 @@ static int ohci_iso_recv_init(struct hpsb_iso *iso) int ctx; int ret = -ENOMEM; - recv = kmalloc(sizeof(*recv), SLAB_KERNEL); + recv = kmalloc(sizeof(*recv), GFP_KERNEL); if (!recv) return -ENOMEM; @@ -1918,7 +1918,7 @@ static int ohci_iso_xmit_init(struct hpsb_iso *iso) int ctx; int ret = -ENOMEM; - xmit = kmalloc(sizeof(*xmit), SLAB_KERNEL); + xmit = kmalloc(sizeof(*xmit), GFP_KERNEL); if (!xmit) return -ENOMEM; @@ -3021,7 +3021,7 @@ alloc_dma_rcv_ctx(struct ti_ohci *ohci, struct dma_rcv_ctx *d, return -ENOMEM; } - d->prg_cpu[i] = pci_pool_alloc(d->prg_pool, SLAB_KERNEL, d->prg_bus+i); + d->prg_cpu[i] = pci_pool_alloc(d->prg_pool, GFP_KERNEL, d->prg_bus+i); OHCI_DMA_ALLOC("pool dma_rcv prg[%d]", i); if (d->prg_cpu[i] != NULL) { @@ -3117,7 +3117,7 @@ alloc_dma_trm_ctx(struct ti_ohci *ohci, struct dma_trm_ctx *d, OHCI_DMA_ALLOC("dma_rcv prg pool"); for (i = 0; i < d->num_desc; i++) { - d->prg_cpu[i] = pci_pool_alloc(d->prg_pool, SLAB_KERNEL, d->prg_bus+i); + d->prg_cpu[i] = pci_pool_alloc(d->prg_pool, GFP_KERNEL, d->prg_bus+i); OHCI_DMA_ALLOC("pool dma_trm prg[%d]", i); if (d->prg_cpu[i] != NULL) { diff --git a/drivers/ieee1394/pcilynx.c b/drivers/ieee1394/pcilynx.c index 0a7412e27eb..9cab1d66147 100644 --- a/drivers/ieee1394/pcilynx.c +++ b/drivers/ieee1394/pcilynx.c @@ -1428,7 +1428,7 @@ static int __devinit add_card(struct pci_dev *dev, struct i2c_algo_bit_data i2c_adapter_data; error = -ENOMEM; - i2c_ad = kmalloc(sizeof(*i2c_ad), SLAB_KERNEL); + i2c_ad = kmalloc(sizeof(*i2c_ad), GFP_KERNEL); if (!i2c_ad) FAIL("failed to allocate I2C adapter memory"); memcpy(i2c_ad, &bit_ops, sizeof(struct i2c_adapter)); diff --git a/drivers/ieee1394/raw1394.c b/drivers/ieee1394/raw1394.c index 47f6a4e29b4..bf71e069eaf 100644 --- a/drivers/ieee1394/raw1394.c +++ b/drivers/ieee1394/raw1394.c @@ -112,7 +112,7 @@ static struct pending_request *__alloc_pending_request(gfp_t flags) static inline struct pending_request *alloc_pending_request(void) { - return __alloc_pending_request(SLAB_KERNEL); + return __alloc_pending_request(GFP_KERNEL); } static void free_pending_request(struct pending_request *req) @@ -1737,7 +1737,7 @@ static int arm_register(struct file_info *fi, struct pending_request *req) return (-EINVAL); } /* addr-list-entry for fileinfo */ - addr = kmalloc(sizeof(*addr), SLAB_KERNEL); + addr = kmalloc(sizeof(*addr), GFP_KERNEL); if (!addr) { req->req.length = 0; return (-ENOMEM); @@ -2103,7 +2103,7 @@ static int write_phypacket(struct file_info *fi, struct pending_request *req) static int get_config_rom(struct file_info *fi, struct pending_request *req) { int ret = sizeof(struct raw1394_request); - quadlet_t *data = kmalloc(req->req.length, SLAB_KERNEL); + quadlet_t *data = kmalloc(req->req.length, GFP_KERNEL); int status; if (!data) @@ -2133,7 +2133,7 @@ static int get_config_rom(struct file_info *fi, struct pending_request *req) static int update_config_rom(struct file_info *fi, struct pending_request *req) { int ret = sizeof(struct raw1394_request); - quadlet_t *data = kmalloc(req->req.length, SLAB_KERNEL); + quadlet_t *data = kmalloc(req->req.length, GFP_KERNEL); if (!data) return -ENOMEM; if (copy_from_user(data, int2ptr(req->req.sendb), req->req.length)) { @@ -2779,7 +2779,7 @@ static int raw1394_open(struct inode *inode, struct file *file) { struct file_info *fi; - fi = kzalloc(sizeof(*fi), SLAB_KERNEL); + fi = kzalloc(sizeof(*fi), GFP_KERNEL); if (!fi) return -ENOMEM; diff --git a/drivers/infiniband/hw/ehca/ehca_av.c b/drivers/infiniband/hw/ehca/ehca_av.c index 214e2fdddee..0d6e2c4bb24 100644 --- a/drivers/infiniband/hw/ehca/ehca_av.c +++ b/drivers/infiniband/hw/ehca/ehca_av.c @@ -57,7 +57,7 @@ struct ib_ah *ehca_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) struct ehca_shca *shca = container_of(pd->device, struct ehca_shca, ib_device); - av = kmem_cache_alloc(av_cache, SLAB_KERNEL); + av = kmem_cache_alloc(av_cache, GFP_KERNEL); if (!av) { ehca_err(pd->device, "Out of memory pd=%p ah_attr=%p", pd, ah_attr); diff --git a/drivers/infiniband/hw/ehca/ehca_cq.c b/drivers/infiniband/hw/ehca/ehca_cq.c index 458fe19648a..93995b658d9 100644 --- a/drivers/infiniband/hw/ehca/ehca_cq.c +++ b/drivers/infiniband/hw/ehca/ehca_cq.c @@ -134,7 +134,7 @@ struct ib_cq *ehca_create_cq(struct ib_device *device, int cqe, if (cqe >= 0xFFFFFFFF - 64 - additional_cqe) return ERR_PTR(-EINVAL); - my_cq = kmem_cache_alloc(cq_cache, SLAB_KERNEL); + my_cq = kmem_cache_alloc(cq_cache, GFP_KERNEL); if (!my_cq) { ehca_err(device, "Out of memory for ehca_cq struct device=%p", device); diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c index 3d1c1c53503..cc47e4c13a1 100644 --- a/drivers/infiniband/hw/ehca/ehca_main.c +++ b/drivers/infiniband/hw/ehca/ehca_main.c @@ -108,7 +108,7 @@ static struct kmem_cache *ctblk_cache = NULL; void *ehca_alloc_fw_ctrlblock(void) { - void *ret = kmem_cache_zalloc(ctblk_cache, SLAB_KERNEL); + void *ret = kmem_cache_zalloc(ctblk_cache, GFP_KERNEL); if (!ret) ehca_gen_err("Out of memory for ctblk"); return ret; diff --git a/drivers/infiniband/hw/ehca/ehca_mrmw.c b/drivers/infiniband/hw/ehca/ehca_mrmw.c index abce676c0ae..0a5e2214cc5 100644 --- a/drivers/infiniband/hw/ehca/ehca_mrmw.c +++ b/drivers/infiniband/hw/ehca/ehca_mrmw.c @@ -53,7 +53,7 @@ static struct ehca_mr *ehca_mr_new(void) { struct ehca_mr *me; - me = kmem_cache_alloc(mr_cache, SLAB_KERNEL); + me = kmem_cache_alloc(mr_cache, GFP_KERNEL); if (me) { memset(me, 0, sizeof(struct ehca_mr)); spin_lock_init(&me->mrlock); @@ -72,7 +72,7 @@ static struct ehca_mw *ehca_mw_new(void) { struct ehca_mw *me; - me = kmem_cache_alloc(mw_cache, SLAB_KERNEL); + me = kmem_cache_alloc(mw_cache, GFP_KERNEL); if (me) { memset(me, 0, sizeof(struct ehca_mw)); spin_lock_init(&me->mwlock); diff --git a/drivers/infiniband/hw/ehca/ehca_pd.c b/drivers/infiniband/hw/ehca/ehca_pd.c index 2c3cdc6f7b3..d5345e5b3cd 100644 --- a/drivers/infiniband/hw/ehca/ehca_pd.c +++ b/drivers/infiniband/hw/ehca/ehca_pd.c @@ -50,7 +50,7 @@ struct ib_pd *ehca_alloc_pd(struct ib_device *device, { struct ehca_pd *pd; - pd = kmem_cache_alloc(pd_cache, SLAB_KERNEL); + pd = kmem_cache_alloc(pd_cache, GFP_KERNEL); if (!pd) { ehca_err(device, "device=%p context=%p out of memory", device, context); diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c index 8682aa50c70..c6c9cef203e 100644 --- a/drivers/infiniband/hw/ehca/ehca_qp.c +++ b/drivers/infiniband/hw/ehca/ehca_qp.c @@ -450,7 +450,7 @@ struct ib_qp *ehca_create_qp(struct ib_pd *pd, if (pd->uobject && udata) context = pd->uobject->context; - my_qp = kmem_cache_alloc(qp_cache, SLAB_KERNEL); + my_qp = kmem_cache_alloc(qp_cache, GFP_KERNEL); if (!my_qp) { ehca_err(pd->device, "pd=%p not enough memory to alloc qp", pd); return ERR_PTR(-ENOMEM); diff --git a/drivers/input/touchscreen/ads7846.c b/drivers/input/touchscreen/ads7846.c index f56d6a0f062..0517c7387d6 100644 --- a/drivers/input/touchscreen/ads7846.c +++ b/drivers/input/touchscreen/ads7846.c @@ -189,7 +189,7 @@ static int ads7846_read12_ser(struct device *dev, unsigned command) { struct spi_device *spi = to_spi_device(dev); struct ads7846 *ts = dev_get_drvdata(dev); - struct ser_req *req = kzalloc(sizeof *req, SLAB_KERNEL); + struct ser_req *req = kzalloc(sizeof *req, GFP_KERNEL); int status; int sample; int i; diff --git a/drivers/isdn/gigaset/bas-gigaset.c b/drivers/isdn/gigaset/bas-gigaset.c index 5857e7e23f8..63b629b1cdb 100644 --- a/drivers/isdn/gigaset/bas-gigaset.c +++ b/drivers/isdn/gigaset/bas-gigaset.c @@ -2218,21 +2218,21 @@ static int gigaset_probe(struct usb_interface *interface, * - three for the different uses of the default control pipe * - three for each isochronous pipe */ - if (!(ucs->urb_int_in = usb_alloc_urb(0, SLAB_KERNEL)) || - !(ucs->urb_cmd_in = usb_alloc_urb(0, SLAB_KERNEL)) || - !(ucs->urb_cmd_out = usb_alloc_urb(0, SLAB_KERNEL)) || - !(ucs->urb_ctrl = usb_alloc_urb(0, SLAB_KERNEL))) + if (!(ucs->urb_int_in = usb_alloc_urb(0, GFP_KERNEL)) || + !(ucs->urb_cmd_in = usb_alloc_urb(0, GFP_KERNEL)) || + !(ucs->urb_cmd_out = usb_alloc_urb(0, GFP_KERNEL)) || + !(ucs->urb_ctrl = usb_alloc_urb(0, GFP_KERNEL))) goto allocerr; for (j = 0; j < 2; ++j) { ubc = cs->bcs[j].hw.bas; for (i = 0; i < BAS_OUTURBS; ++i) if (!(ubc->isoouturbs[i].urb = - usb_alloc_urb(BAS_NUMFRAMES, SLAB_KERNEL))) + usb_alloc_urb(BAS_NUMFRAMES, GFP_KERNEL))) goto allocerr; for (i = 0; i < BAS_INURBS; ++i) if (!(ubc->isoinurbs[i] = - usb_alloc_urb(BAS_NUMFRAMES, SLAB_KERNEL))) + usb_alloc_urb(BAS_NUMFRAMES, GFP_KERNEL))) goto allocerr; } @@ -2246,7 +2246,7 @@ static int gigaset_probe(struct usb_interface *interface, (endpoint->bEndpointAddress) & 0x0f), ucs->int_in_buf, 3, read_int_callback, cs, endpoint->bInterval); - if ((rc = usb_submit_urb(ucs->urb_int_in, SLAB_KERNEL)) != 0) { + if ((rc = usb_submit_urb(ucs->urb_int_in, GFP_KERNEL)) != 0) { dev_err(cs->dev, "could not submit interrupt URB: %s\n", get_usb_rcmsg(rc)); goto error; diff --git a/drivers/isdn/gigaset/usb-gigaset.c b/drivers/isdn/gigaset/usb-gigaset.c index af89ce188f2..04f2ad7ba8b 100644 --- a/drivers/isdn/gigaset/usb-gigaset.c +++ b/drivers/isdn/gigaset/usb-gigaset.c @@ -763,7 +763,7 @@ static int gigaset_probe(struct usb_interface *interface, goto error; } - ucs->bulk_out_urb = usb_alloc_urb(0, SLAB_KERNEL); + ucs->bulk_out_urb = usb_alloc_urb(0, GFP_KERNEL); if (!ucs->bulk_out_urb) { dev_err(cs->dev, "Couldn't allocate bulk_out_urb\n"); retval = -ENOMEM; @@ -774,7 +774,7 @@ static int gigaset_probe(struct usb_interface *interface, atomic_set(&ucs->busy, 0); - ucs->read_urb = usb_alloc_urb(0, SLAB_KERNEL); + ucs->read_urb = usb_alloc_urb(0, GFP_KERNEL); if (!ucs->read_urb) { dev_err(cs->dev, "No free urbs available\n"); retval = -ENOMEM; @@ -797,7 +797,7 @@ static int gigaset_probe(struct usb_interface *interface, gigaset_read_int_callback, cs->inbuf + 0, endpoint->bInterval); - retval = usb_submit_urb(ucs->read_urb, SLAB_KERNEL); + retval = usb_submit_urb(ucs->read_urb, GFP_KERNEL); if (retval) { dev_err(cs->dev, "Could not submit URB (error %d)\n", -retval); goto error; diff --git a/drivers/media/dvb/cinergyT2/cinergyT2.c b/drivers/media/dvb/cinergyT2/cinergyT2.c index 206c13e47a0..9123147e376 100644 --- a/drivers/media/dvb/cinergyT2/cinergyT2.c +++ b/drivers/media/dvb/cinergyT2/cinergyT2.c @@ -287,7 +287,7 @@ static int cinergyt2_alloc_stream_urbs (struct cinergyt2 *cinergyt2) int i; cinergyt2->streambuf = usb_buffer_alloc(cinergyt2->udev, STREAM_URB_COUNT*STREAM_BUF_SIZE, - SLAB_KERNEL, &cinergyt2->streambuf_dmahandle); + GFP_KERNEL, &cinergyt2->streambuf_dmahandle); if (!cinergyt2->streambuf) { dprintk(1, "failed to alloc consistent stream memory area, bailing out!\n"); return -ENOMEM; diff --git a/drivers/mtd/devices/m25p80.c b/drivers/mtd/devices/m25p80.c index ef4a731ca5c..334e078ffaf 100644 --- a/drivers/mtd/devices/m25p80.c +++ b/drivers/mtd/devices/m25p80.c @@ -451,7 +451,7 @@ static int __devinit m25p_probe(struct spi_device *spi) return -ENODEV; } - flash = kzalloc(sizeof *flash, SLAB_KERNEL); + flash = kzalloc(sizeof *flash, GFP_KERNEL); if (!flash) return -ENOMEM; diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c index ccd4dafce8e..b318500785e 100644 --- a/drivers/scsi/ipr.c +++ b/drivers/scsi/ipr.c @@ -6940,7 +6940,7 @@ static int __devinit ipr_alloc_cmd_blks(struct ipr_ioa_cfg *ioa_cfg) return -ENOMEM; for (i = 0; i < IPR_NUM_CMD_BLKS; i++) { - ipr_cmd = pci_pool_alloc (ioa_cfg->ipr_cmd_pool, SLAB_KERNEL, &dma_addr); + ipr_cmd = pci_pool_alloc (ioa_cfg->ipr_cmd_pool, GFP_KERNEL, &dma_addr); if (!ipr_cmd) { ipr_free_cmd_blks(ioa_cfg); diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index c3c0626f550..09f2c74a40c 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -360,7 +360,7 @@ spi_alloc_master(struct device *dev, unsigned size) if (!dev) return NULL; - master = kzalloc(size + sizeof *master, SLAB_KERNEL); + master = kzalloc(size + sizeof *master, GFP_KERNEL); if (!master) return NULL; @@ -607,7 +607,7 @@ static int __init spi_init(void) { int status; - buf = kmalloc(SPI_BUFSIZ, SLAB_KERNEL); + buf = kmalloc(SPI_BUFSIZ, GFP_KERNEL); if (!buf) { status = -ENOMEM; goto err0; diff --git a/drivers/spi/spi_bitbang.c b/drivers/spi/spi_bitbang.c index 08c1c57c612..57289b61d0b 100644 --- a/drivers/spi/spi_bitbang.c +++ b/drivers/spi/spi_bitbang.c @@ -196,7 +196,7 @@ int spi_bitbang_setup(struct spi_device *spi) return -EINVAL; if (!cs) { - cs = kzalloc(sizeof *cs, SLAB_KERNEL); + cs = kzalloc(sizeof *cs, GFP_KERNEL); if (!cs) return -ENOMEM; spi->controller_state = cs; diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index 0a46acf557a..77c05be5241 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -2371,7 +2371,7 @@ check_highspeed (struct usb_hub *hub, struct usb_device *udev, int port1) struct usb_qualifier_descriptor *qual; int status; - qual = kmalloc (sizeof *qual, SLAB_KERNEL); + qual = kmalloc (sizeof *qual, GFP_KERNEL); if (qual == NULL) return; @@ -2922,7 +2922,7 @@ static int config_descriptors_changed(struct usb_device *udev) if (len < le16_to_cpu(udev->config[index].desc.wTotalLength)) len = le16_to_cpu(udev->config[index].desc.wTotalLength); } - buf = kmalloc (len, SLAB_KERNEL); + buf = kmalloc (len, GFP_KERNEL); if (buf == NULL) { dev_err(&udev->dev, "no mem to re-read configs after reset\n"); /* assume the worst */ diff --git a/drivers/usb/gadget/gmidi.c b/drivers/usb/gadget/gmidi.c index 64554acad63..31351826f2b 100644 --- a/drivers/usb/gadget/gmidi.c +++ b/drivers/usb/gadget/gmidi.c @@ -1236,7 +1236,7 @@ autoconf_fail: /* ok, we made sense of the hardware ... */ - dev = kzalloc(sizeof(*dev), SLAB_KERNEL); + dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) { return -ENOMEM; } diff --git a/drivers/usb/gadget/goku_udc.c b/drivers/usb/gadget/goku_udc.c index a3076da3f4e..805a9826842 100644 --- a/drivers/usb/gadget/goku_udc.c +++ b/drivers/usb/gadget/goku_udc.c @@ -1864,7 +1864,7 @@ static int goku_probe(struct pci_dev *pdev, const struct pci_device_id *id) } /* alloc, and start init */ - dev = kmalloc (sizeof *dev, SLAB_KERNEL); + dev = kmalloc (sizeof *dev, GFP_KERNEL); if (dev == NULL){ pr_debug("enomem %s\n", pci_name(pdev)); retval = -ENOMEM; diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c index 86924f9cdd7..3fb1044a4db 100644 --- a/drivers/usb/gadget/inode.c +++ b/drivers/usb/gadget/inode.c @@ -412,7 +412,7 @@ ep_read (struct file *fd, char __user *buf, size_t len, loff_t *ptr) /* FIXME readahead for O_NONBLOCK and poll(); careful with ZLPs */ value = -ENOMEM; - kbuf = kmalloc (len, SLAB_KERNEL); + kbuf = kmalloc (len, GFP_KERNEL); if (unlikely (!kbuf)) goto free1; @@ -456,7 +456,7 @@ ep_write (struct file *fd, const char __user *buf, size_t len, loff_t *ptr) /* FIXME writebehind for O_NONBLOCK and poll(), qlen = 1 */ value = -ENOMEM; - kbuf = kmalloc (len, SLAB_KERNEL); + kbuf = kmalloc (len, GFP_KERNEL); if (!kbuf) goto free1; if (copy_from_user (kbuf, buf, len)) { @@ -1898,7 +1898,7 @@ dev_config (struct file *fd, const char __user *buf, size_t len, loff_t *ptr) buf += 4; length -= 4; - kbuf = kmalloc (length, SLAB_KERNEL); + kbuf = kmalloc (length, GFP_KERNEL); if (!kbuf) return -ENOMEM; if (copy_from_user (kbuf, buf, length)) { diff --git a/drivers/usb/gadget/net2280.c b/drivers/usb/gadget/net2280.c index 0b590831582..3024c679e38 100644 --- a/drivers/usb/gadget/net2280.c +++ b/drivers/usb/gadget/net2280.c @@ -2861,7 +2861,7 @@ static int net2280_probe (struct pci_dev *pdev, const struct pci_device_id *id) } /* alloc, and start init */ - dev = kzalloc (sizeof *dev, SLAB_KERNEL); + dev = kzalloc (sizeof *dev, GFP_KERNEL); if (dev == NULL){ retval = -ENOMEM; goto done; diff --git a/drivers/usb/gadget/omap_udc.c b/drivers/usb/gadget/omap_udc.c index 48a09fd89d1..030d87c28c2 100644 --- a/drivers/usb/gadget/omap_udc.c +++ b/drivers/usb/gadget/omap_udc.c @@ -2581,7 +2581,7 @@ omap_udc_setup(struct platform_device *odev, struct otg_transceiver *xceiv) /* UDC_PULLUP_EN gates the chip clock */ // OTG_SYSCON_1_REG |= DEV_IDLE_EN; - udc = kzalloc(sizeof(*udc), SLAB_KERNEL); + udc = kzalloc(sizeof(*udc), GFP_KERNEL); if (!udc) return -ENOMEM; diff --git a/drivers/usb/gadget/zero.c b/drivers/usb/gadget/zero.c index 0f809dd6849..40710ea1b49 100644 --- a/drivers/usb/gadget/zero.c +++ b/drivers/usb/gadget/zero.c @@ -1190,7 +1190,7 @@ autoconf_fail: /* ok, we made sense of the hardware ... */ - dev = kzalloc(sizeof(*dev), SLAB_KERNEL); + dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return -ENOMEM; spin_lock_init (&dev->lock); diff --git a/drivers/usb/host/hc_crisv10.c b/drivers/usb/host/hc_crisv10.c index 396dc69d4b4..7fd872aa654 100644 --- a/drivers/usb/host/hc_crisv10.c +++ b/drivers/usb/host/hc_crisv10.c @@ -188,7 +188,7 @@ static DEFINE_TIMER(bulk_eot_timer, NULL, 0, 0); #define CHECK_ALIGN(x) if (((__u32)(x)) & 0x00000003) \ {panic("Alignment check (DWORD) failed at %s:%s:%d\n", __FILE__, __FUNCTION__, __LINE__);} -#define SLAB_FLAG (in_interrupt() ? GFP_ATOMIC : SLAB_KERNEL) +#define SLAB_FLAG (in_interrupt() ? GFP_ATOMIC : GFP_KERNEL) #define KMALLOC_FLAG (in_interrupt() ? GFP_ATOMIC : GFP_KERNEL) /* Most helpful debugging aid */ diff --git a/drivers/usb/host/ohci-pnx4008.c b/drivers/usb/host/ohci-pnx4008.c index 2dbb7741490..7f26f9bdbaf 100644 --- a/drivers/usb/host/ohci-pnx4008.c +++ b/drivers/usb/host/ohci-pnx4008.c @@ -134,7 +134,7 @@ static int isp1301_attach(struct i2c_adapter *adap, int addr, int kind) { struct i2c_client *c; - c = (struct i2c_client *)kzalloc(sizeof(*c), SLAB_KERNEL); + c = (struct i2c_client *)kzalloc(sizeof(*c), GFP_KERNEL); if (!c) return -ENOMEM; diff --git a/drivers/usb/input/acecad.c b/drivers/usb/input/acecad.c index 0096373b5f9..909138e5aa0 100644 --- a/drivers/usb/input/acecad.c +++ b/drivers/usb/input/acecad.c @@ -152,7 +152,7 @@ static int usb_acecad_probe(struct usb_interface *intf, const struct usb_device_ if (!acecad || !input_dev) goto fail1; - acecad->data = usb_buffer_alloc(dev, 8, SLAB_KERNEL, &acecad->data_dma); + acecad->data = usb_buffer_alloc(dev, 8, GFP_KERNEL, &acecad->data_dma); if (!acecad->data) goto fail1; diff --git a/drivers/usb/input/usbtouchscreen.c b/drivers/usb/input/usbtouchscreen.c index 49704d4ed0e..7f3c57da9bc 100644 --- a/drivers/usb/input/usbtouchscreen.c +++ b/drivers/usb/input/usbtouchscreen.c @@ -680,7 +680,7 @@ static int usbtouch_probe(struct usb_interface *intf, type->process_pkt = usbtouch_process_pkt; usbtouch->data = usb_buffer_alloc(udev, type->rept_size, - SLAB_KERNEL, &usbtouch->data_dma); + GFP_KERNEL, &usbtouch->data_dma); if (!usbtouch->data) goto out_free; diff --git a/drivers/usb/misc/usbtest.c b/drivers/usb/misc/usbtest.c index ea04dccdc65..fb321864a92 100644 --- a/drivers/usb/misc/usbtest.c +++ b/drivers/usb/misc/usbtest.c @@ -213,7 +213,7 @@ static struct urb *simple_alloc_urb ( if (bytes < 0) return NULL; - urb = usb_alloc_urb (0, SLAB_KERNEL); + urb = usb_alloc_urb (0, GFP_KERNEL); if (!urb) return urb; usb_fill_bulk_urb (urb, udev, pipe, NULL, bytes, simple_callback, NULL); @@ -223,7 +223,7 @@ static struct urb *simple_alloc_urb ( urb->transfer_flags = URB_NO_TRANSFER_DMA_MAP; if (usb_pipein (pipe)) urb->transfer_flags |= URB_SHORT_NOT_OK; - urb->transfer_buffer = usb_buffer_alloc (udev, bytes, SLAB_KERNEL, + urb->transfer_buffer = usb_buffer_alloc (udev, bytes, GFP_KERNEL, &urb->transfer_dma); if (!urb->transfer_buffer) { usb_free_urb (urb); @@ -315,7 +315,7 @@ static int simple_io ( init_completion (&completion); if (usb_pipeout (urb->pipe)) simple_fill_buf (urb); - if ((retval = usb_submit_urb (urb, SLAB_KERNEL)) != 0) + if ((retval = usb_submit_urb (urb, GFP_KERNEL)) != 0) break; /* NOTE: no timeouts; can't be broken out of by interrupt */ @@ -374,7 +374,7 @@ alloc_sglist (int nents, int max, int vary) unsigned i; unsigned size = max; - sg = kmalloc (nents * sizeof *sg, SLAB_KERNEL); + sg = kmalloc (nents * sizeof *sg, GFP_KERNEL); if (!sg) return NULL; @@ -382,7 +382,7 @@ alloc_sglist (int nents, int max, int vary) char *buf; unsigned j; - buf = kzalloc (size, SLAB_KERNEL); + buf = kzalloc (size, GFP_KERNEL); if (!buf) { free_sglist (sg, i); return NULL; @@ -428,7 +428,7 @@ static int perform_sglist ( (udev->speed == USB_SPEED_HIGH) ? (INTERRUPT_RATE << 3) : INTERRUPT_RATE, - sg, nents, 0, SLAB_KERNEL); + sg, nents, 0, GFP_KERNEL); if (retval) break; @@ -855,7 +855,7 @@ test_ctrl_queue (struct usbtest_dev *dev, struct usbtest_param *param) * as with bulk/intr sglists, sglen is the queue depth; it also * controls which subtests run (more tests than sglen) or rerun. */ - urb = kcalloc(param->sglen, sizeof(struct urb *), SLAB_KERNEL); + urb = kcalloc(param->sglen, sizeof(struct urb *), GFP_KERNEL); if (!urb) return -ENOMEM; for (i = 0; i < param->sglen; i++) { @@ -981,7 +981,7 @@ test_ctrl_queue (struct usbtest_dev *dev, struct usbtest_param *param) if (!u) goto cleanup; - reqp = usb_buffer_alloc (udev, sizeof *reqp, SLAB_KERNEL, + reqp = usb_buffer_alloc (udev, sizeof *reqp, GFP_KERNEL, &u->setup_dma); if (!reqp) goto cleanup; @@ -1067,7 +1067,7 @@ static int unlink1 (struct usbtest_dev *dev, int pipe, int size, int async) * FIXME want additional tests for when endpoint is STALLing * due to errors, or is just NAKing requests. */ - if ((retval = usb_submit_urb (urb, SLAB_KERNEL)) != 0) { + if ((retval = usb_submit_urb (urb, GFP_KERNEL)) != 0) { dev_dbg (&dev->intf->dev, "submit fail %d\n", retval); return retval; } @@ -1251,7 +1251,7 @@ static int ctrl_out (struct usbtest_dev *dev, if (length < 1 || length > 0xffff || vary >= length) return -EINVAL; - buf = kmalloc(length, SLAB_KERNEL); + buf = kmalloc(length, GFP_KERNEL); if (!buf) return -ENOMEM; @@ -1403,7 +1403,7 @@ static struct urb *iso_alloc_urb ( maxp *= 1 + (0x3 & (le16_to_cpu(desc->wMaxPacketSize) >> 11)); packets = (bytes + maxp - 1) / maxp; - urb = usb_alloc_urb (packets, SLAB_KERNEL); + urb = usb_alloc_urb (packets, GFP_KERNEL); if (!urb) return urb; urb->dev = udev; @@ -1411,7 +1411,7 @@ static struct urb *iso_alloc_urb ( urb->number_of_packets = packets; urb->transfer_buffer_length = bytes; - urb->transfer_buffer = usb_buffer_alloc (udev, bytes, SLAB_KERNEL, + urb->transfer_buffer = usb_buffer_alloc (udev, bytes, GFP_KERNEL, &urb->transfer_dma); if (!urb->transfer_buffer) { usb_free_urb (urb); @@ -1900,7 +1900,7 @@ usbtest_probe (struct usb_interface *intf, const struct usb_device_id *id) } #endif - dev = kzalloc(sizeof(*dev), SLAB_KERNEL); + dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return -ENOMEM; info = (struct usbtest_info *) id->driver_info; @@ -1910,7 +1910,7 @@ usbtest_probe (struct usb_interface *intf, const struct usb_device_id *id) dev->intf = intf; /* cacheline-aligned scratch for i/o */ - if ((dev->buf = kmalloc (TBUF_SIZE, SLAB_KERNEL)) == NULL) { + if ((dev->buf = kmalloc (TBUF_SIZE, GFP_KERNEL)) == NULL) { kfree (dev); return -ENOMEM; } diff --git a/drivers/usb/net/rndis_host.c b/drivers/usb/net/rndis_host.c index c2a28d88ef3..99f26b3e502 100644 --- a/drivers/usb/net/rndis_host.c +++ b/drivers/usb/net/rndis_host.c @@ -469,7 +469,7 @@ static void rndis_unbind(struct usbnet *dev, struct usb_interface *intf) struct rndis_halt *halt; /* try to clear any rndis state/activity (no i/o from stack!) */ - halt = kcalloc(1, sizeof *halt, SLAB_KERNEL); + halt = kcalloc(1, sizeof *halt, GFP_KERNEL); if (halt) { halt->msg_type = RNDIS_MSG_HALT; halt->msg_len = ccpu2(sizeof *halt); diff --git a/drivers/usb/net/usbnet.c b/drivers/usb/net/usbnet.c index 327f9755567..6e39e998825 100644 --- a/drivers/usb/net/usbnet.c +++ b/drivers/usb/net/usbnet.c @@ -179,9 +179,9 @@ static int init_status (struct usbnet *dev, struct usb_interface *intf) period = max ((int) dev->status->desc.bInterval, (dev->udev->speed == USB_SPEED_HIGH) ? 7 : 3); - buf = kmalloc (maxp, SLAB_KERNEL); + buf = kmalloc (maxp, GFP_KERNEL); if (buf) { - dev->interrupt = usb_alloc_urb (0, SLAB_KERNEL); + dev->interrupt = usb_alloc_urb (0, GFP_KERNEL); if (!dev->interrupt) { kfree (buf); return -ENOMEM; diff --git a/fs/adfs/super.c b/fs/adfs/super.c index 9ade139086f..52eb10ca654 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -217,7 +217,7 @@ static kmem_cache_t *adfs_inode_cachep; static struct inode *adfs_alloc_inode(struct super_block *sb) { struct adfs_inode_info *ei; - ei = (struct adfs_inode_info *)kmem_cache_alloc(adfs_inode_cachep, SLAB_KERNEL); + ei = (struct adfs_inode_info *)kmem_cache_alloc(adfs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; diff --git a/fs/affs/super.c b/fs/affs/super.c index 5ea72c3a16c..81c73ec09f6 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -71,7 +71,7 @@ static kmem_cache_t * affs_inode_cachep; static struct inode *affs_alloc_inode(struct super_block *sb) { struct affs_inode_info *ei; - ei = (struct affs_inode_info *)kmem_cache_alloc(affs_inode_cachep, SLAB_KERNEL); + ei = (struct affs_inode_info *)kmem_cache_alloc(affs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; ei->vfs_inode.i_version = 1; diff --git a/fs/afs/super.c b/fs/afs/super.c index 67d1f5c819e..c6ead009bf7 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -412,7 +412,7 @@ static struct inode *afs_alloc_inode(struct super_block *sb) struct afs_vnode *vnode; vnode = (struct afs_vnode *) - kmem_cache_alloc(afs_inode_cachep, SLAB_KERNEL); + kmem_cache_alloc(afs_inode_cachep, GFP_KERNEL); if (!vnode) return NULL; diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 07f7144f0e2..995348df94a 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -277,7 +277,7 @@ befs_alloc_inode(struct super_block *sb) { struct befs_inode_info *bi; bi = (struct befs_inode_info *)kmem_cache_alloc(befs_inode_cachep, - SLAB_KERNEL); + GFP_KERNEL); if (!bi) return NULL; return &bi->vfs_inode; diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index ed27ffb3459..2e45123c8f7 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -233,7 +233,7 @@ static kmem_cache_t * bfs_inode_cachep; static struct inode *bfs_alloc_inode(struct super_block *sb) { struct bfs_inode_info *bi; - bi = kmem_cache_alloc(bfs_inode_cachep, SLAB_KERNEL); + bi = kmem_cache_alloc(bfs_inode_cachep, GFP_KERNEL); if (!bi) return NULL; return &bi->vfs_inode; diff --git a/fs/block_dev.c b/fs/block_dev.c index 36c0e7af9d0..063506705f2 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -239,7 +239,7 @@ static kmem_cache_t * bdev_cachep __read_mostly; static struct inode *bdev_alloc_inode(struct super_block *sb) { - struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, SLAB_KERNEL); + struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 84976cdbe71..84168629cea 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -245,7 +245,7 @@ static struct inode * cifs_alloc_inode(struct super_block *sb) { struct cifsInodeInfo *cifs_inode; - cifs_inode = kmem_cache_alloc(cifs_inode_cachep, SLAB_KERNEL); + cifs_inode = kmem_cache_alloc(cifs_inode_cachep, GFP_KERNEL); if (!cifs_inode) return NULL; cifs_inode->cifsAttrs = 0x20; /* default */ diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 8355daff504..aedf683f011 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -153,7 +153,7 @@ cifs_buf_get(void) albeit slightly larger than necessary and maxbuffersize defaults to this and can not be bigger */ ret_buf = - (struct smb_hdr *) mempool_alloc(cifs_req_poolp, SLAB_KERNEL | GFP_NOFS); + (struct smb_hdr *) mempool_alloc(cifs_req_poolp, GFP_KERNEL | GFP_NOFS); /* clear the first few header bytes */ /* for most paths, more is cleared in header_assemble */ @@ -192,7 +192,7 @@ cifs_small_buf_get(void) albeit slightly larger than necessary and maxbuffersize defaults to this and can not be bigger */ ret_buf = - (struct smb_hdr *) mempool_alloc(cifs_sm_req_poolp, SLAB_KERNEL | GFP_NOFS); + (struct smb_hdr *) mempool_alloc(cifs_sm_req_poolp, GFP_KERNEL | GFP_NOFS); if (ret_buf) { /* No need to clear memory here, cleared in header assemble */ /* memset(ret_buf, 0, sizeof(struct smb_hdr) + 27);*/ diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 7514237cf31..1f727765a8e 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -51,7 +51,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct cifsSesInfo *ses) } temp = (struct mid_q_entry *) mempool_alloc(cifs_mid_poolp, - SLAB_KERNEL | GFP_NOFS); + GFP_KERNEL | GFP_NOFS); if (temp == NULL) return temp; else { @@ -118,7 +118,7 @@ AllocOplockQEntry(struct inode * pinode, __u16 fid, struct cifsTconInfo * tcon) return NULL; } temp = (struct oplock_q_entry *) kmem_cache_alloc(cifs_oplock_cachep, - SLAB_KERNEL); + GFP_KERNEL); if (temp == NULL) return temp; else { diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 88d12332116..50cedd2617d 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -43,7 +43,7 @@ static kmem_cache_t * coda_inode_cachep; static struct inode *coda_alloc_inode(struct super_block *sb) { struct coda_inode_info *ei; - ei = (struct coda_inode_info *)kmem_cache_alloc(coda_inode_cachep, SLAB_KERNEL); + ei = (struct coda_inode_info *)kmem_cache_alloc(coda_inode_cachep, GFP_KERNEL); if (!ei) return NULL; memset(&ei->c_fid, 0, sizeof(struct CodaFid)); diff --git a/fs/dnotify.c b/fs/dnotify.c index 2b0442db67e..e778b1737b7 100644 --- a/fs/dnotify.c +++ b/fs/dnotify.c @@ -77,7 +77,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) inode = filp->f_dentry->d_inode; if (!S_ISDIR(inode->i_mode)) return -ENOTDIR; - dn = kmem_cache_alloc(dn_cache, SLAB_KERNEL); + dn = kmem_cache_alloc(dn_cache, GFP_KERNEL); if (dn == NULL) return -ENOMEM; spin_lock(&inode->i_lock); diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 776b2eed371..7196f50fe15 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -628,7 +628,7 @@ int ecryptfs_decrypt_page(struct file *file, struct page *page) num_extents_per_page = PAGE_CACHE_SIZE / crypt_stat->extent_size; base_extent = (page->index * num_extents_per_page); lower_page_virt = kmem_cache_alloc(ecryptfs_lower_page_cache, - SLAB_KERNEL); + GFP_KERNEL); if (!lower_page_virt) { rc = -ENOMEM; ecryptfs_printk(KERN_ERR, "Error getting page for encrypted " diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index a92ef05eff8..42099e779a5 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -250,7 +250,7 @@ static int ecryptfs_open(struct inode *inode, struct file *file) int lower_flags; /* Released in ecryptfs_release or end of function if failure */ - file_info = kmem_cache_alloc(ecryptfs_file_info_cache, SLAB_KERNEL); + file_info = kmem_cache_alloc(ecryptfs_file_info_cache, GFP_KERNEL); ecryptfs_set_file_private(file, file_info); if (!file_info) { ecryptfs_printk(KERN_ERR, diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 70911412044..8a1945a84c3 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -369,7 +369,7 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry, BUG_ON(!atomic_read(&lower_dentry->d_count)); ecryptfs_set_dentry_private(dentry, kmem_cache_alloc(ecryptfs_dentry_info_cache, - SLAB_KERNEL)); + GFP_KERNEL)); if (!ecryptfs_dentry_to_private(dentry)) { rc = -ENOMEM; ecryptfs_printk(KERN_ERR, "Out of memory whilst attempting " @@ -795,7 +795,7 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) /* Released at out_free: label */ ecryptfs_set_file_private(&fake_ecryptfs_file, kmem_cache_alloc(ecryptfs_file_info_cache, - SLAB_KERNEL)); + GFP_KERNEL)); if (unlikely(!ecryptfs_file_to_private(&fake_ecryptfs_file))) { rc = -ENOMEM; goto out; diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index c3746f56d16..745c0f1bfbb 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -207,7 +207,7 @@ parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat, /* Released: wipe_auth_tok_list called in ecryptfs_parse_packet_set or * at end of function upon failure */ auth_tok_list_item = - kmem_cache_alloc(ecryptfs_auth_tok_list_item_cache, SLAB_KERNEL); + kmem_cache_alloc(ecryptfs_auth_tok_list_item_cache, GFP_KERNEL); if (!auth_tok_list_item) { ecryptfs_printk(KERN_ERR, "Unable to allocate memory\n"); rc = -ENOMEM; diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index a78d87d14ba..a2c6ccbce30 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -378,7 +378,7 @@ ecryptfs_fill_super(struct super_block *sb, void *raw_data, int silent) /* Released in ecryptfs_put_super() */ ecryptfs_set_superblock_private(sb, kmem_cache_alloc(ecryptfs_sb_info_cache, - SLAB_KERNEL)); + GFP_KERNEL)); if (!ecryptfs_superblock_to_private(sb)) { ecryptfs_printk(KERN_WARNING, "Out of memory\n"); rc = -ENOMEM; @@ -402,7 +402,7 @@ ecryptfs_fill_super(struct super_block *sb, void *raw_data, int silent) /* through deactivate_super(sb) from get_sb_nodev() */ ecryptfs_set_dentry_private(sb->s_root, kmem_cache_alloc(ecryptfs_dentry_info_cache, - SLAB_KERNEL)); + GFP_KERNEL)); if (!ecryptfs_dentry_to_private(sb->s_root)) { ecryptfs_printk(KERN_ERR, "dentry_info_cache alloc failed\n"); diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index 825757ae486..eaa5daaf106 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -50,7 +50,7 @@ static struct inode *ecryptfs_alloc_inode(struct super_block *sb) struct inode *inode = NULL; ecryptfs_inode = kmem_cache_alloc(ecryptfs_inode_info_cache, - SLAB_KERNEL); + GFP_KERNEL); if (unlikely(!ecryptfs_inode)) goto out; ecryptfs_init_crypt_stat(&ecryptfs_inode->crypt_stat); diff --git a/fs/efs/super.c b/fs/efs/super.c index b3f50651eb6..69b15a996cf 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -57,7 +57,7 @@ static kmem_cache_t * efs_inode_cachep; static struct inode *efs_alloc_inode(struct super_block *sb) { struct efs_inode_info *ei; - ei = (struct efs_inode_info *)kmem_cache_alloc(efs_inode_cachep, SLAB_KERNEL); + ei = (struct efs_inode_info *)kmem_cache_alloc(efs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index ae228ec54e9..f5c88435c6b 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -961,7 +961,7 @@ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, struct epitem *epi = ep_item_from_epqueue(pt); struct eppoll_entry *pwq; - if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, SLAB_KERNEL))) { + if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) { init_waitqueue_func_entry(&pwq->wait, ep_poll_callback); pwq->whead = whead; pwq->base = epi; @@ -1004,7 +1004,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, struct ep_pqueue epq; error = -ENOMEM; - if (!(epi = kmem_cache_alloc(epi_cache, SLAB_KERNEL))) + if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL))) goto eexit_1; /* Item initialization follow here ... */ diff --git a/fs/exec.c b/fs/exec.c index d993ea1a81a..2092bd20746 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -404,7 +404,7 @@ int setup_arg_pages(struct linux_binprm *bprm, bprm->loader += stack_base; bprm->exec += stack_base; - mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + mpnt = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!mpnt) return -ENOMEM; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index d8b9abd95d0..85c237e7385 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -140,7 +140,7 @@ static kmem_cache_t * ext2_inode_cachep; static struct inode *ext2_alloc_inode(struct super_block *sb) { struct ext2_inode_info *ei; - ei = (struct ext2_inode_info *)kmem_cache_alloc(ext2_inode_cachep, SLAB_KERNEL); + ei = (struct ext2_inode_info *)kmem_cache_alloc(ext2_inode_cachep, GFP_KERNEL); if (!ei) return NULL; #ifdef CONFIG_EXT2_FS_POSIX_ACL diff --git a/fs/fat/cache.c b/fs/fat/cache.c index 82cc4f59e3b..8c272278455 100644 --- a/fs/fat/cache.c +++ b/fs/fat/cache.c @@ -63,7 +63,7 @@ void fat_cache_destroy(void) static inline struct fat_cache *fat_cache_alloc(struct inode *inode) { - return kmem_cache_alloc(fat_cache_cachep, SLAB_KERNEL); + return kmem_cache_alloc(fat_cache_cachep, GFP_KERNEL); } static inline void fat_cache_free(struct fat_cache *cache) diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 78945b53b0f..b58fd0c9f3c 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -482,7 +482,7 @@ static kmem_cache_t *fat_inode_cachep; static struct inode *fat_alloc_inode(struct super_block *sb) { struct msdos_inode_info *ei; - ei = kmem_cache_alloc(fat_inode_cachep, SLAB_KERNEL); + ei = kmem_cache_alloc(fat_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; diff --git a/fs/fcntl.c b/fs/fcntl.c index e4f26165f12..c03dc9cb21c 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -567,7 +567,7 @@ int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fap int result = 0; if (on) { - new = kmem_cache_alloc(fasync_cache, SLAB_KERNEL); + new = kmem_cache_alloc(fasync_cache, GFP_KERNEL); if (!new) return -ENOMEM; } diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c index 4786d51ad3b..d2dd0d70007 100644 --- a/fs/freevxfs/vxfs_inode.c +++ b/fs/freevxfs/vxfs_inode.c @@ -103,7 +103,7 @@ vxfs_blkiget(struct super_block *sbp, u_long extent, ino_t ino) struct vxfs_inode_info *vip; struct vxfs_dinode *dip; - if (!(vip = kmem_cache_alloc(vxfs_inode_cachep, SLAB_KERNEL))) + if (!(vip = kmem_cache_alloc(vxfs_inode_cachep, GFP_KERNEL))) goto fail; dip = (struct vxfs_dinode *)(bp->b_data + offset); memcpy(vip, dip, sizeof(*vip)); @@ -145,7 +145,7 @@ __vxfs_iget(ino_t ino, struct inode *ilistp) struct vxfs_dinode *dip; caddr_t kaddr = (char *)page_address(pp); - if (!(vip = kmem_cache_alloc(vxfs_inode_cachep, SLAB_KERNEL))) + if (!(vip = kmem_cache_alloc(vxfs_inode_cachep, GFP_KERNEL))) goto fail; dip = (struct vxfs_dinode *)(kaddr + offset); memcpy(vip, dip, sizeof(*vip)); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 66571eafbb1..8c15139f275 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -41,7 +41,7 @@ static void fuse_request_init(struct fuse_req *req) struct fuse_req *fuse_request_alloc(void) { - struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, SLAB_KERNEL); + struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, GFP_KERNEL); if (req) fuse_request_init(req); return req; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index fc420357037..e039e2047cc 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -46,7 +46,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) struct inode *inode; struct fuse_inode *fi; - inode = kmem_cache_alloc(fuse_inode_cachep, SLAB_KERNEL); + inode = kmem_cache_alloc(fuse_inode_cachep, GFP_KERNEL); if (!inode) return NULL; diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 85b17b3fa4a..ffc6409132c 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -145,7 +145,7 @@ static struct inode *hfs_alloc_inode(struct super_block *sb) { struct hfs_inode_info *i; - i = kmem_cache_alloc(hfs_inode_cachep, SLAB_KERNEL); + i = kmem_cache_alloc(hfs_inode_cachep, GFP_KERNEL); return i ? &i->vfs_inode : NULL; } diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 194eede52fa..4a0c70c76c8 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -440,7 +440,7 @@ static struct inode *hfsplus_alloc_inode(struct super_block *sb) { struct hfsplus_inode_info *i; - i = kmem_cache_alloc(hfsplus_inode_cachep, SLAB_KERNEL); + i = kmem_cache_alloc(hfsplus_inode_cachep, GFP_KERNEL); return i ? &i->vfs_inode : NULL; } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 7f4756963d0..36e52173a54 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -522,7 +522,7 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo))) return NULL; - p = kmem_cache_alloc(hugetlbfs_inode_cachep, SLAB_KERNEL); + p = kmem_cache_alloc(hugetlbfs_inode_cachep, GFP_KERNEL); if (unlikely(!p)) { hugetlbfs_inc_free_inodes(sbinfo); return NULL; diff --git a/fs/inode.c b/fs/inode.c index 26cdb115ce6..dd15984d51a 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -109,7 +109,7 @@ static struct inode *alloc_inode(struct super_block *sb) if (sb->s_op->alloc_inode) inode = sb->s_op->alloc_inode(sb); else - inode = (struct inode *) kmem_cache_alloc(inode_cachep, SLAB_KERNEL); + inode = (struct inode *) kmem_cache_alloc(inode_cachep, GFP_KERNEL); if (inode) { struct address_space * const mapping = &inode->i_data; diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index c34b862cdbf..4b6381cd2cf 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -62,7 +62,7 @@ static kmem_cache_t *isofs_inode_cachep; static struct inode *isofs_alloc_inode(struct super_block *sb) { struct iso_inode_info *ei; - ei = kmem_cache_alloc(isofs_inode_cachep, SLAB_KERNEL); + ei = kmem_cache_alloc(isofs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index bc4b8106a49..77be534ce42 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -33,7 +33,7 @@ static kmem_cache_t *jffs2_inode_cachep; static struct inode *jffs2_alloc_inode(struct super_block *sb) { struct jffs2_inode_info *ei; - ei = (struct jffs2_inode_info *)kmem_cache_alloc(jffs2_inode_cachep, SLAB_KERNEL); + ei = (struct jffs2_inode_info *)kmem_cache_alloc(jffs2_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; diff --git a/fs/locks.c b/fs/locks.c index e0b6a80649a..a7b97d50c1e 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -147,7 +147,7 @@ static kmem_cache_t *filelock_cache __read_mostly; /* Allocate an empty lock structure. */ static struct file_lock *locks_alloc_lock(void) { - return kmem_cache_alloc(filelock_cache, SLAB_KERNEL); + return kmem_cache_alloc(filelock_cache, GFP_KERNEL); } static void locks_release_private(struct file_lock *fl) diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 1e36bae4d0e..ce532c2deda 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -56,7 +56,7 @@ static kmem_cache_t * minix_inode_cachep; static struct inode *minix_alloc_inode(struct super_block *sb) { struct minix_inode_info *ei; - ei = (struct minix_inode_info *)kmem_cache_alloc(minix_inode_cachep, SLAB_KERNEL); + ei = (struct minix_inode_info *)kmem_cache_alloc(minix_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 72dad552aa0..ed84d899220 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -45,7 +45,7 @@ static kmem_cache_t * ncp_inode_cachep; static struct inode *ncp_alloc_inode(struct super_block *sb) { struct ncp_inode_info *ei; - ei = (struct ncp_inode_info *)kmem_cache_alloc(ncp_inode_cachep, SLAB_KERNEL); + ei = (struct ncp_inode_info *)kmem_cache_alloc(ncp_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index bdfabf854a5..769fd0a0c77 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -143,7 +143,7 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void) { struct nfs_direct_req *dreq; - dreq = kmem_cache_alloc(nfs_direct_cachep, SLAB_KERNEL); + dreq = kmem_cache_alloc(nfs_direct_cachep, GFP_KERNEL); if (!dreq) return NULL; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 08cc4c5919a..6b53aae4ed2 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1080,7 +1080,7 @@ void nfs4_clear_inode(struct inode *inode) struct inode *nfs_alloc_inode(struct super_block *sb) { struct nfs_inode *nfsi; - nfsi = (struct nfs_inode *)kmem_cache_alloc(nfs_inode_cachep, SLAB_KERNEL); + nfsi = (struct nfs_inode *)kmem_cache_alloc(nfs_inode_cachep, GFP_KERNEL); if (!nfsi) return NULL; nfsi->flags = 0UL; diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 829af323f28..a1561a820ab 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -26,7 +26,7 @@ static inline struct nfs_page * nfs_page_alloc(void) { struct nfs_page *p; - p = kmem_cache_alloc(nfs_page_cachep, SLAB_KERNEL); + p = kmem_cache_alloc(nfs_page_cachep, GFP_KERNEL); if (p) { memset(p, 0, sizeof(*p)); INIT_LIST_HEAD(&p->wb_list); diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index 592a6402e85..911d1bcfc56 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -336,7 +336,7 @@ static struct inode *openprom_alloc_inode(struct super_block *sb) { struct op_inode_info *oi; - oi = kmem_cache_alloc(op_inode_cachep, SLAB_KERNEL); + oi = kmem_cache_alloc(op_inode_cachep, GFP_KERNEL); if (!oi) return NULL; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 49dfb2ab783..b24cdb2f17c 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -88,7 +88,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb) struct proc_inode *ei; struct inode *inode; - ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, SLAB_KERNEL); + ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL); if (!ei) return NULL; ei->pid = NULL; diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 5a41db2a218..5b943eb11d7 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -520,7 +520,7 @@ static kmem_cache_t *qnx4_inode_cachep; static struct inode *qnx4_alloc_inode(struct super_block *sb) { struct qnx4_inode_info *ei; - ei = kmem_cache_alloc(qnx4_inode_cachep, SLAB_KERNEL); + ei = kmem_cache_alloc(qnx4_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 17249994110..32332516d65 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -496,7 +496,7 @@ static struct inode *reiserfs_alloc_inode(struct super_block *sb) { struct reiserfs_inode_info *ei; ei = (struct reiserfs_inode_info *) - kmem_cache_alloc(reiserfs_inode_cachep, SLAB_KERNEL); + kmem_cache_alloc(reiserfs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c index ddcd9e1ef28..d1b455f9b66 100644 --- a/fs/romfs/inode.c +++ b/fs/romfs/inode.c @@ -555,7 +555,7 @@ static kmem_cache_t * romfs_inode_cachep; static struct inode *romfs_alloc_inode(struct super_block *sb) { struct romfs_inode_info *ei; - ei = (struct romfs_inode_info *)kmem_cache_alloc(romfs_inode_cachep, SLAB_KERNEL); + ei = (struct romfs_inode_info *)kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c index 2c122ee83ad..22161710348 100644 --- a/fs/smbfs/inode.c +++ b/fs/smbfs/inode.c @@ -55,7 +55,7 @@ static kmem_cache_t *smb_inode_cachep; static struct inode *smb_alloc_inode(struct super_block *sb) { struct smb_inode_info *ei; - ei = (struct smb_inode_info *)kmem_cache_alloc(smb_inode_cachep, SLAB_KERNEL); + ei = (struct smb_inode_info *)kmem_cache_alloc(smb_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; diff --git a/fs/smbfs/request.c b/fs/smbfs/request.c index 0fb74697abc..3eb1402191b 100644 --- a/fs/smbfs/request.c +++ b/fs/smbfs/request.c @@ -61,7 +61,7 @@ static struct smb_request *smb_do_alloc_request(struct smb_sb_info *server, struct smb_request *req; unsigned char *buf = NULL; - req = kmem_cache_alloc(req_cachep, SLAB_KERNEL); + req = kmem_cache_alloc(req_cachep, GFP_KERNEL); VERBOSE("allocating request: %p\n", req); if (!req) goto out; diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index d63c5e48b05..a6ca12b747c 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -307,7 +307,7 @@ static struct inode *sysv_alloc_inode(struct super_block *sb) { struct sysv_inode_info *si; - si = kmem_cache_alloc(sysv_inode_cachep, SLAB_KERNEL); + si = kmem_cache_alloc(sysv_inode_cachep, GFP_KERNEL); if (!si) return NULL; return &si->vfs_inode; diff --git a/fs/udf/super.c b/fs/udf/super.c index 1aea6a4f9a4..e50f24221de 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -112,7 +112,7 @@ static kmem_cache_t * udf_inode_cachep; static struct inode *udf_alloc_inode(struct super_block *sb) { struct udf_inode_info *ei; - ei = (struct udf_inode_info *)kmem_cache_alloc(udf_inode_cachep, SLAB_KERNEL); + ei = (struct udf_inode_info *)kmem_cache_alloc(udf_inode_cachep, GFP_KERNEL); if (!ei) return NULL; diff --git a/fs/ufs/super.c b/fs/ufs/super.c index ec79e3091d1..85a88c0c5e6 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1209,7 +1209,7 @@ static kmem_cache_t * ufs_inode_cachep; static struct inode *ufs_alloc_inode(struct super_block *sb) { struct ufs_inode_info *ei; - ei = (struct ufs_inode_info *)kmem_cache_alloc(ufs_inode_cachep, SLAB_KERNEL); + ei = (struct ufs_inode_info *)kmem_cache_alloc(ufs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; ei->vfs_inode.i_version = 1; diff --git a/include/linux/fs.h b/include/linux/fs.h index a8039c8d8cb..94b831b8157 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1483,7 +1483,7 @@ extern void __init vfs_caches_init(unsigned long); extern struct kmem_cache *names_cachep; -#define __getname() kmem_cache_alloc(names_cachep, SLAB_KERNEL) +#define __getname() kmem_cache_alloc(names_cachep, GFP_KERNEL) #define __putname(name) kmem_cache_free(names_cachep, (void *)(name)) #ifndef CONFIG_AUDITSYSCALL #define putname(name) __putname(name) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index db2c1df4fef..61c2ab634b0 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -34,7 +34,7 @@ extern kmem_cache_t *anon_vma_cachep; static inline struct anon_vma *anon_vma_alloc(void) { - return kmem_cache_alloc(anon_vma_cachep, SLAB_KERNEL); + return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); } static inline void anon_vma_free(struct anon_vma *anon_vma) diff --git a/include/linux/slab.h b/include/linux/slab.h index 34b046ea88f..639f65efa46 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -19,7 +19,6 @@ typedef struct kmem_cache kmem_cache_t; #include /* kmalloc_sizes.h needs L1_CACHE_BYTES */ /* flags for kmem_cache_alloc() */ -#define SLAB_KERNEL GFP_KERNEL #define SLAB_DMA GFP_DMA /* flags to pass to kmem_cache_create(). diff --git a/include/linux/taskstats_kern.h b/include/linux/taskstats_kern.h index 6562a2050a2..f81a5af8a4f 100644 --- a/include/linux/taskstats_kern.h +++ b/include/linux/taskstats_kern.h @@ -35,7 +35,7 @@ static inline void taskstats_tgid_alloc(struct task_struct *tsk) return; /* No problem if kmem_cache_zalloc() fails */ - stats = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL); + stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); spin_lock_irq(&tsk->sighand->siglock); if (!sig->stats) { diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 7c274002c9f..813bb941342 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -224,7 +224,7 @@ static struct inode *mqueue_alloc_inode(struct super_block *sb) { struct mqueue_inode_info *ei; - ei = kmem_cache_alloc(mqueue_inode_cachep, SLAB_KERNEL); + ei = kmem_cache_alloc(mqueue_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 66a0ea48751..70e9ec60308 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -41,7 +41,7 @@ void delayacct_init(void) void __delayacct_tsk_init(struct task_struct *tsk) { - tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL); + tsk->delays = kmem_cache_zalloc(delayacct_cache, GFP_KERNEL); if (tsk->delays) spin_lock_init(&tsk->delays->lock); } diff --git a/kernel/fork.c b/kernel/fork.c index 5678e6c61ef..711aa5f10da 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -237,7 +237,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) goto fail_nomem; charge = len; } - tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!tmp) goto fail_nomem; *tmp = *mpnt; @@ -319,7 +319,7 @@ static inline void mm_free_pgd(struct mm_struct * mm) __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); -#define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL)) +#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) #include @@ -621,7 +621,7 @@ static struct files_struct *alloc_files(void) struct files_struct *newf; struct fdtable *fdt; - newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL); + newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); if (!newf) goto out; diff --git a/kernel/taskstats.c b/kernel/taskstats.c index d3d28919d4b..1b2b326cf70 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -425,7 +425,7 @@ void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu) *mycpu = raw_smp_processor_id(); *ptidstats = NULL; - tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL); + tmp = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); if (!tmp) return; diff --git a/kernel/user.c b/kernel/user.c index 220e586127a..c1f93c164c9 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -132,7 +132,7 @@ struct user_struct * alloc_uid(uid_t uid) if (!up) { struct user_struct *new; - new = kmem_cache_alloc(uid_cachep, SLAB_KERNEL); + new = kmem_cache_alloc(uid_cachep, GFP_KERNEL); if (!new) return NULL; new->uid = uid; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e7b69c90cfd..ad864f8708b 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1326,7 +1326,7 @@ struct mempolicy *__mpol_copy(struct mempolicy *old) atomic_set(&new->refcnt, 1); if (new->policy == MPOL_BIND) { int sz = ksize(old->v.zonelist); - new->v.zonelist = kmemdup(old->v.zonelist, sz, SLAB_KERNEL); + new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL); if (!new->v.zonelist) { kmem_cache_free(policy_cache, new); return ERR_PTR(-ENOMEM); diff --git a/mm/mmap.c b/mm/mmap.c index 7b40abd7cba..7be110e98d4 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1736,7 +1736,7 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, if (mm->map_count >= sysctl_max_map_count) return -ENOMEM; - new = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!new) return -ENOMEM; @@ -2057,7 +2057,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, vma_start < new_vma->vm_end) *vmap = new_vma; } else { - new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (new_vma) { *new_vma = *vma; pol = mpol_copy(vma_policy(vma)); diff --git a/mm/shmem.c b/mm/shmem.c index 4959535fc14..bdaecfdaabd 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2263,7 +2263,7 @@ static struct kmem_cache *shmem_inode_cachep; static struct inode *shmem_alloc_inode(struct super_block *sb) { struct shmem_inode_info *p; - p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, SLAB_KERNEL); + p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); if (!p) return NULL; return &p->vfs_inode; diff --git a/mm/slab.c b/mm/slab.c index 9f34b4946fb..1f374c1df01 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2237,7 +2237,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, align = ralign; /* Get cache's description obj. */ - cachep = kmem_cache_zalloc(&cache_cache, SLAB_KERNEL); + cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL); if (!cachep) goto oops; diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c index bdbc3f43166..101e5ccaf09 100644 --- a/net/decnet/dn_table.c +++ b/net/decnet/dn_table.c @@ -590,7 +590,7 @@ create: replace: err = -ENOBUFS; - new_f = kmem_cache_alloc(dn_hash_kmem, SLAB_KERNEL); + new_f = kmem_cache_alloc(dn_hash_kmem, GFP_KERNEL); if (new_f == NULL) goto out; diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index 107bb6cbb0b..4463443e42c 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -485,13 +485,13 @@ static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg) goto out; err = -ENOBUFS; - new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL); + new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); if (new_fa == NULL) goto out; new_f = NULL; if (!f) { - new_f = kmem_cache_alloc(fn_hash_kmem, SLAB_KERNEL); + new_f = kmem_cache_alloc(fn_hash_kmem, GFP_KERNEL); if (new_f == NULL) goto out_free_new_fa; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index d17990ec724..6be6caf1af3 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1187,7 +1187,7 @@ static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg) u8 state; err = -ENOBUFS; - new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL); + new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); if (new_fa == NULL) goto out; @@ -1232,7 +1232,7 @@ static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg) goto out; err = -ENOBUFS; - new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL); + new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); if (new_fa == NULL) goto out; diff --git a/net/socket.c b/net/socket.c index e8db54702a6..4f417c2ddc1 100644 --- a/net/socket.c +++ b/net/socket.c @@ -236,7 +236,7 @@ static struct inode *sock_alloc_inode(struct super_block *sb) { struct socket_alloc *ei; - ei = kmem_cache_alloc(sock_inode_cachep, SLAB_KERNEL); + ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL); if (!ei) return NULL; init_waitqueue_head(&ei->socket.wait); diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 49dba5febbb..df753d0a884 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -143,7 +143,7 @@ static struct inode * rpc_alloc_inode(struct super_block *sb) { struct rpc_inode *rpci; - rpci = (struct rpc_inode *)kmem_cache_alloc(rpc_inode_cachep, SLAB_KERNEL); + rpci = (struct rpc_inode *)kmem_cache_alloc(rpc_inode_cachep, GFP_KERNEL); if (!rpci) return NULL; return &rpci->vfs_inode; diff --git a/security/keys/key.c b/security/keys/key.c index 70eacbe5abd..157bac658bf 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -285,7 +285,7 @@ struct key *key_alloc(struct key_type *type, const char *desc, } /* allocate and initialise the key and its description */ - key = kmem_cache_alloc(key_jar, SLAB_KERNEL); + key = kmem_cache_alloc(key_jar, GFP_KERNEL); if (!key) goto no_memory_2; diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 78f98fe084e..ac1aeed0b28 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -181,7 +181,7 @@ static int inode_alloc_security(struct inode *inode) struct task_security_struct *tsec = current->security; struct inode_security_struct *isec; - isec = kmem_cache_alloc(sel_inode_cache, SLAB_KERNEL); + isec = kmem_cache_alloc(sel_inode_cache, GFP_KERNEL); if (!isec) return -ENOMEM; diff --git a/security/selinux/ss/avtab.c b/security/selinux/ss/avtab.c index d049c7acbc8..2dfc6134c2c 100644 --- a/security/selinux/ss/avtab.c +++ b/security/selinux/ss/avtab.c @@ -36,7 +36,7 @@ avtab_insert_node(struct avtab *h, int hvalue, struct avtab_key *key, struct avtab_datum *datum) { struct avtab_node * newnode; - newnode = kmem_cache_alloc(avtab_node_cachep, SLAB_KERNEL); + newnode = kmem_cache_alloc(avtab_node_cachep, GFP_KERNEL); if (newnode == NULL) return NULL; memset(newnode, 0, sizeof(struct avtab_node)); -- cgit v1.2.3-70-g09d2 From 7dfb71030f7636a0d65200158113c37764552f93 Mon Sep 17 00:00:00 2001 From: Nigel Cunningham Date: Wed, 6 Dec 2006 20:34:23 -0800 Subject: [PATCH] Add include/linux/freezer.h and move definitions from sched.h Move process freezing functions from include/linux/sched.h to freezer.h, so that modifications to the freezer or the kernel configuration don't require recompiling just about everything. [akpm@osdl.org: fix ueagle driver] Signed-off-by: Nigel Cunningham Cc: "Rafael J. Wysocki" Cc: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/kernel/signal.c | 1 + arch/avr32/kernel/signal.c | 2 +- arch/frv/kernel/signal.c | 2 +- arch/h8300/kernel/signal.c | 2 +- arch/i386/kernel/io_apic.c | 1 + arch/m32r/kernel/signal.c | 2 +- arch/powerpc/kernel/signal_32.c | 2 +- arch/sh/kernel/signal.c | 1 + arch/sh64/kernel/signal.c | 2 +- drivers/block/pktcdvd.c | 2 +- drivers/char/hvc_console.c | 1 + drivers/edac/edac_mc.c | 1 + drivers/ieee1394/nodemgr.c | 1 + drivers/input/gameport/gameport.c | 1 + drivers/input/serio/serio.c | 1 + drivers/macintosh/therm_adt746x.c | 1 + drivers/macintosh/via-pmu.c | 2 +- drivers/macintosh/windfarm_core.c | 1 + drivers/md/md.c | 2 +- drivers/media/dvb/dvb-core/dvb_frontend.c | 2 +- drivers/media/video/msp3400-driver.c | 2 +- drivers/media/video/tvaudio.c | 1 + drivers/media/video/video-buf-dvb.c | 2 +- drivers/media/video/vivi.c | 1 + drivers/mfd/ucb1x00-ts.c | 2 +- drivers/net/irda/stir4200.c | 1 + drivers/net/wireless/airo.c | 1 + drivers/pcmcia/cs.c | 1 + drivers/pnp/pnpbios/core.c | 1 + drivers/usb/atm/ueagle-atm.c | 2 + drivers/usb/core/hub.c | 1 + drivers/usb/gadget/file_storage.c | 2 +- drivers/usb/storage/usb.c | 2 +- drivers/w1/w1.c | 1 + fs/afs/kafsasyncd.c | 1 + fs/afs/kafstimod.c | 1 + fs/cifs/cifsfs.c | 1 + fs/cifs/connect.c | 1 + fs/jbd/journal.c | 2 +- fs/jbd2/journal.c | 2 +- fs/jffs/intrep.c | 1 + fs/jffs2/background.c | 1 + fs/jfs/jfs_logmgr.c | 2 +- fs/jfs/jfs_txnmgr.c | 2 +- fs/lockd/clntproc.c | 1 + fs/xfs/linux-2.6/xfs_buf.c | 1 + fs/xfs/linux-2.6/xfs_super.c | 1 + include/linux/freezer.h | 84 +++++++++++++++++++++++++++++++ include/linux/sched.h | 81 ----------------------------- init/do_mounts_initrd.c | 1 + kernel/audit.c | 1 + kernel/power/disk.c | 1 + kernel/power/main.c | 1 + kernel/power/process.c | 1 + kernel/power/user.c | 1 + kernel/rtmutex-tester.c | 1 + kernel/sched.c | 2 +- kernel/signal.c | 1 + mm/pdflush.c | 1 + mm/vmscan.c | 1 + net/rxrpc/krxiod.c | 1 + net/rxrpc/krxsecd.c | 1 + net/rxrpc/krxtimod.c | 1 + net/sunrpc/svcsock.c | 1 + 64 files changed, 147 insertions(+), 101 deletions(-) create mode 100644 include/linux/freezer.h (limited to 'arch/i386/kernel') diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index 48cf7fffddf..f38a60a03b8 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include diff --git a/arch/avr32/kernel/signal.c b/arch/avr32/kernel/signal.c index 33096651c24..0ec14854a20 100644 --- a/arch/avr32/kernel/signal.c +++ b/arch/avr32/kernel/signal.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/frv/kernel/signal.c b/arch/frv/kernel/signal.c index b8a5882b862..85baeae9666 100644 --- a/arch/frv/kernel/signal.c +++ b/arch/frv/kernel/signal.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/h8300/kernel/signal.c b/arch/h8300/kernel/signal.c index 7787f70a05b..02955604d76 100644 --- a/arch/h8300/kernel/signal.c +++ b/arch/h8300/kernel/signal.c @@ -38,7 +38,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index 3b7a63e0ed1..44c5a3206b2 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include diff --git a/arch/m32r/kernel/signal.c b/arch/m32r/kernel/signal.c index b60cea4aeba..092ea86bb07 100644 --- a/arch/m32r/kernel/signal.c +++ b/arch/m32r/kernel/signal.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index 320353f0926..e4ebe1a6228 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -36,7 +36,7 @@ #include #include #include -#include +#include #endif #include diff --git a/arch/sh/kernel/signal.c b/arch/sh/kernel/signal.c index 50d7c4993be..bb1c480a59c 100644 --- a/arch/sh/kernel/signal.c +++ b/arch/sh/kernel/signal.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include diff --git a/arch/sh64/kernel/signal.c b/arch/sh64/kernel/signal.c index 9e2ffc45c0e..1666d3efb52 100644 --- a/arch/sh64/kernel/signal.c +++ b/arch/sh64/kernel/signal.c @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index f2904f67af4..e45eaa26411 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -54,7 +54,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/char/hvc_console.c b/drivers/char/hvc_console.c index 9902ffad3b1..cc2cd46bedc 100644 --- a/drivers/char/hvc_console.c +++ b/drivers/char/hvc_console.c @@ -38,6 +38,7 @@ #include #include #include +#include #include diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 75e9e38330f..1b4fc922180 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/ieee1394/nodemgr.c b/drivers/ieee1394/nodemgr.c index 8e7b83f8448..e829c9336b3 100644 --- a/drivers/ieee1394/nodemgr.c +++ b/drivers/ieee1394/nodemgr.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include "csr.h" diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c index a0af97efe6a..79dfb4b25c9 100644 --- a/drivers/input/gameport/gameport.c +++ b/drivers/input/gameport/gameport.c @@ -23,6 +23,7 @@ #include #include /* HZ */ #include +#include /*#include */ diff --git a/drivers/input/serio/serio.c b/drivers/input/serio/serio.c index 211943f85cb..5f1d4032fd5 100644 --- a/drivers/input/serio/serio.c +++ b/drivers/input/serio/serio.c @@ -35,6 +35,7 @@ #include #include #include +#include MODULE_AUTHOR("Vojtech Pavlik "); MODULE_DESCRIPTION("Serio abstraction core"); diff --git a/drivers/macintosh/therm_adt746x.c b/drivers/macintosh/therm_adt746x.c index 13b953ae8eb..3d3bf1643e7 100644 --- a/drivers/macintosh/therm_adt746x.c +++ b/drivers/macintosh/therm_adt746x.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c index e63ea1c1f3c..c8558d4ed50 100644 --- a/drivers/macintosh/via-pmu.c +++ b/drivers/macintosh/via-pmu.c @@ -42,7 +42,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/macintosh/windfarm_core.c b/drivers/macintosh/windfarm_core.c index ab3faa702d5..e947af982f9 100644 --- a/drivers/macintosh/windfarm_core.c +++ b/drivers/macintosh/windfarm_core.c @@ -34,6 +34,7 @@ #include #include #include +#include #include diff --git a/drivers/md/md.c b/drivers/md/md.c index 8cbf9c9df1c..6c4345bde07 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -39,10 +39,10 @@ #include #include #include /* for invalidate_bdev */ -#include #include #include #include +#include #include diff --git a/drivers/media/dvb/dvb-core/dvb_frontend.c b/drivers/media/dvb/dvb-core/dvb_frontend.c index a2ab2eebfc6..e85972222ab 100644 --- a/drivers/media/dvb/dvb-core/dvb_frontend.c +++ b/drivers/media/dvb/dvb-core/dvb_frontend.c @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/media/video/msp3400-driver.c b/drivers/media/video/msp3400-driver.c index cf43df3fe70..e1b56dc13c3 100644 --- a/drivers/media/video/msp3400-driver.c +++ b/drivers/media/video/msp3400-driver.c @@ -56,7 +56,7 @@ #include #include #include -#include +#include #include "msp3400-driver.h" /* ---------------------------------------------------------------------- */ diff --git a/drivers/media/video/tvaudio.c b/drivers/media/video/tvaudio.c index fcaef4bf828..d506dfaa45a 100644 --- a/drivers/media/video/tvaudio.c +++ b/drivers/media/video/tvaudio.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include diff --git a/drivers/media/video/video-buf-dvb.c b/drivers/media/video/video-buf-dvb.c index f53edf1923b..fcc5467e763 100644 --- a/drivers/media/video/video-buf-dvb.c +++ b/drivers/media/video/video-buf-dvb.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/media/video/vivi.c b/drivers/media/video/vivi.c index 3c8dc72dc8e..9986de5cb3d 100644 --- a/drivers/media/video/vivi.c +++ b/drivers/media/video/vivi.c @@ -36,6 +36,7 @@ #include #include #include +#include /* Wake up at about 30 fps */ #define WAKE_NUMERATOR 30 diff --git a/drivers/mfd/ucb1x00-ts.c b/drivers/mfd/ucb1x00-ts.c index 82938ad6ddb..ce1a4810821 100644 --- a/drivers/mfd/ucb1x00-ts.c +++ b/drivers/mfd/ucb1x00-ts.c @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/net/irda/stir4200.c b/drivers/net/irda/stir4200.c index 3b4c4787593..c14a74634fd 100644 --- a/drivers/net/irda/stir4200.c +++ b/drivers/net/irda/stir4200.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/wireless/airo.c b/drivers/net/wireless/airo.c index efcdaf1c5f7..44a22701da9 100644 --- a/drivers/net/wireless/airo.c +++ b/drivers/net/wireless/airo.c @@ -49,6 +49,7 @@ #include #include #include +#include #include "airo.h" diff --git a/drivers/pcmcia/cs.c b/drivers/pcmcia/cs.c index f9cd831a3f3..606a4674033 100644 --- a/drivers/pcmcia/cs.c +++ b/drivers/pcmcia/cs.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include diff --git a/drivers/pnp/pnpbios/core.c b/drivers/pnp/pnpbios/core.c index 81a6c83d89a..81186f479a3 100644 --- a/drivers/pnp/pnpbios/core.c +++ b/drivers/pnp/pnpbios/core.c @@ -61,6 +61,7 @@ #include #include #include +#include #include #include diff --git a/drivers/usb/atm/ueagle-atm.c b/drivers/usb/atm/ueagle-atm.c index f2d196fa1e8..dae4ef1e8fe 100644 --- a/drivers/usb/atm/ueagle-atm.c +++ b/drivers/usb/atm/ueagle-atm.c @@ -64,6 +64,8 @@ #include #include #include +#include + #include #include "usbatm.h" diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index 77c05be5241..2651c2e2a89 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include diff --git a/drivers/usb/gadget/file_storage.c b/drivers/usb/gadget/file_storage.c index 8b975d15538..c98316ce838 100644 --- a/drivers/usb/gadget/file_storage.c +++ b/drivers/usb/gadget/file_storage.c @@ -250,7 +250,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/usb/storage/usb.c b/drivers/usb/storage/usb.c index b401084b3d2..70644506651 100644 --- a/drivers/usb/storage/usb.c +++ b/drivers/usb/storage/usb.c @@ -49,7 +49,7 @@ #include #include -#include +#include #include #include #include diff --git a/drivers/w1/w1.c b/drivers/w1/w1.c index de3e9791f80..63c07243993 100644 --- a/drivers/w1/w1.c +++ b/drivers/w1/w1.c @@ -31,6 +31,7 @@ #include #include #include +#include #include diff --git a/fs/afs/kafsasyncd.c b/fs/afs/kafsasyncd.c index f09a794f248..615df2407cb 100644 --- a/fs/afs/kafsasyncd.c +++ b/fs/afs/kafsasyncd.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "cell.h" #include "server.h" #include "volume.h" diff --git a/fs/afs/kafstimod.c b/fs/afs/kafstimod.c index 65bc05ab818..694344e4d3c 100644 --- a/fs/afs/kafstimod.c +++ b/fs/afs/kafstimod.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "cell.h" #include "volume.h" #include "kafstimod.h" diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index e6b5866e500..71bc87a37fc 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -34,6 +34,7 @@ #include #include #include +#include #include "cifsfs.h" #include "cifspdu.h" #define DECLARE_GLOBALS_HERE diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 71f77914ce9..2caca06b4ba 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include "cifspdu.h" diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index a8774bed20b..10fff944393 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 50356019ae3..44fc32bfd7f 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/fs/jffs/intrep.c b/fs/jffs/intrep.c index 4a543e11497..478a74e2e9d 100644 --- a/fs/jffs/intrep.c +++ b/fs/jffs/intrep.c @@ -66,6 +66,7 @@ #include #include #include +#include #include "intrep.h" #include "jffs_fm.h" diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c index ff2a872e80e..6eb3daebd56 100644 --- a/fs/jffs2/background.c +++ b/fs/jffs2/background.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "nodelist.h" diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index b89c9aba046..5065baa530b 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -67,7 +67,7 @@ #include #include /* for sync_blockdev() */ #include -#include +#include #include #include #include "jfs_incore.h" diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index 81f6f04af19..d558e51b0df 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -46,7 +46,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 3d84f600b63..50643b6a555 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index eef4a0ba11e..b971237c5a9 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -32,6 +32,7 @@ #include #include #include +#include STATIC kmem_zone_t *xfs_buf_zone; STATIC kmem_shaker_t xfs_buf_shake; diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index de05abbbe7f..b93265b7c79 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -56,6 +56,7 @@ #include #include #include +#include STATIC struct quotactl_ops xfs_quotactl_operations; STATIC struct super_operations xfs_super_operations; diff --git a/include/linux/freezer.h b/include/linux/freezer.h new file mode 100644 index 00000000000..266373f7444 --- /dev/null +++ b/include/linux/freezer.h @@ -0,0 +1,84 @@ +/* Freezer declarations */ + +#ifdef CONFIG_PM +/* + * Check if a process has been frozen + */ +static inline int frozen(struct task_struct *p) +{ + return p->flags & PF_FROZEN; +} + +/* + * Check if there is a request to freeze a process + */ +static inline int freezing(struct task_struct *p) +{ + return p->flags & PF_FREEZE; +} + +/* + * Request that a process be frozen + * FIXME: SMP problem. We may not modify other process' flags! + */ +static inline void freeze(struct task_struct *p) +{ + p->flags |= PF_FREEZE; +} + +/* + * Sometimes we may need to cancel the previous 'freeze' request + */ +static inline void do_not_freeze(struct task_struct *p) +{ + p->flags &= ~PF_FREEZE; +} + +/* + * Wake up a frozen process + */ +static inline int thaw_process(struct task_struct *p) +{ + if (frozen(p)) { + p->flags &= ~PF_FROZEN; + wake_up_process(p); + return 1; + } + return 0; +} + +/* + * freezing is complete, mark process as frozen + */ +static inline void frozen_process(struct task_struct *p) +{ + p->flags = (p->flags & ~PF_FREEZE) | PF_FROZEN; +} + +extern void refrigerator(void); +extern int freeze_processes(void); +extern void thaw_processes(void); + +static inline int try_to_freeze(void) +{ + if (freezing(current)) { + refrigerator(); + return 1; + } else + return 0; +} +#else +static inline int frozen(struct task_struct *p) { return 0; } +static inline int freezing(struct task_struct *p) { return 0; } +static inline void freeze(struct task_struct *p) { BUG(); } +static inline int thaw_process(struct task_struct *p) { return 1; } +static inline void frozen_process(struct task_struct *p) { BUG(); } + +static inline void refrigerator(void) {} +static inline int freeze_processes(void) { BUG(); return 0; } +static inline void thaw_processes(void) {} + +static inline int try_to_freeze(void) { return 0; } + + +#endif diff --git a/include/linux/sched.h b/include/linux/sched.h index acfd2e15c5f..837a012f573 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1618,87 +1618,6 @@ extern int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls); extern void normalize_rt_tasks(void); -#ifdef CONFIG_PM -/* - * Check if a process has been frozen - */ -static inline int frozen(struct task_struct *p) -{ - return p->flags & PF_FROZEN; -} - -/* - * Check if there is a request to freeze a process - */ -static inline int freezing(struct task_struct *p) -{ - return p->flags & PF_FREEZE; -} - -/* - * Request that a process be frozen - * FIXME: SMP problem. We may not modify other process' flags! - */ -static inline void freeze(struct task_struct *p) -{ - p->flags |= PF_FREEZE; -} - -/* - * Sometimes we may need to cancel the previous 'freeze' request - */ -static inline void do_not_freeze(struct task_struct *p) -{ - p->flags &= ~PF_FREEZE; -} - -/* - * Wake up a frozen process - */ -static inline int thaw_process(struct task_struct *p) -{ - if (frozen(p)) { - p->flags &= ~PF_FROZEN; - wake_up_process(p); - return 1; - } - return 0; -} - -/* - * freezing is complete, mark process as frozen - */ -static inline void frozen_process(struct task_struct *p) -{ - p->flags = (p->flags & ~PF_FREEZE) | PF_FROZEN; -} - -extern void refrigerator(void); -extern int freeze_processes(void); -extern void thaw_processes(void); - -static inline int try_to_freeze(void) -{ - if (freezing(current)) { - refrigerator(); - return 1; - } else - return 0; -} -#else -static inline int frozen(struct task_struct *p) { return 0; } -static inline int freezing(struct task_struct *p) { return 0; } -static inline void freeze(struct task_struct *p) { BUG(); } -static inline int thaw_process(struct task_struct *p) { return 1; } -static inline void frozen_process(struct task_struct *p) { BUG(); } - -static inline void refrigerator(void) {} -static inline int freeze_processes(void) { BUG(); return 0; } -static inline void thaw_processes(void) {} - -static inline int try_to_freeze(void) { return 0; } - -#endif /* CONFIG_PM */ #endif /* __KERNEL__ */ #endif diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c index 919a80cb322..2cfd7cb36e7 100644 --- a/init/do_mounts_initrd.c +++ b/init/do_mounts_initrd.c @@ -6,6 +6,7 @@ #include #include #include +#include #include "do_mounts.h" diff --git a/kernel/audit.c b/kernel/audit.c index 98106f6078b..d9b690ac684 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -57,6 +57,7 @@ #include #include #include +#include #include "audit.h" diff --git a/kernel/power/disk.c b/kernel/power/disk.c index f5079231383..53b3b57c022 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "power.h" diff --git a/kernel/power/main.c b/kernel/power/main.c index 873228c71da..6096c71b182 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "power.h" diff --git a/kernel/power/process.c b/kernel/power/process.c index 72e72d2c61e..29be608e834 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -13,6 +13,7 @@ #include #include #include +#include /* * Timeout for stopping processes diff --git a/kernel/power/user.c b/kernel/power/user.c index a63b25c63b4..26c66941c00 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -22,6 +22,7 @@ #include #include #include +#include #include diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 6dcea9dd8c9..015fc633c96 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "rtmutex.h" diff --git a/kernel/sched.c b/kernel/sched.c index 3399701c680..12fdbef1d9b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/kernel/signal.c b/kernel/signal.c index 8e19d278548..bc972d7e631 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include diff --git a/mm/pdflush.c b/mm/pdflush.c index b02102feeb4..8ce0900dc95 100644 --- a/mm/pdflush.c +++ b/mm/pdflush.c @@ -21,6 +21,7 @@ #include // Prototypes pdflush_operation() #include #include +#include /* diff --git a/mm/vmscan.c b/mm/vmscan.c index 2a6a79f6813..f6616e81fac 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include diff --git a/net/rxrpc/krxiod.c b/net/rxrpc/krxiod.c index dada34a77b2..49effd92144 100644 --- a/net/rxrpc/krxiod.c +++ b/net/rxrpc/krxiod.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/net/rxrpc/krxsecd.c b/net/rxrpc/krxsecd.c index cea4eb5e249..3ab0f77409f 100644 --- a/net/rxrpc/krxsecd.c +++ b/net/rxrpc/krxsecd.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include "internal.h" diff --git a/net/rxrpc/krxtimod.c b/net/rxrpc/krxtimod.c index 3e7466900bd..9a9b6132dba 100644 --- a/net/rxrpc/krxtimod.c +++ b/net/rxrpc/krxtimod.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 64ca1f61dd9..1c68956824e 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3-70-g09d2 From 6cfd76a26d9fe2ba54b9d496a48c1d9285e5c5ed Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 6 Dec 2006 20:37:22 -0800 Subject: [PATCH] lockdep: name some old style locks Name some of the remaning 'old_style_spin_init' locks Signed-off-by: Peter Zijlstra Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/traps.c | 2 +- include/asm-i386/rwsem.h | 4 ++-- include/linux/init_task.h | 2 +- include/linux/mutex.h | 2 +- include/linux/rtmutex.h | 2 +- include/linux/rwsem-spinlock.h | 3 ++- include/linux/sunrpc/sched.h | 4 ++-- kernel/acct.c | 3 ++- kernel/irq/handle.c | 2 +- net/sunrpc/svcauth.c | 3 ++- security/keys/process_keys.c | 2 +- 11 files changed, 16 insertions(+), 13 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index fe9c5e8e7e6..3124f1b04d6 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -452,7 +452,7 @@ void die(const char * str, struct pt_regs * regs, long err) u32 lock_owner; int lock_owner_depth; } die = { - .lock = SPIN_LOCK_UNLOCKED, + .lock = __SPIN_LOCK_UNLOCKED(die.lock), .lock_owner = -1, .lock_owner_depth = 0 }; diff --git a/include/asm-i386/rwsem.h b/include/asm-i386/rwsem.h index bc598d6388e..041906f3c6d 100644 --- a/include/asm-i386/rwsem.h +++ b/include/asm-i386/rwsem.h @@ -75,8 +75,8 @@ struct rw_semaphore { #define __RWSEM_INITIALIZER(name) \ -{ RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, LIST_HEAD_INIT((name).wait_list) \ - __RWSEM_DEP_MAP_INIT(name) } +{ RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \ + LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) } #define DECLARE_RWSEM(name) \ struct rw_semaphore name = __RWSEM_INITIALIZER(name) diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 33c5daacc74..733790d4f7d 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -73,7 +73,7 @@ extern struct nsproxy init_nsproxy; #define INIT_NSPROXY(nsproxy) { \ .count = ATOMIC_INIT(1), \ - .nslock = SPIN_LOCK_UNLOCKED, \ + .nslock = __SPIN_LOCK_UNLOCKED(nsproxy.nslock), \ .uts_ns = &init_uts_ns, \ .namespace = NULL, \ INIT_IPC_NS(ipc_ns) \ diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 27c48daa318..b2b91c47756 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -94,7 +94,7 @@ do { \ #define __MUTEX_INITIALIZER(lockname) \ { .count = ATOMIC_INIT(1) \ - , .wait_lock = SPIN_LOCK_UNLOCKED \ + , .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \ , .wait_list = LIST_HEAD_INIT(lockname.wait_list) \ __DEBUG_MUTEX_INITIALIZER(lockname) \ __DEP_MAP_MUTEX_INITIALIZER(lockname) } diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h index 5d41dee82f8..b0090e9f788 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h @@ -63,7 +63,7 @@ struct hrtimer_sleeper; #endif #define __RT_MUTEX_INITIALIZER(mutexname) \ - { .wait_lock = SPIN_LOCK_UNLOCKED \ + { .wait_lock = __SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \ , .wait_list = PLIST_HEAD_INIT(mutexname.wait_list, mutexname.wait_lock) \ , .owner = NULL \ __DEBUG_RT_MUTEX_INITIALIZER(mutexname)} diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h index ae1fcadd598..813cee13da0 100644 --- a/include/linux/rwsem-spinlock.h +++ b/include/linux/rwsem-spinlock.h @@ -44,7 +44,8 @@ struct rw_semaphore { #endif #define __RWSEM_INITIALIZER(name) \ -{ 0, SPIN_LOCK_UNLOCKED, LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) } +{ 0, __SPIN_LOCK_UNLOCKED(name.wait_lock), LIST_HEAD_INIT((name).wait_list) \ + __RWSEM_DEP_MAP_INIT(name) } #define DECLARE_RWSEM(name) \ struct rw_semaphore name = __RWSEM_INITIALIZER(name) diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index f399c138f79..0746c3b16f3 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -222,7 +222,7 @@ struct rpc_wait_queue { #ifndef RPC_DEBUG # define RPC_WAITQ_INIT(var,qname) { \ - .lock = SPIN_LOCK_UNLOCKED, \ + .lock = __SPIN_LOCK_UNLOCKED(var.lock), \ .tasks = { \ [0] = LIST_HEAD_INIT(var.tasks[0]), \ [1] = LIST_HEAD_INIT(var.tasks[1]), \ @@ -231,7 +231,7 @@ struct rpc_wait_queue { } #else # define RPC_WAITQ_INIT(var,qname) { \ - .lock = SPIN_LOCK_UNLOCKED, \ + .lock = __SPIN_LOCK_UNLOCKED(var.lock), \ .tasks = { \ [0] = LIST_HEAD_INIT(var.tasks[0]), \ [1] = LIST_HEAD_INIT(var.tasks[1]), \ diff --git a/kernel/acct.c b/kernel/acct.c index 0aad5ca36a8..dc12db8600e 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -89,7 +89,8 @@ struct acct_glbs { struct timer_list timer; }; -static struct acct_glbs acct_globals __cacheline_aligned = {SPIN_LOCK_UNLOCKED}; +static struct acct_glbs acct_globals __cacheline_aligned = + {__SPIN_LOCK_UNLOCKED(acct_globals.lock)}; /* * Called whenever the timer says to check the free space. diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index a681912bc89..aff1f0fabb0 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -54,7 +54,7 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = { .chip = &no_irq_chip, .handle_irq = handle_bad_irq, .depth = 1, - .lock = SPIN_LOCK_UNLOCKED, + .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock), #ifdef CONFIG_SMP .affinity = CPU_MASK_ALL #endif diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c index ee9bb1522d5..c7bb5f7f21a 100644 --- a/net/sunrpc/svcauth.c +++ b/net/sunrpc/svcauth.c @@ -119,7 +119,8 @@ EXPORT_SYMBOL(svc_auth_unregister); #define DN_HASHMASK (DN_HASHMAX-1) static struct hlist_head auth_domain_table[DN_HASHMAX]; -static spinlock_t auth_domain_lock = SPIN_LOCK_UNLOCKED; +static spinlock_t auth_domain_lock = + __SPIN_LOCK_UNLOCKED(auth_domain_lock); void auth_domain_put(struct auth_domain *dom) { diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c index 32150cf7c37..b6f86808475 100644 --- a/security/keys/process_keys.c +++ b/security/keys/process_keys.c @@ -27,7 +27,7 @@ static DEFINE_MUTEX(key_session_mutex); struct key_user root_key_user = { .usage = ATOMIC_INIT(3), .consq = LIST_HEAD_INIT(root_key_user.consq), - .lock = SPIN_LOCK_UNLOCKED, + .lock = __SPIN_LOCK_UNLOCKED(root_key_user.lock), .nkeys = ATOMIC_INIT(2), .nikeys = ATOMIC_INIT(2), .uid = 0, -- cgit v1.2.3-70-g09d2 From 19e5d9c0d2194b4b47189cbec2921cbf72b0bd1c Mon Sep 17 00:00:00 2001 From: Henry Nestler Date: Wed, 6 Dec 2006 20:37:45 -0800 Subject: [PATCH] initrd: remove unused false condition for initrd_start After LOADER_TYPE && INITRD_START are true, the short if-condition for INITRD_START can never be false. Remove unused code from the else condition. Signed-off-by: Henry Nestler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/frv/kernel/setup.c | 2 +- arch/i386/kernel/setup.c | 3 +-- arch/m32r/kernel/setup.c | 4 +--- arch/m32r/mm/discontig.c | 4 +--- arch/sh/kernel/setup.c | 3 +-- arch/sh64/kernel/setup.c | 4 +--- arch/x86_64/kernel/setup.c | 3 +-- 7 files changed, 7 insertions(+), 16 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/frv/kernel/setup.c b/arch/frv/kernel/setup.c index a8c61dac1ce..1a5eb6c301c 100644 --- a/arch/frv/kernel/setup.c +++ b/arch/frv/kernel/setup.c @@ -947,7 +947,7 @@ static void __init setup_linux_memory(void) if (LOADER_TYPE && INITRD_START) { if (INITRD_START + INITRD_SIZE <= (low_top_pfn << PAGE_SHIFT)) { reserve_bootmem(INITRD_START, INITRD_SIZE); - initrd_start = INITRD_START ? INITRD_START + PAGE_OFFSET : 0; + initrd_start = INITRD_START + PAGE_OFFSET; initrd_end = initrd_start + INITRD_SIZE; } else { diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index 141041dde74..97bb869307b 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -1162,8 +1162,7 @@ void __init setup_bootmem_allocator(void) if (LOADER_TYPE && INITRD_START) { if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) { reserve_bootmem(INITRD_START, INITRD_SIZE); - initrd_start = - INITRD_START ? INITRD_START + PAGE_OFFSET : 0; + initrd_start = INITRD_START + PAGE_OFFSET; initrd_end = initrd_start+INITRD_SIZE; } else { diff --git a/arch/m32r/kernel/setup.c b/arch/m32r/kernel/setup.c index 0e7778be33c..936205f7aba 100644 --- a/arch/m32r/kernel/setup.c +++ b/arch/m32r/kernel/setup.c @@ -196,9 +196,7 @@ static unsigned long __init setup_memory(void) if (LOADER_TYPE && INITRD_START) { if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) { reserve_bootmem(INITRD_START, INITRD_SIZE); - initrd_start = INITRD_START ? - INITRD_START + PAGE_OFFSET : 0; - + initrd_start = INITRD_START + PAGE_OFFSET; initrd_end = initrd_start + INITRD_SIZE; printk("initrd:start[%08lx],size[%08lx]\n", initrd_start, INITRD_SIZE); diff --git a/arch/m32r/mm/discontig.c b/arch/m32r/mm/discontig.c index abb34ccd598..c7efdb0aefc 100644 --- a/arch/m32r/mm/discontig.c +++ b/arch/m32r/mm/discontig.c @@ -105,9 +105,7 @@ unsigned long __init setup_memory(void) if (INITRD_START + INITRD_SIZE <= PFN_PHYS(max_low_pfn)) { reserve_bootmem_node(NODE_DATA(0), INITRD_START, INITRD_SIZE); - initrd_start = INITRD_START ? - INITRD_START + PAGE_OFFSET : 0; - + initrd_start = INITRD_START + PAGE_OFFSET; initrd_end = initrd_start + INITRD_SIZE; printk("initrd:start[%08lx],size[%08lx]\n", initrd_start, INITRD_SIZE); diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c index 696ca75752d..f8dd6b7bfab 100644 --- a/arch/sh/kernel/setup.c +++ b/arch/sh/kernel/setup.c @@ -332,8 +332,7 @@ void __init setup_arch(char **cmdline_p) if (LOADER_TYPE && INITRD_START) { if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) { reserve_bootmem_node(NODE_DATA(0), INITRD_START+__MEMORY_START, INITRD_SIZE); - initrd_start = - INITRD_START ? INITRD_START + PAGE_OFFSET + __MEMORY_START : 0; + initrd_start = INITRD_START + PAGE_OFFSET + __MEMORY_START; initrd_end = initrd_start + INITRD_SIZE; } else { printk("initrd extends beyond end of memory " diff --git a/arch/sh64/kernel/setup.c b/arch/sh64/kernel/setup.c index ffb310e33ce..b9e7d54d7b8 100644 --- a/arch/sh64/kernel/setup.c +++ b/arch/sh64/kernel/setup.c @@ -243,9 +243,7 @@ void __init setup_arch(char **cmdline_p) if (INITRD_START + INITRD_SIZE <= (PFN_PHYS(last_pfn))) { reserve_bootmem_node(NODE_DATA(0), INITRD_START + __MEMORY_START, INITRD_SIZE); - initrd_start = - (long) INITRD_START ? INITRD_START + PAGE_OFFSET + __MEMORY_START : 0; - + initrd_start = (long) INITRD_START + PAGE_OFFSET + __MEMORY_START; initrd_end = initrd_start + INITRD_SIZE; } else { printk("initrd extends beyond end of memory " diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index fc944b5e8f4..f12f266f3e9 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -471,8 +471,7 @@ void __init setup_arch(char **cmdline_p) if (LOADER_TYPE && INITRD_START) { if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) { reserve_bootmem_generic(INITRD_START, INITRD_SIZE); - initrd_start = - INITRD_START ? INITRD_START + PAGE_OFFSET : 0; + initrd_start = INITRD_START + PAGE_OFFSET; initrd_end = initrd_start+INITRD_SIZE; } else { -- cgit v1.2.3-70-g09d2 From b4c6c34a530b4d1c626f4ac0a884e0a9b849378c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 6 Dec 2006 20:38:11 -0800 Subject: [PATCH] kprobes: enable booster on the preemptible kernel When we are unregistering a kprobe-booster, we can't release its instruction buffer immediately on the preemptive kernel, because some processes might be preempted on the buffer. The freeze_processes() and thaw_processes() functions can clean most of processes up from the buffer. There are still some non-frozen threads who have the PF_NOFREEZE flag. If those threads are sleeping (not preempted) at the known place outside the buffer, we can ensure safety of freeing. However, the processing of this check routine takes a long time. So, this patch introduces the garbage collection mechanism of insn_slot. It also introduces the "dirty" flag to free_insn_slot because of efficiency. The "clean" instruction slots (dirty flag is cleared) are released immediately. But the "dirty" slots which are used by boosted kprobes, are marked as garbages. collect_garbage_slots() will be invoked to release "dirty" slots if there are more than INSNS_PER_PAGE garbage slots or if there are no unused slots. Cc: "Keshavamurthy, Anil S" Cc: Ananth N Mavinakayanahalli Cc: "bibo,mao" Cc: Prasanna S Panchamukhi Cc: Yumiko Sugita Cc: Satoshi Oshima Cc: Hideo Aoki Signed-off-by: Masami Hiramatsu Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/kprobes.c | 4 +- arch/ia64/kernel/kprobes.c | 2 +- arch/powerpc/kernel/kprobes.c | 2 +- arch/s390/kernel/kprobes.c | 2 +- arch/x86_64/kernel/kprobes.c | 2 +- include/linux/kprobes.h | 2 +- kernel/kprobes.c | 117 ++++++++++++++++++++++++++++++++++-------- 7 files changed, 103 insertions(+), 28 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c index fc79e1e859c..af1d5334499 100644 --- a/arch/i386/kernel/kprobes.c +++ b/arch/i386/kernel/kprobes.c @@ -184,7 +184,7 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p) void __kprobes arch_remove_kprobe(struct kprobe *p) { mutex_lock(&kprobe_mutex); - free_insn_slot(p->ainsn.insn); + free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1)); mutex_unlock(&kprobe_mutex); } @@ -333,7 +333,7 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) return 1; ss_probe: -#ifndef CONFIG_PREEMPT +#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM) if (p->ainsn.boostable == 1 && !p->post_handler){ /* Boost up -- we can execute copied instructions directly */ reset_current_kprobe(); diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c index 51217d63285..4d592ee9300 100644 --- a/arch/ia64/kernel/kprobes.c +++ b/arch/ia64/kernel/kprobes.c @@ -481,7 +481,7 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p) void __kprobes arch_remove_kprobe(struct kprobe *p) { mutex_lock(&kprobe_mutex); - free_insn_slot(p->ainsn.insn); + free_insn_slot(p->ainsn.insn, 0); mutex_unlock(&kprobe_mutex); } /* diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index 7b8d12b9026..4657563f881 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -85,7 +85,7 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p) void __kprobes arch_remove_kprobe(struct kprobe *p) { mutex_lock(&kprobe_mutex); - free_insn_slot(p->ainsn.insn); + free_insn_slot(p->ainsn.insn, 0); mutex_unlock(&kprobe_mutex); } diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index 67914fe7f31..576368c4f60 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -200,7 +200,7 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p) void __kprobes arch_remove_kprobe(struct kprobe *p) { mutex_lock(&kprobe_mutex); - free_insn_slot(p->ainsn.insn); + free_insn_slot(p->ainsn.insn, 0); mutex_unlock(&kprobe_mutex); } diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c index ac241567e68..209c8c0bec7 100644 --- a/arch/x86_64/kernel/kprobes.c +++ b/arch/x86_64/kernel/kprobes.c @@ -224,7 +224,7 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p) void __kprobes arch_remove_kprobe(struct kprobe *p) { mutex_lock(&kprobe_mutex); - free_insn_slot(p->ainsn.insn); + free_insn_slot(p->ainsn.insn, 0); mutex_unlock(&kprobe_mutex); } diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index ac4c0559f75..769be39b968 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -165,7 +165,7 @@ extern void arch_disarm_kprobe(struct kprobe *p); extern int arch_init_kprobes(void); extern void show_registers(struct pt_regs *regs); extern kprobe_opcode_t *get_insn_slot(void); -extern void free_insn_slot(kprobe_opcode_t *slot); +extern void free_insn_slot(kprobe_opcode_t *slot, int dirty); extern void kprobes_inc_nmissed_count(struct kprobe *p); /* Get the kprobe at this addr (if any) - called with preemption disabled */ diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 610c837ad9e..17ec4afb099 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -83,9 +84,36 @@ struct kprobe_insn_page { kprobe_opcode_t *insns; /* Page of instruction slots */ char slot_used[INSNS_PER_PAGE]; int nused; + int ngarbage; }; static struct hlist_head kprobe_insn_pages; +static int kprobe_garbage_slots; +static int collect_garbage_slots(void); + +static int __kprobes check_safety(void) +{ + int ret = 0; +#if defined(CONFIG_PREEMPT) && defined(CONFIG_PM) + ret = freeze_processes(); + if (ret == 0) { + struct task_struct *p, *q; + do_each_thread(p, q) { + if (p != current && p->state == TASK_RUNNING && + p->pid != 0) { + printk("Check failed: %s is running\n",p->comm); + ret = -1; + goto loop_end; + } + } while_each_thread(p, q); + } +loop_end: + thaw_processes(); +#else + synchronize_sched(); +#endif + return ret; +} /** * get_insn_slot() - Find a slot on an executable page for an instruction. @@ -96,6 +124,7 @@ kprobe_opcode_t __kprobes *get_insn_slot(void) struct kprobe_insn_page *kip; struct hlist_node *pos; + retry: hlist_for_each(pos, &kprobe_insn_pages) { kip = hlist_entry(pos, struct kprobe_insn_page, hlist); if (kip->nused < INSNS_PER_PAGE) { @@ -112,7 +141,11 @@ kprobe_opcode_t __kprobes *get_insn_slot(void) } } - /* All out of space. Need to allocate a new page. Use slot 0.*/ + /* If there are any garbage slots, collect it and try again. */ + if (kprobe_garbage_slots && collect_garbage_slots() == 0) { + goto retry; + } + /* All out of space. Need to allocate a new page. Use slot 0. */ kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); if (!kip) { return NULL; @@ -133,10 +166,62 @@ kprobe_opcode_t __kprobes *get_insn_slot(void) memset(kip->slot_used, 0, INSNS_PER_PAGE); kip->slot_used[0] = 1; kip->nused = 1; + kip->ngarbage = 0; return kip->insns; } -void __kprobes free_insn_slot(kprobe_opcode_t *slot) +/* Return 1 if all garbages are collected, otherwise 0. */ +static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) +{ + kip->slot_used[idx] = 0; + kip->nused--; + if (kip->nused == 0) { + /* + * Page is no longer in use. Free it unless + * it's the last one. We keep the last one + * so as not to have to set it up again the + * next time somebody inserts a probe. + */ + hlist_del(&kip->hlist); + if (hlist_empty(&kprobe_insn_pages)) { + INIT_HLIST_NODE(&kip->hlist); + hlist_add_head(&kip->hlist, + &kprobe_insn_pages); + } else { + module_free(NULL, kip->insns); + kfree(kip); + } + return 1; + } + return 0; +} + +static int __kprobes collect_garbage_slots(void) +{ + struct kprobe_insn_page *kip; + struct hlist_node *pos, *next; + + /* Ensure no-one is preepmted on the garbages */ + if (check_safety() != 0) + return -EAGAIN; + + hlist_for_each_safe(pos, next, &kprobe_insn_pages) { + int i; + kip = hlist_entry(pos, struct kprobe_insn_page, hlist); + if (kip->ngarbage == 0) + continue; + kip->ngarbage = 0; /* we will collect all garbages */ + for (i = 0; i < INSNS_PER_PAGE; i++) { + if (kip->slot_used[i] == -1 && + collect_one_slot(kip, i)) + break; + } + } + kprobe_garbage_slots = 0; + return 0; +} + +void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) { struct kprobe_insn_page *kip; struct hlist_node *pos; @@ -146,28 +231,18 @@ void __kprobes free_insn_slot(kprobe_opcode_t *slot) if (kip->insns <= slot && slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { int i = (slot - kip->insns) / MAX_INSN_SIZE; - kip->slot_used[i] = 0; - kip->nused--; - if (kip->nused == 0) { - /* - * Page is no longer in use. Free it unless - * it's the last one. We keep the last one - * so as not to have to set it up again the - * next time somebody inserts a probe. - */ - hlist_del(&kip->hlist); - if (hlist_empty(&kprobe_insn_pages)) { - INIT_HLIST_NODE(&kip->hlist); - hlist_add_head(&kip->hlist, - &kprobe_insn_pages); - } else { - module_free(NULL, kip->insns); - kfree(kip); - } + if (dirty) { + kip->slot_used[i] = -1; + kip->ngarbage++; + } else { + collect_one_slot(kip, i); } - return; + break; } } + if (dirty && (++kprobe_garbage_slots > INSNS_PER_PAGE)) { + collect_garbage_slots(); + } } #endif -- cgit v1.2.3-70-g09d2 From a38a44c1a93078fc5fadc4ac2df8dea4697069e2 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 6 Dec 2006 20:38:16 -0800 Subject: [PATCH] smp_call_function_single() check that local interrupts are enabled smp_call_function_single() can deadlock if the caller disabled local interrupts (the target CPU could be spinning on call_lock). Check for that. Why on earth do these functions use spin_lock_bh()?? Cc: "Randy.Dunlap" Cc: Andi Kleen Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/smp.c | 4 ++++ arch/x86_64/kernel/smp.c | 4 ++++ 2 files changed, 8 insertions(+) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index 31e5c6573aa..9827cf927ec 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -699,6 +699,10 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, put_cpu(); return -EBUSY; } + + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + spin_lock_bh(&call_lock); __smp_call_function_single(cpu, func, info, nonatomic, wait); spin_unlock_bh(&call_lock); diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index 9f74c883568..32f4d7e2a06 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c @@ -379,6 +379,10 @@ int smp_call_function_single (int cpu, void (*func) (void *info), void *info, put_cpu(); return 0; } + + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + spin_lock_bh(&call_lock); __smp_call_function_single(cpu, func, info, nonatomic, wait); spin_unlock_bh(&call_lock); -- cgit v1.2.3-70-g09d2 From 02316067852187b8bec781bec07410e91af79627 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 6 Dec 2006 20:38:17 -0800 Subject: [PATCH] hotplug CPU: clean up hotcpu_notifier() use There was lots of #ifdef noise in the kernel due to hotcpu_notifier(fn, prio) not correctly marking 'fn' as used in the !HOTPLUG_CPU case, and thus generating compiler warnings of unused symbols, hence forcing people to add #ifdefs. the compiler can skip truly unused functions just fine: text data bss dec hex filename 1624412 728710 3674856 6027978 5bfaca vmlinux.before 1624412 728710 3674856 6027978 5bfaca vmlinux.after [akpm@osdl.org: topology.c fix] Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/cpu/mcheck/therm_throt.c | 2 -- arch/i386/kernel/cpuid.c | 2 -- arch/i386/kernel/microcode.c | 2 -- arch/i386/kernel/msr.c | 2 -- arch/ia64/kernel/palinfo.c | 2 -- arch/ia64/kernel/salinfo.c | 2 -- arch/s390/appldata/appldata_base.c | 2 -- arch/x86_64/kernel/mce.c | 2 -- arch/x86_64/kernel/mce_amd.c | 4 ++-- arch/x86_64/kernel/vsyscall.c | 2 -- block/ll_rw_blk.c | 4 ---- drivers/base/topology.c | 2 -- drivers/cpufreq/cpufreq.c | 2 -- fs/buffer.c | 2 -- include/linux/cpu.h | 6 +++--- kernel/cpuset.c | 4 ---- kernel/profile.c | 3 +-- kernel/sched.c | 3 --- kernel/workqueue.c | 2 -- lib/radix-tree.c | 2 -- mm/page_alloc.c | 4 ---- mm/swap.c | 2 ++ mm/vmscan.c | 2 -- net/core/dev.c | 2 -- net/core/flow.c | 2 -- 25 files changed, 8 insertions(+), 56 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/cpu/mcheck/therm_throt.c b/arch/i386/kernel/cpu/mcheck/therm_throt.c index bad8b442070..065005c3f16 100644 --- a/arch/i386/kernel/cpu/mcheck/therm_throt.c +++ b/arch/i386/kernel/cpu/mcheck/therm_throt.c @@ -116,7 +116,6 @@ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) return sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group); } -#ifdef CONFIG_HOTPLUG_CPU static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) { return sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); @@ -153,7 +152,6 @@ static struct notifier_block thermal_throttle_cpu_notifier = { .notifier_call = thermal_throttle_cpu_callback, }; -#endif /* CONFIG_HOTPLUG_CPU */ static __init int thermal_throttle_init_device(void) { diff --git a/arch/i386/kernel/cpuid.c b/arch/i386/kernel/cpuid.c index ab0c327e79d..23b2cc748d4 100644 --- a/arch/i386/kernel/cpuid.c +++ b/arch/i386/kernel/cpuid.c @@ -167,7 +167,6 @@ static int cpuid_device_create(int i) return err; } -#ifdef CONFIG_HOTPLUG_CPU static int cpuid_class_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; @@ -187,7 +186,6 @@ static struct notifier_block __cpuinitdata cpuid_class_cpu_notifier = { .notifier_call = cpuid_class_cpu_callback, }; -#endif /* !CONFIG_HOTPLUG_CPU */ static int __init cpuid_init(void) { diff --git a/arch/i386/kernel/microcode.c b/arch/i386/kernel/microcode.c index 23f5984d065..972346604f9 100644 --- a/arch/i386/kernel/microcode.c +++ b/arch/i386/kernel/microcode.c @@ -703,7 +703,6 @@ static struct sysdev_driver mc_sysdev_driver = { .resume = mc_sysdev_resume, }; -#ifdef CONFIG_HOTPLUG_CPU static __cpuinit int mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) { @@ -726,7 +725,6 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) static struct notifier_block mc_cpu_notifier = { .notifier_call = mc_cpu_callback, }; -#endif static int __init microcode_init (void) { diff --git a/arch/i386/kernel/msr.c b/arch/i386/kernel/msr.c index a773f776c9e..7763c67ca28 100644 --- a/arch/i386/kernel/msr.c +++ b/arch/i386/kernel/msr.c @@ -250,7 +250,6 @@ static int msr_device_create(int i) return err; } -#ifdef CONFIG_HOTPLUG_CPU static int msr_class_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -271,7 +270,6 @@ static struct notifier_block __cpuinitdata msr_class_cpu_notifier = { .notifier_call = msr_class_cpu_callback, }; -#endif static int __init msr_init(void) { diff --git a/arch/ia64/kernel/palinfo.c b/arch/ia64/kernel/palinfo.c index 0b546e2b36a..c4c10a0b99d 100644 --- a/arch/ia64/kernel/palinfo.c +++ b/arch/ia64/kernel/palinfo.c @@ -952,7 +952,6 @@ remove_palinfo_proc_entries(unsigned int hcpu) } } -#ifdef CONFIG_HOTPLUG_CPU static int palinfo_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -974,7 +973,6 @@ static struct notifier_block palinfo_cpu_notifier = .notifier_call = palinfo_cpu_callback, .priority = 0, }; -#endif static int __init palinfo_init(void) diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c index e63b8ca5344..fd607ca51a8 100644 --- a/arch/ia64/kernel/salinfo.c +++ b/arch/ia64/kernel/salinfo.c @@ -575,7 +575,6 @@ static struct file_operations salinfo_data_fops = { .write = salinfo_log_write, }; -#ifdef CONFIG_HOTPLUG_CPU static int __devinit salinfo_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) { @@ -620,7 +619,6 @@ static struct notifier_block salinfo_cpu_notifier = .notifier_call = salinfo_cpu_callback, .priority = 0, }; -#endif /* CONFIG_HOTPLUG_CPU */ static int __init salinfo_init(void) diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c index 67d5cf9cba8..b8c23729026 100644 --- a/arch/s390/appldata/appldata_base.c +++ b/arch/s390/appldata/appldata_base.c @@ -561,7 +561,6 @@ appldata_offline_cpu(int cpu) spin_unlock(&appldata_timer_lock); } -#ifdef CONFIG_HOTPLUG_CPU static int __cpuinit appldata_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) @@ -582,7 +581,6 @@ appldata_cpu_notify(struct notifier_block *self, static struct notifier_block appldata_nb = { .notifier_call = appldata_cpu_notify, }; -#endif /* * appldata_init() diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index c7587fc3901..bc863c464a1 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -641,7 +641,6 @@ static __cpuinit int mce_create_device(unsigned int cpu) return err; } -#ifdef CONFIG_HOTPLUG_CPU static void mce_remove_device(unsigned int cpu) { int i; @@ -674,7 +673,6 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) static struct notifier_block mce_cpu_notifier = { .notifier_call = mce_cpu_callback, }; -#endif static __init int mce_init_device(void) { diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c index 883fe747f64..fa09debad4b 100644 --- a/arch/x86_64/kernel/mce_amd.c +++ b/arch/x86_64/kernel/mce_amd.c @@ -551,7 +551,6 @@ out: return err; } -#ifdef CONFIG_HOTPLUG_CPU /* * let's be hotplug friendly. * in case of multiple core processors, the first core always takes ownership @@ -594,12 +593,14 @@ static void threshold_remove_bank(unsigned int cpu, int bank) sprintf(name, "threshold_bank%i", bank); +#ifdef CONFIG_SMP /* sibling symlink */ if (shared_bank[bank] && b->blocks->cpu != cpu) { sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name); per_cpu(threshold_banks, cpu)[bank] = NULL; return; } +#endif /* remove all sibling symlinks before unregistering */ for_each_cpu_mask(i, b->cpus) { @@ -656,7 +657,6 @@ static int threshold_cpu_callback(struct notifier_block *nfb, static struct notifier_block threshold_cpu_notifier = { .notifier_call = threshold_cpu_callback, }; -#endif /* CONFIG_HOTPLUG_CPU */ static __init int threshold_init_device(void) { diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c index 630036c06c7..3785e495473 100644 --- a/arch/x86_64/kernel/vsyscall.c +++ b/arch/x86_64/kernel/vsyscall.c @@ -275,7 +275,6 @@ static void __cpuinit cpu_vsyscall_init(void *arg) vsyscall_set_cpu(raw_smp_processor_id()); } -#ifdef CONFIG_HOTPLUG_CPU static int __cpuinit cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) { @@ -284,7 +283,6 @@ cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1); return NOTIFY_DONE; } -#endif static void __init map_vsyscall(void) { diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c index a4ff3271d4a..31512cd9f3a 100644 --- a/block/ll_rw_blk.c +++ b/block/ll_rw_blk.c @@ -3459,8 +3459,6 @@ static void blk_done_softirq(struct softirq_action *h) } } -#ifdef CONFIG_HOTPLUG_CPU - static int blk_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -3486,8 +3484,6 @@ static struct notifier_block __devinitdata blk_cpu_notifier = { .notifier_call = blk_cpu_notify, }; -#endif /* CONFIG_HOTPLUG_CPU */ - /** * blk_complete_request - end I/O on a request * @req: the request being processed diff --git a/drivers/base/topology.c b/drivers/base/topology.c index 3d12b85b096..067a9e8bc37 100644 --- a/drivers/base/topology.c +++ b/drivers/base/topology.c @@ -108,7 +108,6 @@ static int __cpuinit topology_add_dev(unsigned int cpu) return rc; } -#ifdef CONFIG_HOTPLUG_CPU static void __cpuinit topology_remove_dev(unsigned int cpu) { struct sys_device *sys_dev = get_cpu_sysdev(cpu); @@ -136,7 +135,6 @@ static int __cpuinit topology_cpu_callback(struct notifier_block *nfb, } return rc ? NOTIFY_BAD : NOTIFY_OK; } -#endif static int __cpuinit topology_sysfs_init(void) { diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 7a7c6e6dfe4..47ab42db122 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1537,7 +1537,6 @@ int cpufreq_update_policy(unsigned int cpu) } EXPORT_SYMBOL(cpufreq_update_policy); -#ifdef CONFIG_HOTPLUG_CPU static int cpufreq_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -1577,7 +1576,6 @@ static struct notifier_block __cpuinitdata cpufreq_cpu_notifier = { .notifier_call = cpufreq_cpu_callback, }; -#endif /* CONFIG_HOTPLUG_CPU */ /********************************************************************* * REGISTER / UNREGISTER CPUFREQ DRIVER * diff --git a/fs/buffer.c b/fs/buffer.c index a8ca0ac2148..517860f2d75 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2972,7 +2972,6 @@ init_buffer_head(void *data, struct kmem_cache *cachep, unsigned long flags) } } -#ifdef CONFIG_HOTPLUG_CPU static void buffer_exit_cpu(int cpu) { int i; @@ -2994,7 +2993,6 @@ static int buffer_cpu_notify(struct notifier_block *self, buffer_exit_cpu((unsigned long)hcpu); return NOTIFY_OK; } -#endif /* CONFIG_HOTPLUG_CPU */ void __init buffer_init(void) { diff --git a/include/linux/cpu.h b/include/linux/cpu.h index f02d71bf689..71dc6ba4f73 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -89,9 +89,9 @@ int cpu_down(unsigned int cpu); #define lock_cpu_hotplug() do { } while (0) #define unlock_cpu_hotplug() do { } while (0) #define lock_cpu_hotplug_interruptible() 0 -#define hotcpu_notifier(fn, pri) do { } while (0) -#define register_hotcpu_notifier(nb) do { } while (0) -#define unregister_hotcpu_notifier(nb) do { } while (0) +#define hotcpu_notifier(fn, pri) do { (void)(fn); } while (0) +#define register_hotcpu_notifier(nb) do { (void)(nb); } while (0) +#define unregister_hotcpu_notifier(nb) do { (void)(nb); } while (0) /* CPUs don't go offline once they're online w/o CONFIG_HOTPLUG_CPU */ static inline int cpu_is_offline(int cpu) { return 0; } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index bd1e89c4c96..9b62b4c03ad 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2044,7 +2044,6 @@ out: return err; } -#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG) /* * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs * or memory nodes, we need to walk over the cpuset hierarchy, @@ -2108,9 +2107,7 @@ static void common_cpu_mem_hotplug_unplug(void) mutex_unlock(&callback_mutex); mutex_unlock(&manage_mutex); } -#endif -#ifdef CONFIG_HOTPLUG_CPU /* * The top_cpuset tracks what CPUs and Memory Nodes are online, * period. This is necessary in order to make cpusets transparent @@ -2127,7 +2124,6 @@ static int cpuset_handle_cpuhp(struct notifier_block *nb, common_cpu_mem_hotplug_unplug(); return 0; } -#endif #ifdef CONFIG_MEMORY_HOTPLUG /* diff --git a/kernel/profile.c b/kernel/profile.c index 04fd84e8cdb..0961d93e1d9 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -319,7 +319,6 @@ out: put_cpu(); } -#ifdef CONFIG_HOTPLUG_CPU static int __devinit profile_cpu_callback(struct notifier_block *info, unsigned long action, void *__cpu) { @@ -372,10 +371,10 @@ static int __devinit profile_cpu_callback(struct notifier_block *info, } return NOTIFY_OK; } -#endif /* CONFIG_HOTPLUG_CPU */ #else /* !CONFIG_SMP */ #define profile_flip_buffers() do { } while (0) #define profile_discard_flip_buffers() do { } while (0) +#define profile_cpu_callback NULL void profile_hits(int type, void *__pc, unsigned int nr_hits) { diff --git a/kernel/sched.c b/kernel/sched.c index 75a005ed4ed..c83f531c288 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6740,8 +6740,6 @@ SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, sched_smt_power_savings_store); #endif - -#ifdef CONFIG_HOTPLUG_CPU /* * Force a reinitialization of the sched domains hierarchy. The domains * and groups cannot be updated in place without racing with the balancing @@ -6774,7 +6772,6 @@ static int update_sched_domains(struct notifier_block *nfb, return NOTIFY_OK; } -#endif void __init sched_init_smp(void) { diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5484d6e045c..c5257316f4b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -655,7 +655,6 @@ int current_is_keventd(void) } -#ifdef CONFIG_HOTPLUG_CPU /* Take the work from this (downed) CPU. */ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) { @@ -738,7 +737,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } -#endif void init_workqueues(void) { diff --git a/lib/radix-tree.c b/lib/radix-tree.c index e2cefabb5aa..d69ddbe4386 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -996,7 +996,6 @@ static __init void radix_tree_init_maxindex(void) height_to_maxindex[i] = __maxindex(i); } -#ifdef CONFIG_HOTPLUG_CPU static int radix_tree_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) @@ -1016,7 +1015,6 @@ static int radix_tree_callback(struct notifier_block *nfb, } return NOTIFY_OK; } -#endif /* CONFIG_HOTPLUG_CPU */ void __init radix_tree_init(void) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2273952300d..27ec7a1b802 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -701,7 +701,6 @@ void drain_node_pages(int nodeid) } #endif -#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) static void __drain_pages(unsigned int cpu) { unsigned long flags; @@ -723,7 +722,6 @@ static void __drain_pages(unsigned int cpu) } } } -#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */ #ifdef CONFIG_PM @@ -2907,7 +2905,6 @@ void __init free_area_init(unsigned long *zones_size) __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); } -#ifdef CONFIG_HOTPLUG_CPU static int page_alloc_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -2922,7 +2919,6 @@ static int page_alloc_cpu_notify(struct notifier_block *self, } return NOTIFY_OK; } -#endif /* CONFIG_HOTPLUG_CPU */ void __init page_alloc_init(void) { diff --git a/mm/swap.c b/mm/swap.c index 017e72ca9bb..2ed7be39795 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -514,5 +514,7 @@ void __init swap_setup(void) * Right now other parts of the system means that we * _really_ don't want to cluster much more */ +#ifdef CONFIG_HOTPLUG_CPU hotcpu_notifier(cpu_swap_callback, 0); +#endif } diff --git a/mm/vmscan.c b/mm/vmscan.c index f6616e81fac..093f5fe6dd7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1513,7 +1513,6 @@ out: } #endif -#ifdef CONFIG_HOTPLUG_CPU /* It's optimal to keep kswapds on the same CPUs as their memory, but not required for correctness. So if the last cpu in a node goes away, we get changed to run anywhere: as the first one comes back, @@ -1534,7 +1533,6 @@ static int __devinit cpu_callback(struct notifier_block *nfb, } return NOTIFY_OK; } -#endif /* CONFIG_HOTPLUG_CPU */ /* * This kswapd start function will be called by init and node-hot-add. diff --git a/net/core/dev.c b/net/core/dev.c index 59d058a3b50..e660cb57e42 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3340,7 +3340,6 @@ void unregister_netdev(struct net_device *dev) EXPORT_SYMBOL(unregister_netdev); -#ifdef CONFIG_HOTPLUG_CPU static int dev_cpu_callback(struct notifier_block *nfb, unsigned long action, void *ocpu) @@ -3384,7 +3383,6 @@ static int dev_cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } -#endif /* CONFIG_HOTPLUG_CPU */ #ifdef CONFIG_NET_DMA /** diff --git a/net/core/flow.c b/net/core/flow.c index 104c25d00a1..d137f971f97 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -340,7 +340,6 @@ static void __devinit flow_cache_cpu_prepare(int cpu) tasklet_init(tasklet, flow_cache_flush_tasklet, 0); } -#ifdef CONFIG_HOTPLUG_CPU static int flow_cache_cpu(struct notifier_block *nfb, unsigned long action, void *hcpu) @@ -349,7 +348,6 @@ static int flow_cache_cpu(struct notifier_block *nfb, __flow_cache_shrink((unsigned long)hcpu, 0); return NOTIFY_OK; } -#endif /* CONFIG_HOTPLUG_CPU */ static int __init flow_cache_init(void) { -- cgit v1.2.3-70-g09d2 From cd6ed52568e161ce924593ebc798050a2d23cca0 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 6 Dec 2006 20:40:06 -0800 Subject: [PATCH] arch/i386/kernel/reboot.c should #include Every file should #include the headers containing the prototypes for its global functions. Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/reboot.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c index 84278e0093a..3514b4153f7 100644 --- a/arch/i386/kernel/reboot.c +++ b/arch/i386/kernel/reboot.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3-70-g09d2 From 85916f8166b59eeac63d2b4f7f1df8de849334b4 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 6 Dec 2006 20:40:41 -0800 Subject: [PATCH] Kexec / Kdump: Unify elf note code The elf note saving code is currently duplicated over several architectures. This cleanup patch simply adds code to a common file and then replaces the arch-specific code with calls to the newly added code. The only drawback with this approach is that s390 doesn't fully support kexec-on-panic which for that arch leads to introduction of unused code. Signed-off-by: Magnus Damm Cc: Vivek Goyal Cc: Andi Kleen Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/crash.c | 66 ++----------------------------------------- arch/powerpc/kernel/crash.c | 59 ++------------------------------------ arch/x86_64/kernel/crash.c | 69 ++------------------------------------------- include/linux/kexec.h | 1 + kernel/kexec.c | 56 ++++++++++++++++++++++++++++++++++++ 5 files changed, 63 insertions(+), 188 deletions(-) (limited to 'arch/i386/kernel') diff --git a/arch/i386/kernel/crash.c b/arch/i386/kernel/crash.c index 144b4328896..a5e0e990ea9 100644 --- a/arch/i386/kernel/crash.c +++ b/arch/i386/kernel/crash.c @@ -31,68 +31,6 @@ /* This keeps a track of which one is crashing cpu. */ static int crashing_cpu; -static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, - size_t data_len) -{ - struct elf_note note; - - note.n_namesz = strlen(name) + 1; - note.n_descsz = data_len; - note.n_type = type; - memcpy(buf, ¬e, sizeof(note)); - buf += (sizeof(note) +3)/4; - memcpy(buf, name, note.n_namesz); - buf += (note.n_namesz + 3)/4; - memcpy(buf, data, note.n_descsz); - buf += (note.n_descsz + 3)/4; - - return buf; -} - -static void final_note(u32 *buf) -{ - struct elf_note note; - - note.n_namesz = 0; - note.n_descsz = 0; - note.n_type = 0; - memcpy(buf, ¬e, sizeof(note)); -} - -static void crash_save_this_cpu(struct pt_regs *regs, int cpu) -{ - struct elf_prstatus prstatus; - u32 *buf; - - if ((cpu < 0) || (cpu >= NR_CPUS)) - return; - - /* Using ELF notes here is opportunistic. - * I need a well defined structure format - * for the data I pass, and I need tags - * on the data to indicate what information I have - * squirrelled away. ELF notes happen to provide - * all of that, so there is no need to invent something new. - */ - buf = (u32*)per_cpu_ptr(crash_notes, cpu); - if (!buf) - return; - memset(&prstatus, 0, sizeof(prstatus)); - prstatus.pr_pid = current->pid; - elf_core_copy_regs(&prstatus.pr_reg, regs); - buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus, - sizeof(prstatus)); - final_note(buf); -} - -static void crash_save_self(struct pt_regs *regs) -{ - int cpu; - - cpu = safe_smp_processor_id(); - crash_save_this_cpu(regs, cpu); -} - #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) static atomic_t waiting_for_crash_ipi; @@ -121,7 +59,7 @@ static int crash_nmi_callback(struct notifier_block *self, crash_fixup_ss_esp(&fixed_regs, regs); regs = &fixed_regs; } - crash_save_this_cpu(regs, cpu); + crash_save_cpu(regs, cpu); disable_local_APIC(); atomic_dec(&waiting_for_crash_ipi); /* Assume hlt works */ @@ -195,5 +133,5 @@ void machine_crash_shutdown(struct pt_regs *regs) #if defined(CONFIG_X86_IO_APIC) disable_IO_APIC(); #endif - crash_save_self(regs); + crash_save_cpu(regs, safe_smp_processor_id()); } diff --git a/arch/powerpc/kernel/crash.c b/arch/powerpc/kernel/crash.c index 89b03c8da9d..d3f2080d2ee 100644 --- a/arch/powerpc/kernel/crash.c +++ b/arch/powerpc/kernel/crash.c @@ -46,61 +46,6 @@ int crashing_cpu = -1; static cpumask_t cpus_in_crash = CPU_MASK_NONE; cpumask_t cpus_in_sr = CPU_MASK_NONE; -static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, - size_t data_len) -{ - struct elf_note note; - - note.n_namesz = strlen(name) + 1; - note.n_descsz = data_len; - note.n_type = type; - memcpy(buf, ¬e, sizeof(note)); - buf += (sizeof(note) +3)/4; - memcpy(buf, name, note.n_namesz); - buf += (note.n_namesz + 3)/4; - memcpy(buf, data, note.n_descsz); - buf += (note.n_descsz + 3)/4; - - return buf; -} - -static void final_note(u32 *buf) -{ - struct elf_note note; - - note.n_namesz = 0; - note.n_descsz = 0; - note.n_type = 0; - memcpy(buf, ¬e, sizeof(note)); -} - -static void crash_save_this_cpu(struct pt_regs *regs, int cpu) -{ - struct elf_prstatus prstatus; - u32 *buf; - - if ((cpu < 0) || (cpu >= NR_CPUS)) - return; - - /* Using ELF notes here is opportunistic. - * I need a well defined structure format - * for the data I pass, and I need tags - * on the data to indicate what information I have - * squirrelled away. ELF notes happen to provide - * all of that that no need to invent something new. - */ - buf = (u32*)per_cpu_ptr(crash_notes, cpu); - if (!buf) - return; - - memset(&prstatus, 0, sizeof(prstatus)); - prstatus.pr_pid = current->pid; - elf_core_copy_regs(&prstatus.pr_reg, regs); - buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus, - sizeof(prstatus)); - final_note(buf); -} - #ifdef CONFIG_SMP static atomic_t enter_on_soft_reset = ATOMIC_INIT(0); @@ -113,7 +58,7 @@ void crash_ipi_callback(struct pt_regs *regs) hard_irq_disable(); if (!cpu_isset(cpu, cpus_in_crash)) - crash_save_this_cpu(regs, cpu); + crash_save_cpu(regs, cpu); cpu_set(cpu, cpus_in_crash); /* @@ -306,7 +251,7 @@ void default_machine_crash_shutdown(struct pt_regs *regs) * such that another IPI will not be sent. */ crashing_cpu = smp_processor_id(); - crash_save_this_cpu(regs, crashing_cpu); + crash_save_cpu(regs, crashing_cpu); crash_kexec_prepare_cpus(crashing_cpu); cpu_set(crashing_cpu, cpus_in_crash); if (ppc_md.kexec_cpu_down) diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c index 3525f884af8..95a7a2c1313 100644 --- a/arch/x86_64/kernel/crash.c +++ b/arch/x86_64/kernel/crash.c @@ -28,71 +28,6 @@ /* This keeps a track of which one is crashing cpu. */ static int crashing_cpu; -static u32 *append_elf_note(u32 *buf, char *name, unsigned type, - void *data, size_t data_len) -{ - struct elf_note note; - - note.n_namesz = strlen(name) + 1; - note.n_descsz = data_len; - note.n_type = type; - memcpy(buf, ¬e, sizeof(note)); - buf += (sizeof(note) +3)/4; - memcpy(buf, name, note.n_namesz); - buf += (note.n_namesz + 3)/4; - memcpy(buf, data, note.n_descsz); - buf += (note.n_descsz + 3)/4; - - return buf; -} - -static void final_note(u32 *buf) -{ - struct elf_note note; - - note.n_namesz = 0; - note.n_descsz = 0; - note.n_type = 0; - memcpy(buf, ¬e, sizeof(note)); -} - -static void crash_save_this_cpu(struct pt_regs *regs, int cpu) -{ - struct elf_prstatus prstatus; - u32 *buf; - - if ((cpu < 0) || (cpu >= NR_CPUS)) - return; - - /* Using ELF notes here is opportunistic. - * I need a well defined structure format - * for the data I pass, and I need tags - * on the data to indicate what information I have - * squirrelled away. ELF notes happen to provide - * all of that, no need to invent something new. - */ - - buf = (u32*)per_cpu_ptr(crash_notes, cpu); - - if (!buf) - return; - - memset(&prstatus, 0, sizeof(prstatus)); - prstatus.pr_pid = current->pid; - elf_core_copy_regs(&prstatus.pr_reg, regs); - buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus, - sizeof(prstatus)); - final_note(buf); -} - -static void crash_save_self(struct pt_regs *regs) -{ - int cpu; - - cpu = smp_processor_id(); - crash_save_this_cpu(regs, cpu); -} - #ifdef CONFIG_SMP static atomic_t waiting_for_crash_ipi; @@ -117,7 +52,7 @@ static int crash_nmi_callback(struct notifier_block *self, return NOTIFY_STOP; local_irq_disable(); - crash_save_this_cpu(regs, cpu); + crash_save_cpu(regs, cpu); disable_local_APIC(); atomic_dec(&waiting_for_crash_ipi); /* Assume hlt works */ @@ -196,5 +131,5 @@ void machine_crash_shutdown(struct pt_regs *regs) disable_IO_APIC(); - crash_save_self(regs); + crash_save_cpu(regs, smp_processor_id()); } diff --git a/include/linux/kexec.h b/include/linux/kexec.h index a4ede62b339..e3abcec6c51 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -105,6 +105,7 @@ extern struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order); extern void crash_kexec(struct pt_regs *); int kexec_should_crash(struct task_struct *); +void crash_save_cpu(struct pt_regs *regs, int cpu); extern struct kimage *kexec_image; extern struct kimage *kexec_crash_image; diff --git a/kernel/kexec.c b/kernel/kexec.c index d43692cf232..afbbbe981be 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -20,6 +20,8 @@ #include #include #include +#include +#include #include #include @@ -1066,6 +1068,60 @@ void crash_kexec(struct pt_regs *regs) } } +static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, + size_t data_len) +{ + struct elf_note note; + + note.n_namesz = strlen(name) + 1; + note.n_descsz = data_len; + note.n_type = type; + memcpy(buf, ¬e, sizeof(note)); + buf += (sizeof(note) + 3)/4; + memcpy(buf, name, note.n_namesz); + buf += (note.n_namesz + 3)/4; + memcpy(buf, data, note.n_descsz); + buf += (note.n_descsz + 3)/4; + + return buf; +} + +static void final_note(u32 *buf) +{ + struct elf_note note; + + note.n_namesz = 0; + note.n_descsz = 0; + note.n_type = 0; + memcpy(buf, ¬e, sizeof(note)); +} + +void crash_save_cpu(struct pt_regs *regs, int cpu) +{ + struct elf_prstatus prstatus; + u32 *buf; + + if ((cpu < 0) || (cpu >= NR_CPUS)) + return; + + /* Using ELF notes here is opportunistic. + * I need a well defined structure format + * for the data I pass, and I need tags + * on the data to indicate what information I have + * squirrelled away. ELF notes happen to provide + * all of that, so there is no need to invent something new. + */ + buf = (u32*)per_cpu_ptr(crash_notes, cpu); + if (!buf) + return; + memset(&prstatus, 0, sizeof(prstatus)); + prstatus.pr_pid = current->pid; + elf_core_copy_regs(&prstatus.pr_reg, regs); + buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus, + sizeof(prstatus)); + final_note(buf); +} + static int __init crash_notes_memory_init(void) { /* Allocate memory for saving cpu registers. */ -- cgit v1.2.3-70-g09d2